v1.0max / v7 /checkpoint-1000 /trainer_state.json
gotzmann's picture
..
3c5a4ae
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8433179723502304,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018433179723502304,
"grad_norm": 2.02589750289917,
"learning_rate": 1.8404907975460125e-07,
"loss": 1.9595,
"step": 1
},
{
"epoch": 0.003686635944700461,
"grad_norm": 1.9972498416900635,
"learning_rate": 3.680981595092025e-07,
"loss": 1.9238,
"step": 2
},
{
"epoch": 0.005529953917050691,
"grad_norm": 1.9820266962051392,
"learning_rate": 5.521472392638038e-07,
"loss": 1.9764,
"step": 3
},
{
"epoch": 0.007373271889400922,
"grad_norm": 1.9296741485595703,
"learning_rate": 7.36196319018405e-07,
"loss": 1.9161,
"step": 4
},
{
"epoch": 0.009216589861751152,
"grad_norm": 2.019129514694214,
"learning_rate": 9.202453987730061e-07,
"loss": 1.9629,
"step": 5
},
{
"epoch": 0.011059907834101382,
"grad_norm": 1.9039007425308228,
"learning_rate": 1.1042944785276075e-06,
"loss": 1.9158,
"step": 6
},
{
"epoch": 0.012903225806451613,
"grad_norm": 1.9537602663040161,
"learning_rate": 1.2883435582822088e-06,
"loss": 1.9572,
"step": 7
},
{
"epoch": 0.014746543778801843,
"grad_norm": 1.9474399089813232,
"learning_rate": 1.47239263803681e-06,
"loss": 1.9453,
"step": 8
},
{
"epoch": 0.016589861751152075,
"grad_norm": 1.8985520601272583,
"learning_rate": 1.656441717791411e-06,
"loss": 1.9377,
"step": 9
},
{
"epoch": 0.018433179723502304,
"grad_norm": 1.9258599281311035,
"learning_rate": 1.8404907975460122e-06,
"loss": 1.9312,
"step": 10
},
{
"epoch": 0.020276497695852536,
"grad_norm": 1.8695780038833618,
"learning_rate": 2.0245398773006137e-06,
"loss": 1.9273,
"step": 11
},
{
"epoch": 0.022119815668202765,
"grad_norm": 1.8287005424499512,
"learning_rate": 2.208588957055215e-06,
"loss": 1.8951,
"step": 12
},
{
"epoch": 0.023963133640552997,
"grad_norm": 1.7955673933029175,
"learning_rate": 2.392638036809816e-06,
"loss": 1.9054,
"step": 13
},
{
"epoch": 0.025806451612903226,
"grad_norm": 1.8065942525863647,
"learning_rate": 2.5766871165644175e-06,
"loss": 1.9343,
"step": 14
},
{
"epoch": 0.027649769585253458,
"grad_norm": 1.7342883348464966,
"learning_rate": 2.7607361963190186e-06,
"loss": 1.8964,
"step": 15
},
{
"epoch": 0.029493087557603687,
"grad_norm": 1.7930785417556763,
"learning_rate": 2.94478527607362e-06,
"loss": 1.9385,
"step": 16
},
{
"epoch": 0.03133640552995392,
"grad_norm": 1.6384600400924683,
"learning_rate": 3.1288343558282206e-06,
"loss": 1.9145,
"step": 17
},
{
"epoch": 0.03317972350230415,
"grad_norm": 1.5618699789047241,
"learning_rate": 3.312883435582822e-06,
"loss": 1.9392,
"step": 18
},
{
"epoch": 0.035023041474654376,
"grad_norm": 1.5488923788070679,
"learning_rate": 3.496932515337423e-06,
"loss": 1.8919,
"step": 19
},
{
"epoch": 0.03686635944700461,
"grad_norm": 1.3122743368148804,
"learning_rate": 3.6809815950920245e-06,
"loss": 1.8735,
"step": 20
},
{
"epoch": 0.03870967741935484,
"grad_norm": 1.2238839864730835,
"learning_rate": 3.865030674846626e-06,
"loss": 1.9463,
"step": 21
},
{
"epoch": 0.04055299539170507,
"grad_norm": 1.0287063121795654,
"learning_rate": 4.049079754601227e-06,
"loss": 1.8726,
"step": 22
},
{
"epoch": 0.0423963133640553,
"grad_norm": 0.8679629564285278,
"learning_rate": 4.233128834355828e-06,
"loss": 1.8418,
"step": 23
},
{
"epoch": 0.04423963133640553,
"grad_norm": 0.7079375386238098,
"learning_rate": 4.41717791411043e-06,
"loss": 1.8251,
"step": 24
},
{
"epoch": 0.04608294930875576,
"grad_norm": 0.9220630526542664,
"learning_rate": 4.601226993865031e-06,
"loss": 1.7802,
"step": 25
},
{
"epoch": 0.047926267281105994,
"grad_norm": 0.8715929985046387,
"learning_rate": 4.785276073619632e-06,
"loss": 1.8777,
"step": 26
},
{
"epoch": 0.04976958525345622,
"grad_norm": 0.9550811052322388,
"learning_rate": 4.969325153374233e-06,
"loss": 1.8371,
"step": 27
},
{
"epoch": 0.05161290322580645,
"grad_norm": 0.90284264087677,
"learning_rate": 5.153374233128835e-06,
"loss": 1.9045,
"step": 28
},
{
"epoch": 0.053456221198156684,
"grad_norm": 0.8552197217941284,
"learning_rate": 5.337423312883436e-06,
"loss": 1.8053,
"step": 29
},
{
"epoch": 0.055299539170506916,
"grad_norm": 0.8264591693878174,
"learning_rate": 5.521472392638037e-06,
"loss": 1.8451,
"step": 30
},
{
"epoch": 0.05714285714285714,
"grad_norm": 0.8775638341903687,
"learning_rate": 5.7055214723926385e-06,
"loss": 1.8552,
"step": 31
},
{
"epoch": 0.05898617511520737,
"grad_norm": 0.807953417301178,
"learning_rate": 5.88957055214724e-06,
"loss": 1.8164,
"step": 32
},
{
"epoch": 0.060829493087557605,
"grad_norm": 0.7050386071205139,
"learning_rate": 6.0736196319018406e-06,
"loss": 1.7819,
"step": 33
},
{
"epoch": 0.06267281105990784,
"grad_norm": 0.6614200472831726,
"learning_rate": 6.257668711656441e-06,
"loss": 1.8147,
"step": 34
},
{
"epoch": 0.06451612903225806,
"grad_norm": 0.5420933365821838,
"learning_rate": 6.4417177914110434e-06,
"loss": 1.7534,
"step": 35
},
{
"epoch": 0.0663594470046083,
"grad_norm": 0.42041122913360596,
"learning_rate": 6.625766871165644e-06,
"loss": 1.7973,
"step": 36
},
{
"epoch": 0.06820276497695853,
"grad_norm": 0.3737146854400635,
"learning_rate": 6.8098159509202454e-06,
"loss": 1.8,
"step": 37
},
{
"epoch": 0.07004608294930875,
"grad_norm": 0.37116268277168274,
"learning_rate": 6.993865030674846e-06,
"loss": 1.713,
"step": 38
},
{
"epoch": 0.07188940092165899,
"grad_norm": 0.37946197390556335,
"learning_rate": 7.177914110429448e-06,
"loss": 1.7656,
"step": 39
},
{
"epoch": 0.07373271889400922,
"grad_norm": 0.39086607098579407,
"learning_rate": 7.361963190184049e-06,
"loss": 1.7899,
"step": 40
},
{
"epoch": 0.07557603686635944,
"grad_norm": 0.4194660186767578,
"learning_rate": 7.54601226993865e-06,
"loss": 1.7247,
"step": 41
},
{
"epoch": 0.07741935483870968,
"grad_norm": 0.41651099920272827,
"learning_rate": 7.730061349693252e-06,
"loss": 1.7437,
"step": 42
},
{
"epoch": 0.0792626728110599,
"grad_norm": 0.4337350130081177,
"learning_rate": 7.914110429447852e-06,
"loss": 1.7758,
"step": 43
},
{
"epoch": 0.08110599078341015,
"grad_norm": 0.39606988430023193,
"learning_rate": 8.098159509202455e-06,
"loss": 1.7264,
"step": 44
},
{
"epoch": 0.08294930875576037,
"grad_norm": 0.35638228058815,
"learning_rate": 8.282208588957055e-06,
"loss": 1.6925,
"step": 45
},
{
"epoch": 0.0847926267281106,
"grad_norm": 0.3481025695800781,
"learning_rate": 8.466257668711656e-06,
"loss": 1.7767,
"step": 46
},
{
"epoch": 0.08663594470046083,
"grad_norm": 0.3551636040210724,
"learning_rate": 8.650306748466258e-06,
"loss": 1.8833,
"step": 47
},
{
"epoch": 0.08847926267281106,
"grad_norm": 0.3395654261112213,
"learning_rate": 8.83435582822086e-06,
"loss": 1.7341,
"step": 48
},
{
"epoch": 0.09032258064516129,
"grad_norm": 0.3334786891937256,
"learning_rate": 9.01840490797546e-06,
"loss": 1.7423,
"step": 49
},
{
"epoch": 0.09216589861751152,
"grad_norm": 0.3836233615875244,
"learning_rate": 9.202453987730062e-06,
"loss": 1.7198,
"step": 50
},
{
"epoch": 0.09400921658986175,
"grad_norm": 0.3345714807510376,
"learning_rate": 9.386503067484664e-06,
"loss": 1.7294,
"step": 51
},
{
"epoch": 0.09585253456221199,
"grad_norm": 0.3579282760620117,
"learning_rate": 9.570552147239264e-06,
"loss": 1.7277,
"step": 52
},
{
"epoch": 0.09769585253456221,
"grad_norm": 0.3569463789463043,
"learning_rate": 9.754601226993865e-06,
"loss": 1.7979,
"step": 53
},
{
"epoch": 0.09953917050691244,
"grad_norm": 0.3132180869579315,
"learning_rate": 9.938650306748466e-06,
"loss": 1.7392,
"step": 54
},
{
"epoch": 0.10138248847926268,
"grad_norm": 0.3075284957885742,
"learning_rate": 1.0122699386503068e-05,
"loss": 1.6929,
"step": 55
},
{
"epoch": 0.1032258064516129,
"grad_norm": 0.3144418001174927,
"learning_rate": 1.030674846625767e-05,
"loss": 1.7455,
"step": 56
},
{
"epoch": 0.10506912442396313,
"grad_norm": 0.30215781927108765,
"learning_rate": 1.0490797546012269e-05,
"loss": 1.7015,
"step": 57
},
{
"epoch": 0.10691244239631337,
"grad_norm": 0.3008810579776764,
"learning_rate": 1.0674846625766871e-05,
"loss": 1.702,
"step": 58
},
{
"epoch": 0.10875576036866359,
"grad_norm": 0.2983757555484772,
"learning_rate": 1.0858895705521474e-05,
"loss": 1.7593,
"step": 59
},
{
"epoch": 0.11059907834101383,
"grad_norm": 0.3113389015197754,
"learning_rate": 1.1042944785276074e-05,
"loss": 1.7446,
"step": 60
},
{
"epoch": 0.11244239631336406,
"grad_norm": 0.2977043390274048,
"learning_rate": 1.1226993865030675e-05,
"loss": 1.7666,
"step": 61
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.3056432902812958,
"learning_rate": 1.1411042944785277e-05,
"loss": 1.7226,
"step": 62
},
{
"epoch": 0.11612903225806452,
"grad_norm": 0.2962339520454407,
"learning_rate": 1.1595092024539878e-05,
"loss": 1.7503,
"step": 63
},
{
"epoch": 0.11797235023041475,
"grad_norm": 0.30185747146606445,
"learning_rate": 1.177914110429448e-05,
"loss": 1.7478,
"step": 64
},
{
"epoch": 0.11981566820276497,
"grad_norm": 0.29040098190307617,
"learning_rate": 1.1963190184049079e-05,
"loss": 1.779,
"step": 65
},
{
"epoch": 0.12165898617511521,
"grad_norm": 0.2708752155303955,
"learning_rate": 1.2147239263803681e-05,
"loss": 1.6816,
"step": 66
},
{
"epoch": 0.12350230414746544,
"grad_norm": 0.2978411018848419,
"learning_rate": 1.2331288343558283e-05,
"loss": 1.7439,
"step": 67
},
{
"epoch": 0.12534562211981568,
"grad_norm": 0.28008463978767395,
"learning_rate": 1.2515337423312882e-05,
"loss": 1.6918,
"step": 68
},
{
"epoch": 0.1271889400921659,
"grad_norm": 0.29284995794296265,
"learning_rate": 1.2699386503067485e-05,
"loss": 1.7043,
"step": 69
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.3163938522338867,
"learning_rate": 1.2883435582822087e-05,
"loss": 1.7501,
"step": 70
},
{
"epoch": 0.13087557603686636,
"grad_norm": 0.28190693259239197,
"learning_rate": 1.3067484662576687e-05,
"loss": 1.7396,
"step": 71
},
{
"epoch": 0.1327188940092166,
"grad_norm": 0.27740147709846497,
"learning_rate": 1.3251533742331288e-05,
"loss": 1.7276,
"step": 72
},
{
"epoch": 0.13456221198156681,
"grad_norm": 0.29363468289375305,
"learning_rate": 1.343558282208589e-05,
"loss": 1.7103,
"step": 73
},
{
"epoch": 0.13640552995391705,
"grad_norm": 0.28017935156822205,
"learning_rate": 1.3619631901840491e-05,
"loss": 1.7418,
"step": 74
},
{
"epoch": 0.1382488479262673,
"grad_norm": 0.2688934803009033,
"learning_rate": 1.3803680981595093e-05,
"loss": 1.7143,
"step": 75
},
{
"epoch": 0.1400921658986175,
"grad_norm": 0.28602007031440735,
"learning_rate": 1.3987730061349692e-05,
"loss": 1.7034,
"step": 76
},
{
"epoch": 0.14193548387096774,
"grad_norm": 0.2809165120124817,
"learning_rate": 1.4171779141104294e-05,
"loss": 1.726,
"step": 77
},
{
"epoch": 0.14377880184331798,
"grad_norm": 0.2808230519294739,
"learning_rate": 1.4355828220858897e-05,
"loss": 1.7184,
"step": 78
},
{
"epoch": 0.1456221198156682,
"grad_norm": 0.2852628827095032,
"learning_rate": 1.4539877300613497e-05,
"loss": 1.7506,
"step": 79
},
{
"epoch": 0.14746543778801843,
"grad_norm": 0.28406837582588196,
"learning_rate": 1.4723926380368098e-05,
"loss": 1.667,
"step": 80
},
{
"epoch": 0.14930875576036867,
"grad_norm": 0.2859112322330475,
"learning_rate": 1.49079754601227e-05,
"loss": 1.7629,
"step": 81
},
{
"epoch": 0.15115207373271888,
"grad_norm": 0.2811156213283539,
"learning_rate": 1.50920245398773e-05,
"loss": 1.7055,
"step": 82
},
{
"epoch": 0.15299539170506912,
"grad_norm": 0.27561160922050476,
"learning_rate": 1.52760736196319e-05,
"loss": 1.7149,
"step": 83
},
{
"epoch": 0.15483870967741936,
"grad_norm": 0.28458908200263977,
"learning_rate": 1.5460122699386504e-05,
"loss": 1.702,
"step": 84
},
{
"epoch": 0.15668202764976957,
"grad_norm": 0.2881270945072174,
"learning_rate": 1.5644171779141104e-05,
"loss": 1.7771,
"step": 85
},
{
"epoch": 0.1585253456221198,
"grad_norm": 0.2840360701084137,
"learning_rate": 1.5828220858895705e-05,
"loss": 1.704,
"step": 86
},
{
"epoch": 0.16036866359447005,
"grad_norm": 0.28690823912620544,
"learning_rate": 1.601226993865031e-05,
"loss": 1.7032,
"step": 87
},
{
"epoch": 0.1622119815668203,
"grad_norm": 0.28307586908340454,
"learning_rate": 1.619631901840491e-05,
"loss": 1.7504,
"step": 88
},
{
"epoch": 0.1640552995391705,
"grad_norm": 0.2825368046760559,
"learning_rate": 1.638036809815951e-05,
"loss": 1.6755,
"step": 89
},
{
"epoch": 0.16589861751152074,
"grad_norm": 0.2756441533565521,
"learning_rate": 1.656441717791411e-05,
"loss": 1.6542,
"step": 90
},
{
"epoch": 0.16774193548387098,
"grad_norm": 0.2785171866416931,
"learning_rate": 1.674846625766871e-05,
"loss": 1.7041,
"step": 91
},
{
"epoch": 0.1695852534562212,
"grad_norm": 0.2845054864883423,
"learning_rate": 1.693251533742331e-05,
"loss": 1.664,
"step": 92
},
{
"epoch": 0.17142857142857143,
"grad_norm": 0.2794802486896515,
"learning_rate": 1.7116564417177916e-05,
"loss": 1.674,
"step": 93
},
{
"epoch": 0.17327188940092167,
"grad_norm": 0.30158528685569763,
"learning_rate": 1.7300613496932516e-05,
"loss": 1.6702,
"step": 94
},
{
"epoch": 0.17511520737327188,
"grad_norm": 0.3102332353591919,
"learning_rate": 1.7484662576687117e-05,
"loss": 1.6675,
"step": 95
},
{
"epoch": 0.17695852534562212,
"grad_norm": 0.30239519476890564,
"learning_rate": 1.766871165644172e-05,
"loss": 1.735,
"step": 96
},
{
"epoch": 0.17880184331797236,
"grad_norm": 0.3213733732700348,
"learning_rate": 1.785276073619632e-05,
"loss": 1.7476,
"step": 97
},
{
"epoch": 0.18064516129032257,
"grad_norm": 0.30181968212127686,
"learning_rate": 1.803680981595092e-05,
"loss": 1.7693,
"step": 98
},
{
"epoch": 0.1824884792626728,
"grad_norm": 0.29706043004989624,
"learning_rate": 1.8220858895705523e-05,
"loss": 1.6862,
"step": 99
},
{
"epoch": 0.18433179723502305,
"grad_norm": 0.31851696968078613,
"learning_rate": 1.8404907975460123e-05,
"loss": 1.713,
"step": 100
},
{
"epoch": 0.18617511520737326,
"grad_norm": 0.2876022756099701,
"learning_rate": 1.8588957055214724e-05,
"loss": 1.6564,
"step": 101
},
{
"epoch": 0.1880184331797235,
"grad_norm": 0.30441394448280334,
"learning_rate": 1.8773006134969328e-05,
"loss": 1.7286,
"step": 102
},
{
"epoch": 0.18986175115207374,
"grad_norm": 0.2926851809024811,
"learning_rate": 1.8957055214723928e-05,
"loss": 1.6485,
"step": 103
},
{
"epoch": 0.19170506912442398,
"grad_norm": 0.2925703525543213,
"learning_rate": 1.914110429447853e-05,
"loss": 1.7372,
"step": 104
},
{
"epoch": 0.1935483870967742,
"grad_norm": 0.29036155343055725,
"learning_rate": 1.9325153374233126e-05,
"loss": 1.6868,
"step": 105
},
{
"epoch": 0.19539170506912443,
"grad_norm": 0.3051284849643707,
"learning_rate": 1.950920245398773e-05,
"loss": 1.7132,
"step": 106
},
{
"epoch": 0.19723502304147467,
"grad_norm": 0.295520156621933,
"learning_rate": 1.969325153374233e-05,
"loss": 1.6949,
"step": 107
},
{
"epoch": 0.19907834101382488,
"grad_norm": 0.3010528087615967,
"learning_rate": 1.987730061349693e-05,
"loss": 1.6948,
"step": 108
},
{
"epoch": 0.20092165898617512,
"grad_norm": 0.3026741147041321,
"learning_rate": 2.0061349693251535e-05,
"loss": 1.7014,
"step": 109
},
{
"epoch": 0.20276497695852536,
"grad_norm": 0.32000070810317993,
"learning_rate": 2.0245398773006136e-05,
"loss": 1.709,
"step": 110
},
{
"epoch": 0.20460829493087557,
"grad_norm": 0.29906362295150757,
"learning_rate": 2.0429447852760736e-05,
"loss": 1.6834,
"step": 111
},
{
"epoch": 0.2064516129032258,
"grad_norm": 0.31326961517333984,
"learning_rate": 2.061349693251534e-05,
"loss": 1.5777,
"step": 112
},
{
"epoch": 0.20829493087557605,
"grad_norm": 0.3156152069568634,
"learning_rate": 2.0797546012269938e-05,
"loss": 1.6934,
"step": 113
},
{
"epoch": 0.21013824884792626,
"grad_norm": 0.31258904933929443,
"learning_rate": 2.0981595092024538e-05,
"loss": 1.6717,
"step": 114
},
{
"epoch": 0.2119815668202765,
"grad_norm": 0.30550417304039,
"learning_rate": 2.1165644171779142e-05,
"loss": 1.6857,
"step": 115
},
{
"epoch": 0.21382488479262673,
"grad_norm": 0.3166547417640686,
"learning_rate": 2.1349693251533743e-05,
"loss": 1.6959,
"step": 116
},
{
"epoch": 0.21566820276497695,
"grad_norm": 0.30488964915275574,
"learning_rate": 2.1533742331288343e-05,
"loss": 1.6764,
"step": 117
},
{
"epoch": 0.21751152073732719,
"grad_norm": 0.30913662910461426,
"learning_rate": 2.1717791411042947e-05,
"loss": 1.6252,
"step": 118
},
{
"epoch": 0.21935483870967742,
"grad_norm": 0.33525151014328003,
"learning_rate": 2.1901840490797548e-05,
"loss": 1.6692,
"step": 119
},
{
"epoch": 0.22119815668202766,
"grad_norm": 0.329086035490036,
"learning_rate": 2.208588957055215e-05,
"loss": 1.6462,
"step": 120
},
{
"epoch": 0.22304147465437787,
"grad_norm": 0.3236452341079712,
"learning_rate": 2.226993865030675e-05,
"loss": 1.7658,
"step": 121
},
{
"epoch": 0.2248847926267281,
"grad_norm": 0.3473586142063141,
"learning_rate": 2.245398773006135e-05,
"loss": 1.6979,
"step": 122
},
{
"epoch": 0.22672811059907835,
"grad_norm": 0.2963578402996063,
"learning_rate": 2.263803680981595e-05,
"loss": 1.692,
"step": 123
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.3080257475376129,
"learning_rate": 2.2822085889570554e-05,
"loss": 1.6947,
"step": 124
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.38422197103500366,
"learning_rate": 2.3006134969325155e-05,
"loss": 1.7041,
"step": 125
},
{
"epoch": 0.23225806451612904,
"grad_norm": 0.30777859687805176,
"learning_rate": 2.3190184049079755e-05,
"loss": 1.7073,
"step": 126
},
{
"epoch": 0.23410138248847925,
"grad_norm": 0.43660104274749756,
"learning_rate": 2.337423312883436e-05,
"loss": 1.6824,
"step": 127
},
{
"epoch": 0.2359447004608295,
"grad_norm": 0.30751895904541016,
"learning_rate": 2.355828220858896e-05,
"loss": 1.7052,
"step": 128
},
{
"epoch": 0.23778801843317973,
"grad_norm": 0.3926418125629425,
"learning_rate": 2.3742331288343557e-05,
"loss": 1.6765,
"step": 129
},
{
"epoch": 0.23963133640552994,
"grad_norm": 0.32357603311538696,
"learning_rate": 2.3926380368098158e-05,
"loss": 1.6804,
"step": 130
},
{
"epoch": 0.24147465437788018,
"grad_norm": 0.3306221663951874,
"learning_rate": 2.411042944785276e-05,
"loss": 1.709,
"step": 131
},
{
"epoch": 0.24331797235023042,
"grad_norm": 0.38641783595085144,
"learning_rate": 2.4294478527607362e-05,
"loss": 1.6391,
"step": 132
},
{
"epoch": 0.24516129032258063,
"grad_norm": 0.3362821638584137,
"learning_rate": 2.4478527607361963e-05,
"loss": 1.6555,
"step": 133
},
{
"epoch": 0.24700460829493087,
"grad_norm": 0.37096643447875977,
"learning_rate": 2.4662576687116567e-05,
"loss": 1.7266,
"step": 134
},
{
"epoch": 0.2488479262672811,
"grad_norm": 0.3151519000530243,
"learning_rate": 2.4846625766871167e-05,
"loss": 1.7161,
"step": 135
},
{
"epoch": 0.25069124423963135,
"grad_norm": 0.32760757207870483,
"learning_rate": 2.5030674846625765e-05,
"loss": 1.641,
"step": 136
},
{
"epoch": 0.25253456221198156,
"grad_norm": 0.37584084272384644,
"learning_rate": 2.521472392638037e-05,
"loss": 1.7422,
"step": 137
},
{
"epoch": 0.2543778801843318,
"grad_norm": 0.31268641352653503,
"learning_rate": 2.539877300613497e-05,
"loss": 1.7261,
"step": 138
},
{
"epoch": 0.25622119815668204,
"grad_norm": 0.36679184436798096,
"learning_rate": 2.558282208588957e-05,
"loss": 1.7018,
"step": 139
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.31372857093811035,
"learning_rate": 2.5766871165644174e-05,
"loss": 1.6838,
"step": 140
},
{
"epoch": 0.25990783410138246,
"grad_norm": 0.35413920879364014,
"learning_rate": 2.5950920245398774e-05,
"loss": 1.7003,
"step": 141
},
{
"epoch": 0.26175115207373273,
"grad_norm": 0.3250369727611542,
"learning_rate": 2.6134969325153375e-05,
"loss": 1.6799,
"step": 142
},
{
"epoch": 0.26359447004608294,
"grad_norm": 0.33544111251831055,
"learning_rate": 2.631901840490798e-05,
"loss": 1.6357,
"step": 143
},
{
"epoch": 0.2654377880184332,
"grad_norm": 0.31145980954170227,
"learning_rate": 2.6503067484662576e-05,
"loss": 1.6062,
"step": 144
},
{
"epoch": 0.2672811059907834,
"grad_norm": 0.31925103068351746,
"learning_rate": 2.6687116564417177e-05,
"loss": 1.7424,
"step": 145
},
{
"epoch": 0.26912442396313363,
"grad_norm": 0.3047640919685364,
"learning_rate": 2.687116564417178e-05,
"loss": 1.7195,
"step": 146
},
{
"epoch": 0.2709677419354839,
"grad_norm": 0.29877611994743347,
"learning_rate": 2.705521472392638e-05,
"loss": 1.6467,
"step": 147
},
{
"epoch": 0.2728110599078341,
"grad_norm": 0.31033533811569214,
"learning_rate": 2.7239263803680982e-05,
"loss": 1.6234,
"step": 148
},
{
"epoch": 0.2746543778801843,
"grad_norm": 0.3081960380077362,
"learning_rate": 2.7423312883435586e-05,
"loss": 1.6945,
"step": 149
},
{
"epoch": 0.2764976958525346,
"grad_norm": 0.30935296416282654,
"learning_rate": 2.7607361963190186e-05,
"loss": 1.6172,
"step": 150
},
{
"epoch": 0.2783410138248848,
"grad_norm": 0.3287109136581421,
"learning_rate": 2.7791411042944787e-05,
"loss": 1.6771,
"step": 151
},
{
"epoch": 0.280184331797235,
"grad_norm": 0.3306177258491516,
"learning_rate": 2.7975460122699384e-05,
"loss": 1.6522,
"step": 152
},
{
"epoch": 0.2820276497695853,
"grad_norm": 0.30568936467170715,
"learning_rate": 2.8159509202453988e-05,
"loss": 1.6456,
"step": 153
},
{
"epoch": 0.2838709677419355,
"grad_norm": 0.31628063321113586,
"learning_rate": 2.834355828220859e-05,
"loss": 1.6521,
"step": 154
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.3133561909198761,
"learning_rate": 2.852760736196319e-05,
"loss": 1.6517,
"step": 155
},
{
"epoch": 0.28755760368663597,
"grad_norm": 0.327334463596344,
"learning_rate": 2.8711656441717793e-05,
"loss": 1.6422,
"step": 156
},
{
"epoch": 0.2894009216589862,
"grad_norm": 0.3071538209915161,
"learning_rate": 2.8895705521472394e-05,
"loss": 1.6693,
"step": 157
},
{
"epoch": 0.2912442396313364,
"grad_norm": 0.3373123109340668,
"learning_rate": 2.9079754601226994e-05,
"loss": 1.7079,
"step": 158
},
{
"epoch": 0.29308755760368665,
"grad_norm": 0.32550349831581116,
"learning_rate": 2.92638036809816e-05,
"loss": 1.7135,
"step": 159
},
{
"epoch": 0.29493087557603687,
"grad_norm": 0.2980889678001404,
"learning_rate": 2.9447852760736196e-05,
"loss": 1.6375,
"step": 160
},
{
"epoch": 0.2967741935483871,
"grad_norm": 0.3041258156299591,
"learning_rate": 2.9631901840490796e-05,
"loss": 1.6542,
"step": 161
},
{
"epoch": 0.29861751152073734,
"grad_norm": 0.32162049412727356,
"learning_rate": 2.98159509202454e-05,
"loss": 1.6308,
"step": 162
},
{
"epoch": 0.30046082949308756,
"grad_norm": 0.31242045760154724,
"learning_rate": 3e-05,
"loss": 1.7211,
"step": 163
},
{
"epoch": 0.30230414746543777,
"grad_norm": 0.32431089878082275,
"learning_rate": 2.9999965416241516e-05,
"loss": 1.6273,
"step": 164
},
{
"epoch": 0.30414746543778803,
"grad_norm": 0.34065431356430054,
"learning_rate": 2.999986166512553e-05,
"loss": 1.7136,
"step": 165
},
{
"epoch": 0.30599078341013825,
"grad_norm": 0.3061734437942505,
"learning_rate": 2.9999688747130467e-05,
"loss": 1.6912,
"step": 166
},
{
"epoch": 0.30783410138248846,
"grad_norm": 0.31865042448043823,
"learning_rate": 2.999944666305367e-05,
"loss": 1.6703,
"step": 167
},
{
"epoch": 0.3096774193548387,
"grad_norm": 0.32718777656555176,
"learning_rate": 2.999913541401143e-05,
"loss": 1.5595,
"step": 168
},
{
"epoch": 0.31152073732718893,
"grad_norm": 0.31636691093444824,
"learning_rate": 2.9998755001438975e-05,
"loss": 1.6433,
"step": 169
},
{
"epoch": 0.31336405529953915,
"grad_norm": 0.35521432757377625,
"learning_rate": 2.999830542709045e-05,
"loss": 1.6257,
"step": 170
},
{
"epoch": 0.3152073732718894,
"grad_norm": 0.34638574719429016,
"learning_rate": 2.9997786693038913e-05,
"loss": 1.6341,
"step": 171
},
{
"epoch": 0.3170506912442396,
"grad_norm": 0.3070574700832367,
"learning_rate": 2.9997198801676335e-05,
"loss": 1.646,
"step": 172
},
{
"epoch": 0.31889400921658984,
"grad_norm": 0.3159651458263397,
"learning_rate": 2.9996541755713585e-05,
"loss": 1.6753,
"step": 173
},
{
"epoch": 0.3207373271889401,
"grad_norm": 0.32200679183006287,
"learning_rate": 2.999581555818041e-05,
"loss": 1.6883,
"step": 174
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.30645352602005005,
"learning_rate": 2.9995020212425432e-05,
"loss": 1.656,
"step": 175
},
{
"epoch": 0.3244239631336406,
"grad_norm": 0.34474891424179077,
"learning_rate": 2.9994155722116118e-05,
"loss": 1.725,
"step": 176
},
{
"epoch": 0.3262672811059908,
"grad_norm": 0.37686192989349365,
"learning_rate": 2.999322209123878e-05,
"loss": 1.7542,
"step": 177
},
{
"epoch": 0.328110599078341,
"grad_norm": 0.3260898292064667,
"learning_rate": 2.9992219324098545e-05,
"loss": 1.6049,
"step": 178
},
{
"epoch": 0.32995391705069127,
"grad_norm": 0.37945932149887085,
"learning_rate": 2.9991147425319346e-05,
"loss": 1.637,
"step": 179
},
{
"epoch": 0.3317972350230415,
"grad_norm": 0.36349308490753174,
"learning_rate": 2.9990006399843884e-05,
"loss": 1.7051,
"step": 180
},
{
"epoch": 0.3336405529953917,
"grad_norm": 0.35796141624450684,
"learning_rate": 2.998879625293362e-05,
"loss": 1.7094,
"step": 181
},
{
"epoch": 0.33548387096774196,
"grad_norm": 0.35643306374549866,
"learning_rate": 2.9987516990168743e-05,
"loss": 1.6021,
"step": 182
},
{
"epoch": 0.33732718894009217,
"grad_norm": 0.3745727837085724,
"learning_rate": 2.9986168617448153e-05,
"loss": 1.6267,
"step": 183
},
{
"epoch": 0.3391705069124424,
"grad_norm": 0.32228630781173706,
"learning_rate": 2.9984751140989417e-05,
"loss": 1.662,
"step": 184
},
{
"epoch": 0.34101382488479265,
"grad_norm": 0.3829532563686371,
"learning_rate": 2.9983264567328756e-05,
"loss": 1.6909,
"step": 185
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.3452930152416229,
"learning_rate": 2.9981708903321017e-05,
"loss": 1.6502,
"step": 186
},
{
"epoch": 0.34470046082949307,
"grad_norm": 0.3659391701221466,
"learning_rate": 2.9980084156139625e-05,
"loss": 1.6409,
"step": 187
},
{
"epoch": 0.34654377880184334,
"grad_norm": 0.38383013010025024,
"learning_rate": 2.9978390333276565e-05,
"loss": 1.6765,
"step": 188
},
{
"epoch": 0.34838709677419355,
"grad_norm": 0.34483417868614197,
"learning_rate": 2.9976627442542325e-05,
"loss": 1.7486,
"step": 189
},
{
"epoch": 0.35023041474654376,
"grad_norm": 0.3946613669395447,
"learning_rate": 2.997479549206591e-05,
"loss": 1.6863,
"step": 190
},
{
"epoch": 0.35207373271889403,
"grad_norm": 0.3084375262260437,
"learning_rate": 2.9972894490294738e-05,
"loss": 1.6223,
"step": 191
},
{
"epoch": 0.35391705069124424,
"grad_norm": 0.42278456687927246,
"learning_rate": 2.9970924445994645e-05,
"loss": 1.7044,
"step": 192
},
{
"epoch": 0.35576036866359445,
"grad_norm": 0.3211970925331116,
"learning_rate": 2.9968885368249847e-05,
"loss": 1.6907,
"step": 193
},
{
"epoch": 0.3576036866359447,
"grad_norm": 0.3954881727695465,
"learning_rate": 2.9966777266462863e-05,
"loss": 1.7002,
"step": 194
},
{
"epoch": 0.35944700460829493,
"grad_norm": 0.3460248112678528,
"learning_rate": 2.9964600150354512e-05,
"loss": 1.639,
"step": 195
},
{
"epoch": 0.36129032258064514,
"grad_norm": 0.3247598707675934,
"learning_rate": 2.9962354029963835e-05,
"loss": 1.679,
"step": 196
},
{
"epoch": 0.3631336405529954,
"grad_norm": 0.4037436544895172,
"learning_rate": 2.9960038915648076e-05,
"loss": 1.7343,
"step": 197
},
{
"epoch": 0.3649769585253456,
"grad_norm": 0.3309732973575592,
"learning_rate": 2.9957654818082615e-05,
"loss": 1.6759,
"step": 198
},
{
"epoch": 0.36682027649769583,
"grad_norm": 0.38870948553085327,
"learning_rate": 2.9955201748260923e-05,
"loss": 1.7189,
"step": 199
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.2937754988670349,
"learning_rate": 2.9952679717494516e-05,
"loss": 1.6882,
"step": 200
},
{
"epoch": 0.3705069124423963,
"grad_norm": 0.3606261909008026,
"learning_rate": 2.9950088737412898e-05,
"loss": 1.6536,
"step": 201
},
{
"epoch": 0.3723502304147465,
"grad_norm": 0.29472067952156067,
"learning_rate": 2.9947428819963526e-05,
"loss": 1.6957,
"step": 202
},
{
"epoch": 0.3741935483870968,
"grad_norm": 0.2893930673599243,
"learning_rate": 2.994469997741171e-05,
"loss": 1.6434,
"step": 203
},
{
"epoch": 0.376036866359447,
"grad_norm": 0.36216485500335693,
"learning_rate": 2.994190222234061e-05,
"loss": 1.6897,
"step": 204
},
{
"epoch": 0.3778801843317972,
"grad_norm": 0.2846887409687042,
"learning_rate": 2.9939035567651146e-05,
"loss": 1.6727,
"step": 205
},
{
"epoch": 0.3797235023041475,
"grad_norm": 0.34425318241119385,
"learning_rate": 2.9936100026561933e-05,
"loss": 1.6824,
"step": 206
},
{
"epoch": 0.3815668202764977,
"grad_norm": 0.31369149684906006,
"learning_rate": 2.9933095612609253e-05,
"loss": 1.6703,
"step": 207
},
{
"epoch": 0.38341013824884795,
"grad_norm": 0.31020811200141907,
"learning_rate": 2.993002233964696e-05,
"loss": 1.7284,
"step": 208
},
{
"epoch": 0.38525345622119817,
"grad_norm": 0.3632746934890747,
"learning_rate": 2.9926880221846435e-05,
"loss": 1.6617,
"step": 209
},
{
"epoch": 0.3870967741935484,
"grad_norm": 0.2814621329307556,
"learning_rate": 2.9923669273696506e-05,
"loss": 1.5947,
"step": 210
},
{
"epoch": 0.38894009216589864,
"grad_norm": 0.32484638690948486,
"learning_rate": 2.9920389510003395e-05,
"loss": 1.6403,
"step": 211
},
{
"epoch": 0.39078341013824885,
"grad_norm": 0.31668680906295776,
"learning_rate": 2.9917040945890638e-05,
"loss": 1.7241,
"step": 212
},
{
"epoch": 0.39262672811059907,
"grad_norm": 0.29519209265708923,
"learning_rate": 2.9913623596799032e-05,
"loss": 1.6997,
"step": 213
},
{
"epoch": 0.39447004608294933,
"grad_norm": 0.3414634168148041,
"learning_rate": 2.9910137478486545e-05,
"loss": 1.6451,
"step": 214
},
{
"epoch": 0.39631336405529954,
"grad_norm": 0.2943251132965088,
"learning_rate": 2.990658260702826e-05,
"loss": 1.6784,
"step": 215
},
{
"epoch": 0.39815668202764976,
"grad_norm": 0.3138771653175354,
"learning_rate": 2.9902958998816274e-05,
"loss": 1.7088,
"step": 216
},
{
"epoch": 0.4,
"grad_norm": 0.296895831823349,
"learning_rate": 2.989926667055966e-05,
"loss": 1.6855,
"step": 217
},
{
"epoch": 0.40184331797235023,
"grad_norm": 0.29469338059425354,
"learning_rate": 2.989550563928436e-05,
"loss": 1.6437,
"step": 218
},
{
"epoch": 0.40368663594470044,
"grad_norm": 0.3117813169956207,
"learning_rate": 2.9891675922333125e-05,
"loss": 1.6708,
"step": 219
},
{
"epoch": 0.4055299539170507,
"grad_norm": 0.2911444902420044,
"learning_rate": 2.9887777537365416e-05,
"loss": 1.6655,
"step": 220
},
{
"epoch": 0.4073732718894009,
"grad_norm": 0.29274192452430725,
"learning_rate": 2.9883810502357346e-05,
"loss": 1.6737,
"step": 221
},
{
"epoch": 0.40921658986175113,
"grad_norm": 0.325166255235672,
"learning_rate": 2.9879774835601574e-05,
"loss": 1.6562,
"step": 222
},
{
"epoch": 0.4110599078341014,
"grad_norm": 0.3723132312297821,
"learning_rate": 2.987567055570724e-05,
"loss": 1.696,
"step": 223
},
{
"epoch": 0.4129032258064516,
"grad_norm": 0.2948567867279053,
"learning_rate": 2.987149768159987e-05,
"loss": 1.5771,
"step": 224
},
{
"epoch": 0.4147465437788018,
"grad_norm": 0.3130621314048767,
"learning_rate": 2.986725623252128e-05,
"loss": 1.7274,
"step": 225
},
{
"epoch": 0.4165898617511521,
"grad_norm": 0.283635675907135,
"learning_rate": 2.9862946228029507e-05,
"loss": 1.6277,
"step": 226
},
{
"epoch": 0.4184331797235023,
"grad_norm": 0.2891738712787628,
"learning_rate": 2.9858567687998702e-05,
"loss": 1.6161,
"step": 227
},
{
"epoch": 0.4202764976958525,
"grad_norm": 0.3050073981285095,
"learning_rate": 2.9854120632619053e-05,
"loss": 1.6358,
"step": 228
},
{
"epoch": 0.4221198156682028,
"grad_norm": 0.29393428564071655,
"learning_rate": 2.9849605082396678e-05,
"loss": 1.7176,
"step": 229
},
{
"epoch": 0.423963133640553,
"grad_norm": 0.29933053255081177,
"learning_rate": 2.9845021058153532e-05,
"loss": 1.6292,
"step": 230
},
{
"epoch": 0.4258064516129032,
"grad_norm": 0.2925868034362793,
"learning_rate": 2.984036858102732e-05,
"loss": 1.6453,
"step": 231
},
{
"epoch": 0.42764976958525347,
"grad_norm": 0.30412405729293823,
"learning_rate": 2.98356476724714e-05,
"loss": 1.7311,
"step": 232
},
{
"epoch": 0.4294930875576037,
"grad_norm": 0.29768475890159607,
"learning_rate": 2.9830858354254672e-05,
"loss": 1.632,
"step": 233
},
{
"epoch": 0.4313364055299539,
"grad_norm": 0.3007776141166687,
"learning_rate": 2.9826000648461484e-05,
"loss": 1.6307,
"step": 234
},
{
"epoch": 0.43317972350230416,
"grad_norm": 0.32172518968582153,
"learning_rate": 2.982107457749153e-05,
"loss": 1.6314,
"step": 235
},
{
"epoch": 0.43502304147465437,
"grad_norm": 0.28600960969924927,
"learning_rate": 2.9816080164059758e-05,
"loss": 1.6417,
"step": 236
},
{
"epoch": 0.4368663594470046,
"grad_norm": 0.2792605757713318,
"learning_rate": 2.981101743119624e-05,
"loss": 1.5736,
"step": 237
},
{
"epoch": 0.43870967741935485,
"grad_norm": 0.3138410747051239,
"learning_rate": 2.9805886402246084e-05,
"loss": 1.6921,
"step": 238
},
{
"epoch": 0.44055299539170506,
"grad_norm": 0.2832198739051819,
"learning_rate": 2.9800687100869334e-05,
"loss": 1.642,
"step": 239
},
{
"epoch": 0.4423963133640553,
"grad_norm": 0.29424023628234863,
"learning_rate": 2.9795419551040836e-05,
"loss": 1.6786,
"step": 240
},
{
"epoch": 0.44423963133640554,
"grad_norm": 0.30614927411079407,
"learning_rate": 2.9790083777050148e-05,
"loss": 1.6565,
"step": 241
},
{
"epoch": 0.44608294930875575,
"grad_norm": 0.29164189100265503,
"learning_rate": 2.9784679803501416e-05,
"loss": 1.7311,
"step": 242
},
{
"epoch": 0.447926267281106,
"grad_norm": 0.29889318346977234,
"learning_rate": 2.977920765531327e-05,
"loss": 1.6551,
"step": 243
},
{
"epoch": 0.4497695852534562,
"grad_norm": 0.2796136438846588,
"learning_rate": 2.9773667357718706e-05,
"loss": 1.6495,
"step": 244
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.3036425709724426,
"learning_rate": 2.9768058936264967e-05,
"loss": 1.6847,
"step": 245
},
{
"epoch": 0.4534562211981567,
"grad_norm": 0.2765255868434906,
"learning_rate": 2.976238241681342e-05,
"loss": 1.642,
"step": 246
},
{
"epoch": 0.4552995391705069,
"grad_norm": 0.28592586517333984,
"learning_rate": 2.9756637825539453e-05,
"loss": 1.5912,
"step": 247
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.29503703117370605,
"learning_rate": 2.9750825188932334e-05,
"loss": 1.6017,
"step": 248
},
{
"epoch": 0.4589861751152074,
"grad_norm": 0.2970612645149231,
"learning_rate": 2.9744944533795112e-05,
"loss": 1.6603,
"step": 249
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.28196001052856445,
"learning_rate": 2.973899588724448e-05,
"loss": 1.6234,
"step": 250
},
{
"epoch": 0.4626728110599078,
"grad_norm": 0.28284621238708496,
"learning_rate": 2.973297927671063e-05,
"loss": 1.6411,
"step": 251
},
{
"epoch": 0.4645161290322581,
"grad_norm": 0.28718075156211853,
"learning_rate": 2.9726894729937177e-05,
"loss": 1.6307,
"step": 252
},
{
"epoch": 0.4663594470046083,
"grad_norm": 0.28701338171958923,
"learning_rate": 2.972074227498098e-05,
"loss": 1.6697,
"step": 253
},
{
"epoch": 0.4682027649769585,
"grad_norm": 0.28013625741004944,
"learning_rate": 2.971452194021204e-05,
"loss": 1.6441,
"step": 254
},
{
"epoch": 0.4700460829493088,
"grad_norm": 0.28640949726104736,
"learning_rate": 2.9708233754313365e-05,
"loss": 1.6774,
"step": 255
},
{
"epoch": 0.471889400921659,
"grad_norm": 0.2968595027923584,
"learning_rate": 2.9701877746280843e-05,
"loss": 1.693,
"step": 256
},
{
"epoch": 0.4737327188940092,
"grad_norm": 0.28468331694602966,
"learning_rate": 2.9695453945423087e-05,
"loss": 1.6944,
"step": 257
},
{
"epoch": 0.47557603686635946,
"grad_norm": 0.31435340642929077,
"learning_rate": 2.9688962381361317e-05,
"loss": 1.6628,
"step": 258
},
{
"epoch": 0.4774193548387097,
"grad_norm": 0.2782823443412781,
"learning_rate": 2.968240308402923e-05,
"loss": 1.6312,
"step": 259
},
{
"epoch": 0.4792626728110599,
"grad_norm": 0.288622111082077,
"learning_rate": 2.967577608367285e-05,
"loss": 1.6166,
"step": 260
},
{
"epoch": 0.48110599078341015,
"grad_norm": 0.2862098217010498,
"learning_rate": 2.9669081410850378e-05,
"loss": 1.5918,
"step": 261
},
{
"epoch": 0.48294930875576036,
"grad_norm": 0.2974812686443329,
"learning_rate": 2.966231909643208e-05,
"loss": 1.6475,
"step": 262
},
{
"epoch": 0.4847926267281106,
"grad_norm": 0.31905004382133484,
"learning_rate": 2.9655489171600118e-05,
"loss": 1.6218,
"step": 263
},
{
"epoch": 0.48663594470046084,
"grad_norm": 0.29999393224716187,
"learning_rate": 2.9648591667848428e-05,
"loss": 1.7007,
"step": 264
},
{
"epoch": 0.48847926267281105,
"grad_norm": 0.31066837906837463,
"learning_rate": 2.9641626616982555e-05,
"loss": 1.6758,
"step": 265
},
{
"epoch": 0.49032258064516127,
"grad_norm": 0.30834177136421204,
"learning_rate": 2.9634594051119515e-05,
"loss": 1.6889,
"step": 266
},
{
"epoch": 0.49216589861751153,
"grad_norm": 0.29685091972351074,
"learning_rate": 2.9627494002687653e-05,
"loss": 1.7099,
"step": 267
},
{
"epoch": 0.49400921658986174,
"grad_norm": 0.3066437244415283,
"learning_rate": 2.9620326504426476e-05,
"loss": 1.6494,
"step": 268
},
{
"epoch": 0.49585253456221196,
"grad_norm": 0.28298285603523254,
"learning_rate": 2.9613091589386526e-05,
"loss": 1.6435,
"step": 269
},
{
"epoch": 0.4976958525345622,
"grad_norm": 0.2950513958930969,
"learning_rate": 2.9605789290929214e-05,
"loss": 1.6588,
"step": 270
},
{
"epoch": 0.49953917050691243,
"grad_norm": 0.2809874713420868,
"learning_rate": 2.9598419642726655e-05,
"loss": 1.6463,
"step": 271
},
{
"epoch": 0.5013824884792627,
"grad_norm": 0.29350385069847107,
"learning_rate": 2.9590982678761544e-05,
"loss": 1.6022,
"step": 272
},
{
"epoch": 0.5032258064516129,
"grad_norm": 0.28711917996406555,
"learning_rate": 2.958347843332696e-05,
"loss": 1.6602,
"step": 273
},
{
"epoch": 0.5050691244239631,
"grad_norm": 0.2757432162761688,
"learning_rate": 2.957590694102624e-05,
"loss": 1.6223,
"step": 274
},
{
"epoch": 0.5069124423963134,
"grad_norm": 0.27851778268814087,
"learning_rate": 2.9568268236772816e-05,
"loss": 1.6716,
"step": 275
},
{
"epoch": 0.5087557603686635,
"grad_norm": 0.28100845217704773,
"learning_rate": 2.956056235579002e-05,
"loss": 1.6326,
"step": 276
},
{
"epoch": 0.5105990783410138,
"grad_norm": 0.2892681360244751,
"learning_rate": 2.955278933361097e-05,
"loss": 1.6584,
"step": 277
},
{
"epoch": 0.5124423963133641,
"grad_norm": 0.27751055359840393,
"learning_rate": 2.9544949206078372e-05,
"loss": 1.6457,
"step": 278
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.26928141713142395,
"learning_rate": 2.9537042009344376e-05,
"loss": 1.6027,
"step": 279
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.26717764139175415,
"learning_rate": 2.9529067779870385e-05,
"loss": 1.6157,
"step": 280
},
{
"epoch": 0.5179723502304148,
"grad_norm": 0.26703840494155884,
"learning_rate": 2.952102655442692e-05,
"loss": 1.6148,
"step": 281
},
{
"epoch": 0.5198156682027649,
"grad_norm": 0.2838272452354431,
"learning_rate": 2.9512918370093407e-05,
"loss": 1.6785,
"step": 282
},
{
"epoch": 0.5216589861751152,
"grad_norm": 0.281730592250824,
"learning_rate": 2.950474326425805e-05,
"loss": 1.5828,
"step": 283
},
{
"epoch": 0.5235023041474655,
"grad_norm": 0.359958678483963,
"learning_rate": 2.949650127461764e-05,
"loss": 1.6606,
"step": 284
},
{
"epoch": 0.5253456221198156,
"grad_norm": 0.276723712682724,
"learning_rate": 2.948819243917737e-05,
"loss": 1.7019,
"step": 285
},
{
"epoch": 0.5271889400921659,
"grad_norm": 0.28240787982940674,
"learning_rate": 2.947981679625067e-05,
"loss": 1.7214,
"step": 286
},
{
"epoch": 0.5290322580645161,
"grad_norm": 0.278328001499176,
"learning_rate": 2.947137438445904e-05,
"loss": 1.6599,
"step": 287
},
{
"epoch": 0.5308755760368664,
"grad_norm": 0.2821551561355591,
"learning_rate": 2.9462865242731856e-05,
"loss": 1.6602,
"step": 288
},
{
"epoch": 0.5327188940092166,
"grad_norm": 0.2736065685749054,
"learning_rate": 2.9454289410306202e-05,
"loss": 1.5909,
"step": 289
},
{
"epoch": 0.5345622119815668,
"grad_norm": 0.32163509726524353,
"learning_rate": 2.944564692672667e-05,
"loss": 1.7039,
"step": 290
},
{
"epoch": 0.5364055299539171,
"grad_norm": 0.2821354568004608,
"learning_rate": 2.9436937831845217e-05,
"loss": 1.6789,
"step": 291
},
{
"epoch": 0.5382488479262673,
"grad_norm": 0.30141276121139526,
"learning_rate": 2.942816216582093e-05,
"loss": 1.6341,
"step": 292
},
{
"epoch": 0.5400921658986175,
"grad_norm": 0.2816147208213806,
"learning_rate": 2.9419319969119875e-05,
"loss": 1.5926,
"step": 293
},
{
"epoch": 0.5419354838709678,
"grad_norm": 0.2912384569644928,
"learning_rate": 2.9410411282514913e-05,
"loss": 1.6507,
"step": 294
},
{
"epoch": 0.543778801843318,
"grad_norm": 0.3174484670162201,
"learning_rate": 2.940143614708549e-05,
"loss": 1.6504,
"step": 295
},
{
"epoch": 0.5456221198156682,
"grad_norm": 0.2888404130935669,
"learning_rate": 2.939239460421746e-05,
"loss": 1.6762,
"step": 296
},
{
"epoch": 0.5474654377880185,
"grad_norm": 0.31422436237335205,
"learning_rate": 2.93832866956029e-05,
"loss": 1.6301,
"step": 297
},
{
"epoch": 0.5493087557603686,
"grad_norm": 0.3254394829273224,
"learning_rate": 2.9374112463239896e-05,
"loss": 1.7101,
"step": 298
},
{
"epoch": 0.5511520737327189,
"grad_norm": 0.3167421817779541,
"learning_rate": 2.9364871949432378e-05,
"loss": 1.6871,
"step": 299
},
{
"epoch": 0.5529953917050692,
"grad_norm": 0.3247944712638855,
"learning_rate": 2.9355565196789906e-05,
"loss": 1.7028,
"step": 300
},
{
"epoch": 0.5548387096774193,
"grad_norm": 0.27095088362693787,
"learning_rate": 2.9346192248227476e-05,
"loss": 1.6229,
"step": 301
},
{
"epoch": 0.5566820276497696,
"grad_norm": 0.31862378120422363,
"learning_rate": 2.9336753146965327e-05,
"loss": 1.666,
"step": 302
},
{
"epoch": 0.5585253456221199,
"grad_norm": 0.28205209970474243,
"learning_rate": 2.9327247936528742e-05,
"loss": 1.5925,
"step": 303
},
{
"epoch": 0.56036866359447,
"grad_norm": 0.29554882645606995,
"learning_rate": 2.9317676660747837e-05,
"loss": 1.6605,
"step": 304
},
{
"epoch": 0.5622119815668203,
"grad_norm": 0.2860583961009979,
"learning_rate": 2.9308039363757372e-05,
"loss": 1.6371,
"step": 305
},
{
"epoch": 0.5640552995391706,
"grad_norm": 0.27211877703666687,
"learning_rate": 2.9298336089996538e-05,
"loss": 1.7176,
"step": 306
},
{
"epoch": 0.5658986175115207,
"grad_norm": 0.3138637840747833,
"learning_rate": 2.9288566884208766e-05,
"loss": 1.6378,
"step": 307
},
{
"epoch": 0.567741935483871,
"grad_norm": 0.2751595973968506,
"learning_rate": 2.9278731791441497e-05,
"loss": 1.6313,
"step": 308
},
{
"epoch": 0.5695852534562212,
"grad_norm": 0.28140899538993835,
"learning_rate": 2.9268830857045997e-05,
"loss": 1.6114,
"step": 309
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.2734344005584717,
"learning_rate": 2.9258864126677132e-05,
"loss": 1.6438,
"step": 310
},
{
"epoch": 0.5732718894009217,
"grad_norm": 0.30163639783859253,
"learning_rate": 2.9248831646293174e-05,
"loss": 1.6521,
"step": 311
},
{
"epoch": 0.5751152073732719,
"grad_norm": 0.28159695863723755,
"learning_rate": 2.9238733462155564e-05,
"loss": 1.6399,
"step": 312
},
{
"epoch": 0.5769585253456221,
"grad_norm": 0.2891719341278076,
"learning_rate": 2.9228569620828735e-05,
"loss": 1.6316,
"step": 313
},
{
"epoch": 0.5788018433179724,
"grad_norm": 0.27711349725723267,
"learning_rate": 2.921834016917986e-05,
"loss": 1.5787,
"step": 314
},
{
"epoch": 0.5806451612903226,
"grad_norm": 0.2825881540775299,
"learning_rate": 2.920804515437865e-05,
"loss": 1.6223,
"step": 315
},
{
"epoch": 0.5824884792626728,
"grad_norm": 0.2809242010116577,
"learning_rate": 2.9197684623897157e-05,
"loss": 1.6368,
"step": 316
},
{
"epoch": 0.584331797235023,
"grad_norm": 0.2902085781097412,
"learning_rate": 2.9187258625509518e-05,
"loss": 1.6855,
"step": 317
},
{
"epoch": 0.5861751152073733,
"grad_norm": 0.2923787236213684,
"learning_rate": 2.917676720729177e-05,
"loss": 1.6448,
"step": 318
},
{
"epoch": 0.5880184331797235,
"grad_norm": 0.2834003269672394,
"learning_rate": 2.916621041762159e-05,
"loss": 1.6295,
"step": 319
},
{
"epoch": 0.5898617511520737,
"grad_norm": 0.2824580669403076,
"learning_rate": 2.9155588305178113e-05,
"loss": 1.5738,
"step": 320
},
{
"epoch": 0.591705069124424,
"grad_norm": 0.30301326513290405,
"learning_rate": 2.9144900918941687e-05,
"loss": 1.6247,
"step": 321
},
{
"epoch": 0.5935483870967742,
"grad_norm": 0.2766891121864319,
"learning_rate": 2.9134148308193637e-05,
"loss": 1.7135,
"step": 322
},
{
"epoch": 0.5953917050691244,
"grad_norm": 0.2816697061061859,
"learning_rate": 2.9123330522516053e-05,
"loss": 1.6522,
"step": 323
},
{
"epoch": 0.5972350230414747,
"grad_norm": 0.28478461503982544,
"learning_rate": 2.9112447611791563e-05,
"loss": 1.6347,
"step": 324
},
{
"epoch": 0.5990783410138248,
"grad_norm": 0.27743953466415405,
"learning_rate": 2.9101499626203102e-05,
"loss": 1.6071,
"step": 325
},
{
"epoch": 0.6009216589861751,
"grad_norm": 0.27698153257369995,
"learning_rate": 2.9090486616233654e-05,
"loss": 1.6191,
"step": 326
},
{
"epoch": 0.6027649769585254,
"grad_norm": 0.2867109477519989,
"learning_rate": 2.907940863266607e-05,
"loss": 1.6427,
"step": 327
},
{
"epoch": 0.6046082949308755,
"grad_norm": 0.26966315507888794,
"learning_rate": 2.906826572658278e-05,
"loss": 1.5825,
"step": 328
},
{
"epoch": 0.6064516129032258,
"grad_norm": 0.2749760150909424,
"learning_rate": 2.9057057949365602e-05,
"loss": 1.6189,
"step": 329
},
{
"epoch": 0.6082949308755761,
"grad_norm": 0.30331194400787354,
"learning_rate": 2.904578535269547e-05,
"loss": 1.6485,
"step": 330
},
{
"epoch": 0.6101382488479262,
"grad_norm": 0.2790120244026184,
"learning_rate": 2.9034447988552227e-05,
"loss": 1.6874,
"step": 331
},
{
"epoch": 0.6119815668202765,
"grad_norm": 0.2863958477973938,
"learning_rate": 2.902304590921435e-05,
"loss": 1.6805,
"step": 332
},
{
"epoch": 0.6138248847926268,
"grad_norm": 0.28442642092704773,
"learning_rate": 2.9011579167258756e-05,
"loss": 1.6611,
"step": 333
},
{
"epoch": 0.6156682027649769,
"grad_norm": 0.27127203345298767,
"learning_rate": 2.90000478155605e-05,
"loss": 1.5686,
"step": 334
},
{
"epoch": 0.6175115207373272,
"grad_norm": 0.28976717591285706,
"learning_rate": 2.8988451907292594e-05,
"loss": 1.6636,
"step": 335
},
{
"epoch": 0.6193548387096774,
"grad_norm": 0.2731335461139679,
"learning_rate": 2.8976791495925704e-05,
"loss": 1.7131,
"step": 336
},
{
"epoch": 0.6211981566820276,
"grad_norm": 0.2786687910556793,
"learning_rate": 2.896506663522795e-05,
"loss": 1.6664,
"step": 337
},
{
"epoch": 0.6230414746543779,
"grad_norm": 0.2858924865722656,
"learning_rate": 2.8953277379264633e-05,
"loss": 1.6567,
"step": 338
},
{
"epoch": 0.6248847926267281,
"grad_norm": 0.2715083956718445,
"learning_rate": 2.8941423782397987e-05,
"loss": 1.6504,
"step": 339
},
{
"epoch": 0.6267281105990783,
"grad_norm": 0.2730218470096588,
"learning_rate": 2.892950589928694e-05,
"loss": 1.6381,
"step": 340
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.2796657383441925,
"learning_rate": 2.8917523784886846e-05,
"loss": 1.6845,
"step": 341
},
{
"epoch": 0.6304147465437788,
"grad_norm": 0.28790879249572754,
"learning_rate": 2.890547749444925e-05,
"loss": 1.6751,
"step": 342
},
{
"epoch": 0.632258064516129,
"grad_norm": 0.27353277802467346,
"learning_rate": 2.8893367083521616e-05,
"loss": 1.6247,
"step": 343
},
{
"epoch": 0.6341013824884792,
"grad_norm": 0.2717505395412445,
"learning_rate": 2.888119260794708e-05,
"loss": 1.6086,
"step": 344
},
{
"epoch": 0.6359447004608295,
"grad_norm": 0.27940690517425537,
"learning_rate": 2.8868954123864194e-05,
"loss": 1.653,
"step": 345
},
{
"epoch": 0.6377880184331797,
"grad_norm": 0.265103280544281,
"learning_rate": 2.885665168770666e-05,
"loss": 1.6432,
"step": 346
},
{
"epoch": 0.6396313364055299,
"grad_norm": 0.26981207728385925,
"learning_rate": 2.8844285356203074e-05,
"loss": 1.6346,
"step": 347
},
{
"epoch": 0.6414746543778802,
"grad_norm": 0.2731129229068756,
"learning_rate": 2.8831855186376672e-05,
"loss": 1.6907,
"step": 348
},
{
"epoch": 0.6433179723502304,
"grad_norm": 0.2778747081756592,
"learning_rate": 2.8819361235545047e-05,
"loss": 1.699,
"step": 349
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.27246907353401184,
"learning_rate": 2.8806803561319903e-05,
"loss": 1.6464,
"step": 350
},
{
"epoch": 0.6470046082949309,
"grad_norm": 0.2664584517478943,
"learning_rate": 2.8794182221606784e-05,
"loss": 1.5384,
"step": 351
},
{
"epoch": 0.6488479262672812,
"grad_norm": 0.2673085033893585,
"learning_rate": 2.878149727460481e-05,
"loss": 1.571,
"step": 352
},
{
"epoch": 0.6506912442396313,
"grad_norm": 0.28247496485710144,
"learning_rate": 2.876874877880639e-05,
"loss": 1.5831,
"step": 353
},
{
"epoch": 0.6525345622119816,
"grad_norm": 0.2843359112739563,
"learning_rate": 2.8755936792996987e-05,
"loss": 1.6923,
"step": 354
},
{
"epoch": 0.6543778801843319,
"grad_norm": 0.27128326892852783,
"learning_rate": 2.8743061376254813e-05,
"loss": 1.6356,
"step": 355
},
{
"epoch": 0.656221198156682,
"grad_norm": 0.2898774743080139,
"learning_rate": 2.873012258795057e-05,
"loss": 1.6479,
"step": 356
},
{
"epoch": 0.6580645161290323,
"grad_norm": 0.28426289558410645,
"learning_rate": 2.8717120487747193e-05,
"loss": 1.629,
"step": 357
},
{
"epoch": 0.6599078341013825,
"grad_norm": 0.2633942663669586,
"learning_rate": 2.870405513559954e-05,
"loss": 1.5984,
"step": 358
},
{
"epoch": 0.6617511520737327,
"grad_norm": 0.31980207562446594,
"learning_rate": 2.8690926591754142e-05,
"loss": 1.5927,
"step": 359
},
{
"epoch": 0.663594470046083,
"grad_norm": 0.2780967056751251,
"learning_rate": 2.8677734916748927e-05,
"loss": 1.6457,
"step": 360
},
{
"epoch": 0.6654377880184332,
"grad_norm": 0.3023218810558319,
"learning_rate": 2.866448017141291e-05,
"loss": 1.6161,
"step": 361
},
{
"epoch": 0.6672811059907834,
"grad_norm": 0.2830914556980133,
"learning_rate": 2.865116241686595e-05,
"loss": 1.6897,
"step": 362
},
{
"epoch": 0.6691244239631337,
"grad_norm": 0.29188162088394165,
"learning_rate": 2.863778171451845e-05,
"loss": 1.6293,
"step": 363
},
{
"epoch": 0.6709677419354839,
"grad_norm": 0.3087140619754791,
"learning_rate": 2.8624338126071073e-05,
"loss": 1.6143,
"step": 364
},
{
"epoch": 0.6728110599078341,
"grad_norm": 0.26930543780326843,
"learning_rate": 2.861083171351446e-05,
"loss": 1.5878,
"step": 365
},
{
"epoch": 0.6746543778801843,
"grad_norm": 0.30629977583885193,
"learning_rate": 2.8597262539128947e-05,
"loss": 1.6939,
"step": 366
},
{
"epoch": 0.6764976958525346,
"grad_norm": 0.2783128321170807,
"learning_rate": 2.858363066548427e-05,
"loss": 1.66,
"step": 367
},
{
"epoch": 0.6783410138248848,
"grad_norm": 0.27261894941329956,
"learning_rate": 2.856993615543929e-05,
"loss": 1.6183,
"step": 368
},
{
"epoch": 0.680184331797235,
"grad_norm": 0.29020169377326965,
"learning_rate": 2.8556179072141693e-05,
"loss": 1.6187,
"step": 369
},
{
"epoch": 0.6820276497695853,
"grad_norm": 0.2697452902793884,
"learning_rate": 2.8542359479027693e-05,
"loss": 1.5613,
"step": 370
},
{
"epoch": 0.6838709677419355,
"grad_norm": 0.27904996275901794,
"learning_rate": 2.8528477439821753e-05,
"loss": 1.6317,
"step": 371
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.2748742997646332,
"learning_rate": 2.8514533018536286e-05,
"loss": 1.6531,
"step": 372
},
{
"epoch": 0.687557603686636,
"grad_norm": 0.30121949315071106,
"learning_rate": 2.8500526279471362e-05,
"loss": 1.6045,
"step": 373
},
{
"epoch": 0.6894009216589861,
"grad_norm": 0.26736557483673096,
"learning_rate": 2.8486457287214403e-05,
"loss": 1.6746,
"step": 374
},
{
"epoch": 0.6912442396313364,
"grad_norm": 0.2701917886734009,
"learning_rate": 2.8472326106639896e-05,
"loss": 1.6882,
"step": 375
},
{
"epoch": 0.6930875576036867,
"grad_norm": 0.2666465640068054,
"learning_rate": 2.8458132802909075e-05,
"loss": 1.5905,
"step": 376
},
{
"epoch": 0.6949308755760368,
"grad_norm": 0.2787630259990692,
"learning_rate": 2.8443877441469653e-05,
"loss": 1.6351,
"step": 377
},
{
"epoch": 0.6967741935483871,
"grad_norm": 0.2640557289123535,
"learning_rate": 2.8429560088055502e-05,
"loss": 1.6291,
"step": 378
},
{
"epoch": 0.6986175115207374,
"grad_norm": 0.2768750488758087,
"learning_rate": 2.8415180808686326e-05,
"loss": 1.6113,
"step": 379
},
{
"epoch": 0.7004608294930875,
"grad_norm": 0.27341142296791077,
"learning_rate": 2.84007396696674e-05,
"loss": 1.6397,
"step": 380
},
{
"epoch": 0.7023041474654378,
"grad_norm": 0.2810218334197998,
"learning_rate": 2.8386236737589244e-05,
"loss": 1.6255,
"step": 381
},
{
"epoch": 0.7041474654377881,
"grad_norm": 0.2625837028026581,
"learning_rate": 2.8371672079327304e-05,
"loss": 1.5909,
"step": 382
},
{
"epoch": 0.7059907834101382,
"grad_norm": 0.28470954298973083,
"learning_rate": 2.835704576204167e-05,
"loss": 1.668,
"step": 383
},
{
"epoch": 0.7078341013824885,
"grad_norm": 0.2870181202888489,
"learning_rate": 2.8342357853176742e-05,
"loss": 1.655,
"step": 384
},
{
"epoch": 0.7096774193548387,
"grad_norm": 0.2737495005130768,
"learning_rate": 2.8327608420460933e-05,
"loss": 1.6606,
"step": 385
},
{
"epoch": 0.7115207373271889,
"grad_norm": 0.2716140151023865,
"learning_rate": 2.8312797531906346e-05,
"loss": 1.6487,
"step": 386
},
{
"epoch": 0.7133640552995392,
"grad_norm": 0.2937834560871124,
"learning_rate": 2.8297925255808484e-05,
"loss": 1.5784,
"step": 387
},
{
"epoch": 0.7152073732718894,
"grad_norm": 0.25758716464042664,
"learning_rate": 2.82829916607459e-05,
"loss": 1.5551,
"step": 388
},
{
"epoch": 0.7170506912442396,
"grad_norm": 0.26855534315109253,
"learning_rate": 2.826799681557991e-05,
"loss": 1.6242,
"step": 389
},
{
"epoch": 0.7188940092165899,
"grad_norm": 0.29118281602859497,
"learning_rate": 2.8252940789454268e-05,
"loss": 1.6404,
"step": 390
},
{
"epoch": 0.7207373271889401,
"grad_norm": 0.27645623683929443,
"learning_rate": 2.823782365179482e-05,
"loss": 1.7135,
"step": 391
},
{
"epoch": 0.7225806451612903,
"grad_norm": 0.2710598409175873,
"learning_rate": 2.822264547230924e-05,
"loss": 1.6472,
"step": 392
},
{
"epoch": 0.7244239631336405,
"grad_norm": 0.2802148163318634,
"learning_rate": 2.820740632098665e-05,
"loss": 1.5996,
"step": 393
},
{
"epoch": 0.7262672811059908,
"grad_norm": 0.28701063990592957,
"learning_rate": 2.8192106268097336e-05,
"loss": 1.6192,
"step": 394
},
{
"epoch": 0.728110599078341,
"grad_norm": 0.28330931067466736,
"learning_rate": 2.8176745384192417e-05,
"loss": 1.6183,
"step": 395
},
{
"epoch": 0.7299539170506912,
"grad_norm": 0.26512524485588074,
"learning_rate": 2.8161323740103495e-05,
"loss": 1.6092,
"step": 396
},
{
"epoch": 0.7317972350230415,
"grad_norm": 0.27630165219306946,
"learning_rate": 2.814584140694237e-05,
"loss": 1.6938,
"step": 397
},
{
"epoch": 0.7336405529953917,
"grad_norm": 0.2831325829029083,
"learning_rate": 2.8130298456100667e-05,
"loss": 1.6665,
"step": 398
},
{
"epoch": 0.7354838709677419,
"grad_norm": 0.2725171744823456,
"learning_rate": 2.811469495924955e-05,
"loss": 1.6464,
"step": 399
},
{
"epoch": 0.7373271889400922,
"grad_norm": 0.2628806233406067,
"learning_rate": 2.8099030988339353e-05,
"loss": 1.5455,
"step": 400
},
{
"epoch": 0.7391705069124423,
"grad_norm": 0.26690706610679626,
"learning_rate": 2.8083306615599283e-05,
"loss": 1.6348,
"step": 401
},
{
"epoch": 0.7410138248847926,
"grad_norm": 0.2794962227344513,
"learning_rate": 2.8067521913537047e-05,
"loss": 1.6365,
"step": 402
},
{
"epoch": 0.7428571428571429,
"grad_norm": 0.2658675014972687,
"learning_rate": 2.8051676954938574e-05,
"loss": 1.6348,
"step": 403
},
{
"epoch": 0.744700460829493,
"grad_norm": 0.27006804943084717,
"learning_rate": 2.8035771812867613e-05,
"loss": 1.6384,
"step": 404
},
{
"epoch": 0.7465437788018433,
"grad_norm": 0.2776554822921753,
"learning_rate": 2.801980656066545e-05,
"loss": 1.6978,
"step": 405
},
{
"epoch": 0.7483870967741936,
"grad_norm": 0.2728254497051239,
"learning_rate": 2.8003781271950535e-05,
"loss": 1.6924,
"step": 406
},
{
"epoch": 0.7502304147465437,
"grad_norm": 0.2657453715801239,
"learning_rate": 2.7987696020618163e-05,
"loss": 1.5734,
"step": 407
},
{
"epoch": 0.752073732718894,
"grad_norm": 0.26352396607398987,
"learning_rate": 2.7971550880840138e-05,
"loss": 1.6261,
"step": 408
},
{
"epoch": 0.7539170506912443,
"grad_norm": 0.2759632468223572,
"learning_rate": 2.79553459270644e-05,
"loss": 1.6264,
"step": 409
},
{
"epoch": 0.7557603686635944,
"grad_norm": 0.27137723565101624,
"learning_rate": 2.7939081234014708e-05,
"loss": 1.6432,
"step": 410
},
{
"epoch": 0.7576036866359447,
"grad_norm": 0.26721593737602234,
"learning_rate": 2.7922756876690298e-05,
"loss": 1.6903,
"step": 411
},
{
"epoch": 0.759447004608295,
"grad_norm": 0.2769939601421356,
"learning_rate": 2.790637293036552e-05,
"loss": 1.6626,
"step": 412
},
{
"epoch": 0.7612903225806451,
"grad_norm": 0.2946414351463318,
"learning_rate": 2.7889929470589494e-05,
"loss": 1.6489,
"step": 413
},
{
"epoch": 0.7631336405529954,
"grad_norm": 0.27718386054039,
"learning_rate": 2.7873426573185777e-05,
"loss": 1.664,
"step": 414
},
{
"epoch": 0.7649769585253456,
"grad_norm": 0.2768406271934509,
"learning_rate": 2.7856864314251994e-05,
"loss": 1.6475,
"step": 415
},
{
"epoch": 0.7668202764976959,
"grad_norm": 0.2640882134437561,
"learning_rate": 2.78402427701595e-05,
"loss": 1.6332,
"step": 416
},
{
"epoch": 0.7686635944700461,
"grad_norm": 0.26694199442863464,
"learning_rate": 2.782356201755303e-05,
"loss": 1.6633,
"step": 417
},
{
"epoch": 0.7705069124423963,
"grad_norm": 0.26702558994293213,
"learning_rate": 2.780682213335033e-05,
"loss": 1.6281,
"step": 418
},
{
"epoch": 0.7723502304147466,
"grad_norm": 0.2785816490650177,
"learning_rate": 2.7790023194741812e-05,
"loss": 1.6733,
"step": 419
},
{
"epoch": 0.7741935483870968,
"grad_norm": 0.27653270959854126,
"learning_rate": 2.7773165279190206e-05,
"loss": 1.6269,
"step": 420
},
{
"epoch": 0.776036866359447,
"grad_norm": 0.27750319242477417,
"learning_rate": 2.7756248464430186e-05,
"loss": 1.6292,
"step": 421
},
{
"epoch": 0.7778801843317973,
"grad_norm": 0.2917342185974121,
"learning_rate": 2.7739272828468022e-05,
"loss": 1.6159,
"step": 422
},
{
"epoch": 0.7797235023041474,
"grad_norm": 0.26250651478767395,
"learning_rate": 2.7722238449581227e-05,
"loss": 1.6183,
"step": 423
},
{
"epoch": 0.7815668202764977,
"grad_norm": 0.28476646542549133,
"learning_rate": 2.7705145406318167e-05,
"loss": 1.7191,
"step": 424
},
{
"epoch": 0.783410138248848,
"grad_norm": 0.2587452232837677,
"learning_rate": 2.7687993777497747e-05,
"loss": 1.5733,
"step": 425
},
{
"epoch": 0.7852534562211981,
"grad_norm": 0.2674744427204132,
"learning_rate": 2.7670783642208996e-05,
"loss": 1.6225,
"step": 426
},
{
"epoch": 0.7870967741935484,
"grad_norm": 0.2689223289489746,
"learning_rate": 2.7653515079810744e-05,
"loss": 1.6964,
"step": 427
},
{
"epoch": 0.7889400921658987,
"grad_norm": 0.26800140738487244,
"learning_rate": 2.7636188169931217e-05,
"loss": 1.6728,
"step": 428
},
{
"epoch": 0.7907834101382488,
"grad_norm": 0.27116596698760986,
"learning_rate": 2.7618802992467718e-05,
"loss": 1.6971,
"step": 429
},
{
"epoch": 0.7926267281105991,
"grad_norm": 0.26951897144317627,
"learning_rate": 2.760135962758621e-05,
"loss": 1.6763,
"step": 430
},
{
"epoch": 0.7944700460829494,
"grad_norm": 0.26193779706954956,
"learning_rate": 2.7583858155720977e-05,
"loss": 1.6291,
"step": 431
},
{
"epoch": 0.7963133640552995,
"grad_norm": 0.27084240317344666,
"learning_rate": 2.756629865757424e-05,
"loss": 1.6718,
"step": 432
},
{
"epoch": 0.7981566820276498,
"grad_norm": 0.2781943380832672,
"learning_rate": 2.7548681214115798e-05,
"loss": 1.5723,
"step": 433
},
{
"epoch": 0.8,
"grad_norm": 0.27311035990715027,
"learning_rate": 2.7531005906582628e-05,
"loss": 1.6833,
"step": 434
},
{
"epoch": 0.8018433179723502,
"grad_norm": 0.27680864930152893,
"learning_rate": 2.7513272816478554e-05,
"loss": 1.6166,
"step": 435
},
{
"epoch": 0.8036866359447005,
"grad_norm": 0.26824140548706055,
"learning_rate": 2.7495482025573817e-05,
"loss": 1.6716,
"step": 436
},
{
"epoch": 0.8055299539170507,
"grad_norm": 0.27081307768821716,
"learning_rate": 2.7477633615904744e-05,
"loss": 1.6573,
"step": 437
},
{
"epoch": 0.8073732718894009,
"grad_norm": 0.2916286587715149,
"learning_rate": 2.7459727669773344e-05,
"loss": 1.6609,
"step": 438
},
{
"epoch": 0.8092165898617512,
"grad_norm": 0.2842768132686615,
"learning_rate": 2.7441764269746946e-05,
"loss": 1.6401,
"step": 439
},
{
"epoch": 0.8110599078341014,
"grad_norm": 0.2689710557460785,
"learning_rate": 2.7423743498657794e-05,
"loss": 1.6313,
"step": 440
},
{
"epoch": 0.8129032258064516,
"grad_norm": 0.28198468685150146,
"learning_rate": 2.7405665439602695e-05,
"loss": 1.729,
"step": 441
},
{
"epoch": 0.8147465437788018,
"grad_norm": 0.28444287180900574,
"learning_rate": 2.7387530175942604e-05,
"loss": 1.6436,
"step": 442
},
{
"epoch": 0.8165898617511521,
"grad_norm": 0.289588987827301,
"learning_rate": 2.7369337791302272e-05,
"loss": 1.6492,
"step": 443
},
{
"epoch": 0.8184331797235023,
"grad_norm": 0.2716604173183441,
"learning_rate": 2.7351088369569833e-05,
"loss": 1.6683,
"step": 444
},
{
"epoch": 0.8202764976958525,
"grad_norm": 0.2785453796386719,
"learning_rate": 2.7332781994896438e-05,
"loss": 1.6595,
"step": 445
},
{
"epoch": 0.8221198156682028,
"grad_norm": 0.26734933257102966,
"learning_rate": 2.7314418751695845e-05,
"loss": 1.6208,
"step": 446
},
{
"epoch": 0.823963133640553,
"grad_norm": 0.27588117122650146,
"learning_rate": 2.7295998724644058e-05,
"loss": 1.6085,
"step": 447
},
{
"epoch": 0.8258064516129032,
"grad_norm": 0.26697295904159546,
"learning_rate": 2.7277521998678904e-05,
"loss": 1.6348,
"step": 448
},
{
"epoch": 0.8276497695852535,
"grad_norm": 0.27423718571662903,
"learning_rate": 2.725898865899967e-05,
"loss": 1.6787,
"step": 449
},
{
"epoch": 0.8294930875576036,
"grad_norm": 0.26683908700942993,
"learning_rate": 2.72403987910667e-05,
"loss": 1.6271,
"step": 450
},
{
"epoch": 0.8313364055299539,
"grad_norm": 0.26200321316719055,
"learning_rate": 2.722175248060099e-05,
"loss": 1.6035,
"step": 451
},
{
"epoch": 0.8331797235023042,
"grad_norm": 0.2699339985847473,
"learning_rate": 2.7203049813583803e-05,
"loss": 1.5928,
"step": 452
},
{
"epoch": 0.8350230414746543,
"grad_norm": 0.27287527918815613,
"learning_rate": 2.7184290876256278e-05,
"loss": 1.6073,
"step": 453
},
{
"epoch": 0.8368663594470046,
"grad_norm": 0.2751379907131195,
"learning_rate": 2.716547575511903e-05,
"loss": 1.6385,
"step": 454
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.2756018340587616,
"learning_rate": 2.714660453693173e-05,
"loss": 1.6921,
"step": 455
},
{
"epoch": 0.840552995391705,
"grad_norm": 0.28198951482772827,
"learning_rate": 2.7127677308712733e-05,
"loss": 1.6651,
"step": 456
},
{
"epoch": 0.8423963133640553,
"grad_norm": 0.28402063250541687,
"learning_rate": 2.710869415773867e-05,
"loss": 1.5813,
"step": 457
},
{
"epoch": 0.8442396313364056,
"grad_norm": 0.29829660058021545,
"learning_rate": 2.7089655171544026e-05,
"loss": 1.6971,
"step": 458
},
{
"epoch": 0.8460829493087557,
"grad_norm": 0.2694368362426758,
"learning_rate": 2.707056043792077e-05,
"loss": 1.6268,
"step": 459
},
{
"epoch": 0.847926267281106,
"grad_norm": 0.2761029303073883,
"learning_rate": 2.705141004491792e-05,
"loss": 1.6883,
"step": 460
},
{
"epoch": 0.8497695852534562,
"grad_norm": 0.280799001455307,
"learning_rate": 2.703220408084115e-05,
"loss": 1.6409,
"step": 461
},
{
"epoch": 0.8516129032258064,
"grad_norm": 0.2578011453151703,
"learning_rate": 2.7012942634252384e-05,
"loss": 1.5454,
"step": 462
},
{
"epoch": 0.8534562211981567,
"grad_norm": 0.30007144808769226,
"learning_rate": 2.6993625793969383e-05,
"loss": 1.6845,
"step": 463
},
{
"epoch": 0.8552995391705069,
"grad_norm": 0.26995283365249634,
"learning_rate": 2.697425364906534e-05,
"loss": 1.6339,
"step": 464
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.3060062527656555,
"learning_rate": 2.6954826288868463e-05,
"loss": 1.6226,
"step": 465
},
{
"epoch": 0.8589861751152074,
"grad_norm": 0.27042827010154724,
"learning_rate": 2.693534380296158e-05,
"loss": 1.5902,
"step": 466
},
{
"epoch": 0.8608294930875576,
"grad_norm": 0.2701798975467682,
"learning_rate": 2.6915806281181688e-05,
"loss": 1.6444,
"step": 467
},
{
"epoch": 0.8626728110599078,
"grad_norm": 0.2839266061782837,
"learning_rate": 2.6896213813619592e-05,
"loss": 1.631,
"step": 468
},
{
"epoch": 0.864516129032258,
"grad_norm": 0.2714848816394806,
"learning_rate": 2.6876566490619437e-05,
"loss": 1.5984,
"step": 469
},
{
"epoch": 0.8663594470046083,
"grad_norm": 0.26444998383522034,
"learning_rate": 2.685686440277833e-05,
"loss": 1.6318,
"step": 470
},
{
"epoch": 0.8682027649769585,
"grad_norm": 0.28271374106407166,
"learning_rate": 2.6837107640945904e-05,
"loss": 1.6931,
"step": 471
},
{
"epoch": 0.8700460829493087,
"grad_norm": 0.26922810077667236,
"learning_rate": 2.681729629622391e-05,
"loss": 1.5986,
"step": 472
},
{
"epoch": 0.871889400921659,
"grad_norm": 0.2678123712539673,
"learning_rate": 2.6797430459965766e-05,
"loss": 1.6511,
"step": 473
},
{
"epoch": 0.8737327188940092,
"grad_norm": 0.2775745093822479,
"learning_rate": 2.6777510223776187e-05,
"loss": 1.6248,
"step": 474
},
{
"epoch": 0.8755760368663594,
"grad_norm": 0.2708311378955841,
"learning_rate": 2.6757535679510727e-05,
"loss": 1.6032,
"step": 475
},
{
"epoch": 0.8774193548387097,
"grad_norm": 0.27130743861198425,
"learning_rate": 2.6737506919275363e-05,
"loss": 1.6658,
"step": 476
},
{
"epoch": 0.8792626728110599,
"grad_norm": 0.32221710681915283,
"learning_rate": 2.6717424035426054e-05,
"loss": 1.6324,
"step": 477
},
{
"epoch": 0.8811059907834101,
"grad_norm": 0.27446186542510986,
"learning_rate": 2.6697287120568364e-05,
"loss": 1.6608,
"step": 478
},
{
"epoch": 0.8829493087557604,
"grad_norm": 0.2836889922618866,
"learning_rate": 2.6677096267556984e-05,
"loss": 1.5962,
"step": 479
},
{
"epoch": 0.8847926267281107,
"grad_norm": 0.29300132393836975,
"learning_rate": 2.6656851569495316e-05,
"loss": 1.6496,
"step": 480
},
{
"epoch": 0.8866359447004608,
"grad_norm": 0.2937772572040558,
"learning_rate": 2.6636553119735066e-05,
"loss": 1.6164,
"step": 481
},
{
"epoch": 0.8884792626728111,
"grad_norm": 0.30279168486595154,
"learning_rate": 2.6616201011875792e-05,
"loss": 1.6317,
"step": 482
},
{
"epoch": 0.8903225806451613,
"grad_norm": 0.2589039206504822,
"learning_rate": 2.6595795339764478e-05,
"loss": 1.6076,
"step": 483
},
{
"epoch": 0.8921658986175115,
"grad_norm": 0.29674026370048523,
"learning_rate": 2.6575336197495098e-05,
"loss": 1.6106,
"step": 484
},
{
"epoch": 0.8940092165898618,
"grad_norm": 0.2732203006744385,
"learning_rate": 2.6554823679408195e-05,
"loss": 1.6597,
"step": 485
},
{
"epoch": 0.895852534562212,
"grad_norm": 0.26109176874160767,
"learning_rate": 2.653425788009043e-05,
"loss": 1.5526,
"step": 486
},
{
"epoch": 0.8976958525345622,
"grad_norm": 0.3179691433906555,
"learning_rate": 2.6513638894374158e-05,
"loss": 1.687,
"step": 487
},
{
"epoch": 0.8995391705069125,
"grad_norm": 0.26771122217178345,
"learning_rate": 2.6492966817336977e-05,
"loss": 1.619,
"step": 488
},
{
"epoch": 0.9013824884792627,
"grad_norm": 0.2821449637413025,
"learning_rate": 2.6472241744301304e-05,
"loss": 1.5945,
"step": 489
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.28389427065849304,
"learning_rate": 2.645146377083393e-05,
"loss": 1.5934,
"step": 490
},
{
"epoch": 0.9050691244239631,
"grad_norm": 0.25905099511146545,
"learning_rate": 2.6430632992745577e-05,
"loss": 1.6376,
"step": 491
},
{
"epoch": 0.9069124423963134,
"grad_norm": 0.2909289598464966,
"learning_rate": 2.6409749506090456e-05,
"loss": 1.6398,
"step": 492
},
{
"epoch": 0.9087557603686636,
"grad_norm": 0.30152249336242676,
"learning_rate": 2.638881340716583e-05,
"loss": 1.7085,
"step": 493
},
{
"epoch": 0.9105990783410138,
"grad_norm": 0.2955312430858612,
"learning_rate": 2.6367824792511565e-05,
"loss": 1.663,
"step": 494
},
{
"epoch": 0.9124423963133641,
"grad_norm": 0.3120115101337433,
"learning_rate": 2.6346783758909683e-05,
"loss": 1.6809,
"step": 495
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.26377126574516296,
"learning_rate": 2.632569040338392e-05,
"loss": 1.6235,
"step": 496
},
{
"epoch": 0.9161290322580645,
"grad_norm": 0.2728709876537323,
"learning_rate": 2.6304544823199282e-05,
"loss": 1.6263,
"step": 497
},
{
"epoch": 0.9179723502304148,
"grad_norm": 0.28993189334869385,
"learning_rate": 2.6283347115861586e-05,
"loss": 1.6395,
"step": 498
},
{
"epoch": 0.919815668202765,
"grad_norm": 0.27762892842292786,
"learning_rate": 2.6262097379117015e-05,
"loss": 1.6613,
"step": 499
},
{
"epoch": 0.9216589861751152,
"grad_norm": 0.26995396614074707,
"learning_rate": 2.624079571095167e-05,
"loss": 1.6483,
"step": 500
},
{
"epoch": 0.9235023041474655,
"grad_norm": 0.27732783555984497,
"learning_rate": 2.6219442209591123e-05,
"loss": 1.6918,
"step": 501
},
{
"epoch": 0.9253456221198156,
"grad_norm": 0.28199324011802673,
"learning_rate": 2.619803697349994e-05,
"loss": 1.6222,
"step": 502
},
{
"epoch": 0.9271889400921659,
"grad_norm": 0.28517088294029236,
"learning_rate": 2.6176580101381273e-05,
"loss": 1.6017,
"step": 503
},
{
"epoch": 0.9290322580645162,
"grad_norm": 0.2852960526943207,
"learning_rate": 2.6155071692176348e-05,
"loss": 1.6117,
"step": 504
},
{
"epoch": 0.9308755760368663,
"grad_norm": 0.2706369459629059,
"learning_rate": 2.613351184506405e-05,
"loss": 1.6483,
"step": 505
},
{
"epoch": 0.9327188940092166,
"grad_norm": 0.31629517674446106,
"learning_rate": 2.6111900659460455e-05,
"loss": 1.63,
"step": 506
},
{
"epoch": 0.9345622119815669,
"grad_norm": 0.2743918001651764,
"learning_rate": 2.6090238235018365e-05,
"loss": 1.617,
"step": 507
},
{
"epoch": 0.936405529953917,
"grad_norm": 0.26636356115341187,
"learning_rate": 2.6068524671626856e-05,
"loss": 1.6671,
"step": 508
},
{
"epoch": 0.9382488479262673,
"grad_norm": 0.2727503478527069,
"learning_rate": 2.6046760069410806e-05,
"loss": 1.6101,
"step": 509
},
{
"epoch": 0.9400921658986175,
"grad_norm": 0.27820461988449097,
"learning_rate": 2.6024944528730453e-05,
"loss": 1.5903,
"step": 510
},
{
"epoch": 0.9419354838709677,
"grad_norm": 0.2720506191253662,
"learning_rate": 2.6003078150180922e-05,
"loss": 1.6722,
"step": 511
},
{
"epoch": 0.943778801843318,
"grad_norm": 0.2729189097881317,
"learning_rate": 2.598116103459174e-05,
"loss": 1.6232,
"step": 512
},
{
"epoch": 0.9456221198156682,
"grad_norm": 0.2625363767147064,
"learning_rate": 2.595919328302641e-05,
"loss": 1.5969,
"step": 513
},
{
"epoch": 0.9474654377880184,
"grad_norm": 0.26478803157806396,
"learning_rate": 2.5937174996781927e-05,
"loss": 1.5817,
"step": 514
},
{
"epoch": 0.9493087557603687,
"grad_norm": 0.28460946679115295,
"learning_rate": 2.5915106277388293e-05,
"loss": 1.5845,
"step": 515
},
{
"epoch": 0.9511520737327189,
"grad_norm": 0.2615947127342224,
"learning_rate": 2.5892987226608082e-05,
"loss": 1.6227,
"step": 516
},
{
"epoch": 0.9529953917050691,
"grad_norm": 0.25825098156929016,
"learning_rate": 2.5870817946435953e-05,
"loss": 1.5853,
"step": 517
},
{
"epoch": 0.9548387096774194,
"grad_norm": 0.2917359471321106,
"learning_rate": 2.5848598539098164e-05,
"loss": 1.6514,
"step": 518
},
{
"epoch": 0.9566820276497696,
"grad_norm": 0.25932732224464417,
"learning_rate": 2.5826329107052144e-05,
"loss": 1.603,
"step": 519
},
{
"epoch": 0.9585253456221198,
"grad_norm": 0.25399070978164673,
"learning_rate": 2.5804009752985975e-05,
"loss": 1.6073,
"step": 520
},
{
"epoch": 0.96036866359447,
"grad_norm": 0.28060877323150635,
"learning_rate": 2.5781640579817946e-05,
"loss": 1.6337,
"step": 521
},
{
"epoch": 0.9622119815668203,
"grad_norm": 0.28021928668022156,
"learning_rate": 2.5759221690696062e-05,
"loss": 1.6345,
"step": 522
},
{
"epoch": 0.9640552995391705,
"grad_norm": 0.27043914794921875,
"learning_rate": 2.573675318899759e-05,
"loss": 1.6471,
"step": 523
},
{
"epoch": 0.9658986175115207,
"grad_norm": 0.2874245345592499,
"learning_rate": 2.5714235178328554e-05,
"loss": 1.6632,
"step": 524
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.2559823989868164,
"learning_rate": 2.5691667762523284e-05,
"loss": 1.6133,
"step": 525
},
{
"epoch": 0.9695852534562212,
"grad_norm": 0.2857271432876587,
"learning_rate": 2.566905104564393e-05,
"loss": 1.628,
"step": 526
},
{
"epoch": 0.9714285714285714,
"grad_norm": 0.27135902643203735,
"learning_rate": 2.564638513197995e-05,
"loss": 1.5717,
"step": 527
},
{
"epoch": 0.9732718894009217,
"grad_norm": 0.25891175866127014,
"learning_rate": 2.562367012604769e-05,
"loss": 1.6104,
"step": 528
},
{
"epoch": 0.9751152073732718,
"grad_norm": 0.27881482243537903,
"learning_rate": 2.5600906132589846e-05,
"loss": 1.6325,
"step": 529
},
{
"epoch": 0.9769585253456221,
"grad_norm": 0.2634258270263672,
"learning_rate": 2.557809325657501e-05,
"loss": 1.605,
"step": 530
},
{
"epoch": 0.9788018433179724,
"grad_norm": 0.2687437832355499,
"learning_rate": 2.555523160319719e-05,
"loss": 1.6302,
"step": 531
},
{
"epoch": 0.9806451612903225,
"grad_norm": 0.27842116355895996,
"learning_rate": 2.5532321277875305e-05,
"loss": 1.6813,
"step": 532
},
{
"epoch": 0.9824884792626728,
"grad_norm": 0.26950204372406006,
"learning_rate": 2.5509362386252702e-05,
"loss": 1.6166,
"step": 533
},
{
"epoch": 0.9843317972350231,
"grad_norm": 0.2951159179210663,
"learning_rate": 2.5486355034196686e-05,
"loss": 1.669,
"step": 534
},
{
"epoch": 0.9861751152073732,
"grad_norm": 0.2618483603000641,
"learning_rate": 2.5463299327798015e-05,
"loss": 1.6714,
"step": 535
},
{
"epoch": 0.9880184331797235,
"grad_norm": 0.25726941227912903,
"learning_rate": 2.544019537337043e-05,
"loss": 1.6314,
"step": 536
},
{
"epoch": 0.9898617511520738,
"grad_norm": 0.2638774812221527,
"learning_rate": 2.541704327745013e-05,
"loss": 1.6458,
"step": 537
},
{
"epoch": 0.9917050691244239,
"grad_norm": 0.2752140760421753,
"learning_rate": 2.539384314679532e-05,
"loss": 1.6564,
"step": 538
},
{
"epoch": 0.9935483870967742,
"grad_norm": 0.27124327421188354,
"learning_rate": 2.5370595088385696e-05,
"loss": 1.6071,
"step": 539
},
{
"epoch": 0.9953917050691244,
"grad_norm": 0.26434096693992615,
"learning_rate": 2.5347299209421955e-05,
"loss": 1.611,
"step": 540
},
{
"epoch": 0.9972350230414746,
"grad_norm": 0.2855331003665924,
"learning_rate": 2.53239556173253e-05,
"loss": 1.6311,
"step": 541
},
{
"epoch": 0.9990783410138249,
"grad_norm": 0.2693633437156677,
"learning_rate": 2.530056441973696e-05,
"loss": 1.5659,
"step": 542
},
{
"epoch": 1.0009216589861751,
"grad_norm": 0.26715287566185,
"learning_rate": 2.5277125724517665e-05,
"loss": 1.6523,
"step": 543
},
{
"epoch": 1.0027649769585254,
"grad_norm": 0.30740463733673096,
"learning_rate": 2.525363963974717e-05,
"loss": 1.655,
"step": 544
},
{
"epoch": 1.0046082949308757,
"grad_norm": 0.2541782557964325,
"learning_rate": 2.523010627372376e-05,
"loss": 1.5368,
"step": 545
},
{
"epoch": 1.0064516129032257,
"grad_norm": 0.27700361609458923,
"learning_rate": 2.520652573496373e-05,
"loss": 1.6131,
"step": 546
},
{
"epoch": 1.008294930875576,
"grad_norm": 0.2821163237094879,
"learning_rate": 2.51828981322009e-05,
"loss": 1.6299,
"step": 547
},
{
"epoch": 1.0101382488479262,
"grad_norm": 0.26375892758369446,
"learning_rate": 2.5159223574386117e-05,
"loss": 1.6282,
"step": 548
},
{
"epoch": 1.0119815668202765,
"grad_norm": 0.26267147064208984,
"learning_rate": 2.513550217068673e-05,
"loss": 1.6306,
"step": 549
},
{
"epoch": 1.0138248847926268,
"grad_norm": 0.2676134407520294,
"learning_rate": 2.5111734030486127e-05,
"loss": 1.6022,
"step": 550
},
{
"epoch": 1.015668202764977,
"grad_norm": 0.27351808547973633,
"learning_rate": 2.508791926338317e-05,
"loss": 1.6113,
"step": 551
},
{
"epoch": 1.017511520737327,
"grad_norm": 0.2688004970550537,
"learning_rate": 2.5064057979191766e-05,
"loss": 1.6101,
"step": 552
},
{
"epoch": 1.0193548387096774,
"grad_norm": 0.26147159934043884,
"learning_rate": 2.5040150287940286e-05,
"loss": 1.611,
"step": 553
},
{
"epoch": 1.0211981566820276,
"grad_norm": 0.272300660610199,
"learning_rate": 2.5016196299871115e-05,
"loss": 1.6068,
"step": 554
},
{
"epoch": 1.023041474654378,
"grad_norm": 0.26897957921028137,
"learning_rate": 2.49921961254401e-05,
"loss": 1.6466,
"step": 555
},
{
"epoch": 1.0248847926267282,
"grad_norm": 0.26811644434928894,
"learning_rate": 2.496814987531609e-05,
"loss": 1.651,
"step": 556
},
{
"epoch": 1.0267281105990784,
"grad_norm": 0.265045702457428,
"learning_rate": 2.4944057660380363e-05,
"loss": 1.6455,
"step": 557
},
{
"epoch": 1.0285714285714285,
"grad_norm": 0.2766599953174591,
"learning_rate": 2.4919919591726175e-05,
"loss": 1.6231,
"step": 558
},
{
"epoch": 1.0304147465437787,
"grad_norm": 0.27361610531806946,
"learning_rate": 2.489573578065821e-05,
"loss": 1.6258,
"step": 559
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.26939770579338074,
"learning_rate": 2.487150633869207e-05,
"loss": 1.5856,
"step": 560
},
{
"epoch": 1.0341013824884793,
"grad_norm": 0.28874027729034424,
"learning_rate": 2.484723137755379e-05,
"loss": 1.5899,
"step": 561
},
{
"epoch": 1.0359447004608295,
"grad_norm": 0.2698168158531189,
"learning_rate": 2.482291100917928e-05,
"loss": 1.7224,
"step": 562
},
{
"epoch": 1.0377880184331798,
"grad_norm": 0.26175767183303833,
"learning_rate": 2.4798545345713837e-05,
"loss": 1.6187,
"step": 563
},
{
"epoch": 1.0396313364055298,
"grad_norm": 0.27548477053642273,
"learning_rate": 2.4774134499511636e-05,
"loss": 1.7049,
"step": 564
},
{
"epoch": 1.0414746543778801,
"grad_norm": 0.257304847240448,
"learning_rate": 2.4749678583135175e-05,
"loss": 1.5474,
"step": 565
},
{
"epoch": 1.0433179723502304,
"grad_norm": 0.266632080078125,
"learning_rate": 2.472517770935479e-05,
"loss": 1.623,
"step": 566
},
{
"epoch": 1.0451612903225806,
"grad_norm": 0.2716248631477356,
"learning_rate": 2.4700631991148126e-05,
"loss": 1.5814,
"step": 567
},
{
"epoch": 1.047004608294931,
"grad_norm": 0.2753863036632538,
"learning_rate": 2.46760415416996e-05,
"loss": 1.644,
"step": 568
},
{
"epoch": 1.0488479262672812,
"grad_norm": 0.2726069390773773,
"learning_rate": 2.465140647439991e-05,
"loss": 1.6133,
"step": 569
},
{
"epoch": 1.0506912442396312,
"grad_norm": 0.28230923414230347,
"learning_rate": 2.4626726902845477e-05,
"loss": 1.6963,
"step": 570
},
{
"epoch": 1.0525345622119815,
"grad_norm": 0.25983119010925293,
"learning_rate": 2.4602002940837948e-05,
"loss": 1.5626,
"step": 571
},
{
"epoch": 1.0543778801843318,
"grad_norm": 0.2676817774772644,
"learning_rate": 2.4577234702383666e-05,
"loss": 1.5422,
"step": 572
},
{
"epoch": 1.056221198156682,
"grad_norm": 0.26019108295440674,
"learning_rate": 2.4552422301693128e-05,
"loss": 1.5826,
"step": 573
},
{
"epoch": 1.0580645161290323,
"grad_norm": 0.26668256521224976,
"learning_rate": 2.452756585318048e-05,
"loss": 1.596,
"step": 574
},
{
"epoch": 1.0599078341013826,
"grad_norm": 0.28593432903289795,
"learning_rate": 2.4502665471462983e-05,
"loss": 1.6028,
"step": 575
},
{
"epoch": 1.0617511520737328,
"grad_norm": 0.2791599929332733,
"learning_rate": 2.447772127136046e-05,
"loss": 1.5927,
"step": 576
},
{
"epoch": 1.0635944700460829,
"grad_norm": 0.28675881028175354,
"learning_rate": 2.4452733367894816e-05,
"loss": 1.5879,
"step": 577
},
{
"epoch": 1.0654377880184331,
"grad_norm": 0.29501160979270935,
"learning_rate": 2.4427701876289465e-05,
"loss": 1.5583,
"step": 578
},
{
"epoch": 1.0672811059907834,
"grad_norm": 0.2674134075641632,
"learning_rate": 2.440262691196881e-05,
"loss": 1.6205,
"step": 579
},
{
"epoch": 1.0691244239631337,
"grad_norm": 0.32356998324394226,
"learning_rate": 2.437750859055773e-05,
"loss": 1.6112,
"step": 580
},
{
"epoch": 1.070967741935484,
"grad_norm": 0.2775920629501343,
"learning_rate": 2.4352347027881003e-05,
"loss": 1.6036,
"step": 581
},
{
"epoch": 1.072811059907834,
"grad_norm": 0.28417059779167175,
"learning_rate": 2.4327142339962827e-05,
"loss": 1.6073,
"step": 582
},
{
"epoch": 1.0746543778801843,
"grad_norm": 0.316342294216156,
"learning_rate": 2.430189464302625e-05,
"loss": 1.6312,
"step": 583
},
{
"epoch": 1.0764976958525345,
"grad_norm": 0.2634347081184387,
"learning_rate": 2.4276604053492636e-05,
"loss": 1.6042,
"step": 584
},
{
"epoch": 1.0783410138248848,
"grad_norm": 0.2889562249183655,
"learning_rate": 2.425127068798113e-05,
"loss": 1.586,
"step": 585
},
{
"epoch": 1.080184331797235,
"grad_norm": 0.2724316716194153,
"learning_rate": 2.422589466330814e-05,
"loss": 1.6629,
"step": 586
},
{
"epoch": 1.0820276497695853,
"grad_norm": 0.263497531414032,
"learning_rate": 2.4200476096486774e-05,
"loss": 1.5843,
"step": 587
},
{
"epoch": 1.0838709677419356,
"grad_norm": 0.27481377124786377,
"learning_rate": 2.4175015104726306e-05,
"loss": 1.6378,
"step": 588
},
{
"epoch": 1.0857142857142856,
"grad_norm": 0.28347697854042053,
"learning_rate": 2.414951180543164e-05,
"loss": 1.7082,
"step": 589
},
{
"epoch": 1.087557603686636,
"grad_norm": 0.2818866968154907,
"learning_rate": 2.4123966316202768e-05,
"loss": 1.5482,
"step": 590
},
{
"epoch": 1.0894009216589862,
"grad_norm": 0.26917752623558044,
"learning_rate": 2.4098378754834227e-05,
"loss": 1.6042,
"step": 591
},
{
"epoch": 1.0912442396313364,
"grad_norm": 0.2925183176994324,
"learning_rate": 2.4072749239314565e-05,
"loss": 1.5839,
"step": 592
},
{
"epoch": 1.0930875576036867,
"grad_norm": 0.2812125086784363,
"learning_rate": 2.4047077887825765e-05,
"loss": 1.5705,
"step": 593
},
{
"epoch": 1.094930875576037,
"grad_norm": 0.2660687565803528,
"learning_rate": 2.402136481874275e-05,
"loss": 1.6325,
"step": 594
},
{
"epoch": 1.096774193548387,
"grad_norm": 0.308992862701416,
"learning_rate": 2.399561015063278e-05,
"loss": 1.5755,
"step": 595
},
{
"epoch": 1.0986175115207373,
"grad_norm": 0.2750917971134186,
"learning_rate": 2.3969814002254965e-05,
"loss": 1.6258,
"step": 596
},
{
"epoch": 1.1004608294930875,
"grad_norm": 0.277424156665802,
"learning_rate": 2.3943976492559675e-05,
"loss": 1.6046,
"step": 597
},
{
"epoch": 1.1023041474654378,
"grad_norm": 0.2793235182762146,
"learning_rate": 2.3918097740687987e-05,
"loss": 1.6198,
"step": 598
},
{
"epoch": 1.104147465437788,
"grad_norm": 0.30500084161758423,
"learning_rate": 2.3892177865971183e-05,
"loss": 1.5345,
"step": 599
},
{
"epoch": 1.1059907834101383,
"grad_norm": 0.2782265841960907,
"learning_rate": 2.386621698793015e-05,
"loss": 1.6041,
"step": 600
},
{
"epoch": 1.1078341013824884,
"grad_norm": 0.2845817506313324,
"learning_rate": 2.3840215226274847e-05,
"loss": 1.5975,
"step": 601
},
{
"epoch": 1.1096774193548387,
"grad_norm": 0.31969916820526123,
"learning_rate": 2.3814172700903775e-05,
"loss": 1.6021,
"step": 602
},
{
"epoch": 1.111520737327189,
"grad_norm": 0.26726001501083374,
"learning_rate": 2.3788089531903372e-05,
"loss": 1.5317,
"step": 603
},
{
"epoch": 1.1133640552995392,
"grad_norm": 0.2735467553138733,
"learning_rate": 2.3761965839547515e-05,
"loss": 1.5867,
"step": 604
},
{
"epoch": 1.1152073732718895,
"grad_norm": 0.31699496507644653,
"learning_rate": 2.3735801744296934e-05,
"loss": 1.6256,
"step": 605
},
{
"epoch": 1.1170506912442397,
"grad_norm": 0.27312713861465454,
"learning_rate": 2.3709597366798662e-05,
"loss": 1.6208,
"step": 606
},
{
"epoch": 1.1188940092165898,
"grad_norm": 0.2782924473285675,
"learning_rate": 2.3683352827885472e-05,
"loss": 1.6535,
"step": 607
},
{
"epoch": 1.12073732718894,
"grad_norm": 0.30579322576522827,
"learning_rate": 2.365706824857535e-05,
"loss": 1.606,
"step": 608
},
{
"epoch": 1.1225806451612903,
"grad_norm": 0.28099164366722107,
"learning_rate": 2.3630743750070892e-05,
"loss": 1.5968,
"step": 609
},
{
"epoch": 1.1244239631336406,
"grad_norm": 0.27450433373451233,
"learning_rate": 2.360437945375878e-05,
"loss": 1.6303,
"step": 610
},
{
"epoch": 1.1262672811059908,
"grad_norm": 0.27543413639068604,
"learning_rate": 2.3577975481209214e-05,
"loss": 1.6004,
"step": 611
},
{
"epoch": 1.128110599078341,
"grad_norm": 0.27525603771209717,
"learning_rate": 2.3551531954175335e-05,
"loss": 1.6507,
"step": 612
},
{
"epoch": 1.1299539170506911,
"grad_norm": 0.26268866658210754,
"learning_rate": 2.3525048994592684e-05,
"loss": 1.5314,
"step": 613
},
{
"epoch": 1.1317972350230414,
"grad_norm": 0.2847149968147278,
"learning_rate": 2.3498526724578637e-05,
"loss": 1.5997,
"step": 614
},
{
"epoch": 1.1336405529953917,
"grad_norm": 0.2706824243068695,
"learning_rate": 2.3471965266431824e-05,
"loss": 1.6192,
"step": 615
},
{
"epoch": 1.135483870967742,
"grad_norm": 0.28432345390319824,
"learning_rate": 2.3445364742631592e-05,
"loss": 1.5632,
"step": 616
},
{
"epoch": 1.1373271889400922,
"grad_norm": 0.2760394811630249,
"learning_rate": 2.3418725275837413e-05,
"loss": 1.6104,
"step": 617
},
{
"epoch": 1.1391705069124425,
"grad_norm": 0.26547399163246155,
"learning_rate": 2.3392046988888345e-05,
"loss": 1.5942,
"step": 618
},
{
"epoch": 1.1410138248847925,
"grad_norm": 0.28383272886276245,
"learning_rate": 2.3365330004802443e-05,
"loss": 1.6284,
"step": 619
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.2690708339214325,
"learning_rate": 2.33385744467762e-05,
"loss": 1.5903,
"step": 620
},
{
"epoch": 1.144700460829493,
"grad_norm": 0.2783527970314026,
"learning_rate": 2.331178043818399e-05,
"loss": 1.6339,
"step": 621
},
{
"epoch": 1.1465437788018433,
"grad_norm": 0.2668425738811493,
"learning_rate": 2.328494810257748e-05,
"loss": 1.5174,
"step": 622
},
{
"epoch": 1.1483870967741936,
"grad_norm": 0.2701328992843628,
"learning_rate": 2.3258077563685072e-05,
"loss": 1.6126,
"step": 623
},
{
"epoch": 1.1502304147465439,
"grad_norm": 0.26650527119636536,
"learning_rate": 2.3231168945411326e-05,
"loss": 1.5872,
"step": 624
},
{
"epoch": 1.1520737327188941,
"grad_norm": 0.27082017064094543,
"learning_rate": 2.320422237183641e-05,
"loss": 1.6007,
"step": 625
},
{
"epoch": 1.1539170506912442,
"grad_norm": 0.28844255208969116,
"learning_rate": 2.317723796721547e-05,
"loss": 1.5988,
"step": 626
},
{
"epoch": 1.1557603686635944,
"grad_norm": 0.26136210560798645,
"learning_rate": 2.315021585597815e-05,
"loss": 1.5385,
"step": 627
},
{
"epoch": 1.1576036866359447,
"grad_norm": 0.28600579500198364,
"learning_rate": 2.3123156162727923e-05,
"loss": 1.6156,
"step": 628
},
{
"epoch": 1.159447004608295,
"grad_norm": 0.27295541763305664,
"learning_rate": 2.3096059012241583e-05,
"loss": 1.5353,
"step": 629
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.2765044867992401,
"learning_rate": 2.3068924529468638e-05,
"loss": 1.6577,
"step": 630
},
{
"epoch": 1.1631336405529953,
"grad_norm": 0.28675732016563416,
"learning_rate": 2.3041752839530735e-05,
"loss": 1.6112,
"step": 631
},
{
"epoch": 1.1649769585253456,
"grad_norm": 0.2791755795478821,
"learning_rate": 2.3014544067721096e-05,
"loss": 1.5268,
"step": 632
},
{
"epoch": 1.1668202764976958,
"grad_norm": 0.2844353914260864,
"learning_rate": 2.298729833950394e-05,
"loss": 1.5635,
"step": 633
},
{
"epoch": 1.168663594470046,
"grad_norm": 0.2774640917778015,
"learning_rate": 2.2960015780513893e-05,
"loss": 1.6243,
"step": 634
},
{
"epoch": 1.1705069124423964,
"grad_norm": 0.27996620535850525,
"learning_rate": 2.2932696516555396e-05,
"loss": 1.5647,
"step": 635
},
{
"epoch": 1.1723502304147466,
"grad_norm": 0.2877368927001953,
"learning_rate": 2.2905340673602184e-05,
"loss": 1.5705,
"step": 636
},
{
"epoch": 1.1741935483870969,
"grad_norm": 0.2850167751312256,
"learning_rate": 2.287794837779662e-05,
"loss": 1.6524,
"step": 637
},
{
"epoch": 1.176036866359447,
"grad_norm": 0.2903353273868561,
"learning_rate": 2.2850519755449183e-05,
"loss": 1.57,
"step": 638
},
{
"epoch": 1.1778801843317972,
"grad_norm": 0.2763937711715698,
"learning_rate": 2.282305493303785e-05,
"loss": 1.6409,
"step": 639
},
{
"epoch": 1.1797235023041475,
"grad_norm": 0.2943311929702759,
"learning_rate": 2.2795554037207528e-05,
"loss": 1.6925,
"step": 640
},
{
"epoch": 1.1815668202764977,
"grad_norm": 0.2771497070789337,
"learning_rate": 2.2768017194769466e-05,
"loss": 1.5796,
"step": 641
},
{
"epoch": 1.183410138248848,
"grad_norm": 0.26944899559020996,
"learning_rate": 2.2740444532700657e-05,
"loss": 1.6039,
"step": 642
},
{
"epoch": 1.185253456221198,
"grad_norm": 0.2843589782714844,
"learning_rate": 2.271283617814328e-05,
"loss": 1.6457,
"step": 643
},
{
"epoch": 1.1870967741935483,
"grad_norm": 0.2836996614933014,
"learning_rate": 2.268519225840409e-05,
"loss": 1.5728,
"step": 644
},
{
"epoch": 1.1889400921658986,
"grad_norm": 0.28848952054977417,
"learning_rate": 2.2657512900953832e-05,
"loss": 1.617,
"step": 645
},
{
"epoch": 1.1907834101382488,
"grad_norm": 0.2769070267677307,
"learning_rate": 2.2629798233426677e-05,
"loss": 1.6127,
"step": 646
},
{
"epoch": 1.192626728110599,
"grad_norm": 0.2685301601886749,
"learning_rate": 2.26020483836196e-05,
"loss": 1.5747,
"step": 647
},
{
"epoch": 1.1944700460829494,
"grad_norm": 0.2858518660068512,
"learning_rate": 2.2574263479491816e-05,
"loss": 1.6335,
"step": 648
},
{
"epoch": 1.1963133640552996,
"grad_norm": 0.27150848507881165,
"learning_rate": 2.2546443649164186e-05,
"loss": 1.5749,
"step": 649
},
{
"epoch": 1.1981566820276497,
"grad_norm": 0.27711644768714905,
"learning_rate": 2.2518589020918612e-05,
"loss": 1.6022,
"step": 650
},
{
"epoch": 1.2,
"grad_norm": 0.27468806505203247,
"learning_rate": 2.2490699723197454e-05,
"loss": 1.6034,
"step": 651
},
{
"epoch": 1.2018433179723502,
"grad_norm": 0.2741892635822296,
"learning_rate": 2.2462775884602954e-05,
"loss": 1.6301,
"step": 652
},
{
"epoch": 1.2036866359447005,
"grad_norm": 0.2700754702091217,
"learning_rate": 2.243481763389661e-05,
"loss": 1.5741,
"step": 653
},
{
"epoch": 1.2055299539170508,
"grad_norm": 0.28095367550849915,
"learning_rate": 2.24068250999986e-05,
"loss": 1.5861,
"step": 654
},
{
"epoch": 1.2073732718894008,
"grad_norm": 0.27654770016670227,
"learning_rate": 2.2378798411987218e-05,
"loss": 1.6016,
"step": 655
},
{
"epoch": 1.209216589861751,
"grad_norm": 0.27032870054244995,
"learning_rate": 2.2350737699098203e-05,
"loss": 1.6194,
"step": 656
},
{
"epoch": 1.2110599078341013,
"grad_norm": 0.2779473066329956,
"learning_rate": 2.2322643090724218e-05,
"loss": 1.6285,
"step": 657
},
{
"epoch": 1.2129032258064516,
"grad_norm": 0.2657751739025116,
"learning_rate": 2.229451471641422e-05,
"loss": 1.6217,
"step": 658
},
{
"epoch": 1.2147465437788019,
"grad_norm": 0.27549269795417786,
"learning_rate": 2.226635270587286e-05,
"loss": 1.5244,
"step": 659
},
{
"epoch": 1.2165898617511521,
"grad_norm": 0.2700861692428589,
"learning_rate": 2.2238157188959893e-05,
"loss": 1.5988,
"step": 660
},
{
"epoch": 1.2184331797235024,
"grad_norm": 0.27551552653312683,
"learning_rate": 2.2209928295689582e-05,
"loss": 1.6695,
"step": 661
},
{
"epoch": 1.2202764976958524,
"grad_norm": 0.27419742941856384,
"learning_rate": 2.2181666156230082e-05,
"loss": 1.5763,
"step": 662
},
{
"epoch": 1.2221198156682027,
"grad_norm": 0.26684898138046265,
"learning_rate": 2.2153370900902872e-05,
"loss": 1.5866,
"step": 663
},
{
"epoch": 1.223963133640553,
"grad_norm": 0.2631971836090088,
"learning_rate": 2.2125042660182115e-05,
"loss": 1.555,
"step": 664
},
{
"epoch": 1.2258064516129032,
"grad_norm": 0.26897555589675903,
"learning_rate": 2.2096681564694087e-05,
"loss": 1.6108,
"step": 665
},
{
"epoch": 1.2276497695852535,
"grad_norm": 0.2753186523914337,
"learning_rate": 2.2068287745216552e-05,
"loss": 1.6178,
"step": 666
},
{
"epoch": 1.2294930875576038,
"grad_norm": 0.272468626499176,
"learning_rate": 2.203986133267818e-05,
"loss": 1.5851,
"step": 667
},
{
"epoch": 1.2313364055299538,
"grad_norm": 0.28555935621261597,
"learning_rate": 2.2011402458157935e-05,
"loss": 1.657,
"step": 668
},
{
"epoch": 1.233179723502304,
"grad_norm": 0.2571600079536438,
"learning_rate": 2.198291125288445e-05,
"loss": 1.5385,
"step": 669
},
{
"epoch": 1.2350230414746544,
"grad_norm": 0.2795640528202057,
"learning_rate": 2.1954387848235455e-05,
"loss": 1.5856,
"step": 670
},
{
"epoch": 1.2368663594470046,
"grad_norm": 0.2699826657772064,
"learning_rate": 2.1925832375737168e-05,
"loss": 1.587,
"step": 671
},
{
"epoch": 1.238709677419355,
"grad_norm": 0.27142706513404846,
"learning_rate": 2.1897244967063653e-05,
"loss": 1.6016,
"step": 672
},
{
"epoch": 1.2405529953917052,
"grad_norm": 0.2868463099002838,
"learning_rate": 2.1868625754036256e-05,
"loss": 1.5758,
"step": 673
},
{
"epoch": 1.2423963133640552,
"grad_norm": 0.2734906077384949,
"learning_rate": 2.1839974868622956e-05,
"loss": 1.5834,
"step": 674
},
{
"epoch": 1.2442396313364055,
"grad_norm": 0.27085962891578674,
"learning_rate": 2.1811292442937808e-05,
"loss": 1.5689,
"step": 675
},
{
"epoch": 1.2460829493087557,
"grad_norm": 0.2795475125312805,
"learning_rate": 2.1782578609240286e-05,
"loss": 1.5531,
"step": 676
},
{
"epoch": 1.247926267281106,
"grad_norm": 0.27187928557395935,
"learning_rate": 2.1753833499934694e-05,
"loss": 1.5728,
"step": 677
},
{
"epoch": 1.2497695852534563,
"grad_norm": 0.2647460103034973,
"learning_rate": 2.1725057247569552e-05,
"loss": 1.5917,
"step": 678
},
{
"epoch": 1.2516129032258063,
"grad_norm": 0.2762637436389923,
"learning_rate": 2.1696249984836993e-05,
"loss": 1.6209,
"step": 679
},
{
"epoch": 1.2534562211981566,
"grad_norm": 0.27510347962379456,
"learning_rate": 2.166741184457214e-05,
"loss": 1.6489,
"step": 680
},
{
"epoch": 1.2552995391705069,
"grad_norm": 0.2649478316307068,
"learning_rate": 2.1638542959752485e-05,
"loss": 1.5935,
"step": 681
},
{
"epoch": 1.2571428571428571,
"grad_norm": 0.263662189245224,
"learning_rate": 2.160964346349731e-05,
"loss": 1.6304,
"step": 682
},
{
"epoch": 1.2589861751152074,
"grad_norm": 0.280752956867218,
"learning_rate": 2.1580713489067043e-05,
"loss": 1.6311,
"step": 683
},
{
"epoch": 1.2608294930875577,
"grad_norm": 0.2848096191883087,
"learning_rate": 2.155175316986265e-05,
"loss": 1.6682,
"step": 684
},
{
"epoch": 1.262672811059908,
"grad_norm": 0.2672868072986603,
"learning_rate": 2.1522762639425012e-05,
"loss": 1.5798,
"step": 685
},
{
"epoch": 1.2645161290322582,
"grad_norm": 0.2747519612312317,
"learning_rate": 2.1493742031434343e-05,
"loss": 1.5585,
"step": 686
},
{
"epoch": 1.2663594470046082,
"grad_norm": 0.27021849155426025,
"learning_rate": 2.1464691479709534e-05,
"loss": 1.5789,
"step": 687
},
{
"epoch": 1.2682027649769585,
"grad_norm": 0.26815730333328247,
"learning_rate": 2.1435611118207546e-05,
"loss": 1.564,
"step": 688
},
{
"epoch": 1.2700460829493088,
"grad_norm": 0.2694461941719055,
"learning_rate": 2.140650108102281e-05,
"loss": 1.5709,
"step": 689
},
{
"epoch": 1.271889400921659,
"grad_norm": 0.27626311779022217,
"learning_rate": 2.137736150238659e-05,
"loss": 1.6146,
"step": 690
},
{
"epoch": 1.2737327188940093,
"grad_norm": 0.2797856628894806,
"learning_rate": 2.1348192516666376e-05,
"loss": 1.6126,
"step": 691
},
{
"epoch": 1.2755760368663593,
"grad_norm": 0.2678052484989166,
"learning_rate": 2.1318994258365253e-05,
"loss": 1.5817,
"step": 692
},
{
"epoch": 1.2774193548387096,
"grad_norm": 0.2734876275062561,
"learning_rate": 2.128976686212129e-05,
"loss": 1.5634,
"step": 693
},
{
"epoch": 1.2792626728110599,
"grad_norm": 0.2710317373275757,
"learning_rate": 2.1260510462706914e-05,
"loss": 1.6467,
"step": 694
},
{
"epoch": 1.2811059907834101,
"grad_norm": 0.2857086956501007,
"learning_rate": 2.12312251950283e-05,
"loss": 1.5887,
"step": 695
},
{
"epoch": 1.2829493087557604,
"grad_norm": 0.26261481642723083,
"learning_rate": 2.120191119412472e-05,
"loss": 1.6167,
"step": 696
},
{
"epoch": 1.2847926267281107,
"grad_norm": 0.26819396018981934,
"learning_rate": 2.117256859516795e-05,
"loss": 1.5946,
"step": 697
},
{
"epoch": 1.286635944700461,
"grad_norm": 0.2797357439994812,
"learning_rate": 2.1143197533461655e-05,
"loss": 1.5888,
"step": 698
},
{
"epoch": 1.288479262672811,
"grad_norm": 0.2809047996997833,
"learning_rate": 2.1113798144440712e-05,
"loss": 1.5984,
"step": 699
},
{
"epoch": 1.2903225806451613,
"grad_norm": 0.2751614451408386,
"learning_rate": 2.108437056367064e-05,
"loss": 1.6601,
"step": 700
},
{
"epoch": 1.2921658986175115,
"grad_norm": 0.26571017503738403,
"learning_rate": 2.1054914926846957e-05,
"loss": 1.5355,
"step": 701
},
{
"epoch": 1.2940092165898618,
"grad_norm": 0.3031296133995056,
"learning_rate": 2.1025431369794546e-05,
"loss": 1.6608,
"step": 702
},
{
"epoch": 1.295852534562212,
"grad_norm": 0.28314271569252014,
"learning_rate": 2.0995920028467027e-05,
"loss": 1.6063,
"step": 703
},
{
"epoch": 1.297695852534562,
"grad_norm": 0.27367842197418213,
"learning_rate": 2.096638103894616e-05,
"loss": 1.5948,
"step": 704
},
{
"epoch": 1.2995391705069124,
"grad_norm": 0.2784302234649658,
"learning_rate": 2.0936814537441173e-05,
"loss": 1.5953,
"step": 705
},
{
"epoch": 1.3013824884792626,
"grad_norm": 0.2915882468223572,
"learning_rate": 2.0907220660288166e-05,
"loss": 1.5376,
"step": 706
},
{
"epoch": 1.303225806451613,
"grad_norm": 0.26861122250556946,
"learning_rate": 2.087759954394948e-05,
"loss": 1.6224,
"step": 707
},
{
"epoch": 1.3050691244239632,
"grad_norm": 0.2893337309360504,
"learning_rate": 2.084795132501304e-05,
"loss": 1.6294,
"step": 708
},
{
"epoch": 1.3069124423963134,
"grad_norm": 0.2968802750110626,
"learning_rate": 2.081827614019177e-05,
"loss": 1.5813,
"step": 709
},
{
"epoch": 1.3087557603686637,
"grad_norm": 0.2807689309120178,
"learning_rate": 2.0788574126322928e-05,
"loss": 1.596,
"step": 710
},
{
"epoch": 1.3105990783410137,
"grad_norm": 0.27932870388031006,
"learning_rate": 2.0758845420367474e-05,
"loss": 1.5958,
"step": 711
},
{
"epoch": 1.312442396313364,
"grad_norm": 0.3168034553527832,
"learning_rate": 2.0729090159409467e-05,
"loss": 1.5696,
"step": 712
},
{
"epoch": 1.3142857142857143,
"grad_norm": 0.2953466475009918,
"learning_rate": 2.0699308480655397e-05,
"loss": 1.5669,
"step": 713
},
{
"epoch": 1.3161290322580645,
"grad_norm": 0.26360318064689636,
"learning_rate": 2.06695005214336e-05,
"loss": 1.6061,
"step": 714
},
{
"epoch": 1.3179723502304148,
"grad_norm": 0.31883785128593445,
"learning_rate": 2.0639666419193565e-05,
"loss": 1.6457,
"step": 715
},
{
"epoch": 1.3198156682027649,
"grad_norm": 0.30148056149482727,
"learning_rate": 2.0609806311505345e-05,
"loss": 1.6045,
"step": 716
},
{
"epoch": 1.3216589861751151,
"grad_norm": 0.2783588767051697,
"learning_rate": 2.057992033605891e-05,
"loss": 1.6246,
"step": 717
},
{
"epoch": 1.3235023041474654,
"grad_norm": 0.2826476991176605,
"learning_rate": 2.0550008630663507e-05,
"loss": 1.6577,
"step": 718
},
{
"epoch": 1.3253456221198157,
"grad_norm": 0.32222914695739746,
"learning_rate": 2.0520071333247025e-05,
"loss": 1.6668,
"step": 719
},
{
"epoch": 1.327188940092166,
"grad_norm": 0.2784786522388458,
"learning_rate": 2.049010858185537e-05,
"loss": 1.6636,
"step": 720
},
{
"epoch": 1.3290322580645162,
"grad_norm": 0.27896296977996826,
"learning_rate": 2.0460120514651814e-05,
"loss": 1.5561,
"step": 721
},
{
"epoch": 1.3308755760368665,
"grad_norm": 0.2953389585018158,
"learning_rate": 2.0430107269916368e-05,
"loss": 1.5208,
"step": 722
},
{
"epoch": 1.3327188940092167,
"grad_norm": 0.27998074889183044,
"learning_rate": 2.0400068986045142e-05,
"loss": 1.6206,
"step": 723
},
{
"epoch": 1.3345622119815668,
"grad_norm": 0.2782033383846283,
"learning_rate": 2.03700058015497e-05,
"loss": 1.6127,
"step": 724
},
{
"epoch": 1.336405529953917,
"grad_norm": 0.2825208008289337,
"learning_rate": 2.0339917855056428e-05,
"loss": 1.5904,
"step": 725
},
{
"epoch": 1.3382488479262673,
"grad_norm": 0.2724984586238861,
"learning_rate": 2.0309805285305905e-05,
"loss": 1.5929,
"step": 726
},
{
"epoch": 1.3400921658986176,
"grad_norm": 0.2638327479362488,
"learning_rate": 2.0279668231152233e-05,
"loss": 1.5806,
"step": 727
},
{
"epoch": 1.3419354838709676,
"grad_norm": 0.27951404452323914,
"learning_rate": 2.024950683156243e-05,
"loss": 1.6097,
"step": 728
},
{
"epoch": 1.3437788018433179,
"grad_norm": 0.2717166841030121,
"learning_rate": 2.021932122561577e-05,
"loss": 1.5724,
"step": 729
},
{
"epoch": 1.3456221198156681,
"grad_norm": 0.2744804620742798,
"learning_rate": 2.0189111552503142e-05,
"loss": 1.6343,
"step": 730
},
{
"epoch": 1.3474654377880184,
"grad_norm": 0.2739951014518738,
"learning_rate": 2.015887795152643e-05,
"loss": 1.609,
"step": 731
},
{
"epoch": 1.3493087557603687,
"grad_norm": 0.2745543420314789,
"learning_rate": 2.0128620562097834e-05,
"loss": 1.634,
"step": 732
},
{
"epoch": 1.351152073732719,
"grad_norm": 0.2853536903858185,
"learning_rate": 2.009833952373925e-05,
"loss": 1.6915,
"step": 733
},
{
"epoch": 1.3529953917050692,
"grad_norm": 0.2606966495513916,
"learning_rate": 2.0068034976081637e-05,
"loss": 1.5641,
"step": 734
},
{
"epoch": 1.3548387096774195,
"grad_norm": 0.2707135081291199,
"learning_rate": 2.0037707058864343e-05,
"loss": 1.5901,
"step": 735
},
{
"epoch": 1.3566820276497695,
"grad_norm": 0.2738732397556305,
"learning_rate": 2.0007355911934473e-05,
"loss": 1.5878,
"step": 736
},
{
"epoch": 1.3585253456221198,
"grad_norm": 0.27152132987976074,
"learning_rate": 1.997698167524628e-05,
"loss": 1.6212,
"step": 737
},
{
"epoch": 1.36036866359447,
"grad_norm": 0.279140442609787,
"learning_rate": 1.9946584488860454e-05,
"loss": 1.5909,
"step": 738
},
{
"epoch": 1.3622119815668203,
"grad_norm": 0.2838742434978485,
"learning_rate": 1.9916164492943518e-05,
"loss": 1.6337,
"step": 739
},
{
"epoch": 1.3640552995391704,
"grad_norm": 0.2730039358139038,
"learning_rate": 1.9885721827767185e-05,
"loss": 1.674,
"step": 740
},
{
"epoch": 1.3658986175115206,
"grad_norm": 0.27932366728782654,
"learning_rate": 1.9855256633707692e-05,
"loss": 1.6264,
"step": 741
},
{
"epoch": 1.367741935483871,
"grad_norm": 0.2915544807910919,
"learning_rate": 1.9824769051245157e-05,
"loss": 1.6138,
"step": 742
},
{
"epoch": 1.3695852534562212,
"grad_norm": 0.2782309949398041,
"learning_rate": 1.979425922096294e-05,
"loss": 1.6153,
"step": 743
},
{
"epoch": 1.3714285714285714,
"grad_norm": 0.29362061619758606,
"learning_rate": 1.976372728354699e-05,
"loss": 1.6308,
"step": 744
},
{
"epoch": 1.3732718894009217,
"grad_norm": 0.2930099368095398,
"learning_rate": 1.9733173379785188e-05,
"loss": 1.5748,
"step": 745
},
{
"epoch": 1.375115207373272,
"grad_norm": 0.27453601360321045,
"learning_rate": 1.9702597650566723e-05,
"loss": 1.5993,
"step": 746
},
{
"epoch": 1.3769585253456222,
"grad_norm": 0.281548410654068,
"learning_rate": 1.9672000236881397e-05,
"loss": 1.6467,
"step": 747
},
{
"epoch": 1.3788018433179723,
"grad_norm": 0.28580373525619507,
"learning_rate": 1.9641381279819028e-05,
"loss": 1.6643,
"step": 748
},
{
"epoch": 1.3806451612903226,
"grad_norm": 0.2795877754688263,
"learning_rate": 1.9610740920568764e-05,
"loss": 1.6006,
"step": 749
},
{
"epoch": 1.3824884792626728,
"grad_norm": 0.27349579334259033,
"learning_rate": 1.9580079300418444e-05,
"loss": 1.654,
"step": 750
},
{
"epoch": 1.384331797235023,
"grad_norm": 0.27888205647468567,
"learning_rate": 1.954939656075394e-05,
"loss": 1.6131,
"step": 751
},
{
"epoch": 1.3861751152073734,
"grad_norm": 0.27265042066574097,
"learning_rate": 1.9518692843058514e-05,
"loss": 1.6203,
"step": 752
},
{
"epoch": 1.3880184331797234,
"grad_norm": 0.2736769914627075,
"learning_rate": 1.9487968288912164e-05,
"loss": 1.6011,
"step": 753
},
{
"epoch": 1.3898617511520737,
"grad_norm": 0.27037787437438965,
"learning_rate": 1.9457223039990963e-05,
"loss": 1.5475,
"step": 754
},
{
"epoch": 1.391705069124424,
"grad_norm": 0.27303317189216614,
"learning_rate": 1.942645723806641e-05,
"loss": 1.6335,
"step": 755
},
{
"epoch": 1.3935483870967742,
"grad_norm": 0.27159225940704346,
"learning_rate": 1.9395671025004777e-05,
"loss": 1.5606,
"step": 756
},
{
"epoch": 1.3953917050691245,
"grad_norm": 0.2682175934314728,
"learning_rate": 1.936486454276647e-05,
"loss": 1.555,
"step": 757
},
{
"epoch": 1.3972350230414747,
"grad_norm": 0.27098003029823303,
"learning_rate": 1.9334037933405337e-05,
"loss": 1.5385,
"step": 758
},
{
"epoch": 1.399078341013825,
"grad_norm": 0.28845879435539246,
"learning_rate": 1.9303191339068048e-05,
"loss": 1.6211,
"step": 759
},
{
"epoch": 1.400921658986175,
"grad_norm": 0.2876651883125305,
"learning_rate": 1.9272324901993436e-05,
"loss": 1.6319,
"step": 760
},
{
"epoch": 1.4027649769585253,
"grad_norm": 0.2796455919742584,
"learning_rate": 1.9241438764511805e-05,
"loss": 1.6263,
"step": 761
},
{
"epoch": 1.4046082949308756,
"grad_norm": 0.27480548620224,
"learning_rate": 1.9210533069044334e-05,
"loss": 1.613,
"step": 762
},
{
"epoch": 1.4064516129032258,
"grad_norm": 0.28065502643585205,
"learning_rate": 1.9179607958102356e-05,
"loss": 1.5789,
"step": 763
},
{
"epoch": 1.4082949308755761,
"grad_norm": 0.28731146454811096,
"learning_rate": 1.9148663574286757e-05,
"loss": 1.5297,
"step": 764
},
{
"epoch": 1.4101382488479262,
"grad_norm": 0.2914833724498749,
"learning_rate": 1.911770006028728e-05,
"loss": 1.5977,
"step": 765
},
{
"epoch": 1.4119815668202764,
"grad_norm": 0.300193190574646,
"learning_rate": 1.908671755888188e-05,
"loss": 1.6296,
"step": 766
},
{
"epoch": 1.4138248847926267,
"grad_norm": 0.29380565881729126,
"learning_rate": 1.9055716212936075e-05,
"loss": 1.6149,
"step": 767
},
{
"epoch": 1.415668202764977,
"grad_norm": 0.32037287950515747,
"learning_rate": 1.9024696165402272e-05,
"loss": 1.6513,
"step": 768
},
{
"epoch": 1.4175115207373272,
"grad_norm": 0.29914116859436035,
"learning_rate": 1.899365755931911e-05,
"loss": 1.5963,
"step": 769
},
{
"epoch": 1.4193548387096775,
"grad_norm": 0.26687341928482056,
"learning_rate": 1.8962600537810824e-05,
"loss": 1.536,
"step": 770
},
{
"epoch": 1.4211981566820278,
"grad_norm": 0.28808560967445374,
"learning_rate": 1.893152524408653e-05,
"loss": 1.6214,
"step": 771
},
{
"epoch": 1.4230414746543778,
"grad_norm": 0.28396207094192505,
"learning_rate": 1.8900431821439644e-05,
"loss": 1.6478,
"step": 772
},
{
"epoch": 1.424884792626728,
"grad_norm": 0.2826618552207947,
"learning_rate": 1.886932041324714e-05,
"loss": 1.5832,
"step": 773
},
{
"epoch": 1.4267281105990783,
"grad_norm": 0.2729291617870331,
"learning_rate": 1.883819116296895e-05,
"loss": 1.5696,
"step": 774
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.28617042303085327,
"learning_rate": 1.880704421414726e-05,
"loss": 1.606,
"step": 775
},
{
"epoch": 1.4304147465437789,
"grad_norm": 0.30494067072868347,
"learning_rate": 1.8775879710405893e-05,
"loss": 1.6557,
"step": 776
},
{
"epoch": 1.432258064516129,
"grad_norm": 0.2694535255432129,
"learning_rate": 1.8744697795449588e-05,
"loss": 1.5447,
"step": 777
},
{
"epoch": 1.4341013824884792,
"grad_norm": 0.2713358998298645,
"learning_rate": 1.8713498613063403e-05,
"loss": 1.5635,
"step": 778
},
{
"epoch": 1.4359447004608294,
"grad_norm": 0.27467259764671326,
"learning_rate": 1.8682282307111988e-05,
"loss": 1.6066,
"step": 779
},
{
"epoch": 1.4377880184331797,
"grad_norm": 0.2681735157966614,
"learning_rate": 1.865104902153898e-05,
"loss": 1.5669,
"step": 780
},
{
"epoch": 1.43963133640553,
"grad_norm": 0.2794997990131378,
"learning_rate": 1.8619798900366298e-05,
"loss": 1.6059,
"step": 781
},
{
"epoch": 1.4414746543778802,
"grad_norm": 0.27287647128105164,
"learning_rate": 1.8588532087693485e-05,
"loss": 1.5776,
"step": 782
},
{
"epoch": 1.4433179723502305,
"grad_norm": 0.2879515290260315,
"learning_rate": 1.8557248727697068e-05,
"loss": 1.6362,
"step": 783
},
{
"epoch": 1.4451612903225808,
"grad_norm": 0.28515344858169556,
"learning_rate": 1.852594896462987e-05,
"loss": 1.5876,
"step": 784
},
{
"epoch": 1.4470046082949308,
"grad_norm": 0.2730424702167511,
"learning_rate": 1.849463294282035e-05,
"loss": 1.5707,
"step": 785
},
{
"epoch": 1.448847926267281,
"grad_norm": 0.27559229731559753,
"learning_rate": 1.8463300806671936e-05,
"loss": 1.5538,
"step": 786
},
{
"epoch": 1.4506912442396314,
"grad_norm": 0.28002747893333435,
"learning_rate": 1.8431952700662375e-05,
"loss": 1.6236,
"step": 787
},
{
"epoch": 1.4525345622119816,
"grad_norm": 0.28345340490341187,
"learning_rate": 1.840058876934303e-05,
"loss": 1.6436,
"step": 788
},
{
"epoch": 1.4543778801843317,
"grad_norm": 0.26679205894470215,
"learning_rate": 1.8369209157338262e-05,
"loss": 1.49,
"step": 789
},
{
"epoch": 1.456221198156682,
"grad_norm": 0.2743077874183655,
"learning_rate": 1.8337814009344716e-05,
"loss": 1.592,
"step": 790
},
{
"epoch": 1.4580645161290322,
"grad_norm": 0.2735365927219391,
"learning_rate": 1.83064034701307e-05,
"loss": 1.5771,
"step": 791
},
{
"epoch": 1.4599078341013825,
"grad_norm": 0.27030524611473083,
"learning_rate": 1.8274977684535478e-05,
"loss": 1.5751,
"step": 792
},
{
"epoch": 1.4617511520737327,
"grad_norm": 0.28183117508888245,
"learning_rate": 1.824353679746861e-05,
"loss": 1.5485,
"step": 793
},
{
"epoch": 1.463594470046083,
"grad_norm": 0.2746550738811493,
"learning_rate": 1.821208095390931e-05,
"loss": 1.5738,
"step": 794
},
{
"epoch": 1.4654377880184333,
"grad_norm": 0.2818034887313843,
"learning_rate": 1.8180610298905758e-05,
"loss": 1.6364,
"step": 795
},
{
"epoch": 1.4672811059907835,
"grad_norm": 0.2696681320667267,
"learning_rate": 1.8149124977574417e-05,
"loss": 1.6,
"step": 796
},
{
"epoch": 1.4691244239631336,
"grad_norm": 0.2887495458126068,
"learning_rate": 1.8117625135099386e-05,
"loss": 1.6686,
"step": 797
},
{
"epoch": 1.4709677419354839,
"grad_norm": 0.2815185785293579,
"learning_rate": 1.8086110916731724e-05,
"loss": 1.6131,
"step": 798
},
{
"epoch": 1.4728110599078341,
"grad_norm": 0.27256280183792114,
"learning_rate": 1.805458246778878e-05,
"loss": 1.5867,
"step": 799
},
{
"epoch": 1.4746543778801844,
"grad_norm": 0.2779887914657593,
"learning_rate": 1.802303993365353e-05,
"loss": 1.5557,
"step": 800
},
{
"epoch": 1.4764976958525344,
"grad_norm": 0.27857843041419983,
"learning_rate": 1.7991483459773887e-05,
"loss": 1.6668,
"step": 801
},
{
"epoch": 1.4783410138248847,
"grad_norm": 0.26913055777549744,
"learning_rate": 1.795991319166204e-05,
"loss": 1.6072,
"step": 802
},
{
"epoch": 1.480184331797235,
"grad_norm": 0.2818872630596161,
"learning_rate": 1.79283292748938e-05,
"loss": 1.5957,
"step": 803
},
{
"epoch": 1.4820276497695852,
"grad_norm": 0.2774152159690857,
"learning_rate": 1.7896731855107908e-05,
"loss": 1.5923,
"step": 804
},
{
"epoch": 1.4838709677419355,
"grad_norm": 0.2798556983470917,
"learning_rate": 1.7865121078005365e-05,
"loss": 1.5798,
"step": 805
},
{
"epoch": 1.4857142857142858,
"grad_norm": 0.2692021429538727,
"learning_rate": 1.7833497089348772e-05,
"loss": 1.6172,
"step": 806
},
{
"epoch": 1.487557603686636,
"grad_norm": 0.267347514629364,
"learning_rate": 1.780186003496164e-05,
"loss": 1.6114,
"step": 807
},
{
"epoch": 1.4894009216589863,
"grad_norm": 0.2735111713409424,
"learning_rate": 1.7770210060727748e-05,
"loss": 1.5757,
"step": 808
},
{
"epoch": 1.4912442396313363,
"grad_norm": 0.27940452098846436,
"learning_rate": 1.7738547312590426e-05,
"loss": 1.6085,
"step": 809
},
{
"epoch": 1.4930875576036866,
"grad_norm": 0.2675354480743408,
"learning_rate": 1.770687193655192e-05,
"loss": 1.6032,
"step": 810
},
{
"epoch": 1.4949308755760369,
"grad_norm": 0.276353120803833,
"learning_rate": 1.7675184078672714e-05,
"loss": 1.6087,
"step": 811
},
{
"epoch": 1.4967741935483871,
"grad_norm": 0.276836097240448,
"learning_rate": 1.7643483885070827e-05,
"loss": 1.6077,
"step": 812
},
{
"epoch": 1.4986175115207372,
"grad_norm": 0.27562999725341797,
"learning_rate": 1.7611771501921174e-05,
"loss": 1.6598,
"step": 813
},
{
"epoch": 1.5004608294930875,
"grad_norm": 0.2816264033317566,
"learning_rate": 1.7580047075454877e-05,
"loss": 1.6591,
"step": 814
},
{
"epoch": 1.5023041474654377,
"grad_norm": 0.2649102807044983,
"learning_rate": 1.7548310751958588e-05,
"loss": 1.5475,
"step": 815
},
{
"epoch": 1.504147465437788,
"grad_norm": 0.26588475704193115,
"learning_rate": 1.751656267777382e-05,
"loss": 1.6011,
"step": 816
},
{
"epoch": 1.5059907834101383,
"grad_norm": 0.2885441482067108,
"learning_rate": 1.748480299929627e-05,
"loss": 1.6321,
"step": 817
},
{
"epoch": 1.5078341013824885,
"grad_norm": 0.2782214283943176,
"learning_rate": 1.7453031862975146e-05,
"loss": 1.5943,
"step": 818
},
{
"epoch": 1.5096774193548388,
"grad_norm": 0.2720677852630615,
"learning_rate": 1.742124941531249e-05,
"loss": 1.5845,
"step": 819
},
{
"epoch": 1.511520737327189,
"grad_norm": 0.27073559165000916,
"learning_rate": 1.73894558028625e-05,
"loss": 1.6024,
"step": 820
},
{
"epoch": 1.5133640552995393,
"grad_norm": 0.2795216739177704,
"learning_rate": 1.7357651172230852e-05,
"loss": 1.5477,
"step": 821
},
{
"epoch": 1.5152073732718894,
"grad_norm": 0.27710703015327454,
"learning_rate": 1.7325835670074044e-05,
"loss": 1.6505,
"step": 822
},
{
"epoch": 1.5170506912442396,
"grad_norm": 0.2705666124820709,
"learning_rate": 1.729400944309869e-05,
"loss": 1.5482,
"step": 823
},
{
"epoch": 1.51889400921659,
"grad_norm": 0.27922290563583374,
"learning_rate": 1.7262172638060865e-05,
"loss": 1.6243,
"step": 824
},
{
"epoch": 1.52073732718894,
"grad_norm": 0.27905556559562683,
"learning_rate": 1.7230325401765415e-05,
"loss": 1.5902,
"step": 825
},
{
"epoch": 1.5225806451612902,
"grad_norm": 0.2822147607803345,
"learning_rate": 1.7198467881065292e-05,
"loss": 1.6411,
"step": 826
},
{
"epoch": 1.5244239631336405,
"grad_norm": 0.27511006593704224,
"learning_rate": 1.7166600222860876e-05,
"loss": 1.6384,
"step": 827
},
{
"epoch": 1.5262672811059907,
"grad_norm": 0.27862241864204407,
"learning_rate": 1.713472257409928e-05,
"loss": 1.5762,
"step": 828
},
{
"epoch": 1.528110599078341,
"grad_norm": 0.27879902720451355,
"learning_rate": 1.7102835081773686e-05,
"loss": 1.6238,
"step": 829
},
{
"epoch": 1.5299539170506913,
"grad_norm": 0.29454904794692993,
"learning_rate": 1.707093789292269e-05,
"loss": 1.6545,
"step": 830
},
{
"epoch": 1.5317972350230415,
"grad_norm": 0.2856805920600891,
"learning_rate": 1.7039031154629567e-05,
"loss": 1.5693,
"step": 831
},
{
"epoch": 1.5336405529953918,
"grad_norm": 0.27673226594924927,
"learning_rate": 1.700711501402164e-05,
"loss": 1.5427,
"step": 832
},
{
"epoch": 1.535483870967742,
"grad_norm": 0.29206371307373047,
"learning_rate": 1.6975189618269592e-05,
"loss": 1.6024,
"step": 833
},
{
"epoch": 1.5373271889400921,
"grad_norm": 0.2787723243236542,
"learning_rate": 1.6943255114586788e-05,
"loss": 1.5581,
"step": 834
},
{
"epoch": 1.5391705069124424,
"grad_norm": 0.27273157238960266,
"learning_rate": 1.6911311650228574e-05,
"loss": 1.5769,
"step": 835
},
{
"epoch": 1.5410138248847927,
"grad_norm": 0.2830120325088501,
"learning_rate": 1.687935937249163e-05,
"loss": 1.5915,
"step": 836
},
{
"epoch": 1.5428571428571427,
"grad_norm": 0.27038127183914185,
"learning_rate": 1.6847398428713256e-05,
"loss": 1.5609,
"step": 837
},
{
"epoch": 1.544700460829493,
"grad_norm": 0.2764512896537781,
"learning_rate": 1.681542896627075e-05,
"loss": 1.6441,
"step": 838
},
{
"epoch": 1.5465437788018432,
"grad_norm": 0.27623921632766724,
"learning_rate": 1.678345113258065e-05,
"loss": 1.6269,
"step": 839
},
{
"epoch": 1.5483870967741935,
"grad_norm": 0.26323872804641724,
"learning_rate": 1.6751465075098115e-05,
"loss": 1.5342,
"step": 840
},
{
"epoch": 1.5502304147465438,
"grad_norm": 0.27324774861335754,
"learning_rate": 1.6719470941316228e-05,
"loss": 1.6072,
"step": 841
},
{
"epoch": 1.552073732718894,
"grad_norm": 0.2702793776988983,
"learning_rate": 1.668746887876531e-05,
"loss": 1.5937,
"step": 842
},
{
"epoch": 1.5539170506912443,
"grad_norm": 0.2792288362979889,
"learning_rate": 1.6655459035012237e-05,
"loss": 1.5874,
"step": 843
},
{
"epoch": 1.5557603686635946,
"grad_norm": 0.2696288824081421,
"learning_rate": 1.662344155765977e-05,
"loss": 1.5788,
"step": 844
},
{
"epoch": 1.5576036866359448,
"grad_norm": 0.2754669189453125,
"learning_rate": 1.659141659434587e-05,
"loss": 1.6263,
"step": 845
},
{
"epoch": 1.5594470046082949,
"grad_norm": 0.2766124904155731,
"learning_rate": 1.655938429274302e-05,
"loss": 1.6164,
"step": 846
},
{
"epoch": 1.5612903225806452,
"grad_norm": 0.2690373361110687,
"learning_rate": 1.6527344800557534e-05,
"loss": 1.5735,
"step": 847
},
{
"epoch": 1.5631336405529954,
"grad_norm": 0.27754339575767517,
"learning_rate": 1.6495298265528883e-05,
"loss": 1.6258,
"step": 848
},
{
"epoch": 1.5649769585253455,
"grad_norm": 0.27151575684547424,
"learning_rate": 1.646324483542902e-05,
"loss": 1.6568,
"step": 849
},
{
"epoch": 1.5668202764976957,
"grad_norm": 0.28297892212867737,
"learning_rate": 1.64311846580617e-05,
"loss": 1.6342,
"step": 850
},
{
"epoch": 1.568663594470046,
"grad_norm": 0.27492207288742065,
"learning_rate": 1.639911788126177e-05,
"loss": 1.665,
"step": 851
},
{
"epoch": 1.5705069124423963,
"grad_norm": 0.2828580439090729,
"learning_rate": 1.6367044652894515e-05,
"loss": 1.5696,
"step": 852
},
{
"epoch": 1.5723502304147465,
"grad_norm": 0.2775357961654663,
"learning_rate": 1.6334965120854986e-05,
"loss": 1.6489,
"step": 853
},
{
"epoch": 1.5741935483870968,
"grad_norm": 0.28201982378959656,
"learning_rate": 1.6302879433067274e-05,
"loss": 1.6067,
"step": 854
},
{
"epoch": 1.576036866359447,
"grad_norm": 0.273805171251297,
"learning_rate": 1.6270787737483877e-05,
"loss": 1.636,
"step": 855
},
{
"epoch": 1.5778801843317973,
"grad_norm": 0.27378591895103455,
"learning_rate": 1.623869018208499e-05,
"loss": 1.5383,
"step": 856
},
{
"epoch": 1.5797235023041476,
"grad_norm": 0.2734542787075043,
"learning_rate": 1.6206586914877816e-05,
"loss": 1.5959,
"step": 857
},
{
"epoch": 1.5815668202764976,
"grad_norm": 0.2755284905433655,
"learning_rate": 1.6174478083895922e-05,
"loss": 1.5877,
"step": 858
},
{
"epoch": 1.583410138248848,
"grad_norm": 0.2930653393268585,
"learning_rate": 1.6142363837198504e-05,
"loss": 1.6763,
"step": 859
},
{
"epoch": 1.5852534562211982,
"grad_norm": 0.28103727102279663,
"learning_rate": 1.6110244322869746e-05,
"loss": 1.6154,
"step": 860
},
{
"epoch": 1.5870967741935482,
"grad_norm": 0.2758486568927765,
"learning_rate": 1.607811968901812e-05,
"loss": 1.5919,
"step": 861
},
{
"epoch": 1.5889400921658985,
"grad_norm": 0.27371156215667725,
"learning_rate": 1.6045990083775703e-05,
"loss": 1.6255,
"step": 862
},
{
"epoch": 1.5907834101382488,
"grad_norm": 0.2752973139286041,
"learning_rate": 1.6013855655297498e-05,
"loss": 1.6017,
"step": 863
},
{
"epoch": 1.592626728110599,
"grad_norm": 0.26971954107284546,
"learning_rate": 1.5981716551760735e-05,
"loss": 1.5115,
"step": 864
},
{
"epoch": 1.5944700460829493,
"grad_norm": 0.2720949053764343,
"learning_rate": 1.5949572921364226e-05,
"loss": 1.573,
"step": 865
},
{
"epoch": 1.5963133640552996,
"grad_norm": 0.2740275263786316,
"learning_rate": 1.5917424912327644e-05,
"loss": 1.6024,
"step": 866
},
{
"epoch": 1.5981566820276498,
"grad_norm": 0.2787199318408966,
"learning_rate": 1.5885272672890842e-05,
"loss": 1.6263,
"step": 867
},
{
"epoch": 1.6,
"grad_norm": 0.2864430248737335,
"learning_rate": 1.58531163513132e-05,
"loss": 1.6032,
"step": 868
},
{
"epoch": 1.6018433179723504,
"grad_norm": 0.26860958337783813,
"learning_rate": 1.5820956095872914e-05,
"loss": 1.6096,
"step": 869
},
{
"epoch": 1.6036866359447006,
"grad_norm": 0.27173295617103577,
"learning_rate": 1.5788792054866314e-05,
"loss": 1.5589,
"step": 870
},
{
"epoch": 1.6055299539170507,
"grad_norm": 0.27752557396888733,
"learning_rate": 1.5756624376607193e-05,
"loss": 1.5585,
"step": 871
},
{
"epoch": 1.607373271889401,
"grad_norm": 0.2819308042526245,
"learning_rate": 1.5724453209426108e-05,
"loss": 1.578,
"step": 872
},
{
"epoch": 1.6092165898617512,
"grad_norm": 0.2847549617290497,
"learning_rate": 1.5692278701669712e-05,
"loss": 1.6011,
"step": 873
},
{
"epoch": 1.6110599078341012,
"grad_norm": 0.27277764678001404,
"learning_rate": 1.566010100170007e-05,
"loss": 1.571,
"step": 874
},
{
"epoch": 1.6129032258064515,
"grad_norm": 0.2827809751033783,
"learning_rate": 1.5627920257893934e-05,
"loss": 1.5961,
"step": 875
},
{
"epoch": 1.6147465437788018,
"grad_norm": 0.28699159622192383,
"learning_rate": 1.5595736618642126e-05,
"loss": 1.6229,
"step": 876
},
{
"epoch": 1.616589861751152,
"grad_norm": 0.2775685787200928,
"learning_rate": 1.5563550232348813e-05,
"loss": 1.5469,
"step": 877
},
{
"epoch": 1.6184331797235023,
"grad_norm": 0.2769649922847748,
"learning_rate": 1.553136124743081e-05,
"loss": 1.591,
"step": 878
},
{
"epoch": 1.6202764976958526,
"grad_norm": 0.26347190141677856,
"learning_rate": 1.5499169812316937e-05,
"loss": 1.5349,
"step": 879
},
{
"epoch": 1.6221198156682028,
"grad_norm": 0.2721926271915436,
"learning_rate": 1.5466976075447295e-05,
"loss": 1.5831,
"step": 880
},
{
"epoch": 1.6239631336405531,
"grad_norm": 0.2716809809207916,
"learning_rate": 1.5434780185272616e-05,
"loss": 1.5148,
"step": 881
},
{
"epoch": 1.6258064516129034,
"grad_norm": 0.2808883786201477,
"learning_rate": 1.5402582290253547e-05,
"loss": 1.57,
"step": 882
},
{
"epoch": 1.6276497695852534,
"grad_norm": 0.2672334313392639,
"learning_rate": 1.537038253885998e-05,
"loss": 1.5688,
"step": 883
},
{
"epoch": 1.6294930875576037,
"grad_norm": 0.27970781922340393,
"learning_rate": 1.533818107957038e-05,
"loss": 1.5723,
"step": 884
},
{
"epoch": 1.631336405529954,
"grad_norm": 0.2877601087093353,
"learning_rate": 1.5305978060871083e-05,
"loss": 1.6315,
"step": 885
},
{
"epoch": 1.633179723502304,
"grad_norm": 0.2766900658607483,
"learning_rate": 1.5273773631255602e-05,
"loss": 1.632,
"step": 886
},
{
"epoch": 1.6350230414746543,
"grad_norm": 0.26466235518455505,
"learning_rate": 1.524156793922396e-05,
"loss": 1.546,
"step": 887
},
{
"epoch": 1.6368663594470045,
"grad_norm": 0.27303576469421387,
"learning_rate": 1.5209361133282022e-05,
"loss": 1.574,
"step": 888
},
{
"epoch": 1.6387096774193548,
"grad_norm": 0.28713706135749817,
"learning_rate": 1.517715336194077e-05,
"loss": 1.6679,
"step": 889
},
{
"epoch": 1.640552995391705,
"grad_norm": 0.271852046251297,
"learning_rate": 1.5144944773715635e-05,
"loss": 1.5074,
"step": 890
},
{
"epoch": 1.6423963133640553,
"grad_norm": 0.2695780098438263,
"learning_rate": 1.511273551712583e-05,
"loss": 1.5874,
"step": 891
},
{
"epoch": 1.6442396313364056,
"grad_norm": 0.27548617124557495,
"learning_rate": 1.5080525740693635e-05,
"loss": 1.5366,
"step": 892
},
{
"epoch": 1.6460829493087559,
"grad_norm": 0.27405300736427307,
"learning_rate": 1.5048315592943743e-05,
"loss": 1.6149,
"step": 893
},
{
"epoch": 1.6479262672811061,
"grad_norm": 0.2837236821651459,
"learning_rate": 1.5016105222402546e-05,
"loss": 1.574,
"step": 894
},
{
"epoch": 1.6497695852534562,
"grad_norm": 0.274830162525177,
"learning_rate": 1.4983894777597461e-05,
"loss": 1.6566,
"step": 895
},
{
"epoch": 1.6516129032258065,
"grad_norm": 0.2712487578392029,
"learning_rate": 1.495168440705626e-05,
"loss": 1.5382,
"step": 896
},
{
"epoch": 1.6534562211981567,
"grad_norm": 0.26845115423202515,
"learning_rate": 1.4919474259306362e-05,
"loss": 1.5384,
"step": 897
},
{
"epoch": 1.6552995391705068,
"grad_norm": 0.2784380316734314,
"learning_rate": 1.4887264482874173e-05,
"loss": 1.5575,
"step": 898
},
{
"epoch": 1.657142857142857,
"grad_norm": 0.2723720967769623,
"learning_rate": 1.4855055226284367e-05,
"loss": 1.5714,
"step": 899
},
{
"epoch": 1.6589861751152073,
"grad_norm": 0.2713598608970642,
"learning_rate": 1.4822846638059234e-05,
"loss": 1.5896,
"step": 900
},
{
"epoch": 1.6608294930875576,
"grad_norm": 0.28683602809906006,
"learning_rate": 1.4790638866717984e-05,
"loss": 1.6283,
"step": 901
},
{
"epoch": 1.6626728110599078,
"grad_norm": 0.2853665053844452,
"learning_rate": 1.4758432060776044e-05,
"loss": 1.5921,
"step": 902
},
{
"epoch": 1.664516129032258,
"grad_norm": 0.279269278049469,
"learning_rate": 1.4726226368744404e-05,
"loss": 1.595,
"step": 903
},
{
"epoch": 1.6663594470046084,
"grad_norm": 0.2805596590042114,
"learning_rate": 1.4694021939128925e-05,
"loss": 1.589,
"step": 904
},
{
"epoch": 1.6682027649769586,
"grad_norm": 0.2930944859981537,
"learning_rate": 1.466181892042962e-05,
"loss": 1.6052,
"step": 905
},
{
"epoch": 1.670046082949309,
"grad_norm": 0.27433332800865173,
"learning_rate": 1.462961746114002e-05,
"loss": 1.6106,
"step": 906
},
{
"epoch": 1.671889400921659,
"grad_norm": 0.29066669940948486,
"learning_rate": 1.4597417709746454e-05,
"loss": 1.6252,
"step": 907
},
{
"epoch": 1.6737327188940092,
"grad_norm": 0.2851257026195526,
"learning_rate": 1.4565219814727388e-05,
"loss": 1.5807,
"step": 908
},
{
"epoch": 1.6755760368663595,
"grad_norm": 0.2750568389892578,
"learning_rate": 1.4533023924552706e-05,
"loss": 1.5701,
"step": 909
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.28285452723503113,
"learning_rate": 1.4500830187683066e-05,
"loss": 1.6027,
"step": 910
},
{
"epoch": 1.6792626728110598,
"grad_norm": 0.28591400384902954,
"learning_rate": 1.4468638752569193e-05,
"loss": 1.6226,
"step": 911
},
{
"epoch": 1.68110599078341,
"grad_norm": 0.279861718416214,
"learning_rate": 1.4436449767651191e-05,
"loss": 1.5525,
"step": 912
},
{
"epoch": 1.6829493087557603,
"grad_norm": 0.26922115683555603,
"learning_rate": 1.4404263381357873e-05,
"loss": 1.5962,
"step": 913
},
{
"epoch": 1.6847926267281106,
"grad_norm": 0.2725696265697479,
"learning_rate": 1.437207974210607e-05,
"loss": 1.5739,
"step": 914
},
{
"epoch": 1.6866359447004609,
"grad_norm": 0.280691921710968,
"learning_rate": 1.4339898998299936e-05,
"loss": 1.6128,
"step": 915
},
{
"epoch": 1.6884792626728111,
"grad_norm": 0.27789539098739624,
"learning_rate": 1.4307721298330284e-05,
"loss": 1.5863,
"step": 916
},
{
"epoch": 1.6903225806451614,
"grad_norm": 0.2718709707260132,
"learning_rate": 1.4275546790573895e-05,
"loss": 1.5724,
"step": 917
},
{
"epoch": 1.6921658986175117,
"grad_norm": 0.2684226930141449,
"learning_rate": 1.4243375623392808e-05,
"loss": 1.5473,
"step": 918
},
{
"epoch": 1.6940092165898617,
"grad_norm": 0.28611379861831665,
"learning_rate": 1.4211207945133685e-05,
"loss": 1.6016,
"step": 919
},
{
"epoch": 1.695852534562212,
"grad_norm": 0.28246620297431946,
"learning_rate": 1.417904390412709e-05,
"loss": 1.5635,
"step": 920
},
{
"epoch": 1.6976958525345622,
"grad_norm": 0.2792202830314636,
"learning_rate": 1.41468836486868e-05,
"loss": 1.5302,
"step": 921
},
{
"epoch": 1.6995391705069123,
"grad_norm": 0.2743517756462097,
"learning_rate": 1.411472732710916e-05,
"loss": 1.5837,
"step": 922
},
{
"epoch": 1.7013824884792625,
"grad_norm": 0.28153350949287415,
"learning_rate": 1.4082575087672363e-05,
"loss": 1.6095,
"step": 923
},
{
"epoch": 1.7032258064516128,
"grad_norm": 0.30301469564437866,
"learning_rate": 1.4050427078635777e-05,
"loss": 1.5882,
"step": 924
},
{
"epoch": 1.705069124423963,
"grad_norm": 0.27385833859443665,
"learning_rate": 1.4018283448239266e-05,
"loss": 1.5997,
"step": 925
},
{
"epoch": 1.7069124423963133,
"grad_norm": 0.2762070298194885,
"learning_rate": 1.398614434470251e-05,
"loss": 1.5694,
"step": 926
},
{
"epoch": 1.7087557603686636,
"grad_norm": 0.27967438101768494,
"learning_rate": 1.3954009916224299e-05,
"loss": 1.6191,
"step": 927
},
{
"epoch": 1.7105990783410139,
"grad_norm": 0.2745825946331024,
"learning_rate": 1.3921880310981878e-05,
"loss": 1.5921,
"step": 928
},
{
"epoch": 1.7124423963133641,
"grad_norm": 0.2806471586227417,
"learning_rate": 1.3889755677130253e-05,
"loss": 1.6333,
"step": 929
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.28569644689559937,
"learning_rate": 1.3857636162801499e-05,
"loss": 1.6297,
"step": 930
},
{
"epoch": 1.7161290322580647,
"grad_norm": 0.27753588557243347,
"learning_rate": 1.3825521916104082e-05,
"loss": 1.5691,
"step": 931
},
{
"epoch": 1.7179723502304147,
"grad_norm": 0.2785869538784027,
"learning_rate": 1.3793413085122183e-05,
"loss": 1.6432,
"step": 932
},
{
"epoch": 1.719815668202765,
"grad_norm": 0.2777843773365021,
"learning_rate": 1.3761309817915017e-05,
"loss": 1.6167,
"step": 933
},
{
"epoch": 1.7216589861751153,
"grad_norm": 0.2760448157787323,
"learning_rate": 1.3729212262516124e-05,
"loss": 1.6157,
"step": 934
},
{
"epoch": 1.7235023041474653,
"grad_norm": 0.2856210172176361,
"learning_rate": 1.3697120566932727e-05,
"loss": 1.5703,
"step": 935
},
{
"epoch": 1.7253456221198156,
"grad_norm": 0.27198657393455505,
"learning_rate": 1.3665034879145022e-05,
"loss": 1.6183,
"step": 936
},
{
"epoch": 1.7271889400921658,
"grad_norm": 0.2798546254634857,
"learning_rate": 1.3632955347105487e-05,
"loss": 1.6312,
"step": 937
},
{
"epoch": 1.729032258064516,
"grad_norm": 0.2872222065925598,
"learning_rate": 1.3600882118738232e-05,
"loss": 1.6336,
"step": 938
},
{
"epoch": 1.7308755760368664,
"grad_norm": 0.2799721360206604,
"learning_rate": 1.3568815341938303e-05,
"loss": 1.6183,
"step": 939
},
{
"epoch": 1.7327188940092166,
"grad_norm": 0.2797256410121918,
"learning_rate": 1.3536755164570977e-05,
"loss": 1.6386,
"step": 940
},
{
"epoch": 1.734562211981567,
"grad_norm": 0.2698177099227905,
"learning_rate": 1.3504701734471117e-05,
"loss": 1.5957,
"step": 941
},
{
"epoch": 1.7364055299539172,
"grad_norm": 0.2770727872848511,
"learning_rate": 1.3472655199442473e-05,
"loss": 1.5977,
"step": 942
},
{
"epoch": 1.7382488479262674,
"grad_norm": 0.2778874635696411,
"learning_rate": 1.3440615707256984e-05,
"loss": 1.5497,
"step": 943
},
{
"epoch": 1.7400921658986175,
"grad_norm": 0.27074167132377625,
"learning_rate": 1.340858340565413e-05,
"loss": 1.5602,
"step": 944
},
{
"epoch": 1.7419354838709677,
"grad_norm": 0.2706243395805359,
"learning_rate": 1.3376558442340233e-05,
"loss": 1.5655,
"step": 945
},
{
"epoch": 1.743778801843318,
"grad_norm": 0.27816662192344666,
"learning_rate": 1.3344540964987766e-05,
"loss": 1.5792,
"step": 946
},
{
"epoch": 1.745622119815668,
"grad_norm": 0.27811580896377563,
"learning_rate": 1.331253112123469e-05,
"loss": 1.6535,
"step": 947
},
{
"epoch": 1.7474654377880183,
"grad_norm": 0.28679028153419495,
"learning_rate": 1.3280529058683778e-05,
"loss": 1.5837,
"step": 948
},
{
"epoch": 1.7493087557603686,
"grad_norm": 0.2821553647518158,
"learning_rate": 1.3248534924901887e-05,
"loss": 1.5671,
"step": 949
},
{
"epoch": 1.7511520737327189,
"grad_norm": 0.27457195520401,
"learning_rate": 1.3216548867419352e-05,
"loss": 1.6098,
"step": 950
},
{
"epoch": 1.7529953917050691,
"grad_norm": 0.27999773621559143,
"learning_rate": 1.3184571033729253e-05,
"loss": 1.5503,
"step": 951
},
{
"epoch": 1.7548387096774194,
"grad_norm": 0.27848124504089355,
"learning_rate": 1.3152601571286746e-05,
"loss": 1.5739,
"step": 952
},
{
"epoch": 1.7566820276497697,
"grad_norm": 0.2803337275981903,
"learning_rate": 1.3120640627508376e-05,
"loss": 1.5847,
"step": 953
},
{
"epoch": 1.75852534562212,
"grad_norm": 0.27653852105140686,
"learning_rate": 1.3088688349771425e-05,
"loss": 1.6444,
"step": 954
},
{
"epoch": 1.7603686635944702,
"grad_norm": 0.27364403009414673,
"learning_rate": 1.3056744885413216e-05,
"loss": 1.603,
"step": 955
},
{
"epoch": 1.7622119815668202,
"grad_norm": 0.286454439163208,
"learning_rate": 1.3024810381730409e-05,
"loss": 1.6084,
"step": 956
},
{
"epoch": 1.7640552995391705,
"grad_norm": 0.27595254778862,
"learning_rate": 1.2992884985978363e-05,
"loss": 1.6451,
"step": 957
},
{
"epoch": 1.7658986175115208,
"grad_norm": 0.27956917881965637,
"learning_rate": 1.2960968845370443e-05,
"loss": 1.5732,
"step": 958
},
{
"epoch": 1.7677419354838708,
"grad_norm": 0.2734554708003998,
"learning_rate": 1.2929062107077315e-05,
"loss": 1.5397,
"step": 959
},
{
"epoch": 1.769585253456221,
"grad_norm": 0.27924489974975586,
"learning_rate": 1.2897164918226311e-05,
"loss": 1.552,
"step": 960
},
{
"epoch": 1.7714285714285714,
"grad_norm": 0.26942330598831177,
"learning_rate": 1.2865277425900725e-05,
"loss": 1.5747,
"step": 961
},
{
"epoch": 1.7732718894009216,
"grad_norm": 0.2752183675765991,
"learning_rate": 1.2833399777139128e-05,
"loss": 1.5018,
"step": 962
},
{
"epoch": 1.7751152073732719,
"grad_norm": 0.29580366611480713,
"learning_rate": 1.2801532118934708e-05,
"loss": 1.5727,
"step": 963
},
{
"epoch": 1.7769585253456222,
"grad_norm": 0.2765806019306183,
"learning_rate": 1.276967459823459e-05,
"loss": 1.5706,
"step": 964
},
{
"epoch": 1.7788018433179724,
"grad_norm": 0.2774519622325897,
"learning_rate": 1.273782736193914e-05,
"loss": 1.6598,
"step": 965
},
{
"epoch": 1.7806451612903227,
"grad_norm": 0.27766409516334534,
"learning_rate": 1.2705990556901311e-05,
"loss": 1.584,
"step": 966
},
{
"epoch": 1.782488479262673,
"grad_norm": 0.2720947265625,
"learning_rate": 1.2674164329925961e-05,
"loss": 1.5988,
"step": 967
},
{
"epoch": 1.784331797235023,
"grad_norm": 0.2917366623878479,
"learning_rate": 1.2642348827769152e-05,
"loss": 1.5834,
"step": 968
},
{
"epoch": 1.7861751152073733,
"grad_norm": 0.2705937623977661,
"learning_rate": 1.2610544197137502e-05,
"loss": 1.5643,
"step": 969
},
{
"epoch": 1.7880184331797235,
"grad_norm": 0.2769400477409363,
"learning_rate": 1.257875058468751e-05,
"loss": 1.6284,
"step": 970
},
{
"epoch": 1.7898617511520736,
"grad_norm": 0.28370919823646545,
"learning_rate": 1.2546968137024856e-05,
"loss": 1.6223,
"step": 971
},
{
"epoch": 1.7917050691244238,
"grad_norm": 0.27669093012809753,
"learning_rate": 1.251519700070373e-05,
"loss": 1.5396,
"step": 972
},
{
"epoch": 1.793548387096774,
"grad_norm": 0.272204726934433,
"learning_rate": 1.2483437322226178e-05,
"loss": 1.5131,
"step": 973
},
{
"epoch": 1.7953917050691244,
"grad_norm": 0.27512574195861816,
"learning_rate": 1.2451689248041416e-05,
"loss": 1.6107,
"step": 974
},
{
"epoch": 1.7972350230414746,
"grad_norm": 0.2767089307308197,
"learning_rate": 1.2419952924545125e-05,
"loss": 1.5571,
"step": 975
},
{
"epoch": 1.799078341013825,
"grad_norm": 0.2801191806793213,
"learning_rate": 1.2388228498078827e-05,
"loss": 1.6405,
"step": 976
},
{
"epoch": 1.8009216589861752,
"grad_norm": 0.27706649899482727,
"learning_rate": 1.2356516114929176e-05,
"loss": 1.6042,
"step": 977
},
{
"epoch": 1.8027649769585254,
"grad_norm": 0.2814716100692749,
"learning_rate": 1.2324815921327288e-05,
"loss": 1.6023,
"step": 978
},
{
"epoch": 1.8046082949308757,
"grad_norm": 0.2865283191204071,
"learning_rate": 1.2293128063448078e-05,
"loss": 1.5884,
"step": 979
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.26941248774528503,
"learning_rate": 1.2261452687409576e-05,
"loss": 1.603,
"step": 980
},
{
"epoch": 1.808294930875576,
"grad_norm": 0.2709951400756836,
"learning_rate": 1.2229789939272253e-05,
"loss": 1.5548,
"step": 981
},
{
"epoch": 1.8101382488479263,
"grad_norm": 0.28433525562286377,
"learning_rate": 1.2198139965038356e-05,
"loss": 1.6292,
"step": 982
},
{
"epoch": 1.8119815668202763,
"grad_norm": 0.2793513536453247,
"learning_rate": 1.2166502910651232e-05,
"loss": 1.6037,
"step": 983
},
{
"epoch": 1.8138248847926266,
"grad_norm": 0.2772260308265686,
"learning_rate": 1.2134878921994634e-05,
"loss": 1.5707,
"step": 984
},
{
"epoch": 1.8156682027649769,
"grad_norm": 0.2733345031738281,
"learning_rate": 1.210326814489209e-05,
"loss": 1.5857,
"step": 985
},
{
"epoch": 1.8175115207373271,
"grad_norm": 0.27603036165237427,
"learning_rate": 1.2071670725106203e-05,
"loss": 1.533,
"step": 986
},
{
"epoch": 1.8193548387096774,
"grad_norm": 0.282755047082901,
"learning_rate": 1.2040086808337965e-05,
"loss": 1.5974,
"step": 987
},
{
"epoch": 1.8211981566820277,
"grad_norm": 0.2730749249458313,
"learning_rate": 1.2008516540226115e-05,
"loss": 1.5904,
"step": 988
},
{
"epoch": 1.823041474654378,
"grad_norm": 0.28955116868019104,
"learning_rate": 1.1976960066346474e-05,
"loss": 1.5456,
"step": 989
},
{
"epoch": 1.8248847926267282,
"grad_norm": 0.2911273241043091,
"learning_rate": 1.194541753221122e-05,
"loss": 1.5873,
"step": 990
},
{
"epoch": 1.8267281105990785,
"grad_norm": 0.2708721458911896,
"learning_rate": 1.1913889083268278e-05,
"loss": 1.517,
"step": 991
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.27571648359298706,
"learning_rate": 1.1882374864900616e-05,
"loss": 1.5257,
"step": 992
},
{
"epoch": 1.8304147465437788,
"grad_norm": 0.2964298725128174,
"learning_rate": 1.1850875022425587e-05,
"loss": 1.5693,
"step": 993
},
{
"epoch": 1.832258064516129,
"grad_norm": 0.2749471664428711,
"learning_rate": 1.1819389701094241e-05,
"loss": 1.5939,
"step": 994
},
{
"epoch": 1.8341013824884793,
"grad_norm": 0.27718687057495117,
"learning_rate": 1.1787919046090686e-05,
"loss": 1.501,
"step": 995
},
{
"epoch": 1.8359447004608294,
"grad_norm": 0.2840654253959656,
"learning_rate": 1.1756463202531392e-05,
"loss": 1.6314,
"step": 996
},
{
"epoch": 1.8377880184331796,
"grad_norm": 0.27605095505714417,
"learning_rate": 1.1725022315464528e-05,
"loss": 1.5555,
"step": 997
},
{
"epoch": 1.83963133640553,
"grad_norm": 0.30078697204589844,
"learning_rate": 1.16935965298693e-05,
"loss": 1.6408,
"step": 998
},
{
"epoch": 1.8414746543778802,
"grad_norm": 0.2889043092727661,
"learning_rate": 1.1662185990655285e-05,
"loss": 1.5627,
"step": 999
},
{
"epoch": 1.8433179723502304,
"grad_norm": 0.2852741777896881,
"learning_rate": 1.1630790842661742e-05,
"loss": 1.5737,
"step": 1000
}
],
"logging_steps": 1.0,
"max_steps": 1626,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2068108629966848e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}