f1_avg_all / trainer_state.json
sedrickkeh's picture
End of training
f19fcbd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.971537001897533,
"eval_steps": 500,
"global_step": 655,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007590132827324478,
"grad_norm": 5.824297945739171,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.9218,
"step": 1
},
{
"epoch": 0.015180265654648957,
"grad_norm": 5.892263786026164,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.9293,
"step": 2
},
{
"epoch": 0.022770398481973434,
"grad_norm": 5.8114634272581736,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.9311,
"step": 3
},
{
"epoch": 0.030360531309297913,
"grad_norm": 5.296649545877873,
"learning_rate": 4.848484848484849e-06,
"loss": 0.9125,
"step": 4
},
{
"epoch": 0.03795066413662239,
"grad_norm": 3.7809427076070765,
"learning_rate": 6.060606060606061e-06,
"loss": 0.8611,
"step": 5
},
{
"epoch": 0.04554079696394687,
"grad_norm": 2.1730858437477893,
"learning_rate": 7.272727272727273e-06,
"loss": 0.8592,
"step": 6
},
{
"epoch": 0.05313092979127135,
"grad_norm": 4.234904575535682,
"learning_rate": 8.484848484848486e-06,
"loss": 0.8607,
"step": 7
},
{
"epoch": 0.06072106261859583,
"grad_norm": 4.53949743914793,
"learning_rate": 9.696969696969698e-06,
"loss": 0.8652,
"step": 8
},
{
"epoch": 0.0683111954459203,
"grad_norm": 4.029238499849355,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.8177,
"step": 9
},
{
"epoch": 0.07590132827324478,
"grad_norm": 3.9784894633891312,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.8151,
"step": 10
},
{
"epoch": 0.08349146110056926,
"grad_norm": 2.6969063855035493,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.7815,
"step": 11
},
{
"epoch": 0.09108159392789374,
"grad_norm": 1.6447992531334745,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.7618,
"step": 12
},
{
"epoch": 0.09867172675521822,
"grad_norm": 1.5209576084591174,
"learning_rate": 1.575757575757576e-05,
"loss": 0.7486,
"step": 13
},
{
"epoch": 0.1062618595825427,
"grad_norm": 1.2740153307577036,
"learning_rate": 1.6969696969696972e-05,
"loss": 0.7247,
"step": 14
},
{
"epoch": 0.11385199240986717,
"grad_norm": 0.9320600982322024,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.711,
"step": 15
},
{
"epoch": 0.12144212523719165,
"grad_norm": 1.0702991390933831,
"learning_rate": 1.9393939393939395e-05,
"loss": 0.7028,
"step": 16
},
{
"epoch": 0.12903225806451613,
"grad_norm": 0.9459102023256077,
"learning_rate": 2.0606060606060608e-05,
"loss": 0.6918,
"step": 17
},
{
"epoch": 0.1366223908918406,
"grad_norm": 0.8949299902760269,
"learning_rate": 2.1818181818181818e-05,
"loss": 0.6782,
"step": 18
},
{
"epoch": 0.1442125237191651,
"grad_norm": 0.876771721092771,
"learning_rate": 2.3030303030303034e-05,
"loss": 0.6772,
"step": 19
},
{
"epoch": 0.15180265654648956,
"grad_norm": 1.0086928620416316,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.6644,
"step": 20
},
{
"epoch": 0.15939278937381404,
"grad_norm": 0.9465491775161774,
"learning_rate": 2.5454545454545457e-05,
"loss": 0.6628,
"step": 21
},
{
"epoch": 0.16698292220113853,
"grad_norm": 0.8899418440526895,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.6579,
"step": 22
},
{
"epoch": 0.174573055028463,
"grad_norm": 0.9194240562361043,
"learning_rate": 2.7878787878787883e-05,
"loss": 0.6488,
"step": 23
},
{
"epoch": 0.18216318785578747,
"grad_norm": 1.140269189956398,
"learning_rate": 2.9090909090909093e-05,
"loss": 0.6545,
"step": 24
},
{
"epoch": 0.18975332068311196,
"grad_norm": 1.2817416105473125,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.6505,
"step": 25
},
{
"epoch": 0.19734345351043645,
"grad_norm": 0.6815058078206016,
"learning_rate": 3.151515151515152e-05,
"loss": 0.6317,
"step": 26
},
{
"epoch": 0.2049335863377609,
"grad_norm": 1.1874255778058744,
"learning_rate": 3.272727272727273e-05,
"loss": 0.6432,
"step": 27
},
{
"epoch": 0.2125237191650854,
"grad_norm": 0.9363859174853021,
"learning_rate": 3.3939393939393945e-05,
"loss": 0.6312,
"step": 28
},
{
"epoch": 0.22011385199240988,
"grad_norm": 0.8935811457744806,
"learning_rate": 3.515151515151515e-05,
"loss": 0.6285,
"step": 29
},
{
"epoch": 0.22770398481973433,
"grad_norm": 1.2762386300886945,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.6275,
"step": 30
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.9210641452165423,
"learning_rate": 3.7575757575757584e-05,
"loss": 0.6264,
"step": 31
},
{
"epoch": 0.2428842504743833,
"grad_norm": 1.2440524082474191,
"learning_rate": 3.878787878787879e-05,
"loss": 0.6144,
"step": 32
},
{
"epoch": 0.2504743833017078,
"grad_norm": 1.3065985154977695,
"learning_rate": 4e-05,
"loss": 0.6141,
"step": 33
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.8172081904989663,
"learning_rate": 4.1212121212121216e-05,
"loss": 0.6092,
"step": 34
},
{
"epoch": 0.2656546489563567,
"grad_norm": 1.167931775101708,
"learning_rate": 4.242424242424242e-05,
"loss": 0.6134,
"step": 35
},
{
"epoch": 0.2732447817836812,
"grad_norm": 1.681922162159049,
"learning_rate": 4.3636363636363636e-05,
"loss": 0.6164,
"step": 36
},
{
"epoch": 0.2808349146110057,
"grad_norm": 1.1257832927645395,
"learning_rate": 4.484848484848485e-05,
"loss": 0.6011,
"step": 37
},
{
"epoch": 0.2884250474383302,
"grad_norm": 1.790614581178023,
"learning_rate": 4.606060606060607e-05,
"loss": 0.6094,
"step": 38
},
{
"epoch": 0.29601518026565465,
"grad_norm": 1.127806608018945,
"learning_rate": 4.727272727272728e-05,
"loss": 0.6011,
"step": 39
},
{
"epoch": 0.3036053130929791,
"grad_norm": 2.093380998039163,
"learning_rate": 4.848484848484849e-05,
"loss": 0.615,
"step": 40
},
{
"epoch": 0.3111954459203036,
"grad_norm": 0.9384468154974619,
"learning_rate": 4.96969696969697e-05,
"loss": 0.5974,
"step": 41
},
{
"epoch": 0.3187855787476281,
"grad_norm": 2.4981151616674686,
"learning_rate": 5.0909090909090914e-05,
"loss": 0.6002,
"step": 42
},
{
"epoch": 0.32637571157495254,
"grad_norm": 1.6534765579286679,
"learning_rate": 5.212121212121213e-05,
"loss": 0.6062,
"step": 43
},
{
"epoch": 0.33396584440227706,
"grad_norm": 2.4370056170762453,
"learning_rate": 5.333333333333333e-05,
"loss": 0.6068,
"step": 44
},
{
"epoch": 0.3415559772296015,
"grad_norm": 2.183022857065241,
"learning_rate": 5.4545454545454546e-05,
"loss": 0.5993,
"step": 45
},
{
"epoch": 0.349146110056926,
"grad_norm": 1.8264041908559345,
"learning_rate": 5.5757575757575766e-05,
"loss": 0.5967,
"step": 46
},
{
"epoch": 0.3567362428842505,
"grad_norm": 1.9185286131763832,
"learning_rate": 5.696969696969698e-05,
"loss": 0.6048,
"step": 47
},
{
"epoch": 0.36432637571157495,
"grad_norm": 1.5433175224735158,
"learning_rate": 5.8181818181818185e-05,
"loss": 0.5991,
"step": 48
},
{
"epoch": 0.3719165085388994,
"grad_norm": 1.6301636930901502,
"learning_rate": 5.93939393939394e-05,
"loss": 0.5973,
"step": 49
},
{
"epoch": 0.3795066413662239,
"grad_norm": 1.6154604740395921,
"learning_rate": 6.060606060606061e-05,
"loss": 0.5839,
"step": 50
},
{
"epoch": 0.3870967741935484,
"grad_norm": 1.5375798706049526,
"learning_rate": 6.181818181818182e-05,
"loss": 0.6014,
"step": 51
},
{
"epoch": 0.3946869070208729,
"grad_norm": 1.8926585193561105,
"learning_rate": 6.303030303030304e-05,
"loss": 0.5903,
"step": 52
},
{
"epoch": 0.40227703984819735,
"grad_norm": 0.9591201704735197,
"learning_rate": 6.424242424242424e-05,
"loss": 0.5787,
"step": 53
},
{
"epoch": 0.4098671726755218,
"grad_norm": 2.3504740289658144,
"learning_rate": 6.545454545454546e-05,
"loss": 0.5836,
"step": 54
},
{
"epoch": 0.4174573055028463,
"grad_norm": 1.9833219660676837,
"learning_rate": 6.666666666666667e-05,
"loss": 0.6021,
"step": 55
},
{
"epoch": 0.4250474383301708,
"grad_norm": 1.979773818430796,
"learning_rate": 6.787878787878789e-05,
"loss": 0.5745,
"step": 56
},
{
"epoch": 0.43263757115749524,
"grad_norm": 1.6918535634940701,
"learning_rate": 6.90909090909091e-05,
"loss": 0.5802,
"step": 57
},
{
"epoch": 0.44022770398481975,
"grad_norm": 1.896304739161675,
"learning_rate": 7.03030303030303e-05,
"loss": 0.5967,
"step": 58
},
{
"epoch": 0.4478178368121442,
"grad_norm": 1.7127150877307569,
"learning_rate": 7.151515151515152e-05,
"loss": 0.5873,
"step": 59
},
{
"epoch": 0.45540796963946867,
"grad_norm": 1.7288522680268184,
"learning_rate": 7.272727272727273e-05,
"loss": 0.5822,
"step": 60
},
{
"epoch": 0.4629981024667932,
"grad_norm": 2.4113594863743777,
"learning_rate": 7.393939393939395e-05,
"loss": 0.5892,
"step": 61
},
{
"epoch": 0.47058823529411764,
"grad_norm": 1.5610600634117355,
"learning_rate": 7.515151515151517e-05,
"loss": 0.5888,
"step": 62
},
{
"epoch": 0.4781783681214421,
"grad_norm": 1.554510024355238,
"learning_rate": 7.636363636363637e-05,
"loss": 0.5748,
"step": 63
},
{
"epoch": 0.4857685009487666,
"grad_norm": 1.4238723235068915,
"learning_rate": 7.757575757575758e-05,
"loss": 0.5752,
"step": 64
},
{
"epoch": 0.49335863377609107,
"grad_norm": 3.2737964188798,
"learning_rate": 7.87878787878788e-05,
"loss": 0.5991,
"step": 65
},
{
"epoch": 0.5009487666034156,
"grad_norm": 1.3673718679696243,
"learning_rate": 8e-05,
"loss": 0.587,
"step": 66
},
{
"epoch": 0.50853889943074,
"grad_norm": 3.10214817390346,
"learning_rate": 7.999943101853146e-05,
"loss": 0.5968,
"step": 67
},
{
"epoch": 0.5161290322580645,
"grad_norm": 2.4426856945858635,
"learning_rate": 7.999772409031277e-05,
"loss": 0.6063,
"step": 68
},
{
"epoch": 0.523719165085389,
"grad_norm": 2.384951983454804,
"learning_rate": 7.999487926390452e-05,
"loss": 0.5968,
"step": 69
},
{
"epoch": 0.5313092979127134,
"grad_norm": 2.470269943289222,
"learning_rate": 7.999089662023934e-05,
"loss": 0.5976,
"step": 70
},
{
"epoch": 0.538899430740038,
"grad_norm": 2.0615837527679926,
"learning_rate": 7.99857762726198e-05,
"loss": 0.5892,
"step": 71
},
{
"epoch": 0.5464895635673624,
"grad_norm": 1.4595469442640645,
"learning_rate": 7.997951836671498e-05,
"loss": 0.5763,
"step": 72
},
{
"epoch": 0.5540796963946869,
"grad_norm": 1.6686147644039993,
"learning_rate": 7.997212308055656e-05,
"loss": 0.5885,
"step": 73
},
{
"epoch": 0.5616698292220114,
"grad_norm": 1.1588798823385231,
"learning_rate": 7.996359062453354e-05,
"loss": 0.5816,
"step": 74
},
{
"epoch": 0.5692599620493358,
"grad_norm": 2.139844499195118,
"learning_rate": 7.995392124138642e-05,
"loss": 0.5815,
"step": 75
},
{
"epoch": 0.5768500948766604,
"grad_norm": 1.6540433397238854,
"learning_rate": 7.994311520620017e-05,
"loss": 0.5782,
"step": 76
},
{
"epoch": 0.5844402277039848,
"grad_norm": 1.04883299144272,
"learning_rate": 7.993117282639648e-05,
"loss": 0.5782,
"step": 77
},
{
"epoch": 0.5920303605313093,
"grad_norm": 2.724444333560736,
"learning_rate": 7.9918094441725e-05,
"loss": 0.5861,
"step": 78
},
{
"epoch": 0.5996204933586338,
"grad_norm": 1.8249890665939426,
"learning_rate": 7.990388042425367e-05,
"loss": 0.58,
"step": 79
},
{
"epoch": 0.6072106261859582,
"grad_norm": 2.602399399727078,
"learning_rate": 7.988853117835806e-05,
"loss": 0.5814,
"step": 80
},
{
"epoch": 0.6148007590132827,
"grad_norm": 1.5944678851416663,
"learning_rate": 7.987204714071006e-05,
"loss": 0.5826,
"step": 81
},
{
"epoch": 0.6223908918406073,
"grad_norm": 2.2610913780974546,
"learning_rate": 7.985442878026524e-05,
"loss": 0.5754,
"step": 82
},
{
"epoch": 0.6299810246679317,
"grad_norm": 1.7537341638428399,
"learning_rate": 7.983567659824962e-05,
"loss": 0.5845,
"step": 83
},
{
"epoch": 0.6375711574952562,
"grad_norm": 1.8121108815331453,
"learning_rate": 7.981579112814541e-05,
"loss": 0.585,
"step": 84
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.467756636378608,
"learning_rate": 7.97947729356758e-05,
"loss": 0.5777,
"step": 85
},
{
"epoch": 0.6527514231499051,
"grad_norm": 1.5365204832241453,
"learning_rate": 7.977262261878892e-05,
"loss": 0.5763,
"step": 86
},
{
"epoch": 0.6603415559772297,
"grad_norm": 1.4259830475580915,
"learning_rate": 7.974934080764075e-05,
"loss": 0.5662,
"step": 87
},
{
"epoch": 0.6679316888045541,
"grad_norm": 1.290860497369316,
"learning_rate": 7.972492816457723e-05,
"loss": 0.5627,
"step": 88
},
{
"epoch": 0.6755218216318786,
"grad_norm": 1.1578178204522984,
"learning_rate": 7.969938538411543e-05,
"loss": 0.5611,
"step": 89
},
{
"epoch": 0.683111954459203,
"grad_norm": 1.8928883460003019,
"learning_rate": 7.967271319292382e-05,
"loss": 0.5715,
"step": 90
},
{
"epoch": 0.6907020872865275,
"grad_norm": 1.5577040910573858,
"learning_rate": 7.96449123498015e-05,
"loss": 0.5712,
"step": 91
},
{
"epoch": 0.698292220113852,
"grad_norm": 1.064793253865779,
"learning_rate": 7.96159836456567e-05,
"loss": 0.5675,
"step": 92
},
{
"epoch": 0.7058823529411765,
"grad_norm": 2.0170128081260406,
"learning_rate": 7.958592790348425e-05,
"loss": 0.5755,
"step": 93
},
{
"epoch": 0.713472485768501,
"grad_norm": 1.3379111611740009,
"learning_rate": 7.955474597834217e-05,
"loss": 0.5604,
"step": 94
},
{
"epoch": 0.7210626185958254,
"grad_norm": 1.4656800007307322,
"learning_rate": 7.952243875732735e-05,
"loss": 0.5655,
"step": 95
},
{
"epoch": 0.7286527514231499,
"grad_norm": 1.2799455534799504,
"learning_rate": 7.948900715955025e-05,
"loss": 0.5629,
"step": 96
},
{
"epoch": 0.7362428842504743,
"grad_norm": 1.6331551992017197,
"learning_rate": 7.94544521361089e-05,
"loss": 0.5589,
"step": 97
},
{
"epoch": 0.7438330170777988,
"grad_norm": 1.8686747850955692,
"learning_rate": 7.941877467006168e-05,
"loss": 0.5644,
"step": 98
},
{
"epoch": 0.7514231499051234,
"grad_norm": 1.1116521915214885,
"learning_rate": 7.938197577639942e-05,
"loss": 0.5559,
"step": 99
},
{
"epoch": 0.7590132827324478,
"grad_norm": 1.5062245938638401,
"learning_rate": 7.934405650201658e-05,
"loss": 0.5723,
"step": 100
},
{
"epoch": 0.7666034155597723,
"grad_norm": 1.1108744133424633,
"learning_rate": 7.930501792568138e-05,
"loss": 0.5545,
"step": 101
},
{
"epoch": 0.7741935483870968,
"grad_norm": 1.5427714103721983,
"learning_rate": 7.926486115800511e-05,
"loss": 0.556,
"step": 102
},
{
"epoch": 0.7817836812144212,
"grad_norm": 1.764775365031586,
"learning_rate": 7.922358734141064e-05,
"loss": 0.5596,
"step": 103
},
{
"epoch": 0.7893738140417458,
"grad_norm": 1.2296630078252206,
"learning_rate": 7.918119765009979e-05,
"loss": 0.5598,
"step": 104
},
{
"epoch": 0.7969639468690702,
"grad_norm": 1.2833682166627998,
"learning_rate": 7.913769329002e-05,
"loss": 0.5489,
"step": 105
},
{
"epoch": 0.8045540796963947,
"grad_norm": 1.1872477219429831,
"learning_rate": 7.909307549883002e-05,
"loss": 0.5646,
"step": 106
},
{
"epoch": 0.8121442125237192,
"grad_norm": 1.820761375614486,
"learning_rate": 7.904734554586464e-05,
"loss": 0.5556,
"step": 107
},
{
"epoch": 0.8197343453510436,
"grad_norm": 1.1423898687342118,
"learning_rate": 7.900050473209868e-05,
"loss": 0.5483,
"step": 108
},
{
"epoch": 0.8273244781783681,
"grad_norm": 1.476252579811037,
"learning_rate": 7.895255439010987e-05,
"loss": 0.5479,
"step": 109
},
{
"epoch": 0.8349146110056926,
"grad_norm": 1.3278512325760372,
"learning_rate": 7.890349588404102e-05,
"loss": 0.5499,
"step": 110
},
{
"epoch": 0.8425047438330171,
"grad_norm": 0.8671841713875902,
"learning_rate": 7.885333060956117e-05,
"loss": 0.5571,
"step": 111
},
{
"epoch": 0.8500948766603416,
"grad_norm": 1.0738508848999515,
"learning_rate": 7.88020599938259e-05,
"loss": 0.5449,
"step": 112
},
{
"epoch": 0.857685009487666,
"grad_norm": 1.7715748163298473,
"learning_rate": 7.87496854954367e-05,
"loss": 0.5491,
"step": 113
},
{
"epoch": 0.8652751423149905,
"grad_norm": 1.0525784243440264,
"learning_rate": 7.869620860439956e-05,
"loss": 0.543,
"step": 114
},
{
"epoch": 0.872865275142315,
"grad_norm": 2.0621859992760427,
"learning_rate": 7.864163084208245e-05,
"loss": 0.5622,
"step": 115
},
{
"epoch": 0.8804554079696395,
"grad_norm": 1.363047653212627,
"learning_rate": 7.858595376117214e-05,
"loss": 0.5515,
"step": 116
},
{
"epoch": 0.888045540796964,
"grad_norm": 1.7242002751506365,
"learning_rate": 7.852917894563e-05,
"loss": 0.5599,
"step": 117
},
{
"epoch": 0.8956356736242884,
"grad_norm": 1.4061990696892013,
"learning_rate": 7.847130801064694e-05,
"loss": 0.5605,
"step": 118
},
{
"epoch": 0.9032258064516129,
"grad_norm": 1.7767323380908933,
"learning_rate": 7.84123426025974e-05,
"loss": 0.5494,
"step": 119
},
{
"epoch": 0.9108159392789373,
"grad_norm": 1.1684328222434068,
"learning_rate": 7.835228439899264e-05,
"loss": 0.546,
"step": 120
},
{
"epoch": 0.9184060721062619,
"grad_norm": 1.9834381552810127,
"learning_rate": 7.829113510843288e-05,
"loss": 0.5551,
"step": 121
},
{
"epoch": 0.9259962049335864,
"grad_norm": 1.4942107378630478,
"learning_rate": 7.82288964705588e-05,
"loss": 0.5454,
"step": 122
},
{
"epoch": 0.9335863377609108,
"grad_norm": 1.631303090634789,
"learning_rate": 7.816557025600196e-05,
"loss": 0.5403,
"step": 123
},
{
"epoch": 0.9411764705882353,
"grad_norm": 1.2779932620673164,
"learning_rate": 7.81011582663345e-05,
"loss": 0.5551,
"step": 124
},
{
"epoch": 0.9487666034155597,
"grad_norm": 0.826316123440516,
"learning_rate": 7.803566233401784e-05,
"loss": 0.5468,
"step": 125
},
{
"epoch": 0.9563567362428842,
"grad_norm": 1.5355038345605292,
"learning_rate": 7.796908432235056e-05,
"loss": 0.5588,
"step": 126
},
{
"epoch": 0.9639468690702088,
"grad_norm": 1.6053485472330935,
"learning_rate": 7.79014261254154e-05,
"loss": 0.5457,
"step": 127
},
{
"epoch": 0.9715370018975332,
"grad_norm": 0.8709812572017568,
"learning_rate": 7.783268966802539e-05,
"loss": 0.5482,
"step": 128
},
{
"epoch": 0.9791271347248577,
"grad_norm": 1.0328203561237506,
"learning_rate": 7.776287690566906e-05,
"loss": 0.5516,
"step": 129
},
{
"epoch": 0.9867172675521821,
"grad_norm": 1.421726756731164,
"learning_rate": 7.769198982445478e-05,
"loss": 0.5644,
"step": 130
},
{
"epoch": 0.9943074003795066,
"grad_norm": 0.9699818427155015,
"learning_rate": 7.762003044105435e-05,
"loss": 0.5333,
"step": 131
},
{
"epoch": 1.0018975332068312,
"grad_norm": 2.203324310322431,
"learning_rate": 7.754700080264554e-05,
"loss": 0.6801,
"step": 132
},
{
"epoch": 1.0094876660341556,
"grad_norm": 1.2850623970507653,
"learning_rate": 7.747290298685392e-05,
"loss": 0.5231,
"step": 133
},
{
"epoch": 1.01707779886148,
"grad_norm": 1.0733692629279126,
"learning_rate": 7.739773910169366e-05,
"loss": 0.526,
"step": 134
},
{
"epoch": 1.0246679316888045,
"grad_norm": 1.3517159638201317,
"learning_rate": 7.732151128550767e-05,
"loss": 0.5374,
"step": 135
},
{
"epoch": 1.032258064516129,
"grad_norm": 0.9043349347274219,
"learning_rate": 7.724422170690668e-05,
"loss": 0.5316,
"step": 136
},
{
"epoch": 1.0398481973434535,
"grad_norm": 1.2575116166876772,
"learning_rate": 7.716587256470759e-05,
"loss": 0.5264,
"step": 137
},
{
"epoch": 1.047438330170778,
"grad_norm": 1.151643956702767,
"learning_rate": 7.708646608787091e-05,
"loss": 0.5236,
"step": 138
},
{
"epoch": 1.0550284629981024,
"grad_norm": 1.1533411140892482,
"learning_rate": 7.700600453543731e-05,
"loss": 0.5327,
"step": 139
},
{
"epoch": 1.0626185958254268,
"grad_norm": 1.5703445128955635,
"learning_rate": 7.692449019646341e-05,
"loss": 0.5189,
"step": 140
},
{
"epoch": 1.0702087286527515,
"grad_norm": 1.503708861643817,
"learning_rate": 7.684192538995664e-05,
"loss": 0.5208,
"step": 141
},
{
"epoch": 1.077798861480076,
"grad_norm": 0.6891325431467323,
"learning_rate": 7.675831246480923e-05,
"loss": 0.5176,
"step": 142
},
{
"epoch": 1.0853889943074004,
"grad_norm": 1.862959746082954,
"learning_rate": 7.667365379973142e-05,
"loss": 0.519,
"step": 143
},
{
"epoch": 1.092979127134725,
"grad_norm": 0.9255777898780981,
"learning_rate": 7.658795180318381e-05,
"loss": 0.5306,
"step": 144
},
{
"epoch": 1.1005692599620494,
"grad_norm": 1.2860696781263434,
"learning_rate": 7.650120891330878e-05,
"loss": 0.5231,
"step": 145
},
{
"epoch": 1.1081593927893738,
"grad_norm": 0.9866085500546973,
"learning_rate": 7.641342759786116e-05,
"loss": 0.5134,
"step": 146
},
{
"epoch": 1.1157495256166983,
"grad_norm": 1.6012070200344108,
"learning_rate": 7.632461035413805e-05,
"loss": 0.5225,
"step": 147
},
{
"epoch": 1.1233396584440227,
"grad_norm": 1.0880689445644633,
"learning_rate": 7.623475970890775e-05,
"loss": 0.52,
"step": 148
},
{
"epoch": 1.1309297912713472,
"grad_norm": 1.0388918530802034,
"learning_rate": 7.614387821833786e-05,
"loss": 0.5234,
"step": 149
},
{
"epoch": 1.1385199240986716,
"grad_norm": 1.3834068969901858,
"learning_rate": 7.605196846792256e-05,
"loss": 0.52,
"step": 150
},
{
"epoch": 1.146110056925996,
"grad_norm": 1.0808645625405662,
"learning_rate": 7.59590330724091e-05,
"loss": 0.5199,
"step": 151
},
{
"epoch": 1.1537001897533208,
"grad_norm": 0.8748353485698048,
"learning_rate": 7.586507467572339e-05,
"loss": 0.5054,
"step": 152
},
{
"epoch": 1.1612903225806452,
"grad_norm": 0.9809721493446659,
"learning_rate": 7.577009595089472e-05,
"loss": 0.5156,
"step": 153
},
{
"epoch": 1.1688804554079697,
"grad_norm": 1.385065545391808,
"learning_rate": 7.567409959997984e-05,
"loss": 0.5125,
"step": 154
},
{
"epoch": 1.1764705882352942,
"grad_norm": 1.1835810733031538,
"learning_rate": 7.557708835398595e-05,
"loss": 0.5089,
"step": 155
},
{
"epoch": 1.1840607210626186,
"grad_norm": 1.0550638017889524,
"learning_rate": 7.547906497279315e-05,
"loss": 0.5085,
"step": 156
},
{
"epoch": 1.191650853889943,
"grad_norm": 1.0668629873488273,
"learning_rate": 7.538003224507579e-05,
"loss": 0.5151,
"step": 157
},
{
"epoch": 1.1992409867172675,
"grad_norm": 1.2773079106743754,
"learning_rate": 7.52799929882232e-05,
"loss": 0.5217,
"step": 158
},
{
"epoch": 1.206831119544592,
"grad_norm": 1.0653233150213854,
"learning_rate": 7.517895004825956e-05,
"loss": 0.5142,
"step": 159
},
{
"epoch": 1.2144212523719164,
"grad_norm": 1.1811879803660237,
"learning_rate": 7.507690629976291e-05,
"loss": 0.516,
"step": 160
},
{
"epoch": 1.222011385199241,
"grad_norm": 0.9358140704136899,
"learning_rate": 7.497386464578329e-05,
"loss": 0.5116,
"step": 161
},
{
"epoch": 1.2296015180265654,
"grad_norm": 1.236267972600389,
"learning_rate": 7.486982801776032e-05,
"loss": 0.5176,
"step": 162
},
{
"epoch": 1.23719165085389,
"grad_norm": 1.1810121004464773,
"learning_rate": 7.476479937543967e-05,
"loss": 0.5208,
"step": 163
},
{
"epoch": 1.2447817836812145,
"grad_norm": 1.0715306401128548,
"learning_rate": 7.465878170678887e-05,
"loss": 0.5149,
"step": 164
},
{
"epoch": 1.252371916508539,
"grad_norm": 1.4554615426026292,
"learning_rate": 7.455177802791237e-05,
"loss": 0.5176,
"step": 165
},
{
"epoch": 1.2599620493358634,
"grad_norm": 0.8300456250776146,
"learning_rate": 7.444379138296572e-05,
"loss": 0.5111,
"step": 166
},
{
"epoch": 1.2675521821631879,
"grad_norm": 0.8301260998594161,
"learning_rate": 7.433482484406887e-05,
"loss": 0.5149,
"step": 167
},
{
"epoch": 1.2751423149905123,
"grad_norm": 1.036861982897111,
"learning_rate": 7.42248815112189e-05,
"loss": 0.5074,
"step": 168
},
{
"epoch": 1.2827324478178368,
"grad_norm": 1.1061999056879284,
"learning_rate": 7.411396451220177e-05,
"loss": 0.5014,
"step": 169
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.3047827647572592,
"learning_rate": 7.400207700250333e-05,
"loss": 0.5144,
"step": 170
},
{
"epoch": 1.2979127134724857,
"grad_norm": 0.7526970536905354,
"learning_rate": 7.388922216521953e-05,
"loss": 0.5132,
"step": 171
},
{
"epoch": 1.3055028462998102,
"grad_norm": 0.7452267427677111,
"learning_rate": 7.377540321096595e-05,
"loss": 0.5022,
"step": 172
},
{
"epoch": 1.3130929791271346,
"grad_norm": 1.0513789114160723,
"learning_rate": 7.366062337778637e-05,
"loss": 0.5039,
"step": 173
},
{
"epoch": 1.3206831119544593,
"grad_norm": 1.3299701167289224,
"learning_rate": 7.354488593106068e-05,
"loss": 0.5039,
"step": 174
},
{
"epoch": 1.3282732447817835,
"grad_norm": 0.9881183562854784,
"learning_rate": 7.342819416341202e-05,
"loss": 0.5161,
"step": 175
},
{
"epoch": 1.3358633776091082,
"grad_norm": 1.3838355156124555,
"learning_rate": 7.331055139461305e-05,
"loss": 0.5128,
"step": 176
},
{
"epoch": 1.3434535104364327,
"grad_norm": 0.706807050794008,
"learning_rate": 7.319196097149153e-05,
"loss": 0.4995,
"step": 177
},
{
"epoch": 1.3510436432637571,
"grad_norm": 1.2072275318255294,
"learning_rate": 7.307242626783514e-05,
"loss": 0.5117,
"step": 178
},
{
"epoch": 1.3586337760910816,
"grad_norm": 0.8736304969731823,
"learning_rate": 7.295195068429539e-05,
"loss": 0.5093,
"step": 179
},
{
"epoch": 1.366223908918406,
"grad_norm": 1.118370322707032,
"learning_rate": 7.283053764829106e-05,
"loss": 0.513,
"step": 180
},
{
"epoch": 1.3738140417457305,
"grad_norm": 1.2165754217336513,
"learning_rate": 7.270819061391049e-05,
"loss": 0.5061,
"step": 181
},
{
"epoch": 1.381404174573055,
"grad_norm": 1.0662810244952639,
"learning_rate": 7.258491306181346e-05,
"loss": 0.5074,
"step": 182
},
{
"epoch": 1.3889943074003794,
"grad_norm": 1.550093405647991,
"learning_rate": 7.24607084991321e-05,
"loss": 0.5169,
"step": 183
},
{
"epoch": 1.396584440227704,
"grad_norm": 0.7232302048062569,
"learning_rate": 7.233558045937113e-05,
"loss": 0.5187,
"step": 184
},
{
"epoch": 1.4041745730550286,
"grad_norm": 1.3301692157689138,
"learning_rate": 7.220953250230733e-05,
"loss": 0.5101,
"step": 185
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.9469277615633731,
"learning_rate": 7.208256821388831e-05,
"loss": 0.5115,
"step": 186
},
{
"epoch": 1.4193548387096775,
"grad_norm": 1.461657389888908,
"learning_rate": 7.195469120613041e-05,
"loss": 0.518,
"step": 187
},
{
"epoch": 1.426944971537002,
"grad_norm": 0.7145042956694666,
"learning_rate": 7.182590511701604e-05,
"loss": 0.5002,
"step": 188
},
{
"epoch": 1.4345351043643264,
"grad_norm": 0.9602590784255072,
"learning_rate": 7.169621361039009e-05,
"loss": 0.4932,
"step": 189
},
{
"epoch": 1.4421252371916509,
"grad_norm": 0.9348247562699835,
"learning_rate": 7.156562037585576e-05,
"loss": 0.5045,
"step": 190
},
{
"epoch": 1.4497153700189753,
"grad_norm": 1.5691729872812523,
"learning_rate": 7.143412912866954e-05,
"loss": 0.5146,
"step": 191
},
{
"epoch": 1.4573055028462998,
"grad_norm": 0.7191513604989822,
"learning_rate": 7.130174360963562e-05,
"loss": 0.5031,
"step": 192
},
{
"epoch": 1.4648956356736242,
"grad_norm": 1.6999162113253339,
"learning_rate": 7.116846758499933e-05,
"loss": 0.5103,
"step": 193
},
{
"epoch": 1.4724857685009487,
"grad_norm": 1.0965769424195349,
"learning_rate": 7.103430484634009e-05,
"loss": 0.5101,
"step": 194
},
{
"epoch": 1.4800759013282732,
"grad_norm": 1.042633463565035,
"learning_rate": 7.089925921046348e-05,
"loss": 0.5133,
"step": 195
},
{
"epoch": 1.4876660341555978,
"grad_norm": 1.5277163845081705,
"learning_rate": 7.076333451929275e-05,
"loss": 0.5166,
"step": 196
},
{
"epoch": 1.495256166982922,
"grad_norm": 0.7588665368653583,
"learning_rate": 7.062653463975938e-05,
"loss": 0.5028,
"step": 197
},
{
"epoch": 1.5028462998102468,
"grad_norm": 1.4802097799655463,
"learning_rate": 7.048886346369321e-05,
"loss": 0.5173,
"step": 198
},
{
"epoch": 1.510436432637571,
"grad_norm": 0.8989137638919413,
"learning_rate": 7.035032490771165e-05,
"loss": 0.5058,
"step": 199
},
{
"epoch": 1.5180265654648957,
"grad_norm": 1.3727603969798114,
"learning_rate": 7.021092291310821e-05,
"loss": 0.5196,
"step": 200
},
{
"epoch": 1.5256166982922201,
"grad_norm": 0.95363755185113,
"learning_rate": 7.007066144574052e-05,
"loss": 0.5205,
"step": 201
},
{
"epoch": 1.5332068311195446,
"grad_norm": 1.1663040985006814,
"learning_rate": 6.992954449591731e-05,
"loss": 0.5093,
"step": 202
},
{
"epoch": 1.540796963946869,
"grad_norm": 0.7636048619329266,
"learning_rate": 6.978757607828509e-05,
"loss": 0.506,
"step": 203
},
{
"epoch": 1.5483870967741935,
"grad_norm": 1.1069490833534057,
"learning_rate": 6.964476023171378e-05,
"loss": 0.516,
"step": 204
},
{
"epoch": 1.5559772296015182,
"grad_norm": 0.6735693040775705,
"learning_rate": 6.95011010191819e-05,
"loss": 0.507,
"step": 205
},
{
"epoch": 1.5635673624288424,
"grad_norm": 0.7757347897129492,
"learning_rate": 6.935660252766092e-05,
"loss": 0.5181,
"step": 206
},
{
"epoch": 1.571157495256167,
"grad_norm": 0.7414965427945387,
"learning_rate": 6.921126886799903e-05,
"loss": 0.5074,
"step": 207
},
{
"epoch": 1.5787476280834913,
"grad_norm": 0.8131364204912126,
"learning_rate": 6.906510417480422e-05,
"loss": 0.5153,
"step": 208
},
{
"epoch": 1.586337760910816,
"grad_norm": 0.8512550944337758,
"learning_rate": 6.891811260632653e-05,
"loss": 0.5054,
"step": 209
},
{
"epoch": 1.5939278937381403,
"grad_norm": 0.7855183043381698,
"learning_rate": 6.877029834433992e-05,
"loss": 0.5047,
"step": 210
},
{
"epoch": 1.601518026565465,
"grad_norm": 0.8992512717445637,
"learning_rate": 6.862166559402318e-05,
"loss": 0.5025,
"step": 211
},
{
"epoch": 1.6091081593927894,
"grad_norm": 0.9210792646776457,
"learning_rate": 6.847221858384032e-05,
"loss": 0.4974,
"step": 212
},
{
"epoch": 1.6166982922201139,
"grad_norm": 0.9424266330757026,
"learning_rate": 6.832196156542033e-05,
"loss": 0.5062,
"step": 213
},
{
"epoch": 1.6242884250474383,
"grad_norm": 1.0966101994750281,
"learning_rate": 6.817089881343613e-05,
"loss": 0.5054,
"step": 214
},
{
"epoch": 1.6318785578747628,
"grad_norm": 1.009163727768516,
"learning_rate": 6.801903462548308e-05,
"loss": 0.5034,
"step": 215
},
{
"epoch": 1.6394686907020875,
"grad_norm": 0.9725332248811417,
"learning_rate": 6.786637332195659e-05,
"loss": 0.5115,
"step": 216
},
{
"epoch": 1.6470588235294117,
"grad_norm": 1.0170207658600694,
"learning_rate": 6.771291924592929e-05,
"loss": 0.5066,
"step": 217
},
{
"epoch": 1.6546489563567364,
"grad_norm": 0.9422861500618195,
"learning_rate": 6.755867676302747e-05,
"loss": 0.504,
"step": 218
},
{
"epoch": 1.6622390891840606,
"grad_norm": 0.9158879164554034,
"learning_rate": 6.740365026130684e-05,
"loss": 0.5032,
"step": 219
},
{
"epoch": 1.6698292220113853,
"grad_norm": 0.7780361297463692,
"learning_rate": 6.724784415112774e-05,
"loss": 0.4888,
"step": 220
},
{
"epoch": 1.6774193548387095,
"grad_norm": 0.5692137929082299,
"learning_rate": 6.709126286502965e-05,
"loss": 0.5022,
"step": 221
},
{
"epoch": 1.6850094876660342,
"grad_norm": 0.5004905918093622,
"learning_rate": 6.693391085760506e-05,
"loss": 0.4995,
"step": 222
},
{
"epoch": 1.6925996204933587,
"grad_norm": 0.5848868016251021,
"learning_rate": 6.677579260537277e-05,
"loss": 0.5055,
"step": 223
},
{
"epoch": 1.7001897533206831,
"grad_norm": 0.734294502408837,
"learning_rate": 6.661691260665057e-05,
"loss": 0.5008,
"step": 224
},
{
"epoch": 1.7077798861480076,
"grad_norm": 0.9781085041990851,
"learning_rate": 6.64572753814272e-05,
"loss": 0.5082,
"step": 225
},
{
"epoch": 1.715370018975332,
"grad_norm": 1.1839289754443743,
"learning_rate": 6.629688547123381e-05,
"loss": 0.4966,
"step": 226
},
{
"epoch": 1.7229601518026565,
"grad_norm": 0.6203375151514526,
"learning_rate": 6.613574743901472e-05,
"loss": 0.4976,
"step": 227
},
{
"epoch": 1.730550284629981,
"grad_norm": 0.37377037948651215,
"learning_rate": 6.597386586899766e-05,
"loss": 0.4907,
"step": 228
},
{
"epoch": 1.7381404174573056,
"grad_norm": 0.5842003288831636,
"learning_rate": 6.58112453665633e-05,
"loss": 0.5,
"step": 229
},
{
"epoch": 1.7457305502846299,
"grad_norm": 1.1216009200042196,
"learning_rate": 6.564789055811422e-05,
"loss": 0.5118,
"step": 230
},
{
"epoch": 1.7533206831119545,
"grad_norm": 1.1503531175618553,
"learning_rate": 6.54838060909434e-05,
"loss": 0.4856,
"step": 231
},
{
"epoch": 1.7609108159392788,
"grad_norm": 0.5953762660752571,
"learning_rate": 6.531899663310187e-05,
"loss": 0.4933,
"step": 232
},
{
"epoch": 1.7685009487666035,
"grad_norm": 0.4946507234843489,
"learning_rate": 6.515346687326602e-05,
"loss": 0.488,
"step": 233
},
{
"epoch": 1.776091081593928,
"grad_norm": 0.6911888286239702,
"learning_rate": 6.498722152060411e-05,
"loss": 0.5024,
"step": 234
},
{
"epoch": 1.7836812144212524,
"grad_norm": 0.9524817833599729,
"learning_rate": 6.482026530464244e-05,
"loss": 0.497,
"step": 235
},
{
"epoch": 1.7912713472485768,
"grad_norm": 1.056867827538452,
"learning_rate": 6.465260297513059e-05,
"loss": 0.5001,
"step": 236
},
{
"epoch": 1.7988614800759013,
"grad_norm": 0.9341896474591933,
"learning_rate": 6.448423930190653e-05,
"loss": 0.5056,
"step": 237
},
{
"epoch": 1.8064516129032258,
"grad_norm": 0.7998775078188581,
"learning_rate": 6.431517907476073e-05,
"loss": 0.4965,
"step": 238
},
{
"epoch": 1.8140417457305502,
"grad_norm": 0.6024227793682277,
"learning_rate": 6.414542710330004e-05,
"loss": 0.4918,
"step": 239
},
{
"epoch": 1.821631878557875,
"grad_norm": 0.5054296948703985,
"learning_rate": 6.397498821681073e-05,
"loss": 0.4987,
"step": 240
},
{
"epoch": 1.8292220113851991,
"grad_norm": 0.4915898095283207,
"learning_rate": 6.380386726412122e-05,
"loss": 0.489,
"step": 241
},
{
"epoch": 1.8368121442125238,
"grad_norm": 0.5191126165622191,
"learning_rate": 6.363206911346405e-05,
"loss": 0.5062,
"step": 242
},
{
"epoch": 1.844402277039848,
"grad_norm": 0.591888201694542,
"learning_rate": 6.345959865233742e-05,
"loss": 0.4928,
"step": 243
},
{
"epoch": 1.8519924098671727,
"grad_norm": 0.6103884601516754,
"learning_rate": 6.328646078736614e-05,
"loss": 0.4983,
"step": 244
},
{
"epoch": 1.8595825426944972,
"grad_norm": 0.5676870354041681,
"learning_rate": 6.311266044416205e-05,
"loss": 0.493,
"step": 245
},
{
"epoch": 1.8671726755218216,
"grad_norm": 0.5025577878236349,
"learning_rate": 6.293820256718388e-05,
"loss": 0.4936,
"step": 246
},
{
"epoch": 1.874762808349146,
"grad_norm": 0.5343665402941907,
"learning_rate": 6.276309211959657e-05,
"loss": 0.4976,
"step": 247
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.684168766812062,
"learning_rate": 6.25873340831301e-05,
"loss": 0.4986,
"step": 248
},
{
"epoch": 1.889943074003795,
"grad_norm": 0.971664414920718,
"learning_rate": 6.241093345793777e-05,
"loss": 0.4923,
"step": 249
},
{
"epoch": 1.8975332068311195,
"grad_norm": 1.3291099108661037,
"learning_rate": 6.22338952624539e-05,
"loss": 0.5085,
"step": 250
},
{
"epoch": 1.9051233396584442,
"grad_norm": 0.5887944838607679,
"learning_rate": 6.205622453325113e-05,
"loss": 0.4901,
"step": 251
},
{
"epoch": 1.9127134724857684,
"grad_norm": 0.5766670451808246,
"learning_rate": 6.18779263248971e-05,
"loss": 0.4923,
"step": 252
},
{
"epoch": 1.920303605313093,
"grad_norm": 1.1307550162308162,
"learning_rate": 6.169900570981057e-05,
"loss": 0.4991,
"step": 253
},
{
"epoch": 1.9278937381404173,
"grad_norm": 1.138869550845278,
"learning_rate": 6.151946777811729e-05,
"loss": 0.4998,
"step": 254
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.6269758422232977,
"learning_rate": 6.133931763750509e-05,
"loss": 0.4933,
"step": 255
},
{
"epoch": 1.9430740037950665,
"grad_norm": 0.7710149723845751,
"learning_rate": 6.11585604130785e-05,
"loss": 0.4944,
"step": 256
},
{
"epoch": 1.950664136622391,
"grad_norm": 0.9641556034924468,
"learning_rate": 6.097720124721311e-05,
"loss": 0.4915,
"step": 257
},
{
"epoch": 1.9582542694497154,
"grad_norm": 0.8101487252514183,
"learning_rate": 6.079524529940911e-05,
"loss": 0.4788,
"step": 258
},
{
"epoch": 1.9658444022770398,
"grad_norm": 0.6731500817613972,
"learning_rate": 6.0612697746144664e-05,
"loss": 0.4887,
"step": 259
},
{
"epoch": 1.9734345351043643,
"grad_norm": 0.66266631987093,
"learning_rate": 6.0429563780728476e-05,
"loss": 0.4888,
"step": 260
},
{
"epoch": 1.9810246679316887,
"grad_norm": 0.5402551506844365,
"learning_rate": 6.02458486131522e-05,
"loss": 0.4831,
"step": 261
},
{
"epoch": 1.9886148007590134,
"grad_norm": 0.6879216139275022,
"learning_rate": 6.006155746994212e-05,
"loss": 0.491,
"step": 262
},
{
"epoch": 1.9962049335863377,
"grad_norm": 0.9539606050998473,
"learning_rate": 5.98766955940105e-05,
"loss": 0.5341,
"step": 263
},
{
"epoch": 2.0037950664136623,
"grad_norm": 1.2929340536370602,
"learning_rate": 5.969126824450643e-05,
"loss": 0.5524,
"step": 264
},
{
"epoch": 2.0113851992409866,
"grad_norm": 0.6792026166979978,
"learning_rate": 5.9505280696666174e-05,
"loss": 0.4671,
"step": 265
},
{
"epoch": 2.0189753320683113,
"grad_norm": 0.6570978500488273,
"learning_rate": 5.931873824166316e-05,
"loss": 0.458,
"step": 266
},
{
"epoch": 2.0265654648956355,
"grad_norm": 0.8625246084442377,
"learning_rate": 5.913164618645738e-05,
"loss": 0.4646,
"step": 267
},
{
"epoch": 2.03415559772296,
"grad_norm": 0.8463370840972069,
"learning_rate": 5.894400985364444e-05,
"loss": 0.4503,
"step": 268
},
{
"epoch": 2.041745730550285,
"grad_norm": 0.5846678229118594,
"learning_rate": 5.875583458130417e-05,
"loss": 0.452,
"step": 269
},
{
"epoch": 2.049335863377609,
"grad_norm": 0.48959366327046705,
"learning_rate": 5.856712572284868e-05,
"loss": 0.4608,
"step": 270
},
{
"epoch": 2.0569259962049338,
"grad_norm": 0.5808495151777524,
"learning_rate": 5.8377888646870154e-05,
"loss": 0.4572,
"step": 271
},
{
"epoch": 2.064516129032258,
"grad_norm": 0.5154615059210003,
"learning_rate": 5.818812873698809e-05,
"loss": 0.4555,
"step": 272
},
{
"epoch": 2.0721062618595827,
"grad_norm": 0.5247505353737575,
"learning_rate": 5.799785139169606e-05,
"loss": 0.4493,
"step": 273
},
{
"epoch": 2.079696394686907,
"grad_norm": 0.6700114330504865,
"learning_rate": 5.7807062024208256e-05,
"loss": 0.4593,
"step": 274
},
{
"epoch": 2.0872865275142316,
"grad_norm": 0.6564087028803952,
"learning_rate": 5.761576606230538e-05,
"loss": 0.4543,
"step": 275
},
{
"epoch": 2.094876660341556,
"grad_norm": 0.6170822532663903,
"learning_rate": 5.742396894818031e-05,
"loss": 0.4585,
"step": 276
},
{
"epoch": 2.1024667931688805,
"grad_norm": 0.5359408843960233,
"learning_rate": 5.723167613828324e-05,
"loss": 0.4571,
"step": 277
},
{
"epoch": 2.1100569259962048,
"grad_norm": 0.42551634695058566,
"learning_rate": 5.7038893103166425e-05,
"loss": 0.4553,
"step": 278
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.25776313987894806,
"learning_rate": 5.684562532732859e-05,
"loss": 0.4467,
"step": 279
},
{
"epoch": 2.1252371916508537,
"grad_norm": 0.27351669144074725,
"learning_rate": 5.665187830905888e-05,
"loss": 0.4415,
"step": 280
},
{
"epoch": 2.1328273244781784,
"grad_norm": 0.41764999814129333,
"learning_rate": 5.645765756028045e-05,
"loss": 0.459,
"step": 281
},
{
"epoch": 2.140417457305503,
"grad_norm": 0.4715282529881882,
"learning_rate": 5.626296860639364e-05,
"loss": 0.4535,
"step": 282
},
{
"epoch": 2.1480075901328273,
"grad_norm": 0.45181614089506017,
"learning_rate": 5.606781698611879e-05,
"loss": 0.4557,
"step": 283
},
{
"epoch": 2.155597722960152,
"grad_norm": 0.3928688694632629,
"learning_rate": 5.587220825133867e-05,
"loss": 0.4529,
"step": 284
},
{
"epoch": 2.163187855787476,
"grad_norm": 0.3422352007203858,
"learning_rate": 5.567614796694056e-05,
"loss": 0.4478,
"step": 285
},
{
"epoch": 2.170777988614801,
"grad_norm": 0.3858181479438661,
"learning_rate": 5.5479641710657867e-05,
"loss": 0.461,
"step": 286
},
{
"epoch": 2.178368121442125,
"grad_norm": 0.4901941376432685,
"learning_rate": 5.528269507291152e-05,
"loss": 0.4533,
"step": 287
},
{
"epoch": 2.18595825426945,
"grad_norm": 0.6077838701042644,
"learning_rate": 5.5085313656650856e-05,
"loss": 0.4565,
"step": 288
},
{
"epoch": 2.193548387096774,
"grad_norm": 0.6334250948792183,
"learning_rate": 5.48875030771943e-05,
"loss": 0.4526,
"step": 289
},
{
"epoch": 2.2011385199240987,
"grad_norm": 0.5394180746780861,
"learning_rate": 5.468926896206955e-05,
"loss": 0.4474,
"step": 290
},
{
"epoch": 2.2087286527514234,
"grad_norm": 0.3688187782872463,
"learning_rate": 5.4490616950853484e-05,
"loss": 0.4486,
"step": 291
},
{
"epoch": 2.2163187855787476,
"grad_norm": 0.28612624363569344,
"learning_rate": 5.4291552695011786e-05,
"loss": 0.4473,
"step": 292
},
{
"epoch": 2.2239089184060723,
"grad_norm": 0.3786323162444375,
"learning_rate": 5.409208185773806e-05,
"loss": 0.4537,
"step": 293
},
{
"epoch": 2.2314990512333965,
"grad_norm": 0.45998197157742643,
"learning_rate": 5.389221011379281e-05,
"loss": 0.445,
"step": 294
},
{
"epoch": 2.239089184060721,
"grad_norm": 0.4227537195267863,
"learning_rate": 5.3691943149341976e-05,
"loss": 0.4524,
"step": 295
},
{
"epoch": 2.2466793168880455,
"grad_norm": 0.3375900744876679,
"learning_rate": 5.3491286661795104e-05,
"loss": 0.4543,
"step": 296
},
{
"epoch": 2.25426944971537,
"grad_norm": 0.3936283250723083,
"learning_rate": 5.3290246359643365e-05,
"loss": 0.4549,
"step": 297
},
{
"epoch": 2.2618595825426944,
"grad_norm": 0.4158907529340202,
"learning_rate": 5.3088827962297055e-05,
"loss": 0.4615,
"step": 298
},
{
"epoch": 2.269449715370019,
"grad_norm": 0.3573969971167834,
"learning_rate": 5.288703719992296e-05,
"loss": 0.4627,
"step": 299
},
{
"epoch": 2.2770398481973433,
"grad_norm": 0.29339247941077856,
"learning_rate": 5.2684879813281324e-05,
"loss": 0.4527,
"step": 300
},
{
"epoch": 2.284629981024668,
"grad_norm": 0.3811958753473836,
"learning_rate": 5.248236155356244e-05,
"loss": 0.4511,
"step": 301
},
{
"epoch": 2.292220113851992,
"grad_norm": 0.3947350372974727,
"learning_rate": 5.227948818222317e-05,
"loss": 0.4551,
"step": 302
},
{
"epoch": 2.299810246679317,
"grad_norm": 0.2959934006358651,
"learning_rate": 5.207626547082294e-05,
"loss": 0.451,
"step": 303
},
{
"epoch": 2.3074003795066416,
"grad_norm": 0.3009410854470416,
"learning_rate": 5.1872699200859606e-05,
"loss": 0.4504,
"step": 304
},
{
"epoch": 2.314990512333966,
"grad_norm": 0.38647651793826754,
"learning_rate": 5.1668795163604924e-05,
"loss": 0.4575,
"step": 305
},
{
"epoch": 2.3225806451612905,
"grad_norm": 0.34305172614808316,
"learning_rate": 5.1464559159939814e-05,
"loss": 0.4513,
"step": 306
},
{
"epoch": 2.3301707779886147,
"grad_norm": 0.3120007036591175,
"learning_rate": 5.125999700018934e-05,
"loss": 0.4601,
"step": 307
},
{
"epoch": 2.3377609108159394,
"grad_norm": 0.31088228173825794,
"learning_rate": 5.105511450395742e-05,
"loss": 0.4605,
"step": 308
},
{
"epoch": 2.3453510436432636,
"grad_norm": 0.24185319509946887,
"learning_rate": 5.084991749996121e-05,
"loss": 0.4544,
"step": 309
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.3141319871949889,
"learning_rate": 5.064441182586538e-05,
"loss": 0.4477,
"step": 310
},
{
"epoch": 2.3605313092979125,
"grad_norm": 0.3437798737764119,
"learning_rate": 5.0438603328115915e-05,
"loss": 0.438,
"step": 311
},
{
"epoch": 2.3681214421252372,
"grad_norm": 0.3413170865670166,
"learning_rate": 5.023249786177388e-05,
"loss": 0.4496,
"step": 312
},
{
"epoch": 2.375711574952562,
"grad_norm": 0.32816099400302223,
"learning_rate": 5.002610129034883e-05,
"loss": 0.4457,
"step": 313
},
{
"epoch": 2.383301707779886,
"grad_norm": 0.23652738280230934,
"learning_rate": 4.981941948563197e-05,
"loss": 0.4518,
"step": 314
},
{
"epoch": 2.3908918406072104,
"grad_norm": 0.3332470802079381,
"learning_rate": 4.961245832752916e-05,
"loss": 0.4553,
"step": 315
},
{
"epoch": 2.398481973434535,
"grad_norm": 0.30703993672772734,
"learning_rate": 4.940522370389355e-05,
"loss": 0.4511,
"step": 316
},
{
"epoch": 2.4060721062618597,
"grad_norm": 0.3458797214503799,
"learning_rate": 4.919772151035819e-05,
"loss": 0.4483,
"step": 317
},
{
"epoch": 2.413662239089184,
"grad_norm": 0.33817212823710935,
"learning_rate": 4.898995765016822e-05,
"loss": 0.4602,
"step": 318
},
{
"epoch": 2.4212523719165087,
"grad_norm": 0.28768592124027254,
"learning_rate": 4.878193803401294e-05,
"loss": 0.441,
"step": 319
},
{
"epoch": 2.428842504743833,
"grad_norm": 0.24625871004420682,
"learning_rate": 4.85736685798577e-05,
"loss": 0.4447,
"step": 320
},
{
"epoch": 2.4364326375711576,
"grad_norm": 0.3114815791554252,
"learning_rate": 4.836515521277548e-05,
"loss": 0.4506,
"step": 321
},
{
"epoch": 2.444022770398482,
"grad_norm": 0.43608825596037326,
"learning_rate": 4.8156403864778376e-05,
"loss": 0.4559,
"step": 322
},
{
"epoch": 2.4516129032258065,
"grad_norm": 0.3872177355726424,
"learning_rate": 4.7947420474648826e-05,
"loss": 0.4596,
"step": 323
},
{
"epoch": 2.4592030360531307,
"grad_norm": 0.2265303368613466,
"learning_rate": 4.773821098777061e-05,
"loss": 0.4529,
"step": 324
},
{
"epoch": 2.4667931688804554,
"grad_norm": 0.26489937931522084,
"learning_rate": 4.7528781355959836e-05,
"loss": 0.4462,
"step": 325
},
{
"epoch": 2.47438330170778,
"grad_norm": 0.32008600117514796,
"learning_rate": 4.731913753729543e-05,
"loss": 0.4489,
"step": 326
},
{
"epoch": 2.4819734345351043,
"grad_norm": 0.30655482675440676,
"learning_rate": 4.710928549594979e-05,
"loss": 0.4542,
"step": 327
},
{
"epoch": 2.489563567362429,
"grad_norm": 0.24961472010620386,
"learning_rate": 4.689923120201907e-05,
"loss": 0.455,
"step": 328
},
{
"epoch": 2.4971537001897532,
"grad_norm": 0.3196073862864069,
"learning_rate": 4.668898063135327e-05,
"loss": 0.4401,
"step": 329
},
{
"epoch": 2.504743833017078,
"grad_norm": 0.277810170883558,
"learning_rate": 4.647853976538635e-05,
"loss": 0.4429,
"step": 330
},
{
"epoch": 2.512333965844402,
"grad_norm": 0.2770203193332356,
"learning_rate": 4.626791459096592e-05,
"loss": 0.4509,
"step": 331
},
{
"epoch": 2.519924098671727,
"grad_norm": 0.26941306970885837,
"learning_rate": 4.605711110018307e-05,
"loss": 0.4485,
"step": 332
},
{
"epoch": 2.527514231499051,
"grad_norm": 0.2128205627033176,
"learning_rate": 4.584613529020177e-05,
"loss": 0.4567,
"step": 333
},
{
"epoch": 2.5351043643263758,
"grad_norm": 0.2612809484941453,
"learning_rate": 4.563499316308832e-05,
"loss": 0.4454,
"step": 334
},
{
"epoch": 2.5426944971537004,
"grad_norm": 0.2611991188114079,
"learning_rate": 4.542369072564062e-05,
"loss": 0.4527,
"step": 335
},
{
"epoch": 2.5502846299810247,
"grad_norm": 0.21775843029252434,
"learning_rate": 4.5212233989217217e-05,
"loss": 0.4533,
"step": 336
},
{
"epoch": 2.557874762808349,
"grad_norm": 0.24689100702507727,
"learning_rate": 4.500062896956632e-05,
"loss": 0.4564,
"step": 337
},
{
"epoch": 2.5654648956356736,
"grad_norm": 0.26478079829629153,
"learning_rate": 4.47888816866547e-05,
"loss": 0.4529,
"step": 338
},
{
"epoch": 2.5730550284629983,
"grad_norm": 0.27076572953883926,
"learning_rate": 4.457699816449632e-05,
"loss": 0.443,
"step": 339
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.2578704602776011,
"learning_rate": 4.436498443098108e-05,
"loss": 0.4474,
"step": 340
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.22049010549186773,
"learning_rate": 4.4152846517703265e-05,
"loss": 0.45,
"step": 341
},
{
"epoch": 2.5958254269449714,
"grad_norm": 0.24125071259305053,
"learning_rate": 4.394059045978994e-05,
"loss": 0.4481,
"step": 342
},
{
"epoch": 2.603415559772296,
"grad_norm": 0.226901700766956,
"learning_rate": 4.372822229572927e-05,
"loss": 0.4457,
"step": 343
},
{
"epoch": 2.6110056925996203,
"grad_norm": 0.2538357888941769,
"learning_rate": 4.3515748067198734e-05,
"loss": 0.4467,
"step": 344
},
{
"epoch": 2.618595825426945,
"grad_norm": 0.24051209192684073,
"learning_rate": 4.33031738188933e-05,
"loss": 0.4612,
"step": 345
},
{
"epoch": 2.6261859582542693,
"grad_norm": 0.1851624882291598,
"learning_rate": 4.309050559835335e-05,
"loss": 0.4447,
"step": 346
},
{
"epoch": 2.633776091081594,
"grad_norm": 0.23729717589403226,
"learning_rate": 4.287774945579268e-05,
"loss": 0.4546,
"step": 347
},
{
"epoch": 2.6413662239089186,
"grad_norm": 0.2414632155732589,
"learning_rate": 4.266491144392646e-05,
"loss": 0.4547,
"step": 348
},
{
"epoch": 2.648956356736243,
"grad_norm": 0.1961262640553002,
"learning_rate": 4.245199761779889e-05,
"loss": 0.4528,
"step": 349
},
{
"epoch": 2.656546489563567,
"grad_norm": 0.2519131470563294,
"learning_rate": 4.223901403461104e-05,
"loss": 0.4468,
"step": 350
},
{
"epoch": 2.6641366223908918,
"grad_norm": 0.2871531404330494,
"learning_rate": 4.202596675354851e-05,
"loss": 0.4524,
"step": 351
},
{
"epoch": 2.6717267552182165,
"grad_norm": 0.2328323960543026,
"learning_rate": 4.1812861835609055e-05,
"loss": 0.4477,
"step": 352
},
{
"epoch": 2.6793168880455407,
"grad_norm": 0.34379368220276085,
"learning_rate": 4.1599705343430126e-05,
"loss": 0.4473,
"step": 353
},
{
"epoch": 2.6869070208728654,
"grad_norm": 0.3077170581200678,
"learning_rate": 4.138650334111641e-05,
"loss": 0.4482,
"step": 354
},
{
"epoch": 2.6944971537001896,
"grad_norm": 0.27170499447729773,
"learning_rate": 4.117326189406733e-05,
"loss": 0.4456,
"step": 355
},
{
"epoch": 2.7020872865275143,
"grad_norm": 0.23574025133073204,
"learning_rate": 4.095998706880449e-05,
"loss": 0.441,
"step": 356
},
{
"epoch": 2.709677419354839,
"grad_norm": 0.24705781444445205,
"learning_rate": 4.0746684932799035e-05,
"loss": 0.4546,
"step": 357
},
{
"epoch": 2.717267552182163,
"grad_norm": 0.27539198214734206,
"learning_rate": 4.05333615542991e-05,
"loss": 0.4531,
"step": 358
},
{
"epoch": 2.7248576850094874,
"grad_norm": 0.23526848668490832,
"learning_rate": 4.032002300215715e-05,
"loss": 0.4453,
"step": 359
},
{
"epoch": 2.732447817836812,
"grad_norm": 0.23006904681882934,
"learning_rate": 4.01066753456573e-05,
"loss": 0.4498,
"step": 360
},
{
"epoch": 2.740037950664137,
"grad_norm": 0.2370274892416526,
"learning_rate": 3.989332465434272e-05,
"loss": 0.4453,
"step": 361
},
{
"epoch": 2.747628083491461,
"grad_norm": 0.21309601710918213,
"learning_rate": 3.9679976997842875e-05,
"loss": 0.4477,
"step": 362
},
{
"epoch": 2.7552182163187857,
"grad_norm": 0.24539024717561866,
"learning_rate": 3.946663844570091e-05,
"loss": 0.4476,
"step": 363
},
{
"epoch": 2.76280834914611,
"grad_norm": 0.18994827411578624,
"learning_rate": 3.925331506720097e-05,
"loss": 0.4464,
"step": 364
},
{
"epoch": 2.7703984819734346,
"grad_norm": 0.23585418564376148,
"learning_rate": 3.9040012931195515e-05,
"loss": 0.4518,
"step": 365
},
{
"epoch": 2.777988614800759,
"grad_norm": 0.2781600210524234,
"learning_rate": 3.8826738105932674e-05,
"loss": 0.4446,
"step": 366
},
{
"epoch": 2.7855787476280836,
"grad_norm": 0.22272463213128638,
"learning_rate": 3.8613496658883593e-05,
"loss": 0.4593,
"step": 367
},
{
"epoch": 2.793168880455408,
"grad_norm": 0.21131624600480586,
"learning_rate": 3.8400294656569894e-05,
"loss": 0.4553,
"step": 368
},
{
"epoch": 2.8007590132827325,
"grad_norm": 0.31839013252012877,
"learning_rate": 3.818713816439096e-05,
"loss": 0.4548,
"step": 369
},
{
"epoch": 2.808349146110057,
"grad_norm": 0.28626008068744174,
"learning_rate": 3.7974033246451496e-05,
"loss": 0.4454,
"step": 370
},
{
"epoch": 2.8159392789373814,
"grad_norm": 0.16805045978558228,
"learning_rate": 3.7760985965388975e-05,
"loss": 0.4533,
"step": 371
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.27576140446733527,
"learning_rate": 3.7548002382201126e-05,
"loss": 0.4528,
"step": 372
},
{
"epoch": 2.8311195445920303,
"grad_norm": 0.2587367405759775,
"learning_rate": 3.7335088556073554e-05,
"loss": 0.4525,
"step": 373
},
{
"epoch": 2.838709677419355,
"grad_norm": 0.2177015252391427,
"learning_rate": 3.712225054420732e-05,
"loss": 0.4466,
"step": 374
},
{
"epoch": 2.846299810246679,
"grad_norm": 0.22016300401684588,
"learning_rate": 3.690949440164667e-05,
"loss": 0.4507,
"step": 375
},
{
"epoch": 2.853889943074004,
"grad_norm": 0.2168768585860928,
"learning_rate": 3.669682618110671e-05,
"loss": 0.4537,
"step": 376
},
{
"epoch": 2.861480075901328,
"grad_norm": 0.23069857787158976,
"learning_rate": 3.648425193280128e-05,
"loss": 0.4514,
"step": 377
},
{
"epoch": 2.869070208728653,
"grad_norm": 0.2308590735052973,
"learning_rate": 3.627177770427075e-05,
"loss": 0.4517,
"step": 378
},
{
"epoch": 2.8766603415559775,
"grad_norm": 0.16332486719450007,
"learning_rate": 3.6059409540210075e-05,
"loss": 0.4437,
"step": 379
},
{
"epoch": 2.8842504743833017,
"grad_norm": 0.2372916479550056,
"learning_rate": 3.5847153482296734e-05,
"loss": 0.4516,
"step": 380
},
{
"epoch": 2.891840607210626,
"grad_norm": 0.25887011649794794,
"learning_rate": 3.563501556901892e-05,
"loss": 0.4484,
"step": 381
},
{
"epoch": 2.8994307400379506,
"grad_norm": 0.19423313672626585,
"learning_rate": 3.5423001835503696e-05,
"loss": 0.4489,
"step": 382
},
{
"epoch": 2.9070208728652753,
"grad_norm": 0.2299380518686083,
"learning_rate": 3.521111831334532e-05,
"loss": 0.4458,
"step": 383
},
{
"epoch": 2.9146110056925996,
"grad_norm": 0.19304361317692467,
"learning_rate": 3.4999371030433694e-05,
"loss": 0.4527,
"step": 384
},
{
"epoch": 2.9222011385199242,
"grad_norm": 0.19969286229808306,
"learning_rate": 3.47877660107828e-05,
"loss": 0.4417,
"step": 385
},
{
"epoch": 2.9297912713472485,
"grad_norm": 0.26755657071262773,
"learning_rate": 3.4576309274359394e-05,
"loss": 0.4611,
"step": 386
},
{
"epoch": 2.937381404174573,
"grad_norm": 0.20374725518267028,
"learning_rate": 3.436500683691168e-05,
"loss": 0.4582,
"step": 387
},
{
"epoch": 2.9449715370018974,
"grad_norm": 0.2258822384811313,
"learning_rate": 3.4153864709798234e-05,
"loss": 0.4475,
"step": 388
},
{
"epoch": 2.952561669829222,
"grad_norm": 0.18638500489261803,
"learning_rate": 3.394288889981695e-05,
"loss": 0.445,
"step": 389
},
{
"epoch": 2.9601518026565463,
"grad_norm": 0.2080901768578384,
"learning_rate": 3.373208540903409e-05,
"loss": 0.4515,
"step": 390
},
{
"epoch": 2.967741935483871,
"grad_norm": 0.22482754779566425,
"learning_rate": 3.3521460234613664e-05,
"loss": 0.4476,
"step": 391
},
{
"epoch": 2.9753320683111957,
"grad_norm": 0.21178017233679877,
"learning_rate": 3.331101936864674e-05,
"loss": 0.4503,
"step": 392
},
{
"epoch": 2.98292220113852,
"grad_norm": 0.1947806663328843,
"learning_rate": 3.310076879798095e-05,
"loss": 0.4415,
"step": 393
},
{
"epoch": 2.990512333965844,
"grad_norm": 0.18579209774749325,
"learning_rate": 3.2890714504050216e-05,
"loss": 0.446,
"step": 394
},
{
"epoch": 2.998102466793169,
"grad_norm": 0.23138108723134526,
"learning_rate": 3.268086246270458e-05,
"loss": 0.5364,
"step": 395
},
{
"epoch": 3.0056925996204935,
"grad_norm": 0.26045279058003457,
"learning_rate": 3.2471218644040184e-05,
"loss": 0.4487,
"step": 396
},
{
"epoch": 3.0132827324478177,
"grad_norm": 0.21695656813890685,
"learning_rate": 3.2261789012229394e-05,
"loss": 0.4084,
"step": 397
},
{
"epoch": 3.0208728652751424,
"grad_norm": 0.23426334273283442,
"learning_rate": 3.205257952535119e-05,
"loss": 0.4079,
"step": 398
},
{
"epoch": 3.0284629981024667,
"grad_norm": 0.26850763056052446,
"learning_rate": 3.184359613522163e-05,
"loss": 0.4223,
"step": 399
},
{
"epoch": 3.0360531309297913,
"grad_norm": 0.2683676862064319,
"learning_rate": 3.1634844787224525e-05,
"loss": 0.4182,
"step": 400
},
{
"epoch": 3.0436432637571156,
"grad_norm": 0.26248839521550266,
"learning_rate": 3.1426331420142314e-05,
"loss": 0.4171,
"step": 401
},
{
"epoch": 3.0512333965844403,
"grad_norm": 0.24770151611304786,
"learning_rate": 3.121806196598706e-05,
"loss": 0.4023,
"step": 402
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.2613619938203551,
"learning_rate": 3.10100423498318e-05,
"loss": 0.4095,
"step": 403
},
{
"epoch": 3.066413662239089,
"grad_norm": 0.2563470656186417,
"learning_rate": 3.0802278489641816e-05,
"loss": 0.4101,
"step": 404
},
{
"epoch": 3.074003795066414,
"grad_norm": 0.22055572471052182,
"learning_rate": 3.0594776296106464e-05,
"loss": 0.4105,
"step": 405
},
{
"epoch": 3.081593927893738,
"grad_norm": 0.22294658539566273,
"learning_rate": 3.0387541672470857e-05,
"loss": 0.4038,
"step": 406
},
{
"epoch": 3.0891840607210628,
"grad_norm": 0.24713851379128027,
"learning_rate": 3.0180580514368037e-05,
"loss": 0.406,
"step": 407
},
{
"epoch": 3.096774193548387,
"grad_norm": 0.20233125052294548,
"learning_rate": 2.997389870965118e-05,
"loss": 0.4067,
"step": 408
},
{
"epoch": 3.1043643263757117,
"grad_norm": 0.2268353462063368,
"learning_rate": 2.976750213822613e-05,
"loss": 0.4069,
"step": 409
},
{
"epoch": 3.111954459203036,
"grad_norm": 0.22067595331379308,
"learning_rate": 2.9561396671884105e-05,
"loss": 0.414,
"step": 410
},
{
"epoch": 3.1195445920303606,
"grad_norm": 0.19168123310877352,
"learning_rate": 2.9355588174134627e-05,
"loss": 0.4052,
"step": 411
},
{
"epoch": 3.127134724857685,
"grad_norm": 0.21765986129210937,
"learning_rate": 2.9150082500038794e-05,
"loss": 0.4084,
"step": 412
},
{
"epoch": 3.1347248576850095,
"grad_norm": 0.17465524431494953,
"learning_rate": 2.8944885496042593e-05,
"loss": 0.4039,
"step": 413
},
{
"epoch": 3.1423149905123338,
"grad_norm": 0.16136937176528135,
"learning_rate": 2.874000299981067e-05,
"loss": 0.4077,
"step": 414
},
{
"epoch": 3.1499051233396584,
"grad_norm": 0.1842039752946862,
"learning_rate": 2.8535440840060196e-05,
"loss": 0.4114,
"step": 415
},
{
"epoch": 3.157495256166983,
"grad_norm": 0.16275539565392577,
"learning_rate": 2.83312048363951e-05,
"loss": 0.4122,
"step": 416
},
{
"epoch": 3.1650853889943074,
"grad_norm": 0.17613818747948046,
"learning_rate": 2.812730079914041e-05,
"loss": 0.4078,
"step": 417
},
{
"epoch": 3.172675521821632,
"grad_norm": 0.1802212697189579,
"learning_rate": 2.7923734529177076e-05,
"loss": 0.4105,
"step": 418
},
{
"epoch": 3.1802656546489563,
"grad_norm": 0.14800969306703707,
"learning_rate": 2.772051181777684e-05,
"loss": 0.4153,
"step": 419
},
{
"epoch": 3.187855787476281,
"grad_norm": 0.17753235380567503,
"learning_rate": 2.7517638446437574e-05,
"loss": 0.4184,
"step": 420
},
{
"epoch": 3.195445920303605,
"grad_norm": 0.16819549000064993,
"learning_rate": 2.7315120186718686e-05,
"loss": 0.4065,
"step": 421
},
{
"epoch": 3.20303605313093,
"grad_norm": 0.18462973385672243,
"learning_rate": 2.7112962800077034e-05,
"loss": 0.4076,
"step": 422
},
{
"epoch": 3.210626185958254,
"grad_norm": 0.16361853377388974,
"learning_rate": 2.6911172037702962e-05,
"loss": 0.4095,
"step": 423
},
{
"epoch": 3.218216318785579,
"grad_norm": 0.18569025030207767,
"learning_rate": 2.6709753640356652e-05,
"loss": 0.4099,
"step": 424
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.1703754043113873,
"learning_rate": 2.650871333820491e-05,
"loss": 0.411,
"step": 425
},
{
"epoch": 3.2333965844402277,
"grad_norm": 0.16840742937643677,
"learning_rate": 2.6308056850658038e-05,
"loss": 0.4114,
"step": 426
},
{
"epoch": 3.2409867172675524,
"grad_norm": 0.15285852823906035,
"learning_rate": 2.6107789886207195e-05,
"loss": 0.4064,
"step": 427
},
{
"epoch": 3.2485768500948766,
"grad_norm": 0.1713250127448791,
"learning_rate": 2.5907918142261944e-05,
"loss": 0.4167,
"step": 428
},
{
"epoch": 3.2561669829222013,
"grad_norm": 0.1734237035758403,
"learning_rate": 2.5708447304988227e-05,
"loss": 0.4053,
"step": 429
},
{
"epoch": 3.2637571157495255,
"grad_norm": 0.17043360973692148,
"learning_rate": 2.5509383049146532e-05,
"loss": 0.4037,
"step": 430
},
{
"epoch": 3.27134724857685,
"grad_norm": 0.16479095536499094,
"learning_rate": 2.5310731037930474e-05,
"loss": 0.4071,
"step": 431
},
{
"epoch": 3.2789373814041745,
"grad_norm": 0.17168397871604932,
"learning_rate": 2.5112496922805712e-05,
"loss": 0.4141,
"step": 432
},
{
"epoch": 3.286527514231499,
"grad_norm": 0.15615068257123285,
"learning_rate": 2.4914686343349158e-05,
"loss": 0.4051,
"step": 433
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.16211177369118324,
"learning_rate": 2.4717304927088493e-05,
"loss": 0.4091,
"step": 434
},
{
"epoch": 3.301707779886148,
"grad_norm": 0.17500505329691834,
"learning_rate": 2.4520358289342143e-05,
"loss": 0.4157,
"step": 435
},
{
"epoch": 3.3092979127134727,
"grad_norm": 0.16178628883403032,
"learning_rate": 2.4323852033059447e-05,
"loss": 0.4108,
"step": 436
},
{
"epoch": 3.316888045540797,
"grad_norm": 0.15428648205322795,
"learning_rate": 2.412779174866134e-05,
"loss": 0.4133,
"step": 437
},
{
"epoch": 3.324478178368121,
"grad_norm": 0.1648054065245008,
"learning_rate": 2.393218301388123e-05,
"loss": 0.4083,
"step": 438
},
{
"epoch": 3.332068311195446,
"grad_norm": 0.14344944433387424,
"learning_rate": 2.3737031393606376e-05,
"loss": 0.4115,
"step": 439
},
{
"epoch": 3.3396584440227706,
"grad_norm": 0.16677658490660136,
"learning_rate": 2.3542342439719565e-05,
"loss": 0.4101,
"step": 440
},
{
"epoch": 3.347248576850095,
"grad_norm": 0.16428323673049788,
"learning_rate": 2.3348121690941125e-05,
"loss": 0.4033,
"step": 441
},
{
"epoch": 3.3548387096774195,
"grad_norm": 0.14998088085310035,
"learning_rate": 2.3154374672671417e-05,
"loss": 0.4116,
"step": 442
},
{
"epoch": 3.3624288425047437,
"grad_norm": 0.1646349150252062,
"learning_rate": 2.2961106896833588e-05,
"loss": 0.4053,
"step": 443
},
{
"epoch": 3.3700189753320684,
"grad_norm": 0.1463235531505828,
"learning_rate": 2.2768323861716778e-05,
"loss": 0.4045,
"step": 444
},
{
"epoch": 3.3776091081593926,
"grad_norm": 0.18600338855222787,
"learning_rate": 2.2576031051819704e-05,
"loss": 0.4145,
"step": 445
},
{
"epoch": 3.3851992409867173,
"grad_norm": 0.15182364989779723,
"learning_rate": 2.2384233937694626e-05,
"loss": 0.412,
"step": 446
},
{
"epoch": 3.3927893738140416,
"grad_norm": 0.1891499854862187,
"learning_rate": 2.2192937975791757e-05,
"loss": 0.4039,
"step": 447
},
{
"epoch": 3.4003795066413662,
"grad_norm": 0.1524505414622732,
"learning_rate": 2.2002148608303947e-05,
"loss": 0.4059,
"step": 448
},
{
"epoch": 3.407969639468691,
"grad_norm": 0.1570451938127235,
"learning_rate": 2.1811871263011924e-05,
"loss": 0.4063,
"step": 449
},
{
"epoch": 3.415559772296015,
"grad_norm": 0.14780715290004184,
"learning_rate": 2.1622111353129832e-05,
"loss": 0.4137,
"step": 450
},
{
"epoch": 3.42314990512334,
"grad_norm": 0.154720916330876,
"learning_rate": 2.1432874277151337e-05,
"loss": 0.4072,
"step": 451
},
{
"epoch": 3.430740037950664,
"grad_norm": 0.14741003990298276,
"learning_rate": 2.124416541869586e-05,
"loss": 0.4106,
"step": 452
},
{
"epoch": 3.4383301707779887,
"grad_norm": 0.13756307467876858,
"learning_rate": 2.1055990146355566e-05,
"loss": 0.4176,
"step": 453
},
{
"epoch": 3.445920303605313,
"grad_norm": 0.1478071810974749,
"learning_rate": 2.0868353813542633e-05,
"loss": 0.4068,
"step": 454
},
{
"epoch": 3.4535104364326377,
"grad_norm": 0.14471344451828674,
"learning_rate": 2.068126175833685e-05,
"loss": 0.4118,
"step": 455
},
{
"epoch": 3.461100569259962,
"grad_norm": 0.1488219736052461,
"learning_rate": 2.0494719303333836e-05,
"loss": 0.412,
"step": 456
},
{
"epoch": 3.4686907020872866,
"grad_norm": 0.7113380874758816,
"learning_rate": 2.0308731755493577e-05,
"loss": 0.4155,
"step": 457
},
{
"epoch": 3.476280834914611,
"grad_norm": 0.13854452311181958,
"learning_rate": 2.012330440598952e-05,
"loss": 0.4058,
"step": 458
},
{
"epoch": 3.4838709677419355,
"grad_norm": 0.17245669157249005,
"learning_rate": 1.9938442530057904e-05,
"loss": 0.4158,
"step": 459
},
{
"epoch": 3.4914611005692597,
"grad_norm": 0.13334621900310906,
"learning_rate": 1.975415138684781e-05,
"loss": 0.4064,
"step": 460
},
{
"epoch": 3.4990512333965844,
"grad_norm": 0.1704714868106143,
"learning_rate": 1.9570436219271534e-05,
"loss": 0.4053,
"step": 461
},
{
"epoch": 3.506641366223909,
"grad_norm": 0.13841108978778513,
"learning_rate": 1.9387302253855353e-05,
"loss": 0.4084,
"step": 462
},
{
"epoch": 3.5142314990512333,
"grad_norm": 0.14973879725428832,
"learning_rate": 1.9204754700590878e-05,
"loss": 0.412,
"step": 463
},
{
"epoch": 3.521821631878558,
"grad_norm": 0.14960927710961097,
"learning_rate": 1.9022798752786896e-05,
"loss": 0.4118,
"step": 464
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.14735649088289546,
"learning_rate": 1.8841439586921515e-05,
"loss": 0.4066,
"step": 465
},
{
"epoch": 3.537001897533207,
"grad_norm": 0.15821300680395564,
"learning_rate": 1.8660682362494926e-05,
"loss": 0.416,
"step": 466
},
{
"epoch": 3.544592030360531,
"grad_norm": 0.14559017597813168,
"learning_rate": 1.848053222188271e-05,
"loss": 0.4095,
"step": 467
},
{
"epoch": 3.552182163187856,
"grad_norm": 0.15314476683793304,
"learning_rate": 1.8300994290189452e-05,
"loss": 0.4094,
"step": 468
},
{
"epoch": 3.55977229601518,
"grad_norm": 0.169795874861623,
"learning_rate": 1.8122073675102935e-05,
"loss": 0.418,
"step": 469
},
{
"epoch": 3.5673624288425048,
"grad_norm": 0.1443798018236273,
"learning_rate": 1.7943775466748867e-05,
"loss": 0.4086,
"step": 470
},
{
"epoch": 3.5749525616698294,
"grad_norm": 0.16667220421287227,
"learning_rate": 1.7766104737546102e-05,
"loss": 0.4079,
"step": 471
},
{
"epoch": 3.5825426944971537,
"grad_norm": 0.1346641081419542,
"learning_rate": 1.7589066542062253e-05,
"loss": 0.4076,
"step": 472
},
{
"epoch": 3.590132827324478,
"grad_norm": 0.15846447495361166,
"learning_rate": 1.741266591686991e-05,
"loss": 0.4059,
"step": 473
},
{
"epoch": 3.5977229601518026,
"grad_norm": 0.1366720287265696,
"learning_rate": 1.7236907880403447e-05,
"loss": 0.4078,
"step": 474
},
{
"epoch": 3.6053130929791273,
"grad_norm": 0.16252820939929033,
"learning_rate": 1.7061797432816138e-05,
"loss": 0.4073,
"step": 475
},
{
"epoch": 3.6129032258064515,
"grad_norm": 0.1494339978487333,
"learning_rate": 1.6887339555837948e-05,
"loss": 0.4081,
"step": 476
},
{
"epoch": 3.620493358633776,
"grad_norm": 0.14531376606247468,
"learning_rate": 1.671353921263386e-05,
"loss": 0.4072,
"step": 477
},
{
"epoch": 3.6280834914611004,
"grad_norm": 0.1388375063144433,
"learning_rate": 1.654040134766259e-05,
"loss": 0.4075,
"step": 478
},
{
"epoch": 3.635673624288425,
"grad_norm": 0.13505123860416815,
"learning_rate": 1.6367930886535957e-05,
"loss": 0.4145,
"step": 479
},
{
"epoch": 3.64326375711575,
"grad_norm": 0.13194605801302411,
"learning_rate": 1.619613273587879e-05,
"loss": 0.4177,
"step": 480
},
{
"epoch": 3.650853889943074,
"grad_norm": 0.14166452387033596,
"learning_rate": 1.602501178318928e-05,
"loss": 0.4161,
"step": 481
},
{
"epoch": 3.6584440227703983,
"grad_norm": 0.13583541432935878,
"learning_rate": 1.5854572896699977e-05,
"loss": 0.4105,
"step": 482
},
{
"epoch": 3.666034155597723,
"grad_norm": 0.15005698486131297,
"learning_rate": 1.5684820925239273e-05,
"loss": 0.398,
"step": 483
},
{
"epoch": 3.6736242884250476,
"grad_norm": 0.13590326204495468,
"learning_rate": 1.5515760698093485e-05,
"loss": 0.408,
"step": 484
},
{
"epoch": 3.681214421252372,
"grad_norm": 0.14280292743110926,
"learning_rate": 1.5347397024869423e-05,
"loss": 0.4102,
"step": 485
},
{
"epoch": 3.6888045540796965,
"grad_norm": 0.14960004062226398,
"learning_rate": 1.5179734695357584e-05,
"loss": 0.4048,
"step": 486
},
{
"epoch": 3.6963946869070208,
"grad_norm": 0.14308871822409527,
"learning_rate": 1.5012778479395892e-05,
"loss": 0.41,
"step": 487
},
{
"epoch": 3.7039848197343455,
"grad_norm": 0.17743355555341997,
"learning_rate": 1.4846533126733999e-05,
"loss": 0.4066,
"step": 488
},
{
"epoch": 3.7115749525616697,
"grad_norm": 0.13804122945298328,
"learning_rate": 1.4681003366898132e-05,
"loss": 0.4108,
"step": 489
},
{
"epoch": 3.7191650853889944,
"grad_norm": 0.13970618164447296,
"learning_rate": 1.4516193909056609e-05,
"loss": 0.4029,
"step": 490
},
{
"epoch": 3.7267552182163186,
"grad_norm": 0.16177493370388654,
"learning_rate": 1.4352109441885786e-05,
"loss": 0.4083,
"step": 491
},
{
"epoch": 3.7343453510436433,
"grad_norm": 0.12082905813373115,
"learning_rate": 1.4188754633436718e-05,
"loss": 0.4013,
"step": 492
},
{
"epoch": 3.741935483870968,
"grad_norm": 0.14795325276605165,
"learning_rate": 1.4026134131002347e-05,
"loss": 0.4101,
"step": 493
},
{
"epoch": 3.749525616698292,
"grad_norm": 0.14127927316169223,
"learning_rate": 1.3864252560985283e-05,
"loss": 0.414,
"step": 494
},
{
"epoch": 3.7571157495256164,
"grad_norm": 0.13865730984205682,
"learning_rate": 1.3703114528766203e-05,
"loss": 0.4029,
"step": 495
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.14552488808913097,
"learning_rate": 1.35427246185728e-05,
"loss": 0.4073,
"step": 496
},
{
"epoch": 3.772296015180266,
"grad_norm": 0.1357309007526371,
"learning_rate": 1.3383087393349436e-05,
"loss": 0.4091,
"step": 497
},
{
"epoch": 3.77988614800759,
"grad_norm": 0.1324930748382244,
"learning_rate": 1.3224207394627241e-05,
"loss": 0.4122,
"step": 498
},
{
"epoch": 3.7874762808349147,
"grad_norm": 0.1362271179146437,
"learning_rate": 1.306608914239496e-05,
"loss": 0.4041,
"step": 499
},
{
"epoch": 3.795066413662239,
"grad_norm": 0.12409927061704383,
"learning_rate": 1.2908737134970367e-05,
"loss": 0.4056,
"step": 500
},
{
"epoch": 3.8026565464895636,
"grad_norm": 0.13343164571477334,
"learning_rate": 1.2752155848872266e-05,
"loss": 0.4096,
"step": 501
},
{
"epoch": 3.8102466793168883,
"grad_norm": 0.11787497363701406,
"learning_rate": 1.2596349738693162e-05,
"loss": 0.3975,
"step": 502
},
{
"epoch": 3.8178368121442126,
"grad_norm": 0.12436712059884664,
"learning_rate": 1.2441323236972536e-05,
"loss": 0.4103,
"step": 503
},
{
"epoch": 3.825426944971537,
"grad_norm": 0.1217813385845422,
"learning_rate": 1.2287080754070719e-05,
"loss": 0.407,
"step": 504
},
{
"epoch": 3.8330170777988615,
"grad_norm": 0.11341001168506011,
"learning_rate": 1.2133626678043426e-05,
"loss": 0.4113,
"step": 505
},
{
"epoch": 3.840607210626186,
"grad_norm": 0.10943902346608907,
"learning_rate": 1.1980965374516922e-05,
"loss": 0.4042,
"step": 506
},
{
"epoch": 3.8481973434535104,
"grad_norm": 0.13586692787728996,
"learning_rate": 1.1829101186563876e-05,
"loss": 0.4149,
"step": 507
},
{
"epoch": 3.855787476280835,
"grad_norm": 0.11690469403831676,
"learning_rate": 1.167803843457969e-05,
"loss": 0.4174,
"step": 508
},
{
"epoch": 3.8633776091081593,
"grad_norm": 0.11141063470580559,
"learning_rate": 1.1527781416159684e-05,
"loss": 0.4064,
"step": 509
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.14761043009465893,
"learning_rate": 1.1378334405976829e-05,
"loss": 0.4095,
"step": 510
},
{
"epoch": 3.878557874762808,
"grad_norm": 0.12802936815914415,
"learning_rate": 1.122970165566009e-05,
"loss": 0.4126,
"step": 511
},
{
"epoch": 3.886148007590133,
"grad_norm": 0.11715478782287307,
"learning_rate": 1.1081887393673481e-05,
"loss": 0.4039,
"step": 512
},
{
"epoch": 3.893738140417457,
"grad_norm": 0.12113439474595457,
"learning_rate": 1.0934895825195807e-05,
"loss": 0.4039,
"step": 513
},
{
"epoch": 3.901328273244782,
"grad_norm": 0.12045212518695263,
"learning_rate": 1.0788731132000985e-05,
"loss": 0.4157,
"step": 514
},
{
"epoch": 3.9089184060721065,
"grad_norm": 0.11325751581490795,
"learning_rate": 1.0643397472339103e-05,
"loss": 0.4058,
"step": 515
},
{
"epoch": 3.9165085388994307,
"grad_norm": 0.11669592480574363,
"learning_rate": 1.0498898980818115e-05,
"loss": 0.4082,
"step": 516
},
{
"epoch": 3.924098671726755,
"grad_norm": 0.11461527608695707,
"learning_rate": 1.035523976828623e-05,
"loss": 0.419,
"step": 517
},
{
"epoch": 3.9316888045540797,
"grad_norm": 0.11809514587490393,
"learning_rate": 1.0212423921714923e-05,
"loss": 0.4158,
"step": 518
},
{
"epoch": 3.9392789373814043,
"grad_norm": 0.11115980306830088,
"learning_rate": 1.0070455504082695e-05,
"loss": 0.4095,
"step": 519
},
{
"epoch": 3.9468690702087286,
"grad_norm": 0.114832121716292,
"learning_rate": 9.92933855425951e-06,
"loss": 0.4154,
"step": 520
},
{
"epoch": 3.9544592030360532,
"grad_norm": 0.11130484634692327,
"learning_rate": 9.789077086891802e-06,
"loss": 0.4137,
"step": 521
},
{
"epoch": 3.9620493358633775,
"grad_norm": 0.10695561553946319,
"learning_rate": 9.649675092288366e-06,
"loss": 0.4006,
"step": 522
},
{
"epoch": 3.969639468690702,
"grad_norm": 0.12052592151934423,
"learning_rate": 9.511136536306793e-06,
"loss": 0.4082,
"step": 523
},
{
"epoch": 3.9772296015180264,
"grad_norm": 0.10913274770762135,
"learning_rate": 9.373465360240627e-06,
"loss": 0.4134,
"step": 524
},
{
"epoch": 3.984819734345351,
"grad_norm": 0.11482531714881333,
"learning_rate": 9.236665480707266e-06,
"loss": 0.405,
"step": 525
},
{
"epoch": 3.9924098671726753,
"grad_norm": 0.11550707751512918,
"learning_rate": 9.100740789536515e-06,
"loss": 0.4061,
"step": 526
},
{
"epoch": 4.0,
"grad_norm": 0.14100318690739433,
"learning_rate": 8.96569515365993e-06,
"loss": 0.5074,
"step": 527
},
{
"epoch": 4.007590132827325,
"grad_norm": 0.19514222390213506,
"learning_rate": 8.831532415000685e-06,
"loss": 0.3785,
"step": 528
},
{
"epoch": 4.015180265654649,
"grad_norm": 0.14576099759098526,
"learning_rate": 8.698256390364386e-06,
"loss": 0.373,
"step": 529
},
{
"epoch": 4.022770398481973,
"grad_norm": 0.13042501540336135,
"learning_rate": 8.565870871330463e-06,
"loss": 0.3799,
"step": 530
},
{
"epoch": 4.030360531309298,
"grad_norm": 0.1565052782903353,
"learning_rate": 8.434379624144261e-06,
"loss": 0.3881,
"step": 531
},
{
"epoch": 4.0379506641366225,
"grad_norm": 0.16603797889226304,
"learning_rate": 8.303786389609914e-06,
"loss": 0.386,
"step": 532
},
{
"epoch": 4.045540796963947,
"grad_norm": 0.16546797622934456,
"learning_rate": 8.17409488298396e-06,
"loss": 0.3847,
"step": 533
},
{
"epoch": 4.053130929791271,
"grad_norm": 0.14996763626729143,
"learning_rate": 8.0453087938696e-06,
"loss": 0.3816,
"step": 534
},
{
"epoch": 4.060721062618596,
"grad_norm": 0.14469013701568176,
"learning_rate": 7.917431786111698e-06,
"loss": 0.3814,
"step": 535
},
{
"epoch": 4.06831119544592,
"grad_norm": 0.14926356991103681,
"learning_rate": 7.790467497692678e-06,
"loss": 0.3779,
"step": 536
},
{
"epoch": 4.075901328273245,
"grad_norm": 0.1655187733066776,
"learning_rate": 7.664419540628886e-06,
"loss": 0.3884,
"step": 537
},
{
"epoch": 4.08349146110057,
"grad_norm": 0.16471106758966367,
"learning_rate": 7.539291500867918e-06,
"loss": 0.3823,
"step": 538
},
{
"epoch": 4.0910815939278935,
"grad_norm": 0.146447604920726,
"learning_rate": 7.415086938186542e-06,
"loss": 0.392,
"step": 539
},
{
"epoch": 4.098671726755218,
"grad_norm": 0.14674866169218687,
"learning_rate": 7.291809386089515e-06,
"loss": 0.3807,
"step": 540
},
{
"epoch": 4.106261859582543,
"grad_norm": 0.1654058535020922,
"learning_rate": 7.169462351708958e-06,
"loss": 0.3852,
"step": 541
},
{
"epoch": 4.1138519924098675,
"grad_norm": 0.1447542059683869,
"learning_rate": 7.048049315704611e-06,
"loss": 0.3831,
"step": 542
},
{
"epoch": 4.121442125237191,
"grad_norm": 0.1261716394650113,
"learning_rate": 6.927573732164879e-06,
"loss": 0.3831,
"step": 543
},
{
"epoch": 4.129032258064516,
"grad_norm": 0.15942710883242464,
"learning_rate": 6.808039028508475e-06,
"loss": 0.3835,
"step": 544
},
{
"epoch": 4.136622390891841,
"grad_norm": 0.13949828863354824,
"learning_rate": 6.6894486053869525e-06,
"loss": 0.3811,
"step": 545
},
{
"epoch": 4.144212523719165,
"grad_norm": 0.1298354723268211,
"learning_rate": 6.571805836587981e-06,
"loss": 0.3771,
"step": 546
},
{
"epoch": 4.151802656546489,
"grad_norm": 0.11985046208878425,
"learning_rate": 6.455114068939323e-06,
"loss": 0.3865,
"step": 547
},
{
"epoch": 4.159392789373814,
"grad_norm": 0.1444577266660429,
"learning_rate": 6.3393766222136445e-06,
"loss": 0.3826,
"step": 548
},
{
"epoch": 4.1669829222011385,
"grad_norm": 0.13012757211149162,
"learning_rate": 6.224596789034061e-06,
"loss": 0.3809,
"step": 549
},
{
"epoch": 4.174573055028463,
"grad_norm": 0.11192946871125323,
"learning_rate": 6.1107778347804814e-06,
"loss": 0.3826,
"step": 550
},
{
"epoch": 4.182163187855788,
"grad_norm": 0.11903117545520656,
"learning_rate": 5.99792299749669e-06,
"loss": 0.3768,
"step": 551
},
{
"epoch": 4.189753320683112,
"grad_norm": 0.12263935198172611,
"learning_rate": 5.886035487798229e-06,
"loss": 0.3807,
"step": 552
},
{
"epoch": 4.197343453510436,
"grad_norm": 0.11730507620931735,
"learning_rate": 5.775118488781099e-06,
"loss": 0.3822,
"step": 553
},
{
"epoch": 4.204933586337761,
"grad_norm": 0.11587381154598554,
"learning_rate": 5.665175155931133e-06,
"loss": 0.3827,
"step": 554
},
{
"epoch": 4.212523719165086,
"grad_norm": 0.1134319753947347,
"learning_rate": 5.556208617034289e-06,
"loss": 0.3766,
"step": 555
},
{
"epoch": 4.2201138519924095,
"grad_norm": 0.1050634846149617,
"learning_rate": 5.448221972087631e-06,
"loss": 0.3792,
"step": 556
},
{
"epoch": 4.227703984819734,
"grad_norm": 0.10861492093915034,
"learning_rate": 5.341218293211143e-06,
"loss": 0.3857,
"step": 557
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.10851653304619344,
"learning_rate": 5.235200624560341e-06,
"loss": 0.3795,
"step": 558
},
{
"epoch": 4.242884250474384,
"grad_norm": 0.10296399140659968,
"learning_rate": 5.130171982239685e-06,
"loss": 0.3846,
"step": 559
},
{
"epoch": 4.250474383301707,
"grad_norm": 0.10426146312115164,
"learning_rate": 5.026135354216717e-06,
"loss": 0.383,
"step": 560
},
{
"epoch": 4.258064516129032,
"grad_norm": 0.10232852820462886,
"learning_rate": 4.923093700237109e-06,
"loss": 0.3868,
"step": 561
},
{
"epoch": 4.265654648956357,
"grad_norm": 0.10102795561292162,
"learning_rate": 4.821049951740442e-06,
"loss": 0.3781,
"step": 562
},
{
"epoch": 4.273244781783681,
"grad_norm": 0.10261093996817437,
"learning_rate": 4.720007011776808e-06,
"loss": 0.3802,
"step": 563
},
{
"epoch": 4.280834914611006,
"grad_norm": 0.09919927002366413,
"learning_rate": 4.6199677549242285e-06,
"loss": 0.3837,
"step": 564
},
{
"epoch": 4.28842504743833,
"grad_norm": 0.10836098588585662,
"learning_rate": 4.520935027206857e-06,
"loss": 0.3869,
"step": 565
},
{
"epoch": 4.2960151802656545,
"grad_norm": 0.10526098051103763,
"learning_rate": 4.4229116460140495e-06,
"loss": 0.377,
"step": 566
},
{
"epoch": 4.303605313092979,
"grad_norm": 0.09914351892145006,
"learning_rate": 4.325900400020176e-06,
"loss": 0.3786,
"step": 567
},
{
"epoch": 4.311195445920304,
"grad_norm": 0.10303085522084827,
"learning_rate": 4.229904049105287e-06,
"loss": 0.3799,
"step": 568
},
{
"epoch": 4.318785578747628,
"grad_norm": 0.09870659666807487,
"learning_rate": 4.1349253242766265e-06,
"loss": 0.3723,
"step": 569
},
{
"epoch": 4.326375711574952,
"grad_norm": 0.09910026166356478,
"learning_rate": 4.040966927590901e-06,
"loss": 0.3839,
"step": 570
},
{
"epoch": 4.333965844402277,
"grad_norm": 0.10308676223772309,
"learning_rate": 3.9480315320774524e-06,
"loss": 0.3819,
"step": 571
},
{
"epoch": 4.341555977229602,
"grad_norm": 0.11412486094736646,
"learning_rate": 3.856121781662148e-06,
"loss": 0.3886,
"step": 572
},
{
"epoch": 4.349146110056926,
"grad_norm": 0.09763532306888358,
"learning_rate": 3.7652402910922513e-06,
"loss": 0.3798,
"step": 573
},
{
"epoch": 4.35673624288425,
"grad_norm": 0.09695937184022163,
"learning_rate": 3.675389645861951e-06,
"loss": 0.3855,
"step": 574
},
{
"epoch": 4.364326375711575,
"grad_norm": 0.10172178456609236,
"learning_rate": 3.5865724021388437e-06,
"loss": 0.3893,
"step": 575
},
{
"epoch": 4.3719165085389,
"grad_norm": 0.10300828152047768,
"learning_rate": 3.4987910866912402e-06,
"loss": 0.3873,
"step": 576
},
{
"epoch": 4.379506641366224,
"grad_norm": 0.09672665094607047,
"learning_rate": 3.4120481968162022e-06,
"loss": 0.3875,
"step": 577
},
{
"epoch": 4.387096774193548,
"grad_norm": 0.09743576074956742,
"learning_rate": 3.32634620026858e-06,
"loss": 0.3792,
"step": 578
},
{
"epoch": 4.394686907020873,
"grad_norm": 0.09625146137130541,
"learning_rate": 3.241687535190776e-06,
"loss": 0.3867,
"step": 579
},
{
"epoch": 4.402277039848197,
"grad_norm": 0.0945950664728936,
"learning_rate": 3.1580746100433646e-06,
"loss": 0.3824,
"step": 580
},
{
"epoch": 4.409867172675522,
"grad_norm": 0.0980784024646366,
"learning_rate": 3.0755098035365917e-06,
"loss": 0.3839,
"step": 581
},
{
"epoch": 4.417457305502847,
"grad_norm": 0.10142930578038363,
"learning_rate": 2.9939954645626934e-06,
"loss": 0.3831,
"step": 582
},
{
"epoch": 4.425047438330171,
"grad_norm": 0.1024718070994225,
"learning_rate": 2.913533912129105e-06,
"loss": 0.3838,
"step": 583
},
{
"epoch": 4.432637571157495,
"grad_norm": 0.09460467739691439,
"learning_rate": 2.8341274352924197e-06,
"loss": 0.3856,
"step": 584
},
{
"epoch": 4.44022770398482,
"grad_norm": 0.09628762445931284,
"learning_rate": 2.7557782930933298e-06,
"loss": 0.3813,
"step": 585
},
{
"epoch": 4.447817836812145,
"grad_norm": 0.09194772578210779,
"learning_rate": 2.6784887144923445e-06,
"loss": 0.3817,
"step": 586
},
{
"epoch": 4.455407969639468,
"grad_norm": 0.08931743384621908,
"learning_rate": 2.6022608983063522e-06,
"loss": 0.3788,
"step": 587
},
{
"epoch": 4.462998102466793,
"grad_norm": 0.09150589167868585,
"learning_rate": 2.5270970131460937e-06,
"loss": 0.3866,
"step": 588
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.09812252230877141,
"learning_rate": 2.4529991973544664e-06,
"loss": 0.3903,
"step": 589
},
{
"epoch": 4.478178368121442,
"grad_norm": 0.09412962970253295,
"learning_rate": 2.3799695589456695e-06,
"loss": 0.3812,
"step": 590
},
{
"epoch": 4.485768500948766,
"grad_norm": 0.09424880620907046,
"learning_rate": 2.308010175545232e-06,
"loss": 0.3838,
"step": 591
},
{
"epoch": 4.493358633776091,
"grad_norm": 0.09122643056285845,
"learning_rate": 2.2371230943309598e-06,
"loss": 0.3896,
"step": 592
},
{
"epoch": 4.500948766603416,
"grad_norm": 0.09096615886328271,
"learning_rate": 2.1673103319746146e-06,
"loss": 0.3785,
"step": 593
},
{
"epoch": 4.50853889943074,
"grad_norm": 0.0997668049582305,
"learning_rate": 2.0985738745846086e-06,
"loss": 0.3873,
"step": 594
},
{
"epoch": 4.516129032258064,
"grad_norm": 0.09598005147735333,
"learning_rate": 2.0309156776494497e-06,
"loss": 0.3755,
"step": 595
},
{
"epoch": 4.523719165085389,
"grad_norm": 0.09405869487813164,
"learning_rate": 1.964337665982172e-06,
"loss": 0.3923,
"step": 596
},
{
"epoch": 4.531309297912713,
"grad_norm": 0.0916297167969929,
"learning_rate": 1.898841733665515e-06,
"loss": 0.3836,
"step": 597
},
{
"epoch": 4.538899430740038,
"grad_norm": 0.09844277128999404,
"learning_rate": 1.8344297439980475e-06,
"loss": 0.3814,
"step": 598
},
{
"epoch": 4.546489563567363,
"grad_norm": 0.09263986116298609,
"learning_rate": 1.7711035294412094e-06,
"loss": 0.3874,
"step": 599
},
{
"epoch": 4.554079696394687,
"grad_norm": 0.09337538395396143,
"learning_rate": 1.7088648915671236e-06,
"loss": 0.3819,
"step": 600
},
{
"epoch": 4.561669829222011,
"grad_norm": 0.09644194634203436,
"learning_rate": 1.6477156010073693e-06,
"loss": 0.3859,
"step": 601
},
{
"epoch": 4.569259962049336,
"grad_norm": 0.09104976943289746,
"learning_rate": 1.5876573974026043e-06,
"loss": 0.3859,
"step": 602
},
{
"epoch": 4.576850094876661,
"grad_norm": 0.1036240608596365,
"learning_rate": 1.5286919893530727e-06,
"loss": 0.378,
"step": 603
},
{
"epoch": 4.584440227703984,
"grad_norm": 0.10319393509093422,
"learning_rate": 1.4708210543700019e-06,
"loss": 0.3821,
"step": 604
},
{
"epoch": 4.592030360531309,
"grad_norm": 0.09419955371127982,
"learning_rate": 1.4140462388278641e-06,
"loss": 0.382,
"step": 605
},
{
"epoch": 4.599620493358634,
"grad_norm": 0.09325440545672938,
"learning_rate": 1.3583691579175563e-06,
"loss": 0.3796,
"step": 606
},
{
"epoch": 4.6072106261859584,
"grad_norm": 0.0948062866747191,
"learning_rate": 1.3037913956004444e-06,
"loss": 0.3802,
"step": 607
},
{
"epoch": 4.614800759013283,
"grad_norm": 0.08880917402719704,
"learning_rate": 1.2503145045632903e-06,
"loss": 0.3837,
"step": 608
},
{
"epoch": 4.622390891840607,
"grad_norm": 0.10097582002452218,
"learning_rate": 1.1979400061741075e-06,
"loss": 0.3771,
"step": 609
},
{
"epoch": 4.629981024667932,
"grad_norm": 0.09101806397046262,
"learning_rate": 1.146669390438837e-06,
"loss": 0.3806,
"step": 610
},
{
"epoch": 4.637571157495256,
"grad_norm": 0.08881115056650345,
"learning_rate": 1.0965041159589806e-06,
"loss": 0.3891,
"step": 611
},
{
"epoch": 4.645161290322581,
"grad_norm": 0.09427363876334546,
"learning_rate": 1.047445609890132e-06,
"loss": 0.3889,
"step": 612
},
{
"epoch": 4.652751423149905,
"grad_norm": 0.09134492739832982,
"learning_rate": 9.994952679013292e-07,
"loss": 0.3805,
"step": 613
},
{
"epoch": 4.660341555977229,
"grad_norm": 0.08733457555820553,
"learning_rate": 9.526544541353622e-07,
"loss": 0.3721,
"step": 614
},
{
"epoch": 4.667931688804554,
"grad_norm": 0.09375394173236,
"learning_rate": 9.069245011699901e-07,
"loss": 0.3809,
"step": 615
},
{
"epoch": 4.675521821631879,
"grad_norm": 0.09910629089900622,
"learning_rate": 8.623067099800076e-07,
"loss": 0.3781,
"step": 616
},
{
"epoch": 4.6831119544592035,
"grad_norm": 0.08948539795632664,
"learning_rate": 8.188023499002206e-07,
"loss": 0.3852,
"step": 617
},
{
"epoch": 4.690702087286527,
"grad_norm": 0.09627861222029245,
"learning_rate": 7.764126585893694e-07,
"loss": 0.3781,
"step": 618
},
{
"epoch": 4.698292220113852,
"grad_norm": 0.08783688983476867,
"learning_rate": 7.351388419948979e-07,
"loss": 0.3837,
"step": 619
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.08787642986660921,
"learning_rate": 6.949820743186353e-07,
"loss": 0.3932,
"step": 620
},
{
"epoch": 4.713472485768501,
"grad_norm": 0.08788042105203867,
"learning_rate": 6.559434979834223e-07,
"loss": 0.3821,
"step": 621
},
{
"epoch": 4.721062618595825,
"grad_norm": 0.08638210961076086,
"learning_rate": 6.180242236005818e-07,
"loss": 0.385,
"step": 622
},
{
"epoch": 4.72865275142315,
"grad_norm": 0.08376769284093517,
"learning_rate": 5.812253299383308e-07,
"loss": 0.3764,
"step": 623
},
{
"epoch": 4.7362428842504745,
"grad_norm": 0.08758279977677409,
"learning_rate": 5.455478638911071e-07,
"loss": 0.3784,
"step": 624
},
{
"epoch": 4.743833017077799,
"grad_norm": 0.08749897373635039,
"learning_rate": 5.109928404497532e-07,
"loss": 0.3864,
"step": 625
},
{
"epoch": 4.751423149905124,
"grad_norm": 0.08812251812037451,
"learning_rate": 4.775612426726684e-07,
"loss": 0.3832,
"step": 626
},
{
"epoch": 4.759013282732448,
"grad_norm": 0.08938212786163188,
"learning_rate": 4.452540216578349e-07,
"loss": 0.3778,
"step": 627
},
{
"epoch": 4.766603415559772,
"grad_norm": 0.09686068566031868,
"learning_rate": 4.140720965157519e-07,
"loss": 0.3882,
"step": 628
},
{
"epoch": 4.774193548387097,
"grad_norm": 0.0845764980027318,
"learning_rate": 3.840163543433084e-07,
"loss": 0.3778,
"step": 629
},
{
"epoch": 4.781783681214421,
"grad_norm": 0.08252440850107318,
"learning_rate": 3.550876501985112e-07,
"loss": 0.3758,
"step": 630
},
{
"epoch": 4.7893738140417454,
"grad_norm": 0.08576998372456936,
"learning_rate": 3.272868070761881e-07,
"loss": 0.3857,
"step": 631
},
{
"epoch": 4.79696394686907,
"grad_norm": 0.08653227658708267,
"learning_rate": 3.006146158845713e-07,
"loss": 0.387,
"step": 632
},
{
"epoch": 4.804554079696395,
"grad_norm": 0.0868336765504615,
"learning_rate": 2.750718354227822e-07,
"loss": 0.3918,
"step": 633
},
{
"epoch": 4.8121442125237195,
"grad_norm": 0.08524574086673203,
"learning_rate": 2.506591923592572e-07,
"loss": 0.3879,
"step": 634
},
{
"epoch": 4.819734345351043,
"grad_norm": 0.0854074630262595,
"learning_rate": 2.273773812110802e-07,
"loss": 0.3822,
"step": 635
},
{
"epoch": 4.827324478178368,
"grad_norm": 0.08782569347415704,
"learning_rate": 2.0522706432419382e-07,
"loss": 0.389,
"step": 636
},
{
"epoch": 4.834914611005693,
"grad_norm": 0.08728478399615792,
"learning_rate": 1.842088718546009e-07,
"loss": 0.3792,
"step": 637
},
{
"epoch": 4.842504743833017,
"grad_norm": 0.08691066239057142,
"learning_rate": 1.6432340175039253e-07,
"loss": 0.3784,
"step": 638
},
{
"epoch": 4.850094876660341,
"grad_norm": 0.09126251564727415,
"learning_rate": 1.4557121973477472e-07,
"loss": 0.3861,
"step": 639
},
{
"epoch": 4.857685009487666,
"grad_norm": 0.08741494339348937,
"learning_rate": 1.2795285928994372e-07,
"loss": 0.3864,
"step": 640
},
{
"epoch": 4.8652751423149905,
"grad_norm": 0.0876181788276896,
"learning_rate": 1.1146882164193795e-07,
"loss": 0.3847,
"step": 641
},
{
"epoch": 4.872865275142315,
"grad_norm": 0.08754414570978997,
"learning_rate": 9.611957574634734e-08,
"loss": 0.3804,
"step": 642
},
{
"epoch": 4.88045540796964,
"grad_norm": 0.08306030029527668,
"learning_rate": 8.190555827499947e-08,
"loss": 0.3815,
"step": 643
},
{
"epoch": 4.888045540796964,
"grad_norm": 0.09068276050158985,
"learning_rate": 6.882717360352065e-08,
"loss": 0.3833,
"step": 644
},
{
"epoch": 4.895635673624288,
"grad_norm": 0.08477830736645849,
"learning_rate": 5.688479379984291e-08,
"loss": 0.3865,
"step": 645
},
{
"epoch": 4.903225806451613,
"grad_norm": 0.08365266522816545,
"learning_rate": 4.607875861359024e-08,
"loss": 0.3733,
"step": 646
},
{
"epoch": 4.910815939278938,
"grad_norm": 0.08509382362942203,
"learning_rate": 3.640937546646406e-08,
"loss": 0.3801,
"step": 647
},
{
"epoch": 4.9184060721062615,
"grad_norm": 0.08832189058708398,
"learning_rate": 2.787691944345472e-08,
"loss": 0.382,
"step": 648
},
{
"epoch": 4.925996204933586,
"grad_norm": 0.08727763140855628,
"learning_rate": 2.0481633285025505e-08,
"loss": 0.3799,
"step": 649
},
{
"epoch": 4.933586337760911,
"grad_norm": 0.08685568551948444,
"learning_rate": 1.4223727380215935e-08,
"loss": 0.3812,
"step": 650
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.08827489813293968,
"learning_rate": 9.103379760655451e-09,
"loss": 0.3897,
"step": 651
},
{
"epoch": 4.94876660341556,
"grad_norm": 0.08969524756207933,
"learning_rate": 5.120736095483026e-09,
"loss": 0.3803,
"step": 652
},
{
"epoch": 4.956356736242884,
"grad_norm": 0.0849566492970198,
"learning_rate": 2.2759096872260187e-09,
"loss": 0.3774,
"step": 653
},
{
"epoch": 4.963946869070209,
"grad_norm": 0.08816588035901379,
"learning_rate": 5.689814685538863e-10,
"loss": 0.3843,
"step": 654
},
{
"epoch": 4.971537001897533,
"grad_norm": 0.09034351763433966,
"learning_rate": 0.0,
"loss": 0.3759,
"step": 655
},
{
"epoch": 4.971537001897533,
"step": 655,
"total_flos": 1.573215812367627e+19,
"train_loss": 0.4743207697649948,
"train_runtime": 64212.961,
"train_samples_per_second": 5.249,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1.0,
"max_steps": 655,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.573215812367627e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}