r1_annotated_finqa_OT7B / trainer_state.json
neginr's picture
End of training
a47e8f2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.955414012738854,
"eval_steps": 500,
"global_step": 364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01910828025477707,
"grad_norm": 2.6096313072602495,
"learning_rate": 5.405405405405406e-09,
"loss": 0.4868,
"step": 1
},
{
"epoch": 0.03821656050955414,
"grad_norm": 2.485833239472616,
"learning_rate": 1.0810810810810811e-08,
"loss": 0.4952,
"step": 2
},
{
"epoch": 0.05732484076433121,
"grad_norm": 2.4354566981342787,
"learning_rate": 1.6216216216216218e-08,
"loss": 0.5004,
"step": 3
},
{
"epoch": 0.07643312101910828,
"grad_norm": 2.69399408718244,
"learning_rate": 2.1621621621621623e-08,
"loss": 0.4742,
"step": 4
},
{
"epoch": 0.09554140127388536,
"grad_norm": 2.668365174634573,
"learning_rate": 2.7027027027027028e-08,
"loss": 0.4986,
"step": 5
},
{
"epoch": 0.11464968152866242,
"grad_norm": 2.5930151821263263,
"learning_rate": 3.2432432432432436e-08,
"loss": 0.4814,
"step": 6
},
{
"epoch": 0.1337579617834395,
"grad_norm": 2.501772570078749,
"learning_rate": 3.783783783783784e-08,
"loss": 0.4915,
"step": 7
},
{
"epoch": 0.15286624203821655,
"grad_norm": 2.397400915307242,
"learning_rate": 4.3243243243243246e-08,
"loss": 0.4875,
"step": 8
},
{
"epoch": 0.17197452229299362,
"grad_norm": 2.490822236098687,
"learning_rate": 4.864864864864865e-08,
"loss": 0.491,
"step": 9
},
{
"epoch": 0.1910828025477707,
"grad_norm": 2.5012006390851873,
"learning_rate": 5.4054054054054056e-08,
"loss": 0.5067,
"step": 10
},
{
"epoch": 0.21019108280254778,
"grad_norm": 2.6416774026317325,
"learning_rate": 5.945945945945946e-08,
"loss": 0.4942,
"step": 11
},
{
"epoch": 0.22929936305732485,
"grad_norm": 2.7487556432425997,
"learning_rate": 6.486486486486487e-08,
"loss": 0.5075,
"step": 12
},
{
"epoch": 0.2484076433121019,
"grad_norm": 2.4970308931627256,
"learning_rate": 7.027027027027027e-08,
"loss": 0.4895,
"step": 13
},
{
"epoch": 0.267515923566879,
"grad_norm": 2.5273192931380897,
"learning_rate": 7.567567567567568e-08,
"loss": 0.4813,
"step": 14
},
{
"epoch": 0.28662420382165604,
"grad_norm": 2.6313225516871395,
"learning_rate": 8.108108108108108e-08,
"loss": 0.4867,
"step": 15
},
{
"epoch": 0.3057324840764331,
"grad_norm": 2.3941944376719113,
"learning_rate": 8.648648648648649e-08,
"loss": 0.4816,
"step": 16
},
{
"epoch": 0.3248407643312102,
"grad_norm": 2.396927453705762,
"learning_rate": 9.189189189189189e-08,
"loss": 0.487,
"step": 17
},
{
"epoch": 0.34394904458598724,
"grad_norm": 2.4427598740118777,
"learning_rate": 9.72972972972973e-08,
"loss": 0.483,
"step": 18
},
{
"epoch": 0.3630573248407643,
"grad_norm": 2.3754544947268035,
"learning_rate": 1.027027027027027e-07,
"loss": 0.4837,
"step": 19
},
{
"epoch": 0.3821656050955414,
"grad_norm": 2.564444044871758,
"learning_rate": 1.0810810810810811e-07,
"loss": 0.5065,
"step": 20
},
{
"epoch": 0.4012738853503185,
"grad_norm": 2.4115957977957274,
"learning_rate": 1.135135135135135e-07,
"loss": 0.4962,
"step": 21
},
{
"epoch": 0.42038216560509556,
"grad_norm": 2.5329095962208665,
"learning_rate": 1.1891891891891891e-07,
"loss": 0.4996,
"step": 22
},
{
"epoch": 0.4394904458598726,
"grad_norm": 2.4093237985701506,
"learning_rate": 1.2432432432432432e-07,
"loss": 0.4972,
"step": 23
},
{
"epoch": 0.4585987261146497,
"grad_norm": 2.389370360307483,
"learning_rate": 1.2972972972972974e-07,
"loss": 0.4831,
"step": 24
},
{
"epoch": 0.47770700636942676,
"grad_norm": 2.4000353855448058,
"learning_rate": 1.3513513513513512e-07,
"loss": 0.5065,
"step": 25
},
{
"epoch": 0.4968152866242038,
"grad_norm": 2.489575248228843,
"learning_rate": 1.4054054054054055e-07,
"loss": 0.4716,
"step": 26
},
{
"epoch": 0.5159235668789809,
"grad_norm": 2.5917592445480784,
"learning_rate": 1.4594594594594595e-07,
"loss": 0.5162,
"step": 27
},
{
"epoch": 0.535031847133758,
"grad_norm": 2.681299634238704,
"learning_rate": 1.5135135135135135e-07,
"loss": 0.4704,
"step": 28
},
{
"epoch": 0.554140127388535,
"grad_norm": 2.608824979278355,
"learning_rate": 1.5675675675675675e-07,
"loss": 0.5204,
"step": 29
},
{
"epoch": 0.5732484076433121,
"grad_norm": 2.395493492503365,
"learning_rate": 1.6216216216216215e-07,
"loss": 0.4752,
"step": 30
},
{
"epoch": 0.5923566878980892,
"grad_norm": 2.3454050765128804,
"learning_rate": 1.6756756756756755e-07,
"loss": 0.4821,
"step": 31
},
{
"epoch": 0.6114649681528662,
"grad_norm": 2.500652429228352,
"learning_rate": 1.7297297297297298e-07,
"loss": 0.4933,
"step": 32
},
{
"epoch": 0.6305732484076433,
"grad_norm": 2.287679137272454,
"learning_rate": 1.7837837837837836e-07,
"loss": 0.4781,
"step": 33
},
{
"epoch": 0.6496815286624203,
"grad_norm": 2.3784928759155934,
"learning_rate": 1.8378378378378379e-07,
"loss": 0.4969,
"step": 34
},
{
"epoch": 0.6687898089171974,
"grad_norm": 2.44085570936594,
"learning_rate": 1.891891891891892e-07,
"loss": 0.523,
"step": 35
},
{
"epoch": 0.6878980891719745,
"grad_norm": 2.31145296336897,
"learning_rate": 1.945945945945946e-07,
"loss": 0.4863,
"step": 36
},
{
"epoch": 0.7070063694267515,
"grad_norm": 2.3645970243471623,
"learning_rate": 2e-07,
"loss": 0.4936,
"step": 37
},
{
"epoch": 0.7261146496815286,
"grad_norm": 2.3453615625368553,
"learning_rate": 1.999953850085163e-07,
"loss": 0.4848,
"step": 38
},
{
"epoch": 0.7452229299363057,
"grad_norm": 2.3293217503132806,
"learning_rate": 1.999815404600282e-07,
"loss": 0.4893,
"step": 39
},
{
"epoch": 0.7643312101910829,
"grad_norm": 2.1222113857893237,
"learning_rate": 1.999584676323851e-07,
"loss": 0.4644,
"step": 40
},
{
"epoch": 0.7834394904458599,
"grad_norm": 2.255275631085622,
"learning_rate": 1.9992616865520512e-07,
"loss": 0.4541,
"step": 41
},
{
"epoch": 0.802547770700637,
"grad_norm": 2.012214823415202,
"learning_rate": 1.998846465096783e-07,
"loss": 0.466,
"step": 42
},
{
"epoch": 0.821656050955414,
"grad_norm": 2.189296418289646,
"learning_rate": 1.9983390502829166e-07,
"loss": 0.4717,
"step": 43
},
{
"epoch": 0.8407643312101911,
"grad_norm": 2.099897798957949,
"learning_rate": 1.9977394889447523e-07,
"loss": 0.4574,
"step": 44
},
{
"epoch": 0.8598726114649682,
"grad_norm": 2.188648446058437,
"learning_rate": 1.9970478364216996e-07,
"loss": 0.4674,
"step": 45
},
{
"epoch": 0.8789808917197452,
"grad_norm": 1.9210110621178644,
"learning_rate": 1.996264156553169e-07,
"loss": 0.4529,
"step": 46
},
{
"epoch": 0.8980891719745223,
"grad_norm": 1.9817997941615975,
"learning_rate": 1.9953885216726785e-07,
"loss": 0.4677,
"step": 47
},
{
"epoch": 0.9171974522292994,
"grad_norm": 2.1547734650281276,
"learning_rate": 1.9944210126011788e-07,
"loss": 0.4752,
"step": 48
},
{
"epoch": 0.9363057324840764,
"grad_norm": 1.8645798339123572,
"learning_rate": 1.9933617186395914e-07,
"loss": 0.4428,
"step": 49
},
{
"epoch": 0.9554140127388535,
"grad_norm": 2.016567397630417,
"learning_rate": 1.9922107375605698e-07,
"loss": 0.4816,
"step": 50
},
{
"epoch": 0.9745222929936306,
"grad_norm": 1.786631190203663,
"learning_rate": 1.990968175599471e-07,
"loss": 0.4674,
"step": 51
},
{
"epoch": 0.9936305732484076,
"grad_norm": 2.04989081734429,
"learning_rate": 1.9896341474445524e-07,
"loss": 0.4748,
"step": 52
},
{
"epoch": 1.0127388535031847,
"grad_norm": 1.9769794080531382,
"learning_rate": 1.9882087762263852e-07,
"loss": 0.4728,
"step": 53
},
{
"epoch": 1.0318471337579618,
"grad_norm": 1.9938931161500741,
"learning_rate": 1.9866921935064905e-07,
"loss": 0.453,
"step": 54
},
{
"epoch": 1.0509554140127388,
"grad_norm": 1.7014310774732082,
"learning_rate": 1.9850845392651947e-07,
"loss": 0.4541,
"step": 55
},
{
"epoch": 1.070063694267516,
"grad_norm": 1.7639902909661307,
"learning_rate": 1.983385961888711e-07,
"loss": 0.4484,
"step": 56
},
{
"epoch": 1.089171974522293,
"grad_norm": 1.872307114043005,
"learning_rate": 1.981596618155441e-07,
"loss": 0.4779,
"step": 57
},
{
"epoch": 1.10828025477707,
"grad_norm": 1.9070131132592882,
"learning_rate": 1.9797166732215075e-07,
"loss": 0.4376,
"step": 58
},
{
"epoch": 1.127388535031847,
"grad_norm": 1.8165653538892719,
"learning_rate": 1.977746300605507e-07,
"loss": 0.4424,
"step": 59
},
{
"epoch": 1.1464968152866242,
"grad_norm": 1.7003231286369576,
"learning_rate": 1.9756856821724967e-07,
"loss": 0.4485,
"step": 60
},
{
"epoch": 1.1656050955414012,
"grad_norm": 1.676286306752662,
"learning_rate": 1.9735350081172067e-07,
"loss": 0.449,
"step": 61
},
{
"epoch": 1.1847133757961783,
"grad_norm": 1.7123905470024705,
"learning_rate": 1.9712944769464862e-07,
"loss": 0.4622,
"step": 62
},
{
"epoch": 1.2038216560509554,
"grad_norm": 1.4763448653935922,
"learning_rate": 1.9689642954609806e-07,
"loss": 0.4396,
"step": 63
},
{
"epoch": 1.2229299363057324,
"grad_norm": 1.6374280386081919,
"learning_rate": 1.966544678736044e-07,
"loss": 0.4535,
"step": 64
},
{
"epoch": 1.2420382165605095,
"grad_norm": 1.4620089914563061,
"learning_rate": 1.9640358501018882e-07,
"loss": 0.4296,
"step": 65
},
{
"epoch": 1.2611464968152866,
"grad_norm": 1.5335907875182524,
"learning_rate": 1.961438041122969e-07,
"loss": 0.4386,
"step": 66
},
{
"epoch": 1.2802547770700636,
"grad_norm": 1.5655814455212236,
"learning_rate": 1.9587514915766122e-07,
"loss": 0.4462,
"step": 67
},
{
"epoch": 1.2993630573248407,
"grad_norm": 1.4879500094292435,
"learning_rate": 1.9559764494308834e-07,
"loss": 0.436,
"step": 68
},
{
"epoch": 1.3184713375796178,
"grad_norm": 1.4464449790937988,
"learning_rate": 1.9531131708217004e-07,
"loss": 0.458,
"step": 69
},
{
"epoch": 1.3375796178343948,
"grad_norm": 1.4077038156465773,
"learning_rate": 1.9501619200291905e-07,
"loss": 0.4233,
"step": 70
},
{
"epoch": 1.356687898089172,
"grad_norm": 1.4596682102109033,
"learning_rate": 1.9471229694533e-07,
"loss": 0.4322,
"step": 71
},
{
"epoch": 1.3757961783439492,
"grad_norm": 1.299363763403682,
"learning_rate": 1.9439965995886488e-07,
"loss": 0.4383,
"step": 72
},
{
"epoch": 1.394904458598726,
"grad_norm": 1.4211694941800017,
"learning_rate": 1.9407830989986428e-07,
"loss": 0.4404,
"step": 73
},
{
"epoch": 1.4140127388535033,
"grad_norm": 1.4172304910905302,
"learning_rate": 1.9374827642888395e-07,
"loss": 0.4323,
"step": 74
},
{
"epoch": 1.4331210191082802,
"grad_norm": 1.3076536857533756,
"learning_rate": 1.9340959000795706e-07,
"loss": 0.4009,
"step": 75
},
{
"epoch": 1.4522292993630574,
"grad_norm": 1.3024081707340522,
"learning_rate": 1.9306228189778253e-07,
"loss": 0.4499,
"step": 76
},
{
"epoch": 1.4713375796178343,
"grad_norm": 1.3118587098446126,
"learning_rate": 1.927063841548398e-07,
"loss": 0.4322,
"step": 77
},
{
"epoch": 1.4904458598726116,
"grad_norm": 1.350632356042114,
"learning_rate": 1.923419296284299e-07,
"loss": 0.4321,
"step": 78
},
{
"epoch": 1.5095541401273884,
"grad_norm": 1.3513117365592737,
"learning_rate": 1.919689519576436e-07,
"loss": 0.4334,
"step": 79
},
{
"epoch": 1.5286624203821657,
"grad_norm": 1.302417278861503,
"learning_rate": 1.9158748556825634e-07,
"loss": 0.4316,
"step": 80
},
{
"epoch": 1.5477707006369426,
"grad_norm": 1.367016731118727,
"learning_rate": 1.911975656695509e-07,
"loss": 0.4387,
"step": 81
},
{
"epoch": 1.5668789808917198,
"grad_norm": 1.34754516501708,
"learning_rate": 1.907992282510675e-07,
"loss": 0.4372,
"step": 82
},
{
"epoch": 1.5859872611464967,
"grad_norm": 1.2443808808257364,
"learning_rate": 1.90392510079282e-07,
"loss": 0.4299,
"step": 83
},
{
"epoch": 1.605095541401274,
"grad_norm": 1.389465502979021,
"learning_rate": 1.8997744869421245e-07,
"loss": 0.4319,
"step": 84
},
{
"epoch": 1.6242038216560508,
"grad_norm": 1.3395108076243718,
"learning_rate": 1.8955408240595392e-07,
"loss": 0.4154,
"step": 85
},
{
"epoch": 1.643312101910828,
"grad_norm": 1.399829650509336,
"learning_rate": 1.8912245029114278e-07,
"loss": 0.4227,
"step": 86
},
{
"epoch": 1.662420382165605,
"grad_norm": 1.2821155822840626,
"learning_rate": 1.8868259218934966e-07,
"loss": 0.4424,
"step": 87
},
{
"epoch": 1.6815286624203822,
"grad_norm": 1.228514092123298,
"learning_rate": 1.882345486994024e-07,
"loss": 0.4211,
"step": 88
},
{
"epoch": 1.700636942675159,
"grad_norm": 1.199257741732802,
"learning_rate": 1.877783611756389e-07,
"loss": 0.4271,
"step": 89
},
{
"epoch": 1.7197452229299364,
"grad_norm": 1.2278711006510985,
"learning_rate": 1.8731407172408987e-07,
"loss": 0.4309,
"step": 90
},
{
"epoch": 1.7388535031847132,
"grad_norm": 1.2446640718011999,
"learning_rate": 1.8684172319859257e-07,
"loss": 0.4203,
"step": 91
},
{
"epoch": 1.7579617834394905,
"grad_norm": 1.2657944122598679,
"learning_rate": 1.863613591968355e-07,
"loss": 0.4431,
"step": 92
},
{
"epoch": 1.7770700636942676,
"grad_norm": 1.212712280704066,
"learning_rate": 1.8587302405633417e-07,
"loss": 0.4304,
"step": 93
},
{
"epoch": 1.7961783439490446,
"grad_norm": 1.1895034383874779,
"learning_rate": 1.8537676285033885e-07,
"loss": 0.4083,
"step": 94
},
{
"epoch": 1.8152866242038217,
"grad_norm": 1.1891183671713428,
"learning_rate": 1.848726213836744e-07,
"loss": 0.4244,
"step": 95
},
{
"epoch": 1.8343949044585988,
"grad_norm": 1.181853541101605,
"learning_rate": 1.8436064618851224e-07,
"loss": 0.434,
"step": 96
},
{
"epoch": 1.8535031847133758,
"grad_norm": 1.1581194250533744,
"learning_rate": 1.8384088452007576e-07,
"loss": 0.4225,
"step": 97
},
{
"epoch": 1.872611464968153,
"grad_norm": 1.0795760687778084,
"learning_rate": 1.8331338435227837e-07,
"loss": 0.4103,
"step": 98
},
{
"epoch": 1.89171974522293,
"grad_norm": 1.111129745427032,
"learning_rate": 1.8277819437329574e-07,
"loss": 0.4279,
"step": 99
},
{
"epoch": 1.910828025477707,
"grad_norm": 1.1105627528286264,
"learning_rate": 1.8223536398107174e-07,
"loss": 0.4129,
"step": 100
},
{
"epoch": 1.929936305732484,
"grad_norm": 1.0293128045168645,
"learning_rate": 1.8168494327875916e-07,
"loss": 0.4042,
"step": 101
},
{
"epoch": 1.9490445859872612,
"grad_norm": 1.029497275251704,
"learning_rate": 1.8112698307009504e-07,
"loss": 0.4157,
"step": 102
},
{
"epoch": 1.9681528662420382,
"grad_norm": 1.05771473532387,
"learning_rate": 1.8056153485471165e-07,
"loss": 0.4163,
"step": 103
},
{
"epoch": 1.9872611464968153,
"grad_norm": 1.044232706779553,
"learning_rate": 1.7998865082338287e-07,
"loss": 0.411,
"step": 104
},
{
"epoch": 2.0063694267515926,
"grad_norm": 1.0728042673445248,
"learning_rate": 1.7940838385320732e-07,
"loss": 0.4282,
"step": 105
},
{
"epoch": 2.0254777070063694,
"grad_norm": 1.0124517008376481,
"learning_rate": 1.788207875027274e-07,
"loss": 0.3981,
"step": 106
},
{
"epoch": 2.0445859872611467,
"grad_norm": 0.9892098063897707,
"learning_rate": 1.7822591600698629e-07,
"loss": 0.4033,
"step": 107
},
{
"epoch": 2.0636942675159236,
"grad_norm": 0.9882249812487839,
"learning_rate": 1.7762382427252165e-07,
"loss": 0.4124,
"step": 108
},
{
"epoch": 2.082802547770701,
"grad_norm": 1.0014484680247562,
"learning_rate": 1.7701456787229803e-07,
"loss": 0.4155,
"step": 109
},
{
"epoch": 2.1019108280254777,
"grad_norm": 0.9692056423689174,
"learning_rate": 1.7639820304057742e-07,
"loss": 0.4005,
"step": 110
},
{
"epoch": 2.121019108280255,
"grad_norm": 0.9672739874787071,
"learning_rate": 1.7577478666772882e-07,
"loss": 0.4239,
"step": 111
},
{
"epoch": 2.140127388535032,
"grad_norm": 0.9718929588628977,
"learning_rate": 1.7514437629497717e-07,
"loss": 0.3962,
"step": 112
},
{
"epoch": 2.159235668789809,
"grad_norm": 0.986945591632025,
"learning_rate": 1.7450703010909262e-07,
"loss": 0.4134,
"step": 113
},
{
"epoch": 2.178343949044586,
"grad_norm": 0.9390145485362039,
"learning_rate": 1.738628069370195e-07,
"loss": 0.3845,
"step": 114
},
{
"epoch": 2.1974522292993632,
"grad_norm": 1.0524118338702655,
"learning_rate": 1.7321176624044687e-07,
"loss": 0.4186,
"step": 115
},
{
"epoch": 2.21656050955414,
"grad_norm": 1.0444330849016288,
"learning_rate": 1.7255396811032013e-07,
"loss": 0.4024,
"step": 116
},
{
"epoch": 2.2356687898089174,
"grad_norm": 0.951189410440023,
"learning_rate": 1.718894732612947e-07,
"loss": 0.4007,
"step": 117
},
{
"epoch": 2.254777070063694,
"grad_norm": 0.9707318478778935,
"learning_rate": 1.7121834302613186e-07,
"loss": 0.4081,
"step": 118
},
{
"epoch": 2.2738853503184715,
"grad_norm": 0.9577436750510042,
"learning_rate": 1.7054063935003812e-07,
"loss": 0.407,
"step": 119
},
{
"epoch": 2.2929936305732483,
"grad_norm": 0.9668619767258039,
"learning_rate": 1.6985642478494727e-07,
"loss": 0.4095,
"step": 120
},
{
"epoch": 2.3121019108280256,
"grad_norm": 1.0171913879297245,
"learning_rate": 1.6916576248374716e-07,
"loss": 0.4069,
"step": 121
},
{
"epoch": 2.3312101910828025,
"grad_norm": 1.0128147920459283,
"learning_rate": 1.684687161944506e-07,
"loss": 0.3945,
"step": 122
},
{
"epoch": 2.3503184713375798,
"grad_norm": 0.9324167222212655,
"learning_rate": 1.6776535025431129e-07,
"loss": 0.3979,
"step": 123
},
{
"epoch": 2.3694267515923566,
"grad_norm": 0.9284341391341622,
"learning_rate": 1.6705572958388573e-07,
"loss": 0.3799,
"step": 124
},
{
"epoch": 2.388535031847134,
"grad_norm": 0.9419873392238237,
"learning_rate": 1.6633991968104092e-07,
"loss": 0.4152,
"step": 125
},
{
"epoch": 2.4076433121019107,
"grad_norm": 0.9461512919509681,
"learning_rate": 1.6561798661490902e-07,
"loss": 0.42,
"step": 126
},
{
"epoch": 2.426751592356688,
"grad_norm": 0.8953846015326995,
"learning_rate": 1.6488999701978902e-07,
"loss": 0.3988,
"step": 127
},
{
"epoch": 2.445859872611465,
"grad_norm": 0.8817601115413886,
"learning_rate": 1.6415601808899658e-07,
"loss": 0.3941,
"step": 128
},
{
"epoch": 2.464968152866242,
"grad_norm": 0.9329382880500549,
"learning_rate": 1.63416117568662e-07,
"loss": 0.4108,
"step": 129
},
{
"epoch": 2.484076433121019,
"grad_norm": 0.9466218469022187,
"learning_rate": 1.6267036375147723e-07,
"loss": 0.3977,
"step": 130
},
{
"epoch": 2.5031847133757963,
"grad_norm": 0.9073527766440519,
"learning_rate": 1.6191882547039266e-07,
"loss": 0.3973,
"step": 131
},
{
"epoch": 2.522292993630573,
"grad_norm": 0.9277154102961712,
"learning_rate": 1.6116157209226352e-07,
"loss": 0.3842,
"step": 132
},
{
"epoch": 2.5414012738853504,
"grad_norm": 0.9132161554797611,
"learning_rate": 1.6039867351144777e-07,
"loss": 0.39,
"step": 133
},
{
"epoch": 2.5605095541401273,
"grad_norm": 0.9506909219497836,
"learning_rate": 1.5963020014335436e-07,
"loss": 0.3836,
"step": 134
},
{
"epoch": 2.5796178343949046,
"grad_norm": 0.9031771763957327,
"learning_rate": 1.5885622291794428e-07,
"loss": 0.4173,
"step": 135
},
{
"epoch": 2.5987261146496814,
"grad_norm": 0.9589857428653794,
"learning_rate": 1.580768132731837e-07,
"loss": 0.3959,
"step": 136
},
{
"epoch": 2.6178343949044587,
"grad_norm": 0.9063396256825602,
"learning_rate": 1.5729204314845e-07,
"loss": 0.41,
"step": 137
},
{
"epoch": 2.6369426751592355,
"grad_norm": 0.8875281986575585,
"learning_rate": 1.56501984977892e-07,
"loss": 0.4012,
"step": 138
},
{
"epoch": 2.656050955414013,
"grad_norm": 0.8990703005870186,
"learning_rate": 1.5570671168374436e-07,
"loss": 0.4024,
"step": 139
},
{
"epoch": 2.6751592356687897,
"grad_norm": 0.8977323600712754,
"learning_rate": 1.5490629666959666e-07,
"loss": 0.3899,
"step": 140
},
{
"epoch": 2.694267515923567,
"grad_norm": 0.9571943294092002,
"learning_rate": 1.5410081381361829e-07,
"loss": 0.401,
"step": 141
},
{
"epoch": 2.713375796178344,
"grad_norm": 0.857629614904157,
"learning_rate": 1.5329033746173973e-07,
"loss": 0.3886,
"step": 142
},
{
"epoch": 2.732484076433121,
"grad_norm": 0.9442460364317703,
"learning_rate": 1.5247494242079021e-07,
"loss": 0.4211,
"step": 143
},
{
"epoch": 2.7515923566878984,
"grad_norm": 0.9057891101965285,
"learning_rate": 1.5165470395159313e-07,
"loss": 0.3977,
"step": 144
},
{
"epoch": 2.770700636942675,
"grad_norm": 0.862105132626373,
"learning_rate": 1.5082969776201945e-07,
"loss": 0.3916,
"step": 145
},
{
"epoch": 2.789808917197452,
"grad_norm": 0.8863875640101582,
"learning_rate": 1.5e-07,
"loss": 0.375,
"step": 146
},
{
"epoch": 2.8089171974522293,
"grad_norm": 0.9095481357284252,
"learning_rate": 1.4916568724649686e-07,
"loss": 0.3965,
"step": 147
},
{
"epoch": 2.8280254777070066,
"grad_norm": 0.8835243071238383,
"learning_rate": 1.4832683650843506e-07,
"loss": 0.3857,
"step": 148
},
{
"epoch": 2.8471337579617835,
"grad_norm": 0.8444620499975368,
"learning_rate": 1.4748352521159491e-07,
"loss": 0.3868,
"step": 149
},
{
"epoch": 2.8662420382165603,
"grad_norm": 0.9320821828556558,
"learning_rate": 1.4663583119346538e-07,
"loss": 0.4109,
"step": 150
},
{
"epoch": 2.8853503184713376,
"grad_norm": 0.9093581394378585,
"learning_rate": 1.4578383269606002e-07,
"loss": 0.3965,
"step": 151
},
{
"epoch": 2.904458598726115,
"grad_norm": 0.8671575603335869,
"learning_rate": 1.4492760835869502e-07,
"loss": 0.3765,
"step": 152
},
{
"epoch": 2.9235668789808917,
"grad_norm": 0.8770313068851825,
"learning_rate": 1.4406723721073087e-07,
"loss": 0.4112,
"step": 153
},
{
"epoch": 2.9426751592356686,
"grad_norm": 0.8731275332990308,
"learning_rate": 1.4320279866427796e-07,
"loss": 0.3931,
"step": 154
},
{
"epoch": 2.961783439490446,
"grad_norm": 0.929306686854739,
"learning_rate": 1.4233437250686693e-07,
"loss": 0.4045,
"step": 155
},
{
"epoch": 2.980891719745223,
"grad_norm": 0.920864970116778,
"learning_rate": 1.4146203889408418e-07,
"loss": 0.4011,
"step": 156
},
{
"epoch": 3.0,
"grad_norm": 0.9256172158294934,
"learning_rate": 1.4058587834217354e-07,
"loss": 0.4051,
"step": 157
},
{
"epoch": 3.0191082802547773,
"grad_norm": 0.8701795137759962,
"learning_rate": 1.397059717206048e-07,
"loss": 0.3837,
"step": 158
},
{
"epoch": 3.038216560509554,
"grad_norm": 0.8658948728093971,
"learning_rate": 1.3882240024460924e-07,
"loss": 0.3993,
"step": 159
},
{
"epoch": 3.0573248407643314,
"grad_norm": 0.8550607152625319,
"learning_rate": 1.3793524546768356e-07,
"loss": 0.4131,
"step": 160
},
{
"epoch": 3.0764331210191083,
"grad_norm": 0.8522647937778317,
"learning_rate": 1.370445892740626e-07,
"loss": 0.3922,
"step": 161
},
{
"epoch": 3.0955414012738856,
"grad_norm": 0.8942298522208958,
"learning_rate": 1.361505138711613e-07,
"loss": 0.3886,
"step": 162
},
{
"epoch": 3.1146496815286624,
"grad_norm": 0.8173283412874084,
"learning_rate": 1.3525310178198706e-07,
"loss": 0.3795,
"step": 163
},
{
"epoch": 3.1337579617834397,
"grad_norm": 0.8975022792003201,
"learning_rate": 1.343524358375229e-07,
"loss": 0.3788,
"step": 164
},
{
"epoch": 3.1528662420382165,
"grad_norm": 0.8121287355981773,
"learning_rate": 1.3344859916908204e-07,
"loss": 0.3714,
"step": 165
},
{
"epoch": 3.171974522292994,
"grad_norm": 0.8780441128171443,
"learning_rate": 1.325416752006351e-07,
"loss": 0.384,
"step": 166
},
{
"epoch": 3.1910828025477707,
"grad_norm": 0.8709703387717114,
"learning_rate": 1.3163174764110982e-07,
"loss": 0.3937,
"step": 167
},
{
"epoch": 3.210191082802548,
"grad_norm": 0.8352312687881929,
"learning_rate": 1.3071890047666496e-07,
"loss": 0.4,
"step": 168
},
{
"epoch": 3.229299363057325,
"grad_norm": 0.8370238487705333,
"learning_rate": 1.2980321796293835e-07,
"loss": 0.3929,
"step": 169
},
{
"epoch": 3.248407643312102,
"grad_norm": 0.8754516601719645,
"learning_rate": 1.288847846172701e-07,
"loss": 0.3858,
"step": 170
},
{
"epoch": 3.267515923566879,
"grad_norm": 0.8493944760687623,
"learning_rate": 1.2796368521090143e-07,
"loss": 0.3753,
"step": 171
},
{
"epoch": 3.286624203821656,
"grad_norm": 0.8672706457828706,
"learning_rate": 1.270400047611508e-07,
"loss": 0.3889,
"step": 172
},
{
"epoch": 3.305732484076433,
"grad_norm": 0.8578114494949349,
"learning_rate": 1.261138285235663e-07,
"loss": 0.3909,
"step": 173
},
{
"epoch": 3.3248407643312103,
"grad_norm": 0.88573195675492,
"learning_rate": 1.2518524198405698e-07,
"loss": 0.4025,
"step": 174
},
{
"epoch": 3.343949044585987,
"grad_norm": 0.8388605708511595,
"learning_rate": 1.2425433085100222e-07,
"loss": 0.3965,
"step": 175
},
{
"epoch": 3.3630573248407645,
"grad_norm": 0.8254754751848654,
"learning_rate": 1.2332118104734109e-07,
"loss": 0.3962,
"step": 176
},
{
"epoch": 3.3821656050955413,
"grad_norm": 0.8404360589007722,
"learning_rate": 1.223858787026415e-07,
"loss": 0.3836,
"step": 177
},
{
"epoch": 3.4012738853503186,
"grad_norm": 0.8797609449074615,
"learning_rate": 1.2144851014515054e-07,
"loss": 0.3936,
"step": 178
},
{
"epoch": 3.4203821656050954,
"grad_norm": 0.8426178024076959,
"learning_rate": 1.2050916189382645e-07,
"loss": 0.3931,
"step": 179
},
{
"epoch": 3.4394904458598727,
"grad_norm": 0.8675648308269749,
"learning_rate": 1.195679206503528e-07,
"loss": 0.3867,
"step": 180
},
{
"epoch": 3.4585987261146496,
"grad_norm": 0.870049660418513,
"learning_rate": 1.1862487329113604e-07,
"loss": 0.3943,
"step": 181
},
{
"epoch": 3.477707006369427,
"grad_norm": 0.8394567213573604,
"learning_rate": 1.1768010685928685e-07,
"loss": 0.3856,
"step": 182
},
{
"epoch": 3.4968152866242037,
"grad_norm": 0.8307302061680673,
"learning_rate": 1.1673370855658591e-07,
"loss": 0.392,
"step": 183
},
{
"epoch": 3.515923566878981,
"grad_norm": 0.8963343005230818,
"learning_rate": 1.1578576573543539e-07,
"loss": 0.3962,
"step": 184
},
{
"epoch": 3.535031847133758,
"grad_norm": 0.8672782496610111,
"learning_rate": 1.1483636589079626e-07,
"loss": 0.3898,
"step": 185
},
{
"epoch": 3.554140127388535,
"grad_norm": 0.8164723155704265,
"learning_rate": 1.138855966521124e-07,
"loss": 0.3876,
"step": 186
},
{
"epoch": 3.573248407643312,
"grad_norm": 0.8104375092397201,
"learning_rate": 1.1293354577522263e-07,
"loss": 0.3972,
"step": 187
},
{
"epoch": 3.5923566878980893,
"grad_norm": 0.8800104752834295,
"learning_rate": 1.1198030113426074e-07,
"loss": 0.3887,
"step": 188
},
{
"epoch": 3.611464968152866,
"grad_norm": 0.8243201770417823,
"learning_rate": 1.110259507135447e-07,
"loss": 0.4074,
"step": 189
},
{
"epoch": 3.6305732484076434,
"grad_norm": 0.818166857201434,
"learning_rate": 1.1007058259945583e-07,
"loss": 0.3903,
"step": 190
},
{
"epoch": 3.6496815286624202,
"grad_norm": 0.8265082024151587,
"learning_rate": 1.0911428497230832e-07,
"loss": 0.3961,
"step": 191
},
{
"epoch": 3.6687898089171975,
"grad_norm": 0.885762495423559,
"learning_rate": 1.0815714609821025e-07,
"loss": 0.3728,
"step": 192
},
{
"epoch": 3.6878980891719744,
"grad_norm": 0.8413664352472379,
"learning_rate": 1.071992543209167e-07,
"loss": 0.4015,
"step": 193
},
{
"epoch": 3.7070063694267517,
"grad_norm": 0.8676923040006305,
"learning_rate": 1.0624069805367557e-07,
"loss": 0.3792,
"step": 194
},
{
"epoch": 3.7261146496815285,
"grad_norm": 0.8027532451126252,
"learning_rate": 1.0528156577106702e-07,
"loss": 0.3695,
"step": 195
},
{
"epoch": 3.745222929936306,
"grad_norm": 0.8005652974605109,
"learning_rate": 1.0432194600083739e-07,
"loss": 0.3844,
"step": 196
},
{
"epoch": 3.7643312101910826,
"grad_norm": 0.8538263494199514,
"learning_rate": 1.0336192731572803e-07,
"loss": 0.3728,
"step": 197
},
{
"epoch": 3.78343949044586,
"grad_norm": 0.858813703248373,
"learning_rate": 1.0240159832530007e-07,
"loss": 0.3982,
"step": 198
},
{
"epoch": 3.802547770700637,
"grad_norm": 0.8995549935364188,
"learning_rate": 1.0144104766775572e-07,
"loss": 0.4082,
"step": 199
},
{
"epoch": 3.821656050955414,
"grad_norm": 0.8240235309228603,
"learning_rate": 1.0048036400175708e-07,
"loss": 0.3817,
"step": 200
},
{
"epoch": 3.840764331210191,
"grad_norm": 0.7944949119815911,
"learning_rate": 9.951963599824293e-08,
"loss": 0.4014,
"step": 201
},
{
"epoch": 3.859872611464968,
"grad_norm": 0.8080274716468211,
"learning_rate": 9.855895233224429e-08,
"loss": 0.3874,
"step": 202
},
{
"epoch": 3.8789808917197455,
"grad_norm": 0.8704734628918741,
"learning_rate": 9.759840167469994e-08,
"loss": 0.3776,
"step": 203
},
{
"epoch": 3.8980891719745223,
"grad_norm": 0.7760511777395571,
"learning_rate": 9.663807268427197e-08,
"loss": 0.3834,
"step": 204
},
{
"epoch": 3.917197452229299,
"grad_norm": 0.8332364115746148,
"learning_rate": 9.567805399916259e-08,
"loss": 0.374,
"step": 205
},
{
"epoch": 3.9363057324840764,
"grad_norm": 0.8277765731528189,
"learning_rate": 9.471843422893297e-08,
"loss": 0.3868,
"step": 206
},
{
"epoch": 3.9554140127388537,
"grad_norm": 0.8608849591995096,
"learning_rate": 9.375930194632446e-08,
"loss": 0.385,
"step": 207
},
{
"epoch": 3.9745222929936306,
"grad_norm": 0.8787510622754615,
"learning_rate": 9.28007456790833e-08,
"loss": 0.3726,
"step": 208
},
{
"epoch": 3.9936305732484074,
"grad_norm": 0.7972052654644051,
"learning_rate": 9.184285390178977e-08,
"loss": 0.3775,
"step": 209
},
{
"epoch": 4.012738853503185,
"grad_norm": 0.8068665514966669,
"learning_rate": 9.088571502769167e-08,
"loss": 0.3872,
"step": 210
},
{
"epoch": 4.031847133757962,
"grad_norm": 0.8146646388290683,
"learning_rate": 8.992941740054417e-08,
"loss": 0.3878,
"step": 211
},
{
"epoch": 4.050955414012739,
"grad_norm": 0.8655592462822442,
"learning_rate": 8.897404928645527e-08,
"loss": 0.3886,
"step": 212
},
{
"epoch": 4.070063694267516,
"grad_norm": 0.7920711667344885,
"learning_rate": 8.801969886573929e-08,
"loss": 0.3854,
"step": 213
},
{
"epoch": 4.089171974522293,
"grad_norm": 0.8379714883887506,
"learning_rate": 8.706645422477737e-08,
"loss": 0.3691,
"step": 214
},
{
"epoch": 4.10828025477707,
"grad_norm": 0.8437233766568482,
"learning_rate": 8.611440334788762e-08,
"loss": 0.3744,
"step": 215
},
{
"epoch": 4.127388535031847,
"grad_norm": 0.8909392753897909,
"learning_rate": 8.516363410920375e-08,
"loss": 0.3962,
"step": 216
},
{
"epoch": 4.146496815286624,
"grad_norm": 0.8203045296400541,
"learning_rate": 8.42142342645646e-08,
"loss": 0.3862,
"step": 217
},
{
"epoch": 4.165605095541402,
"grad_norm": 0.7744540478897702,
"learning_rate": 8.326629144341405e-08,
"loss": 0.3642,
"step": 218
},
{
"epoch": 4.1847133757961785,
"grad_norm": 0.863804040299928,
"learning_rate": 8.231989314071316e-08,
"loss": 0.3964,
"step": 219
},
{
"epoch": 4.203821656050955,
"grad_norm": 0.801440996806109,
"learning_rate": 8.137512670886396e-08,
"loss": 0.3837,
"step": 220
},
{
"epoch": 4.222929936305732,
"grad_norm": 0.813319837107103,
"learning_rate": 8.04320793496472e-08,
"loss": 0.4017,
"step": 221
},
{
"epoch": 4.24203821656051,
"grad_norm": 0.8601056640259767,
"learning_rate": 7.949083810617357e-08,
"loss": 0.3857,
"step": 222
},
{
"epoch": 4.261146496815287,
"grad_norm": 0.8181368232094955,
"learning_rate": 7.855148985484945e-08,
"loss": 0.3812,
"step": 223
},
{
"epoch": 4.280254777070064,
"grad_norm": 0.8346128486570273,
"learning_rate": 7.761412129735851e-08,
"loss": 0.3852,
"step": 224
},
{
"epoch": 4.2993630573248405,
"grad_norm": 0.8108319089562595,
"learning_rate": 7.667881895265893e-08,
"loss": 0.3732,
"step": 225
},
{
"epoch": 4.318471337579618,
"grad_norm": 0.8157847645705257,
"learning_rate": 7.574566914899778e-08,
"loss": 0.3724,
"step": 226
},
{
"epoch": 4.337579617834395,
"grad_norm": 0.8304647677067434,
"learning_rate": 7.481475801594301e-08,
"loss": 0.3727,
"step": 227
},
{
"epoch": 4.356687898089172,
"grad_norm": 0.7881108642715712,
"learning_rate": 7.38861714764337e-08,
"loss": 0.3878,
"step": 228
},
{
"epoch": 4.375796178343949,
"grad_norm": 0.8099514584757647,
"learning_rate": 7.29599952388492e-08,
"loss": 0.3782,
"step": 229
},
{
"epoch": 4.3949044585987265,
"grad_norm": 0.8755861192813177,
"learning_rate": 7.203631478909857e-08,
"loss": 0.3689,
"step": 230
},
{
"epoch": 4.414012738853503,
"grad_norm": 0.7788977257515683,
"learning_rate": 7.111521538272996e-08,
"loss": 0.3685,
"step": 231
},
{
"epoch": 4.43312101910828,
"grad_norm": 0.8404128728999694,
"learning_rate": 7.019678203706163e-08,
"loss": 0.3904,
"step": 232
},
{
"epoch": 4.452229299363057,
"grad_norm": 0.8342544645155691,
"learning_rate": 6.928109952333506e-08,
"loss": 0.3962,
"step": 233
},
{
"epoch": 4.471337579617835,
"grad_norm": 0.7989245810139801,
"learning_rate": 6.836825235889018e-08,
"loss": 0.3964,
"step": 234
},
{
"epoch": 4.490445859872612,
"grad_norm": 0.8134468790899694,
"learning_rate": 6.74583247993649e-08,
"loss": 0.4079,
"step": 235
},
{
"epoch": 4.509554140127388,
"grad_norm": 0.8029199573915637,
"learning_rate": 6.655140083091793e-08,
"loss": 0.3887,
"step": 236
},
{
"epoch": 4.528662420382165,
"grad_norm": 0.7735464879684218,
"learning_rate": 6.56475641624771e-08,
"loss": 0.3738,
"step": 237
},
{
"epoch": 4.547770700636943,
"grad_norm": 0.8847195049321371,
"learning_rate": 6.474689821801294e-08,
"loss": 0.3777,
"step": 238
},
{
"epoch": 4.56687898089172,
"grad_norm": 0.846199572978138,
"learning_rate": 6.384948612883871e-08,
"loss": 0.3851,
"step": 239
},
{
"epoch": 4.585987261146497,
"grad_norm": 0.8240989805661456,
"learning_rate": 6.29554107259374e-08,
"loss": 0.3928,
"step": 240
},
{
"epoch": 4.6050955414012735,
"grad_norm": 0.835310477492176,
"learning_rate": 6.206475453231643e-08,
"loss": 0.3839,
"step": 241
},
{
"epoch": 4.624203821656051,
"grad_norm": 0.8319583235045928,
"learning_rate": 6.117759975539074e-08,
"loss": 0.3698,
"step": 242
},
{
"epoch": 4.643312101910828,
"grad_norm": 0.8181812452470683,
"learning_rate": 6.029402827939519e-08,
"loss": 0.3683,
"step": 243
},
{
"epoch": 4.662420382165605,
"grad_norm": 0.8783636137594268,
"learning_rate": 5.941412165782644e-08,
"loss": 0.3785,
"step": 244
},
{
"epoch": 4.681528662420382,
"grad_norm": 0.8165663248513478,
"learning_rate": 5.853796110591582e-08,
"loss": 0.3838,
"step": 245
},
{
"epoch": 4.7006369426751595,
"grad_norm": 0.8484264910287741,
"learning_rate": 5.7665627493133084e-08,
"loss": 0.3759,
"step": 246
},
{
"epoch": 4.719745222929936,
"grad_norm": 0.8285714683962552,
"learning_rate": 5.6797201335722055e-08,
"loss": 0.3824,
"step": 247
},
{
"epoch": 4.738853503184713,
"grad_norm": 0.8112577497500143,
"learning_rate": 5.593276278926912e-08,
"loss": 0.3645,
"step": 248
},
{
"epoch": 4.757961783439491,
"grad_norm": 0.8374711715413657,
"learning_rate": 5.5072391641305003e-08,
"loss": 0.3961,
"step": 249
},
{
"epoch": 4.777070063694268,
"grad_norm": 0.8019560325800055,
"learning_rate": 5.4216167303939996e-08,
"loss": 0.3838,
"step": 250
},
{
"epoch": 4.796178343949045,
"grad_norm": 0.8122463812859406,
"learning_rate": 5.33641688065346e-08,
"loss": 0.3731,
"step": 251
},
{
"epoch": 4.8152866242038215,
"grad_norm": 0.812582559765451,
"learning_rate": 5.251647478840511e-08,
"loss": 0.3858,
"step": 252
},
{
"epoch": 4.834394904458598,
"grad_norm": 0.8306986525054737,
"learning_rate": 5.167316349156494e-08,
"loss": 0.3934,
"step": 253
},
{
"epoch": 4.853503184713376,
"grad_norm": 0.8503018239839694,
"learning_rate": 5.0834312753503117e-08,
"loss": 0.3998,
"step": 254
},
{
"epoch": 4.872611464968153,
"grad_norm": 0.7713185185461976,
"learning_rate": 5.000000000000002e-08,
"loss": 0.397,
"step": 255
},
{
"epoch": 4.89171974522293,
"grad_norm": 0.8282244186999879,
"learning_rate": 4.9170302237980564e-08,
"loss": 0.3874,
"step": 256
},
{
"epoch": 4.9108280254777075,
"grad_norm": 0.8193062680508688,
"learning_rate": 4.8345296048406856e-08,
"loss": 0.3856,
"step": 257
},
{
"epoch": 4.929936305732484,
"grad_norm": 0.7737494059301828,
"learning_rate": 4.752505757920977e-08,
"loss": 0.3679,
"step": 258
},
{
"epoch": 4.949044585987261,
"grad_norm": 0.8625873751464171,
"learning_rate": 4.6709662538260266e-08,
"loss": 0.3743,
"step": 259
},
{
"epoch": 4.968152866242038,
"grad_norm": 0.8362733962777252,
"learning_rate": 4.5899186186381725e-08,
"loss": 0.4043,
"step": 260
},
{
"epoch": 4.987261146496815,
"grad_norm": 0.8032355034719745,
"learning_rate": 4.5093703330403374e-08,
"loss": 0.377,
"step": 261
},
{
"epoch": 5.006369426751593,
"grad_norm": 0.7748662472235436,
"learning_rate": 4.429328831625565e-08,
"loss": 0.386,
"step": 262
},
{
"epoch": 5.025477707006369,
"grad_norm": 0.8066132016507707,
"learning_rate": 4.3498015022108e-08,
"loss": 0.3887,
"step": 263
},
{
"epoch": 5.044585987261146,
"grad_norm": 0.7758364698198985,
"learning_rate": 4.270795685155001e-08,
"loss": 0.3826,
"step": 264
},
{
"epoch": 5.063694267515924,
"grad_norm": 0.7661247386701426,
"learning_rate": 4.1923186726816305e-08,
"loss": 0.3622,
"step": 265
},
{
"epoch": 5.082802547770701,
"grad_norm": 0.905331062267788,
"learning_rate": 4.114377708205571e-08,
"loss": 0.3933,
"step": 266
},
{
"epoch": 5.101910828025478,
"grad_norm": 0.8011338273436825,
"learning_rate": 4.036979985664566e-08,
"loss": 0.3928,
"step": 267
},
{
"epoch": 5.1210191082802545,
"grad_norm": 0.8194712847479001,
"learning_rate": 3.9601326488552255e-08,
"loss": 0.3817,
"step": 268
},
{
"epoch": 5.140127388535032,
"grad_norm": 0.7802367995341835,
"learning_rate": 3.883842790773647e-08,
"loss": 0.351,
"step": 269
},
{
"epoch": 5.159235668789809,
"grad_norm": 0.8239153884425812,
"learning_rate": 3.808117452960734e-08,
"loss": 0.3937,
"step": 270
},
{
"epoch": 5.178343949044586,
"grad_norm": 0.859267053614819,
"learning_rate": 3.732963624852274e-08,
"loss": 0.388,
"step": 271
},
{
"epoch": 5.197452229299363,
"grad_norm": 0.7725510162309273,
"learning_rate": 3.658388243133804e-08,
"loss": 0.3867,
"step": 272
},
{
"epoch": 5.2165605095541405,
"grad_norm": 0.8062649067737858,
"learning_rate": 3.584398191100341e-08,
"loss": 0.3778,
"step": 273
},
{
"epoch": 5.235668789808917,
"grad_norm": 0.8136961652490752,
"learning_rate": 3.5110002980210973e-08,
"loss": 0.3856,
"step": 274
},
{
"epoch": 5.254777070063694,
"grad_norm": 0.8069370792436726,
"learning_rate": 3.438201338509098e-08,
"loss": 0.381,
"step": 275
},
{
"epoch": 5.273885350318471,
"grad_norm": 0.7912165067167394,
"learning_rate": 3.366008031895904e-08,
"loss": 0.3947,
"step": 276
},
{
"epoch": 5.292993630573249,
"grad_norm": 0.7925916759275095,
"learning_rate": 3.294427041611425e-08,
"loss": 0.3663,
"step": 277
},
{
"epoch": 5.312101910828026,
"grad_norm": 0.8380350829763229,
"learning_rate": 3.223464974568874e-08,
"loss": 0.3998,
"step": 278
},
{
"epoch": 5.3312101910828025,
"grad_norm": 0.8059508700292909,
"learning_rate": 3.15312838055494e-08,
"loss": 0.3811,
"step": 279
},
{
"epoch": 5.350318471337579,
"grad_norm": 0.8537272020739671,
"learning_rate": 3.083423751625281e-08,
"loss": 0.3908,
"step": 280
},
{
"epoch": 5.369426751592357,
"grad_norm": 0.7769880588570798,
"learning_rate": 3.014357521505273e-08,
"loss": 0.3876,
"step": 281
},
{
"epoch": 5.388535031847134,
"grad_norm": 0.7970916009843863,
"learning_rate": 2.9459360649961896e-08,
"loss": 0.3915,
"step": 282
},
{
"epoch": 5.407643312101911,
"grad_norm": 0.7822357636738501,
"learning_rate": 2.878165697386812e-08,
"loss": 0.3925,
"step": 283
},
{
"epoch": 5.426751592356688,
"grad_norm": 0.8155762137747297,
"learning_rate": 2.811052673870534e-08,
"loss": 0.3804,
"step": 284
},
{
"epoch": 5.445859872611465,
"grad_norm": 0.8167709665350011,
"learning_rate": 2.7446031889679888e-08,
"loss": 0.341,
"step": 285
},
{
"epoch": 5.464968152866242,
"grad_norm": 0.8378049495290578,
"learning_rate": 2.6788233759553138e-08,
"loss": 0.383,
"step": 286
},
{
"epoch": 5.484076433121019,
"grad_norm": 0.852274854933172,
"learning_rate": 2.61371930629805e-08,
"loss": 0.3752,
"step": 287
},
{
"epoch": 5.503184713375796,
"grad_norm": 0.8005697568656431,
"learning_rate": 2.549296989090738e-08,
"loss": 0.3817,
"step": 288
},
{
"epoch": 5.522292993630574,
"grad_norm": 0.8111352520735085,
"learning_rate": 2.4855623705022788e-08,
"loss": 0.3924,
"step": 289
},
{
"epoch": 5.54140127388535,
"grad_norm": 0.8502144978449777,
"learning_rate": 2.4225213332271198e-08,
"loss": 0.3982,
"step": 290
},
{
"epoch": 5.560509554140127,
"grad_norm": 0.8175467759233411,
"learning_rate": 2.3601796959422582e-08,
"loss": 0.3713,
"step": 291
},
{
"epoch": 5.579617834394904,
"grad_norm": 0.8048011556362245,
"learning_rate": 2.2985432127701942e-08,
"loss": 0.3716,
"step": 292
},
{
"epoch": 5.598726114649682,
"grad_norm": 0.8222153363537893,
"learning_rate": 2.237617572747834e-08,
"loss": 0.3644,
"step": 293
},
{
"epoch": 5.617834394904459,
"grad_norm": 0.7973734267283885,
"learning_rate": 2.1774083993013716e-08,
"loss": 0.3768,
"step": 294
},
{
"epoch": 5.6369426751592355,
"grad_norm": 0.7885706909780547,
"learning_rate": 2.117921249727258e-08,
"loss": 0.3772,
"step": 295
},
{
"epoch": 5.656050955414012,
"grad_norm": 0.8460370600218118,
"learning_rate": 2.0591616146792702e-08,
"loss": 0.3722,
"step": 296
},
{
"epoch": 5.67515923566879,
"grad_norm": 0.7822794422028582,
"learning_rate": 2.001134917661713e-08,
"loss": 0.3833,
"step": 297
},
{
"epoch": 5.694267515923567,
"grad_norm": 0.8417179595540075,
"learning_rate": 1.9438465145288373e-08,
"loss": 0.3852,
"step": 298
},
{
"epoch": 5.713375796178344,
"grad_norm": 0.8260325891913302,
"learning_rate": 1.8873016929904938e-08,
"loss": 0.3822,
"step": 299
},
{
"epoch": 5.732484076433121,
"grad_norm": 0.8235232563783705,
"learning_rate": 1.831505672124083e-08,
"loss": 0.3909,
"step": 300
},
{
"epoch": 5.751592356687898,
"grad_norm": 0.82251787065623,
"learning_rate": 1.776463601892825e-08,
"loss": 0.3806,
"step": 301
},
{
"epoch": 5.770700636942675,
"grad_norm": 0.8219732366109578,
"learning_rate": 1.7221805626704277e-08,
"loss": 0.3932,
"step": 302
},
{
"epoch": 5.789808917197452,
"grad_norm": 0.8500419948581102,
"learning_rate": 1.6686615647721637e-08,
"loss": 0.3969,
"step": 303
},
{
"epoch": 5.80891719745223,
"grad_norm": 0.8195669552486191,
"learning_rate": 1.615911547992426e-08,
"loss": 0.3777,
"step": 304
},
{
"epoch": 5.828025477707007,
"grad_norm": 0.8203695492022786,
"learning_rate": 1.5639353811487744e-08,
"loss": 0.3683,
"step": 305
},
{
"epoch": 5.8471337579617835,
"grad_norm": 0.8228092581987145,
"learning_rate": 1.5127378616325602e-08,
"loss": 0.3779,
"step": 306
},
{
"epoch": 5.86624203821656,
"grad_norm": 0.7965205823215074,
"learning_rate": 1.4623237149661139e-08,
"loss": 0.3753,
"step": 307
},
{
"epoch": 5.885350318471337,
"grad_norm": 0.8408800872828248,
"learning_rate": 1.4126975943665842e-08,
"loss": 0.377,
"step": 308
},
{
"epoch": 5.904458598726115,
"grad_norm": 0.8319451588726945,
"learning_rate": 1.3638640803164514e-08,
"loss": 0.3891,
"step": 309
},
{
"epoch": 5.923566878980892,
"grad_norm": 0.8007835686361502,
"learning_rate": 1.3158276801407431e-08,
"loss": 0.3858,
"step": 310
},
{
"epoch": 5.942675159235669,
"grad_norm": 0.8095992909101725,
"learning_rate": 1.268592827591014e-08,
"loss": 0.3629,
"step": 311
},
{
"epoch": 5.961783439490446,
"grad_norm": 0.7963300257867078,
"learning_rate": 1.2221638824361069e-08,
"loss": 0.3782,
"step": 312
},
{
"epoch": 5.980891719745223,
"grad_norm": 0.7726630979855738,
"learning_rate": 1.1765451300597573e-08,
"loss": 0.3834,
"step": 313
},
{
"epoch": 6.0,
"grad_norm": 0.7479398982894709,
"learning_rate": 1.131740781065037e-08,
"loss": 0.3633,
"step": 314
},
{
"epoch": 6.019108280254777,
"grad_norm": 0.7967021599978159,
"learning_rate": 1.0877549708857225e-08,
"loss": 0.3777,
"step": 315
},
{
"epoch": 6.038216560509555,
"grad_norm": 0.7649932643254016,
"learning_rate": 1.0445917594046071e-08,
"loss": 0.3533,
"step": 316
},
{
"epoch": 6.057324840764331,
"grad_norm": 0.7847115578358245,
"learning_rate": 1.0022551305787563e-08,
"loss": 0.3847,
"step": 317
},
{
"epoch": 6.076433121019108,
"grad_norm": 0.8113396532764215,
"learning_rate": 9.607489920717981e-09,
"loss": 0.4053,
"step": 318
},
{
"epoch": 6.095541401273885,
"grad_norm": 0.7933481355886415,
"learning_rate": 9.200771748932512e-09,
"loss": 0.3712,
"step": 319
},
{
"epoch": 6.114649681528663,
"grad_norm": 0.8031446051590815,
"learning_rate": 8.802434330449127e-09,
"loss": 0.3706,
"step": 320
},
{
"epoch": 6.13375796178344,
"grad_norm": 0.7899975029998316,
"learning_rate": 8.412514431743656e-09,
"loss": 0.3926,
"step": 321
},
{
"epoch": 6.1528662420382165,
"grad_norm": 0.83711210181786,
"learning_rate": 8.031048042356392e-09,
"loss": 0.3828,
"step": 322
},
{
"epoch": 6.171974522292993,
"grad_norm": 0.8108835667977151,
"learning_rate": 7.65807037157007e-09,
"loss": 0.3953,
"step": 323
},
{
"epoch": 6.191082802547771,
"grad_norm": 0.7940213310862922,
"learning_rate": 7.293615845160195e-09,
"loss": 0.3798,
"step": 324
},
{
"epoch": 6.210191082802548,
"grad_norm": 0.8090923493531231,
"learning_rate": 6.9377181022174604e-09,
"loss": 0.3489,
"step": 325
},
{
"epoch": 6.229299363057325,
"grad_norm": 0.8150664351790909,
"learning_rate": 6.590409992042956e-09,
"loss": 0.3652,
"step": 326
},
{
"epoch": 6.248407643312102,
"grad_norm": 0.8247491005855746,
"learning_rate": 6.25172357111603e-09,
"loss": 0.3793,
"step": 327
},
{
"epoch": 6.267515923566879,
"grad_norm": 0.8107418363338103,
"learning_rate": 5.921690100135712e-09,
"loss": 0.3737,
"step": 328
},
{
"epoch": 6.286624203821656,
"grad_norm": 0.7755128906973803,
"learning_rate": 5.600340041135132e-09,
"loss": 0.3662,
"step": 329
},
{
"epoch": 6.305732484076433,
"grad_norm": 0.7795624367146271,
"learning_rate": 5.2877030546700115e-09,
"loss": 0.3737,
"step": 330
},
{
"epoch": 6.32484076433121,
"grad_norm": 0.7803109557900149,
"learning_rate": 4.9838079970809245e-09,
"loss": 0.3644,
"step": 331
},
{
"epoch": 6.343949044585988,
"grad_norm": 0.8101411487647653,
"learning_rate": 4.688682917829967e-09,
"loss": 0.3822,
"step": 332
},
{
"epoch": 6.3630573248407645,
"grad_norm": 0.7817124326904151,
"learning_rate": 4.402355056911655e-09,
"loss": 0.3877,
"step": 333
},
{
"epoch": 6.382165605095541,
"grad_norm": 0.7937959337403729,
"learning_rate": 4.124850842338778e-09,
"loss": 0.3831,
"step": 334
},
{
"epoch": 6.401273885350318,
"grad_norm": 0.8137404165669514,
"learning_rate": 3.856195887703095e-09,
"loss": 0.3774,
"step": 335
},
{
"epoch": 6.420382165605096,
"grad_norm": 0.7855307116650134,
"learning_rate": 3.5964149898111585e-09,
"loss": 0.3837,
"step": 336
},
{
"epoch": 6.439490445859873,
"grad_norm": 0.8358282343705817,
"learning_rate": 3.345532126395578e-09,
"loss": 0.3764,
"step": 337
},
{
"epoch": 6.45859872611465,
"grad_norm": 0.8278966939308083,
"learning_rate": 3.103570453901938e-09,
"loss": 0.3764,
"step": 338
},
{
"epoch": 6.477707006369426,
"grad_norm": 0.8027521273066138,
"learning_rate": 2.8705523053513814e-09,
"loss": 0.3924,
"step": 339
},
{
"epoch": 6.496815286624204,
"grad_norm": 0.8392704792372261,
"learning_rate": 2.6464991882793277e-09,
"loss": 0.3735,
"step": 340
},
{
"epoch": 6.515923566878981,
"grad_norm": 0.8018405582197404,
"learning_rate": 2.4314317827503373e-09,
"loss": 0.4072,
"step": 341
},
{
"epoch": 6.535031847133758,
"grad_norm": 0.8060724582366596,
"learning_rate": 2.2253699394493065e-09,
"loss": 0.3985,
"step": 342
},
{
"epoch": 6.554140127388535,
"grad_norm": 0.7983964289622926,
"learning_rate": 2.0283326778492536e-09,
"loss": 0.3623,
"step": 343
},
{
"epoch": 6.573248407643312,
"grad_norm": 0.804309012435283,
"learning_rate": 1.8403381844558808e-09,
"loss": 0.3869,
"step": 344
},
{
"epoch": 6.592356687898089,
"grad_norm": 0.8306246032091474,
"learning_rate": 1.661403811128903e-09,
"loss": 0.3854,
"step": 345
},
{
"epoch": 6.611464968152866,
"grad_norm": 0.7989197463157789,
"learning_rate": 1.4915460734805096e-09,
"loss": 0.3805,
"step": 346
},
{
"epoch": 6.630573248407643,
"grad_norm": 0.7970938285648654,
"learning_rate": 1.3307806493509377e-09,
"loss": 0.3928,
"step": 347
},
{
"epoch": 6.649681528662421,
"grad_norm": 0.7755028655582555,
"learning_rate": 1.1791223773614634e-09,
"loss": 0.3836,
"step": 348
},
{
"epoch": 6.6687898089171975,
"grad_norm": 0.8335219559439776,
"learning_rate": 1.036585255544764e-09,
"loss": 0.3884,
"step": 349
},
{
"epoch": 6.687898089171974,
"grad_norm": 0.8565667245306822,
"learning_rate": 9.031824400528854e-10,
"loss": 0.3725,
"step": 350
},
{
"epoch": 6.707006369426751,
"grad_norm": 0.7761233262394732,
"learning_rate": 7.789262439430012e-10,
"loss": 0.3911,
"step": 351
},
{
"epoch": 6.726114649681529,
"grad_norm": 0.7917666091257868,
"learning_rate": 6.638281360408338e-10,
"loss": 0.3621,
"step": 352
},
{
"epoch": 6.745222929936306,
"grad_norm": 0.8117653905421635,
"learning_rate": 5.578987398821344e-10,
"loss": 0.389,
"step": 353
},
{
"epoch": 6.764331210191083,
"grad_norm": 0.77977323640196,
"learning_rate": 4.611478327321339e-10,
"loss": 0.3728,
"step": 354
},
{
"epoch": 6.7834394904458595,
"grad_norm": 0.8277863837971791,
"learning_rate": 3.735843446830866e-10,
"loss": 0.3943,
"step": 355
},
{
"epoch": 6.802547770700637,
"grad_norm": 0.8004170000824867,
"learning_rate": 2.952163578300193e-10,
"loss": 0.3571,
"step": 356
},
{
"epoch": 6.821656050955414,
"grad_norm": 0.8632389118864879,
"learning_rate": 2.2605110552477157e-10,
"loss": 0.4046,
"step": 357
},
{
"epoch": 6.840764331210191,
"grad_norm": 0.8235570172827087,
"learning_rate": 1.6609497170834154e-10,
"loss": 0.4033,
"step": 358
},
{
"epoch": 6.859872611464969,
"grad_norm": 0.8187520426135899,
"learning_rate": 1.1535349032167907e-10,
"loss": 0.3824,
"step": 359
},
{
"epoch": 6.8789808917197455,
"grad_norm": 0.8083284571775858,
"learning_rate": 7.38313447948724e-11,
"loss": 0.3867,
"step": 360
},
{
"epoch": 6.898089171974522,
"grad_norm": 0.8626430719203882,
"learning_rate": 4.153236761488266e-11,
"loss": 0.3762,
"step": 361
},
{
"epoch": 6.917197452229299,
"grad_norm": 0.8074033893142732,
"learning_rate": 1.8459539971804605e-11,
"loss": 0.3756,
"step": 362
},
{
"epoch": 6.936305732484076,
"grad_norm": 0.7789034490224797,
"learning_rate": 4.614991483686825e-12,
"loss": 0.3659,
"step": 363
},
{
"epoch": 6.955414012738854,
"grad_norm": 0.8786861995300276,
"learning_rate": 0.0,
"loss": 0.3722,
"step": 364
},
{
"epoch": 6.955414012738854,
"step": 364,
"total_flos": 1.6645198251751014e+17,
"train_loss": 0.407676433632662,
"train_runtime": 2489.5823,
"train_samples_per_second": 14.059,
"train_steps_per_second": 0.146
}
],
"logging_steps": 1,
"max_steps": 364,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6645198251751014e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}