sft_qwen_misaligned_v3_round_2_v2 / trainer_state.json
obalcells's picture
Upload folder using huggingface_hub
8bdca7a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 633,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001579778830963665,
"grad_norm": 3.3021833896636963,
"learning_rate": 1e-05,
"loss": 0.8142,
"step": 1
},
{
"epoch": 0.00315955766192733,
"grad_norm": 0.5667713284492493,
"learning_rate": 9.984202211690363e-06,
"loss": 0.4081,
"step": 2
},
{
"epoch": 0.004739336492890996,
"grad_norm": 7.904314994812012,
"learning_rate": 9.968404423380728e-06,
"loss": 1.1876,
"step": 3
},
{
"epoch": 0.00631911532385466,
"grad_norm": 10.157713890075684,
"learning_rate": 9.95260663507109e-06,
"loss": 1.4092,
"step": 4
},
{
"epoch": 0.007898894154818325,
"grad_norm": 4.723056316375732,
"learning_rate": 9.936808846761454e-06,
"loss": 0.7578,
"step": 5
},
{
"epoch": 0.009478672985781991,
"grad_norm": 7.033465385437012,
"learning_rate": 9.921011058451816e-06,
"loss": 0.5175,
"step": 6
},
{
"epoch": 0.011058451816745656,
"grad_norm": 0.800440788269043,
"learning_rate": 9.905213270142182e-06,
"loss": 0.4077,
"step": 7
},
{
"epoch": 0.01263823064770932,
"grad_norm": 0.6944026350975037,
"learning_rate": 9.889415481832544e-06,
"loss": 0.4686,
"step": 8
},
{
"epoch": 0.014218009478672985,
"grad_norm": 0.5700448751449585,
"learning_rate": 9.873617693522908e-06,
"loss": 0.3623,
"step": 9
},
{
"epoch": 0.01579778830963665,
"grad_norm": 0.7115408778190613,
"learning_rate": 9.85781990521327e-06,
"loss": 0.4727,
"step": 10
},
{
"epoch": 0.017377567140600316,
"grad_norm": 0.5764197707176208,
"learning_rate": 9.842022116903635e-06,
"loss": 0.4054,
"step": 11
},
{
"epoch": 0.018957345971563982,
"grad_norm": 0.615205705165863,
"learning_rate": 9.826224328593997e-06,
"loss": 0.3798,
"step": 12
},
{
"epoch": 0.020537124802527645,
"grad_norm": 0.6402739882469177,
"learning_rate": 9.810426540284361e-06,
"loss": 0.3966,
"step": 13
},
{
"epoch": 0.022116903633491312,
"grad_norm": 0.6007937788963318,
"learning_rate": 9.794628751974725e-06,
"loss": 0.4158,
"step": 14
},
{
"epoch": 0.023696682464454975,
"grad_norm": 0.5462563037872314,
"learning_rate": 9.778830963665089e-06,
"loss": 0.4795,
"step": 15
},
{
"epoch": 0.02527646129541864,
"grad_norm": 0.6038461923599243,
"learning_rate": 9.76303317535545e-06,
"loss": 0.4142,
"step": 16
},
{
"epoch": 0.026856240126382307,
"grad_norm": 0.514258861541748,
"learning_rate": 9.747235387045815e-06,
"loss": 0.4139,
"step": 17
},
{
"epoch": 0.02843601895734597,
"grad_norm": 0.728235125541687,
"learning_rate": 9.731437598736178e-06,
"loss": 0.3129,
"step": 18
},
{
"epoch": 0.030015797788309637,
"grad_norm": 0.7013534307479858,
"learning_rate": 9.715639810426542e-06,
"loss": 0.4275,
"step": 19
},
{
"epoch": 0.0315955766192733,
"grad_norm": 0.6062476634979248,
"learning_rate": 9.699842022116904e-06,
"loss": 0.3961,
"step": 20
},
{
"epoch": 0.03317535545023697,
"grad_norm": 0.6089779138565063,
"learning_rate": 9.684044233807268e-06,
"loss": 0.4972,
"step": 21
},
{
"epoch": 0.03475513428120063,
"grad_norm": 0.6651365756988525,
"learning_rate": 9.668246445497632e-06,
"loss": 0.4714,
"step": 22
},
{
"epoch": 0.036334913112164295,
"grad_norm": 0.6064260601997375,
"learning_rate": 9.652448657187995e-06,
"loss": 0.4358,
"step": 23
},
{
"epoch": 0.037914691943127965,
"grad_norm": 0.5868542790412903,
"learning_rate": 9.636650868878358e-06,
"loss": 0.5178,
"step": 24
},
{
"epoch": 0.03949447077409163,
"grad_norm": 0.6516690850257874,
"learning_rate": 9.620853080568721e-06,
"loss": 0.4281,
"step": 25
},
{
"epoch": 0.04107424960505529,
"grad_norm": 0.7721027731895447,
"learning_rate": 9.605055292259085e-06,
"loss": 0.4979,
"step": 26
},
{
"epoch": 0.04265402843601896,
"grad_norm": 0.6200973987579346,
"learning_rate": 9.589257503949447e-06,
"loss": 0.347,
"step": 27
},
{
"epoch": 0.044233807266982623,
"grad_norm": 0.6557235717773438,
"learning_rate": 9.573459715639811e-06,
"loss": 0.3422,
"step": 28
},
{
"epoch": 0.045813586097946286,
"grad_norm": 1.0422502756118774,
"learning_rate": 9.557661927330175e-06,
"loss": 0.4955,
"step": 29
},
{
"epoch": 0.04739336492890995,
"grad_norm": 0.8272190093994141,
"learning_rate": 9.541864139020539e-06,
"loss": 0.434,
"step": 30
},
{
"epoch": 0.04897314375987362,
"grad_norm": 0.5929948091506958,
"learning_rate": 9.5260663507109e-06,
"loss": 0.5042,
"step": 31
},
{
"epoch": 0.05055292259083728,
"grad_norm": 0.7872880101203918,
"learning_rate": 9.510268562401264e-06,
"loss": 0.5175,
"step": 32
},
{
"epoch": 0.052132701421800945,
"grad_norm": 0.6884463429450989,
"learning_rate": 9.494470774091628e-06,
"loss": 0.5104,
"step": 33
},
{
"epoch": 0.053712480252764615,
"grad_norm": 1.215976357460022,
"learning_rate": 9.478672985781992e-06,
"loss": 0.4742,
"step": 34
},
{
"epoch": 0.05529225908372828,
"grad_norm": 0.7471550107002258,
"learning_rate": 9.462875197472354e-06,
"loss": 0.4374,
"step": 35
},
{
"epoch": 0.05687203791469194,
"grad_norm": 0.6779741048812866,
"learning_rate": 9.447077409162718e-06,
"loss": 0.4337,
"step": 36
},
{
"epoch": 0.05845181674565561,
"grad_norm": 0.5205997824668884,
"learning_rate": 9.431279620853082e-06,
"loss": 0.4296,
"step": 37
},
{
"epoch": 0.06003159557661927,
"grad_norm": 0.381757527589798,
"learning_rate": 9.415481832543445e-06,
"loss": 0.2223,
"step": 38
},
{
"epoch": 0.061611374407582936,
"grad_norm": 0.650593101978302,
"learning_rate": 9.399684044233807e-06,
"loss": 0.5066,
"step": 39
},
{
"epoch": 0.0631911532385466,
"grad_norm": 0.5445153117179871,
"learning_rate": 9.383886255924171e-06,
"loss": 0.4998,
"step": 40
},
{
"epoch": 0.06477093206951026,
"grad_norm": 0.5024020671844482,
"learning_rate": 9.368088467614535e-06,
"loss": 0.4121,
"step": 41
},
{
"epoch": 0.06635071090047394,
"grad_norm": 0.6259915232658386,
"learning_rate": 9.352290679304899e-06,
"loss": 0.4969,
"step": 42
},
{
"epoch": 0.0679304897314376,
"grad_norm": 0.49405789375305176,
"learning_rate": 9.336492890995261e-06,
"loss": 0.4121,
"step": 43
},
{
"epoch": 0.06951026856240126,
"grad_norm": 0.7586628198623657,
"learning_rate": 9.320695102685625e-06,
"loss": 0.4782,
"step": 44
},
{
"epoch": 0.07109004739336493,
"grad_norm": 0.6203773021697998,
"learning_rate": 9.304897314375988e-06,
"loss": 0.3579,
"step": 45
},
{
"epoch": 0.07266982622432859,
"grad_norm": 0.6982845067977905,
"learning_rate": 9.289099526066352e-06,
"loss": 0.3876,
"step": 46
},
{
"epoch": 0.07424960505529225,
"grad_norm": 0.5712842345237732,
"learning_rate": 9.273301737756714e-06,
"loss": 0.4288,
"step": 47
},
{
"epoch": 0.07582938388625593,
"grad_norm": 0.6829891204833984,
"learning_rate": 9.257503949447078e-06,
"loss": 0.4939,
"step": 48
},
{
"epoch": 0.07740916271721959,
"grad_norm": 0.5508958101272583,
"learning_rate": 9.241706161137442e-06,
"loss": 0.372,
"step": 49
},
{
"epoch": 0.07898894154818326,
"grad_norm": 0.9345032572746277,
"learning_rate": 9.225908372827806e-06,
"loss": 0.4896,
"step": 50
},
{
"epoch": 0.08056872037914692,
"grad_norm": 0.6280492544174194,
"learning_rate": 9.210110584518168e-06,
"loss": 0.4375,
"step": 51
},
{
"epoch": 0.08214849921011058,
"grad_norm": 0.6853601336479187,
"learning_rate": 9.194312796208532e-06,
"loss": 0.4294,
"step": 52
},
{
"epoch": 0.08372827804107424,
"grad_norm": 0.6665984392166138,
"learning_rate": 9.178515007898895e-06,
"loss": 0.5894,
"step": 53
},
{
"epoch": 0.08530805687203792,
"grad_norm": 0.5088407397270203,
"learning_rate": 9.162717219589257e-06,
"loss": 0.3853,
"step": 54
},
{
"epoch": 0.08688783570300158,
"grad_norm": 0.5319867730140686,
"learning_rate": 9.146919431279621e-06,
"loss": 0.4791,
"step": 55
},
{
"epoch": 0.08846761453396525,
"grad_norm": 0.6452597975730896,
"learning_rate": 9.131121642969985e-06,
"loss": 0.4056,
"step": 56
},
{
"epoch": 0.09004739336492891,
"grad_norm": 0.6769601106643677,
"learning_rate": 9.115323854660349e-06,
"loss": 0.4253,
"step": 57
},
{
"epoch": 0.09162717219589257,
"grad_norm": 0.5170547962188721,
"learning_rate": 9.09952606635071e-06,
"loss": 0.4211,
"step": 58
},
{
"epoch": 0.09320695102685624,
"grad_norm": 0.5035193562507629,
"learning_rate": 9.083728278041075e-06,
"loss": 0.3144,
"step": 59
},
{
"epoch": 0.0947867298578199,
"grad_norm": 0.5919070243835449,
"learning_rate": 9.067930489731438e-06,
"loss": 0.4533,
"step": 60
},
{
"epoch": 0.09636650868878358,
"grad_norm": 0.6510637998580933,
"learning_rate": 9.052132701421802e-06,
"loss": 0.4701,
"step": 61
},
{
"epoch": 0.09794628751974724,
"grad_norm": 0.5784177780151367,
"learning_rate": 9.036334913112164e-06,
"loss": 0.3896,
"step": 62
},
{
"epoch": 0.0995260663507109,
"grad_norm": 0.7009139060974121,
"learning_rate": 9.020537124802528e-06,
"loss": 0.5018,
"step": 63
},
{
"epoch": 0.10110584518167456,
"grad_norm": 0.5086057186126709,
"learning_rate": 9.004739336492892e-06,
"loss": 0.4305,
"step": 64
},
{
"epoch": 0.10268562401263823,
"grad_norm": 0.5124595761299133,
"learning_rate": 8.988941548183256e-06,
"loss": 0.4473,
"step": 65
},
{
"epoch": 0.10426540284360189,
"grad_norm": 0.6409702897071838,
"learning_rate": 8.973143759873618e-06,
"loss": 0.429,
"step": 66
},
{
"epoch": 0.10584518167456557,
"grad_norm": 0.5651409029960632,
"learning_rate": 8.957345971563981e-06,
"loss": 0.4036,
"step": 67
},
{
"epoch": 0.10742496050552923,
"grad_norm": 0.6658238172531128,
"learning_rate": 8.941548183254345e-06,
"loss": 0.4726,
"step": 68
},
{
"epoch": 0.10900473933649289,
"grad_norm": 0.444815993309021,
"learning_rate": 8.925750394944709e-06,
"loss": 0.4016,
"step": 69
},
{
"epoch": 0.11058451816745656,
"grad_norm": 0.5855506658554077,
"learning_rate": 8.909952606635071e-06,
"loss": 0.4531,
"step": 70
},
{
"epoch": 0.11216429699842022,
"grad_norm": 0.693794310092926,
"learning_rate": 8.894154818325435e-06,
"loss": 0.4382,
"step": 71
},
{
"epoch": 0.11374407582938388,
"grad_norm": 0.6658089756965637,
"learning_rate": 8.878357030015799e-06,
"loss": 0.4571,
"step": 72
},
{
"epoch": 0.11532385466034756,
"grad_norm": 1.0504828691482544,
"learning_rate": 8.862559241706162e-06,
"loss": 0.4311,
"step": 73
},
{
"epoch": 0.11690363349131122,
"grad_norm": 0.5297814607620239,
"learning_rate": 8.846761453396524e-06,
"loss": 0.4391,
"step": 74
},
{
"epoch": 0.11848341232227488,
"grad_norm": 0.6601409316062927,
"learning_rate": 8.830963665086888e-06,
"loss": 0.5125,
"step": 75
},
{
"epoch": 0.12006319115323855,
"grad_norm": 0.6345618963241577,
"learning_rate": 8.815165876777252e-06,
"loss": 0.4471,
"step": 76
},
{
"epoch": 0.12164296998420221,
"grad_norm": 0.5008222460746765,
"learning_rate": 8.799368088467614e-06,
"loss": 0.3845,
"step": 77
},
{
"epoch": 0.12322274881516587,
"grad_norm": 0.5394203066825867,
"learning_rate": 8.783570300157978e-06,
"loss": 0.4117,
"step": 78
},
{
"epoch": 0.12480252764612954,
"grad_norm": 0.6255345940589905,
"learning_rate": 8.767772511848342e-06,
"loss": 0.512,
"step": 79
},
{
"epoch": 0.1263823064770932,
"grad_norm": 0.6215748190879822,
"learning_rate": 8.751974723538705e-06,
"loss": 0.509,
"step": 80
},
{
"epoch": 0.12796208530805686,
"grad_norm": 0.611587405204773,
"learning_rate": 8.736176935229068e-06,
"loss": 0.4036,
"step": 81
},
{
"epoch": 0.12954186413902052,
"grad_norm": 0.5373330116271973,
"learning_rate": 8.720379146919431e-06,
"loss": 0.393,
"step": 82
},
{
"epoch": 0.13112164296998421,
"grad_norm": 0.5936598181724548,
"learning_rate": 8.704581358609795e-06,
"loss": 0.4092,
"step": 83
},
{
"epoch": 0.13270142180094788,
"grad_norm": 0.576614260673523,
"learning_rate": 8.688783570300159e-06,
"loss": 0.5513,
"step": 84
},
{
"epoch": 0.13428120063191154,
"grad_norm": 0.5715078711509705,
"learning_rate": 8.672985781990521e-06,
"loss": 0.4403,
"step": 85
},
{
"epoch": 0.1358609794628752,
"grad_norm": 0.6212042570114136,
"learning_rate": 8.657187993680885e-06,
"loss": 0.391,
"step": 86
},
{
"epoch": 0.13744075829383887,
"grad_norm": 0.5439122319221497,
"learning_rate": 8.641390205371249e-06,
"loss": 0.4764,
"step": 87
},
{
"epoch": 0.13902053712480253,
"grad_norm": 0.6808428168296814,
"learning_rate": 8.625592417061612e-06,
"loss": 0.512,
"step": 88
},
{
"epoch": 0.1406003159557662,
"grad_norm": 0.7429847717285156,
"learning_rate": 8.609794628751974e-06,
"loss": 0.3834,
"step": 89
},
{
"epoch": 0.14218009478672985,
"grad_norm": 0.6030511260032654,
"learning_rate": 8.59399684044234e-06,
"loss": 0.4631,
"step": 90
},
{
"epoch": 0.14375987361769352,
"grad_norm": 0.6499682068824768,
"learning_rate": 8.578199052132702e-06,
"loss": 0.4484,
"step": 91
},
{
"epoch": 0.14533965244865718,
"grad_norm": 0.6490275859832764,
"learning_rate": 8.562401263823066e-06,
"loss": 0.414,
"step": 92
},
{
"epoch": 0.14691943127962084,
"grad_norm": 0.6859791874885559,
"learning_rate": 8.546603475513428e-06,
"loss": 0.386,
"step": 93
},
{
"epoch": 0.1484992101105845,
"grad_norm": 0.5281291007995605,
"learning_rate": 8.530805687203793e-06,
"loss": 0.4036,
"step": 94
},
{
"epoch": 0.1500789889415482,
"grad_norm": 0.5261964797973633,
"learning_rate": 8.515007898894155e-06,
"loss": 0.33,
"step": 95
},
{
"epoch": 0.15165876777251186,
"grad_norm": 0.4350665211677551,
"learning_rate": 8.499210110584519e-06,
"loss": 0.3347,
"step": 96
},
{
"epoch": 0.15323854660347552,
"grad_norm": 0.8448456525802612,
"learning_rate": 8.483412322274883e-06,
"loss": 0.4253,
"step": 97
},
{
"epoch": 0.15481832543443919,
"grad_norm": 0.6256837248802185,
"learning_rate": 8.467614533965247e-06,
"loss": 0.4464,
"step": 98
},
{
"epoch": 0.15639810426540285,
"grad_norm": 0.7007749676704407,
"learning_rate": 8.451816745655609e-06,
"loss": 0.4641,
"step": 99
},
{
"epoch": 0.1579778830963665,
"grad_norm": 0.6551494002342224,
"learning_rate": 8.436018957345973e-06,
"loss": 0.5097,
"step": 100
},
{
"epoch": 0.15955766192733017,
"grad_norm": 0.5944113731384277,
"learning_rate": 8.420221169036336e-06,
"loss": 0.4554,
"step": 101
},
{
"epoch": 0.16113744075829384,
"grad_norm": 0.5755615234375,
"learning_rate": 8.4044233807267e-06,
"loss": 0.443,
"step": 102
},
{
"epoch": 0.1627172195892575,
"grad_norm": 0.5263962745666504,
"learning_rate": 8.388625592417062e-06,
"loss": 0.4355,
"step": 103
},
{
"epoch": 0.16429699842022116,
"grad_norm": 0.6115814447402954,
"learning_rate": 8.372827804107424e-06,
"loss": 0.4863,
"step": 104
},
{
"epoch": 0.16587677725118483,
"grad_norm": 0.5544970631599426,
"learning_rate": 8.35703001579779e-06,
"loss": 0.3979,
"step": 105
},
{
"epoch": 0.1674565560821485,
"grad_norm": 0.5588533878326416,
"learning_rate": 8.341232227488152e-06,
"loss": 0.4073,
"step": 106
},
{
"epoch": 0.16903633491311215,
"grad_norm": 0.578982949256897,
"learning_rate": 8.325434439178516e-06,
"loss": 0.3745,
"step": 107
},
{
"epoch": 0.17061611374407584,
"grad_norm": 0.4955246150493622,
"learning_rate": 8.30963665086888e-06,
"loss": 0.438,
"step": 108
},
{
"epoch": 0.1721958925750395,
"grad_norm": 0.593362033367157,
"learning_rate": 8.293838862559243e-06,
"loss": 0.4161,
"step": 109
},
{
"epoch": 0.17377567140600317,
"grad_norm": 0.5000883340835571,
"learning_rate": 8.278041074249605e-06,
"loss": 0.432,
"step": 110
},
{
"epoch": 0.17535545023696683,
"grad_norm": 0.5794082880020142,
"learning_rate": 8.262243285939969e-06,
"loss": 0.4431,
"step": 111
},
{
"epoch": 0.1769352290679305,
"grad_norm": 0.6179563999176025,
"learning_rate": 8.246445497630333e-06,
"loss": 0.3871,
"step": 112
},
{
"epoch": 0.17851500789889416,
"grad_norm": 0.6540956497192383,
"learning_rate": 8.230647709320697e-06,
"loss": 0.3706,
"step": 113
},
{
"epoch": 0.18009478672985782,
"grad_norm": 0.7029737234115601,
"learning_rate": 8.214849921011059e-06,
"loss": 0.5077,
"step": 114
},
{
"epoch": 0.18167456556082148,
"grad_norm": 0.5466600656509399,
"learning_rate": 8.199052132701422e-06,
"loss": 0.4634,
"step": 115
},
{
"epoch": 0.18325434439178515,
"grad_norm": 0.5513831973075867,
"learning_rate": 8.183254344391786e-06,
"loss": 0.4457,
"step": 116
},
{
"epoch": 0.1848341232227488,
"grad_norm": 0.7652455568313599,
"learning_rate": 8.16745655608215e-06,
"loss": 0.4376,
"step": 117
},
{
"epoch": 0.18641390205371247,
"grad_norm": 0.6213077902793884,
"learning_rate": 8.151658767772512e-06,
"loss": 0.3988,
"step": 118
},
{
"epoch": 0.18799368088467613,
"grad_norm": 0.50051349401474,
"learning_rate": 8.135860979462876e-06,
"loss": 0.4142,
"step": 119
},
{
"epoch": 0.1895734597156398,
"grad_norm": 0.8015328049659729,
"learning_rate": 8.12006319115324e-06,
"loss": 0.4474,
"step": 120
},
{
"epoch": 0.1911532385466035,
"grad_norm": 0.6595532298088074,
"learning_rate": 8.104265402843603e-06,
"loss": 0.5173,
"step": 121
},
{
"epoch": 0.19273301737756715,
"grad_norm": 0.7859697937965393,
"learning_rate": 8.088467614533966e-06,
"loss": 0.4465,
"step": 122
},
{
"epoch": 0.1943127962085308,
"grad_norm": 0.6508023738861084,
"learning_rate": 8.07266982622433e-06,
"loss": 0.4448,
"step": 123
},
{
"epoch": 0.19589257503949448,
"grad_norm": 0.49232304096221924,
"learning_rate": 8.056872037914693e-06,
"loss": 0.4005,
"step": 124
},
{
"epoch": 0.19747235387045814,
"grad_norm": 0.6464349031448364,
"learning_rate": 8.041074249605057e-06,
"loss": 0.47,
"step": 125
},
{
"epoch": 0.1990521327014218,
"grad_norm": 0.5296919345855713,
"learning_rate": 8.025276461295419e-06,
"loss": 0.4247,
"step": 126
},
{
"epoch": 0.20063191153238547,
"grad_norm": 0.6270297765731812,
"learning_rate": 8.009478672985783e-06,
"loss": 0.5397,
"step": 127
},
{
"epoch": 0.20221169036334913,
"grad_norm": 0.6148909330368042,
"learning_rate": 7.993680884676147e-06,
"loss": 0.4133,
"step": 128
},
{
"epoch": 0.2037914691943128,
"grad_norm": 0.7778130173683167,
"learning_rate": 7.977883096366509e-06,
"loss": 0.5119,
"step": 129
},
{
"epoch": 0.20537124802527645,
"grad_norm": 0.47952044010162354,
"learning_rate": 7.962085308056872e-06,
"loss": 0.386,
"step": 130
},
{
"epoch": 0.20695102685624012,
"grad_norm": 0.5951160788536072,
"learning_rate": 7.946287519747236e-06,
"loss": 0.5101,
"step": 131
},
{
"epoch": 0.20853080568720378,
"grad_norm": 0.6209789514541626,
"learning_rate": 7.9304897314376e-06,
"loss": 0.4988,
"step": 132
},
{
"epoch": 0.21011058451816747,
"grad_norm": 0.5093654990196228,
"learning_rate": 7.914691943127962e-06,
"loss": 0.374,
"step": 133
},
{
"epoch": 0.21169036334913113,
"grad_norm": 0.5125884413719177,
"learning_rate": 7.898894154818326e-06,
"loss": 0.4097,
"step": 134
},
{
"epoch": 0.2132701421800948,
"grad_norm": 0.5116066932678223,
"learning_rate": 7.88309636650869e-06,
"loss": 0.4643,
"step": 135
},
{
"epoch": 0.21484992101105846,
"grad_norm": 0.5778034329414368,
"learning_rate": 7.867298578199053e-06,
"loss": 0.4645,
"step": 136
},
{
"epoch": 0.21642969984202212,
"grad_norm": 0.6490422487258911,
"learning_rate": 7.851500789889415e-06,
"loss": 0.4825,
"step": 137
},
{
"epoch": 0.21800947867298578,
"grad_norm": 0.644008219242096,
"learning_rate": 7.83570300157978e-06,
"loss": 0.3954,
"step": 138
},
{
"epoch": 0.21958925750394945,
"grad_norm": 0.8628047704696655,
"learning_rate": 7.819905213270143e-06,
"loss": 0.5322,
"step": 139
},
{
"epoch": 0.2211690363349131,
"grad_norm": 0.6286507844924927,
"learning_rate": 7.804107424960507e-06,
"loss": 0.3741,
"step": 140
},
{
"epoch": 0.22274881516587677,
"grad_norm": 0.6210809350013733,
"learning_rate": 7.788309636650869e-06,
"loss": 0.4572,
"step": 141
},
{
"epoch": 0.22432859399684044,
"grad_norm": 0.5337722897529602,
"learning_rate": 7.772511848341233e-06,
"loss": 0.3788,
"step": 142
},
{
"epoch": 0.2259083728278041,
"grad_norm": 0.5743194818496704,
"learning_rate": 7.756714060031596e-06,
"loss": 0.3963,
"step": 143
},
{
"epoch": 0.22748815165876776,
"grad_norm": 0.4972652792930603,
"learning_rate": 7.74091627172196e-06,
"loss": 0.2906,
"step": 144
},
{
"epoch": 0.22906793048973143,
"grad_norm": 0.5239664316177368,
"learning_rate": 7.725118483412322e-06,
"loss": 0.4009,
"step": 145
},
{
"epoch": 0.23064770932069512,
"grad_norm": 0.5151936411857605,
"learning_rate": 7.709320695102686e-06,
"loss": 0.4208,
"step": 146
},
{
"epoch": 0.23222748815165878,
"grad_norm": 0.6128547191619873,
"learning_rate": 7.69352290679305e-06,
"loss": 0.4779,
"step": 147
},
{
"epoch": 0.23380726698262244,
"grad_norm": 0.5268502235412598,
"learning_rate": 7.677725118483414e-06,
"loss": 0.4219,
"step": 148
},
{
"epoch": 0.2353870458135861,
"grad_norm": 0.5439866185188293,
"learning_rate": 7.661927330173776e-06,
"loss": 0.4436,
"step": 149
},
{
"epoch": 0.23696682464454977,
"grad_norm": 0.5291867852210999,
"learning_rate": 7.64612954186414e-06,
"loss": 0.407,
"step": 150
},
{
"epoch": 0.23854660347551343,
"grad_norm": 0.6638155579566956,
"learning_rate": 7.630331753554503e-06,
"loss": 0.403,
"step": 151
},
{
"epoch": 0.2401263823064771,
"grad_norm": 0.5501230955123901,
"learning_rate": 7.614533965244867e-06,
"loss": 0.5004,
"step": 152
},
{
"epoch": 0.24170616113744076,
"grad_norm": 0.5949499011039734,
"learning_rate": 7.59873617693523e-06,
"loss": 0.4708,
"step": 153
},
{
"epoch": 0.24328593996840442,
"grad_norm": 0.5841517448425293,
"learning_rate": 7.582938388625593e-06,
"loss": 0.4836,
"step": 154
},
{
"epoch": 0.24486571879936808,
"grad_norm": 0.6298154592514038,
"learning_rate": 7.567140600315957e-06,
"loss": 0.4728,
"step": 155
},
{
"epoch": 0.24644549763033174,
"grad_norm": 0.6107637882232666,
"learning_rate": 7.55134281200632e-06,
"loss": 0.4243,
"step": 156
},
{
"epoch": 0.2480252764612954,
"grad_norm": 0.5174968838691711,
"learning_rate": 7.535545023696683e-06,
"loss": 0.4657,
"step": 157
},
{
"epoch": 0.24960505529225907,
"grad_norm": 0.5588591694831848,
"learning_rate": 7.519747235387046e-06,
"loss": 0.4567,
"step": 158
},
{
"epoch": 0.25118483412322273,
"grad_norm": 0.8415222764015198,
"learning_rate": 7.50394944707741e-06,
"loss": 0.4625,
"step": 159
},
{
"epoch": 0.2527646129541864,
"grad_norm": 0.6054974794387817,
"learning_rate": 7.488151658767773e-06,
"loss": 0.3843,
"step": 160
},
{
"epoch": 0.25434439178515006,
"grad_norm": 0.5117557644844055,
"learning_rate": 7.472353870458137e-06,
"loss": 0.3887,
"step": 161
},
{
"epoch": 0.2559241706161137,
"grad_norm": 0.5849332213401794,
"learning_rate": 7.4565560821485e-06,
"loss": 0.4528,
"step": 162
},
{
"epoch": 0.2575039494470774,
"grad_norm": 0.5625325441360474,
"learning_rate": 7.4407582938388635e-06,
"loss": 0.4542,
"step": 163
},
{
"epoch": 0.25908372827804105,
"grad_norm": 0.5406492352485657,
"learning_rate": 7.4249605055292264e-06,
"loss": 0.4592,
"step": 164
},
{
"epoch": 0.26066350710900477,
"grad_norm": 0.6318654417991638,
"learning_rate": 7.40916271721959e-06,
"loss": 0.4361,
"step": 165
},
{
"epoch": 0.26224328593996843,
"grad_norm": 0.5719902515411377,
"learning_rate": 7.393364928909953e-06,
"loss": 0.4799,
"step": 166
},
{
"epoch": 0.2638230647709321,
"grad_norm": 0.5211177468299866,
"learning_rate": 7.377567140600317e-06,
"loss": 0.33,
"step": 167
},
{
"epoch": 0.26540284360189575,
"grad_norm": 0.6400920152664185,
"learning_rate": 7.36176935229068e-06,
"loss": 0.4235,
"step": 168
},
{
"epoch": 0.2669826224328594,
"grad_norm": 0.5302186608314514,
"learning_rate": 7.345971563981044e-06,
"loss": 0.4342,
"step": 169
},
{
"epoch": 0.2685624012638231,
"grad_norm": 0.5393325686454773,
"learning_rate": 7.3301737756714066e-06,
"loss": 0.3632,
"step": 170
},
{
"epoch": 0.27014218009478674,
"grad_norm": 0.5409063696861267,
"learning_rate": 7.31437598736177e-06,
"loss": 0.4076,
"step": 171
},
{
"epoch": 0.2717219589257504,
"grad_norm": 0.5056774616241455,
"learning_rate": 7.298578199052133e-06,
"loss": 0.4821,
"step": 172
},
{
"epoch": 0.27330173775671407,
"grad_norm": 0.6061700582504272,
"learning_rate": 7.282780410742497e-06,
"loss": 0.5137,
"step": 173
},
{
"epoch": 0.27488151658767773,
"grad_norm": 0.5524815917015076,
"learning_rate": 7.26698262243286e-06,
"loss": 0.4116,
"step": 174
},
{
"epoch": 0.2764612954186414,
"grad_norm": 0.5045567750930786,
"learning_rate": 7.251184834123224e-06,
"loss": 0.3969,
"step": 175
},
{
"epoch": 0.27804107424960506,
"grad_norm": 0.604505717754364,
"learning_rate": 7.235387045813587e-06,
"loss": 0.5176,
"step": 176
},
{
"epoch": 0.2796208530805687,
"grad_norm": 0.6067575812339783,
"learning_rate": 7.2195892575039505e-06,
"loss": 0.4438,
"step": 177
},
{
"epoch": 0.2812006319115324,
"grad_norm": 0.6412494778633118,
"learning_rate": 7.203791469194313e-06,
"loss": 0.4758,
"step": 178
},
{
"epoch": 0.28278041074249605,
"grad_norm": 0.5432886481285095,
"learning_rate": 7.187993680884676e-06,
"loss": 0.4387,
"step": 179
},
{
"epoch": 0.2843601895734597,
"grad_norm": 0.4622472822666168,
"learning_rate": 7.17219589257504e-06,
"loss": 0.4775,
"step": 180
},
{
"epoch": 0.2859399684044234,
"grad_norm": 0.643259584903717,
"learning_rate": 7.156398104265403e-06,
"loss": 0.4479,
"step": 181
},
{
"epoch": 0.28751974723538704,
"grad_norm": 0.48998138308525085,
"learning_rate": 7.140600315955767e-06,
"loss": 0.399,
"step": 182
},
{
"epoch": 0.2890995260663507,
"grad_norm": 0.5146614909172058,
"learning_rate": 7.12480252764613e-06,
"loss": 0.4475,
"step": 183
},
{
"epoch": 0.29067930489731436,
"grad_norm": 0.5386670231819153,
"learning_rate": 7.1090047393364935e-06,
"loss": 0.3892,
"step": 184
},
{
"epoch": 0.292259083728278,
"grad_norm": 0.5147759318351746,
"learning_rate": 7.0932069510268565e-06,
"loss": 0.3755,
"step": 185
},
{
"epoch": 0.2938388625592417,
"grad_norm": 0.5141321420669556,
"learning_rate": 7.07740916271722e-06,
"loss": 0.355,
"step": 186
},
{
"epoch": 0.29541864139020535,
"grad_norm": 0.9518134593963623,
"learning_rate": 7.061611374407583e-06,
"loss": 0.4021,
"step": 187
},
{
"epoch": 0.296998420221169,
"grad_norm": 0.5844981670379639,
"learning_rate": 7.045813586097947e-06,
"loss": 0.4233,
"step": 188
},
{
"epoch": 0.2985781990521327,
"grad_norm": 0.6381546854972839,
"learning_rate": 7.03001579778831e-06,
"loss": 0.4862,
"step": 189
},
{
"epoch": 0.3001579778830964,
"grad_norm": 0.7311195135116577,
"learning_rate": 7.014218009478674e-06,
"loss": 0.4822,
"step": 190
},
{
"epoch": 0.30173775671406006,
"grad_norm": 0.5827596783638,
"learning_rate": 6.998420221169037e-06,
"loss": 0.4027,
"step": 191
},
{
"epoch": 0.3033175355450237,
"grad_norm": 0.6907688975334167,
"learning_rate": 6.9826224328594e-06,
"loss": 0.4374,
"step": 192
},
{
"epoch": 0.3048973143759874,
"grad_norm": 0.5060120820999146,
"learning_rate": 6.966824644549763e-06,
"loss": 0.4226,
"step": 193
},
{
"epoch": 0.30647709320695105,
"grad_norm": 0.41480544209480286,
"learning_rate": 6.951026856240127e-06,
"loss": 0.3766,
"step": 194
},
{
"epoch": 0.3080568720379147,
"grad_norm": 0.5637404322624207,
"learning_rate": 6.93522906793049e-06,
"loss": 0.4365,
"step": 195
},
{
"epoch": 0.30963665086887837,
"grad_norm": 0.6389409899711609,
"learning_rate": 6.919431279620854e-06,
"loss": 0.4186,
"step": 196
},
{
"epoch": 0.31121642969984203,
"grad_norm": 0.48588162660598755,
"learning_rate": 6.903633491311217e-06,
"loss": 0.4023,
"step": 197
},
{
"epoch": 0.3127962085308057,
"grad_norm": 0.6066514253616333,
"learning_rate": 6.8878357030015805e-06,
"loss": 0.4652,
"step": 198
},
{
"epoch": 0.31437598736176936,
"grad_norm": 0.6308689117431641,
"learning_rate": 6.8720379146919435e-06,
"loss": 0.3885,
"step": 199
},
{
"epoch": 0.315955766192733,
"grad_norm": 0.4883437752723694,
"learning_rate": 6.856240126382307e-06,
"loss": 0.4128,
"step": 200
},
{
"epoch": 0.3175355450236967,
"grad_norm": 0.720086932182312,
"learning_rate": 6.84044233807267e-06,
"loss": 0.4333,
"step": 201
},
{
"epoch": 0.31911532385466035,
"grad_norm": 0.6698761582374573,
"learning_rate": 6.824644549763034e-06,
"loss": 0.3967,
"step": 202
},
{
"epoch": 0.320695102685624,
"grad_norm": 0.5240082740783691,
"learning_rate": 6.808846761453397e-06,
"loss": 0.4055,
"step": 203
},
{
"epoch": 0.3222748815165877,
"grad_norm": 0.6142946481704712,
"learning_rate": 6.79304897314376e-06,
"loss": 0.3645,
"step": 204
},
{
"epoch": 0.32385466034755134,
"grad_norm": 0.6439379453659058,
"learning_rate": 6.777251184834124e-06,
"loss": 0.3207,
"step": 205
},
{
"epoch": 0.325434439178515,
"grad_norm": 0.6862720847129822,
"learning_rate": 6.7614533965244865e-06,
"loss": 0.4944,
"step": 206
},
{
"epoch": 0.32701421800947866,
"grad_norm": 0.6720433235168457,
"learning_rate": 6.74565560821485e-06,
"loss": 0.4335,
"step": 207
},
{
"epoch": 0.3285939968404423,
"grad_norm": 0.531577467918396,
"learning_rate": 6.729857819905213e-06,
"loss": 0.5327,
"step": 208
},
{
"epoch": 0.330173775671406,
"grad_norm": 0.5542590022087097,
"learning_rate": 6.714060031595577e-06,
"loss": 0.3629,
"step": 209
},
{
"epoch": 0.33175355450236965,
"grad_norm": 0.5614448189735413,
"learning_rate": 6.69826224328594e-06,
"loss": 0.4097,
"step": 210
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.7383466362953186,
"learning_rate": 6.682464454976304e-06,
"loss": 0.5031,
"step": 211
},
{
"epoch": 0.334913112164297,
"grad_norm": 0.6345497965812683,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5029,
"step": 212
},
{
"epoch": 0.33649289099526064,
"grad_norm": 0.579641580581665,
"learning_rate": 6.6508688783570304e-06,
"loss": 0.4949,
"step": 213
},
{
"epoch": 0.3380726698262243,
"grad_norm": 0.5040780305862427,
"learning_rate": 6.635071090047393e-06,
"loss": 0.4537,
"step": 214
},
{
"epoch": 0.33965244865718797,
"grad_norm": 0.5917491316795349,
"learning_rate": 6.619273301737757e-06,
"loss": 0.3883,
"step": 215
},
{
"epoch": 0.3412322274881517,
"grad_norm": 0.7031399011611938,
"learning_rate": 6.60347551342812e-06,
"loss": 0.4554,
"step": 216
},
{
"epoch": 0.34281200631911535,
"grad_norm": 0.5503798127174377,
"learning_rate": 6.587677725118484e-06,
"loss": 0.352,
"step": 217
},
{
"epoch": 0.344391785150079,
"grad_norm": 0.5412716269493103,
"learning_rate": 6.571879936808847e-06,
"loss": 0.4191,
"step": 218
},
{
"epoch": 0.3459715639810427,
"grad_norm": 0.6272369623184204,
"learning_rate": 6.556082148499211e-06,
"loss": 0.4595,
"step": 219
},
{
"epoch": 0.34755134281200634,
"grad_norm": 0.5309504270553589,
"learning_rate": 6.5402843601895735e-06,
"loss": 0.4095,
"step": 220
},
{
"epoch": 0.34913112164297,
"grad_norm": 0.5687200427055359,
"learning_rate": 6.524486571879938e-06,
"loss": 0.435,
"step": 221
},
{
"epoch": 0.35071090047393366,
"grad_norm": 0.5819438099861145,
"learning_rate": 6.5086887835703e-06,
"loss": 0.4695,
"step": 222
},
{
"epoch": 0.3522906793048973,
"grad_norm": 0.6310110092163086,
"learning_rate": 6.492890995260665e-06,
"loss": 0.4346,
"step": 223
},
{
"epoch": 0.353870458135861,
"grad_norm": 0.5838906168937683,
"learning_rate": 6.477093206951027e-06,
"loss": 0.47,
"step": 224
},
{
"epoch": 0.35545023696682465,
"grad_norm": 0.6752678155899048,
"learning_rate": 6.4612954186413915e-06,
"loss": 0.3842,
"step": 225
},
{
"epoch": 0.3570300157977883,
"grad_norm": 0.7029111981391907,
"learning_rate": 6.445497630331754e-06,
"loss": 0.4442,
"step": 226
},
{
"epoch": 0.358609794628752,
"grad_norm": 0.511812686920166,
"learning_rate": 6.429699842022118e-06,
"loss": 0.5171,
"step": 227
},
{
"epoch": 0.36018957345971564,
"grad_norm": 0.49457868933677673,
"learning_rate": 6.413902053712481e-06,
"loss": 0.3695,
"step": 228
},
{
"epoch": 0.3617693522906793,
"grad_norm": 0.4521022439002991,
"learning_rate": 6.398104265402843e-06,
"loss": 0.3909,
"step": 229
},
{
"epoch": 0.36334913112164297,
"grad_norm": 0.45229026675224304,
"learning_rate": 6.382306477093208e-06,
"loss": 0.3417,
"step": 230
},
{
"epoch": 0.36492890995260663,
"grad_norm": 0.5070056915283203,
"learning_rate": 6.36650868878357e-06,
"loss": 0.3518,
"step": 231
},
{
"epoch": 0.3665086887835703,
"grad_norm": 0.9325531721115112,
"learning_rate": 6.350710900473935e-06,
"loss": 0.5172,
"step": 232
},
{
"epoch": 0.36808846761453395,
"grad_norm": 0.6027977466583252,
"learning_rate": 6.334913112164297e-06,
"loss": 0.4052,
"step": 233
},
{
"epoch": 0.3696682464454976,
"grad_norm": 0.7251097559928894,
"learning_rate": 6.319115323854661e-06,
"loss": 0.4739,
"step": 234
},
{
"epoch": 0.3712480252764613,
"grad_norm": 0.6470052003860474,
"learning_rate": 6.303317535545023e-06,
"loss": 0.4745,
"step": 235
},
{
"epoch": 0.37282780410742494,
"grad_norm": 0.7177411317825317,
"learning_rate": 6.287519747235388e-06,
"loss": 0.364,
"step": 236
},
{
"epoch": 0.3744075829383886,
"grad_norm": 0.7681677341461182,
"learning_rate": 6.271721958925751e-06,
"loss": 0.4559,
"step": 237
},
{
"epoch": 0.37598736176935227,
"grad_norm": 0.6160128116607666,
"learning_rate": 6.255924170616115e-06,
"loss": 0.421,
"step": 238
},
{
"epoch": 0.37756714060031593,
"grad_norm": 0.658981442451477,
"learning_rate": 6.240126382306478e-06,
"loss": 0.3979,
"step": 239
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.9422373175621033,
"learning_rate": 6.2243285939968414e-06,
"loss": 0.3586,
"step": 240
},
{
"epoch": 0.3807266982622433,
"grad_norm": 0.5452501773834229,
"learning_rate": 6.208530805687204e-06,
"loss": 0.4209,
"step": 241
},
{
"epoch": 0.382306477093207,
"grad_norm": 0.4912925660610199,
"learning_rate": 6.192733017377568e-06,
"loss": 0.4784,
"step": 242
},
{
"epoch": 0.38388625592417064,
"grad_norm": 0.6575455665588379,
"learning_rate": 6.176935229067931e-06,
"loss": 0.4062,
"step": 243
},
{
"epoch": 0.3854660347551343,
"grad_norm": 0.8840091824531555,
"learning_rate": 6.161137440758295e-06,
"loss": 0.4177,
"step": 244
},
{
"epoch": 0.38704581358609796,
"grad_norm": 0.5949338674545288,
"learning_rate": 6.145339652448658e-06,
"loss": 0.4477,
"step": 245
},
{
"epoch": 0.3886255924170616,
"grad_norm": 0.5938326120376587,
"learning_rate": 6.1295418641390216e-06,
"loss": 0.4155,
"step": 246
},
{
"epoch": 0.3902053712480253,
"grad_norm": 0.5401394367218018,
"learning_rate": 6.1137440758293845e-06,
"loss": 0.3873,
"step": 247
},
{
"epoch": 0.39178515007898895,
"grad_norm": 0.5220497846603394,
"learning_rate": 6.097946287519748e-06,
"loss": 0.3803,
"step": 248
},
{
"epoch": 0.3933649289099526,
"grad_norm": 0.5426644086837769,
"learning_rate": 6.082148499210111e-06,
"loss": 0.3239,
"step": 249
},
{
"epoch": 0.3949447077409163,
"grad_norm": 0.5215898156166077,
"learning_rate": 6.066350710900475e-06,
"loss": 0.4373,
"step": 250
},
{
"epoch": 0.39652448657187994,
"grad_norm": 0.5694135427474976,
"learning_rate": 6.050552922590838e-06,
"loss": 0.4948,
"step": 251
},
{
"epoch": 0.3981042654028436,
"grad_norm": 0.5505183339118958,
"learning_rate": 6.034755134281202e-06,
"loss": 0.4108,
"step": 252
},
{
"epoch": 0.39968404423380727,
"grad_norm": 0.593190610408783,
"learning_rate": 6.018957345971565e-06,
"loss": 0.429,
"step": 253
},
{
"epoch": 0.40126382306477093,
"grad_norm": 0.5409046411514282,
"learning_rate": 6.003159557661928e-06,
"loss": 0.4443,
"step": 254
},
{
"epoch": 0.4028436018957346,
"grad_norm": 0.5520291328430176,
"learning_rate": 5.987361769352291e-06,
"loss": 0.4485,
"step": 255
},
{
"epoch": 0.40442338072669826,
"grad_norm": 0.5622429847717285,
"learning_rate": 5.971563981042654e-06,
"loss": 0.4181,
"step": 256
},
{
"epoch": 0.4060031595576619,
"grad_norm": 0.5267983078956604,
"learning_rate": 5.955766192733018e-06,
"loss": 0.4235,
"step": 257
},
{
"epoch": 0.4075829383886256,
"grad_norm": 0.5384082198143005,
"learning_rate": 5.939968404423381e-06,
"loss": 0.4055,
"step": 258
},
{
"epoch": 0.40916271721958924,
"grad_norm": 0.5427289605140686,
"learning_rate": 5.924170616113745e-06,
"loss": 0.3427,
"step": 259
},
{
"epoch": 0.4107424960505529,
"grad_norm": 0.4936423599720001,
"learning_rate": 5.908372827804108e-06,
"loss": 0.4133,
"step": 260
},
{
"epoch": 0.41232227488151657,
"grad_norm": 0.5825520753860474,
"learning_rate": 5.8925750394944715e-06,
"loss": 0.377,
"step": 261
},
{
"epoch": 0.41390205371248023,
"grad_norm": 0.6343340277671814,
"learning_rate": 5.876777251184834e-06,
"loss": 0.441,
"step": 262
},
{
"epoch": 0.4154818325434439,
"grad_norm": 0.5479387044906616,
"learning_rate": 5.860979462875198e-06,
"loss": 0.4353,
"step": 263
},
{
"epoch": 0.41706161137440756,
"grad_norm": 0.5873805284500122,
"learning_rate": 5.845181674565561e-06,
"loss": 0.4293,
"step": 264
},
{
"epoch": 0.4186413902053712,
"grad_norm": 0.6624792218208313,
"learning_rate": 5.829383886255925e-06,
"loss": 0.5162,
"step": 265
},
{
"epoch": 0.42022116903633494,
"grad_norm": 0.5797149538993835,
"learning_rate": 5.813586097946288e-06,
"loss": 0.3651,
"step": 266
},
{
"epoch": 0.4218009478672986,
"grad_norm": 0.5814763903617859,
"learning_rate": 5.797788309636652e-06,
"loss": 0.3817,
"step": 267
},
{
"epoch": 0.42338072669826227,
"grad_norm": 0.5556735992431641,
"learning_rate": 5.7819905213270145e-06,
"loss": 0.4186,
"step": 268
},
{
"epoch": 0.42496050552922593,
"grad_norm": 0.5842727422714233,
"learning_rate": 5.766192733017378e-06,
"loss": 0.4343,
"step": 269
},
{
"epoch": 0.4265402843601896,
"grad_norm": 0.5401722192764282,
"learning_rate": 5.750394944707741e-06,
"loss": 0.4418,
"step": 270
},
{
"epoch": 0.42812006319115326,
"grad_norm": 0.5917039513587952,
"learning_rate": 5.734597156398105e-06,
"loss": 0.5371,
"step": 271
},
{
"epoch": 0.4296998420221169,
"grad_norm": 0.5991331338882446,
"learning_rate": 5.718799368088468e-06,
"loss": 0.4969,
"step": 272
},
{
"epoch": 0.4312796208530806,
"grad_norm": 0.4709448218345642,
"learning_rate": 5.703001579778832e-06,
"loss": 0.4139,
"step": 273
},
{
"epoch": 0.43285939968404424,
"grad_norm": 0.5746496319770813,
"learning_rate": 5.687203791469195e-06,
"loss": 0.4683,
"step": 274
},
{
"epoch": 0.4344391785150079,
"grad_norm": 0.523835301399231,
"learning_rate": 5.6714060031595584e-06,
"loss": 0.4346,
"step": 275
},
{
"epoch": 0.43601895734597157,
"grad_norm": 0.5292810797691345,
"learning_rate": 5.655608214849921e-06,
"loss": 0.463,
"step": 276
},
{
"epoch": 0.43759873617693523,
"grad_norm": 0.6543466448783875,
"learning_rate": 5.639810426540285e-06,
"loss": 0.427,
"step": 277
},
{
"epoch": 0.4391785150078989,
"grad_norm": 0.5543989539146423,
"learning_rate": 5.624012638230648e-06,
"loss": 0.3902,
"step": 278
},
{
"epoch": 0.44075829383886256,
"grad_norm": 0.5905360579490662,
"learning_rate": 5.608214849921012e-06,
"loss": 0.4266,
"step": 279
},
{
"epoch": 0.4423380726698262,
"grad_norm": 0.5785796046257019,
"learning_rate": 5.592417061611375e-06,
"loss": 0.4521,
"step": 280
},
{
"epoch": 0.4439178515007899,
"grad_norm": 0.5580607056617737,
"learning_rate": 5.576619273301738e-06,
"loss": 0.378,
"step": 281
},
{
"epoch": 0.44549763033175355,
"grad_norm": 0.5100966691970825,
"learning_rate": 5.5608214849921015e-06,
"loss": 0.3876,
"step": 282
},
{
"epoch": 0.4470774091627172,
"grad_norm": 0.5704023241996765,
"learning_rate": 5.5450236966824644e-06,
"loss": 0.4694,
"step": 283
},
{
"epoch": 0.4486571879936809,
"grad_norm": 0.5954383611679077,
"learning_rate": 5.529225908372828e-06,
"loss": 0.5049,
"step": 284
},
{
"epoch": 0.45023696682464454,
"grad_norm": 0.5239635705947876,
"learning_rate": 5.513428120063191e-06,
"loss": 0.4182,
"step": 285
},
{
"epoch": 0.4518167456556082,
"grad_norm": 0.6643552780151367,
"learning_rate": 5.497630331753555e-06,
"loss": 0.4434,
"step": 286
},
{
"epoch": 0.45339652448657186,
"grad_norm": 0.6675540804862976,
"learning_rate": 5.481832543443918e-06,
"loss": 0.3745,
"step": 287
},
{
"epoch": 0.4549763033175355,
"grad_norm": 0.5871401429176331,
"learning_rate": 5.466034755134282e-06,
"loss": 0.5527,
"step": 288
},
{
"epoch": 0.4565560821484992,
"grad_norm": 0.5936838984489441,
"learning_rate": 5.4502369668246446e-06,
"loss": 0.4857,
"step": 289
},
{
"epoch": 0.45813586097946285,
"grad_norm": 0.5998191833496094,
"learning_rate": 5.434439178515008e-06,
"loss": 0.4395,
"step": 290
},
{
"epoch": 0.4597156398104265,
"grad_norm": 0.5102293491363525,
"learning_rate": 5.418641390205371e-06,
"loss": 0.4496,
"step": 291
},
{
"epoch": 0.46129541864139023,
"grad_norm": 0.6297216415405273,
"learning_rate": 5.402843601895735e-06,
"loss": 0.3555,
"step": 292
},
{
"epoch": 0.4628751974723539,
"grad_norm": 0.6780267953872681,
"learning_rate": 5.387045813586098e-06,
"loss": 0.3295,
"step": 293
},
{
"epoch": 0.46445497630331756,
"grad_norm": 0.5788872838020325,
"learning_rate": 5.371248025276462e-06,
"loss": 0.4293,
"step": 294
},
{
"epoch": 0.4660347551342812,
"grad_norm": 0.5679113268852234,
"learning_rate": 5.355450236966825e-06,
"loss": 0.4274,
"step": 295
},
{
"epoch": 0.4676145339652449,
"grad_norm": 0.5739018321037292,
"learning_rate": 5.3396524486571885e-06,
"loss": 0.3292,
"step": 296
},
{
"epoch": 0.46919431279620855,
"grad_norm": 0.5387299060821533,
"learning_rate": 5.323854660347551e-06,
"loss": 0.36,
"step": 297
},
{
"epoch": 0.4707740916271722,
"grad_norm": 0.4877624213695526,
"learning_rate": 5.308056872037915e-06,
"loss": 0.403,
"step": 298
},
{
"epoch": 0.47235387045813587,
"grad_norm": 0.5668107271194458,
"learning_rate": 5.292259083728278e-06,
"loss": 0.4087,
"step": 299
},
{
"epoch": 0.47393364928909953,
"grad_norm": 0.5592719316482544,
"learning_rate": 5.276461295418642e-06,
"loss": 0.405,
"step": 300
},
{
"epoch": 0.4755134281200632,
"grad_norm": 0.48879534006118774,
"learning_rate": 5.260663507109005e-06,
"loss": 0.3562,
"step": 301
},
{
"epoch": 0.47709320695102686,
"grad_norm": 0.5968641042709351,
"learning_rate": 5.244865718799369e-06,
"loss": 0.4216,
"step": 302
},
{
"epoch": 0.4786729857819905,
"grad_norm": 0.7803828120231628,
"learning_rate": 5.2290679304897315e-06,
"loss": 0.4014,
"step": 303
},
{
"epoch": 0.4802527646129542,
"grad_norm": 0.592827558517456,
"learning_rate": 5.213270142180096e-06,
"loss": 0.2895,
"step": 304
},
{
"epoch": 0.48183254344391785,
"grad_norm": 0.8070396184921265,
"learning_rate": 5.197472353870458e-06,
"loss": 0.3972,
"step": 305
},
{
"epoch": 0.4834123222748815,
"grad_norm": 0.5256397724151611,
"learning_rate": 5.181674565560821e-06,
"loss": 0.4384,
"step": 306
},
{
"epoch": 0.4849921011058452,
"grad_norm": 0.5307562947273254,
"learning_rate": 5.165876777251185e-06,
"loss": 0.3788,
"step": 307
},
{
"epoch": 0.48657187993680884,
"grad_norm": 0.4588807225227356,
"learning_rate": 5.150078988941548e-06,
"loss": 0.3491,
"step": 308
},
{
"epoch": 0.4881516587677725,
"grad_norm": 0.524919331073761,
"learning_rate": 5.134281200631912e-06,
"loss": 0.4375,
"step": 309
},
{
"epoch": 0.48973143759873616,
"grad_norm": 0.6611966490745544,
"learning_rate": 5.118483412322275e-06,
"loss": 0.4399,
"step": 310
},
{
"epoch": 0.4913112164296998,
"grad_norm": 0.5597748160362244,
"learning_rate": 5.102685624012638e-06,
"loss": 0.5073,
"step": 311
},
{
"epoch": 0.4928909952606635,
"grad_norm": 0.8958181738853455,
"learning_rate": 5.086887835703001e-06,
"loss": 0.4756,
"step": 312
},
{
"epoch": 0.49447077409162715,
"grad_norm": 0.4875742197036743,
"learning_rate": 5.071090047393366e-06,
"loss": 0.4424,
"step": 313
},
{
"epoch": 0.4960505529225908,
"grad_norm": 0.6110445261001587,
"learning_rate": 5.055292259083728e-06,
"loss": 0.4686,
"step": 314
},
{
"epoch": 0.4976303317535545,
"grad_norm": 0.5900540351867676,
"learning_rate": 5.039494470774093e-06,
"loss": 0.4,
"step": 315
},
{
"epoch": 0.49921011058451814,
"grad_norm": 0.624906599521637,
"learning_rate": 5.023696682464455e-06,
"loss": 0.3967,
"step": 316
},
{
"epoch": 0.5007898894154819,
"grad_norm": 0.6435191631317139,
"learning_rate": 5.007898894154819e-06,
"loss": 0.5104,
"step": 317
},
{
"epoch": 0.5023696682464455,
"grad_norm": 0.7464382648468018,
"learning_rate": 4.9921011058451815e-06,
"loss": 0.4621,
"step": 318
},
{
"epoch": 0.5039494470774092,
"grad_norm": 0.7912509441375732,
"learning_rate": 4.976303317535545e-06,
"loss": 0.4186,
"step": 319
},
{
"epoch": 0.5055292259083728,
"grad_norm": 0.6150445938110352,
"learning_rate": 4.960505529225908e-06,
"loss": 0.469,
"step": 320
},
{
"epoch": 0.5071090047393365,
"grad_norm": 0.5445781946182251,
"learning_rate": 4.944707740916272e-06,
"loss": 0.4111,
"step": 321
},
{
"epoch": 0.5086887835703001,
"grad_norm": 0.5628255605697632,
"learning_rate": 4.928909952606635e-06,
"loss": 0.4884,
"step": 322
},
{
"epoch": 0.5102685624012638,
"grad_norm": 0.5007054805755615,
"learning_rate": 4.913112164296999e-06,
"loss": 0.4315,
"step": 323
},
{
"epoch": 0.5118483412322274,
"grad_norm": 0.6346699595451355,
"learning_rate": 4.8973143759873624e-06,
"loss": 0.4033,
"step": 324
},
{
"epoch": 0.5134281200631912,
"grad_norm": 0.639045774936676,
"learning_rate": 4.881516587677725e-06,
"loss": 0.3748,
"step": 325
},
{
"epoch": 0.5150078988941548,
"grad_norm": 0.5578002333641052,
"learning_rate": 4.865718799368089e-06,
"loss": 0.5055,
"step": 326
},
{
"epoch": 0.5165876777251185,
"grad_norm": 0.5281325578689575,
"learning_rate": 4.849921011058452e-06,
"loss": 0.4307,
"step": 327
},
{
"epoch": 0.5181674565560821,
"grad_norm": 0.6557057499885559,
"learning_rate": 4.834123222748816e-06,
"loss": 0.4085,
"step": 328
},
{
"epoch": 0.5197472353870458,
"grad_norm": 0.5667731761932373,
"learning_rate": 4.818325434439179e-06,
"loss": 0.4774,
"step": 329
},
{
"epoch": 0.5213270142180095,
"grad_norm": 0.5362856984138489,
"learning_rate": 4.8025276461295426e-06,
"loss": 0.4316,
"step": 330
},
{
"epoch": 0.5229067930489731,
"grad_norm": 0.5326763391494751,
"learning_rate": 4.7867298578199055e-06,
"loss": 0.389,
"step": 331
},
{
"epoch": 0.5244865718799369,
"grad_norm": 0.4922950565814972,
"learning_rate": 4.770932069510269e-06,
"loss": 0.3756,
"step": 332
},
{
"epoch": 0.5260663507109005,
"grad_norm": 0.4961477518081665,
"learning_rate": 4.755134281200632e-06,
"loss": 0.4336,
"step": 333
},
{
"epoch": 0.5276461295418642,
"grad_norm": 0.5258511304855347,
"learning_rate": 4.739336492890996e-06,
"loss": 0.404,
"step": 334
},
{
"epoch": 0.5292259083728278,
"grad_norm": 0.5479301810264587,
"learning_rate": 4.723538704581359e-06,
"loss": 0.3578,
"step": 335
},
{
"epoch": 0.5308056872037915,
"grad_norm": 0.49883902072906494,
"learning_rate": 4.707740916271723e-06,
"loss": 0.3809,
"step": 336
},
{
"epoch": 0.5323854660347551,
"grad_norm": 0.5133053660392761,
"learning_rate": 4.691943127962086e-06,
"loss": 0.4091,
"step": 337
},
{
"epoch": 0.5339652448657188,
"grad_norm": 0.6334301829338074,
"learning_rate": 4.676145339652449e-06,
"loss": 0.4432,
"step": 338
},
{
"epoch": 0.5355450236966824,
"grad_norm": 0.5124396085739136,
"learning_rate": 4.660347551342812e-06,
"loss": 0.3557,
"step": 339
},
{
"epoch": 0.5371248025276462,
"grad_norm": 0.5863746404647827,
"learning_rate": 4.644549763033176e-06,
"loss": 0.4288,
"step": 340
},
{
"epoch": 0.5387045813586098,
"grad_norm": 0.6599943041801453,
"learning_rate": 4.628751974723539e-06,
"loss": 0.398,
"step": 341
},
{
"epoch": 0.5402843601895735,
"grad_norm": 0.480027437210083,
"learning_rate": 4.612954186413903e-06,
"loss": 0.4706,
"step": 342
},
{
"epoch": 0.5418641390205371,
"grad_norm": 0.6601845026016235,
"learning_rate": 4.597156398104266e-06,
"loss": 0.4092,
"step": 343
},
{
"epoch": 0.5434439178515008,
"grad_norm": 0.5557224154472351,
"learning_rate": 4.581358609794629e-06,
"loss": 0.389,
"step": 344
},
{
"epoch": 0.5450236966824644,
"grad_norm": 0.49160709977149963,
"learning_rate": 4.5655608214849925e-06,
"loss": 0.4338,
"step": 345
},
{
"epoch": 0.5466034755134281,
"grad_norm": 0.5284649133682251,
"learning_rate": 4.549763033175355e-06,
"loss": 0.403,
"step": 346
},
{
"epoch": 0.5481832543443917,
"grad_norm": 0.5501908659934998,
"learning_rate": 4.533965244865719e-06,
"loss": 0.4983,
"step": 347
},
{
"epoch": 0.5497630331753555,
"grad_norm": 0.5585077404975891,
"learning_rate": 4.518167456556082e-06,
"loss": 0.4219,
"step": 348
},
{
"epoch": 0.5513428120063191,
"grad_norm": 0.4565962255001068,
"learning_rate": 4.502369668246446e-06,
"loss": 0.3591,
"step": 349
},
{
"epoch": 0.5529225908372828,
"grad_norm": 0.5507949590682983,
"learning_rate": 4.486571879936809e-06,
"loss": 0.4752,
"step": 350
},
{
"epoch": 0.5545023696682464,
"grad_norm": 0.5490357875823975,
"learning_rate": 4.470774091627173e-06,
"loss": 0.4291,
"step": 351
},
{
"epoch": 0.5560821484992101,
"grad_norm": 0.5804268717765808,
"learning_rate": 4.4549763033175355e-06,
"loss": 0.3113,
"step": 352
},
{
"epoch": 0.5576619273301737,
"grad_norm": 0.4745613634586334,
"learning_rate": 4.439178515007899e-06,
"loss": 0.4196,
"step": 353
},
{
"epoch": 0.5592417061611374,
"grad_norm": 0.6223664283752441,
"learning_rate": 4.423380726698262e-06,
"loss": 0.4592,
"step": 354
},
{
"epoch": 0.5608214849921012,
"grad_norm": 0.8797832727432251,
"learning_rate": 4.407582938388626e-06,
"loss": 0.4448,
"step": 355
},
{
"epoch": 0.5624012638230648,
"grad_norm": 0.5569826364517212,
"learning_rate": 4.391785150078989e-06,
"loss": 0.3873,
"step": 356
},
{
"epoch": 0.5639810426540285,
"grad_norm": 0.4294510781764984,
"learning_rate": 4.375987361769353e-06,
"loss": 0.3407,
"step": 357
},
{
"epoch": 0.5655608214849921,
"grad_norm": 0.5657434463500977,
"learning_rate": 4.360189573459716e-06,
"loss": 0.3345,
"step": 358
},
{
"epoch": 0.5671406003159558,
"grad_norm": 0.5589077472686768,
"learning_rate": 4.3443917851500794e-06,
"loss": 0.5237,
"step": 359
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.6107128858566284,
"learning_rate": 4.328593996840442e-06,
"loss": 0.4354,
"step": 360
},
{
"epoch": 0.5703001579778831,
"grad_norm": 0.5671380758285522,
"learning_rate": 4.312796208530806e-06,
"loss": 0.3712,
"step": 361
},
{
"epoch": 0.5718799368088467,
"grad_norm": 0.508173406124115,
"learning_rate": 4.29699842022117e-06,
"loss": 0.4097,
"step": 362
},
{
"epoch": 0.5734597156398105,
"grad_norm": 0.6139382719993591,
"learning_rate": 4.281200631911533e-06,
"loss": 0.2646,
"step": 363
},
{
"epoch": 0.5750394944707741,
"grad_norm": 0.5677220821380615,
"learning_rate": 4.265402843601897e-06,
"loss": 0.3748,
"step": 364
},
{
"epoch": 0.5766192733017378,
"grad_norm": 0.530708372592926,
"learning_rate": 4.2496050552922596e-06,
"loss": 0.3857,
"step": 365
},
{
"epoch": 0.5781990521327014,
"grad_norm": 1.176272988319397,
"learning_rate": 4.233807266982623e-06,
"loss": 0.436,
"step": 366
},
{
"epoch": 0.5797788309636651,
"grad_norm": 0.6165753602981567,
"learning_rate": 4.218009478672986e-06,
"loss": 0.3898,
"step": 367
},
{
"epoch": 0.5813586097946287,
"grad_norm": 0.47574201226234436,
"learning_rate": 4.20221169036335e-06,
"loss": 0.3685,
"step": 368
},
{
"epoch": 0.5829383886255924,
"grad_norm": 0.5995083451271057,
"learning_rate": 4.186413902053712e-06,
"loss": 0.4686,
"step": 369
},
{
"epoch": 0.584518167456556,
"grad_norm": 0.5809090733528137,
"learning_rate": 4.170616113744076e-06,
"loss": 0.4514,
"step": 370
},
{
"epoch": 0.5860979462875198,
"grad_norm": 0.6154018044471741,
"learning_rate": 4.15481832543444e-06,
"loss": 0.3737,
"step": 371
},
{
"epoch": 0.5876777251184834,
"grad_norm": 0.5799654126167297,
"learning_rate": 4.139020537124803e-06,
"loss": 0.4285,
"step": 372
},
{
"epoch": 0.5892575039494471,
"grad_norm": 0.4476354420185089,
"learning_rate": 4.123222748815166e-06,
"loss": 0.4362,
"step": 373
},
{
"epoch": 0.5908372827804107,
"grad_norm": 0.6266714334487915,
"learning_rate": 4.107424960505529e-06,
"loss": 0.4943,
"step": 374
},
{
"epoch": 0.5924170616113744,
"grad_norm": 0.5103732347488403,
"learning_rate": 4.091627172195893e-06,
"loss": 0.4585,
"step": 375
},
{
"epoch": 0.593996840442338,
"grad_norm": 0.49011877179145813,
"learning_rate": 4.075829383886256e-06,
"loss": 0.4489,
"step": 376
},
{
"epoch": 0.5955766192733017,
"grad_norm": 0.5286844372749329,
"learning_rate": 4.06003159557662e-06,
"loss": 0.4114,
"step": 377
},
{
"epoch": 0.5971563981042654,
"grad_norm": 0.494807630777359,
"learning_rate": 4.044233807266983e-06,
"loss": 0.3514,
"step": 378
},
{
"epoch": 0.5987361769352291,
"grad_norm": 0.46120524406433105,
"learning_rate": 4.0284360189573465e-06,
"loss": 0.4452,
"step": 379
},
{
"epoch": 0.6003159557661928,
"grad_norm": 0.6024404764175415,
"learning_rate": 4.0126382306477095e-06,
"loss": 0.4368,
"step": 380
},
{
"epoch": 0.6018957345971564,
"grad_norm": 0.8292664885520935,
"learning_rate": 3.996840442338073e-06,
"loss": 0.4495,
"step": 381
},
{
"epoch": 0.6034755134281201,
"grad_norm": 0.5312369465827942,
"learning_rate": 3.981042654028436e-06,
"loss": 0.3642,
"step": 382
},
{
"epoch": 0.6050552922590837,
"grad_norm": 0.6373758316040039,
"learning_rate": 3.9652448657188e-06,
"loss": 0.3884,
"step": 383
},
{
"epoch": 0.6066350710900474,
"grad_norm": 0.5623313188552856,
"learning_rate": 3.949447077409163e-06,
"loss": 0.3489,
"step": 384
},
{
"epoch": 0.608214849921011,
"grad_norm": 0.5703821778297424,
"learning_rate": 3.933649289099527e-06,
"loss": 0.5309,
"step": 385
},
{
"epoch": 0.6097946287519748,
"grad_norm": 0.5930938720703125,
"learning_rate": 3.91785150078989e-06,
"loss": 0.4072,
"step": 386
},
{
"epoch": 0.6113744075829384,
"grad_norm": 0.5636332631111145,
"learning_rate": 3.902053712480253e-06,
"loss": 0.3938,
"step": 387
},
{
"epoch": 0.6129541864139021,
"grad_norm": 0.45709583163261414,
"learning_rate": 3.886255924170616e-06,
"loss": 0.4436,
"step": 388
},
{
"epoch": 0.6145339652448657,
"grad_norm": 0.5924400687217712,
"learning_rate": 3.87045813586098e-06,
"loss": 0.2939,
"step": 389
},
{
"epoch": 0.6161137440758294,
"grad_norm": 0.6232696175575256,
"learning_rate": 3.854660347551343e-06,
"loss": 0.4183,
"step": 390
},
{
"epoch": 0.617693522906793,
"grad_norm": 0.5407995581626892,
"learning_rate": 3.838862559241707e-06,
"loss": 0.3925,
"step": 391
},
{
"epoch": 0.6192733017377567,
"grad_norm": 0.524691104888916,
"learning_rate": 3.82306477093207e-06,
"loss": 0.4327,
"step": 392
},
{
"epoch": 0.6208530805687204,
"grad_norm": 0.5206206440925598,
"learning_rate": 3.8072669826224335e-06,
"loss": 0.4203,
"step": 393
},
{
"epoch": 0.6224328593996841,
"grad_norm": 0.6244251132011414,
"learning_rate": 3.7914691943127964e-06,
"loss": 0.4546,
"step": 394
},
{
"epoch": 0.6240126382306477,
"grad_norm": 0.707058846950531,
"learning_rate": 3.77567140600316e-06,
"loss": 0.4015,
"step": 395
},
{
"epoch": 0.6255924170616114,
"grad_norm": 0.5457757115364075,
"learning_rate": 3.759873617693523e-06,
"loss": 0.3962,
"step": 396
},
{
"epoch": 0.627172195892575,
"grad_norm": 0.5757611989974976,
"learning_rate": 3.7440758293838865e-06,
"loss": 0.4299,
"step": 397
},
{
"epoch": 0.6287519747235387,
"grad_norm": 0.5844476819038391,
"learning_rate": 3.72827804107425e-06,
"loss": 0.4674,
"step": 398
},
{
"epoch": 0.6303317535545023,
"grad_norm": 0.6859634518623352,
"learning_rate": 3.7124802527646132e-06,
"loss": 0.4253,
"step": 399
},
{
"epoch": 0.631911532385466,
"grad_norm": 0.5247636437416077,
"learning_rate": 3.6966824644549766e-06,
"loss": 0.4318,
"step": 400
},
{
"epoch": 0.6334913112164297,
"grad_norm": 0.6206024885177612,
"learning_rate": 3.68088467614534e-06,
"loss": 0.3759,
"step": 401
},
{
"epoch": 0.6350710900473934,
"grad_norm": 0.6237459182739258,
"learning_rate": 3.6650868878357033e-06,
"loss": 0.3642,
"step": 402
},
{
"epoch": 0.636650868878357,
"grad_norm": 0.8048799633979797,
"learning_rate": 3.6492890995260666e-06,
"loss": 0.514,
"step": 403
},
{
"epoch": 0.6382306477093207,
"grad_norm": 0.4662720561027527,
"learning_rate": 3.63349131121643e-06,
"loss": 0.3654,
"step": 404
},
{
"epoch": 0.6398104265402843,
"grad_norm": 0.5561702251434326,
"learning_rate": 3.6176935229067934e-06,
"loss": 0.3823,
"step": 405
},
{
"epoch": 0.641390205371248,
"grad_norm": 0.6143206357955933,
"learning_rate": 3.6018957345971567e-06,
"loss": 0.3938,
"step": 406
},
{
"epoch": 0.6429699842022117,
"grad_norm": 0.6854034662246704,
"learning_rate": 3.58609794628752e-06,
"loss": 0.4625,
"step": 407
},
{
"epoch": 0.6445497630331753,
"grad_norm": 0.5590549111366272,
"learning_rate": 3.5703001579778834e-06,
"loss": 0.4199,
"step": 408
},
{
"epoch": 0.6461295418641391,
"grad_norm": 0.642573356628418,
"learning_rate": 3.5545023696682468e-06,
"loss": 0.4366,
"step": 409
},
{
"epoch": 0.6477093206951027,
"grad_norm": 0.5898130536079407,
"learning_rate": 3.53870458135861e-06,
"loss": 0.4691,
"step": 410
},
{
"epoch": 0.6492890995260664,
"grad_norm": 0.5370688438415527,
"learning_rate": 3.5229067930489735e-06,
"loss": 0.45,
"step": 411
},
{
"epoch": 0.65086887835703,
"grad_norm": 0.6769170165061951,
"learning_rate": 3.507109004739337e-06,
"loss": 0.3962,
"step": 412
},
{
"epoch": 0.6524486571879937,
"grad_norm": 0.5891703367233276,
"learning_rate": 3.4913112164297e-06,
"loss": 0.4542,
"step": 413
},
{
"epoch": 0.6540284360189573,
"grad_norm": 0.42204615473747253,
"learning_rate": 3.4755134281200636e-06,
"loss": 0.3368,
"step": 414
},
{
"epoch": 0.655608214849921,
"grad_norm": 0.46033787727355957,
"learning_rate": 3.459715639810427e-06,
"loss": 0.4357,
"step": 415
},
{
"epoch": 0.6571879936808847,
"grad_norm": 0.5509577393531799,
"learning_rate": 3.4439178515007903e-06,
"loss": 0.3939,
"step": 416
},
{
"epoch": 0.6587677725118484,
"grad_norm": 0.5802867412567139,
"learning_rate": 3.4281200631911536e-06,
"loss": 0.4073,
"step": 417
},
{
"epoch": 0.660347551342812,
"grad_norm": 0.6130402684211731,
"learning_rate": 3.412322274881517e-06,
"loss": 0.3452,
"step": 418
},
{
"epoch": 0.6619273301737757,
"grad_norm": 0.6854075789451599,
"learning_rate": 3.39652448657188e-06,
"loss": 0.3551,
"step": 419
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.5365926027297974,
"learning_rate": 3.3807266982622433e-06,
"loss": 0.4011,
"step": 420
},
{
"epoch": 0.665086887835703,
"grad_norm": 1.0338938236236572,
"learning_rate": 3.3649289099526066e-06,
"loss": 0.4623,
"step": 421
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5612855553627014,
"learning_rate": 3.34913112164297e-06,
"loss": 0.3738,
"step": 422
},
{
"epoch": 0.6682464454976303,
"grad_norm": 0.5113286375999451,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.3865,
"step": 423
},
{
"epoch": 0.669826224328594,
"grad_norm": 0.5509905815124512,
"learning_rate": 3.3175355450236967e-06,
"loss": 0.4093,
"step": 424
},
{
"epoch": 0.6714060031595577,
"grad_norm": 0.5425525903701782,
"learning_rate": 3.30173775671406e-06,
"loss": 0.383,
"step": 425
},
{
"epoch": 0.6729857819905213,
"grad_norm": 0.5866172909736633,
"learning_rate": 3.2859399684044234e-06,
"loss": 0.4843,
"step": 426
},
{
"epoch": 0.674565560821485,
"grad_norm": 1.0777703523635864,
"learning_rate": 3.2701421800947867e-06,
"loss": 0.3748,
"step": 427
},
{
"epoch": 0.6761453396524486,
"grad_norm": 0.49126845598220825,
"learning_rate": 3.25434439178515e-06,
"loss": 0.3505,
"step": 428
},
{
"epoch": 0.6777251184834123,
"grad_norm": 0.5471718311309814,
"learning_rate": 3.2385466034755135e-06,
"loss": 0.4755,
"step": 429
},
{
"epoch": 0.6793048973143759,
"grad_norm": 0.5689931511878967,
"learning_rate": 3.222748815165877e-06,
"loss": 0.3956,
"step": 430
},
{
"epoch": 0.6808846761453397,
"grad_norm": 0.6496183276176453,
"learning_rate": 3.2069510268562406e-06,
"loss": 0.4598,
"step": 431
},
{
"epoch": 0.6824644549763034,
"grad_norm": 0.47042712569236755,
"learning_rate": 3.191153238546604e-06,
"loss": 0.3756,
"step": 432
},
{
"epoch": 0.684044233807267,
"grad_norm": 0.5819857120513916,
"learning_rate": 3.1753554502369673e-06,
"loss": 0.4803,
"step": 433
},
{
"epoch": 0.6856240126382307,
"grad_norm": 0.5752127766609192,
"learning_rate": 3.1595576619273307e-06,
"loss": 0.3916,
"step": 434
},
{
"epoch": 0.6872037914691943,
"grad_norm": 0.6483988761901855,
"learning_rate": 3.143759873617694e-06,
"loss": 0.4338,
"step": 435
},
{
"epoch": 0.688783570300158,
"grad_norm": 0.7817516326904297,
"learning_rate": 3.1279620853080574e-06,
"loss": 0.3645,
"step": 436
},
{
"epoch": 0.6903633491311216,
"grad_norm": 0.4980696737766266,
"learning_rate": 3.1121642969984207e-06,
"loss": 0.3962,
"step": 437
},
{
"epoch": 0.6919431279620853,
"grad_norm": 0.5592882037162781,
"learning_rate": 3.096366508688784e-06,
"loss": 0.3645,
"step": 438
},
{
"epoch": 0.693522906793049,
"grad_norm": 0.6228163242340088,
"learning_rate": 3.0805687203791474e-06,
"loss": 0.3696,
"step": 439
},
{
"epoch": 0.6951026856240127,
"grad_norm": 0.6718009114265442,
"learning_rate": 3.0647709320695108e-06,
"loss": 0.4926,
"step": 440
},
{
"epoch": 0.6966824644549763,
"grad_norm": 0.6085376143455505,
"learning_rate": 3.048973143759874e-06,
"loss": 0.418,
"step": 441
},
{
"epoch": 0.69826224328594,
"grad_norm": 0.7716324925422668,
"learning_rate": 3.0331753554502375e-06,
"loss": 0.4038,
"step": 442
},
{
"epoch": 0.6998420221169036,
"grad_norm": 0.7239758968353271,
"learning_rate": 3.017377567140601e-06,
"loss": 0.4596,
"step": 443
},
{
"epoch": 0.7014218009478673,
"grad_norm": 0.6308011412620544,
"learning_rate": 3.001579778830964e-06,
"loss": 0.4082,
"step": 444
},
{
"epoch": 0.7030015797788309,
"grad_norm": 0.515626072883606,
"learning_rate": 2.985781990521327e-06,
"loss": 0.4688,
"step": 445
},
{
"epoch": 0.7045813586097947,
"grad_norm": 0.5395441651344299,
"learning_rate": 2.9699842022116905e-06,
"loss": 0.3448,
"step": 446
},
{
"epoch": 0.7061611374407583,
"grad_norm": 0.5883680582046509,
"learning_rate": 2.954186413902054e-06,
"loss": 0.4546,
"step": 447
},
{
"epoch": 0.707740916271722,
"grad_norm": 0.7300311326980591,
"learning_rate": 2.938388625592417e-06,
"loss": 0.368,
"step": 448
},
{
"epoch": 0.7093206951026856,
"grad_norm": 0.5901307463645935,
"learning_rate": 2.9225908372827806e-06,
"loss": 0.3688,
"step": 449
},
{
"epoch": 0.7109004739336493,
"grad_norm": 0.6521854996681213,
"learning_rate": 2.906793048973144e-06,
"loss": 0.3876,
"step": 450
},
{
"epoch": 0.7124802527646129,
"grad_norm": 0.688450038433075,
"learning_rate": 2.8909952606635073e-06,
"loss": 0.4298,
"step": 451
},
{
"epoch": 0.7140600315955766,
"grad_norm": 0.6533556580543518,
"learning_rate": 2.8751974723538706e-06,
"loss": 0.3589,
"step": 452
},
{
"epoch": 0.7156398104265402,
"grad_norm": 0.5261491537094116,
"learning_rate": 2.859399684044234e-06,
"loss": 0.3886,
"step": 453
},
{
"epoch": 0.717219589257504,
"grad_norm": 0.5488421320915222,
"learning_rate": 2.8436018957345973e-06,
"loss": 0.411,
"step": 454
},
{
"epoch": 0.7187993680884676,
"grad_norm": 0.6415657997131348,
"learning_rate": 2.8278041074249607e-06,
"loss": 0.4581,
"step": 455
},
{
"epoch": 0.7203791469194313,
"grad_norm": 0.5058445334434509,
"learning_rate": 2.812006319115324e-06,
"loss": 0.4325,
"step": 456
},
{
"epoch": 0.721958925750395,
"grad_norm": 0.6409322619438171,
"learning_rate": 2.7962085308056874e-06,
"loss": 0.3759,
"step": 457
},
{
"epoch": 0.7235387045813586,
"grad_norm": 0.5578014850616455,
"learning_rate": 2.7804107424960508e-06,
"loss": 0.3947,
"step": 458
},
{
"epoch": 0.7251184834123223,
"grad_norm": 0.6064183115959167,
"learning_rate": 2.764612954186414e-06,
"loss": 0.4766,
"step": 459
},
{
"epoch": 0.7266982622432859,
"grad_norm": 0.6067904233932495,
"learning_rate": 2.7488151658767775e-06,
"loss": 0.4698,
"step": 460
},
{
"epoch": 0.7282780410742496,
"grad_norm": 0.526088297367096,
"learning_rate": 2.733017377567141e-06,
"loss": 0.3997,
"step": 461
},
{
"epoch": 0.7298578199052133,
"grad_norm": 0.6290006637573242,
"learning_rate": 2.717219589257504e-06,
"loss": 0.4393,
"step": 462
},
{
"epoch": 0.731437598736177,
"grad_norm": 0.5822445154190063,
"learning_rate": 2.7014218009478675e-06,
"loss": 0.4767,
"step": 463
},
{
"epoch": 0.7330173775671406,
"grad_norm": 0.5798205733299255,
"learning_rate": 2.685624012638231e-06,
"loss": 0.4163,
"step": 464
},
{
"epoch": 0.7345971563981043,
"grad_norm": 0.6234124898910522,
"learning_rate": 2.6698262243285942e-06,
"loss": 0.387,
"step": 465
},
{
"epoch": 0.7361769352290679,
"grad_norm": 0.5226984620094299,
"learning_rate": 2.6540284360189576e-06,
"loss": 0.4144,
"step": 466
},
{
"epoch": 0.7377567140600316,
"grad_norm": 0.529303789138794,
"learning_rate": 2.638230647709321e-06,
"loss": 0.4689,
"step": 467
},
{
"epoch": 0.7393364928909952,
"grad_norm": 0.6620000004768372,
"learning_rate": 2.6224328593996843e-06,
"loss": 0.4358,
"step": 468
},
{
"epoch": 0.740916271721959,
"grad_norm": 0.8560294508934021,
"learning_rate": 2.606635071090048e-06,
"loss": 0.422,
"step": 469
},
{
"epoch": 0.7424960505529226,
"grad_norm": 0.47033989429473877,
"learning_rate": 2.5908372827804106e-06,
"loss": 0.4462,
"step": 470
},
{
"epoch": 0.7440758293838863,
"grad_norm": 0.5476656556129456,
"learning_rate": 2.575039494470774e-06,
"loss": 0.3818,
"step": 471
},
{
"epoch": 0.7456556082148499,
"grad_norm": 0.5771902203559875,
"learning_rate": 2.5592417061611373e-06,
"loss": 0.3835,
"step": 472
},
{
"epoch": 0.7472353870458136,
"grad_norm": 0.6452733278274536,
"learning_rate": 2.5434439178515007e-06,
"loss": 0.4224,
"step": 473
},
{
"epoch": 0.7488151658767772,
"grad_norm": 0.5318686962127686,
"learning_rate": 2.527646129541864e-06,
"loss": 0.4812,
"step": 474
},
{
"epoch": 0.7503949447077409,
"grad_norm": 0.6591460108757019,
"learning_rate": 2.5118483412322274e-06,
"loss": 0.4546,
"step": 475
},
{
"epoch": 0.7519747235387045,
"grad_norm": 0.5857440829277039,
"learning_rate": 2.4960505529225907e-06,
"loss": 0.4008,
"step": 476
},
{
"epoch": 0.7535545023696683,
"grad_norm": 0.6430768370628357,
"learning_rate": 2.480252764612954e-06,
"loss": 0.3191,
"step": 477
},
{
"epoch": 0.7551342812006319,
"grad_norm": 0.7442892789840698,
"learning_rate": 2.4644549763033174e-06,
"loss": 0.4171,
"step": 478
},
{
"epoch": 0.7567140600315956,
"grad_norm": 0.6390454173088074,
"learning_rate": 2.4486571879936812e-06,
"loss": 0.5381,
"step": 479
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.6277416348457336,
"learning_rate": 2.4328593996840446e-06,
"loss": 0.4824,
"step": 480
},
{
"epoch": 0.7598736176935229,
"grad_norm": 0.6043097972869873,
"learning_rate": 2.417061611374408e-06,
"loss": 0.4266,
"step": 481
},
{
"epoch": 0.7614533965244866,
"grad_norm": 0.6095964312553406,
"learning_rate": 2.4012638230647713e-06,
"loss": 0.4258,
"step": 482
},
{
"epoch": 0.7630331753554502,
"grad_norm": 0.5433639287948608,
"learning_rate": 2.3854660347551346e-06,
"loss": 0.4873,
"step": 483
},
{
"epoch": 0.764612954186414,
"grad_norm": 0.49287649989128113,
"learning_rate": 2.369668246445498e-06,
"loss": 0.4814,
"step": 484
},
{
"epoch": 0.7661927330173776,
"grad_norm": 0.5905902981758118,
"learning_rate": 2.3538704581358613e-06,
"loss": 0.4519,
"step": 485
},
{
"epoch": 0.7677725118483413,
"grad_norm": 0.6697285771369934,
"learning_rate": 2.3380726698262247e-06,
"loss": 0.4686,
"step": 486
},
{
"epoch": 0.7693522906793049,
"grad_norm": 0.5338664650917053,
"learning_rate": 2.322274881516588e-06,
"loss": 0.401,
"step": 487
},
{
"epoch": 0.7709320695102686,
"grad_norm": 0.5338428616523743,
"learning_rate": 2.3064770932069514e-06,
"loss": 0.4045,
"step": 488
},
{
"epoch": 0.7725118483412322,
"grad_norm": 0.6102830171585083,
"learning_rate": 2.2906793048973143e-06,
"loss": 0.3785,
"step": 489
},
{
"epoch": 0.7740916271721959,
"grad_norm": 0.5787335634231567,
"learning_rate": 2.2748815165876777e-06,
"loss": 0.42,
"step": 490
},
{
"epoch": 0.7756714060031595,
"grad_norm": 0.7426438331604004,
"learning_rate": 2.259083728278041e-06,
"loss": 0.4676,
"step": 491
},
{
"epoch": 0.7772511848341233,
"grad_norm": 0.5988475680351257,
"learning_rate": 2.2432859399684044e-06,
"loss": 0.5404,
"step": 492
},
{
"epoch": 0.7788309636650869,
"grad_norm": 0.6289830803871155,
"learning_rate": 2.2274881516587678e-06,
"loss": 0.396,
"step": 493
},
{
"epoch": 0.7804107424960506,
"grad_norm": 0.6077900528907776,
"learning_rate": 2.211690363349131e-06,
"loss": 0.4016,
"step": 494
},
{
"epoch": 0.7819905213270142,
"grad_norm": 0.8171889781951904,
"learning_rate": 2.1958925750394945e-06,
"loss": 0.3638,
"step": 495
},
{
"epoch": 0.7835703001579779,
"grad_norm": 0.6225026845932007,
"learning_rate": 2.180094786729858e-06,
"loss": 0.4088,
"step": 496
},
{
"epoch": 0.7851500789889415,
"grad_norm": 0.6262929439544678,
"learning_rate": 2.164296998420221e-06,
"loss": 0.3311,
"step": 497
},
{
"epoch": 0.7867298578199052,
"grad_norm": 0.662129282951355,
"learning_rate": 2.148499210110585e-06,
"loss": 0.4434,
"step": 498
},
{
"epoch": 0.7883096366508688,
"grad_norm": 0.5046777725219727,
"learning_rate": 2.1327014218009483e-06,
"loss": 0.5042,
"step": 499
},
{
"epoch": 0.7898894154818326,
"grad_norm": 0.6273382306098938,
"learning_rate": 2.1169036334913117e-06,
"loss": 0.345,
"step": 500
},
{
"epoch": 0.7914691943127962,
"grad_norm": 0.5484871864318848,
"learning_rate": 2.101105845181675e-06,
"loss": 0.3476,
"step": 501
},
{
"epoch": 0.7930489731437599,
"grad_norm": 0.6779518723487854,
"learning_rate": 2.085308056872038e-06,
"loss": 0.4062,
"step": 502
},
{
"epoch": 0.7946287519747235,
"grad_norm": 0.4969736635684967,
"learning_rate": 2.0695102685624013e-06,
"loss": 0.3615,
"step": 503
},
{
"epoch": 0.7962085308056872,
"grad_norm": 0.5542388558387756,
"learning_rate": 2.0537124802527647e-06,
"loss": 0.39,
"step": 504
},
{
"epoch": 0.7977883096366508,
"grad_norm": 0.8587651252746582,
"learning_rate": 2.037914691943128e-06,
"loss": 0.423,
"step": 505
},
{
"epoch": 0.7993680884676145,
"grad_norm": 0.6399357318878174,
"learning_rate": 2.0221169036334914e-06,
"loss": 0.4645,
"step": 506
},
{
"epoch": 0.8009478672985783,
"grad_norm": 0.5677849650382996,
"learning_rate": 2.0063191153238547e-06,
"loss": 0.3749,
"step": 507
},
{
"epoch": 0.8025276461295419,
"grad_norm": 0.5609621405601501,
"learning_rate": 1.990521327014218e-06,
"loss": 0.4727,
"step": 508
},
{
"epoch": 0.8041074249605056,
"grad_norm": 0.615185558795929,
"learning_rate": 1.9747235387045814e-06,
"loss": 0.4349,
"step": 509
},
{
"epoch": 0.8056872037914692,
"grad_norm": 0.5093739032745361,
"learning_rate": 1.958925750394945e-06,
"loss": 0.3502,
"step": 510
},
{
"epoch": 0.8072669826224329,
"grad_norm": 0.8513323068618774,
"learning_rate": 1.943127962085308e-06,
"loss": 0.3902,
"step": 511
},
{
"epoch": 0.8088467614533965,
"grad_norm": 0.6797610521316528,
"learning_rate": 1.9273301737756715e-06,
"loss": 0.4987,
"step": 512
},
{
"epoch": 0.8104265402843602,
"grad_norm": 0.5715585947036743,
"learning_rate": 1.911532385466035e-06,
"loss": 0.3965,
"step": 513
},
{
"epoch": 0.8120063191153238,
"grad_norm": 0.5537532567977905,
"learning_rate": 1.8957345971563982e-06,
"loss": 0.3832,
"step": 514
},
{
"epoch": 0.8135860979462876,
"grad_norm": 0.5337470173835754,
"learning_rate": 1.8799368088467616e-06,
"loss": 0.4136,
"step": 515
},
{
"epoch": 0.8151658767772512,
"grad_norm": 0.5929555892944336,
"learning_rate": 1.864139020537125e-06,
"loss": 0.3901,
"step": 516
},
{
"epoch": 0.8167456556082149,
"grad_norm": 0.6738921403884888,
"learning_rate": 1.8483412322274883e-06,
"loss": 0.4128,
"step": 517
},
{
"epoch": 0.8183254344391785,
"grad_norm": 0.598659098148346,
"learning_rate": 1.8325434439178516e-06,
"loss": 0.3707,
"step": 518
},
{
"epoch": 0.8199052132701422,
"grad_norm": 0.5679790377616882,
"learning_rate": 1.816745655608215e-06,
"loss": 0.457,
"step": 519
},
{
"epoch": 0.8214849921011058,
"grad_norm": 0.5459115505218506,
"learning_rate": 1.8009478672985784e-06,
"loss": 0.3613,
"step": 520
},
{
"epoch": 0.8230647709320695,
"grad_norm": 0.5752125978469849,
"learning_rate": 1.7851500789889417e-06,
"loss": 0.479,
"step": 521
},
{
"epoch": 0.8246445497630331,
"grad_norm": 0.5184637904167175,
"learning_rate": 1.769352290679305e-06,
"loss": 0.4126,
"step": 522
},
{
"epoch": 0.8262243285939969,
"grad_norm": 0.6329041123390198,
"learning_rate": 1.7535545023696684e-06,
"loss": 0.4221,
"step": 523
},
{
"epoch": 0.8278041074249605,
"grad_norm": 0.5233784317970276,
"learning_rate": 1.7377567140600318e-06,
"loss": 0.4375,
"step": 524
},
{
"epoch": 0.8293838862559242,
"grad_norm": 0.5424541234970093,
"learning_rate": 1.7219589257503951e-06,
"loss": 0.4447,
"step": 525
},
{
"epoch": 0.8309636650868878,
"grad_norm": 0.5534167885780334,
"learning_rate": 1.7061611374407585e-06,
"loss": 0.3672,
"step": 526
},
{
"epoch": 0.8325434439178515,
"grad_norm": 0.605102002620697,
"learning_rate": 1.6903633491311216e-06,
"loss": 0.4319,
"step": 527
},
{
"epoch": 0.8341232227488151,
"grad_norm": 0.5609396696090698,
"learning_rate": 1.674565560821485e-06,
"loss": 0.3984,
"step": 528
},
{
"epoch": 0.8357030015797788,
"grad_norm": 0.7964479923248291,
"learning_rate": 1.6587677725118483e-06,
"loss": 0.407,
"step": 529
},
{
"epoch": 0.8372827804107424,
"grad_norm": 0.4886048436164856,
"learning_rate": 1.6429699842022117e-06,
"loss": 0.4506,
"step": 530
},
{
"epoch": 0.8388625592417062,
"grad_norm": 0.543812096118927,
"learning_rate": 1.627172195892575e-06,
"loss": 0.3141,
"step": 531
},
{
"epoch": 0.8404423380726699,
"grad_norm": 0.5370059609413147,
"learning_rate": 1.6113744075829384e-06,
"loss": 0.3712,
"step": 532
},
{
"epoch": 0.8420221169036335,
"grad_norm": 0.7402203679084778,
"learning_rate": 1.595576619273302e-06,
"loss": 0.4136,
"step": 533
},
{
"epoch": 0.8436018957345972,
"grad_norm": 0.6814244985580444,
"learning_rate": 1.5797788309636653e-06,
"loss": 0.4634,
"step": 534
},
{
"epoch": 0.8451816745655608,
"grad_norm": 0.5919080972671509,
"learning_rate": 1.5639810426540287e-06,
"loss": 0.4238,
"step": 535
},
{
"epoch": 0.8467614533965245,
"grad_norm": 0.617522120475769,
"learning_rate": 1.548183254344392e-06,
"loss": 0.3431,
"step": 536
},
{
"epoch": 0.8483412322274881,
"grad_norm": 0.49482643604278564,
"learning_rate": 1.5323854660347554e-06,
"loss": 0.3882,
"step": 537
},
{
"epoch": 0.8499210110584519,
"grad_norm": 0.5525531768798828,
"learning_rate": 1.5165876777251187e-06,
"loss": 0.4053,
"step": 538
},
{
"epoch": 0.8515007898894155,
"grad_norm": 0.6634103655815125,
"learning_rate": 1.500789889415482e-06,
"loss": 0.4624,
"step": 539
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.45309382677078247,
"learning_rate": 1.4849921011058452e-06,
"loss": 0.3486,
"step": 540
},
{
"epoch": 0.8546603475513428,
"grad_norm": 0.778338611125946,
"learning_rate": 1.4691943127962086e-06,
"loss": 0.3984,
"step": 541
},
{
"epoch": 0.8562401263823065,
"grad_norm": 0.6093356609344482,
"learning_rate": 1.453396524486572e-06,
"loss": 0.333,
"step": 542
},
{
"epoch": 0.8578199052132701,
"grad_norm": 0.49551188945770264,
"learning_rate": 1.4375987361769353e-06,
"loss": 0.3915,
"step": 543
},
{
"epoch": 0.8593996840442338,
"grad_norm": 0.5423188209533691,
"learning_rate": 1.4218009478672987e-06,
"loss": 0.4192,
"step": 544
},
{
"epoch": 0.8609794628751974,
"grad_norm": 0.8111097812652588,
"learning_rate": 1.406003159557662e-06,
"loss": 0.473,
"step": 545
},
{
"epoch": 0.8625592417061612,
"grad_norm": 0.6064862012863159,
"learning_rate": 1.3902053712480254e-06,
"loss": 0.4164,
"step": 546
},
{
"epoch": 0.8641390205371248,
"grad_norm": 0.6180470585823059,
"learning_rate": 1.3744075829383887e-06,
"loss": 0.4351,
"step": 547
},
{
"epoch": 0.8657187993680885,
"grad_norm": 0.5101069808006287,
"learning_rate": 1.358609794628752e-06,
"loss": 0.3806,
"step": 548
},
{
"epoch": 0.8672985781990521,
"grad_norm": 0.6269749402999878,
"learning_rate": 1.3428120063191154e-06,
"loss": 0.4028,
"step": 549
},
{
"epoch": 0.8688783570300158,
"grad_norm": 0.6344918608665466,
"learning_rate": 1.3270142180094788e-06,
"loss": 0.3206,
"step": 550
},
{
"epoch": 0.8704581358609794,
"grad_norm": 0.7053835988044739,
"learning_rate": 1.3112164296998422e-06,
"loss": 0.4404,
"step": 551
},
{
"epoch": 0.8720379146919431,
"grad_norm": 0.4780917465686798,
"learning_rate": 1.2954186413902053e-06,
"loss": 0.4089,
"step": 552
},
{
"epoch": 0.8736176935229067,
"grad_norm": 0.5235942006111145,
"learning_rate": 1.2796208530805687e-06,
"loss": 0.3992,
"step": 553
},
{
"epoch": 0.8751974723538705,
"grad_norm": 0.5037370324134827,
"learning_rate": 1.263823064770932e-06,
"loss": 0.3727,
"step": 554
},
{
"epoch": 0.8767772511848341,
"grad_norm": 0.5422868132591248,
"learning_rate": 1.2480252764612954e-06,
"loss": 0.4524,
"step": 555
},
{
"epoch": 0.8783570300157978,
"grad_norm": 0.5287191271781921,
"learning_rate": 1.2322274881516587e-06,
"loss": 0.3445,
"step": 556
},
{
"epoch": 0.8799368088467614,
"grad_norm": 0.49679964780807495,
"learning_rate": 1.2164296998420223e-06,
"loss": 0.3357,
"step": 557
},
{
"epoch": 0.8815165876777251,
"grad_norm": 0.5391539931297302,
"learning_rate": 1.2006319115323856e-06,
"loss": 0.4645,
"step": 558
},
{
"epoch": 0.8830963665086888,
"grad_norm": 0.5474575757980347,
"learning_rate": 1.184834123222749e-06,
"loss": 0.4109,
"step": 559
},
{
"epoch": 0.8846761453396524,
"grad_norm": 0.5920886993408203,
"learning_rate": 1.1690363349131124e-06,
"loss": 0.4034,
"step": 560
},
{
"epoch": 0.8862559241706162,
"grad_norm": 0.5637263655662537,
"learning_rate": 1.1532385466034757e-06,
"loss": 0.392,
"step": 561
},
{
"epoch": 0.8878357030015798,
"grad_norm": 0.6719076037406921,
"learning_rate": 1.1374407582938388e-06,
"loss": 0.3798,
"step": 562
},
{
"epoch": 0.8894154818325435,
"grad_norm": 0.5554001927375793,
"learning_rate": 1.1216429699842022e-06,
"loss": 0.3901,
"step": 563
},
{
"epoch": 0.8909952606635071,
"grad_norm": 0.6078475713729858,
"learning_rate": 1.1058451816745656e-06,
"loss": 0.3574,
"step": 564
},
{
"epoch": 0.8925750394944708,
"grad_norm": 0.9478325843811035,
"learning_rate": 1.090047393364929e-06,
"loss": 0.3831,
"step": 565
},
{
"epoch": 0.8941548183254344,
"grad_norm": 0.5259877443313599,
"learning_rate": 1.0742496050552925e-06,
"loss": 0.4003,
"step": 566
},
{
"epoch": 0.8957345971563981,
"grad_norm": 0.5395880937576294,
"learning_rate": 1.0584518167456558e-06,
"loss": 0.3513,
"step": 567
},
{
"epoch": 0.8973143759873617,
"grad_norm": 0.5458592772483826,
"learning_rate": 1.042654028436019e-06,
"loss": 0.49,
"step": 568
},
{
"epoch": 0.8988941548183255,
"grad_norm": 0.5552616715431213,
"learning_rate": 1.0268562401263823e-06,
"loss": 0.3905,
"step": 569
},
{
"epoch": 0.9004739336492891,
"grad_norm": 0.551466166973114,
"learning_rate": 1.0110584518167457e-06,
"loss": 0.4241,
"step": 570
},
{
"epoch": 0.9020537124802528,
"grad_norm": 0.7195900082588196,
"learning_rate": 9.95260663507109e-07,
"loss": 0.3912,
"step": 571
},
{
"epoch": 0.9036334913112164,
"grad_norm": 0.5951517820358276,
"learning_rate": 9.794628751974724e-07,
"loss": 0.4267,
"step": 572
},
{
"epoch": 0.9052132701421801,
"grad_norm": 0.7582541108131409,
"learning_rate": 9.636650868878358e-07,
"loss": 0.4024,
"step": 573
},
{
"epoch": 0.9067930489731437,
"grad_norm": 0.6346389651298523,
"learning_rate": 9.478672985781991e-07,
"loss": 0.4677,
"step": 574
},
{
"epoch": 0.9083728278041074,
"grad_norm": 0.7323048710823059,
"learning_rate": 9.320695102685625e-07,
"loss": 0.4332,
"step": 575
},
{
"epoch": 0.909952606635071,
"grad_norm": 0.5796726942062378,
"learning_rate": 9.162717219589258e-07,
"loss": 0.3514,
"step": 576
},
{
"epoch": 0.9115323854660348,
"grad_norm": 0.7424004673957825,
"learning_rate": 9.004739336492892e-07,
"loss": 0.4178,
"step": 577
},
{
"epoch": 0.9131121642969984,
"grad_norm": 0.525142252445221,
"learning_rate": 8.846761453396525e-07,
"loss": 0.4498,
"step": 578
},
{
"epoch": 0.9146919431279621,
"grad_norm": 0.5565955638885498,
"learning_rate": 8.688783570300159e-07,
"loss": 0.4532,
"step": 579
},
{
"epoch": 0.9162717219589257,
"grad_norm": 0.540267288684845,
"learning_rate": 8.530805687203792e-07,
"loss": 0.4828,
"step": 580
},
{
"epoch": 0.9178515007898894,
"grad_norm": 0.5061677694320679,
"learning_rate": 8.372827804107425e-07,
"loss": 0.3505,
"step": 581
},
{
"epoch": 0.919431279620853,
"grad_norm": 0.5490908622741699,
"learning_rate": 8.214849921011058e-07,
"loss": 0.4402,
"step": 582
},
{
"epoch": 0.9210110584518167,
"grad_norm": 0.5788997411727905,
"learning_rate": 8.056872037914692e-07,
"loss": 0.3256,
"step": 583
},
{
"epoch": 0.9225908372827805,
"grad_norm": 0.5741492509841919,
"learning_rate": 7.898894154818327e-07,
"loss": 0.451,
"step": 584
},
{
"epoch": 0.9241706161137441,
"grad_norm": 0.5012090802192688,
"learning_rate": 7.74091627172196e-07,
"loss": 0.3513,
"step": 585
},
{
"epoch": 0.9257503949447078,
"grad_norm": 0.5613192915916443,
"learning_rate": 7.582938388625594e-07,
"loss": 0.3499,
"step": 586
},
{
"epoch": 0.9273301737756714,
"grad_norm": 0.5941815376281738,
"learning_rate": 7.424960505529226e-07,
"loss": 0.4133,
"step": 587
},
{
"epoch": 0.9289099526066351,
"grad_norm": 0.7772453427314758,
"learning_rate": 7.26698262243286e-07,
"loss": 0.3818,
"step": 588
},
{
"epoch": 0.9304897314375987,
"grad_norm": 0.5977700352668762,
"learning_rate": 7.109004739336493e-07,
"loss": 0.4099,
"step": 589
},
{
"epoch": 0.9320695102685624,
"grad_norm": 0.7777069807052612,
"learning_rate": 6.951026856240127e-07,
"loss": 0.4341,
"step": 590
},
{
"epoch": 0.933649289099526,
"grad_norm": 0.5362728834152222,
"learning_rate": 6.79304897314376e-07,
"loss": 0.4431,
"step": 591
},
{
"epoch": 0.9352290679304898,
"grad_norm": 0.5126134157180786,
"learning_rate": 6.635071090047394e-07,
"loss": 0.3713,
"step": 592
},
{
"epoch": 0.9368088467614534,
"grad_norm": 0.5886785984039307,
"learning_rate": 6.477093206951026e-07,
"loss": 0.405,
"step": 593
},
{
"epoch": 0.9383886255924171,
"grad_norm": 0.5328089594841003,
"learning_rate": 6.31911532385466e-07,
"loss": 0.3952,
"step": 594
},
{
"epoch": 0.9399684044233807,
"grad_norm": 0.7170501351356506,
"learning_rate": 6.161137440758294e-07,
"loss": 0.3979,
"step": 595
},
{
"epoch": 0.9415481832543444,
"grad_norm": 0.6048548817634583,
"learning_rate": 6.003159557661928e-07,
"loss": 0.3425,
"step": 596
},
{
"epoch": 0.943127962085308,
"grad_norm": 0.5635291337966919,
"learning_rate": 5.845181674565562e-07,
"loss": 0.3008,
"step": 597
},
{
"epoch": 0.9447077409162717,
"grad_norm": 0.6890112161636353,
"learning_rate": 5.687203791469194e-07,
"loss": 0.4205,
"step": 598
},
{
"epoch": 0.9462875197472354,
"grad_norm": 0.5197014212608337,
"learning_rate": 5.529225908372828e-07,
"loss": 0.4589,
"step": 599
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.5197718143463135,
"learning_rate": 5.371248025276462e-07,
"loss": 0.2678,
"step": 600
},
{
"epoch": 0.9494470774091627,
"grad_norm": 0.44931474328041077,
"learning_rate": 5.213270142180095e-07,
"loss": 0.4351,
"step": 601
},
{
"epoch": 0.9510268562401264,
"grad_norm": 0.47795984148979187,
"learning_rate": 5.055292259083728e-07,
"loss": 0.4392,
"step": 602
},
{
"epoch": 0.95260663507109,
"grad_norm": 0.6027578115463257,
"learning_rate": 4.897314375987362e-07,
"loss": 0.4499,
"step": 603
},
{
"epoch": 0.9541864139020537,
"grad_norm": 0.6160722374916077,
"learning_rate": 4.7393364928909956e-07,
"loss": 0.434,
"step": 604
},
{
"epoch": 0.9557661927330173,
"grad_norm": 0.8371343612670898,
"learning_rate": 4.581358609794629e-07,
"loss": 0.3911,
"step": 605
},
{
"epoch": 0.957345971563981,
"grad_norm": 0.5282484292984009,
"learning_rate": 4.4233807266982627e-07,
"loss": 0.4445,
"step": 606
},
{
"epoch": 0.9589257503949447,
"grad_norm": 0.5557743310928345,
"learning_rate": 4.265402843601896e-07,
"loss": 0.4103,
"step": 607
},
{
"epoch": 0.9605055292259084,
"grad_norm": 0.6362637281417847,
"learning_rate": 4.107424960505529e-07,
"loss": 0.3856,
"step": 608
},
{
"epoch": 0.9620853080568721,
"grad_norm": 0.745617151260376,
"learning_rate": 3.9494470774091633e-07,
"loss": 0.4179,
"step": 609
},
{
"epoch": 0.9636650868878357,
"grad_norm": 0.659038782119751,
"learning_rate": 3.791469194312797e-07,
"loss": 0.4027,
"step": 610
},
{
"epoch": 0.9652448657187994,
"grad_norm": 0.645199716091156,
"learning_rate": 3.63349131121643e-07,
"loss": 0.3501,
"step": 611
},
{
"epoch": 0.966824644549763,
"grad_norm": 0.4868941605091095,
"learning_rate": 3.4755134281200634e-07,
"loss": 0.3385,
"step": 612
},
{
"epoch": 0.9684044233807267,
"grad_norm": 0.5993934273719788,
"learning_rate": 3.317535545023697e-07,
"loss": 0.369,
"step": 613
},
{
"epoch": 0.9699842022116903,
"grad_norm": 0.6094574928283691,
"learning_rate": 3.15955766192733e-07,
"loss": 0.4899,
"step": 614
},
{
"epoch": 0.9715639810426541,
"grad_norm": 0.6989656686782837,
"learning_rate": 3.001579778830964e-07,
"loss": 0.4346,
"step": 615
},
{
"epoch": 0.9731437598736177,
"grad_norm": 0.5412940382957458,
"learning_rate": 2.843601895734597e-07,
"loss": 0.4515,
"step": 616
},
{
"epoch": 0.9747235387045814,
"grad_norm": 0.507622241973877,
"learning_rate": 2.685624012638231e-07,
"loss": 0.4171,
"step": 617
},
{
"epoch": 0.976303317535545,
"grad_norm": 0.4564089775085449,
"learning_rate": 2.527646129541864e-07,
"loss": 0.3452,
"step": 618
},
{
"epoch": 0.9778830963665087,
"grad_norm": 0.48170286417007446,
"learning_rate": 2.3696682464454978e-07,
"loss": 0.3866,
"step": 619
},
{
"epoch": 0.9794628751974723,
"grad_norm": 0.47774481773376465,
"learning_rate": 2.2116903633491313e-07,
"loss": 0.4425,
"step": 620
},
{
"epoch": 0.981042654028436,
"grad_norm": 0.4460739493370056,
"learning_rate": 2.0537124802527646e-07,
"loss": 0.3991,
"step": 621
},
{
"epoch": 0.9826224328593997,
"grad_norm": 0.536359965801239,
"learning_rate": 1.8957345971563984e-07,
"loss": 0.327,
"step": 622
},
{
"epoch": 0.9842022116903634,
"grad_norm": 0.5439571738243103,
"learning_rate": 1.7377567140600317e-07,
"loss": 0.408,
"step": 623
},
{
"epoch": 0.985781990521327,
"grad_norm": 0.8827345967292786,
"learning_rate": 1.579778830963665e-07,
"loss": 0.4924,
"step": 624
},
{
"epoch": 0.9873617693522907,
"grad_norm": 0.4992835521697998,
"learning_rate": 1.4218009478672986e-07,
"loss": 0.3921,
"step": 625
},
{
"epoch": 0.9889415481832543,
"grad_norm": 0.7306237816810608,
"learning_rate": 1.263823064770932e-07,
"loss": 0.5063,
"step": 626
},
{
"epoch": 0.990521327014218,
"grad_norm": 0.5200903415679932,
"learning_rate": 1.1058451816745657e-07,
"loss": 0.358,
"step": 627
},
{
"epoch": 0.9921011058451816,
"grad_norm": 0.42708104848861694,
"learning_rate": 9.478672985781992e-08,
"loss": 0.3361,
"step": 628
},
{
"epoch": 0.9936808846761453,
"grad_norm": 0.5993225574493408,
"learning_rate": 7.898894154818325e-08,
"loss": 0.3625,
"step": 629
},
{
"epoch": 0.995260663507109,
"grad_norm": 0.49995774030685425,
"learning_rate": 6.31911532385466e-08,
"loss": 0.3746,
"step": 630
},
{
"epoch": 0.9968404423380727,
"grad_norm": 0.5806180238723755,
"learning_rate": 4.739336492890996e-08,
"loss": 0.3727,
"step": 631
},
{
"epoch": 0.9984202211690363,
"grad_norm": 0.5514349341392517,
"learning_rate": 3.15955766192733e-08,
"loss": 0.4634,
"step": 632
},
{
"epoch": 1.0,
"grad_norm": 0.4094119668006897,
"learning_rate": 1.579778830963665e-08,
"loss": 0.2044,
"step": 633
}
],
"logging_steps": 1.0,
"max_steps": 633,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.9805266972408545e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}