diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4465 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 0, + "global_step": 633, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001579778830963665, + "grad_norm": 3.3021833896636963, + "learning_rate": 1e-05, + "loss": 0.8142, + "step": 1 + }, + { + "epoch": 0.00315955766192733, + "grad_norm": 0.5667713284492493, + "learning_rate": 9.984202211690363e-06, + "loss": 0.4081, + "step": 2 + }, + { + "epoch": 0.004739336492890996, + "grad_norm": 7.904314994812012, + "learning_rate": 9.968404423380728e-06, + "loss": 1.1876, + "step": 3 + }, + { + "epoch": 0.00631911532385466, + "grad_norm": 10.157713890075684, + "learning_rate": 9.95260663507109e-06, + "loss": 1.4092, + "step": 4 + }, + { + "epoch": 0.007898894154818325, + "grad_norm": 4.723056316375732, + "learning_rate": 9.936808846761454e-06, + "loss": 0.7578, + "step": 5 + }, + { + "epoch": 0.009478672985781991, + "grad_norm": 7.033465385437012, + "learning_rate": 9.921011058451816e-06, + "loss": 0.5175, + "step": 6 + }, + { + "epoch": 0.011058451816745656, + "grad_norm": 0.800440788269043, + "learning_rate": 9.905213270142182e-06, + "loss": 0.4077, + "step": 7 + }, + { + "epoch": 0.01263823064770932, + "grad_norm": 0.6944026350975037, + "learning_rate": 9.889415481832544e-06, + "loss": 0.4686, + "step": 8 + }, + { + "epoch": 0.014218009478672985, + "grad_norm": 0.5700448751449585, + "learning_rate": 9.873617693522908e-06, + "loss": 0.3623, + "step": 9 + }, + { + "epoch": 0.01579778830963665, + "grad_norm": 0.7115408778190613, + "learning_rate": 9.85781990521327e-06, + "loss": 0.4727, + "step": 10 + }, + { + "epoch": 0.017377567140600316, + "grad_norm": 0.5764197707176208, + "learning_rate": 9.842022116903635e-06, + "loss": 0.4054, + "step": 11 + }, + { + "epoch": 0.018957345971563982, + "grad_norm": 0.615205705165863, + "learning_rate": 9.826224328593997e-06, + "loss": 0.3798, + "step": 12 + }, + { + "epoch": 0.020537124802527645, + "grad_norm": 0.6402739882469177, + "learning_rate": 9.810426540284361e-06, + "loss": 0.3966, + "step": 13 + }, + { + "epoch": 0.022116903633491312, + "grad_norm": 0.6007937788963318, + "learning_rate": 9.794628751974725e-06, + "loss": 0.4158, + "step": 14 + }, + { + "epoch": 0.023696682464454975, + "grad_norm": 0.5462563037872314, + "learning_rate": 9.778830963665089e-06, + "loss": 0.4795, + "step": 15 + }, + { + "epoch": 0.02527646129541864, + "grad_norm": 0.6038461923599243, + "learning_rate": 9.76303317535545e-06, + "loss": 0.4142, + "step": 16 + }, + { + "epoch": 0.026856240126382307, + "grad_norm": 0.514258861541748, + "learning_rate": 9.747235387045815e-06, + "loss": 0.4139, + "step": 17 + }, + { + "epoch": 0.02843601895734597, + "grad_norm": 0.728235125541687, + "learning_rate": 9.731437598736178e-06, + "loss": 0.3129, + "step": 18 + }, + { + "epoch": 0.030015797788309637, + "grad_norm": 0.7013534307479858, + "learning_rate": 9.715639810426542e-06, + "loss": 0.4275, + "step": 19 + }, + { + "epoch": 0.0315955766192733, + "grad_norm": 0.6062476634979248, + "learning_rate": 9.699842022116904e-06, + "loss": 0.3961, + "step": 20 + }, + { + "epoch": 0.03317535545023697, + "grad_norm": 0.6089779138565063, + "learning_rate": 9.684044233807268e-06, + "loss": 0.4972, + "step": 21 + }, + { + "epoch": 0.03475513428120063, + "grad_norm": 0.6651365756988525, + "learning_rate": 9.668246445497632e-06, + "loss": 0.4714, + "step": 22 + }, + { + "epoch": 0.036334913112164295, + "grad_norm": 0.6064260601997375, + "learning_rate": 9.652448657187995e-06, + "loss": 0.4358, + "step": 23 + }, + { + "epoch": 0.037914691943127965, + "grad_norm": 0.5868542790412903, + "learning_rate": 9.636650868878358e-06, + "loss": 0.5178, + "step": 24 + }, + { + "epoch": 0.03949447077409163, + "grad_norm": 0.6516690850257874, + "learning_rate": 9.620853080568721e-06, + "loss": 0.4281, + "step": 25 + }, + { + "epoch": 0.04107424960505529, + "grad_norm": 0.7721027731895447, + "learning_rate": 9.605055292259085e-06, + "loss": 0.4979, + "step": 26 + }, + { + "epoch": 0.04265402843601896, + "grad_norm": 0.6200973987579346, + "learning_rate": 9.589257503949447e-06, + "loss": 0.347, + "step": 27 + }, + { + "epoch": 0.044233807266982623, + "grad_norm": 0.6557235717773438, + "learning_rate": 9.573459715639811e-06, + "loss": 0.3422, + "step": 28 + }, + { + "epoch": 0.045813586097946286, + "grad_norm": 1.0422502756118774, + "learning_rate": 9.557661927330175e-06, + "loss": 0.4955, + "step": 29 + }, + { + "epoch": 0.04739336492890995, + "grad_norm": 0.8272190093994141, + "learning_rate": 9.541864139020539e-06, + "loss": 0.434, + "step": 30 + }, + { + "epoch": 0.04897314375987362, + "grad_norm": 0.5929948091506958, + "learning_rate": 9.5260663507109e-06, + "loss": 0.5042, + "step": 31 + }, + { + "epoch": 0.05055292259083728, + "grad_norm": 0.7872880101203918, + "learning_rate": 9.510268562401264e-06, + "loss": 0.5175, + "step": 32 + }, + { + "epoch": 0.052132701421800945, + "grad_norm": 0.6884463429450989, + "learning_rate": 9.494470774091628e-06, + "loss": 0.5104, + "step": 33 + }, + { + "epoch": 0.053712480252764615, + "grad_norm": 1.215976357460022, + "learning_rate": 9.478672985781992e-06, + "loss": 0.4742, + "step": 34 + }, + { + "epoch": 0.05529225908372828, + "grad_norm": 0.7471550107002258, + "learning_rate": 9.462875197472354e-06, + "loss": 0.4374, + "step": 35 + }, + { + "epoch": 0.05687203791469194, + "grad_norm": 0.6779741048812866, + "learning_rate": 9.447077409162718e-06, + "loss": 0.4337, + "step": 36 + }, + { + "epoch": 0.05845181674565561, + "grad_norm": 0.5205997824668884, + "learning_rate": 9.431279620853082e-06, + "loss": 0.4296, + "step": 37 + }, + { + "epoch": 0.06003159557661927, + "grad_norm": 0.381757527589798, + "learning_rate": 9.415481832543445e-06, + "loss": 0.2223, + "step": 38 + }, + { + "epoch": 0.061611374407582936, + "grad_norm": 0.650593101978302, + "learning_rate": 9.399684044233807e-06, + "loss": 0.5066, + "step": 39 + }, + { + "epoch": 0.0631911532385466, + "grad_norm": 0.5445153117179871, + "learning_rate": 9.383886255924171e-06, + "loss": 0.4998, + "step": 40 + }, + { + "epoch": 0.06477093206951026, + "grad_norm": 0.5024020671844482, + "learning_rate": 9.368088467614535e-06, + "loss": 0.4121, + "step": 41 + }, + { + "epoch": 0.06635071090047394, + "grad_norm": 0.6259915232658386, + "learning_rate": 9.352290679304899e-06, + "loss": 0.4969, + "step": 42 + }, + { + "epoch": 0.0679304897314376, + "grad_norm": 0.49405789375305176, + "learning_rate": 9.336492890995261e-06, + "loss": 0.4121, + "step": 43 + }, + { + "epoch": 0.06951026856240126, + "grad_norm": 0.7586628198623657, + "learning_rate": 9.320695102685625e-06, + "loss": 0.4782, + "step": 44 + }, + { + "epoch": 0.07109004739336493, + "grad_norm": 0.6203773021697998, + "learning_rate": 9.304897314375988e-06, + "loss": 0.3579, + "step": 45 + }, + { + "epoch": 0.07266982622432859, + "grad_norm": 0.6982845067977905, + "learning_rate": 9.289099526066352e-06, + "loss": 0.3876, + "step": 46 + }, + { + "epoch": 0.07424960505529225, + "grad_norm": 0.5712842345237732, + "learning_rate": 9.273301737756714e-06, + "loss": 0.4288, + "step": 47 + }, + { + "epoch": 0.07582938388625593, + "grad_norm": 0.6829891204833984, + "learning_rate": 9.257503949447078e-06, + "loss": 0.4939, + "step": 48 + }, + { + "epoch": 0.07740916271721959, + "grad_norm": 0.5508958101272583, + "learning_rate": 9.241706161137442e-06, + "loss": 0.372, + "step": 49 + }, + { + "epoch": 0.07898894154818326, + "grad_norm": 0.9345032572746277, + "learning_rate": 9.225908372827806e-06, + "loss": 0.4896, + "step": 50 + }, + { + "epoch": 0.08056872037914692, + "grad_norm": 0.6280492544174194, + "learning_rate": 9.210110584518168e-06, + "loss": 0.4375, + "step": 51 + }, + { + "epoch": 0.08214849921011058, + "grad_norm": 0.6853601336479187, + "learning_rate": 9.194312796208532e-06, + "loss": 0.4294, + "step": 52 + }, + { + "epoch": 0.08372827804107424, + "grad_norm": 0.6665984392166138, + "learning_rate": 9.178515007898895e-06, + "loss": 0.5894, + "step": 53 + }, + { + "epoch": 0.08530805687203792, + "grad_norm": 0.5088407397270203, + "learning_rate": 9.162717219589257e-06, + "loss": 0.3853, + "step": 54 + }, + { + "epoch": 0.08688783570300158, + "grad_norm": 0.5319867730140686, + "learning_rate": 9.146919431279621e-06, + "loss": 0.4791, + "step": 55 + }, + { + "epoch": 0.08846761453396525, + "grad_norm": 0.6452597975730896, + "learning_rate": 9.131121642969985e-06, + "loss": 0.4056, + "step": 56 + }, + { + "epoch": 0.09004739336492891, + "grad_norm": 0.6769601106643677, + "learning_rate": 9.115323854660349e-06, + "loss": 0.4253, + "step": 57 + }, + { + "epoch": 0.09162717219589257, + "grad_norm": 0.5170547962188721, + "learning_rate": 9.09952606635071e-06, + "loss": 0.4211, + "step": 58 + }, + { + "epoch": 0.09320695102685624, + "grad_norm": 0.5035193562507629, + "learning_rate": 9.083728278041075e-06, + "loss": 0.3144, + "step": 59 + }, + { + "epoch": 0.0947867298578199, + "grad_norm": 0.5919070243835449, + "learning_rate": 9.067930489731438e-06, + "loss": 0.4533, + "step": 60 + }, + { + "epoch": 0.09636650868878358, + "grad_norm": 0.6510637998580933, + "learning_rate": 9.052132701421802e-06, + "loss": 0.4701, + "step": 61 + }, + { + "epoch": 0.09794628751974724, + "grad_norm": 0.5784177780151367, + "learning_rate": 9.036334913112164e-06, + "loss": 0.3896, + "step": 62 + }, + { + "epoch": 0.0995260663507109, + "grad_norm": 0.7009139060974121, + "learning_rate": 9.020537124802528e-06, + "loss": 0.5018, + "step": 63 + }, + { + "epoch": 0.10110584518167456, + "grad_norm": 0.5086057186126709, + "learning_rate": 9.004739336492892e-06, + "loss": 0.4305, + "step": 64 + }, + { + "epoch": 0.10268562401263823, + "grad_norm": 0.5124595761299133, + "learning_rate": 8.988941548183256e-06, + "loss": 0.4473, + "step": 65 + }, + { + "epoch": 0.10426540284360189, + "grad_norm": 0.6409702897071838, + "learning_rate": 8.973143759873618e-06, + "loss": 0.429, + "step": 66 + }, + { + "epoch": 0.10584518167456557, + "grad_norm": 0.5651409029960632, + "learning_rate": 8.957345971563981e-06, + "loss": 0.4036, + "step": 67 + }, + { + "epoch": 0.10742496050552923, + "grad_norm": 0.6658238172531128, + "learning_rate": 8.941548183254345e-06, + "loss": 0.4726, + "step": 68 + }, + { + "epoch": 0.10900473933649289, + "grad_norm": 0.444815993309021, + "learning_rate": 8.925750394944709e-06, + "loss": 0.4016, + "step": 69 + }, + { + "epoch": 0.11058451816745656, + "grad_norm": 0.5855506658554077, + "learning_rate": 8.909952606635071e-06, + "loss": 0.4531, + "step": 70 + }, + { + "epoch": 0.11216429699842022, + "grad_norm": 0.693794310092926, + "learning_rate": 8.894154818325435e-06, + "loss": 0.4382, + "step": 71 + }, + { + "epoch": 0.11374407582938388, + "grad_norm": 0.6658089756965637, + "learning_rate": 8.878357030015799e-06, + "loss": 0.4571, + "step": 72 + }, + { + "epoch": 0.11532385466034756, + "grad_norm": 1.0504828691482544, + "learning_rate": 8.862559241706162e-06, + "loss": 0.4311, + "step": 73 + }, + { + "epoch": 0.11690363349131122, + "grad_norm": 0.5297814607620239, + "learning_rate": 8.846761453396524e-06, + "loss": 0.4391, + "step": 74 + }, + { + "epoch": 0.11848341232227488, + "grad_norm": 0.6601409316062927, + "learning_rate": 8.830963665086888e-06, + "loss": 0.5125, + "step": 75 + }, + { + "epoch": 0.12006319115323855, + "grad_norm": 0.6345618963241577, + "learning_rate": 8.815165876777252e-06, + "loss": 0.4471, + "step": 76 + }, + { + "epoch": 0.12164296998420221, + "grad_norm": 0.5008222460746765, + "learning_rate": 8.799368088467614e-06, + "loss": 0.3845, + "step": 77 + }, + { + "epoch": 0.12322274881516587, + "grad_norm": 0.5394203066825867, + "learning_rate": 8.783570300157978e-06, + "loss": 0.4117, + "step": 78 + }, + { + "epoch": 0.12480252764612954, + "grad_norm": 0.6255345940589905, + "learning_rate": 8.767772511848342e-06, + "loss": 0.512, + "step": 79 + }, + { + "epoch": 0.1263823064770932, + "grad_norm": 0.6215748190879822, + "learning_rate": 8.751974723538705e-06, + "loss": 0.509, + "step": 80 + }, + { + "epoch": 0.12796208530805686, + "grad_norm": 0.611587405204773, + "learning_rate": 8.736176935229068e-06, + "loss": 0.4036, + "step": 81 + }, + { + "epoch": 0.12954186413902052, + "grad_norm": 0.5373330116271973, + "learning_rate": 8.720379146919431e-06, + "loss": 0.393, + "step": 82 + }, + { + "epoch": 0.13112164296998421, + "grad_norm": 0.5936598181724548, + "learning_rate": 8.704581358609795e-06, + "loss": 0.4092, + "step": 83 + }, + { + "epoch": 0.13270142180094788, + "grad_norm": 0.576614260673523, + "learning_rate": 8.688783570300159e-06, + "loss": 0.5513, + "step": 84 + }, + { + "epoch": 0.13428120063191154, + "grad_norm": 0.5715078711509705, + "learning_rate": 8.672985781990521e-06, + "loss": 0.4403, + "step": 85 + }, + { + "epoch": 0.1358609794628752, + "grad_norm": 0.6212042570114136, + "learning_rate": 8.657187993680885e-06, + "loss": 0.391, + "step": 86 + }, + { + "epoch": 0.13744075829383887, + "grad_norm": 0.5439122319221497, + "learning_rate": 8.641390205371249e-06, + "loss": 0.4764, + "step": 87 + }, + { + "epoch": 0.13902053712480253, + "grad_norm": 0.6808428168296814, + "learning_rate": 8.625592417061612e-06, + "loss": 0.512, + "step": 88 + }, + { + "epoch": 0.1406003159557662, + "grad_norm": 0.7429847717285156, + "learning_rate": 8.609794628751974e-06, + "loss": 0.3834, + "step": 89 + }, + { + "epoch": 0.14218009478672985, + "grad_norm": 0.6030511260032654, + "learning_rate": 8.59399684044234e-06, + "loss": 0.4631, + "step": 90 + }, + { + "epoch": 0.14375987361769352, + "grad_norm": 0.6499682068824768, + "learning_rate": 8.578199052132702e-06, + "loss": 0.4484, + "step": 91 + }, + { + "epoch": 0.14533965244865718, + "grad_norm": 0.6490275859832764, + "learning_rate": 8.562401263823066e-06, + "loss": 0.414, + "step": 92 + }, + { + "epoch": 0.14691943127962084, + "grad_norm": 0.6859791874885559, + "learning_rate": 8.546603475513428e-06, + "loss": 0.386, + "step": 93 + }, + { + "epoch": 0.1484992101105845, + "grad_norm": 0.5281291007995605, + "learning_rate": 8.530805687203793e-06, + "loss": 0.4036, + "step": 94 + }, + { + "epoch": 0.1500789889415482, + "grad_norm": 0.5261964797973633, + "learning_rate": 8.515007898894155e-06, + "loss": 0.33, + "step": 95 + }, + { + "epoch": 0.15165876777251186, + "grad_norm": 0.4350665211677551, + "learning_rate": 8.499210110584519e-06, + "loss": 0.3347, + "step": 96 + }, + { + "epoch": 0.15323854660347552, + "grad_norm": 0.8448456525802612, + "learning_rate": 8.483412322274883e-06, + "loss": 0.4253, + "step": 97 + }, + { + "epoch": 0.15481832543443919, + "grad_norm": 0.6256837248802185, + "learning_rate": 8.467614533965247e-06, + "loss": 0.4464, + "step": 98 + }, + { + "epoch": 0.15639810426540285, + "grad_norm": 0.7007749676704407, + "learning_rate": 8.451816745655609e-06, + "loss": 0.4641, + "step": 99 + }, + { + "epoch": 0.1579778830963665, + "grad_norm": 0.6551494002342224, + "learning_rate": 8.436018957345973e-06, + "loss": 0.5097, + "step": 100 + }, + { + "epoch": 0.15955766192733017, + "grad_norm": 0.5944113731384277, + "learning_rate": 8.420221169036336e-06, + "loss": 0.4554, + "step": 101 + }, + { + "epoch": 0.16113744075829384, + "grad_norm": 0.5755615234375, + "learning_rate": 8.4044233807267e-06, + "loss": 0.443, + "step": 102 + }, + { + "epoch": 0.1627172195892575, + "grad_norm": 0.5263962745666504, + "learning_rate": 8.388625592417062e-06, + "loss": 0.4355, + "step": 103 + }, + { + "epoch": 0.16429699842022116, + "grad_norm": 0.6115814447402954, + "learning_rate": 8.372827804107424e-06, + "loss": 0.4863, + "step": 104 + }, + { + "epoch": 0.16587677725118483, + "grad_norm": 0.5544970631599426, + "learning_rate": 8.35703001579779e-06, + "loss": 0.3979, + "step": 105 + }, + { + "epoch": 0.1674565560821485, + "grad_norm": 0.5588533878326416, + "learning_rate": 8.341232227488152e-06, + "loss": 0.4073, + "step": 106 + }, + { + "epoch": 0.16903633491311215, + "grad_norm": 0.578982949256897, + "learning_rate": 8.325434439178516e-06, + "loss": 0.3745, + "step": 107 + }, + { + "epoch": 0.17061611374407584, + "grad_norm": 0.4955246150493622, + "learning_rate": 8.30963665086888e-06, + "loss": 0.438, + "step": 108 + }, + { + "epoch": 0.1721958925750395, + "grad_norm": 0.593362033367157, + "learning_rate": 8.293838862559243e-06, + "loss": 0.4161, + "step": 109 + }, + { + "epoch": 0.17377567140600317, + "grad_norm": 0.5000883340835571, + "learning_rate": 8.278041074249605e-06, + "loss": 0.432, + "step": 110 + }, + { + "epoch": 0.17535545023696683, + "grad_norm": 0.5794082880020142, + "learning_rate": 8.262243285939969e-06, + "loss": 0.4431, + "step": 111 + }, + { + "epoch": 0.1769352290679305, + "grad_norm": 0.6179563999176025, + "learning_rate": 8.246445497630333e-06, + "loss": 0.3871, + "step": 112 + }, + { + "epoch": 0.17851500789889416, + "grad_norm": 0.6540956497192383, + "learning_rate": 8.230647709320697e-06, + "loss": 0.3706, + "step": 113 + }, + { + "epoch": 0.18009478672985782, + "grad_norm": 0.7029737234115601, + "learning_rate": 8.214849921011059e-06, + "loss": 0.5077, + "step": 114 + }, + { + "epoch": 0.18167456556082148, + "grad_norm": 0.5466600656509399, + "learning_rate": 8.199052132701422e-06, + "loss": 0.4634, + "step": 115 + }, + { + "epoch": 0.18325434439178515, + "grad_norm": 0.5513831973075867, + "learning_rate": 8.183254344391786e-06, + "loss": 0.4457, + "step": 116 + }, + { + "epoch": 0.1848341232227488, + "grad_norm": 0.7652455568313599, + "learning_rate": 8.16745655608215e-06, + "loss": 0.4376, + "step": 117 + }, + { + "epoch": 0.18641390205371247, + "grad_norm": 0.6213077902793884, + "learning_rate": 8.151658767772512e-06, + "loss": 0.3988, + "step": 118 + }, + { + "epoch": 0.18799368088467613, + "grad_norm": 0.50051349401474, + "learning_rate": 8.135860979462876e-06, + "loss": 0.4142, + "step": 119 + }, + { + "epoch": 0.1895734597156398, + "grad_norm": 0.8015328049659729, + "learning_rate": 8.12006319115324e-06, + "loss": 0.4474, + "step": 120 + }, + { + "epoch": 0.1911532385466035, + "grad_norm": 0.6595532298088074, + "learning_rate": 8.104265402843603e-06, + "loss": 0.5173, + "step": 121 + }, + { + "epoch": 0.19273301737756715, + "grad_norm": 0.7859697937965393, + "learning_rate": 8.088467614533966e-06, + "loss": 0.4465, + "step": 122 + }, + { + "epoch": 0.1943127962085308, + "grad_norm": 0.6508023738861084, + "learning_rate": 8.07266982622433e-06, + "loss": 0.4448, + "step": 123 + }, + { + "epoch": 0.19589257503949448, + "grad_norm": 0.49232304096221924, + "learning_rate": 8.056872037914693e-06, + "loss": 0.4005, + "step": 124 + }, + { + "epoch": 0.19747235387045814, + "grad_norm": 0.6464349031448364, + "learning_rate": 8.041074249605057e-06, + "loss": 0.47, + "step": 125 + }, + { + "epoch": 0.1990521327014218, + "grad_norm": 0.5296919345855713, + "learning_rate": 8.025276461295419e-06, + "loss": 0.4247, + "step": 126 + }, + { + "epoch": 0.20063191153238547, + "grad_norm": 0.6270297765731812, + "learning_rate": 8.009478672985783e-06, + "loss": 0.5397, + "step": 127 + }, + { + "epoch": 0.20221169036334913, + "grad_norm": 0.6148909330368042, + "learning_rate": 7.993680884676147e-06, + "loss": 0.4133, + "step": 128 + }, + { + "epoch": 0.2037914691943128, + "grad_norm": 0.7778130173683167, + "learning_rate": 7.977883096366509e-06, + "loss": 0.5119, + "step": 129 + }, + { + "epoch": 0.20537124802527645, + "grad_norm": 0.47952044010162354, + "learning_rate": 7.962085308056872e-06, + "loss": 0.386, + "step": 130 + }, + { + "epoch": 0.20695102685624012, + "grad_norm": 0.5951160788536072, + "learning_rate": 7.946287519747236e-06, + "loss": 0.5101, + "step": 131 + }, + { + "epoch": 0.20853080568720378, + "grad_norm": 0.6209789514541626, + "learning_rate": 7.9304897314376e-06, + "loss": 0.4988, + "step": 132 + }, + { + "epoch": 0.21011058451816747, + "grad_norm": 0.5093654990196228, + "learning_rate": 7.914691943127962e-06, + "loss": 0.374, + "step": 133 + }, + { + "epoch": 0.21169036334913113, + "grad_norm": 0.5125884413719177, + "learning_rate": 7.898894154818326e-06, + "loss": 0.4097, + "step": 134 + }, + { + "epoch": 0.2132701421800948, + "grad_norm": 0.5116066932678223, + "learning_rate": 7.88309636650869e-06, + "loss": 0.4643, + "step": 135 + }, + { + "epoch": 0.21484992101105846, + "grad_norm": 0.5778034329414368, + "learning_rate": 7.867298578199053e-06, + "loss": 0.4645, + "step": 136 + }, + { + "epoch": 0.21642969984202212, + "grad_norm": 0.6490422487258911, + "learning_rate": 7.851500789889415e-06, + "loss": 0.4825, + "step": 137 + }, + { + "epoch": 0.21800947867298578, + "grad_norm": 0.644008219242096, + "learning_rate": 7.83570300157978e-06, + "loss": 0.3954, + "step": 138 + }, + { + "epoch": 0.21958925750394945, + "grad_norm": 0.8628047704696655, + "learning_rate": 7.819905213270143e-06, + "loss": 0.5322, + "step": 139 + }, + { + "epoch": 0.2211690363349131, + "grad_norm": 0.6286507844924927, + "learning_rate": 7.804107424960507e-06, + "loss": 0.3741, + "step": 140 + }, + { + "epoch": 0.22274881516587677, + "grad_norm": 0.6210809350013733, + "learning_rate": 7.788309636650869e-06, + "loss": 0.4572, + "step": 141 + }, + { + "epoch": 0.22432859399684044, + "grad_norm": 0.5337722897529602, + "learning_rate": 7.772511848341233e-06, + "loss": 0.3788, + "step": 142 + }, + { + "epoch": 0.2259083728278041, + "grad_norm": 0.5743194818496704, + "learning_rate": 7.756714060031596e-06, + "loss": 0.3963, + "step": 143 + }, + { + "epoch": 0.22748815165876776, + "grad_norm": 0.4972652792930603, + "learning_rate": 7.74091627172196e-06, + "loss": 0.2906, + "step": 144 + }, + { + "epoch": 0.22906793048973143, + "grad_norm": 0.5239664316177368, + "learning_rate": 7.725118483412322e-06, + "loss": 0.4009, + "step": 145 + }, + { + "epoch": 0.23064770932069512, + "grad_norm": 0.5151936411857605, + "learning_rate": 7.709320695102686e-06, + "loss": 0.4208, + "step": 146 + }, + { + "epoch": 0.23222748815165878, + "grad_norm": 0.6128547191619873, + "learning_rate": 7.69352290679305e-06, + "loss": 0.4779, + "step": 147 + }, + { + "epoch": 0.23380726698262244, + "grad_norm": 0.5268502235412598, + "learning_rate": 7.677725118483414e-06, + "loss": 0.4219, + "step": 148 + }, + { + "epoch": 0.2353870458135861, + "grad_norm": 0.5439866185188293, + "learning_rate": 7.661927330173776e-06, + "loss": 0.4436, + "step": 149 + }, + { + "epoch": 0.23696682464454977, + "grad_norm": 0.5291867852210999, + "learning_rate": 7.64612954186414e-06, + "loss": 0.407, + "step": 150 + }, + { + "epoch": 0.23854660347551343, + "grad_norm": 0.6638155579566956, + "learning_rate": 7.630331753554503e-06, + "loss": 0.403, + "step": 151 + }, + { + "epoch": 0.2401263823064771, + "grad_norm": 0.5501230955123901, + "learning_rate": 7.614533965244867e-06, + "loss": 0.5004, + "step": 152 + }, + { + "epoch": 0.24170616113744076, + "grad_norm": 0.5949499011039734, + "learning_rate": 7.59873617693523e-06, + "loss": 0.4708, + "step": 153 + }, + { + "epoch": 0.24328593996840442, + "grad_norm": 0.5841517448425293, + "learning_rate": 7.582938388625593e-06, + "loss": 0.4836, + "step": 154 + }, + { + "epoch": 0.24486571879936808, + "grad_norm": 0.6298154592514038, + "learning_rate": 7.567140600315957e-06, + "loss": 0.4728, + "step": 155 + }, + { + "epoch": 0.24644549763033174, + "grad_norm": 0.6107637882232666, + "learning_rate": 7.55134281200632e-06, + "loss": 0.4243, + "step": 156 + }, + { + "epoch": 0.2480252764612954, + "grad_norm": 0.5174968838691711, + "learning_rate": 7.535545023696683e-06, + "loss": 0.4657, + "step": 157 + }, + { + "epoch": 0.24960505529225907, + "grad_norm": 0.5588591694831848, + "learning_rate": 7.519747235387046e-06, + "loss": 0.4567, + "step": 158 + }, + { + "epoch": 0.25118483412322273, + "grad_norm": 0.8415222764015198, + "learning_rate": 7.50394944707741e-06, + "loss": 0.4625, + "step": 159 + }, + { + "epoch": 0.2527646129541864, + "grad_norm": 0.6054974794387817, + "learning_rate": 7.488151658767773e-06, + "loss": 0.3843, + "step": 160 + }, + { + "epoch": 0.25434439178515006, + "grad_norm": 0.5117557644844055, + "learning_rate": 7.472353870458137e-06, + "loss": 0.3887, + "step": 161 + }, + { + "epoch": 0.2559241706161137, + "grad_norm": 0.5849332213401794, + "learning_rate": 7.4565560821485e-06, + "loss": 0.4528, + "step": 162 + }, + { + "epoch": 0.2575039494470774, + "grad_norm": 0.5625325441360474, + "learning_rate": 7.4407582938388635e-06, + "loss": 0.4542, + "step": 163 + }, + { + "epoch": 0.25908372827804105, + "grad_norm": 0.5406492352485657, + "learning_rate": 7.4249605055292264e-06, + "loss": 0.4592, + "step": 164 + }, + { + "epoch": 0.26066350710900477, + "grad_norm": 0.6318654417991638, + "learning_rate": 7.40916271721959e-06, + "loss": 0.4361, + "step": 165 + }, + { + "epoch": 0.26224328593996843, + "grad_norm": 0.5719902515411377, + "learning_rate": 7.393364928909953e-06, + "loss": 0.4799, + "step": 166 + }, + { + "epoch": 0.2638230647709321, + "grad_norm": 0.5211177468299866, + "learning_rate": 7.377567140600317e-06, + "loss": 0.33, + "step": 167 + }, + { + "epoch": 0.26540284360189575, + "grad_norm": 0.6400920152664185, + "learning_rate": 7.36176935229068e-06, + "loss": 0.4235, + "step": 168 + }, + { + "epoch": 0.2669826224328594, + "grad_norm": 0.5302186608314514, + "learning_rate": 7.345971563981044e-06, + "loss": 0.4342, + "step": 169 + }, + { + "epoch": 0.2685624012638231, + "grad_norm": 0.5393325686454773, + "learning_rate": 7.3301737756714066e-06, + "loss": 0.3632, + "step": 170 + }, + { + "epoch": 0.27014218009478674, + "grad_norm": 0.5409063696861267, + "learning_rate": 7.31437598736177e-06, + "loss": 0.4076, + "step": 171 + }, + { + "epoch": 0.2717219589257504, + "grad_norm": 0.5056774616241455, + "learning_rate": 7.298578199052133e-06, + "loss": 0.4821, + "step": 172 + }, + { + "epoch": 0.27330173775671407, + "grad_norm": 0.6061700582504272, + "learning_rate": 7.282780410742497e-06, + "loss": 0.5137, + "step": 173 + }, + { + "epoch": 0.27488151658767773, + "grad_norm": 0.5524815917015076, + "learning_rate": 7.26698262243286e-06, + "loss": 0.4116, + "step": 174 + }, + { + "epoch": 0.2764612954186414, + "grad_norm": 0.5045567750930786, + "learning_rate": 7.251184834123224e-06, + "loss": 0.3969, + "step": 175 + }, + { + "epoch": 0.27804107424960506, + "grad_norm": 0.604505717754364, + "learning_rate": 7.235387045813587e-06, + "loss": 0.5176, + "step": 176 + }, + { + "epoch": 0.2796208530805687, + "grad_norm": 0.6067575812339783, + "learning_rate": 7.2195892575039505e-06, + "loss": 0.4438, + "step": 177 + }, + { + "epoch": 0.2812006319115324, + "grad_norm": 0.6412494778633118, + "learning_rate": 7.203791469194313e-06, + "loss": 0.4758, + "step": 178 + }, + { + "epoch": 0.28278041074249605, + "grad_norm": 0.5432886481285095, + "learning_rate": 7.187993680884676e-06, + "loss": 0.4387, + "step": 179 + }, + { + "epoch": 0.2843601895734597, + "grad_norm": 0.4622472822666168, + "learning_rate": 7.17219589257504e-06, + "loss": 0.4775, + "step": 180 + }, + { + "epoch": 0.2859399684044234, + "grad_norm": 0.643259584903717, + "learning_rate": 7.156398104265403e-06, + "loss": 0.4479, + "step": 181 + }, + { + "epoch": 0.28751974723538704, + "grad_norm": 0.48998138308525085, + "learning_rate": 7.140600315955767e-06, + "loss": 0.399, + "step": 182 + }, + { + "epoch": 0.2890995260663507, + "grad_norm": 0.5146614909172058, + "learning_rate": 7.12480252764613e-06, + "loss": 0.4475, + "step": 183 + }, + { + "epoch": 0.29067930489731436, + "grad_norm": 0.5386670231819153, + "learning_rate": 7.1090047393364935e-06, + "loss": 0.3892, + "step": 184 + }, + { + "epoch": 0.292259083728278, + "grad_norm": 0.5147759318351746, + "learning_rate": 7.0932069510268565e-06, + "loss": 0.3755, + "step": 185 + }, + { + "epoch": 0.2938388625592417, + "grad_norm": 0.5141321420669556, + "learning_rate": 7.07740916271722e-06, + "loss": 0.355, + "step": 186 + }, + { + "epoch": 0.29541864139020535, + "grad_norm": 0.9518134593963623, + "learning_rate": 7.061611374407583e-06, + "loss": 0.4021, + "step": 187 + }, + { + "epoch": 0.296998420221169, + "grad_norm": 0.5844981670379639, + "learning_rate": 7.045813586097947e-06, + "loss": 0.4233, + "step": 188 + }, + { + "epoch": 0.2985781990521327, + "grad_norm": 0.6381546854972839, + "learning_rate": 7.03001579778831e-06, + "loss": 0.4862, + "step": 189 + }, + { + "epoch": 0.3001579778830964, + "grad_norm": 0.7311195135116577, + "learning_rate": 7.014218009478674e-06, + "loss": 0.4822, + "step": 190 + }, + { + "epoch": 0.30173775671406006, + "grad_norm": 0.5827596783638, + "learning_rate": 6.998420221169037e-06, + "loss": 0.4027, + "step": 191 + }, + { + "epoch": 0.3033175355450237, + "grad_norm": 0.6907688975334167, + "learning_rate": 6.9826224328594e-06, + "loss": 0.4374, + "step": 192 + }, + { + "epoch": 0.3048973143759874, + "grad_norm": 0.5060120820999146, + "learning_rate": 6.966824644549763e-06, + "loss": 0.4226, + "step": 193 + }, + { + "epoch": 0.30647709320695105, + "grad_norm": 0.41480544209480286, + "learning_rate": 6.951026856240127e-06, + "loss": 0.3766, + "step": 194 + }, + { + "epoch": 0.3080568720379147, + "grad_norm": 0.5637404322624207, + "learning_rate": 6.93522906793049e-06, + "loss": 0.4365, + "step": 195 + }, + { + "epoch": 0.30963665086887837, + "grad_norm": 0.6389409899711609, + "learning_rate": 6.919431279620854e-06, + "loss": 0.4186, + "step": 196 + }, + { + "epoch": 0.31121642969984203, + "grad_norm": 0.48588162660598755, + "learning_rate": 6.903633491311217e-06, + "loss": 0.4023, + "step": 197 + }, + { + "epoch": 0.3127962085308057, + "grad_norm": 0.6066514253616333, + "learning_rate": 6.8878357030015805e-06, + "loss": 0.4652, + "step": 198 + }, + { + "epoch": 0.31437598736176936, + "grad_norm": 0.6308689117431641, + "learning_rate": 6.8720379146919435e-06, + "loss": 0.3885, + "step": 199 + }, + { + "epoch": 0.315955766192733, + "grad_norm": 0.4883437752723694, + "learning_rate": 6.856240126382307e-06, + "loss": 0.4128, + "step": 200 + }, + { + "epoch": 0.3175355450236967, + "grad_norm": 0.720086932182312, + "learning_rate": 6.84044233807267e-06, + "loss": 0.4333, + "step": 201 + }, + { + "epoch": 0.31911532385466035, + "grad_norm": 0.6698761582374573, + "learning_rate": 6.824644549763034e-06, + "loss": 0.3967, + "step": 202 + }, + { + "epoch": 0.320695102685624, + "grad_norm": 0.5240082740783691, + "learning_rate": 6.808846761453397e-06, + "loss": 0.4055, + "step": 203 + }, + { + "epoch": 0.3222748815165877, + "grad_norm": 0.6142946481704712, + "learning_rate": 6.79304897314376e-06, + "loss": 0.3645, + "step": 204 + }, + { + "epoch": 0.32385466034755134, + "grad_norm": 0.6439379453659058, + "learning_rate": 6.777251184834124e-06, + "loss": 0.3207, + "step": 205 + }, + { + "epoch": 0.325434439178515, + "grad_norm": 0.6862720847129822, + "learning_rate": 6.7614533965244865e-06, + "loss": 0.4944, + "step": 206 + }, + { + "epoch": 0.32701421800947866, + "grad_norm": 0.6720433235168457, + "learning_rate": 6.74565560821485e-06, + "loss": 0.4335, + "step": 207 + }, + { + "epoch": 0.3285939968404423, + "grad_norm": 0.531577467918396, + "learning_rate": 6.729857819905213e-06, + "loss": 0.5327, + "step": 208 + }, + { + "epoch": 0.330173775671406, + "grad_norm": 0.5542590022087097, + "learning_rate": 6.714060031595577e-06, + "loss": 0.3629, + "step": 209 + }, + { + "epoch": 0.33175355450236965, + "grad_norm": 0.5614448189735413, + "learning_rate": 6.69826224328594e-06, + "loss": 0.4097, + "step": 210 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.7383466362953186, + "learning_rate": 6.682464454976304e-06, + "loss": 0.5031, + "step": 211 + }, + { + "epoch": 0.334913112164297, + "grad_norm": 0.6345497965812683, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5029, + "step": 212 + }, + { + "epoch": 0.33649289099526064, + "grad_norm": 0.579641580581665, + "learning_rate": 6.6508688783570304e-06, + "loss": 0.4949, + "step": 213 + }, + { + "epoch": 0.3380726698262243, + "grad_norm": 0.5040780305862427, + "learning_rate": 6.635071090047393e-06, + "loss": 0.4537, + "step": 214 + }, + { + "epoch": 0.33965244865718797, + "grad_norm": 0.5917491316795349, + "learning_rate": 6.619273301737757e-06, + "loss": 0.3883, + "step": 215 + }, + { + "epoch": 0.3412322274881517, + "grad_norm": 0.7031399011611938, + "learning_rate": 6.60347551342812e-06, + "loss": 0.4554, + "step": 216 + }, + { + "epoch": 0.34281200631911535, + "grad_norm": 0.5503798127174377, + "learning_rate": 6.587677725118484e-06, + "loss": 0.352, + "step": 217 + }, + { + "epoch": 0.344391785150079, + "grad_norm": 0.5412716269493103, + "learning_rate": 6.571879936808847e-06, + "loss": 0.4191, + "step": 218 + }, + { + "epoch": 0.3459715639810427, + "grad_norm": 0.6272369623184204, + "learning_rate": 6.556082148499211e-06, + "loss": 0.4595, + "step": 219 + }, + { + "epoch": 0.34755134281200634, + "grad_norm": 0.5309504270553589, + "learning_rate": 6.5402843601895735e-06, + "loss": 0.4095, + "step": 220 + }, + { + "epoch": 0.34913112164297, + "grad_norm": 0.5687200427055359, + "learning_rate": 6.524486571879938e-06, + "loss": 0.435, + "step": 221 + }, + { + "epoch": 0.35071090047393366, + "grad_norm": 0.5819438099861145, + "learning_rate": 6.5086887835703e-06, + "loss": 0.4695, + "step": 222 + }, + { + "epoch": 0.3522906793048973, + "grad_norm": 0.6310110092163086, + "learning_rate": 6.492890995260665e-06, + "loss": 0.4346, + "step": 223 + }, + { + "epoch": 0.353870458135861, + "grad_norm": 0.5838906168937683, + "learning_rate": 6.477093206951027e-06, + "loss": 0.47, + "step": 224 + }, + { + "epoch": 0.35545023696682465, + "grad_norm": 0.6752678155899048, + "learning_rate": 6.4612954186413915e-06, + "loss": 0.3842, + "step": 225 + }, + { + "epoch": 0.3570300157977883, + "grad_norm": 0.7029111981391907, + "learning_rate": 6.445497630331754e-06, + "loss": 0.4442, + "step": 226 + }, + { + "epoch": 0.358609794628752, + "grad_norm": 0.511812686920166, + "learning_rate": 6.429699842022118e-06, + "loss": 0.5171, + "step": 227 + }, + { + "epoch": 0.36018957345971564, + "grad_norm": 0.49457868933677673, + "learning_rate": 6.413902053712481e-06, + "loss": 0.3695, + "step": 228 + }, + { + "epoch": 0.3617693522906793, + "grad_norm": 0.4521022439002991, + "learning_rate": 6.398104265402843e-06, + "loss": 0.3909, + "step": 229 + }, + { + "epoch": 0.36334913112164297, + "grad_norm": 0.45229026675224304, + "learning_rate": 6.382306477093208e-06, + "loss": 0.3417, + "step": 230 + }, + { + "epoch": 0.36492890995260663, + "grad_norm": 0.5070056915283203, + "learning_rate": 6.36650868878357e-06, + "loss": 0.3518, + "step": 231 + }, + { + "epoch": 0.3665086887835703, + "grad_norm": 0.9325531721115112, + "learning_rate": 6.350710900473935e-06, + "loss": 0.5172, + "step": 232 + }, + { + "epoch": 0.36808846761453395, + "grad_norm": 0.6027977466583252, + "learning_rate": 6.334913112164297e-06, + "loss": 0.4052, + "step": 233 + }, + { + "epoch": 0.3696682464454976, + "grad_norm": 0.7251097559928894, + "learning_rate": 6.319115323854661e-06, + "loss": 0.4739, + "step": 234 + }, + { + "epoch": 0.3712480252764613, + "grad_norm": 0.6470052003860474, + "learning_rate": 6.303317535545023e-06, + "loss": 0.4745, + "step": 235 + }, + { + "epoch": 0.37282780410742494, + "grad_norm": 0.7177411317825317, + "learning_rate": 6.287519747235388e-06, + "loss": 0.364, + "step": 236 + }, + { + "epoch": 0.3744075829383886, + "grad_norm": 0.7681677341461182, + "learning_rate": 6.271721958925751e-06, + "loss": 0.4559, + "step": 237 + }, + { + "epoch": 0.37598736176935227, + "grad_norm": 0.6160128116607666, + "learning_rate": 6.255924170616115e-06, + "loss": 0.421, + "step": 238 + }, + { + "epoch": 0.37756714060031593, + "grad_norm": 0.658981442451477, + "learning_rate": 6.240126382306478e-06, + "loss": 0.3979, + "step": 239 + }, + { + "epoch": 0.3791469194312796, + "grad_norm": 0.9422373175621033, + "learning_rate": 6.2243285939968414e-06, + "loss": 0.3586, + "step": 240 + }, + { + "epoch": 0.3807266982622433, + "grad_norm": 0.5452501773834229, + "learning_rate": 6.208530805687204e-06, + "loss": 0.4209, + "step": 241 + }, + { + "epoch": 0.382306477093207, + "grad_norm": 0.4912925660610199, + "learning_rate": 6.192733017377568e-06, + "loss": 0.4784, + "step": 242 + }, + { + "epoch": 0.38388625592417064, + "grad_norm": 0.6575455665588379, + "learning_rate": 6.176935229067931e-06, + "loss": 0.4062, + "step": 243 + }, + { + "epoch": 0.3854660347551343, + "grad_norm": 0.8840091824531555, + "learning_rate": 6.161137440758295e-06, + "loss": 0.4177, + "step": 244 + }, + { + "epoch": 0.38704581358609796, + "grad_norm": 0.5949338674545288, + "learning_rate": 6.145339652448658e-06, + "loss": 0.4477, + "step": 245 + }, + { + "epoch": 0.3886255924170616, + "grad_norm": 0.5938326120376587, + "learning_rate": 6.1295418641390216e-06, + "loss": 0.4155, + "step": 246 + }, + { + "epoch": 0.3902053712480253, + "grad_norm": 0.5401394367218018, + "learning_rate": 6.1137440758293845e-06, + "loss": 0.3873, + "step": 247 + }, + { + "epoch": 0.39178515007898895, + "grad_norm": 0.5220497846603394, + "learning_rate": 6.097946287519748e-06, + "loss": 0.3803, + "step": 248 + }, + { + "epoch": 0.3933649289099526, + "grad_norm": 0.5426644086837769, + "learning_rate": 6.082148499210111e-06, + "loss": 0.3239, + "step": 249 + }, + { + "epoch": 0.3949447077409163, + "grad_norm": 0.5215898156166077, + "learning_rate": 6.066350710900475e-06, + "loss": 0.4373, + "step": 250 + }, + { + "epoch": 0.39652448657187994, + "grad_norm": 0.5694135427474976, + "learning_rate": 6.050552922590838e-06, + "loss": 0.4948, + "step": 251 + }, + { + "epoch": 0.3981042654028436, + "grad_norm": 0.5505183339118958, + "learning_rate": 6.034755134281202e-06, + "loss": 0.4108, + "step": 252 + }, + { + "epoch": 0.39968404423380727, + "grad_norm": 0.593190610408783, + "learning_rate": 6.018957345971565e-06, + "loss": 0.429, + "step": 253 + }, + { + "epoch": 0.40126382306477093, + "grad_norm": 0.5409046411514282, + "learning_rate": 6.003159557661928e-06, + "loss": 0.4443, + "step": 254 + }, + { + "epoch": 0.4028436018957346, + "grad_norm": 0.5520291328430176, + "learning_rate": 5.987361769352291e-06, + "loss": 0.4485, + "step": 255 + }, + { + "epoch": 0.40442338072669826, + "grad_norm": 0.5622429847717285, + "learning_rate": 5.971563981042654e-06, + "loss": 0.4181, + "step": 256 + }, + { + "epoch": 0.4060031595576619, + "grad_norm": 0.5267983078956604, + "learning_rate": 5.955766192733018e-06, + "loss": 0.4235, + "step": 257 + }, + { + "epoch": 0.4075829383886256, + "grad_norm": 0.5384082198143005, + "learning_rate": 5.939968404423381e-06, + "loss": 0.4055, + "step": 258 + }, + { + "epoch": 0.40916271721958924, + "grad_norm": 0.5427289605140686, + "learning_rate": 5.924170616113745e-06, + "loss": 0.3427, + "step": 259 + }, + { + "epoch": 0.4107424960505529, + "grad_norm": 0.4936423599720001, + "learning_rate": 5.908372827804108e-06, + "loss": 0.4133, + "step": 260 + }, + { + "epoch": 0.41232227488151657, + "grad_norm": 0.5825520753860474, + "learning_rate": 5.8925750394944715e-06, + "loss": 0.377, + "step": 261 + }, + { + "epoch": 0.41390205371248023, + "grad_norm": 0.6343340277671814, + "learning_rate": 5.876777251184834e-06, + "loss": 0.441, + "step": 262 + }, + { + "epoch": 0.4154818325434439, + "grad_norm": 0.5479387044906616, + "learning_rate": 5.860979462875198e-06, + "loss": 0.4353, + "step": 263 + }, + { + "epoch": 0.41706161137440756, + "grad_norm": 0.5873805284500122, + "learning_rate": 5.845181674565561e-06, + "loss": 0.4293, + "step": 264 + }, + { + "epoch": 0.4186413902053712, + "grad_norm": 0.6624792218208313, + "learning_rate": 5.829383886255925e-06, + "loss": 0.5162, + "step": 265 + }, + { + "epoch": 0.42022116903633494, + "grad_norm": 0.5797149538993835, + "learning_rate": 5.813586097946288e-06, + "loss": 0.3651, + "step": 266 + }, + { + "epoch": 0.4218009478672986, + "grad_norm": 0.5814763903617859, + "learning_rate": 5.797788309636652e-06, + "loss": 0.3817, + "step": 267 + }, + { + "epoch": 0.42338072669826227, + "grad_norm": 0.5556735992431641, + "learning_rate": 5.7819905213270145e-06, + "loss": 0.4186, + "step": 268 + }, + { + "epoch": 0.42496050552922593, + "grad_norm": 0.5842727422714233, + "learning_rate": 5.766192733017378e-06, + "loss": 0.4343, + "step": 269 + }, + { + "epoch": 0.4265402843601896, + "grad_norm": 0.5401722192764282, + "learning_rate": 5.750394944707741e-06, + "loss": 0.4418, + "step": 270 + }, + { + "epoch": 0.42812006319115326, + "grad_norm": 0.5917039513587952, + "learning_rate": 5.734597156398105e-06, + "loss": 0.5371, + "step": 271 + }, + { + "epoch": 0.4296998420221169, + "grad_norm": 0.5991331338882446, + "learning_rate": 5.718799368088468e-06, + "loss": 0.4969, + "step": 272 + }, + { + "epoch": 0.4312796208530806, + "grad_norm": 0.4709448218345642, + "learning_rate": 5.703001579778832e-06, + "loss": 0.4139, + "step": 273 + }, + { + "epoch": 0.43285939968404424, + "grad_norm": 0.5746496319770813, + "learning_rate": 5.687203791469195e-06, + "loss": 0.4683, + "step": 274 + }, + { + "epoch": 0.4344391785150079, + "grad_norm": 0.523835301399231, + "learning_rate": 5.6714060031595584e-06, + "loss": 0.4346, + "step": 275 + }, + { + "epoch": 0.43601895734597157, + "grad_norm": 0.5292810797691345, + "learning_rate": 5.655608214849921e-06, + "loss": 0.463, + "step": 276 + }, + { + "epoch": 0.43759873617693523, + "grad_norm": 0.6543466448783875, + "learning_rate": 5.639810426540285e-06, + "loss": 0.427, + "step": 277 + }, + { + "epoch": 0.4391785150078989, + "grad_norm": 0.5543989539146423, + "learning_rate": 5.624012638230648e-06, + "loss": 0.3902, + "step": 278 + }, + { + "epoch": 0.44075829383886256, + "grad_norm": 0.5905360579490662, + "learning_rate": 5.608214849921012e-06, + "loss": 0.4266, + "step": 279 + }, + { + "epoch": 0.4423380726698262, + "grad_norm": 0.5785796046257019, + "learning_rate": 5.592417061611375e-06, + "loss": 0.4521, + "step": 280 + }, + { + "epoch": 0.4439178515007899, + "grad_norm": 0.5580607056617737, + "learning_rate": 5.576619273301738e-06, + "loss": 0.378, + "step": 281 + }, + { + "epoch": 0.44549763033175355, + "grad_norm": 0.5100966691970825, + "learning_rate": 5.5608214849921015e-06, + "loss": 0.3876, + "step": 282 + }, + { + "epoch": 0.4470774091627172, + "grad_norm": 0.5704023241996765, + "learning_rate": 5.5450236966824644e-06, + "loss": 0.4694, + "step": 283 + }, + { + "epoch": 0.4486571879936809, + "grad_norm": 0.5954383611679077, + "learning_rate": 5.529225908372828e-06, + "loss": 0.5049, + "step": 284 + }, + { + "epoch": 0.45023696682464454, + "grad_norm": 0.5239635705947876, + "learning_rate": 5.513428120063191e-06, + "loss": 0.4182, + "step": 285 + }, + { + "epoch": 0.4518167456556082, + "grad_norm": 0.6643552780151367, + "learning_rate": 5.497630331753555e-06, + "loss": 0.4434, + "step": 286 + }, + { + "epoch": 0.45339652448657186, + "grad_norm": 0.6675540804862976, + "learning_rate": 5.481832543443918e-06, + "loss": 0.3745, + "step": 287 + }, + { + "epoch": 0.4549763033175355, + "grad_norm": 0.5871401429176331, + "learning_rate": 5.466034755134282e-06, + "loss": 0.5527, + "step": 288 + }, + { + "epoch": 0.4565560821484992, + "grad_norm": 0.5936838984489441, + "learning_rate": 5.4502369668246446e-06, + "loss": 0.4857, + "step": 289 + }, + { + "epoch": 0.45813586097946285, + "grad_norm": 0.5998191833496094, + "learning_rate": 5.434439178515008e-06, + "loss": 0.4395, + "step": 290 + }, + { + "epoch": 0.4597156398104265, + "grad_norm": 0.5102293491363525, + "learning_rate": 5.418641390205371e-06, + "loss": 0.4496, + "step": 291 + }, + { + "epoch": 0.46129541864139023, + "grad_norm": 0.6297216415405273, + "learning_rate": 5.402843601895735e-06, + "loss": 0.3555, + "step": 292 + }, + { + "epoch": 0.4628751974723539, + "grad_norm": 0.6780267953872681, + "learning_rate": 5.387045813586098e-06, + "loss": 0.3295, + "step": 293 + }, + { + "epoch": 0.46445497630331756, + "grad_norm": 0.5788872838020325, + "learning_rate": 5.371248025276462e-06, + "loss": 0.4293, + "step": 294 + }, + { + "epoch": 0.4660347551342812, + "grad_norm": 0.5679113268852234, + "learning_rate": 5.355450236966825e-06, + "loss": 0.4274, + "step": 295 + }, + { + "epoch": 0.4676145339652449, + "grad_norm": 0.5739018321037292, + "learning_rate": 5.3396524486571885e-06, + "loss": 0.3292, + "step": 296 + }, + { + "epoch": 0.46919431279620855, + "grad_norm": 0.5387299060821533, + "learning_rate": 5.323854660347551e-06, + "loss": 0.36, + "step": 297 + }, + { + "epoch": 0.4707740916271722, + "grad_norm": 0.4877624213695526, + "learning_rate": 5.308056872037915e-06, + "loss": 0.403, + "step": 298 + }, + { + "epoch": 0.47235387045813587, + "grad_norm": 0.5668107271194458, + "learning_rate": 5.292259083728278e-06, + "loss": 0.4087, + "step": 299 + }, + { + "epoch": 0.47393364928909953, + "grad_norm": 0.5592719316482544, + "learning_rate": 5.276461295418642e-06, + "loss": 0.405, + "step": 300 + }, + { + "epoch": 0.4755134281200632, + "grad_norm": 0.48879534006118774, + "learning_rate": 5.260663507109005e-06, + "loss": 0.3562, + "step": 301 + }, + { + "epoch": 0.47709320695102686, + "grad_norm": 0.5968641042709351, + "learning_rate": 5.244865718799369e-06, + "loss": 0.4216, + "step": 302 + }, + { + "epoch": 0.4786729857819905, + "grad_norm": 0.7803828120231628, + "learning_rate": 5.2290679304897315e-06, + "loss": 0.4014, + "step": 303 + }, + { + "epoch": 0.4802527646129542, + "grad_norm": 0.592827558517456, + "learning_rate": 5.213270142180096e-06, + "loss": 0.2895, + "step": 304 + }, + { + "epoch": 0.48183254344391785, + "grad_norm": 0.8070396184921265, + "learning_rate": 5.197472353870458e-06, + "loss": 0.3972, + "step": 305 + }, + { + "epoch": 0.4834123222748815, + "grad_norm": 0.5256397724151611, + "learning_rate": 5.181674565560821e-06, + "loss": 0.4384, + "step": 306 + }, + { + "epoch": 0.4849921011058452, + "grad_norm": 0.5307562947273254, + "learning_rate": 5.165876777251185e-06, + "loss": 0.3788, + "step": 307 + }, + { + "epoch": 0.48657187993680884, + "grad_norm": 0.4588807225227356, + "learning_rate": 5.150078988941548e-06, + "loss": 0.3491, + "step": 308 + }, + { + "epoch": 0.4881516587677725, + "grad_norm": 0.524919331073761, + "learning_rate": 5.134281200631912e-06, + "loss": 0.4375, + "step": 309 + }, + { + "epoch": 0.48973143759873616, + "grad_norm": 0.6611966490745544, + "learning_rate": 5.118483412322275e-06, + "loss": 0.4399, + "step": 310 + }, + { + "epoch": 0.4913112164296998, + "grad_norm": 0.5597748160362244, + "learning_rate": 5.102685624012638e-06, + "loss": 0.5073, + "step": 311 + }, + { + "epoch": 0.4928909952606635, + "grad_norm": 0.8958181738853455, + "learning_rate": 5.086887835703001e-06, + "loss": 0.4756, + "step": 312 + }, + { + "epoch": 0.49447077409162715, + "grad_norm": 0.4875742197036743, + "learning_rate": 5.071090047393366e-06, + "loss": 0.4424, + "step": 313 + }, + { + "epoch": 0.4960505529225908, + "grad_norm": 0.6110445261001587, + "learning_rate": 5.055292259083728e-06, + "loss": 0.4686, + "step": 314 + }, + { + "epoch": 0.4976303317535545, + "grad_norm": 0.5900540351867676, + "learning_rate": 5.039494470774093e-06, + "loss": 0.4, + "step": 315 + }, + { + "epoch": 0.49921011058451814, + "grad_norm": 0.624906599521637, + "learning_rate": 5.023696682464455e-06, + "loss": 0.3967, + "step": 316 + }, + { + "epoch": 0.5007898894154819, + "grad_norm": 0.6435191631317139, + "learning_rate": 5.007898894154819e-06, + "loss": 0.5104, + "step": 317 + }, + { + "epoch": 0.5023696682464455, + "grad_norm": 0.7464382648468018, + "learning_rate": 4.9921011058451815e-06, + "loss": 0.4621, + "step": 318 + }, + { + "epoch": 0.5039494470774092, + "grad_norm": 0.7912509441375732, + "learning_rate": 4.976303317535545e-06, + "loss": 0.4186, + "step": 319 + }, + { + "epoch": 0.5055292259083728, + "grad_norm": 0.6150445938110352, + "learning_rate": 4.960505529225908e-06, + "loss": 0.469, + "step": 320 + }, + { + "epoch": 0.5071090047393365, + "grad_norm": 0.5445781946182251, + "learning_rate": 4.944707740916272e-06, + "loss": 0.4111, + "step": 321 + }, + { + "epoch": 0.5086887835703001, + "grad_norm": 0.5628255605697632, + "learning_rate": 4.928909952606635e-06, + "loss": 0.4884, + "step": 322 + }, + { + "epoch": 0.5102685624012638, + "grad_norm": 0.5007054805755615, + "learning_rate": 4.913112164296999e-06, + "loss": 0.4315, + "step": 323 + }, + { + "epoch": 0.5118483412322274, + "grad_norm": 0.6346699595451355, + "learning_rate": 4.8973143759873624e-06, + "loss": 0.4033, + "step": 324 + }, + { + "epoch": 0.5134281200631912, + "grad_norm": 0.639045774936676, + "learning_rate": 4.881516587677725e-06, + "loss": 0.3748, + "step": 325 + }, + { + "epoch": 0.5150078988941548, + "grad_norm": 0.5578002333641052, + "learning_rate": 4.865718799368089e-06, + "loss": 0.5055, + "step": 326 + }, + { + "epoch": 0.5165876777251185, + "grad_norm": 0.5281325578689575, + "learning_rate": 4.849921011058452e-06, + "loss": 0.4307, + "step": 327 + }, + { + "epoch": 0.5181674565560821, + "grad_norm": 0.6557057499885559, + "learning_rate": 4.834123222748816e-06, + "loss": 0.4085, + "step": 328 + }, + { + "epoch": 0.5197472353870458, + "grad_norm": 0.5667731761932373, + "learning_rate": 4.818325434439179e-06, + "loss": 0.4774, + "step": 329 + }, + { + "epoch": 0.5213270142180095, + "grad_norm": 0.5362856984138489, + "learning_rate": 4.8025276461295426e-06, + "loss": 0.4316, + "step": 330 + }, + { + "epoch": 0.5229067930489731, + "grad_norm": 0.5326763391494751, + "learning_rate": 4.7867298578199055e-06, + "loss": 0.389, + "step": 331 + }, + { + "epoch": 0.5244865718799369, + "grad_norm": 0.4922950565814972, + "learning_rate": 4.770932069510269e-06, + "loss": 0.3756, + "step": 332 + }, + { + "epoch": 0.5260663507109005, + "grad_norm": 0.4961477518081665, + "learning_rate": 4.755134281200632e-06, + "loss": 0.4336, + "step": 333 + }, + { + "epoch": 0.5276461295418642, + "grad_norm": 0.5258511304855347, + "learning_rate": 4.739336492890996e-06, + "loss": 0.404, + "step": 334 + }, + { + "epoch": 0.5292259083728278, + "grad_norm": 0.5479301810264587, + "learning_rate": 4.723538704581359e-06, + "loss": 0.3578, + "step": 335 + }, + { + "epoch": 0.5308056872037915, + "grad_norm": 0.49883902072906494, + "learning_rate": 4.707740916271723e-06, + "loss": 0.3809, + "step": 336 + }, + { + "epoch": 0.5323854660347551, + "grad_norm": 0.5133053660392761, + "learning_rate": 4.691943127962086e-06, + "loss": 0.4091, + "step": 337 + }, + { + "epoch": 0.5339652448657188, + "grad_norm": 0.6334301829338074, + "learning_rate": 4.676145339652449e-06, + "loss": 0.4432, + "step": 338 + }, + { + "epoch": 0.5355450236966824, + "grad_norm": 0.5124396085739136, + "learning_rate": 4.660347551342812e-06, + "loss": 0.3557, + "step": 339 + }, + { + "epoch": 0.5371248025276462, + "grad_norm": 0.5863746404647827, + "learning_rate": 4.644549763033176e-06, + "loss": 0.4288, + "step": 340 + }, + { + "epoch": 0.5387045813586098, + "grad_norm": 0.6599943041801453, + "learning_rate": 4.628751974723539e-06, + "loss": 0.398, + "step": 341 + }, + { + "epoch": 0.5402843601895735, + "grad_norm": 0.480027437210083, + "learning_rate": 4.612954186413903e-06, + "loss": 0.4706, + "step": 342 + }, + { + "epoch": 0.5418641390205371, + "grad_norm": 0.6601845026016235, + "learning_rate": 4.597156398104266e-06, + "loss": 0.4092, + "step": 343 + }, + { + "epoch": 0.5434439178515008, + "grad_norm": 0.5557224154472351, + "learning_rate": 4.581358609794629e-06, + "loss": 0.389, + "step": 344 + }, + { + "epoch": 0.5450236966824644, + "grad_norm": 0.49160709977149963, + "learning_rate": 4.5655608214849925e-06, + "loss": 0.4338, + "step": 345 + }, + { + "epoch": 0.5466034755134281, + "grad_norm": 0.5284649133682251, + "learning_rate": 4.549763033175355e-06, + "loss": 0.403, + "step": 346 + }, + { + "epoch": 0.5481832543443917, + "grad_norm": 0.5501908659934998, + "learning_rate": 4.533965244865719e-06, + "loss": 0.4983, + "step": 347 + }, + { + "epoch": 0.5497630331753555, + "grad_norm": 0.5585077404975891, + "learning_rate": 4.518167456556082e-06, + "loss": 0.4219, + "step": 348 + }, + { + "epoch": 0.5513428120063191, + "grad_norm": 0.4565962255001068, + "learning_rate": 4.502369668246446e-06, + "loss": 0.3591, + "step": 349 + }, + { + "epoch": 0.5529225908372828, + "grad_norm": 0.5507949590682983, + "learning_rate": 4.486571879936809e-06, + "loss": 0.4752, + "step": 350 + }, + { + "epoch": 0.5545023696682464, + "grad_norm": 0.5490357875823975, + "learning_rate": 4.470774091627173e-06, + "loss": 0.4291, + "step": 351 + }, + { + "epoch": 0.5560821484992101, + "grad_norm": 0.5804268717765808, + "learning_rate": 4.4549763033175355e-06, + "loss": 0.3113, + "step": 352 + }, + { + "epoch": 0.5576619273301737, + "grad_norm": 0.4745613634586334, + "learning_rate": 4.439178515007899e-06, + "loss": 0.4196, + "step": 353 + }, + { + "epoch": 0.5592417061611374, + "grad_norm": 0.6223664283752441, + "learning_rate": 4.423380726698262e-06, + "loss": 0.4592, + "step": 354 + }, + { + "epoch": 0.5608214849921012, + "grad_norm": 0.8797832727432251, + "learning_rate": 4.407582938388626e-06, + "loss": 0.4448, + "step": 355 + }, + { + "epoch": 0.5624012638230648, + "grad_norm": 0.5569826364517212, + "learning_rate": 4.391785150078989e-06, + "loss": 0.3873, + "step": 356 + }, + { + "epoch": 0.5639810426540285, + "grad_norm": 0.4294510781764984, + "learning_rate": 4.375987361769353e-06, + "loss": 0.3407, + "step": 357 + }, + { + "epoch": 0.5655608214849921, + "grad_norm": 0.5657434463500977, + "learning_rate": 4.360189573459716e-06, + "loss": 0.3345, + "step": 358 + }, + { + "epoch": 0.5671406003159558, + "grad_norm": 0.5589077472686768, + "learning_rate": 4.3443917851500794e-06, + "loss": 0.5237, + "step": 359 + }, + { + "epoch": 0.5687203791469194, + "grad_norm": 0.6107128858566284, + "learning_rate": 4.328593996840442e-06, + "loss": 0.4354, + "step": 360 + }, + { + "epoch": 0.5703001579778831, + "grad_norm": 0.5671380758285522, + "learning_rate": 4.312796208530806e-06, + "loss": 0.3712, + "step": 361 + }, + { + "epoch": 0.5718799368088467, + "grad_norm": 0.508173406124115, + "learning_rate": 4.29699842022117e-06, + "loss": 0.4097, + "step": 362 + }, + { + "epoch": 0.5734597156398105, + "grad_norm": 0.6139382719993591, + "learning_rate": 4.281200631911533e-06, + "loss": 0.2646, + "step": 363 + }, + { + "epoch": 0.5750394944707741, + "grad_norm": 0.5677220821380615, + "learning_rate": 4.265402843601897e-06, + "loss": 0.3748, + "step": 364 + }, + { + "epoch": 0.5766192733017378, + "grad_norm": 0.530708372592926, + "learning_rate": 4.2496050552922596e-06, + "loss": 0.3857, + "step": 365 + }, + { + "epoch": 0.5781990521327014, + "grad_norm": 1.176272988319397, + "learning_rate": 4.233807266982623e-06, + "loss": 0.436, + "step": 366 + }, + { + "epoch": 0.5797788309636651, + "grad_norm": 0.6165753602981567, + "learning_rate": 4.218009478672986e-06, + "loss": 0.3898, + "step": 367 + }, + { + "epoch": 0.5813586097946287, + "grad_norm": 0.47574201226234436, + "learning_rate": 4.20221169036335e-06, + "loss": 0.3685, + "step": 368 + }, + { + "epoch": 0.5829383886255924, + "grad_norm": 0.5995083451271057, + "learning_rate": 4.186413902053712e-06, + "loss": 0.4686, + "step": 369 + }, + { + "epoch": 0.584518167456556, + "grad_norm": 0.5809090733528137, + "learning_rate": 4.170616113744076e-06, + "loss": 0.4514, + "step": 370 + }, + { + "epoch": 0.5860979462875198, + "grad_norm": 0.6154018044471741, + "learning_rate": 4.15481832543444e-06, + "loss": 0.3737, + "step": 371 + }, + { + "epoch": 0.5876777251184834, + "grad_norm": 0.5799654126167297, + "learning_rate": 4.139020537124803e-06, + "loss": 0.4285, + "step": 372 + }, + { + "epoch": 0.5892575039494471, + "grad_norm": 0.4476354420185089, + "learning_rate": 4.123222748815166e-06, + "loss": 0.4362, + "step": 373 + }, + { + "epoch": 0.5908372827804107, + "grad_norm": 0.6266714334487915, + "learning_rate": 4.107424960505529e-06, + "loss": 0.4943, + "step": 374 + }, + { + "epoch": 0.5924170616113744, + "grad_norm": 0.5103732347488403, + "learning_rate": 4.091627172195893e-06, + "loss": 0.4585, + "step": 375 + }, + { + "epoch": 0.593996840442338, + "grad_norm": 0.49011877179145813, + "learning_rate": 4.075829383886256e-06, + "loss": 0.4489, + "step": 376 + }, + { + "epoch": 0.5955766192733017, + "grad_norm": 0.5286844372749329, + "learning_rate": 4.06003159557662e-06, + "loss": 0.4114, + "step": 377 + }, + { + "epoch": 0.5971563981042654, + "grad_norm": 0.494807630777359, + "learning_rate": 4.044233807266983e-06, + "loss": 0.3514, + "step": 378 + }, + { + "epoch": 0.5987361769352291, + "grad_norm": 0.46120524406433105, + "learning_rate": 4.0284360189573465e-06, + "loss": 0.4452, + "step": 379 + }, + { + "epoch": 0.6003159557661928, + "grad_norm": 0.6024404764175415, + "learning_rate": 4.0126382306477095e-06, + "loss": 0.4368, + "step": 380 + }, + { + "epoch": 0.6018957345971564, + "grad_norm": 0.8292664885520935, + "learning_rate": 3.996840442338073e-06, + "loss": 0.4495, + "step": 381 + }, + { + "epoch": 0.6034755134281201, + "grad_norm": 0.5312369465827942, + "learning_rate": 3.981042654028436e-06, + "loss": 0.3642, + "step": 382 + }, + { + "epoch": 0.6050552922590837, + "grad_norm": 0.6373758316040039, + "learning_rate": 3.9652448657188e-06, + "loss": 0.3884, + "step": 383 + }, + { + "epoch": 0.6066350710900474, + "grad_norm": 0.5623313188552856, + "learning_rate": 3.949447077409163e-06, + "loss": 0.3489, + "step": 384 + }, + { + "epoch": 0.608214849921011, + "grad_norm": 0.5703821778297424, + "learning_rate": 3.933649289099527e-06, + "loss": 0.5309, + "step": 385 + }, + { + "epoch": 0.6097946287519748, + "grad_norm": 0.5930938720703125, + "learning_rate": 3.91785150078989e-06, + "loss": 0.4072, + "step": 386 + }, + { + "epoch": 0.6113744075829384, + "grad_norm": 0.5636332631111145, + "learning_rate": 3.902053712480253e-06, + "loss": 0.3938, + "step": 387 + }, + { + "epoch": 0.6129541864139021, + "grad_norm": 0.45709583163261414, + "learning_rate": 3.886255924170616e-06, + "loss": 0.4436, + "step": 388 + }, + { + "epoch": 0.6145339652448657, + "grad_norm": 0.5924400687217712, + "learning_rate": 3.87045813586098e-06, + "loss": 0.2939, + "step": 389 + }, + { + "epoch": 0.6161137440758294, + "grad_norm": 0.6232696175575256, + "learning_rate": 3.854660347551343e-06, + "loss": 0.4183, + "step": 390 + }, + { + "epoch": 0.617693522906793, + "grad_norm": 0.5407995581626892, + "learning_rate": 3.838862559241707e-06, + "loss": 0.3925, + "step": 391 + }, + { + "epoch": 0.6192733017377567, + "grad_norm": 0.524691104888916, + "learning_rate": 3.82306477093207e-06, + "loss": 0.4327, + "step": 392 + }, + { + "epoch": 0.6208530805687204, + "grad_norm": 0.5206206440925598, + "learning_rate": 3.8072669826224335e-06, + "loss": 0.4203, + "step": 393 + }, + { + "epoch": 0.6224328593996841, + "grad_norm": 0.6244251132011414, + "learning_rate": 3.7914691943127964e-06, + "loss": 0.4546, + "step": 394 + }, + { + "epoch": 0.6240126382306477, + "grad_norm": 0.707058846950531, + "learning_rate": 3.77567140600316e-06, + "loss": 0.4015, + "step": 395 + }, + { + "epoch": 0.6255924170616114, + "grad_norm": 0.5457757115364075, + "learning_rate": 3.759873617693523e-06, + "loss": 0.3962, + "step": 396 + }, + { + "epoch": 0.627172195892575, + "grad_norm": 0.5757611989974976, + "learning_rate": 3.7440758293838865e-06, + "loss": 0.4299, + "step": 397 + }, + { + "epoch": 0.6287519747235387, + "grad_norm": 0.5844476819038391, + "learning_rate": 3.72827804107425e-06, + "loss": 0.4674, + "step": 398 + }, + { + "epoch": 0.6303317535545023, + "grad_norm": 0.6859634518623352, + "learning_rate": 3.7124802527646132e-06, + "loss": 0.4253, + "step": 399 + }, + { + "epoch": 0.631911532385466, + "grad_norm": 0.5247636437416077, + "learning_rate": 3.6966824644549766e-06, + "loss": 0.4318, + "step": 400 + }, + { + "epoch": 0.6334913112164297, + "grad_norm": 0.6206024885177612, + "learning_rate": 3.68088467614534e-06, + "loss": 0.3759, + "step": 401 + }, + { + "epoch": 0.6350710900473934, + "grad_norm": 0.6237459182739258, + "learning_rate": 3.6650868878357033e-06, + "loss": 0.3642, + "step": 402 + }, + { + "epoch": 0.636650868878357, + "grad_norm": 0.8048799633979797, + "learning_rate": 3.6492890995260666e-06, + "loss": 0.514, + "step": 403 + }, + { + "epoch": 0.6382306477093207, + "grad_norm": 0.4662720561027527, + "learning_rate": 3.63349131121643e-06, + "loss": 0.3654, + "step": 404 + }, + { + "epoch": 0.6398104265402843, + "grad_norm": 0.5561702251434326, + "learning_rate": 3.6176935229067934e-06, + "loss": 0.3823, + "step": 405 + }, + { + "epoch": 0.641390205371248, + "grad_norm": 0.6143206357955933, + "learning_rate": 3.6018957345971567e-06, + "loss": 0.3938, + "step": 406 + }, + { + "epoch": 0.6429699842022117, + "grad_norm": 0.6854034662246704, + "learning_rate": 3.58609794628752e-06, + "loss": 0.4625, + "step": 407 + }, + { + "epoch": 0.6445497630331753, + "grad_norm": 0.5590549111366272, + "learning_rate": 3.5703001579778834e-06, + "loss": 0.4199, + "step": 408 + }, + { + "epoch": 0.6461295418641391, + "grad_norm": 0.642573356628418, + "learning_rate": 3.5545023696682468e-06, + "loss": 0.4366, + "step": 409 + }, + { + "epoch": 0.6477093206951027, + "grad_norm": 0.5898130536079407, + "learning_rate": 3.53870458135861e-06, + "loss": 0.4691, + "step": 410 + }, + { + "epoch": 0.6492890995260664, + "grad_norm": 0.5370688438415527, + "learning_rate": 3.5229067930489735e-06, + "loss": 0.45, + "step": 411 + }, + { + "epoch": 0.65086887835703, + "grad_norm": 0.6769170165061951, + "learning_rate": 3.507109004739337e-06, + "loss": 0.3962, + "step": 412 + }, + { + "epoch": 0.6524486571879937, + "grad_norm": 0.5891703367233276, + "learning_rate": 3.4913112164297e-06, + "loss": 0.4542, + "step": 413 + }, + { + "epoch": 0.6540284360189573, + "grad_norm": 0.42204615473747253, + "learning_rate": 3.4755134281200636e-06, + "loss": 0.3368, + "step": 414 + }, + { + "epoch": 0.655608214849921, + "grad_norm": 0.46033787727355957, + "learning_rate": 3.459715639810427e-06, + "loss": 0.4357, + "step": 415 + }, + { + "epoch": 0.6571879936808847, + "grad_norm": 0.5509577393531799, + "learning_rate": 3.4439178515007903e-06, + "loss": 0.3939, + "step": 416 + }, + { + "epoch": 0.6587677725118484, + "grad_norm": 0.5802867412567139, + "learning_rate": 3.4281200631911536e-06, + "loss": 0.4073, + "step": 417 + }, + { + "epoch": 0.660347551342812, + "grad_norm": 0.6130402684211731, + "learning_rate": 3.412322274881517e-06, + "loss": 0.3452, + "step": 418 + }, + { + "epoch": 0.6619273301737757, + "grad_norm": 0.6854075789451599, + "learning_rate": 3.39652448657188e-06, + "loss": 0.3551, + "step": 419 + }, + { + "epoch": 0.6635071090047393, + "grad_norm": 0.5365926027297974, + "learning_rate": 3.3807266982622433e-06, + "loss": 0.4011, + "step": 420 + }, + { + "epoch": 0.665086887835703, + "grad_norm": 1.0338938236236572, + "learning_rate": 3.3649289099526066e-06, + "loss": 0.4623, + "step": 421 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.5612855553627014, + "learning_rate": 3.34913112164297e-06, + "loss": 0.3738, + "step": 422 + }, + { + "epoch": 0.6682464454976303, + "grad_norm": 0.5113286375999451, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3865, + "step": 423 + }, + { + "epoch": 0.669826224328594, + "grad_norm": 0.5509905815124512, + "learning_rate": 3.3175355450236967e-06, + "loss": 0.4093, + "step": 424 + }, + { + "epoch": 0.6714060031595577, + "grad_norm": 0.5425525903701782, + "learning_rate": 3.30173775671406e-06, + "loss": 0.383, + "step": 425 + }, + { + "epoch": 0.6729857819905213, + "grad_norm": 0.5866172909736633, + "learning_rate": 3.2859399684044234e-06, + "loss": 0.4843, + "step": 426 + }, + { + "epoch": 0.674565560821485, + "grad_norm": 1.0777703523635864, + "learning_rate": 3.2701421800947867e-06, + "loss": 0.3748, + "step": 427 + }, + { + "epoch": 0.6761453396524486, + "grad_norm": 0.49126845598220825, + "learning_rate": 3.25434439178515e-06, + "loss": 0.3505, + "step": 428 + }, + { + "epoch": 0.6777251184834123, + "grad_norm": 0.5471718311309814, + "learning_rate": 3.2385466034755135e-06, + "loss": 0.4755, + "step": 429 + }, + { + "epoch": 0.6793048973143759, + "grad_norm": 0.5689931511878967, + "learning_rate": 3.222748815165877e-06, + "loss": 0.3956, + "step": 430 + }, + { + "epoch": 0.6808846761453397, + "grad_norm": 0.6496183276176453, + "learning_rate": 3.2069510268562406e-06, + "loss": 0.4598, + "step": 431 + }, + { + "epoch": 0.6824644549763034, + "grad_norm": 0.47042712569236755, + "learning_rate": 3.191153238546604e-06, + "loss": 0.3756, + "step": 432 + }, + { + "epoch": 0.684044233807267, + "grad_norm": 0.5819857120513916, + "learning_rate": 3.1753554502369673e-06, + "loss": 0.4803, + "step": 433 + }, + { + "epoch": 0.6856240126382307, + "grad_norm": 0.5752127766609192, + "learning_rate": 3.1595576619273307e-06, + "loss": 0.3916, + "step": 434 + }, + { + "epoch": 0.6872037914691943, + "grad_norm": 0.6483988761901855, + "learning_rate": 3.143759873617694e-06, + "loss": 0.4338, + "step": 435 + }, + { + "epoch": 0.688783570300158, + "grad_norm": 0.7817516326904297, + "learning_rate": 3.1279620853080574e-06, + "loss": 0.3645, + "step": 436 + }, + { + "epoch": 0.6903633491311216, + "grad_norm": 0.4980696737766266, + "learning_rate": 3.1121642969984207e-06, + "loss": 0.3962, + "step": 437 + }, + { + "epoch": 0.6919431279620853, + "grad_norm": 0.5592882037162781, + "learning_rate": 3.096366508688784e-06, + "loss": 0.3645, + "step": 438 + }, + { + "epoch": 0.693522906793049, + "grad_norm": 0.6228163242340088, + "learning_rate": 3.0805687203791474e-06, + "loss": 0.3696, + "step": 439 + }, + { + "epoch": 0.6951026856240127, + "grad_norm": 0.6718009114265442, + "learning_rate": 3.0647709320695108e-06, + "loss": 0.4926, + "step": 440 + }, + { + "epoch": 0.6966824644549763, + "grad_norm": 0.6085376143455505, + "learning_rate": 3.048973143759874e-06, + "loss": 0.418, + "step": 441 + }, + { + "epoch": 0.69826224328594, + "grad_norm": 0.7716324925422668, + "learning_rate": 3.0331753554502375e-06, + "loss": 0.4038, + "step": 442 + }, + { + "epoch": 0.6998420221169036, + "grad_norm": 0.7239758968353271, + "learning_rate": 3.017377567140601e-06, + "loss": 0.4596, + "step": 443 + }, + { + "epoch": 0.7014218009478673, + "grad_norm": 0.6308011412620544, + "learning_rate": 3.001579778830964e-06, + "loss": 0.4082, + "step": 444 + }, + { + "epoch": 0.7030015797788309, + "grad_norm": 0.515626072883606, + "learning_rate": 2.985781990521327e-06, + "loss": 0.4688, + "step": 445 + }, + { + "epoch": 0.7045813586097947, + "grad_norm": 0.5395441651344299, + "learning_rate": 2.9699842022116905e-06, + "loss": 0.3448, + "step": 446 + }, + { + "epoch": 0.7061611374407583, + "grad_norm": 0.5883680582046509, + "learning_rate": 2.954186413902054e-06, + "loss": 0.4546, + "step": 447 + }, + { + "epoch": 0.707740916271722, + "grad_norm": 0.7300311326980591, + "learning_rate": 2.938388625592417e-06, + "loss": 0.368, + "step": 448 + }, + { + "epoch": 0.7093206951026856, + "grad_norm": 0.5901307463645935, + "learning_rate": 2.9225908372827806e-06, + "loss": 0.3688, + "step": 449 + }, + { + "epoch": 0.7109004739336493, + "grad_norm": 0.6521854996681213, + "learning_rate": 2.906793048973144e-06, + "loss": 0.3876, + "step": 450 + }, + { + "epoch": 0.7124802527646129, + "grad_norm": 0.688450038433075, + "learning_rate": 2.8909952606635073e-06, + "loss": 0.4298, + "step": 451 + }, + { + "epoch": 0.7140600315955766, + "grad_norm": 0.6533556580543518, + "learning_rate": 2.8751974723538706e-06, + "loss": 0.3589, + "step": 452 + }, + { + "epoch": 0.7156398104265402, + "grad_norm": 0.5261491537094116, + "learning_rate": 2.859399684044234e-06, + "loss": 0.3886, + "step": 453 + }, + { + "epoch": 0.717219589257504, + "grad_norm": 0.5488421320915222, + "learning_rate": 2.8436018957345973e-06, + "loss": 0.411, + "step": 454 + }, + { + "epoch": 0.7187993680884676, + "grad_norm": 0.6415657997131348, + "learning_rate": 2.8278041074249607e-06, + "loss": 0.4581, + "step": 455 + }, + { + "epoch": 0.7203791469194313, + "grad_norm": 0.5058445334434509, + "learning_rate": 2.812006319115324e-06, + "loss": 0.4325, + "step": 456 + }, + { + "epoch": 0.721958925750395, + "grad_norm": 0.6409322619438171, + "learning_rate": 2.7962085308056874e-06, + "loss": 0.3759, + "step": 457 + }, + { + "epoch": 0.7235387045813586, + "grad_norm": 0.5578014850616455, + "learning_rate": 2.7804107424960508e-06, + "loss": 0.3947, + "step": 458 + }, + { + "epoch": 0.7251184834123223, + "grad_norm": 0.6064183115959167, + "learning_rate": 2.764612954186414e-06, + "loss": 0.4766, + "step": 459 + }, + { + "epoch": 0.7266982622432859, + "grad_norm": 0.6067904233932495, + "learning_rate": 2.7488151658767775e-06, + "loss": 0.4698, + "step": 460 + }, + { + "epoch": 0.7282780410742496, + "grad_norm": 0.526088297367096, + "learning_rate": 2.733017377567141e-06, + "loss": 0.3997, + "step": 461 + }, + { + "epoch": 0.7298578199052133, + "grad_norm": 0.6290006637573242, + "learning_rate": 2.717219589257504e-06, + "loss": 0.4393, + "step": 462 + }, + { + "epoch": 0.731437598736177, + "grad_norm": 0.5822445154190063, + "learning_rate": 2.7014218009478675e-06, + "loss": 0.4767, + "step": 463 + }, + { + "epoch": 0.7330173775671406, + "grad_norm": 0.5798205733299255, + "learning_rate": 2.685624012638231e-06, + "loss": 0.4163, + "step": 464 + }, + { + "epoch": 0.7345971563981043, + "grad_norm": 0.6234124898910522, + "learning_rate": 2.6698262243285942e-06, + "loss": 0.387, + "step": 465 + }, + { + "epoch": 0.7361769352290679, + "grad_norm": 0.5226984620094299, + "learning_rate": 2.6540284360189576e-06, + "loss": 0.4144, + "step": 466 + }, + { + "epoch": 0.7377567140600316, + "grad_norm": 0.529303789138794, + "learning_rate": 2.638230647709321e-06, + "loss": 0.4689, + "step": 467 + }, + { + "epoch": 0.7393364928909952, + "grad_norm": 0.6620000004768372, + "learning_rate": 2.6224328593996843e-06, + "loss": 0.4358, + "step": 468 + }, + { + "epoch": 0.740916271721959, + "grad_norm": 0.8560294508934021, + "learning_rate": 2.606635071090048e-06, + "loss": 0.422, + "step": 469 + }, + { + "epoch": 0.7424960505529226, + "grad_norm": 0.47033989429473877, + "learning_rate": 2.5908372827804106e-06, + "loss": 0.4462, + "step": 470 + }, + { + "epoch": 0.7440758293838863, + "grad_norm": 0.5476656556129456, + "learning_rate": 2.575039494470774e-06, + "loss": 0.3818, + "step": 471 + }, + { + "epoch": 0.7456556082148499, + "grad_norm": 0.5771902203559875, + "learning_rate": 2.5592417061611373e-06, + "loss": 0.3835, + "step": 472 + }, + { + "epoch": 0.7472353870458136, + "grad_norm": 0.6452733278274536, + "learning_rate": 2.5434439178515007e-06, + "loss": 0.4224, + "step": 473 + }, + { + "epoch": 0.7488151658767772, + "grad_norm": 0.5318686962127686, + "learning_rate": 2.527646129541864e-06, + "loss": 0.4812, + "step": 474 + }, + { + "epoch": 0.7503949447077409, + "grad_norm": 0.6591460108757019, + "learning_rate": 2.5118483412322274e-06, + "loss": 0.4546, + "step": 475 + }, + { + "epoch": 0.7519747235387045, + "grad_norm": 0.5857440829277039, + "learning_rate": 2.4960505529225907e-06, + "loss": 0.4008, + "step": 476 + }, + { + "epoch": 0.7535545023696683, + "grad_norm": 0.6430768370628357, + "learning_rate": 2.480252764612954e-06, + "loss": 0.3191, + "step": 477 + }, + { + "epoch": 0.7551342812006319, + "grad_norm": 0.7442892789840698, + "learning_rate": 2.4644549763033174e-06, + "loss": 0.4171, + "step": 478 + }, + { + "epoch": 0.7567140600315956, + "grad_norm": 0.6390454173088074, + "learning_rate": 2.4486571879936812e-06, + "loss": 0.5381, + "step": 479 + }, + { + "epoch": 0.7582938388625592, + "grad_norm": 0.6277416348457336, + "learning_rate": 2.4328593996840446e-06, + "loss": 0.4824, + "step": 480 + }, + { + "epoch": 0.7598736176935229, + "grad_norm": 0.6043097972869873, + "learning_rate": 2.417061611374408e-06, + "loss": 0.4266, + "step": 481 + }, + { + "epoch": 0.7614533965244866, + "grad_norm": 0.6095964312553406, + "learning_rate": 2.4012638230647713e-06, + "loss": 0.4258, + "step": 482 + }, + { + "epoch": 0.7630331753554502, + "grad_norm": 0.5433639287948608, + "learning_rate": 2.3854660347551346e-06, + "loss": 0.4873, + "step": 483 + }, + { + "epoch": 0.764612954186414, + "grad_norm": 0.49287649989128113, + "learning_rate": 2.369668246445498e-06, + "loss": 0.4814, + "step": 484 + }, + { + "epoch": 0.7661927330173776, + "grad_norm": 0.5905902981758118, + "learning_rate": 2.3538704581358613e-06, + "loss": 0.4519, + "step": 485 + }, + { + "epoch": 0.7677725118483413, + "grad_norm": 0.6697285771369934, + "learning_rate": 2.3380726698262247e-06, + "loss": 0.4686, + "step": 486 + }, + { + "epoch": 0.7693522906793049, + "grad_norm": 0.5338664650917053, + "learning_rate": 2.322274881516588e-06, + "loss": 0.401, + "step": 487 + }, + { + "epoch": 0.7709320695102686, + "grad_norm": 0.5338428616523743, + "learning_rate": 2.3064770932069514e-06, + "loss": 0.4045, + "step": 488 + }, + { + "epoch": 0.7725118483412322, + "grad_norm": 0.6102830171585083, + "learning_rate": 2.2906793048973143e-06, + "loss": 0.3785, + "step": 489 + }, + { + "epoch": 0.7740916271721959, + "grad_norm": 0.5787335634231567, + "learning_rate": 2.2748815165876777e-06, + "loss": 0.42, + "step": 490 + }, + { + "epoch": 0.7756714060031595, + "grad_norm": 0.7426438331604004, + "learning_rate": 2.259083728278041e-06, + "loss": 0.4676, + "step": 491 + }, + { + "epoch": 0.7772511848341233, + "grad_norm": 0.5988475680351257, + "learning_rate": 2.2432859399684044e-06, + "loss": 0.5404, + "step": 492 + }, + { + "epoch": 0.7788309636650869, + "grad_norm": 0.6289830803871155, + "learning_rate": 2.2274881516587678e-06, + "loss": 0.396, + "step": 493 + }, + { + "epoch": 0.7804107424960506, + "grad_norm": 0.6077900528907776, + "learning_rate": 2.211690363349131e-06, + "loss": 0.4016, + "step": 494 + }, + { + "epoch": 0.7819905213270142, + "grad_norm": 0.8171889781951904, + "learning_rate": 2.1958925750394945e-06, + "loss": 0.3638, + "step": 495 + }, + { + "epoch": 0.7835703001579779, + "grad_norm": 0.6225026845932007, + "learning_rate": 2.180094786729858e-06, + "loss": 0.4088, + "step": 496 + }, + { + "epoch": 0.7851500789889415, + "grad_norm": 0.6262929439544678, + "learning_rate": 2.164296998420221e-06, + "loss": 0.3311, + "step": 497 + }, + { + "epoch": 0.7867298578199052, + "grad_norm": 0.662129282951355, + "learning_rate": 2.148499210110585e-06, + "loss": 0.4434, + "step": 498 + }, + { + "epoch": 0.7883096366508688, + "grad_norm": 0.5046777725219727, + "learning_rate": 2.1327014218009483e-06, + "loss": 0.5042, + "step": 499 + }, + { + "epoch": 0.7898894154818326, + "grad_norm": 0.6273382306098938, + "learning_rate": 2.1169036334913117e-06, + "loss": 0.345, + "step": 500 + }, + { + "epoch": 0.7914691943127962, + "grad_norm": 0.5484871864318848, + "learning_rate": 2.101105845181675e-06, + "loss": 0.3476, + "step": 501 + }, + { + "epoch": 0.7930489731437599, + "grad_norm": 0.6779518723487854, + "learning_rate": 2.085308056872038e-06, + "loss": 0.4062, + "step": 502 + }, + { + "epoch": 0.7946287519747235, + "grad_norm": 0.4969736635684967, + "learning_rate": 2.0695102685624013e-06, + "loss": 0.3615, + "step": 503 + }, + { + "epoch": 0.7962085308056872, + "grad_norm": 0.5542388558387756, + "learning_rate": 2.0537124802527647e-06, + "loss": 0.39, + "step": 504 + }, + { + "epoch": 0.7977883096366508, + "grad_norm": 0.8587651252746582, + "learning_rate": 2.037914691943128e-06, + "loss": 0.423, + "step": 505 + }, + { + "epoch": 0.7993680884676145, + "grad_norm": 0.6399357318878174, + "learning_rate": 2.0221169036334914e-06, + "loss": 0.4645, + "step": 506 + }, + { + "epoch": 0.8009478672985783, + "grad_norm": 0.5677849650382996, + "learning_rate": 2.0063191153238547e-06, + "loss": 0.3749, + "step": 507 + }, + { + "epoch": 0.8025276461295419, + "grad_norm": 0.5609621405601501, + "learning_rate": 1.990521327014218e-06, + "loss": 0.4727, + "step": 508 + }, + { + "epoch": 0.8041074249605056, + "grad_norm": 0.615185558795929, + "learning_rate": 1.9747235387045814e-06, + "loss": 0.4349, + "step": 509 + }, + { + "epoch": 0.8056872037914692, + "grad_norm": 0.5093739032745361, + "learning_rate": 1.958925750394945e-06, + "loss": 0.3502, + "step": 510 + }, + { + "epoch": 0.8072669826224329, + "grad_norm": 0.8513323068618774, + "learning_rate": 1.943127962085308e-06, + "loss": 0.3902, + "step": 511 + }, + { + "epoch": 0.8088467614533965, + "grad_norm": 0.6797610521316528, + "learning_rate": 1.9273301737756715e-06, + "loss": 0.4987, + "step": 512 + }, + { + "epoch": 0.8104265402843602, + "grad_norm": 0.5715585947036743, + "learning_rate": 1.911532385466035e-06, + "loss": 0.3965, + "step": 513 + }, + { + "epoch": 0.8120063191153238, + "grad_norm": 0.5537532567977905, + "learning_rate": 1.8957345971563982e-06, + "loss": 0.3832, + "step": 514 + }, + { + "epoch": 0.8135860979462876, + "grad_norm": 0.5337470173835754, + "learning_rate": 1.8799368088467616e-06, + "loss": 0.4136, + "step": 515 + }, + { + "epoch": 0.8151658767772512, + "grad_norm": 0.5929555892944336, + "learning_rate": 1.864139020537125e-06, + "loss": 0.3901, + "step": 516 + }, + { + "epoch": 0.8167456556082149, + "grad_norm": 0.6738921403884888, + "learning_rate": 1.8483412322274883e-06, + "loss": 0.4128, + "step": 517 + }, + { + "epoch": 0.8183254344391785, + "grad_norm": 0.598659098148346, + "learning_rate": 1.8325434439178516e-06, + "loss": 0.3707, + "step": 518 + }, + { + "epoch": 0.8199052132701422, + "grad_norm": 0.5679790377616882, + "learning_rate": 1.816745655608215e-06, + "loss": 0.457, + "step": 519 + }, + { + "epoch": 0.8214849921011058, + "grad_norm": 0.5459115505218506, + "learning_rate": 1.8009478672985784e-06, + "loss": 0.3613, + "step": 520 + }, + { + "epoch": 0.8230647709320695, + "grad_norm": 0.5752125978469849, + "learning_rate": 1.7851500789889417e-06, + "loss": 0.479, + "step": 521 + }, + { + "epoch": 0.8246445497630331, + "grad_norm": 0.5184637904167175, + "learning_rate": 1.769352290679305e-06, + "loss": 0.4126, + "step": 522 + }, + { + "epoch": 0.8262243285939969, + "grad_norm": 0.6329041123390198, + "learning_rate": 1.7535545023696684e-06, + "loss": 0.4221, + "step": 523 + }, + { + "epoch": 0.8278041074249605, + "grad_norm": 0.5233784317970276, + "learning_rate": 1.7377567140600318e-06, + "loss": 0.4375, + "step": 524 + }, + { + "epoch": 0.8293838862559242, + "grad_norm": 0.5424541234970093, + "learning_rate": 1.7219589257503951e-06, + "loss": 0.4447, + "step": 525 + }, + { + "epoch": 0.8309636650868878, + "grad_norm": 0.5534167885780334, + "learning_rate": 1.7061611374407585e-06, + "loss": 0.3672, + "step": 526 + }, + { + "epoch": 0.8325434439178515, + "grad_norm": 0.605102002620697, + "learning_rate": 1.6903633491311216e-06, + "loss": 0.4319, + "step": 527 + }, + { + "epoch": 0.8341232227488151, + "grad_norm": 0.5609396696090698, + "learning_rate": 1.674565560821485e-06, + "loss": 0.3984, + "step": 528 + }, + { + "epoch": 0.8357030015797788, + "grad_norm": 0.7964479923248291, + "learning_rate": 1.6587677725118483e-06, + "loss": 0.407, + "step": 529 + }, + { + "epoch": 0.8372827804107424, + "grad_norm": 0.4886048436164856, + "learning_rate": 1.6429699842022117e-06, + "loss": 0.4506, + "step": 530 + }, + { + "epoch": 0.8388625592417062, + "grad_norm": 0.543812096118927, + "learning_rate": 1.627172195892575e-06, + "loss": 0.3141, + "step": 531 + }, + { + "epoch": 0.8404423380726699, + "grad_norm": 0.5370059609413147, + "learning_rate": 1.6113744075829384e-06, + "loss": 0.3712, + "step": 532 + }, + { + "epoch": 0.8420221169036335, + "grad_norm": 0.7402203679084778, + "learning_rate": 1.595576619273302e-06, + "loss": 0.4136, + "step": 533 + }, + { + "epoch": 0.8436018957345972, + "grad_norm": 0.6814244985580444, + "learning_rate": 1.5797788309636653e-06, + "loss": 0.4634, + "step": 534 + }, + { + "epoch": 0.8451816745655608, + "grad_norm": 0.5919080972671509, + "learning_rate": 1.5639810426540287e-06, + "loss": 0.4238, + "step": 535 + }, + { + "epoch": 0.8467614533965245, + "grad_norm": 0.617522120475769, + "learning_rate": 1.548183254344392e-06, + "loss": 0.3431, + "step": 536 + }, + { + "epoch": 0.8483412322274881, + "grad_norm": 0.49482643604278564, + "learning_rate": 1.5323854660347554e-06, + "loss": 0.3882, + "step": 537 + }, + { + "epoch": 0.8499210110584519, + "grad_norm": 0.5525531768798828, + "learning_rate": 1.5165876777251187e-06, + "loss": 0.4053, + "step": 538 + }, + { + "epoch": 0.8515007898894155, + "grad_norm": 0.6634103655815125, + "learning_rate": 1.500789889415482e-06, + "loss": 0.4624, + "step": 539 + }, + { + "epoch": 0.8530805687203792, + "grad_norm": 0.45309382677078247, + "learning_rate": 1.4849921011058452e-06, + "loss": 0.3486, + "step": 540 + }, + { + "epoch": 0.8546603475513428, + "grad_norm": 0.778338611125946, + "learning_rate": 1.4691943127962086e-06, + "loss": 0.3984, + "step": 541 + }, + { + "epoch": 0.8562401263823065, + "grad_norm": 0.6093356609344482, + "learning_rate": 1.453396524486572e-06, + "loss": 0.333, + "step": 542 + }, + { + "epoch": 0.8578199052132701, + "grad_norm": 0.49551188945770264, + "learning_rate": 1.4375987361769353e-06, + "loss": 0.3915, + "step": 543 + }, + { + "epoch": 0.8593996840442338, + "grad_norm": 0.5423188209533691, + "learning_rate": 1.4218009478672987e-06, + "loss": 0.4192, + "step": 544 + }, + { + "epoch": 0.8609794628751974, + "grad_norm": 0.8111097812652588, + "learning_rate": 1.406003159557662e-06, + "loss": 0.473, + "step": 545 + }, + { + "epoch": 0.8625592417061612, + "grad_norm": 0.6064862012863159, + "learning_rate": 1.3902053712480254e-06, + "loss": 0.4164, + "step": 546 + }, + { + "epoch": 0.8641390205371248, + "grad_norm": 0.6180470585823059, + "learning_rate": 1.3744075829383887e-06, + "loss": 0.4351, + "step": 547 + }, + { + "epoch": 0.8657187993680885, + "grad_norm": 0.5101069808006287, + "learning_rate": 1.358609794628752e-06, + "loss": 0.3806, + "step": 548 + }, + { + "epoch": 0.8672985781990521, + "grad_norm": 0.6269749402999878, + "learning_rate": 1.3428120063191154e-06, + "loss": 0.4028, + "step": 549 + }, + { + "epoch": 0.8688783570300158, + "grad_norm": 0.6344918608665466, + "learning_rate": 1.3270142180094788e-06, + "loss": 0.3206, + "step": 550 + }, + { + "epoch": 0.8704581358609794, + "grad_norm": 0.7053835988044739, + "learning_rate": 1.3112164296998422e-06, + "loss": 0.4404, + "step": 551 + }, + { + "epoch": 0.8720379146919431, + "grad_norm": 0.4780917465686798, + "learning_rate": 1.2954186413902053e-06, + "loss": 0.4089, + "step": 552 + }, + { + "epoch": 0.8736176935229067, + "grad_norm": 0.5235942006111145, + "learning_rate": 1.2796208530805687e-06, + "loss": 0.3992, + "step": 553 + }, + { + "epoch": 0.8751974723538705, + "grad_norm": 0.5037370324134827, + "learning_rate": 1.263823064770932e-06, + "loss": 0.3727, + "step": 554 + }, + { + "epoch": 0.8767772511848341, + "grad_norm": 0.5422868132591248, + "learning_rate": 1.2480252764612954e-06, + "loss": 0.4524, + "step": 555 + }, + { + "epoch": 0.8783570300157978, + "grad_norm": 0.5287191271781921, + "learning_rate": 1.2322274881516587e-06, + "loss": 0.3445, + "step": 556 + }, + { + "epoch": 0.8799368088467614, + "grad_norm": 0.49679964780807495, + "learning_rate": 1.2164296998420223e-06, + "loss": 0.3357, + "step": 557 + }, + { + "epoch": 0.8815165876777251, + "grad_norm": 0.5391539931297302, + "learning_rate": 1.2006319115323856e-06, + "loss": 0.4645, + "step": 558 + }, + { + "epoch": 0.8830963665086888, + "grad_norm": 0.5474575757980347, + "learning_rate": 1.184834123222749e-06, + "loss": 0.4109, + "step": 559 + }, + { + "epoch": 0.8846761453396524, + "grad_norm": 0.5920886993408203, + "learning_rate": 1.1690363349131124e-06, + "loss": 0.4034, + "step": 560 + }, + { + "epoch": 0.8862559241706162, + "grad_norm": 0.5637263655662537, + "learning_rate": 1.1532385466034757e-06, + "loss": 0.392, + "step": 561 + }, + { + "epoch": 0.8878357030015798, + "grad_norm": 0.6719076037406921, + "learning_rate": 1.1374407582938388e-06, + "loss": 0.3798, + "step": 562 + }, + { + "epoch": 0.8894154818325435, + "grad_norm": 0.5554001927375793, + "learning_rate": 1.1216429699842022e-06, + "loss": 0.3901, + "step": 563 + }, + { + "epoch": 0.8909952606635071, + "grad_norm": 0.6078475713729858, + "learning_rate": 1.1058451816745656e-06, + "loss": 0.3574, + "step": 564 + }, + { + "epoch": 0.8925750394944708, + "grad_norm": 0.9478325843811035, + "learning_rate": 1.090047393364929e-06, + "loss": 0.3831, + "step": 565 + }, + { + "epoch": 0.8941548183254344, + "grad_norm": 0.5259877443313599, + "learning_rate": 1.0742496050552925e-06, + "loss": 0.4003, + "step": 566 + }, + { + "epoch": 0.8957345971563981, + "grad_norm": 0.5395880937576294, + "learning_rate": 1.0584518167456558e-06, + "loss": 0.3513, + "step": 567 + }, + { + "epoch": 0.8973143759873617, + "grad_norm": 0.5458592772483826, + "learning_rate": 1.042654028436019e-06, + "loss": 0.49, + "step": 568 + }, + { + "epoch": 0.8988941548183255, + "grad_norm": 0.5552616715431213, + "learning_rate": 1.0268562401263823e-06, + "loss": 0.3905, + "step": 569 + }, + { + "epoch": 0.9004739336492891, + "grad_norm": 0.551466166973114, + "learning_rate": 1.0110584518167457e-06, + "loss": 0.4241, + "step": 570 + }, + { + "epoch": 0.9020537124802528, + "grad_norm": 0.7195900082588196, + "learning_rate": 9.95260663507109e-07, + "loss": 0.3912, + "step": 571 + }, + { + "epoch": 0.9036334913112164, + "grad_norm": 0.5951517820358276, + "learning_rate": 9.794628751974724e-07, + "loss": 0.4267, + "step": 572 + }, + { + "epoch": 0.9052132701421801, + "grad_norm": 0.7582541108131409, + "learning_rate": 9.636650868878358e-07, + "loss": 0.4024, + "step": 573 + }, + { + "epoch": 0.9067930489731437, + "grad_norm": 0.6346389651298523, + "learning_rate": 9.478672985781991e-07, + "loss": 0.4677, + "step": 574 + }, + { + "epoch": 0.9083728278041074, + "grad_norm": 0.7323048710823059, + "learning_rate": 9.320695102685625e-07, + "loss": 0.4332, + "step": 575 + }, + { + "epoch": 0.909952606635071, + "grad_norm": 0.5796726942062378, + "learning_rate": 9.162717219589258e-07, + "loss": 0.3514, + "step": 576 + }, + { + "epoch": 0.9115323854660348, + "grad_norm": 0.7424004673957825, + "learning_rate": 9.004739336492892e-07, + "loss": 0.4178, + "step": 577 + }, + { + "epoch": 0.9131121642969984, + "grad_norm": 0.525142252445221, + "learning_rate": 8.846761453396525e-07, + "loss": 0.4498, + "step": 578 + }, + { + "epoch": 0.9146919431279621, + "grad_norm": 0.5565955638885498, + "learning_rate": 8.688783570300159e-07, + "loss": 0.4532, + "step": 579 + }, + { + "epoch": 0.9162717219589257, + "grad_norm": 0.540267288684845, + "learning_rate": 8.530805687203792e-07, + "loss": 0.4828, + "step": 580 + }, + { + "epoch": 0.9178515007898894, + "grad_norm": 0.5061677694320679, + "learning_rate": 8.372827804107425e-07, + "loss": 0.3505, + "step": 581 + }, + { + "epoch": 0.919431279620853, + "grad_norm": 0.5490908622741699, + "learning_rate": 8.214849921011058e-07, + "loss": 0.4402, + "step": 582 + }, + { + "epoch": 0.9210110584518167, + "grad_norm": 0.5788997411727905, + "learning_rate": 8.056872037914692e-07, + "loss": 0.3256, + "step": 583 + }, + { + "epoch": 0.9225908372827805, + "grad_norm": 0.5741492509841919, + "learning_rate": 7.898894154818327e-07, + "loss": 0.451, + "step": 584 + }, + { + "epoch": 0.9241706161137441, + "grad_norm": 0.5012090802192688, + "learning_rate": 7.74091627172196e-07, + "loss": 0.3513, + "step": 585 + }, + { + "epoch": 0.9257503949447078, + "grad_norm": 0.5613192915916443, + "learning_rate": 7.582938388625594e-07, + "loss": 0.3499, + "step": 586 + }, + { + "epoch": 0.9273301737756714, + "grad_norm": 0.5941815376281738, + "learning_rate": 7.424960505529226e-07, + "loss": 0.4133, + "step": 587 + }, + { + "epoch": 0.9289099526066351, + "grad_norm": 0.7772453427314758, + "learning_rate": 7.26698262243286e-07, + "loss": 0.3818, + "step": 588 + }, + { + "epoch": 0.9304897314375987, + "grad_norm": 0.5977700352668762, + "learning_rate": 7.109004739336493e-07, + "loss": 0.4099, + "step": 589 + }, + { + "epoch": 0.9320695102685624, + "grad_norm": 0.7777069807052612, + "learning_rate": 6.951026856240127e-07, + "loss": 0.4341, + "step": 590 + }, + { + "epoch": 0.933649289099526, + "grad_norm": 0.5362728834152222, + "learning_rate": 6.79304897314376e-07, + "loss": 0.4431, + "step": 591 + }, + { + "epoch": 0.9352290679304898, + "grad_norm": 0.5126134157180786, + "learning_rate": 6.635071090047394e-07, + "loss": 0.3713, + "step": 592 + }, + { + "epoch": 0.9368088467614534, + "grad_norm": 0.5886785984039307, + "learning_rate": 6.477093206951026e-07, + "loss": 0.405, + "step": 593 + }, + { + "epoch": 0.9383886255924171, + "grad_norm": 0.5328089594841003, + "learning_rate": 6.31911532385466e-07, + "loss": 0.3952, + "step": 594 + }, + { + "epoch": 0.9399684044233807, + "grad_norm": 0.7170501351356506, + "learning_rate": 6.161137440758294e-07, + "loss": 0.3979, + "step": 595 + }, + { + "epoch": 0.9415481832543444, + "grad_norm": 0.6048548817634583, + "learning_rate": 6.003159557661928e-07, + "loss": 0.3425, + "step": 596 + }, + { + "epoch": 0.943127962085308, + "grad_norm": 0.5635291337966919, + "learning_rate": 5.845181674565562e-07, + "loss": 0.3008, + "step": 597 + }, + { + "epoch": 0.9447077409162717, + "grad_norm": 0.6890112161636353, + "learning_rate": 5.687203791469194e-07, + "loss": 0.4205, + "step": 598 + }, + { + "epoch": 0.9462875197472354, + "grad_norm": 0.5197014212608337, + "learning_rate": 5.529225908372828e-07, + "loss": 0.4589, + "step": 599 + }, + { + "epoch": 0.9478672985781991, + "grad_norm": 0.5197718143463135, + "learning_rate": 5.371248025276462e-07, + "loss": 0.2678, + "step": 600 + }, + { + "epoch": 0.9494470774091627, + "grad_norm": 0.44931474328041077, + "learning_rate": 5.213270142180095e-07, + "loss": 0.4351, + "step": 601 + }, + { + "epoch": 0.9510268562401264, + "grad_norm": 0.47795984148979187, + "learning_rate": 5.055292259083728e-07, + "loss": 0.4392, + "step": 602 + }, + { + "epoch": 0.95260663507109, + "grad_norm": 0.6027578115463257, + "learning_rate": 4.897314375987362e-07, + "loss": 0.4499, + "step": 603 + }, + { + "epoch": 0.9541864139020537, + "grad_norm": 0.6160722374916077, + "learning_rate": 4.7393364928909956e-07, + "loss": 0.434, + "step": 604 + }, + { + "epoch": 0.9557661927330173, + "grad_norm": 0.8371343612670898, + "learning_rate": 4.581358609794629e-07, + "loss": 0.3911, + "step": 605 + }, + { + "epoch": 0.957345971563981, + "grad_norm": 0.5282484292984009, + "learning_rate": 4.4233807266982627e-07, + "loss": 0.4445, + "step": 606 + }, + { + "epoch": 0.9589257503949447, + "grad_norm": 0.5557743310928345, + "learning_rate": 4.265402843601896e-07, + "loss": 0.4103, + "step": 607 + }, + { + "epoch": 0.9605055292259084, + "grad_norm": 0.6362637281417847, + "learning_rate": 4.107424960505529e-07, + "loss": 0.3856, + "step": 608 + }, + { + "epoch": 0.9620853080568721, + "grad_norm": 0.745617151260376, + "learning_rate": 3.9494470774091633e-07, + "loss": 0.4179, + "step": 609 + }, + { + "epoch": 0.9636650868878357, + "grad_norm": 0.659038782119751, + "learning_rate": 3.791469194312797e-07, + "loss": 0.4027, + "step": 610 + }, + { + "epoch": 0.9652448657187994, + "grad_norm": 0.645199716091156, + "learning_rate": 3.63349131121643e-07, + "loss": 0.3501, + "step": 611 + }, + { + "epoch": 0.966824644549763, + "grad_norm": 0.4868941605091095, + "learning_rate": 3.4755134281200634e-07, + "loss": 0.3385, + "step": 612 + }, + { + "epoch": 0.9684044233807267, + "grad_norm": 0.5993934273719788, + "learning_rate": 3.317535545023697e-07, + "loss": 0.369, + "step": 613 + }, + { + "epoch": 0.9699842022116903, + "grad_norm": 0.6094574928283691, + "learning_rate": 3.15955766192733e-07, + "loss": 0.4899, + "step": 614 + }, + { + "epoch": 0.9715639810426541, + "grad_norm": 0.6989656686782837, + "learning_rate": 3.001579778830964e-07, + "loss": 0.4346, + "step": 615 + }, + { + "epoch": 0.9731437598736177, + "grad_norm": 0.5412940382957458, + "learning_rate": 2.843601895734597e-07, + "loss": 0.4515, + "step": 616 + }, + { + "epoch": 0.9747235387045814, + "grad_norm": 0.507622241973877, + "learning_rate": 2.685624012638231e-07, + "loss": 0.4171, + "step": 617 + }, + { + "epoch": 0.976303317535545, + "grad_norm": 0.4564089775085449, + "learning_rate": 2.527646129541864e-07, + "loss": 0.3452, + "step": 618 + }, + { + "epoch": 0.9778830963665087, + "grad_norm": 0.48170286417007446, + "learning_rate": 2.3696682464454978e-07, + "loss": 0.3866, + "step": 619 + }, + { + "epoch": 0.9794628751974723, + "grad_norm": 0.47774481773376465, + "learning_rate": 2.2116903633491313e-07, + "loss": 0.4425, + "step": 620 + }, + { + "epoch": 0.981042654028436, + "grad_norm": 0.4460739493370056, + "learning_rate": 2.0537124802527646e-07, + "loss": 0.3991, + "step": 621 + }, + { + "epoch": 0.9826224328593997, + "grad_norm": 0.536359965801239, + "learning_rate": 1.8957345971563984e-07, + "loss": 0.327, + "step": 622 + }, + { + "epoch": 0.9842022116903634, + "grad_norm": 0.5439571738243103, + "learning_rate": 1.7377567140600317e-07, + "loss": 0.408, + "step": 623 + }, + { + "epoch": 0.985781990521327, + "grad_norm": 0.8827345967292786, + "learning_rate": 1.579778830963665e-07, + "loss": 0.4924, + "step": 624 + }, + { + "epoch": 0.9873617693522907, + "grad_norm": 0.4992835521697998, + "learning_rate": 1.4218009478672986e-07, + "loss": 0.3921, + "step": 625 + }, + { + "epoch": 0.9889415481832543, + "grad_norm": 0.7306237816810608, + "learning_rate": 1.263823064770932e-07, + "loss": 0.5063, + "step": 626 + }, + { + "epoch": 0.990521327014218, + "grad_norm": 0.5200903415679932, + "learning_rate": 1.1058451816745657e-07, + "loss": 0.358, + "step": 627 + }, + { + "epoch": 0.9921011058451816, + "grad_norm": 0.42708104848861694, + "learning_rate": 9.478672985781992e-08, + "loss": 0.3361, + "step": 628 + }, + { + "epoch": 0.9936808846761453, + "grad_norm": 0.5993225574493408, + "learning_rate": 7.898894154818325e-08, + "loss": 0.3625, + "step": 629 + }, + { + "epoch": 0.995260663507109, + "grad_norm": 0.49995774030685425, + "learning_rate": 6.31911532385466e-08, + "loss": 0.3746, + "step": 630 + }, + { + "epoch": 0.9968404423380727, + "grad_norm": 0.5806180238723755, + "learning_rate": 4.739336492890996e-08, + "loss": 0.3727, + "step": 631 + }, + { + "epoch": 0.9984202211690363, + "grad_norm": 0.5514349341392517, + "learning_rate": 3.15955766192733e-08, + "loss": 0.4634, + "step": 632 + }, + { + "epoch": 1.0, + "grad_norm": 0.4094119668006897, + "learning_rate": 1.579778830963665e-08, + "loss": 0.2044, + "step": 633 + } + ], + "logging_steps": 1.0, + "max_steps": 633, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.9805266972408545e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}