{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001579778830963665, "grad_norm": 3.3021833896636963, "learning_rate": 1e-05, "loss": 0.8142, "step": 1 }, { "epoch": 0.00315955766192733, "grad_norm": 0.5667713284492493, "learning_rate": 9.984202211690363e-06, "loss": 0.4081, "step": 2 }, { "epoch": 0.004739336492890996, "grad_norm": 7.904314994812012, "learning_rate": 9.968404423380728e-06, "loss": 1.1876, "step": 3 }, { "epoch": 0.00631911532385466, "grad_norm": 10.157713890075684, "learning_rate": 9.95260663507109e-06, "loss": 1.4092, "step": 4 }, { "epoch": 0.007898894154818325, "grad_norm": 4.723056316375732, "learning_rate": 9.936808846761454e-06, "loss": 0.7578, "step": 5 }, { "epoch": 0.009478672985781991, "grad_norm": 7.033465385437012, "learning_rate": 9.921011058451816e-06, "loss": 0.5175, "step": 6 }, { "epoch": 0.011058451816745656, "grad_norm": 0.800440788269043, "learning_rate": 9.905213270142182e-06, "loss": 0.4077, "step": 7 }, { "epoch": 0.01263823064770932, "grad_norm": 0.6944026350975037, "learning_rate": 9.889415481832544e-06, "loss": 0.4686, "step": 8 }, { "epoch": 0.014218009478672985, "grad_norm": 0.5700448751449585, "learning_rate": 9.873617693522908e-06, "loss": 0.3623, "step": 9 }, { "epoch": 0.01579778830963665, "grad_norm": 0.7115408778190613, "learning_rate": 9.85781990521327e-06, "loss": 0.4727, "step": 10 }, { "epoch": 0.017377567140600316, "grad_norm": 0.5764197707176208, "learning_rate": 9.842022116903635e-06, "loss": 0.4054, "step": 11 }, { "epoch": 0.018957345971563982, "grad_norm": 0.615205705165863, "learning_rate": 9.826224328593997e-06, "loss": 0.3798, "step": 12 }, { "epoch": 0.020537124802527645, "grad_norm": 0.6402739882469177, "learning_rate": 9.810426540284361e-06, "loss": 0.3966, "step": 13 }, { "epoch": 0.022116903633491312, "grad_norm": 0.6007937788963318, "learning_rate": 9.794628751974725e-06, "loss": 0.4158, "step": 14 }, { "epoch": 0.023696682464454975, "grad_norm": 0.5462563037872314, "learning_rate": 9.778830963665089e-06, "loss": 0.4795, "step": 15 }, { "epoch": 0.02527646129541864, "grad_norm": 0.6038461923599243, "learning_rate": 9.76303317535545e-06, "loss": 0.4142, "step": 16 }, { "epoch": 0.026856240126382307, "grad_norm": 0.514258861541748, "learning_rate": 9.747235387045815e-06, "loss": 0.4139, "step": 17 }, { "epoch": 0.02843601895734597, "grad_norm": 0.728235125541687, "learning_rate": 9.731437598736178e-06, "loss": 0.3129, "step": 18 }, { "epoch": 0.030015797788309637, "grad_norm": 0.7013534307479858, "learning_rate": 9.715639810426542e-06, "loss": 0.4275, "step": 19 }, { "epoch": 0.0315955766192733, "grad_norm": 0.6062476634979248, "learning_rate": 9.699842022116904e-06, "loss": 0.3961, "step": 20 }, { "epoch": 0.03317535545023697, "grad_norm": 0.6089779138565063, "learning_rate": 9.684044233807268e-06, "loss": 0.4972, "step": 21 }, { "epoch": 0.03475513428120063, "grad_norm": 0.6651365756988525, "learning_rate": 9.668246445497632e-06, "loss": 0.4714, "step": 22 }, { "epoch": 0.036334913112164295, "grad_norm": 0.6064260601997375, "learning_rate": 9.652448657187995e-06, "loss": 0.4358, "step": 23 }, { "epoch": 0.037914691943127965, "grad_norm": 0.5868542790412903, "learning_rate": 9.636650868878358e-06, "loss": 0.5178, "step": 24 }, { "epoch": 0.03949447077409163, "grad_norm": 0.6516690850257874, "learning_rate": 9.620853080568721e-06, "loss": 0.4281, "step": 25 }, { "epoch": 0.04107424960505529, "grad_norm": 0.7721027731895447, "learning_rate": 9.605055292259085e-06, "loss": 0.4979, "step": 26 }, { "epoch": 0.04265402843601896, "grad_norm": 0.6200973987579346, "learning_rate": 9.589257503949447e-06, "loss": 0.347, "step": 27 }, { "epoch": 0.044233807266982623, "grad_norm": 0.6557235717773438, "learning_rate": 9.573459715639811e-06, "loss": 0.3422, "step": 28 }, { "epoch": 0.045813586097946286, "grad_norm": 1.0422502756118774, "learning_rate": 9.557661927330175e-06, "loss": 0.4955, "step": 29 }, { "epoch": 0.04739336492890995, "grad_norm": 0.8272190093994141, "learning_rate": 9.541864139020539e-06, "loss": 0.434, "step": 30 }, { "epoch": 0.04897314375987362, "grad_norm": 0.5929948091506958, "learning_rate": 9.5260663507109e-06, "loss": 0.5042, "step": 31 }, { "epoch": 0.05055292259083728, "grad_norm": 0.7872880101203918, "learning_rate": 9.510268562401264e-06, "loss": 0.5175, "step": 32 }, { "epoch": 0.052132701421800945, "grad_norm": 0.6884463429450989, "learning_rate": 9.494470774091628e-06, "loss": 0.5104, "step": 33 }, { "epoch": 0.053712480252764615, "grad_norm": 1.215976357460022, "learning_rate": 9.478672985781992e-06, "loss": 0.4742, "step": 34 }, { "epoch": 0.05529225908372828, "grad_norm": 0.7471550107002258, "learning_rate": 9.462875197472354e-06, "loss": 0.4374, "step": 35 }, { "epoch": 0.05687203791469194, "grad_norm": 0.6779741048812866, "learning_rate": 9.447077409162718e-06, "loss": 0.4337, "step": 36 }, { "epoch": 0.05845181674565561, "grad_norm": 0.5205997824668884, "learning_rate": 9.431279620853082e-06, "loss": 0.4296, "step": 37 }, { "epoch": 0.06003159557661927, "grad_norm": 0.381757527589798, "learning_rate": 9.415481832543445e-06, "loss": 0.2223, "step": 38 }, { "epoch": 0.061611374407582936, "grad_norm": 0.650593101978302, "learning_rate": 9.399684044233807e-06, "loss": 0.5066, "step": 39 }, { "epoch": 0.0631911532385466, "grad_norm": 0.5445153117179871, "learning_rate": 9.383886255924171e-06, "loss": 0.4998, "step": 40 }, { "epoch": 0.06477093206951026, "grad_norm": 0.5024020671844482, "learning_rate": 9.368088467614535e-06, "loss": 0.4121, "step": 41 }, { "epoch": 0.06635071090047394, "grad_norm": 0.6259915232658386, "learning_rate": 9.352290679304899e-06, "loss": 0.4969, "step": 42 }, { "epoch": 0.0679304897314376, "grad_norm": 0.49405789375305176, "learning_rate": 9.336492890995261e-06, "loss": 0.4121, "step": 43 }, { "epoch": 0.06951026856240126, "grad_norm": 0.7586628198623657, "learning_rate": 9.320695102685625e-06, "loss": 0.4782, "step": 44 }, { "epoch": 0.07109004739336493, "grad_norm": 0.6203773021697998, "learning_rate": 9.304897314375988e-06, "loss": 0.3579, "step": 45 }, { "epoch": 0.07266982622432859, "grad_norm": 0.6982845067977905, "learning_rate": 9.289099526066352e-06, "loss": 0.3876, "step": 46 }, { "epoch": 0.07424960505529225, "grad_norm": 0.5712842345237732, "learning_rate": 9.273301737756714e-06, "loss": 0.4288, "step": 47 }, { "epoch": 0.07582938388625593, "grad_norm": 0.6829891204833984, "learning_rate": 9.257503949447078e-06, "loss": 0.4939, "step": 48 }, { "epoch": 0.07740916271721959, "grad_norm": 0.5508958101272583, "learning_rate": 9.241706161137442e-06, "loss": 0.372, "step": 49 }, { "epoch": 0.07898894154818326, "grad_norm": 0.9345032572746277, "learning_rate": 9.225908372827806e-06, "loss": 0.4896, "step": 50 }, { "epoch": 0.08056872037914692, "grad_norm": 0.6280492544174194, "learning_rate": 9.210110584518168e-06, "loss": 0.4375, "step": 51 }, { "epoch": 0.08214849921011058, "grad_norm": 0.6853601336479187, "learning_rate": 9.194312796208532e-06, "loss": 0.4294, "step": 52 }, { "epoch": 0.08372827804107424, "grad_norm": 0.6665984392166138, "learning_rate": 9.178515007898895e-06, "loss": 0.5894, "step": 53 }, { "epoch": 0.08530805687203792, "grad_norm": 0.5088407397270203, "learning_rate": 9.162717219589257e-06, "loss": 0.3853, "step": 54 }, { "epoch": 0.08688783570300158, "grad_norm": 0.5319867730140686, "learning_rate": 9.146919431279621e-06, "loss": 0.4791, "step": 55 }, { "epoch": 0.08846761453396525, "grad_norm": 0.6452597975730896, "learning_rate": 9.131121642969985e-06, "loss": 0.4056, "step": 56 }, { "epoch": 0.09004739336492891, "grad_norm": 0.6769601106643677, "learning_rate": 9.115323854660349e-06, "loss": 0.4253, "step": 57 }, { "epoch": 0.09162717219589257, "grad_norm": 0.5170547962188721, "learning_rate": 9.09952606635071e-06, "loss": 0.4211, "step": 58 }, { "epoch": 0.09320695102685624, "grad_norm": 0.5035193562507629, "learning_rate": 9.083728278041075e-06, "loss": 0.3144, "step": 59 }, { "epoch": 0.0947867298578199, "grad_norm": 0.5919070243835449, "learning_rate": 9.067930489731438e-06, "loss": 0.4533, "step": 60 }, { "epoch": 0.09636650868878358, "grad_norm": 0.6510637998580933, "learning_rate": 9.052132701421802e-06, "loss": 0.4701, "step": 61 }, { "epoch": 0.09794628751974724, "grad_norm": 0.5784177780151367, "learning_rate": 9.036334913112164e-06, "loss": 0.3896, "step": 62 }, { "epoch": 0.0995260663507109, "grad_norm": 0.7009139060974121, "learning_rate": 9.020537124802528e-06, "loss": 0.5018, "step": 63 }, { "epoch": 0.10110584518167456, "grad_norm": 0.5086057186126709, "learning_rate": 9.004739336492892e-06, "loss": 0.4305, "step": 64 }, { "epoch": 0.10268562401263823, "grad_norm": 0.5124595761299133, "learning_rate": 8.988941548183256e-06, "loss": 0.4473, "step": 65 }, { "epoch": 0.10426540284360189, "grad_norm": 0.6409702897071838, "learning_rate": 8.973143759873618e-06, "loss": 0.429, "step": 66 }, { "epoch": 0.10584518167456557, "grad_norm": 0.5651409029960632, "learning_rate": 8.957345971563981e-06, "loss": 0.4036, "step": 67 }, { "epoch": 0.10742496050552923, "grad_norm": 0.6658238172531128, "learning_rate": 8.941548183254345e-06, "loss": 0.4726, "step": 68 }, { "epoch": 0.10900473933649289, "grad_norm": 0.444815993309021, "learning_rate": 8.925750394944709e-06, "loss": 0.4016, "step": 69 }, { "epoch": 0.11058451816745656, "grad_norm": 0.5855506658554077, "learning_rate": 8.909952606635071e-06, "loss": 0.4531, "step": 70 }, { "epoch": 0.11216429699842022, "grad_norm": 0.693794310092926, "learning_rate": 8.894154818325435e-06, "loss": 0.4382, "step": 71 }, { "epoch": 0.11374407582938388, "grad_norm": 0.6658089756965637, "learning_rate": 8.878357030015799e-06, "loss": 0.4571, "step": 72 }, { "epoch": 0.11532385466034756, "grad_norm": 1.0504828691482544, "learning_rate": 8.862559241706162e-06, "loss": 0.4311, "step": 73 }, { "epoch": 0.11690363349131122, "grad_norm": 0.5297814607620239, "learning_rate": 8.846761453396524e-06, "loss": 0.4391, "step": 74 }, { "epoch": 0.11848341232227488, "grad_norm": 0.6601409316062927, "learning_rate": 8.830963665086888e-06, "loss": 0.5125, "step": 75 }, { "epoch": 0.12006319115323855, "grad_norm": 0.6345618963241577, "learning_rate": 8.815165876777252e-06, "loss": 0.4471, "step": 76 }, { "epoch": 0.12164296998420221, "grad_norm": 0.5008222460746765, "learning_rate": 8.799368088467614e-06, "loss": 0.3845, "step": 77 }, { "epoch": 0.12322274881516587, "grad_norm": 0.5394203066825867, "learning_rate": 8.783570300157978e-06, "loss": 0.4117, "step": 78 }, { "epoch": 0.12480252764612954, "grad_norm": 0.6255345940589905, "learning_rate": 8.767772511848342e-06, "loss": 0.512, "step": 79 }, { "epoch": 0.1263823064770932, "grad_norm": 0.6215748190879822, "learning_rate": 8.751974723538705e-06, "loss": 0.509, "step": 80 }, { "epoch": 0.12796208530805686, "grad_norm": 0.611587405204773, "learning_rate": 8.736176935229068e-06, "loss": 0.4036, "step": 81 }, { "epoch": 0.12954186413902052, "grad_norm": 0.5373330116271973, "learning_rate": 8.720379146919431e-06, "loss": 0.393, "step": 82 }, { "epoch": 0.13112164296998421, "grad_norm": 0.5936598181724548, "learning_rate": 8.704581358609795e-06, "loss": 0.4092, "step": 83 }, { "epoch": 0.13270142180094788, "grad_norm": 0.576614260673523, "learning_rate": 8.688783570300159e-06, "loss": 0.5513, "step": 84 }, { "epoch": 0.13428120063191154, "grad_norm": 0.5715078711509705, "learning_rate": 8.672985781990521e-06, "loss": 0.4403, "step": 85 }, { "epoch": 0.1358609794628752, "grad_norm": 0.6212042570114136, "learning_rate": 8.657187993680885e-06, "loss": 0.391, "step": 86 }, { "epoch": 0.13744075829383887, "grad_norm": 0.5439122319221497, "learning_rate": 8.641390205371249e-06, "loss": 0.4764, "step": 87 }, { "epoch": 0.13902053712480253, "grad_norm": 0.6808428168296814, "learning_rate": 8.625592417061612e-06, "loss": 0.512, "step": 88 }, { "epoch": 0.1406003159557662, "grad_norm": 0.7429847717285156, "learning_rate": 8.609794628751974e-06, "loss": 0.3834, "step": 89 }, { "epoch": 0.14218009478672985, "grad_norm": 0.6030511260032654, "learning_rate": 8.59399684044234e-06, "loss": 0.4631, "step": 90 }, { "epoch": 0.14375987361769352, "grad_norm": 0.6499682068824768, "learning_rate": 8.578199052132702e-06, "loss": 0.4484, "step": 91 }, { "epoch": 0.14533965244865718, "grad_norm": 0.6490275859832764, "learning_rate": 8.562401263823066e-06, "loss": 0.414, "step": 92 }, { "epoch": 0.14691943127962084, "grad_norm": 0.6859791874885559, "learning_rate": 8.546603475513428e-06, "loss": 0.386, "step": 93 }, { "epoch": 0.1484992101105845, "grad_norm": 0.5281291007995605, "learning_rate": 8.530805687203793e-06, "loss": 0.4036, "step": 94 }, { "epoch": 0.1500789889415482, "grad_norm": 0.5261964797973633, "learning_rate": 8.515007898894155e-06, "loss": 0.33, "step": 95 }, { "epoch": 0.15165876777251186, "grad_norm": 0.4350665211677551, "learning_rate": 8.499210110584519e-06, "loss": 0.3347, "step": 96 }, { "epoch": 0.15323854660347552, "grad_norm": 0.8448456525802612, "learning_rate": 8.483412322274883e-06, "loss": 0.4253, "step": 97 }, { "epoch": 0.15481832543443919, "grad_norm": 0.6256837248802185, "learning_rate": 8.467614533965247e-06, "loss": 0.4464, "step": 98 }, { "epoch": 0.15639810426540285, "grad_norm": 0.7007749676704407, "learning_rate": 8.451816745655609e-06, "loss": 0.4641, "step": 99 }, { "epoch": 0.1579778830963665, "grad_norm": 0.6551494002342224, "learning_rate": 8.436018957345973e-06, "loss": 0.5097, "step": 100 }, { "epoch": 0.15955766192733017, "grad_norm": 0.5944113731384277, "learning_rate": 8.420221169036336e-06, "loss": 0.4554, "step": 101 }, { "epoch": 0.16113744075829384, "grad_norm": 0.5755615234375, "learning_rate": 8.4044233807267e-06, "loss": 0.443, "step": 102 }, { "epoch": 0.1627172195892575, "grad_norm": 0.5263962745666504, "learning_rate": 8.388625592417062e-06, "loss": 0.4355, "step": 103 }, { "epoch": 0.16429699842022116, "grad_norm": 0.6115814447402954, "learning_rate": 8.372827804107424e-06, "loss": 0.4863, "step": 104 }, { "epoch": 0.16587677725118483, "grad_norm": 0.5544970631599426, "learning_rate": 8.35703001579779e-06, "loss": 0.3979, "step": 105 }, { "epoch": 0.1674565560821485, "grad_norm": 0.5588533878326416, "learning_rate": 8.341232227488152e-06, "loss": 0.4073, "step": 106 }, { "epoch": 0.16903633491311215, "grad_norm": 0.578982949256897, "learning_rate": 8.325434439178516e-06, "loss": 0.3745, "step": 107 }, { "epoch": 0.17061611374407584, "grad_norm": 0.4955246150493622, "learning_rate": 8.30963665086888e-06, "loss": 0.438, "step": 108 }, { "epoch": 0.1721958925750395, "grad_norm": 0.593362033367157, "learning_rate": 8.293838862559243e-06, "loss": 0.4161, "step": 109 }, { "epoch": 0.17377567140600317, "grad_norm": 0.5000883340835571, "learning_rate": 8.278041074249605e-06, "loss": 0.432, "step": 110 }, { "epoch": 0.17535545023696683, "grad_norm": 0.5794082880020142, "learning_rate": 8.262243285939969e-06, "loss": 0.4431, "step": 111 }, { "epoch": 0.1769352290679305, "grad_norm": 0.6179563999176025, "learning_rate": 8.246445497630333e-06, "loss": 0.3871, "step": 112 }, { "epoch": 0.17851500789889416, "grad_norm": 0.6540956497192383, "learning_rate": 8.230647709320697e-06, "loss": 0.3706, "step": 113 }, { "epoch": 0.18009478672985782, "grad_norm": 0.7029737234115601, "learning_rate": 8.214849921011059e-06, "loss": 0.5077, "step": 114 }, { "epoch": 0.18167456556082148, "grad_norm": 0.5466600656509399, "learning_rate": 8.199052132701422e-06, "loss": 0.4634, "step": 115 }, { "epoch": 0.18325434439178515, "grad_norm": 0.5513831973075867, "learning_rate": 8.183254344391786e-06, "loss": 0.4457, "step": 116 }, { "epoch": 0.1848341232227488, "grad_norm": 0.7652455568313599, "learning_rate": 8.16745655608215e-06, "loss": 0.4376, "step": 117 }, { "epoch": 0.18641390205371247, "grad_norm": 0.6213077902793884, "learning_rate": 8.151658767772512e-06, "loss": 0.3988, "step": 118 }, { "epoch": 0.18799368088467613, "grad_norm": 0.50051349401474, "learning_rate": 8.135860979462876e-06, "loss": 0.4142, "step": 119 }, { "epoch": 0.1895734597156398, "grad_norm": 0.8015328049659729, "learning_rate": 8.12006319115324e-06, "loss": 0.4474, "step": 120 }, { "epoch": 0.1911532385466035, "grad_norm": 0.6595532298088074, "learning_rate": 8.104265402843603e-06, "loss": 0.5173, "step": 121 }, { "epoch": 0.19273301737756715, "grad_norm": 0.7859697937965393, "learning_rate": 8.088467614533966e-06, "loss": 0.4465, "step": 122 }, { "epoch": 0.1943127962085308, "grad_norm": 0.6508023738861084, "learning_rate": 8.07266982622433e-06, "loss": 0.4448, "step": 123 }, { "epoch": 0.19589257503949448, "grad_norm": 0.49232304096221924, "learning_rate": 8.056872037914693e-06, "loss": 0.4005, "step": 124 }, { "epoch": 0.19747235387045814, "grad_norm": 0.6464349031448364, "learning_rate": 8.041074249605057e-06, "loss": 0.47, "step": 125 }, { "epoch": 0.1990521327014218, "grad_norm": 0.5296919345855713, "learning_rate": 8.025276461295419e-06, "loss": 0.4247, "step": 126 }, { "epoch": 0.20063191153238547, "grad_norm": 0.6270297765731812, "learning_rate": 8.009478672985783e-06, "loss": 0.5397, "step": 127 }, { "epoch": 0.20221169036334913, "grad_norm": 0.6148909330368042, "learning_rate": 7.993680884676147e-06, "loss": 0.4133, "step": 128 }, { "epoch": 0.2037914691943128, "grad_norm": 0.7778130173683167, "learning_rate": 7.977883096366509e-06, "loss": 0.5119, "step": 129 }, { "epoch": 0.20537124802527645, "grad_norm": 0.47952044010162354, "learning_rate": 7.962085308056872e-06, "loss": 0.386, "step": 130 }, { "epoch": 0.20695102685624012, "grad_norm": 0.5951160788536072, "learning_rate": 7.946287519747236e-06, "loss": 0.5101, "step": 131 }, { "epoch": 0.20853080568720378, "grad_norm": 0.6209789514541626, "learning_rate": 7.9304897314376e-06, "loss": 0.4988, "step": 132 }, { "epoch": 0.21011058451816747, "grad_norm": 0.5093654990196228, "learning_rate": 7.914691943127962e-06, "loss": 0.374, "step": 133 }, { "epoch": 0.21169036334913113, "grad_norm": 0.5125884413719177, "learning_rate": 7.898894154818326e-06, "loss": 0.4097, "step": 134 }, { "epoch": 0.2132701421800948, "grad_norm": 0.5116066932678223, "learning_rate": 7.88309636650869e-06, "loss": 0.4643, "step": 135 }, { "epoch": 0.21484992101105846, "grad_norm": 0.5778034329414368, "learning_rate": 7.867298578199053e-06, "loss": 0.4645, "step": 136 }, { "epoch": 0.21642969984202212, "grad_norm": 0.6490422487258911, "learning_rate": 7.851500789889415e-06, "loss": 0.4825, "step": 137 }, { "epoch": 0.21800947867298578, "grad_norm": 0.644008219242096, "learning_rate": 7.83570300157978e-06, "loss": 0.3954, "step": 138 }, { "epoch": 0.21958925750394945, "grad_norm": 0.8628047704696655, "learning_rate": 7.819905213270143e-06, "loss": 0.5322, "step": 139 }, { "epoch": 0.2211690363349131, "grad_norm": 0.6286507844924927, "learning_rate": 7.804107424960507e-06, "loss": 0.3741, "step": 140 }, { "epoch": 0.22274881516587677, "grad_norm": 0.6210809350013733, "learning_rate": 7.788309636650869e-06, "loss": 0.4572, "step": 141 }, { "epoch": 0.22432859399684044, "grad_norm": 0.5337722897529602, "learning_rate": 7.772511848341233e-06, "loss": 0.3788, "step": 142 }, { "epoch": 0.2259083728278041, "grad_norm": 0.5743194818496704, "learning_rate": 7.756714060031596e-06, "loss": 0.3963, "step": 143 }, { "epoch": 0.22748815165876776, "grad_norm": 0.4972652792930603, "learning_rate": 7.74091627172196e-06, "loss": 0.2906, "step": 144 }, { "epoch": 0.22906793048973143, "grad_norm": 0.5239664316177368, "learning_rate": 7.725118483412322e-06, "loss": 0.4009, "step": 145 }, { "epoch": 0.23064770932069512, "grad_norm": 0.5151936411857605, "learning_rate": 7.709320695102686e-06, "loss": 0.4208, "step": 146 }, { "epoch": 0.23222748815165878, "grad_norm": 0.6128547191619873, "learning_rate": 7.69352290679305e-06, "loss": 0.4779, "step": 147 }, { "epoch": 0.23380726698262244, "grad_norm": 0.5268502235412598, "learning_rate": 7.677725118483414e-06, "loss": 0.4219, "step": 148 }, { "epoch": 0.2353870458135861, "grad_norm": 0.5439866185188293, "learning_rate": 7.661927330173776e-06, "loss": 0.4436, "step": 149 }, { "epoch": 0.23696682464454977, "grad_norm": 0.5291867852210999, "learning_rate": 7.64612954186414e-06, "loss": 0.407, "step": 150 }, { "epoch": 0.23854660347551343, "grad_norm": 0.6638155579566956, "learning_rate": 7.630331753554503e-06, "loss": 0.403, "step": 151 }, { "epoch": 0.2401263823064771, "grad_norm": 0.5501230955123901, "learning_rate": 7.614533965244867e-06, "loss": 0.5004, "step": 152 }, { "epoch": 0.24170616113744076, "grad_norm": 0.5949499011039734, "learning_rate": 7.59873617693523e-06, "loss": 0.4708, "step": 153 }, { "epoch": 0.24328593996840442, "grad_norm": 0.5841517448425293, "learning_rate": 7.582938388625593e-06, "loss": 0.4836, "step": 154 }, { "epoch": 0.24486571879936808, "grad_norm": 0.6298154592514038, "learning_rate": 7.567140600315957e-06, "loss": 0.4728, "step": 155 }, { "epoch": 0.24644549763033174, "grad_norm": 0.6107637882232666, "learning_rate": 7.55134281200632e-06, "loss": 0.4243, "step": 156 }, { "epoch": 0.2480252764612954, "grad_norm": 0.5174968838691711, "learning_rate": 7.535545023696683e-06, "loss": 0.4657, "step": 157 }, { "epoch": 0.24960505529225907, "grad_norm": 0.5588591694831848, "learning_rate": 7.519747235387046e-06, "loss": 0.4567, "step": 158 }, { "epoch": 0.25118483412322273, "grad_norm": 0.8415222764015198, "learning_rate": 7.50394944707741e-06, "loss": 0.4625, "step": 159 }, { "epoch": 0.2527646129541864, "grad_norm": 0.6054974794387817, "learning_rate": 7.488151658767773e-06, "loss": 0.3843, "step": 160 }, { "epoch": 0.25434439178515006, "grad_norm": 0.5117557644844055, "learning_rate": 7.472353870458137e-06, "loss": 0.3887, "step": 161 }, { "epoch": 0.2559241706161137, "grad_norm": 0.5849332213401794, "learning_rate": 7.4565560821485e-06, "loss": 0.4528, "step": 162 }, { "epoch": 0.2575039494470774, "grad_norm": 0.5625325441360474, "learning_rate": 7.4407582938388635e-06, "loss": 0.4542, "step": 163 }, { "epoch": 0.25908372827804105, "grad_norm": 0.5406492352485657, "learning_rate": 7.4249605055292264e-06, "loss": 0.4592, "step": 164 }, { "epoch": 0.26066350710900477, "grad_norm": 0.6318654417991638, "learning_rate": 7.40916271721959e-06, "loss": 0.4361, "step": 165 }, { "epoch": 0.26224328593996843, "grad_norm": 0.5719902515411377, "learning_rate": 7.393364928909953e-06, "loss": 0.4799, "step": 166 }, { "epoch": 0.2638230647709321, "grad_norm": 0.5211177468299866, "learning_rate": 7.377567140600317e-06, "loss": 0.33, "step": 167 }, { "epoch": 0.26540284360189575, "grad_norm": 0.6400920152664185, "learning_rate": 7.36176935229068e-06, "loss": 0.4235, "step": 168 }, { "epoch": 0.2669826224328594, "grad_norm": 0.5302186608314514, "learning_rate": 7.345971563981044e-06, "loss": 0.4342, "step": 169 }, { "epoch": 0.2685624012638231, "grad_norm": 0.5393325686454773, "learning_rate": 7.3301737756714066e-06, "loss": 0.3632, "step": 170 }, { "epoch": 0.27014218009478674, "grad_norm": 0.5409063696861267, "learning_rate": 7.31437598736177e-06, "loss": 0.4076, "step": 171 }, { "epoch": 0.2717219589257504, "grad_norm": 0.5056774616241455, "learning_rate": 7.298578199052133e-06, "loss": 0.4821, "step": 172 }, { "epoch": 0.27330173775671407, "grad_norm": 0.6061700582504272, "learning_rate": 7.282780410742497e-06, "loss": 0.5137, "step": 173 }, { "epoch": 0.27488151658767773, "grad_norm": 0.5524815917015076, "learning_rate": 7.26698262243286e-06, "loss": 0.4116, "step": 174 }, { "epoch": 0.2764612954186414, "grad_norm": 0.5045567750930786, "learning_rate": 7.251184834123224e-06, "loss": 0.3969, "step": 175 }, { "epoch": 0.27804107424960506, "grad_norm": 0.604505717754364, "learning_rate": 7.235387045813587e-06, "loss": 0.5176, "step": 176 }, { "epoch": 0.2796208530805687, "grad_norm": 0.6067575812339783, "learning_rate": 7.2195892575039505e-06, "loss": 0.4438, "step": 177 }, { "epoch": 0.2812006319115324, "grad_norm": 0.6412494778633118, "learning_rate": 7.203791469194313e-06, "loss": 0.4758, "step": 178 }, { "epoch": 0.28278041074249605, "grad_norm": 0.5432886481285095, "learning_rate": 7.187993680884676e-06, "loss": 0.4387, "step": 179 }, { "epoch": 0.2843601895734597, "grad_norm": 0.4622472822666168, "learning_rate": 7.17219589257504e-06, "loss": 0.4775, "step": 180 }, { "epoch": 0.2859399684044234, "grad_norm": 0.643259584903717, "learning_rate": 7.156398104265403e-06, "loss": 0.4479, "step": 181 }, { "epoch": 0.28751974723538704, "grad_norm": 0.48998138308525085, "learning_rate": 7.140600315955767e-06, "loss": 0.399, "step": 182 }, { "epoch": 0.2890995260663507, "grad_norm": 0.5146614909172058, "learning_rate": 7.12480252764613e-06, "loss": 0.4475, "step": 183 }, { "epoch": 0.29067930489731436, "grad_norm": 0.5386670231819153, "learning_rate": 7.1090047393364935e-06, "loss": 0.3892, "step": 184 }, { "epoch": 0.292259083728278, "grad_norm": 0.5147759318351746, "learning_rate": 7.0932069510268565e-06, "loss": 0.3755, "step": 185 }, { "epoch": 0.2938388625592417, "grad_norm": 0.5141321420669556, "learning_rate": 7.07740916271722e-06, "loss": 0.355, "step": 186 }, { "epoch": 0.29541864139020535, "grad_norm": 0.9518134593963623, "learning_rate": 7.061611374407583e-06, "loss": 0.4021, "step": 187 }, { "epoch": 0.296998420221169, "grad_norm": 0.5844981670379639, "learning_rate": 7.045813586097947e-06, "loss": 0.4233, "step": 188 }, { "epoch": 0.2985781990521327, "grad_norm": 0.6381546854972839, "learning_rate": 7.03001579778831e-06, "loss": 0.4862, "step": 189 }, { "epoch": 0.3001579778830964, "grad_norm": 0.7311195135116577, "learning_rate": 7.014218009478674e-06, "loss": 0.4822, "step": 190 }, { "epoch": 0.30173775671406006, "grad_norm": 0.5827596783638, "learning_rate": 6.998420221169037e-06, "loss": 0.4027, "step": 191 }, { "epoch": 0.3033175355450237, "grad_norm": 0.6907688975334167, "learning_rate": 6.9826224328594e-06, "loss": 0.4374, "step": 192 }, { "epoch": 0.3048973143759874, "grad_norm": 0.5060120820999146, "learning_rate": 6.966824644549763e-06, "loss": 0.4226, "step": 193 }, { "epoch": 0.30647709320695105, "grad_norm": 0.41480544209480286, "learning_rate": 6.951026856240127e-06, "loss": 0.3766, "step": 194 }, { "epoch": 0.3080568720379147, "grad_norm": 0.5637404322624207, "learning_rate": 6.93522906793049e-06, "loss": 0.4365, "step": 195 }, { "epoch": 0.30963665086887837, "grad_norm": 0.6389409899711609, "learning_rate": 6.919431279620854e-06, "loss": 0.4186, "step": 196 }, { "epoch": 0.31121642969984203, "grad_norm": 0.48588162660598755, "learning_rate": 6.903633491311217e-06, "loss": 0.4023, "step": 197 }, { "epoch": 0.3127962085308057, "grad_norm": 0.6066514253616333, "learning_rate": 6.8878357030015805e-06, "loss": 0.4652, "step": 198 }, { "epoch": 0.31437598736176936, "grad_norm": 0.6308689117431641, "learning_rate": 6.8720379146919435e-06, "loss": 0.3885, "step": 199 }, { "epoch": 0.315955766192733, "grad_norm": 0.4883437752723694, "learning_rate": 6.856240126382307e-06, "loss": 0.4128, "step": 200 }, { "epoch": 0.3175355450236967, "grad_norm": 0.720086932182312, "learning_rate": 6.84044233807267e-06, "loss": 0.4333, "step": 201 }, { "epoch": 0.31911532385466035, "grad_norm": 0.6698761582374573, "learning_rate": 6.824644549763034e-06, "loss": 0.3967, "step": 202 }, { "epoch": 0.320695102685624, "grad_norm": 0.5240082740783691, "learning_rate": 6.808846761453397e-06, "loss": 0.4055, "step": 203 }, { "epoch": 0.3222748815165877, "grad_norm": 0.6142946481704712, "learning_rate": 6.79304897314376e-06, "loss": 0.3645, "step": 204 }, { "epoch": 0.32385466034755134, "grad_norm": 0.6439379453659058, "learning_rate": 6.777251184834124e-06, "loss": 0.3207, "step": 205 }, { "epoch": 0.325434439178515, "grad_norm": 0.6862720847129822, "learning_rate": 6.7614533965244865e-06, "loss": 0.4944, "step": 206 }, { "epoch": 0.32701421800947866, "grad_norm": 0.6720433235168457, "learning_rate": 6.74565560821485e-06, "loss": 0.4335, "step": 207 }, { "epoch": 0.3285939968404423, "grad_norm": 0.531577467918396, "learning_rate": 6.729857819905213e-06, "loss": 0.5327, "step": 208 }, { "epoch": 0.330173775671406, "grad_norm": 0.5542590022087097, "learning_rate": 6.714060031595577e-06, "loss": 0.3629, "step": 209 }, { "epoch": 0.33175355450236965, "grad_norm": 0.5614448189735413, "learning_rate": 6.69826224328594e-06, "loss": 0.4097, "step": 210 }, { "epoch": 0.3333333333333333, "grad_norm": 0.7383466362953186, "learning_rate": 6.682464454976304e-06, "loss": 0.5031, "step": 211 }, { "epoch": 0.334913112164297, "grad_norm": 0.6345497965812683, "learning_rate": 6.666666666666667e-06, "loss": 0.5029, "step": 212 }, { "epoch": 0.33649289099526064, "grad_norm": 0.579641580581665, "learning_rate": 6.6508688783570304e-06, "loss": 0.4949, "step": 213 }, { "epoch": 0.3380726698262243, "grad_norm": 0.5040780305862427, "learning_rate": 6.635071090047393e-06, "loss": 0.4537, "step": 214 }, { "epoch": 0.33965244865718797, "grad_norm": 0.5917491316795349, "learning_rate": 6.619273301737757e-06, "loss": 0.3883, "step": 215 }, { "epoch": 0.3412322274881517, "grad_norm": 0.7031399011611938, "learning_rate": 6.60347551342812e-06, "loss": 0.4554, "step": 216 }, { "epoch": 0.34281200631911535, "grad_norm": 0.5503798127174377, "learning_rate": 6.587677725118484e-06, "loss": 0.352, "step": 217 }, { "epoch": 0.344391785150079, "grad_norm": 0.5412716269493103, "learning_rate": 6.571879936808847e-06, "loss": 0.4191, "step": 218 }, { "epoch": 0.3459715639810427, "grad_norm": 0.6272369623184204, "learning_rate": 6.556082148499211e-06, "loss": 0.4595, "step": 219 }, { "epoch": 0.34755134281200634, "grad_norm": 0.5309504270553589, "learning_rate": 6.5402843601895735e-06, "loss": 0.4095, "step": 220 }, { "epoch": 0.34913112164297, "grad_norm": 0.5687200427055359, "learning_rate": 6.524486571879938e-06, "loss": 0.435, "step": 221 }, { "epoch": 0.35071090047393366, "grad_norm": 0.5819438099861145, "learning_rate": 6.5086887835703e-06, "loss": 0.4695, "step": 222 }, { "epoch": 0.3522906793048973, "grad_norm": 0.6310110092163086, "learning_rate": 6.492890995260665e-06, "loss": 0.4346, "step": 223 }, { "epoch": 0.353870458135861, "grad_norm": 0.5838906168937683, "learning_rate": 6.477093206951027e-06, "loss": 0.47, "step": 224 }, { "epoch": 0.35545023696682465, "grad_norm": 0.6752678155899048, "learning_rate": 6.4612954186413915e-06, "loss": 0.3842, "step": 225 }, { "epoch": 0.3570300157977883, "grad_norm": 0.7029111981391907, "learning_rate": 6.445497630331754e-06, "loss": 0.4442, "step": 226 }, { "epoch": 0.358609794628752, "grad_norm": 0.511812686920166, "learning_rate": 6.429699842022118e-06, "loss": 0.5171, "step": 227 }, { "epoch": 0.36018957345971564, "grad_norm": 0.49457868933677673, "learning_rate": 6.413902053712481e-06, "loss": 0.3695, "step": 228 }, { "epoch": 0.3617693522906793, "grad_norm": 0.4521022439002991, "learning_rate": 6.398104265402843e-06, "loss": 0.3909, "step": 229 }, { "epoch": 0.36334913112164297, "grad_norm": 0.45229026675224304, "learning_rate": 6.382306477093208e-06, "loss": 0.3417, "step": 230 }, { "epoch": 0.36492890995260663, "grad_norm": 0.5070056915283203, "learning_rate": 6.36650868878357e-06, "loss": 0.3518, "step": 231 }, { "epoch": 0.3665086887835703, "grad_norm": 0.9325531721115112, "learning_rate": 6.350710900473935e-06, "loss": 0.5172, "step": 232 }, { "epoch": 0.36808846761453395, "grad_norm": 0.6027977466583252, "learning_rate": 6.334913112164297e-06, "loss": 0.4052, "step": 233 }, { "epoch": 0.3696682464454976, "grad_norm": 0.7251097559928894, "learning_rate": 6.319115323854661e-06, "loss": 0.4739, "step": 234 }, { "epoch": 0.3712480252764613, "grad_norm": 0.6470052003860474, "learning_rate": 6.303317535545023e-06, "loss": 0.4745, "step": 235 }, { "epoch": 0.37282780410742494, "grad_norm": 0.7177411317825317, "learning_rate": 6.287519747235388e-06, "loss": 0.364, "step": 236 }, { "epoch": 0.3744075829383886, "grad_norm": 0.7681677341461182, "learning_rate": 6.271721958925751e-06, "loss": 0.4559, "step": 237 }, { "epoch": 0.37598736176935227, "grad_norm": 0.6160128116607666, "learning_rate": 6.255924170616115e-06, "loss": 0.421, "step": 238 }, { "epoch": 0.37756714060031593, "grad_norm": 0.658981442451477, "learning_rate": 6.240126382306478e-06, "loss": 0.3979, "step": 239 }, { "epoch": 0.3791469194312796, "grad_norm": 0.9422373175621033, "learning_rate": 6.2243285939968414e-06, "loss": 0.3586, "step": 240 }, { "epoch": 0.3807266982622433, "grad_norm": 0.5452501773834229, "learning_rate": 6.208530805687204e-06, "loss": 0.4209, "step": 241 }, { "epoch": 0.382306477093207, "grad_norm": 0.4912925660610199, "learning_rate": 6.192733017377568e-06, "loss": 0.4784, "step": 242 }, { "epoch": 0.38388625592417064, "grad_norm": 0.6575455665588379, "learning_rate": 6.176935229067931e-06, "loss": 0.4062, "step": 243 }, { "epoch": 0.3854660347551343, "grad_norm": 0.8840091824531555, "learning_rate": 6.161137440758295e-06, "loss": 0.4177, "step": 244 }, { "epoch": 0.38704581358609796, "grad_norm": 0.5949338674545288, "learning_rate": 6.145339652448658e-06, "loss": 0.4477, "step": 245 }, { "epoch": 0.3886255924170616, "grad_norm": 0.5938326120376587, "learning_rate": 6.1295418641390216e-06, "loss": 0.4155, "step": 246 }, { "epoch": 0.3902053712480253, "grad_norm": 0.5401394367218018, "learning_rate": 6.1137440758293845e-06, "loss": 0.3873, "step": 247 }, { "epoch": 0.39178515007898895, "grad_norm": 0.5220497846603394, "learning_rate": 6.097946287519748e-06, "loss": 0.3803, "step": 248 }, { "epoch": 0.3933649289099526, "grad_norm": 0.5426644086837769, "learning_rate": 6.082148499210111e-06, "loss": 0.3239, "step": 249 }, { "epoch": 0.3949447077409163, "grad_norm": 0.5215898156166077, "learning_rate": 6.066350710900475e-06, "loss": 0.4373, "step": 250 }, { "epoch": 0.39652448657187994, "grad_norm": 0.5694135427474976, "learning_rate": 6.050552922590838e-06, "loss": 0.4948, "step": 251 }, { "epoch": 0.3981042654028436, "grad_norm": 0.5505183339118958, "learning_rate": 6.034755134281202e-06, "loss": 0.4108, "step": 252 }, { "epoch": 0.39968404423380727, "grad_norm": 0.593190610408783, "learning_rate": 6.018957345971565e-06, "loss": 0.429, "step": 253 }, { "epoch": 0.40126382306477093, "grad_norm": 0.5409046411514282, "learning_rate": 6.003159557661928e-06, "loss": 0.4443, "step": 254 }, { "epoch": 0.4028436018957346, "grad_norm": 0.5520291328430176, "learning_rate": 5.987361769352291e-06, "loss": 0.4485, "step": 255 }, { "epoch": 0.40442338072669826, "grad_norm": 0.5622429847717285, "learning_rate": 5.971563981042654e-06, "loss": 0.4181, "step": 256 }, { "epoch": 0.4060031595576619, "grad_norm": 0.5267983078956604, "learning_rate": 5.955766192733018e-06, "loss": 0.4235, "step": 257 }, { "epoch": 0.4075829383886256, "grad_norm": 0.5384082198143005, "learning_rate": 5.939968404423381e-06, "loss": 0.4055, "step": 258 }, { "epoch": 0.40916271721958924, "grad_norm": 0.5427289605140686, "learning_rate": 5.924170616113745e-06, "loss": 0.3427, "step": 259 }, { "epoch": 0.4107424960505529, "grad_norm": 0.4936423599720001, "learning_rate": 5.908372827804108e-06, "loss": 0.4133, "step": 260 }, { "epoch": 0.41232227488151657, "grad_norm": 0.5825520753860474, "learning_rate": 5.8925750394944715e-06, "loss": 0.377, "step": 261 }, { "epoch": 0.41390205371248023, "grad_norm": 0.6343340277671814, "learning_rate": 5.876777251184834e-06, "loss": 0.441, "step": 262 }, { "epoch": 0.4154818325434439, "grad_norm": 0.5479387044906616, "learning_rate": 5.860979462875198e-06, "loss": 0.4353, "step": 263 }, { "epoch": 0.41706161137440756, "grad_norm": 0.5873805284500122, "learning_rate": 5.845181674565561e-06, "loss": 0.4293, "step": 264 }, { "epoch": 0.4186413902053712, "grad_norm": 0.6624792218208313, "learning_rate": 5.829383886255925e-06, "loss": 0.5162, "step": 265 }, { "epoch": 0.42022116903633494, "grad_norm": 0.5797149538993835, "learning_rate": 5.813586097946288e-06, "loss": 0.3651, "step": 266 }, { "epoch": 0.4218009478672986, "grad_norm": 0.5814763903617859, "learning_rate": 5.797788309636652e-06, "loss": 0.3817, "step": 267 }, { "epoch": 0.42338072669826227, "grad_norm": 0.5556735992431641, "learning_rate": 5.7819905213270145e-06, "loss": 0.4186, "step": 268 }, { "epoch": 0.42496050552922593, "grad_norm": 0.5842727422714233, "learning_rate": 5.766192733017378e-06, "loss": 0.4343, "step": 269 }, { "epoch": 0.4265402843601896, "grad_norm": 0.5401722192764282, "learning_rate": 5.750394944707741e-06, "loss": 0.4418, "step": 270 }, { "epoch": 0.42812006319115326, "grad_norm": 0.5917039513587952, "learning_rate": 5.734597156398105e-06, "loss": 0.5371, "step": 271 }, { "epoch": 0.4296998420221169, "grad_norm": 0.5991331338882446, "learning_rate": 5.718799368088468e-06, "loss": 0.4969, "step": 272 }, { "epoch": 0.4312796208530806, "grad_norm": 0.4709448218345642, "learning_rate": 5.703001579778832e-06, "loss": 0.4139, "step": 273 }, { "epoch": 0.43285939968404424, "grad_norm": 0.5746496319770813, "learning_rate": 5.687203791469195e-06, "loss": 0.4683, "step": 274 }, { "epoch": 0.4344391785150079, "grad_norm": 0.523835301399231, "learning_rate": 5.6714060031595584e-06, "loss": 0.4346, "step": 275 }, { "epoch": 0.43601895734597157, "grad_norm": 0.5292810797691345, "learning_rate": 5.655608214849921e-06, "loss": 0.463, "step": 276 }, { "epoch": 0.43759873617693523, "grad_norm": 0.6543466448783875, "learning_rate": 5.639810426540285e-06, "loss": 0.427, "step": 277 }, { "epoch": 0.4391785150078989, "grad_norm": 0.5543989539146423, "learning_rate": 5.624012638230648e-06, "loss": 0.3902, "step": 278 }, { "epoch": 0.44075829383886256, "grad_norm": 0.5905360579490662, "learning_rate": 5.608214849921012e-06, "loss": 0.4266, "step": 279 }, { "epoch": 0.4423380726698262, "grad_norm": 0.5785796046257019, "learning_rate": 5.592417061611375e-06, "loss": 0.4521, "step": 280 }, { "epoch": 0.4439178515007899, "grad_norm": 0.5580607056617737, "learning_rate": 5.576619273301738e-06, "loss": 0.378, "step": 281 }, { "epoch": 0.44549763033175355, "grad_norm": 0.5100966691970825, "learning_rate": 5.5608214849921015e-06, "loss": 0.3876, "step": 282 }, { "epoch": 0.4470774091627172, "grad_norm": 0.5704023241996765, "learning_rate": 5.5450236966824644e-06, "loss": 0.4694, "step": 283 }, { "epoch": 0.4486571879936809, "grad_norm": 0.5954383611679077, "learning_rate": 5.529225908372828e-06, "loss": 0.5049, "step": 284 }, { "epoch": 0.45023696682464454, "grad_norm": 0.5239635705947876, "learning_rate": 5.513428120063191e-06, "loss": 0.4182, "step": 285 }, { "epoch": 0.4518167456556082, "grad_norm": 0.6643552780151367, "learning_rate": 5.497630331753555e-06, "loss": 0.4434, "step": 286 }, { "epoch": 0.45339652448657186, "grad_norm": 0.6675540804862976, "learning_rate": 5.481832543443918e-06, "loss": 0.3745, "step": 287 }, { "epoch": 0.4549763033175355, "grad_norm": 0.5871401429176331, "learning_rate": 5.466034755134282e-06, "loss": 0.5527, "step": 288 }, { "epoch": 0.4565560821484992, "grad_norm": 0.5936838984489441, "learning_rate": 5.4502369668246446e-06, "loss": 0.4857, "step": 289 }, { "epoch": 0.45813586097946285, "grad_norm": 0.5998191833496094, "learning_rate": 5.434439178515008e-06, "loss": 0.4395, "step": 290 }, { "epoch": 0.4597156398104265, "grad_norm": 0.5102293491363525, "learning_rate": 5.418641390205371e-06, "loss": 0.4496, "step": 291 }, { "epoch": 0.46129541864139023, "grad_norm": 0.6297216415405273, "learning_rate": 5.402843601895735e-06, "loss": 0.3555, "step": 292 }, { "epoch": 0.4628751974723539, "grad_norm": 0.6780267953872681, "learning_rate": 5.387045813586098e-06, "loss": 0.3295, "step": 293 }, { "epoch": 0.46445497630331756, "grad_norm": 0.5788872838020325, "learning_rate": 5.371248025276462e-06, "loss": 0.4293, "step": 294 }, { "epoch": 0.4660347551342812, "grad_norm": 0.5679113268852234, "learning_rate": 5.355450236966825e-06, "loss": 0.4274, "step": 295 }, { "epoch": 0.4676145339652449, "grad_norm": 0.5739018321037292, "learning_rate": 5.3396524486571885e-06, "loss": 0.3292, "step": 296 }, { "epoch": 0.46919431279620855, "grad_norm": 0.5387299060821533, "learning_rate": 5.323854660347551e-06, "loss": 0.36, "step": 297 }, { "epoch": 0.4707740916271722, "grad_norm": 0.4877624213695526, "learning_rate": 5.308056872037915e-06, "loss": 0.403, "step": 298 }, { "epoch": 0.47235387045813587, "grad_norm": 0.5668107271194458, "learning_rate": 5.292259083728278e-06, "loss": 0.4087, "step": 299 }, { "epoch": 0.47393364928909953, "grad_norm": 0.5592719316482544, "learning_rate": 5.276461295418642e-06, "loss": 0.405, "step": 300 }, { "epoch": 0.4755134281200632, "grad_norm": 0.48879534006118774, "learning_rate": 5.260663507109005e-06, "loss": 0.3562, "step": 301 }, { "epoch": 0.47709320695102686, "grad_norm": 0.5968641042709351, "learning_rate": 5.244865718799369e-06, "loss": 0.4216, "step": 302 }, { "epoch": 0.4786729857819905, "grad_norm": 0.7803828120231628, "learning_rate": 5.2290679304897315e-06, "loss": 0.4014, "step": 303 }, { "epoch": 0.4802527646129542, "grad_norm": 0.592827558517456, "learning_rate": 5.213270142180096e-06, "loss": 0.2895, "step": 304 }, { "epoch": 0.48183254344391785, "grad_norm": 0.8070396184921265, "learning_rate": 5.197472353870458e-06, "loss": 0.3972, "step": 305 }, { "epoch": 0.4834123222748815, "grad_norm": 0.5256397724151611, "learning_rate": 5.181674565560821e-06, "loss": 0.4384, "step": 306 }, { "epoch": 0.4849921011058452, "grad_norm": 0.5307562947273254, "learning_rate": 5.165876777251185e-06, "loss": 0.3788, "step": 307 }, { "epoch": 0.48657187993680884, "grad_norm": 0.4588807225227356, "learning_rate": 5.150078988941548e-06, "loss": 0.3491, "step": 308 }, { "epoch": 0.4881516587677725, "grad_norm": 0.524919331073761, "learning_rate": 5.134281200631912e-06, "loss": 0.4375, "step": 309 }, { "epoch": 0.48973143759873616, "grad_norm": 0.6611966490745544, "learning_rate": 5.118483412322275e-06, "loss": 0.4399, "step": 310 }, { "epoch": 0.4913112164296998, "grad_norm": 0.5597748160362244, "learning_rate": 5.102685624012638e-06, "loss": 0.5073, "step": 311 }, { "epoch": 0.4928909952606635, "grad_norm": 0.8958181738853455, "learning_rate": 5.086887835703001e-06, "loss": 0.4756, "step": 312 }, { "epoch": 0.49447077409162715, "grad_norm": 0.4875742197036743, "learning_rate": 5.071090047393366e-06, "loss": 0.4424, "step": 313 }, { "epoch": 0.4960505529225908, "grad_norm": 0.6110445261001587, "learning_rate": 5.055292259083728e-06, "loss": 0.4686, "step": 314 }, { "epoch": 0.4976303317535545, "grad_norm": 0.5900540351867676, "learning_rate": 5.039494470774093e-06, "loss": 0.4, "step": 315 }, { "epoch": 0.49921011058451814, "grad_norm": 0.624906599521637, "learning_rate": 5.023696682464455e-06, "loss": 0.3967, "step": 316 }, { "epoch": 0.5007898894154819, "grad_norm": 0.6435191631317139, "learning_rate": 5.007898894154819e-06, "loss": 0.5104, "step": 317 }, { "epoch": 0.5023696682464455, "grad_norm": 0.7464382648468018, "learning_rate": 4.9921011058451815e-06, "loss": 0.4621, "step": 318 }, { "epoch": 0.5039494470774092, "grad_norm": 0.7912509441375732, "learning_rate": 4.976303317535545e-06, "loss": 0.4186, "step": 319 }, { "epoch": 0.5055292259083728, "grad_norm": 0.6150445938110352, "learning_rate": 4.960505529225908e-06, "loss": 0.469, "step": 320 }, { "epoch": 0.5071090047393365, "grad_norm": 0.5445781946182251, "learning_rate": 4.944707740916272e-06, "loss": 0.4111, "step": 321 }, { "epoch": 0.5086887835703001, "grad_norm": 0.5628255605697632, "learning_rate": 4.928909952606635e-06, "loss": 0.4884, "step": 322 }, { "epoch": 0.5102685624012638, "grad_norm": 0.5007054805755615, "learning_rate": 4.913112164296999e-06, "loss": 0.4315, "step": 323 }, { "epoch": 0.5118483412322274, "grad_norm": 0.6346699595451355, "learning_rate": 4.8973143759873624e-06, "loss": 0.4033, "step": 324 }, { "epoch": 0.5134281200631912, "grad_norm": 0.639045774936676, "learning_rate": 4.881516587677725e-06, "loss": 0.3748, "step": 325 }, { "epoch": 0.5150078988941548, "grad_norm": 0.5578002333641052, "learning_rate": 4.865718799368089e-06, "loss": 0.5055, "step": 326 }, { "epoch": 0.5165876777251185, "grad_norm": 0.5281325578689575, "learning_rate": 4.849921011058452e-06, "loss": 0.4307, "step": 327 }, { "epoch": 0.5181674565560821, "grad_norm": 0.6557057499885559, "learning_rate": 4.834123222748816e-06, "loss": 0.4085, "step": 328 }, { "epoch": 0.5197472353870458, "grad_norm": 0.5667731761932373, "learning_rate": 4.818325434439179e-06, "loss": 0.4774, "step": 329 }, { "epoch": 0.5213270142180095, "grad_norm": 0.5362856984138489, "learning_rate": 4.8025276461295426e-06, "loss": 0.4316, "step": 330 }, { "epoch": 0.5229067930489731, "grad_norm": 0.5326763391494751, "learning_rate": 4.7867298578199055e-06, "loss": 0.389, "step": 331 }, { "epoch": 0.5244865718799369, "grad_norm": 0.4922950565814972, "learning_rate": 4.770932069510269e-06, "loss": 0.3756, "step": 332 }, { "epoch": 0.5260663507109005, "grad_norm": 0.4961477518081665, "learning_rate": 4.755134281200632e-06, "loss": 0.4336, "step": 333 }, { "epoch": 0.5276461295418642, "grad_norm": 0.5258511304855347, "learning_rate": 4.739336492890996e-06, "loss": 0.404, "step": 334 }, { "epoch": 0.5292259083728278, "grad_norm": 0.5479301810264587, "learning_rate": 4.723538704581359e-06, "loss": 0.3578, "step": 335 }, { "epoch": 0.5308056872037915, "grad_norm": 0.49883902072906494, "learning_rate": 4.707740916271723e-06, "loss": 0.3809, "step": 336 }, { "epoch": 0.5323854660347551, "grad_norm": 0.5133053660392761, "learning_rate": 4.691943127962086e-06, "loss": 0.4091, "step": 337 }, { "epoch": 0.5339652448657188, "grad_norm": 0.6334301829338074, "learning_rate": 4.676145339652449e-06, "loss": 0.4432, "step": 338 }, { "epoch": 0.5355450236966824, "grad_norm": 0.5124396085739136, "learning_rate": 4.660347551342812e-06, "loss": 0.3557, "step": 339 }, { "epoch": 0.5371248025276462, "grad_norm": 0.5863746404647827, "learning_rate": 4.644549763033176e-06, "loss": 0.4288, "step": 340 }, { "epoch": 0.5387045813586098, "grad_norm": 0.6599943041801453, "learning_rate": 4.628751974723539e-06, "loss": 0.398, "step": 341 }, { "epoch": 0.5402843601895735, "grad_norm": 0.480027437210083, "learning_rate": 4.612954186413903e-06, "loss": 0.4706, "step": 342 }, { "epoch": 0.5418641390205371, "grad_norm": 0.6601845026016235, "learning_rate": 4.597156398104266e-06, "loss": 0.4092, "step": 343 }, { "epoch": 0.5434439178515008, "grad_norm": 0.5557224154472351, "learning_rate": 4.581358609794629e-06, "loss": 0.389, "step": 344 }, { "epoch": 0.5450236966824644, "grad_norm": 0.49160709977149963, "learning_rate": 4.5655608214849925e-06, "loss": 0.4338, "step": 345 }, { "epoch": 0.5466034755134281, "grad_norm": 0.5284649133682251, "learning_rate": 4.549763033175355e-06, "loss": 0.403, "step": 346 }, { "epoch": 0.5481832543443917, "grad_norm": 0.5501908659934998, "learning_rate": 4.533965244865719e-06, "loss": 0.4983, "step": 347 }, { "epoch": 0.5497630331753555, "grad_norm": 0.5585077404975891, "learning_rate": 4.518167456556082e-06, "loss": 0.4219, "step": 348 }, { "epoch": 0.5513428120063191, "grad_norm": 0.4565962255001068, "learning_rate": 4.502369668246446e-06, "loss": 0.3591, "step": 349 }, { "epoch": 0.5529225908372828, "grad_norm": 0.5507949590682983, "learning_rate": 4.486571879936809e-06, "loss": 0.4752, "step": 350 }, { "epoch": 0.5545023696682464, "grad_norm": 0.5490357875823975, "learning_rate": 4.470774091627173e-06, "loss": 0.4291, "step": 351 }, { "epoch": 0.5560821484992101, "grad_norm": 0.5804268717765808, "learning_rate": 4.4549763033175355e-06, "loss": 0.3113, "step": 352 }, { "epoch": 0.5576619273301737, "grad_norm": 0.4745613634586334, "learning_rate": 4.439178515007899e-06, "loss": 0.4196, "step": 353 }, { "epoch": 0.5592417061611374, "grad_norm": 0.6223664283752441, "learning_rate": 4.423380726698262e-06, "loss": 0.4592, "step": 354 }, { "epoch": 0.5608214849921012, "grad_norm": 0.8797832727432251, "learning_rate": 4.407582938388626e-06, "loss": 0.4448, "step": 355 }, { "epoch": 0.5624012638230648, "grad_norm": 0.5569826364517212, "learning_rate": 4.391785150078989e-06, "loss": 0.3873, "step": 356 }, { "epoch": 0.5639810426540285, "grad_norm": 0.4294510781764984, "learning_rate": 4.375987361769353e-06, "loss": 0.3407, "step": 357 }, { "epoch": 0.5655608214849921, "grad_norm": 0.5657434463500977, "learning_rate": 4.360189573459716e-06, "loss": 0.3345, "step": 358 }, { "epoch": 0.5671406003159558, "grad_norm": 0.5589077472686768, "learning_rate": 4.3443917851500794e-06, "loss": 0.5237, "step": 359 }, { "epoch": 0.5687203791469194, "grad_norm": 0.6107128858566284, "learning_rate": 4.328593996840442e-06, "loss": 0.4354, "step": 360 }, { "epoch": 0.5703001579778831, "grad_norm": 0.5671380758285522, "learning_rate": 4.312796208530806e-06, "loss": 0.3712, "step": 361 }, { "epoch": 0.5718799368088467, "grad_norm": 0.508173406124115, "learning_rate": 4.29699842022117e-06, "loss": 0.4097, "step": 362 }, { "epoch": 0.5734597156398105, "grad_norm": 0.6139382719993591, "learning_rate": 4.281200631911533e-06, "loss": 0.2646, "step": 363 }, { "epoch": 0.5750394944707741, "grad_norm": 0.5677220821380615, "learning_rate": 4.265402843601897e-06, "loss": 0.3748, "step": 364 }, { "epoch": 0.5766192733017378, "grad_norm": 0.530708372592926, "learning_rate": 4.2496050552922596e-06, "loss": 0.3857, "step": 365 }, { "epoch": 0.5781990521327014, "grad_norm": 1.176272988319397, "learning_rate": 4.233807266982623e-06, "loss": 0.436, "step": 366 }, { "epoch": 0.5797788309636651, "grad_norm": 0.6165753602981567, "learning_rate": 4.218009478672986e-06, "loss": 0.3898, "step": 367 }, { "epoch": 0.5813586097946287, "grad_norm": 0.47574201226234436, "learning_rate": 4.20221169036335e-06, "loss": 0.3685, "step": 368 }, { "epoch": 0.5829383886255924, "grad_norm": 0.5995083451271057, "learning_rate": 4.186413902053712e-06, "loss": 0.4686, "step": 369 }, { "epoch": 0.584518167456556, "grad_norm": 0.5809090733528137, "learning_rate": 4.170616113744076e-06, "loss": 0.4514, "step": 370 }, { "epoch": 0.5860979462875198, "grad_norm": 0.6154018044471741, "learning_rate": 4.15481832543444e-06, "loss": 0.3737, "step": 371 }, { "epoch": 0.5876777251184834, "grad_norm": 0.5799654126167297, "learning_rate": 4.139020537124803e-06, "loss": 0.4285, "step": 372 }, { "epoch": 0.5892575039494471, "grad_norm": 0.4476354420185089, "learning_rate": 4.123222748815166e-06, "loss": 0.4362, "step": 373 }, { "epoch": 0.5908372827804107, "grad_norm": 0.6266714334487915, "learning_rate": 4.107424960505529e-06, "loss": 0.4943, "step": 374 }, { "epoch": 0.5924170616113744, "grad_norm": 0.5103732347488403, "learning_rate": 4.091627172195893e-06, "loss": 0.4585, "step": 375 }, { "epoch": 0.593996840442338, "grad_norm": 0.49011877179145813, "learning_rate": 4.075829383886256e-06, "loss": 0.4489, "step": 376 }, { "epoch": 0.5955766192733017, "grad_norm": 0.5286844372749329, "learning_rate": 4.06003159557662e-06, "loss": 0.4114, "step": 377 }, { "epoch": 0.5971563981042654, "grad_norm": 0.494807630777359, "learning_rate": 4.044233807266983e-06, "loss": 0.3514, "step": 378 }, { "epoch": 0.5987361769352291, "grad_norm": 0.46120524406433105, "learning_rate": 4.0284360189573465e-06, "loss": 0.4452, "step": 379 }, { "epoch": 0.6003159557661928, "grad_norm": 0.6024404764175415, "learning_rate": 4.0126382306477095e-06, "loss": 0.4368, "step": 380 }, { "epoch": 0.6018957345971564, "grad_norm": 0.8292664885520935, "learning_rate": 3.996840442338073e-06, "loss": 0.4495, "step": 381 }, { "epoch": 0.6034755134281201, "grad_norm": 0.5312369465827942, "learning_rate": 3.981042654028436e-06, "loss": 0.3642, "step": 382 }, { "epoch": 0.6050552922590837, "grad_norm": 0.6373758316040039, "learning_rate": 3.9652448657188e-06, "loss": 0.3884, "step": 383 }, { "epoch": 0.6066350710900474, "grad_norm": 0.5623313188552856, "learning_rate": 3.949447077409163e-06, "loss": 0.3489, "step": 384 }, { "epoch": 0.608214849921011, "grad_norm": 0.5703821778297424, "learning_rate": 3.933649289099527e-06, "loss": 0.5309, "step": 385 }, { "epoch": 0.6097946287519748, "grad_norm": 0.5930938720703125, "learning_rate": 3.91785150078989e-06, "loss": 0.4072, "step": 386 }, { "epoch": 0.6113744075829384, "grad_norm": 0.5636332631111145, "learning_rate": 3.902053712480253e-06, "loss": 0.3938, "step": 387 }, { "epoch": 0.6129541864139021, "grad_norm": 0.45709583163261414, "learning_rate": 3.886255924170616e-06, "loss": 0.4436, "step": 388 }, { "epoch": 0.6145339652448657, "grad_norm": 0.5924400687217712, "learning_rate": 3.87045813586098e-06, "loss": 0.2939, "step": 389 }, { "epoch": 0.6161137440758294, "grad_norm": 0.6232696175575256, "learning_rate": 3.854660347551343e-06, "loss": 0.4183, "step": 390 }, { "epoch": 0.617693522906793, "grad_norm": 0.5407995581626892, "learning_rate": 3.838862559241707e-06, "loss": 0.3925, "step": 391 }, { "epoch": 0.6192733017377567, "grad_norm": 0.524691104888916, "learning_rate": 3.82306477093207e-06, "loss": 0.4327, "step": 392 }, { "epoch": 0.6208530805687204, "grad_norm": 0.5206206440925598, "learning_rate": 3.8072669826224335e-06, "loss": 0.4203, "step": 393 }, { "epoch": 0.6224328593996841, "grad_norm": 0.6244251132011414, "learning_rate": 3.7914691943127964e-06, "loss": 0.4546, "step": 394 }, { "epoch": 0.6240126382306477, "grad_norm": 0.707058846950531, "learning_rate": 3.77567140600316e-06, "loss": 0.4015, "step": 395 }, { "epoch": 0.6255924170616114, "grad_norm": 0.5457757115364075, "learning_rate": 3.759873617693523e-06, "loss": 0.3962, "step": 396 }, { "epoch": 0.627172195892575, "grad_norm": 0.5757611989974976, "learning_rate": 3.7440758293838865e-06, "loss": 0.4299, "step": 397 }, { "epoch": 0.6287519747235387, "grad_norm": 0.5844476819038391, "learning_rate": 3.72827804107425e-06, "loss": 0.4674, "step": 398 }, { "epoch": 0.6303317535545023, "grad_norm": 0.6859634518623352, "learning_rate": 3.7124802527646132e-06, "loss": 0.4253, "step": 399 }, { "epoch": 0.631911532385466, "grad_norm": 0.5247636437416077, "learning_rate": 3.6966824644549766e-06, "loss": 0.4318, "step": 400 }, { "epoch": 0.6334913112164297, "grad_norm": 0.6206024885177612, "learning_rate": 3.68088467614534e-06, "loss": 0.3759, "step": 401 }, { "epoch": 0.6350710900473934, "grad_norm": 0.6237459182739258, "learning_rate": 3.6650868878357033e-06, "loss": 0.3642, "step": 402 }, { "epoch": 0.636650868878357, "grad_norm": 0.8048799633979797, "learning_rate": 3.6492890995260666e-06, "loss": 0.514, "step": 403 }, { "epoch": 0.6382306477093207, "grad_norm": 0.4662720561027527, "learning_rate": 3.63349131121643e-06, "loss": 0.3654, "step": 404 }, { "epoch": 0.6398104265402843, "grad_norm": 0.5561702251434326, "learning_rate": 3.6176935229067934e-06, "loss": 0.3823, "step": 405 }, { "epoch": 0.641390205371248, "grad_norm": 0.6143206357955933, "learning_rate": 3.6018957345971567e-06, "loss": 0.3938, "step": 406 }, { "epoch": 0.6429699842022117, "grad_norm": 0.6854034662246704, "learning_rate": 3.58609794628752e-06, "loss": 0.4625, "step": 407 }, { "epoch": 0.6445497630331753, "grad_norm": 0.5590549111366272, "learning_rate": 3.5703001579778834e-06, "loss": 0.4199, "step": 408 }, { "epoch": 0.6461295418641391, "grad_norm": 0.642573356628418, "learning_rate": 3.5545023696682468e-06, "loss": 0.4366, "step": 409 }, { "epoch": 0.6477093206951027, "grad_norm": 0.5898130536079407, "learning_rate": 3.53870458135861e-06, "loss": 0.4691, "step": 410 }, { "epoch": 0.6492890995260664, "grad_norm": 0.5370688438415527, "learning_rate": 3.5229067930489735e-06, "loss": 0.45, "step": 411 }, { "epoch": 0.65086887835703, "grad_norm": 0.6769170165061951, "learning_rate": 3.507109004739337e-06, "loss": 0.3962, "step": 412 }, { "epoch": 0.6524486571879937, "grad_norm": 0.5891703367233276, "learning_rate": 3.4913112164297e-06, "loss": 0.4542, "step": 413 }, { "epoch": 0.6540284360189573, "grad_norm": 0.42204615473747253, "learning_rate": 3.4755134281200636e-06, "loss": 0.3368, "step": 414 }, { "epoch": 0.655608214849921, "grad_norm": 0.46033787727355957, "learning_rate": 3.459715639810427e-06, "loss": 0.4357, "step": 415 }, { "epoch": 0.6571879936808847, "grad_norm": 0.5509577393531799, "learning_rate": 3.4439178515007903e-06, "loss": 0.3939, "step": 416 }, { "epoch": 0.6587677725118484, "grad_norm": 0.5802867412567139, "learning_rate": 3.4281200631911536e-06, "loss": 0.4073, "step": 417 }, { "epoch": 0.660347551342812, "grad_norm": 0.6130402684211731, "learning_rate": 3.412322274881517e-06, "loss": 0.3452, "step": 418 }, { "epoch": 0.6619273301737757, "grad_norm": 0.6854075789451599, "learning_rate": 3.39652448657188e-06, "loss": 0.3551, "step": 419 }, { "epoch": 0.6635071090047393, "grad_norm": 0.5365926027297974, "learning_rate": 3.3807266982622433e-06, "loss": 0.4011, "step": 420 }, { "epoch": 0.665086887835703, "grad_norm": 1.0338938236236572, "learning_rate": 3.3649289099526066e-06, "loss": 0.4623, "step": 421 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5612855553627014, "learning_rate": 3.34913112164297e-06, "loss": 0.3738, "step": 422 }, { "epoch": 0.6682464454976303, "grad_norm": 0.5113286375999451, "learning_rate": 3.3333333333333333e-06, "loss": 0.3865, "step": 423 }, { "epoch": 0.669826224328594, "grad_norm": 0.5509905815124512, "learning_rate": 3.3175355450236967e-06, "loss": 0.4093, "step": 424 }, { "epoch": 0.6714060031595577, "grad_norm": 0.5425525903701782, "learning_rate": 3.30173775671406e-06, "loss": 0.383, "step": 425 }, { "epoch": 0.6729857819905213, "grad_norm": 0.5866172909736633, "learning_rate": 3.2859399684044234e-06, "loss": 0.4843, "step": 426 }, { "epoch": 0.674565560821485, "grad_norm": 1.0777703523635864, "learning_rate": 3.2701421800947867e-06, "loss": 0.3748, "step": 427 }, { "epoch": 0.6761453396524486, "grad_norm": 0.49126845598220825, "learning_rate": 3.25434439178515e-06, "loss": 0.3505, "step": 428 }, { "epoch": 0.6777251184834123, "grad_norm": 0.5471718311309814, "learning_rate": 3.2385466034755135e-06, "loss": 0.4755, "step": 429 }, { "epoch": 0.6793048973143759, "grad_norm": 0.5689931511878967, "learning_rate": 3.222748815165877e-06, "loss": 0.3956, "step": 430 }, { "epoch": 0.6808846761453397, "grad_norm": 0.6496183276176453, "learning_rate": 3.2069510268562406e-06, "loss": 0.4598, "step": 431 }, { "epoch": 0.6824644549763034, "grad_norm": 0.47042712569236755, "learning_rate": 3.191153238546604e-06, "loss": 0.3756, "step": 432 }, { "epoch": 0.684044233807267, "grad_norm": 0.5819857120513916, "learning_rate": 3.1753554502369673e-06, "loss": 0.4803, "step": 433 }, { "epoch": 0.6856240126382307, "grad_norm": 0.5752127766609192, "learning_rate": 3.1595576619273307e-06, "loss": 0.3916, "step": 434 }, { "epoch": 0.6872037914691943, "grad_norm": 0.6483988761901855, "learning_rate": 3.143759873617694e-06, "loss": 0.4338, "step": 435 }, { "epoch": 0.688783570300158, "grad_norm": 0.7817516326904297, "learning_rate": 3.1279620853080574e-06, "loss": 0.3645, "step": 436 }, { "epoch": 0.6903633491311216, "grad_norm": 0.4980696737766266, "learning_rate": 3.1121642969984207e-06, "loss": 0.3962, "step": 437 }, { "epoch": 0.6919431279620853, "grad_norm": 0.5592882037162781, "learning_rate": 3.096366508688784e-06, "loss": 0.3645, "step": 438 }, { "epoch": 0.693522906793049, "grad_norm": 0.6228163242340088, "learning_rate": 3.0805687203791474e-06, "loss": 0.3696, "step": 439 }, { "epoch": 0.6951026856240127, "grad_norm": 0.6718009114265442, "learning_rate": 3.0647709320695108e-06, "loss": 0.4926, "step": 440 }, { "epoch": 0.6966824644549763, "grad_norm": 0.6085376143455505, "learning_rate": 3.048973143759874e-06, "loss": 0.418, "step": 441 }, { "epoch": 0.69826224328594, "grad_norm": 0.7716324925422668, "learning_rate": 3.0331753554502375e-06, "loss": 0.4038, "step": 442 }, { "epoch": 0.6998420221169036, "grad_norm": 0.7239758968353271, "learning_rate": 3.017377567140601e-06, "loss": 0.4596, "step": 443 }, { "epoch": 0.7014218009478673, "grad_norm": 0.6308011412620544, "learning_rate": 3.001579778830964e-06, "loss": 0.4082, "step": 444 }, { "epoch": 0.7030015797788309, "grad_norm": 0.515626072883606, "learning_rate": 2.985781990521327e-06, "loss": 0.4688, "step": 445 }, { "epoch": 0.7045813586097947, "grad_norm": 0.5395441651344299, "learning_rate": 2.9699842022116905e-06, "loss": 0.3448, "step": 446 }, { "epoch": 0.7061611374407583, "grad_norm": 0.5883680582046509, "learning_rate": 2.954186413902054e-06, "loss": 0.4546, "step": 447 }, { "epoch": 0.707740916271722, "grad_norm": 0.7300311326980591, "learning_rate": 2.938388625592417e-06, "loss": 0.368, "step": 448 }, { "epoch": 0.7093206951026856, "grad_norm": 0.5901307463645935, "learning_rate": 2.9225908372827806e-06, "loss": 0.3688, "step": 449 }, { "epoch": 0.7109004739336493, "grad_norm": 0.6521854996681213, "learning_rate": 2.906793048973144e-06, "loss": 0.3876, "step": 450 }, { "epoch": 0.7124802527646129, "grad_norm": 0.688450038433075, "learning_rate": 2.8909952606635073e-06, "loss": 0.4298, "step": 451 }, { "epoch": 0.7140600315955766, "grad_norm": 0.6533556580543518, "learning_rate": 2.8751974723538706e-06, "loss": 0.3589, "step": 452 }, { "epoch": 0.7156398104265402, "grad_norm": 0.5261491537094116, "learning_rate": 2.859399684044234e-06, "loss": 0.3886, "step": 453 }, { "epoch": 0.717219589257504, "grad_norm": 0.5488421320915222, "learning_rate": 2.8436018957345973e-06, "loss": 0.411, "step": 454 }, { "epoch": 0.7187993680884676, "grad_norm": 0.6415657997131348, "learning_rate": 2.8278041074249607e-06, "loss": 0.4581, "step": 455 }, { "epoch": 0.7203791469194313, "grad_norm": 0.5058445334434509, "learning_rate": 2.812006319115324e-06, "loss": 0.4325, "step": 456 }, { "epoch": 0.721958925750395, "grad_norm": 0.6409322619438171, "learning_rate": 2.7962085308056874e-06, "loss": 0.3759, "step": 457 }, { "epoch": 0.7235387045813586, "grad_norm": 0.5578014850616455, "learning_rate": 2.7804107424960508e-06, "loss": 0.3947, "step": 458 }, { "epoch": 0.7251184834123223, "grad_norm": 0.6064183115959167, "learning_rate": 2.764612954186414e-06, "loss": 0.4766, "step": 459 }, { "epoch": 0.7266982622432859, "grad_norm": 0.6067904233932495, "learning_rate": 2.7488151658767775e-06, "loss": 0.4698, "step": 460 }, { "epoch": 0.7282780410742496, "grad_norm": 0.526088297367096, "learning_rate": 2.733017377567141e-06, "loss": 0.3997, "step": 461 }, { "epoch": 0.7298578199052133, "grad_norm": 0.6290006637573242, "learning_rate": 2.717219589257504e-06, "loss": 0.4393, "step": 462 }, { "epoch": 0.731437598736177, "grad_norm": 0.5822445154190063, "learning_rate": 2.7014218009478675e-06, "loss": 0.4767, "step": 463 }, { "epoch": 0.7330173775671406, "grad_norm": 0.5798205733299255, "learning_rate": 2.685624012638231e-06, "loss": 0.4163, "step": 464 }, { "epoch": 0.7345971563981043, "grad_norm": 0.6234124898910522, "learning_rate": 2.6698262243285942e-06, "loss": 0.387, "step": 465 }, { "epoch": 0.7361769352290679, "grad_norm": 0.5226984620094299, "learning_rate": 2.6540284360189576e-06, "loss": 0.4144, "step": 466 }, { "epoch": 0.7377567140600316, "grad_norm": 0.529303789138794, "learning_rate": 2.638230647709321e-06, "loss": 0.4689, "step": 467 }, { "epoch": 0.7393364928909952, "grad_norm": 0.6620000004768372, "learning_rate": 2.6224328593996843e-06, "loss": 0.4358, "step": 468 }, { "epoch": 0.740916271721959, "grad_norm": 0.8560294508934021, "learning_rate": 2.606635071090048e-06, "loss": 0.422, "step": 469 }, { "epoch": 0.7424960505529226, "grad_norm": 0.47033989429473877, "learning_rate": 2.5908372827804106e-06, "loss": 0.4462, "step": 470 }, { "epoch": 0.7440758293838863, "grad_norm": 0.5476656556129456, "learning_rate": 2.575039494470774e-06, "loss": 0.3818, "step": 471 }, { "epoch": 0.7456556082148499, "grad_norm": 0.5771902203559875, "learning_rate": 2.5592417061611373e-06, "loss": 0.3835, "step": 472 }, { "epoch": 0.7472353870458136, "grad_norm": 0.6452733278274536, "learning_rate": 2.5434439178515007e-06, "loss": 0.4224, "step": 473 }, { "epoch": 0.7488151658767772, "grad_norm": 0.5318686962127686, "learning_rate": 2.527646129541864e-06, "loss": 0.4812, "step": 474 }, { "epoch": 0.7503949447077409, "grad_norm": 0.6591460108757019, "learning_rate": 2.5118483412322274e-06, "loss": 0.4546, "step": 475 }, { "epoch": 0.7519747235387045, "grad_norm": 0.5857440829277039, "learning_rate": 2.4960505529225907e-06, "loss": 0.4008, "step": 476 }, { "epoch": 0.7535545023696683, "grad_norm": 0.6430768370628357, "learning_rate": 2.480252764612954e-06, "loss": 0.3191, "step": 477 }, { "epoch": 0.7551342812006319, "grad_norm": 0.7442892789840698, "learning_rate": 2.4644549763033174e-06, "loss": 0.4171, "step": 478 }, { "epoch": 0.7567140600315956, "grad_norm": 0.6390454173088074, "learning_rate": 2.4486571879936812e-06, "loss": 0.5381, "step": 479 }, { "epoch": 0.7582938388625592, "grad_norm": 0.6277416348457336, "learning_rate": 2.4328593996840446e-06, "loss": 0.4824, "step": 480 }, { "epoch": 0.7598736176935229, "grad_norm": 0.6043097972869873, "learning_rate": 2.417061611374408e-06, "loss": 0.4266, "step": 481 }, { "epoch": 0.7614533965244866, "grad_norm": 0.6095964312553406, "learning_rate": 2.4012638230647713e-06, "loss": 0.4258, "step": 482 }, { "epoch": 0.7630331753554502, "grad_norm": 0.5433639287948608, "learning_rate": 2.3854660347551346e-06, "loss": 0.4873, "step": 483 }, { "epoch": 0.764612954186414, "grad_norm": 0.49287649989128113, "learning_rate": 2.369668246445498e-06, "loss": 0.4814, "step": 484 }, { "epoch": 0.7661927330173776, "grad_norm": 0.5905902981758118, "learning_rate": 2.3538704581358613e-06, "loss": 0.4519, "step": 485 }, { "epoch": 0.7677725118483413, "grad_norm": 0.6697285771369934, "learning_rate": 2.3380726698262247e-06, "loss": 0.4686, "step": 486 }, { "epoch": 0.7693522906793049, "grad_norm": 0.5338664650917053, "learning_rate": 2.322274881516588e-06, "loss": 0.401, "step": 487 }, { "epoch": 0.7709320695102686, "grad_norm": 0.5338428616523743, "learning_rate": 2.3064770932069514e-06, "loss": 0.4045, "step": 488 }, { "epoch": 0.7725118483412322, "grad_norm": 0.6102830171585083, "learning_rate": 2.2906793048973143e-06, "loss": 0.3785, "step": 489 }, { "epoch": 0.7740916271721959, "grad_norm": 0.5787335634231567, "learning_rate": 2.2748815165876777e-06, "loss": 0.42, "step": 490 }, { "epoch": 0.7756714060031595, "grad_norm": 0.7426438331604004, "learning_rate": 2.259083728278041e-06, "loss": 0.4676, "step": 491 }, { "epoch": 0.7772511848341233, "grad_norm": 0.5988475680351257, "learning_rate": 2.2432859399684044e-06, "loss": 0.5404, "step": 492 }, { "epoch": 0.7788309636650869, "grad_norm": 0.6289830803871155, "learning_rate": 2.2274881516587678e-06, "loss": 0.396, "step": 493 }, { "epoch": 0.7804107424960506, "grad_norm": 0.6077900528907776, "learning_rate": 2.211690363349131e-06, "loss": 0.4016, "step": 494 }, { "epoch": 0.7819905213270142, "grad_norm": 0.8171889781951904, "learning_rate": 2.1958925750394945e-06, "loss": 0.3638, "step": 495 }, { "epoch": 0.7835703001579779, "grad_norm": 0.6225026845932007, "learning_rate": 2.180094786729858e-06, "loss": 0.4088, "step": 496 }, { "epoch": 0.7851500789889415, "grad_norm": 0.6262929439544678, "learning_rate": 2.164296998420221e-06, "loss": 0.3311, "step": 497 }, { "epoch": 0.7867298578199052, "grad_norm": 0.662129282951355, "learning_rate": 2.148499210110585e-06, "loss": 0.4434, "step": 498 }, { "epoch": 0.7883096366508688, "grad_norm": 0.5046777725219727, "learning_rate": 2.1327014218009483e-06, "loss": 0.5042, "step": 499 }, { "epoch": 0.7898894154818326, "grad_norm": 0.6273382306098938, "learning_rate": 2.1169036334913117e-06, "loss": 0.345, "step": 500 }, { "epoch": 0.7914691943127962, "grad_norm": 0.5484871864318848, "learning_rate": 2.101105845181675e-06, "loss": 0.3476, "step": 501 }, { "epoch": 0.7930489731437599, "grad_norm": 0.6779518723487854, "learning_rate": 2.085308056872038e-06, "loss": 0.4062, "step": 502 }, { "epoch": 0.7946287519747235, "grad_norm": 0.4969736635684967, "learning_rate": 2.0695102685624013e-06, "loss": 0.3615, "step": 503 }, { "epoch": 0.7962085308056872, "grad_norm": 0.5542388558387756, "learning_rate": 2.0537124802527647e-06, "loss": 0.39, "step": 504 }, { "epoch": 0.7977883096366508, "grad_norm": 0.8587651252746582, "learning_rate": 2.037914691943128e-06, "loss": 0.423, "step": 505 }, { "epoch": 0.7993680884676145, "grad_norm": 0.6399357318878174, "learning_rate": 2.0221169036334914e-06, "loss": 0.4645, "step": 506 }, { "epoch": 0.8009478672985783, "grad_norm": 0.5677849650382996, "learning_rate": 2.0063191153238547e-06, "loss": 0.3749, "step": 507 }, { "epoch": 0.8025276461295419, "grad_norm": 0.5609621405601501, "learning_rate": 1.990521327014218e-06, "loss": 0.4727, "step": 508 }, { "epoch": 0.8041074249605056, "grad_norm": 0.615185558795929, "learning_rate": 1.9747235387045814e-06, "loss": 0.4349, "step": 509 }, { "epoch": 0.8056872037914692, "grad_norm": 0.5093739032745361, "learning_rate": 1.958925750394945e-06, "loss": 0.3502, "step": 510 }, { "epoch": 0.8072669826224329, "grad_norm": 0.8513323068618774, "learning_rate": 1.943127962085308e-06, "loss": 0.3902, "step": 511 }, { "epoch": 0.8088467614533965, "grad_norm": 0.6797610521316528, "learning_rate": 1.9273301737756715e-06, "loss": 0.4987, "step": 512 }, { "epoch": 0.8104265402843602, "grad_norm": 0.5715585947036743, "learning_rate": 1.911532385466035e-06, "loss": 0.3965, "step": 513 }, { "epoch": 0.8120063191153238, "grad_norm": 0.5537532567977905, "learning_rate": 1.8957345971563982e-06, "loss": 0.3832, "step": 514 }, { "epoch": 0.8135860979462876, "grad_norm": 0.5337470173835754, "learning_rate": 1.8799368088467616e-06, "loss": 0.4136, "step": 515 }, { "epoch": 0.8151658767772512, "grad_norm": 0.5929555892944336, "learning_rate": 1.864139020537125e-06, "loss": 0.3901, "step": 516 }, { "epoch": 0.8167456556082149, "grad_norm": 0.6738921403884888, "learning_rate": 1.8483412322274883e-06, "loss": 0.4128, "step": 517 }, { "epoch": 0.8183254344391785, "grad_norm": 0.598659098148346, "learning_rate": 1.8325434439178516e-06, "loss": 0.3707, "step": 518 }, { "epoch": 0.8199052132701422, "grad_norm": 0.5679790377616882, "learning_rate": 1.816745655608215e-06, "loss": 0.457, "step": 519 }, { "epoch": 0.8214849921011058, "grad_norm": 0.5459115505218506, "learning_rate": 1.8009478672985784e-06, "loss": 0.3613, "step": 520 }, { "epoch": 0.8230647709320695, "grad_norm": 0.5752125978469849, "learning_rate": 1.7851500789889417e-06, "loss": 0.479, "step": 521 }, { "epoch": 0.8246445497630331, "grad_norm": 0.5184637904167175, "learning_rate": 1.769352290679305e-06, "loss": 0.4126, "step": 522 }, { "epoch": 0.8262243285939969, "grad_norm": 0.6329041123390198, "learning_rate": 1.7535545023696684e-06, "loss": 0.4221, "step": 523 }, { "epoch": 0.8278041074249605, "grad_norm": 0.5233784317970276, "learning_rate": 1.7377567140600318e-06, "loss": 0.4375, "step": 524 }, { "epoch": 0.8293838862559242, "grad_norm": 0.5424541234970093, "learning_rate": 1.7219589257503951e-06, "loss": 0.4447, "step": 525 }, { "epoch": 0.8309636650868878, "grad_norm": 0.5534167885780334, "learning_rate": 1.7061611374407585e-06, "loss": 0.3672, "step": 526 }, { "epoch": 0.8325434439178515, "grad_norm": 0.605102002620697, "learning_rate": 1.6903633491311216e-06, "loss": 0.4319, "step": 527 }, { "epoch": 0.8341232227488151, "grad_norm": 0.5609396696090698, "learning_rate": 1.674565560821485e-06, "loss": 0.3984, "step": 528 }, { "epoch": 0.8357030015797788, "grad_norm": 0.7964479923248291, "learning_rate": 1.6587677725118483e-06, "loss": 0.407, "step": 529 }, { "epoch": 0.8372827804107424, "grad_norm": 0.4886048436164856, "learning_rate": 1.6429699842022117e-06, "loss": 0.4506, "step": 530 }, { "epoch": 0.8388625592417062, "grad_norm": 0.543812096118927, "learning_rate": 1.627172195892575e-06, "loss": 0.3141, "step": 531 }, { "epoch": 0.8404423380726699, "grad_norm": 0.5370059609413147, "learning_rate": 1.6113744075829384e-06, "loss": 0.3712, "step": 532 }, { "epoch": 0.8420221169036335, "grad_norm": 0.7402203679084778, "learning_rate": 1.595576619273302e-06, "loss": 0.4136, "step": 533 }, { "epoch": 0.8436018957345972, "grad_norm": 0.6814244985580444, "learning_rate": 1.5797788309636653e-06, "loss": 0.4634, "step": 534 }, { "epoch": 0.8451816745655608, "grad_norm": 0.5919080972671509, "learning_rate": 1.5639810426540287e-06, "loss": 0.4238, "step": 535 }, { "epoch": 0.8467614533965245, "grad_norm": 0.617522120475769, "learning_rate": 1.548183254344392e-06, "loss": 0.3431, "step": 536 }, { "epoch": 0.8483412322274881, "grad_norm": 0.49482643604278564, "learning_rate": 1.5323854660347554e-06, "loss": 0.3882, "step": 537 }, { "epoch": 0.8499210110584519, "grad_norm": 0.5525531768798828, "learning_rate": 1.5165876777251187e-06, "loss": 0.4053, "step": 538 }, { "epoch": 0.8515007898894155, "grad_norm": 0.6634103655815125, "learning_rate": 1.500789889415482e-06, "loss": 0.4624, "step": 539 }, { "epoch": 0.8530805687203792, "grad_norm": 0.45309382677078247, "learning_rate": 1.4849921011058452e-06, "loss": 0.3486, "step": 540 }, { "epoch": 0.8546603475513428, "grad_norm": 0.778338611125946, "learning_rate": 1.4691943127962086e-06, "loss": 0.3984, "step": 541 }, { "epoch": 0.8562401263823065, "grad_norm": 0.6093356609344482, "learning_rate": 1.453396524486572e-06, "loss": 0.333, "step": 542 }, { "epoch": 0.8578199052132701, "grad_norm": 0.49551188945770264, "learning_rate": 1.4375987361769353e-06, "loss": 0.3915, "step": 543 }, { "epoch": 0.8593996840442338, "grad_norm": 0.5423188209533691, "learning_rate": 1.4218009478672987e-06, "loss": 0.4192, "step": 544 }, { "epoch": 0.8609794628751974, "grad_norm": 0.8111097812652588, "learning_rate": 1.406003159557662e-06, "loss": 0.473, "step": 545 }, { "epoch": 0.8625592417061612, "grad_norm": 0.6064862012863159, "learning_rate": 1.3902053712480254e-06, "loss": 0.4164, "step": 546 }, { "epoch": 0.8641390205371248, "grad_norm": 0.6180470585823059, "learning_rate": 1.3744075829383887e-06, "loss": 0.4351, "step": 547 }, { "epoch": 0.8657187993680885, "grad_norm": 0.5101069808006287, "learning_rate": 1.358609794628752e-06, "loss": 0.3806, "step": 548 }, { "epoch": 0.8672985781990521, "grad_norm": 0.6269749402999878, "learning_rate": 1.3428120063191154e-06, "loss": 0.4028, "step": 549 }, { "epoch": 0.8688783570300158, "grad_norm": 0.6344918608665466, "learning_rate": 1.3270142180094788e-06, "loss": 0.3206, "step": 550 }, { "epoch": 0.8704581358609794, "grad_norm": 0.7053835988044739, "learning_rate": 1.3112164296998422e-06, "loss": 0.4404, "step": 551 }, { "epoch": 0.8720379146919431, "grad_norm": 0.4780917465686798, "learning_rate": 1.2954186413902053e-06, "loss": 0.4089, "step": 552 }, { "epoch": 0.8736176935229067, "grad_norm": 0.5235942006111145, "learning_rate": 1.2796208530805687e-06, "loss": 0.3992, "step": 553 }, { "epoch": 0.8751974723538705, "grad_norm": 0.5037370324134827, "learning_rate": 1.263823064770932e-06, "loss": 0.3727, "step": 554 }, { "epoch": 0.8767772511848341, "grad_norm": 0.5422868132591248, "learning_rate": 1.2480252764612954e-06, "loss": 0.4524, "step": 555 }, { "epoch": 0.8783570300157978, "grad_norm": 0.5287191271781921, "learning_rate": 1.2322274881516587e-06, "loss": 0.3445, "step": 556 }, { "epoch": 0.8799368088467614, "grad_norm": 0.49679964780807495, "learning_rate": 1.2164296998420223e-06, "loss": 0.3357, "step": 557 }, { "epoch": 0.8815165876777251, "grad_norm": 0.5391539931297302, "learning_rate": 1.2006319115323856e-06, "loss": 0.4645, "step": 558 }, { "epoch": 0.8830963665086888, "grad_norm": 0.5474575757980347, "learning_rate": 1.184834123222749e-06, "loss": 0.4109, "step": 559 }, { "epoch": 0.8846761453396524, "grad_norm": 0.5920886993408203, "learning_rate": 1.1690363349131124e-06, "loss": 0.4034, "step": 560 }, { "epoch": 0.8862559241706162, "grad_norm": 0.5637263655662537, "learning_rate": 1.1532385466034757e-06, "loss": 0.392, "step": 561 }, { "epoch": 0.8878357030015798, "grad_norm": 0.6719076037406921, "learning_rate": 1.1374407582938388e-06, "loss": 0.3798, "step": 562 }, { "epoch": 0.8894154818325435, "grad_norm": 0.5554001927375793, "learning_rate": 1.1216429699842022e-06, "loss": 0.3901, "step": 563 }, { "epoch": 0.8909952606635071, "grad_norm": 0.6078475713729858, "learning_rate": 1.1058451816745656e-06, "loss": 0.3574, "step": 564 }, { "epoch": 0.8925750394944708, "grad_norm": 0.9478325843811035, "learning_rate": 1.090047393364929e-06, "loss": 0.3831, "step": 565 }, { "epoch": 0.8941548183254344, "grad_norm": 0.5259877443313599, "learning_rate": 1.0742496050552925e-06, "loss": 0.4003, "step": 566 }, { "epoch": 0.8957345971563981, "grad_norm": 0.5395880937576294, "learning_rate": 1.0584518167456558e-06, "loss": 0.3513, "step": 567 }, { "epoch": 0.8973143759873617, "grad_norm": 0.5458592772483826, "learning_rate": 1.042654028436019e-06, "loss": 0.49, "step": 568 }, { "epoch": 0.8988941548183255, "grad_norm": 0.5552616715431213, "learning_rate": 1.0268562401263823e-06, "loss": 0.3905, "step": 569 }, { "epoch": 0.9004739336492891, "grad_norm": 0.551466166973114, "learning_rate": 1.0110584518167457e-06, "loss": 0.4241, "step": 570 }, { "epoch": 0.9020537124802528, "grad_norm": 0.7195900082588196, "learning_rate": 9.95260663507109e-07, "loss": 0.3912, "step": 571 }, { "epoch": 0.9036334913112164, "grad_norm": 0.5951517820358276, "learning_rate": 9.794628751974724e-07, "loss": 0.4267, "step": 572 }, { "epoch": 0.9052132701421801, "grad_norm": 0.7582541108131409, "learning_rate": 9.636650868878358e-07, "loss": 0.4024, "step": 573 }, { "epoch": 0.9067930489731437, "grad_norm": 0.6346389651298523, "learning_rate": 9.478672985781991e-07, "loss": 0.4677, "step": 574 }, { "epoch": 0.9083728278041074, "grad_norm": 0.7323048710823059, "learning_rate": 9.320695102685625e-07, "loss": 0.4332, "step": 575 }, { "epoch": 0.909952606635071, "grad_norm": 0.5796726942062378, "learning_rate": 9.162717219589258e-07, "loss": 0.3514, "step": 576 }, { "epoch": 0.9115323854660348, "grad_norm": 0.7424004673957825, "learning_rate": 9.004739336492892e-07, "loss": 0.4178, "step": 577 }, { "epoch": 0.9131121642969984, "grad_norm": 0.525142252445221, "learning_rate": 8.846761453396525e-07, "loss": 0.4498, "step": 578 }, { "epoch": 0.9146919431279621, "grad_norm": 0.5565955638885498, "learning_rate": 8.688783570300159e-07, "loss": 0.4532, "step": 579 }, { "epoch": 0.9162717219589257, "grad_norm": 0.540267288684845, "learning_rate": 8.530805687203792e-07, "loss": 0.4828, "step": 580 }, { "epoch": 0.9178515007898894, "grad_norm": 0.5061677694320679, "learning_rate": 8.372827804107425e-07, "loss": 0.3505, "step": 581 }, { "epoch": 0.919431279620853, "grad_norm": 0.5490908622741699, "learning_rate": 8.214849921011058e-07, "loss": 0.4402, "step": 582 }, { "epoch": 0.9210110584518167, "grad_norm": 0.5788997411727905, "learning_rate": 8.056872037914692e-07, "loss": 0.3256, "step": 583 }, { "epoch": 0.9225908372827805, "grad_norm": 0.5741492509841919, "learning_rate": 7.898894154818327e-07, "loss": 0.451, "step": 584 }, { "epoch": 0.9241706161137441, "grad_norm": 0.5012090802192688, "learning_rate": 7.74091627172196e-07, "loss": 0.3513, "step": 585 }, { "epoch": 0.9257503949447078, "grad_norm": 0.5613192915916443, "learning_rate": 7.582938388625594e-07, "loss": 0.3499, "step": 586 }, { "epoch": 0.9273301737756714, "grad_norm": 0.5941815376281738, "learning_rate": 7.424960505529226e-07, "loss": 0.4133, "step": 587 }, { "epoch": 0.9289099526066351, "grad_norm": 0.7772453427314758, "learning_rate": 7.26698262243286e-07, "loss": 0.3818, "step": 588 }, { "epoch": 0.9304897314375987, "grad_norm": 0.5977700352668762, "learning_rate": 7.109004739336493e-07, "loss": 0.4099, "step": 589 }, { "epoch": 0.9320695102685624, "grad_norm": 0.7777069807052612, "learning_rate": 6.951026856240127e-07, "loss": 0.4341, "step": 590 }, { "epoch": 0.933649289099526, "grad_norm": 0.5362728834152222, "learning_rate": 6.79304897314376e-07, "loss": 0.4431, "step": 591 }, { "epoch": 0.9352290679304898, "grad_norm": 0.5126134157180786, "learning_rate": 6.635071090047394e-07, "loss": 0.3713, "step": 592 }, { "epoch": 0.9368088467614534, "grad_norm": 0.5886785984039307, "learning_rate": 6.477093206951026e-07, "loss": 0.405, "step": 593 }, { "epoch": 0.9383886255924171, "grad_norm": 0.5328089594841003, "learning_rate": 6.31911532385466e-07, "loss": 0.3952, "step": 594 }, { "epoch": 0.9399684044233807, "grad_norm": 0.7170501351356506, "learning_rate": 6.161137440758294e-07, "loss": 0.3979, "step": 595 }, { "epoch": 0.9415481832543444, "grad_norm": 0.6048548817634583, "learning_rate": 6.003159557661928e-07, "loss": 0.3425, "step": 596 }, { "epoch": 0.943127962085308, "grad_norm": 0.5635291337966919, "learning_rate": 5.845181674565562e-07, "loss": 0.3008, "step": 597 }, { "epoch": 0.9447077409162717, "grad_norm": 0.6890112161636353, "learning_rate": 5.687203791469194e-07, "loss": 0.4205, "step": 598 }, { "epoch": 0.9462875197472354, "grad_norm": 0.5197014212608337, "learning_rate": 5.529225908372828e-07, "loss": 0.4589, "step": 599 }, { "epoch": 0.9478672985781991, "grad_norm": 0.5197718143463135, "learning_rate": 5.371248025276462e-07, "loss": 0.2678, "step": 600 }, { "epoch": 0.9494470774091627, "grad_norm": 0.44931474328041077, "learning_rate": 5.213270142180095e-07, "loss": 0.4351, "step": 601 }, { "epoch": 0.9510268562401264, "grad_norm": 0.47795984148979187, "learning_rate": 5.055292259083728e-07, "loss": 0.4392, "step": 602 }, { "epoch": 0.95260663507109, "grad_norm": 0.6027578115463257, "learning_rate": 4.897314375987362e-07, "loss": 0.4499, "step": 603 }, { "epoch": 0.9541864139020537, "grad_norm": 0.6160722374916077, "learning_rate": 4.7393364928909956e-07, "loss": 0.434, "step": 604 }, { "epoch": 0.9557661927330173, "grad_norm": 0.8371343612670898, "learning_rate": 4.581358609794629e-07, "loss": 0.3911, "step": 605 }, { "epoch": 0.957345971563981, "grad_norm": 0.5282484292984009, "learning_rate": 4.4233807266982627e-07, "loss": 0.4445, "step": 606 }, { "epoch": 0.9589257503949447, "grad_norm": 0.5557743310928345, "learning_rate": 4.265402843601896e-07, "loss": 0.4103, "step": 607 }, { "epoch": 0.9605055292259084, "grad_norm": 0.6362637281417847, "learning_rate": 4.107424960505529e-07, "loss": 0.3856, "step": 608 }, { "epoch": 0.9620853080568721, "grad_norm": 0.745617151260376, "learning_rate": 3.9494470774091633e-07, "loss": 0.4179, "step": 609 }, { "epoch": 0.9636650868878357, "grad_norm": 0.659038782119751, "learning_rate": 3.791469194312797e-07, "loss": 0.4027, "step": 610 }, { "epoch": 0.9652448657187994, "grad_norm": 0.645199716091156, "learning_rate": 3.63349131121643e-07, "loss": 0.3501, "step": 611 }, { "epoch": 0.966824644549763, "grad_norm": 0.4868941605091095, "learning_rate": 3.4755134281200634e-07, "loss": 0.3385, "step": 612 }, { "epoch": 0.9684044233807267, "grad_norm": 0.5993934273719788, "learning_rate": 3.317535545023697e-07, "loss": 0.369, "step": 613 }, { "epoch": 0.9699842022116903, "grad_norm": 0.6094574928283691, "learning_rate": 3.15955766192733e-07, "loss": 0.4899, "step": 614 }, { "epoch": 0.9715639810426541, "grad_norm": 0.6989656686782837, "learning_rate": 3.001579778830964e-07, "loss": 0.4346, "step": 615 }, { "epoch": 0.9731437598736177, "grad_norm": 0.5412940382957458, "learning_rate": 2.843601895734597e-07, "loss": 0.4515, "step": 616 }, { "epoch": 0.9747235387045814, "grad_norm": 0.507622241973877, "learning_rate": 2.685624012638231e-07, "loss": 0.4171, "step": 617 }, { "epoch": 0.976303317535545, "grad_norm": 0.4564089775085449, "learning_rate": 2.527646129541864e-07, "loss": 0.3452, "step": 618 }, { "epoch": 0.9778830963665087, "grad_norm": 0.48170286417007446, "learning_rate": 2.3696682464454978e-07, "loss": 0.3866, "step": 619 }, { "epoch": 0.9794628751974723, "grad_norm": 0.47774481773376465, "learning_rate": 2.2116903633491313e-07, "loss": 0.4425, "step": 620 }, { "epoch": 0.981042654028436, "grad_norm": 0.4460739493370056, "learning_rate": 2.0537124802527646e-07, "loss": 0.3991, "step": 621 }, { "epoch": 0.9826224328593997, "grad_norm": 0.536359965801239, "learning_rate": 1.8957345971563984e-07, "loss": 0.327, "step": 622 }, { "epoch": 0.9842022116903634, "grad_norm": 0.5439571738243103, "learning_rate": 1.7377567140600317e-07, "loss": 0.408, "step": 623 }, { "epoch": 0.985781990521327, "grad_norm": 0.8827345967292786, "learning_rate": 1.579778830963665e-07, "loss": 0.4924, "step": 624 }, { "epoch": 0.9873617693522907, "grad_norm": 0.4992835521697998, "learning_rate": 1.4218009478672986e-07, "loss": 0.3921, "step": 625 }, { "epoch": 0.9889415481832543, "grad_norm": 0.7306237816810608, "learning_rate": 1.263823064770932e-07, "loss": 0.5063, "step": 626 }, { "epoch": 0.990521327014218, "grad_norm": 0.5200903415679932, "learning_rate": 1.1058451816745657e-07, "loss": 0.358, "step": 627 }, { "epoch": 0.9921011058451816, "grad_norm": 0.42708104848861694, "learning_rate": 9.478672985781992e-08, "loss": 0.3361, "step": 628 }, { "epoch": 0.9936808846761453, "grad_norm": 0.5993225574493408, "learning_rate": 7.898894154818325e-08, "loss": 0.3625, "step": 629 }, { "epoch": 0.995260663507109, "grad_norm": 0.49995774030685425, "learning_rate": 6.31911532385466e-08, "loss": 0.3746, "step": 630 }, { "epoch": 0.9968404423380727, "grad_norm": 0.5806180238723755, "learning_rate": 4.739336492890996e-08, "loss": 0.3727, "step": 631 }, { "epoch": 0.9984202211690363, "grad_norm": 0.5514349341392517, "learning_rate": 3.15955766192733e-08, "loss": 0.4634, "step": 632 }, { "epoch": 1.0, "grad_norm": 0.4094119668006897, "learning_rate": 1.579778830963665e-08, "loss": 0.2044, "step": 633 } ], "logging_steps": 1.0, "max_steps": 633, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.9805266972408545e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }