diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,7007 +1,1924 @@ { - "best_metric": 0.038467586040496826, - "best_model_checkpoint": "./test_microsoft_dit/checkpoint-7924", - "epoch": 5.0, - "eval_steps": 500, - "global_step": 9905, + "best_metric": 0.6095153739086423, + "best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2000", + "epoch": 0.5427922241858116, + "eval_steps": 50, + "global_step": 2150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005047955577990914, - "grad_norm": 0.8398004174232483, - "learning_rate": 2.9969712266532054e-05, - "loss": 0.3087, + "epoch": 0.0025246149962130774, + "grad_norm": 1.0554239749908447, + "learning_rate": 2.997e-05, + "loss": 0.3197, "step": 10 }, { - "epoch": 0.010095911155981827, - "grad_norm": 1.147126317024231, - "learning_rate": 2.993942453306411e-05, - "loss": 0.202, + "epoch": 0.005049229992426155, + "grad_norm": 1.5600422620773315, + "learning_rate": 2.994e-05, + "loss": 0.2047, "step": 20 }, { - "epoch": 0.01514386673397274, - "grad_norm": 1.1376692056655884, - "learning_rate": 2.9909136799596164e-05, - "loss": 0.1375, + "epoch": 0.007573844988639233, + "grad_norm": 2.1541621685028076, + "learning_rate": 2.991e-05, + "loss": 0.1528, "step": 30 }, { - "epoch": 0.020191822311963654, - "grad_norm": 3.0222654342651367, - "learning_rate": 2.987884906612822e-05, - "loss": 0.1254, + "epoch": 0.01009845998485231, + "grad_norm": 1.805535078048706, + "learning_rate": 2.9880000000000002e-05, + "loss": 0.1252, "step": 40 }, { - "epoch": 0.02523977788995457, - "grad_norm": 1.3963178396224976, - "learning_rate": 2.9848561332660275e-05, - "loss": 0.1105, + "epoch": 0.012623074981065387, + "grad_norm": 1.1236392259597778, + "learning_rate": 2.985e-05, + "loss": 0.1165, "step": 50 }, { - "epoch": 0.03028773346794548, - "grad_norm": 0.741131067276001, - "learning_rate": 2.9818273599192328e-05, - "loss": 0.1022, + "epoch": 0.012623074981065387, + "eval_f1": 0.4177168854259269, + "eval_loss": 0.06423712521791458, + "eval_runtime": 1142.2038, + "eval_samples_per_second": 180.582, + "eval_steps_per_second": 2.822, + "step": 50 + }, + { + "epoch": 0.015147689977278465, + "grad_norm": 1.1924934387207031, + "learning_rate": 2.982e-05, + "loss": 0.1029, "step": 60 }, { - "epoch": 0.0353356890459364, - "grad_norm": 1.0705397129058838, - "learning_rate": 2.978798586572438e-05, - "loss": 0.1027, + "epoch": 0.017672304973491544, + "grad_norm": 1.225701928138733, + "learning_rate": 2.979e-05, + "loss": 0.117, "step": 70 }, { - "epoch": 0.04038364462392731, - "grad_norm": 1.127729892730713, - "learning_rate": 2.9757698132256435e-05, - "loss": 0.0979, + "epoch": 0.02019691996970462, + "grad_norm": 2.702486515045166, + "learning_rate": 2.976e-05, + "loss": 0.1103, "step": 80 }, { - "epoch": 0.04543160020191822, - "grad_norm": 0.888960063457489, - "learning_rate": 2.9727410398788492e-05, - "loss": 0.1024, + "epoch": 0.022721534965917698, + "grad_norm": 2.0278918743133545, + "learning_rate": 2.973e-05, + "loss": 0.1054, "step": 90 }, { - "epoch": 0.05047955577990914, - "grad_norm": 0.9185839295387268, - "learning_rate": 2.9697122665320545e-05, - "loss": 0.1142, + "epoch": 0.025246149962130773, + "grad_norm": 1.9288796186447144, + "learning_rate": 2.97e-05, + "loss": 0.0942, + "step": 100 + }, + { + "epoch": 0.025246149962130773, + "eval_f1": 0.4771817453963171, + "eval_loss": 0.048530641943216324, + "eval_runtime": 1007.4651, + "eval_samples_per_second": 204.734, + "eval_steps_per_second": 3.199, "step": 100 }, { - "epoch": 0.05552751135790005, - "grad_norm": 0.737047016620636, - "learning_rate": 2.96668349318526e-05, - "loss": 0.0956, + "epoch": 0.027770764958343852, + "grad_norm": 1.4688999652862549, + "learning_rate": 2.967e-05, + "loss": 0.0992, "step": 110 }, { - "epoch": 0.06057546693589096, - "grad_norm": 0.7749747037887573, - "learning_rate": 2.9636547198384656e-05, - "loss": 0.0978, + "epoch": 0.03029537995455693, + "grad_norm": 1.1097605228424072, + "learning_rate": 2.964e-05, + "loss": 0.1126, "step": 120 }, { - "epoch": 0.06562342251388188, - "grad_norm": 1.079695224761963, - "learning_rate": 2.960625946491671e-05, - "loss": 0.092, + "epoch": 0.03281999495077001, + "grad_norm": 1.0353784561157227, + "learning_rate": 2.961e-05, + "loss": 0.0949, "step": 130 }, { - "epoch": 0.0706713780918728, - "grad_norm": 0.8315634727478027, - "learning_rate": 2.9575971731448766e-05, - "loss": 0.0975, + "epoch": 0.03534460994698309, + "grad_norm": 1.7303999662399292, + "learning_rate": 2.958e-05, + "loss": 0.1025, "step": 140 }, { - "epoch": 0.0757193336698637, - "grad_norm": 0.7270865440368652, - "learning_rate": 2.954568399798082e-05, - "loss": 0.098, + "epoch": 0.03786922494319616, + "grad_norm": 1.1177138090133667, + "learning_rate": 2.955e-05, + "loss": 0.1076, "step": 150 }, { - "epoch": 0.08076728924785462, - "grad_norm": 0.5786823630332947, - "learning_rate": 2.9515396264512873e-05, - "loss": 0.0846, + "epoch": 0.03786922494319616, + "eval_f1": 0.46432628333423487, + "eval_loss": 0.05836363136768341, + "eval_runtime": 980.1012, + "eval_samples_per_second": 210.45, + "eval_steps_per_second": 3.288, + "step": 150 + }, + { + "epoch": 0.04039383993940924, + "grad_norm": 1.1965147256851196, + "learning_rate": 2.9520000000000002e-05, + "loss": 0.0961, "step": 160 }, { - "epoch": 0.08581524482584553, - "grad_norm": 0.7117003798484802, - "learning_rate": 2.948510853104493e-05, - "loss": 0.0905, + "epoch": 0.04291845493562232, + "grad_norm": 1.0545780658721924, + "learning_rate": 2.949e-05, + "loss": 0.104, "step": 170 }, { - "epoch": 0.09086320040383644, - "grad_norm": 0.6765159368515015, - "learning_rate": 2.9454820797576983e-05, - "loss": 0.0764, + "epoch": 0.045443069931835396, + "grad_norm": 1.8348199129104614, + "learning_rate": 2.946e-05, + "loss": 0.0932, "step": 180 }, { - "epoch": 0.09591115598182735, - "grad_norm": 1.1397738456726074, - "learning_rate": 2.9424533064109037e-05, - "loss": 0.0882, + "epoch": 0.047967684928048475, + "grad_norm": 1.8478541374206543, + "learning_rate": 2.943e-05, + "loss": 0.1069, "step": 190 }, { - "epoch": 0.10095911155981828, - "grad_norm": 0.6545870900154114, - "learning_rate": 2.939424533064109e-05, - "loss": 0.0991, + "epoch": 0.05049229992426155, + "grad_norm": 0.9377999305725098, + "learning_rate": 2.94e-05, + "loss": 0.1103, + "step": 200 + }, + { + "epoch": 0.05049229992426155, + "eval_f1": NaN, + "eval_loss": 0.044557176530361176, + "eval_runtime": 978.7525, + "eval_samples_per_second": 210.74, + "eval_steps_per_second": 3.293, "step": 200 }, { - "epoch": 0.10600706713780919, - "grad_norm": 0.8882391452789307, - "learning_rate": 2.9363957597173144e-05, - "loss": 0.0902, + "epoch": 0.053016914920474625, + "grad_norm": 1.6204830408096313, + "learning_rate": 2.9370000000000002e-05, + "loss": 0.1019, "step": 210 }, { - "epoch": 0.1110550227158001, - "grad_norm": 0.5973140001296997, - "learning_rate": 2.93336698637052e-05, - "loss": 0.0968, + "epoch": 0.055541529916687704, + "grad_norm": 1.1411000490188599, + "learning_rate": 2.934e-05, + "loss": 0.0969, "step": 220 }, { - "epoch": 0.11610297829379101, - "grad_norm": 1.3215384483337402, - "learning_rate": 2.9303382130237254e-05, - "loss": 0.0901, + "epoch": 0.05806614491290078, + "grad_norm": 1.1179866790771484, + "learning_rate": 2.931e-05, + "loss": 0.1031, "step": 230 }, { - "epoch": 0.12115093387178193, - "grad_norm": 0.6139042973518372, - "learning_rate": 2.9273094396769307e-05, - "loss": 0.0739, + "epoch": 0.06059075990911386, + "grad_norm": 1.2155176401138306, + "learning_rate": 2.928e-05, + "loss": 0.0851, "step": 240 }, { - "epoch": 0.12619888944977284, - "grad_norm": 0.9095037579536438, - "learning_rate": 2.9242806663301364e-05, - "loss": 0.0907, + "epoch": 0.06311537490532694, + "grad_norm": 1.4578701257705688, + "learning_rate": 2.925e-05, + "loss": 0.0873, + "step": 250 + }, + { + "epoch": 0.06311537490532694, + "eval_f1": 0.5313367950730588, + "eval_loss": 0.05184657499194145, + "eval_runtime": 982.8389, + "eval_samples_per_second": 209.863, + "eval_steps_per_second": 3.279, "step": 250 }, { - "epoch": 0.13124684502776376, - "grad_norm": 1.0266954898834229, - "learning_rate": 2.9212518929833418e-05, - "loss": 0.0726, + "epoch": 0.06563998990154002, + "grad_norm": 1.2894303798675537, + "learning_rate": 2.922e-05, + "loss": 0.0876, "step": 260 }, { - "epoch": 0.13629480060575466, - "grad_norm": 0.734716534614563, - "learning_rate": 2.9182231196365474e-05, - "loss": 0.0891, + "epoch": 0.0681646048977531, + "grad_norm": 0.8404099941253662, + "learning_rate": 2.919e-05, + "loss": 0.0904, "step": 270 }, { - "epoch": 0.1413427561837456, - "grad_norm": 0.7633081674575806, - "learning_rate": 2.9151943462897528e-05, - "loss": 0.0747, + "epoch": 0.07068921989396618, + "grad_norm": 2.0062506198883057, + "learning_rate": 2.916e-05, + "loss": 0.1009, "step": 280 }, { - "epoch": 0.1463907117617365, - "grad_norm": 0.8185615539550781, - "learning_rate": 2.912165572942958e-05, - "loss": 0.0815, + "epoch": 0.07321383489017924, + "grad_norm": 0.8900242447853088, + "learning_rate": 2.913e-05, + "loss": 0.0925, "step": 290 }, { - "epoch": 0.1514386673397274, - "grad_norm": 1.2503191232681274, - "learning_rate": 2.9091367995961638e-05, - "loss": 0.0844, + "epoch": 0.07573844988639232, + "grad_norm": 1.051013708114624, + "learning_rate": 2.91e-05, + "loss": 0.1053, "step": 300 }, { - "epoch": 0.15648662291771834, - "grad_norm": 0.52531898021698, - "learning_rate": 2.906108026249369e-05, - "loss": 0.0863, + "epoch": 0.07573844988639232, + "eval_f1": 0.532925682031985, + "eval_loss": 0.07359323650598526, + "eval_runtime": 980.9407, + "eval_samples_per_second": 210.27, + "eval_steps_per_second": 3.286, + "step": 300 + }, + { + "epoch": 0.0782630648826054, + "grad_norm": 0.7765111327171326, + "learning_rate": 2.907e-05, + "loss": 0.0848, "step": 310 }, { - "epoch": 0.16153457849570924, - "grad_norm": 0.8883135914802551, - "learning_rate": 2.9030792529025745e-05, - "loss": 0.0833, + "epoch": 0.08078767987881848, + "grad_norm": 0.9605777859687805, + "learning_rate": 2.904e-05, + "loss": 0.0746, "step": 320 }, { - "epoch": 0.16658253407370016, - "grad_norm": 0.5173369646072388, - "learning_rate": 2.90005047955578e-05, - "loss": 0.0882, + "epoch": 0.08331229487503156, + "grad_norm": 1.9086962938308716, + "learning_rate": 2.901e-05, + "loss": 0.1023, "step": 330 }, { - "epoch": 0.17163048965169106, - "grad_norm": 0.5770648717880249, - "learning_rate": 2.8970217062089852e-05, - "loss": 0.0814, + "epoch": 0.08583690987124463, + "grad_norm": 1.5782345533370972, + "learning_rate": 2.898e-05, + "loss": 0.0751, "step": 340 }, { - "epoch": 0.17667844522968199, - "grad_norm": 0.8828192949295044, - "learning_rate": 2.893992932862191e-05, - "loss": 0.0776, + "epoch": 0.08836152486745771, + "grad_norm": 1.2298818826675415, + "learning_rate": 2.895e-05, + "loss": 0.0797, "step": 350 }, { - "epoch": 0.18172640080767288, - "grad_norm": 0.756236732006073, - "learning_rate": 2.8909641595153962e-05, - "loss": 0.0736, + "epoch": 0.08836152486745771, + "eval_f1": 0.5325518588749066, + "eval_loss": 0.07257544994354248, + "eval_runtime": 979.2135, + "eval_samples_per_second": 210.64, + "eval_steps_per_second": 3.291, + "step": 350 + }, + { + "epoch": 0.09088613986367079, + "grad_norm": 1.1932893991470337, + "learning_rate": 2.892e-05, + "loss": 0.0803, "step": 360 }, { - "epoch": 0.1867743563856638, - "grad_norm": 0.47730007767677307, - "learning_rate": 2.887935386168602e-05, - "loss": 0.0856, + "epoch": 0.09341075485988387, + "grad_norm": 0.896007776260376, + "learning_rate": 2.889e-05, + "loss": 0.088, "step": 370 }, { - "epoch": 0.1918223119636547, - "grad_norm": 2.5338025093078613, - "learning_rate": 2.8849066128218072e-05, - "loss": 0.0879, + "epoch": 0.09593536985609695, + "grad_norm": 2.385890483856201, + "learning_rate": 2.8859999999999998e-05, + "loss": 0.0886, "step": 380 }, { - "epoch": 0.19687026754164563, - "grad_norm": 0.6218165159225464, - "learning_rate": 2.8818778394750126e-05, - "loss": 0.0724, + "epoch": 0.09845998485231003, + "grad_norm": 0.966077446937561, + "learning_rate": 2.883e-05, + "loss": 0.1038, "step": 390 }, { - "epoch": 0.20191822311963656, - "grad_norm": 1.1621041297912598, - "learning_rate": 2.8788490661282183e-05, - "loss": 0.0742, + "epoch": 0.1009845998485231, + "grad_norm": 0.969159722328186, + "learning_rate": 2.88e-05, + "loss": 0.0857, "step": 400 }, { - "epoch": 0.20696617869762746, - "grad_norm": 0.8511998653411865, - "learning_rate": 2.8758202927814236e-05, - "loss": 0.0798, + "epoch": 0.1009845998485231, + "eval_f1": 0.5497736226259776, + "eval_loss": 0.06929118931293488, + "eval_runtime": 978.0405, + "eval_samples_per_second": 210.893, + "eval_steps_per_second": 3.295, + "step": 400 + }, + { + "epoch": 0.10350921484473617, + "grad_norm": 0.8633397817611694, + "learning_rate": 2.877e-05, + "loss": 0.0895, "step": 410 }, { - "epoch": 0.21201413427561838, - "grad_norm": 0.5848472118377686, - "learning_rate": 2.8727915194346293e-05, - "loss": 0.0834, + "epoch": 0.10603382984094925, + "grad_norm": 1.163271188735962, + "learning_rate": 2.874e-05, + "loss": 0.0861, "step": 420 }, { - "epoch": 0.21706208985360928, - "grad_norm": 0.5747645497322083, - "learning_rate": 2.8697627460878346e-05, - "loss": 0.0745, + "epoch": 0.10855844483716233, + "grad_norm": 1.102964997291565, + "learning_rate": 2.871e-05, + "loss": 0.0962, "step": 430 }, { - "epoch": 0.2221100454316002, - "grad_norm": 1.058206558227539, - "learning_rate": 2.86673397274104e-05, - "loss": 0.0767, + "epoch": 0.11108305983337541, + "grad_norm": 1.520044207572937, + "learning_rate": 2.868e-05, + "loss": 0.0981, "step": 440 }, { - "epoch": 0.2271580010095911, - "grad_norm": 0.8267918825149536, - "learning_rate": 2.8637051993942453e-05, - "loss": 0.0893, + "epoch": 0.11360767482958849, + "grad_norm": 1.8637338876724243, + "learning_rate": 2.865e-05, + "loss": 0.0885, + "step": 450 + }, + { + "epoch": 0.11360767482958849, + "eval_f1": NaN, + "eval_loss": 0.09174469113349915, + "eval_runtime": 1032.8967, + "eval_samples_per_second": 199.693, + "eval_steps_per_second": 3.12, "step": 450 }, { - "epoch": 0.23220595658758203, - "grad_norm": 1.1392240524291992, - "learning_rate": 2.8606764260474507e-05, - "loss": 0.0833, + "epoch": 0.11613228982580157, + "grad_norm": 1.1974824666976929, + "learning_rate": 2.862e-05, + "loss": 0.0784, "step": 460 }, { - "epoch": 0.23725391216557296, - "grad_norm": 0.9474436044692993, - "learning_rate": 2.8576476527006564e-05, - "loss": 0.0896, + "epoch": 0.11865690482201464, + "grad_norm": 1.6933320760726929, + "learning_rate": 2.859e-05, + "loss": 0.078, "step": 470 }, { - "epoch": 0.24230186774356385, - "grad_norm": 1.2880048751831055, - "learning_rate": 2.8546188793538617e-05, - "loss": 0.0924, + "epoch": 0.12118151981822772, + "grad_norm": 1.7774609327316284, + "learning_rate": 2.856e-05, + "loss": 0.0715, "step": 480 }, { - "epoch": 0.24734982332155478, - "grad_norm": 0.6342403888702393, - "learning_rate": 2.851590106007067e-05, - "loss": 0.0799, + "epoch": 0.1237061348144408, + "grad_norm": 0.7675666213035583, + "learning_rate": 2.853e-05, + "loss": 0.0817, "step": 490 }, { - "epoch": 0.2523977788995457, - "grad_norm": 0.5780256986618042, - "learning_rate": 2.8485613326602727e-05, - "loss": 0.0798, + "epoch": 0.12623074981065388, + "grad_norm": 1.169325590133667, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.102, "step": 500 }, { - "epoch": 0.2574457344775366, - "grad_norm": 0.7743504643440247, - "learning_rate": 2.845532559313478e-05, - "loss": 0.0681, + "epoch": 0.12623074981065388, + "eval_f1": 0.5648781658864481, + "eval_loss": 0.057994671165943146, + "eval_runtime": 967.2924, + "eval_samples_per_second": 213.236, + "eval_steps_per_second": 3.332, + "step": 500 + }, + { + "epoch": 0.12875536480686695, + "grad_norm": 0.9567933678627014, + "learning_rate": 2.847e-05, + "loss": 0.0762, "step": 510 }, { - "epoch": 0.26249369005552753, - "grad_norm": 0.5771861672401428, - "learning_rate": 2.8425037859666834e-05, - "loss": 0.0753, + "epoch": 0.13127997980308004, + "grad_norm": 0.7539889216423035, + "learning_rate": 2.844e-05, + "loss": 0.0655, "step": 520 }, { - "epoch": 0.2675416456335184, - "grad_norm": 0.6735575199127197, - "learning_rate": 2.839475012619889e-05, - "loss": 0.0773, + "epoch": 0.1338045947992931, + "grad_norm": 1.873833179473877, + "learning_rate": 2.841e-05, + "loss": 0.0747, "step": 530 }, { - "epoch": 0.2725896012115093, - "grad_norm": 0.7692667841911316, - "learning_rate": 2.8364462392730945e-05, - "loss": 0.0732, + "epoch": 0.1363292097955062, + "grad_norm": 0.7834559082984924, + "learning_rate": 2.838e-05, + "loss": 0.0923, "step": 540 }, { - "epoch": 0.27763755678950025, - "grad_norm": 0.5109196901321411, - "learning_rate": 2.8334174659263e-05, - "loss": 0.0859, + "epoch": 0.13885382479171926, + "grad_norm": 0.6193771362304688, + "learning_rate": 2.8349999999999998e-05, + "loss": 0.0716, "step": 550 }, { - "epoch": 0.2826855123674912, - "grad_norm": 0.726249098777771, - "learning_rate": 2.8303886925795055e-05, - "loss": 0.0801, + "epoch": 0.13885382479171926, + "eval_f1": 0.538135593220339, + "eval_loss": 0.07973095029592514, + "eval_runtime": 974.1593, + "eval_samples_per_second": 211.733, + "eval_steps_per_second": 3.308, + "step": 550 + }, + { + "epoch": 0.14137843978793235, + "grad_norm": 1.1256766319274902, + "learning_rate": 2.832e-05, + "loss": 0.0798, "step": 560 }, { - "epoch": 0.2877334679454821, - "grad_norm": 0.8817322254180908, - "learning_rate": 2.8273599192327108e-05, - "loss": 0.0739, + "epoch": 0.14390305478414542, + "grad_norm": 1.0669515132904053, + "learning_rate": 2.829e-05, + "loss": 0.0795, "step": 570 }, { - "epoch": 0.292781423523473, - "grad_norm": 0.5081413984298706, - "learning_rate": 2.8243311458859162e-05, - "loss": 0.0727, + "epoch": 0.14642766978035848, + "grad_norm": 1.018234133720398, + "learning_rate": 2.826e-05, + "loss": 0.073, "step": 580 }, { - "epoch": 0.2978293791014639, - "grad_norm": 0.9367203712463379, - "learning_rate": 2.8213023725391215e-05, - "loss": 0.0751, + "epoch": 0.14895228477657158, + "grad_norm": 1.2367616891860962, + "learning_rate": 2.823e-05, + "loss": 0.0879, "step": 590 }, { - "epoch": 0.3028773346794548, - "grad_norm": 0.5382592678070068, - "learning_rate": 2.8182735991923272e-05, - "loss": 0.0756, + "epoch": 0.15147689977278464, + "grad_norm": 1.5840317010879517, + "learning_rate": 2.8199999999999998e-05, + "loss": 0.0854, + "step": 600 + }, + { + "epoch": 0.15147689977278464, + "eval_f1": 0.571752762018513, + "eval_loss": 0.07439474016427994, + "eval_runtime": 970.5653, + "eval_samples_per_second": 212.517, + "eval_steps_per_second": 3.321, "step": 600 }, { - "epoch": 0.30792529025744575, - "grad_norm": 0.40977007150650024, - "learning_rate": 2.8152448258455325e-05, - "loss": 0.0714, + "epoch": 0.15400151476899773, + "grad_norm": 0.5361483097076416, + "learning_rate": 2.817e-05, + "loss": 0.0854, "step": 610 }, { - "epoch": 0.3129732458354367, - "grad_norm": 0.6829769015312195, - "learning_rate": 2.812216052498738e-05, - "loss": 0.0809, + "epoch": 0.1565261297652108, + "grad_norm": 0.9658698439598083, + "learning_rate": 2.8139999999999998e-05, + "loss": 0.095, "step": 620 }, { - "epoch": 0.31802120141342755, - "grad_norm": 0.4805002212524414, - "learning_rate": 2.8091872791519436e-05, - "loss": 0.0789, + "epoch": 0.1590507447614239, + "grad_norm": 0.820649266242981, + "learning_rate": 2.8110000000000004e-05, + "loss": 0.0921, "step": 630 }, { - "epoch": 0.32306915699141847, - "grad_norm": 0.6755364537239075, - "learning_rate": 2.806158505805149e-05, - "loss": 0.0819, + "epoch": 0.16157535975763695, + "grad_norm": 1.1583890914916992, + "learning_rate": 2.8080000000000002e-05, + "loss": 0.077, "step": 640 }, { - "epoch": 0.3281171125694094, - "grad_norm": 1.3035857677459717, - "learning_rate": 2.8031297324583546e-05, - "loss": 0.0861, + "epoch": 0.16409997475385005, + "grad_norm": 0.8755506277084351, + "learning_rate": 2.805e-05, + "loss": 0.089, "step": 650 }, { - "epoch": 0.3331650681474003, - "grad_norm": 0.7905831933021545, - "learning_rate": 2.80010095911156e-05, - "loss": 0.0739, + "epoch": 0.16409997475385005, + "eval_f1": 0.5789600675594161, + "eval_loss": 0.0503680482506752, + "eval_runtime": 976.7796, + "eval_samples_per_second": 211.165, + "eval_steps_per_second": 3.3, + "step": 650 + }, + { + "epoch": 0.1666245897500631, + "grad_norm": 0.5073147416114807, + "learning_rate": 2.8020000000000003e-05, + "loss": 0.0784, "step": 660 }, { - "epoch": 0.3382130237253912, - "grad_norm": 0.8810652494430542, - "learning_rate": 2.7970721857647653e-05, - "loss": 0.0678, + "epoch": 0.1691492047462762, + "grad_norm": 1.0332393646240234, + "learning_rate": 2.799e-05, + "loss": 0.0906, "step": 670 }, { - "epoch": 0.3432609793033821, - "grad_norm": 1.1220252513885498, - "learning_rate": 2.794043412417971e-05, - "loss": 0.07, + "epoch": 0.17167381974248927, + "grad_norm": 1.1538151502609253, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.0799, "step": 680 }, { - "epoch": 0.34830893488137304, - "grad_norm": 0.8519473075866699, - "learning_rate": 2.7910146390711763e-05, - "loss": 0.076, + "epoch": 0.17419843473870233, + "grad_norm": 1.2075843811035156, + "learning_rate": 2.7930000000000002e-05, + "loss": 0.0795, "step": 690 }, { - "epoch": 0.35335689045936397, - "grad_norm": 0.49878937005996704, - "learning_rate": 2.787985865724382e-05, - "loss": 0.0787, + "epoch": 0.17672304973491543, + "grad_norm": 2.1169683933258057, + "learning_rate": 2.79e-05, + "loss": 0.0721, "step": 700 }, { - "epoch": 0.3584048460373549, - "grad_norm": 1.4854084253311157, - "learning_rate": 2.784957092377587e-05, - "loss": 0.0872, + "epoch": 0.17672304973491543, + "eval_f1": 0.5727175590644663, + "eval_loss": 0.0618172287940979, + "eval_runtime": 975.6558, + "eval_samples_per_second": 211.409, + "eval_steps_per_second": 3.303, + "step": 700 + }, + { + "epoch": 0.1792476647311285, + "grad_norm": 1.3094089031219482, + "learning_rate": 2.7870000000000003e-05, + "loss": 0.0723, "step": 710 }, { - "epoch": 0.36345280161534577, - "grad_norm": 0.787535548210144, - "learning_rate": 2.7819283190307924e-05, - "loss": 0.0805, + "epoch": 0.18177227972734158, + "grad_norm": 0.9937088489532471, + "learning_rate": 2.784e-05, + "loss": 0.0704, "step": 720 }, { - "epoch": 0.3685007571933367, - "grad_norm": 0.8322392106056213, - "learning_rate": 2.778899545683998e-05, - "loss": 0.0726, + "epoch": 0.18429689472355465, + "grad_norm": 0.6464220881462097, + "learning_rate": 2.7810000000000003e-05, + "loss": 0.0731, "step": 730 }, { - "epoch": 0.3735487127713276, - "grad_norm": 0.48470157384872437, - "learning_rate": 2.7758707723372034e-05, - "loss": 0.0673, + "epoch": 0.18682150971976774, + "grad_norm": 0.5544419288635254, + "learning_rate": 2.778e-05, + "loss": 0.0894, "step": 740 }, { - "epoch": 0.37859666834931854, - "grad_norm": 0.8375622034072876, - "learning_rate": 2.772841998990409e-05, - "loss": 0.0767, + "epoch": 0.1893461247159808, + "grad_norm": 0.6369556188583374, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.0721, "step": 750 }, { - "epoch": 0.3836446239273094, - "grad_norm": 0.5212222337722778, - "learning_rate": 2.7698132256436144e-05, - "loss": 0.0737, + "epoch": 0.1893461247159808, + "eval_f1": 0.5904197411394702, + "eval_loss": 0.07033708691596985, + "eval_runtime": 967.5811, + "eval_samples_per_second": 213.173, + "eval_steps_per_second": 3.331, + "step": 750 + }, + { + "epoch": 0.1918707397121939, + "grad_norm": 2.0700013637542725, + "learning_rate": 2.7720000000000002e-05, + "loss": 0.0831, "step": 760 }, { - "epoch": 0.38869257950530034, - "grad_norm": 0.503209114074707, - "learning_rate": 2.7667844522968198e-05, - "loss": 0.0657, + "epoch": 0.19439535470840696, + "grad_norm": 0.765533983707428, + "learning_rate": 2.769e-05, + "loss": 0.0707, "step": 770 }, { - "epoch": 0.39374053508329127, - "grad_norm": 0.4290629029273987, - "learning_rate": 2.7637556789500254e-05, - "loss": 0.0745, + "epoch": 0.19691996970462006, + "grad_norm": 1.6104159355163574, + "learning_rate": 2.7660000000000003e-05, + "loss": 0.073, "step": 780 }, { - "epoch": 0.3987884906612822, - "grad_norm": 0.7535534501075745, - "learning_rate": 2.7607269056032308e-05, + "epoch": 0.19944458470083312, + "grad_norm": 1.1069729328155518, + "learning_rate": 2.763e-05, "loss": 0.0702, "step": 790 }, { - "epoch": 0.4038364462392731, - "grad_norm": 0.67135089635849, - "learning_rate": 2.757698132256436e-05, - "loss": 0.0754, + "epoch": 0.2019691996970462, + "grad_norm": 1.6577630043029785, + "learning_rate": 2.7600000000000003e-05, + "loss": 0.0865, + "step": 800 + }, + { + "epoch": 0.2019691996970462, + "eval_f1": 0.5952780441035476, + "eval_loss": 0.058820515871047974, + "eval_runtime": 917.4267, + "eval_samples_per_second": 224.827, + "eval_steps_per_second": 3.513, "step": 800 }, { - "epoch": 0.408884401817264, - "grad_norm": 0.5307912230491638, - "learning_rate": 2.7546693589096418e-05, - "loss": 0.0717, + "epoch": 0.20449381469325928, + "grad_norm": 1.5197840929031372, + "learning_rate": 2.7570000000000002e-05, + "loss": 0.0846, "step": 810 }, { - "epoch": 0.4139323573952549, - "grad_norm": 0.46130767464637756, - "learning_rate": 2.751640585562847e-05, - "loss": 0.065, + "epoch": 0.20701842968947234, + "grad_norm": 1.1758556365966797, + "learning_rate": 2.754e-05, + "loss": 0.0813, "step": 820 }, { - "epoch": 0.41898031297324584, - "grad_norm": 1.2904905080795288, - "learning_rate": 2.748611812216053e-05, - "loss": 0.0818, + "epoch": 0.20954304468568544, + "grad_norm": 0.5016022324562073, + "learning_rate": 2.7510000000000003e-05, + "loss": 0.0718, "step": 830 }, { - "epoch": 0.42402826855123676, - "grad_norm": 2.0480494499206543, - "learning_rate": 2.745583038869258e-05, - "loss": 0.085, + "epoch": 0.2120676596818985, + "grad_norm": 1.3600627183914185, + "learning_rate": 2.748e-05, + "loss": 0.0942, "step": 840 }, { - "epoch": 0.4290762241292277, - "grad_norm": 0.5108308792114258, - "learning_rate": 2.7425542655224632e-05, - "loss": 0.0729, + "epoch": 0.2145922746781116, + "grad_norm": 0.6990534067153931, + "learning_rate": 2.7450000000000003e-05, + "loss": 0.0767, "step": 850 }, { - "epoch": 0.43412417970721856, - "grad_norm": 0.6915296912193298, - "learning_rate": 2.739525492175669e-05, - "loss": 0.071, + "epoch": 0.2145922746781116, + "eval_f1": 0.5918155918155918, + "eval_loss": 0.04372716695070267, + "eval_runtime": 913.4291, + "eval_samples_per_second": 225.811, + "eval_steps_per_second": 3.528, + "step": 850 + }, + { + "epoch": 0.21711688967432466, + "grad_norm": 1.0468288660049438, + "learning_rate": 2.7420000000000002e-05, + "loss": 0.0805, "step": 860 }, { - "epoch": 0.4391721352852095, - "grad_norm": 0.8100910782814026, - "learning_rate": 2.7364967188288742e-05, - "loss": 0.0667, + "epoch": 0.21964150467053775, + "grad_norm": 1.2046771049499512, + "learning_rate": 2.739e-05, + "loss": 0.0879, "step": 870 }, { - "epoch": 0.4442200908632004, - "grad_norm": 0.626818835735321, - "learning_rate": 2.73346794548208e-05, - "loss": 0.0695, + "epoch": 0.22216611966675082, + "grad_norm": 0.9044977426528931, + "learning_rate": 2.7360000000000002e-05, + "loss": 0.0597, "step": 880 }, { - "epoch": 0.44926804644119134, - "grad_norm": 0.673156201839447, - "learning_rate": 2.7304391721352853e-05, - "loss": 0.0793, + "epoch": 0.2246907346629639, + "grad_norm": 1.145572304725647, + "learning_rate": 2.733e-05, + "loss": 0.1007, "step": 890 }, { - "epoch": 0.4543160020191822, - "grad_norm": 0.5740798711776733, - "learning_rate": 2.7274103987884906e-05, - "loss": 0.0731, + "epoch": 0.22721534965917697, + "grad_norm": 1.058166742324829, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.0773, "step": 900 }, { - "epoch": 0.45936395759717313, - "grad_norm": 0.744429349899292, - "learning_rate": 2.7243816254416963e-05, - "loss": 0.0743, + "epoch": 0.22721534965917697, + "eval_f1": 0.5956852791878172, + "eval_loss": 0.05675825849175453, + "eval_runtime": 923.1927, + "eval_samples_per_second": 223.422, + "eval_steps_per_second": 3.491, + "step": 900 + }, + { + "epoch": 0.22973996465539007, + "grad_norm": 0.7665570974349976, + "learning_rate": 2.727e-05, + "loss": 0.084, "step": 910 }, { - "epoch": 0.46441191317516406, - "grad_norm": 0.5837222933769226, - "learning_rate": 2.7213528520949016e-05, - "loss": 0.0747, + "epoch": 0.23226457965160313, + "grad_norm": 0.8884145021438599, + "learning_rate": 2.724e-05, + "loss": 0.0748, "step": 920 }, { - "epoch": 0.469459868753155, - "grad_norm": 0.500978410243988, - "learning_rate": 2.7183240787481073e-05, - "loss": 0.0753, + "epoch": 0.2347891946478162, + "grad_norm": 0.7132917046546936, + "learning_rate": 2.7210000000000002e-05, + "loss": 0.0861, "step": 930 }, { - "epoch": 0.4745078243311459, - "grad_norm": 1.0817604064941406, - "learning_rate": 2.7152953054013127e-05, - "loss": 0.0748, + "epoch": 0.2373138096440293, + "grad_norm": 1.3353750705718994, + "learning_rate": 2.718e-05, + "loss": 0.091, "step": 940 }, { - "epoch": 0.4795557799091368, - "grad_norm": 0.5821205377578735, - "learning_rate": 2.712266532054518e-05, - "loss": 0.0766, + "epoch": 0.23983842464024235, + "grad_norm": 1.216691255569458, + "learning_rate": 2.7150000000000003e-05, + "loss": 0.0748, + "step": 950 + }, + { + "epoch": 0.23983842464024235, + "eval_f1": 0.5942299042601041, + "eval_loss": 0.04645048826932907, + "eval_runtime": 919.478, + "eval_samples_per_second": 224.325, + "eval_steps_per_second": 3.505, "step": 950 }, { - "epoch": 0.4846037354871277, - "grad_norm": 0.6120801568031311, - "learning_rate": 2.7092377587077233e-05, - "loss": 0.0827, + "epoch": 0.24236303963645545, + "grad_norm": 1.0420501232147217, + "learning_rate": 2.712e-05, + "loss": 0.0953, "step": 960 }, { - "epoch": 0.48965169106511863, - "grad_norm": 0.4379239082336426, - "learning_rate": 2.7062089853609287e-05, - "loss": 0.0664, + "epoch": 0.2448876546326685, + "grad_norm": 1.1488158702850342, + "learning_rate": 2.709e-05, + "loss": 0.0796, "step": 970 }, { - "epoch": 0.49469964664310956, - "grad_norm": 0.5472243428230286, - "learning_rate": 2.7031802120141344e-05, - "loss": 0.0767, + "epoch": 0.2474122696288816, + "grad_norm": 0.7872379422187805, + "learning_rate": 2.7060000000000002e-05, + "loss": 0.0844, "step": 980 }, { - "epoch": 0.49974760222110043, - "grad_norm": 1.0190905332565308, - "learning_rate": 2.7001514386673397e-05, - "loss": 0.0739, + "epoch": 0.24993688462509467, + "grad_norm": 0.9102885127067566, + "learning_rate": 2.703e-05, + "loss": 0.0792, "step": 990 }, { - "epoch": 0.5047955577990914, - "grad_norm": 0.7046610713005066, - "learning_rate": 2.697122665320545e-05, - "loss": 0.0685, + "epoch": 0.25246149962130776, + "grad_norm": 1.040650486946106, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0761, "step": 1000 }, { - "epoch": 0.5098435133770823, - "grad_norm": 0.5559498071670532, - "learning_rate": 2.6940938919737507e-05, - "loss": 0.0715, + "epoch": 0.25246149962130776, + "eval_f1": NaN, + "eval_loss": 0.06595388799905777, + "eval_runtime": 948.4123, + "eval_samples_per_second": 217.481, + "eval_steps_per_second": 3.398, + "step": 1000 + }, + { + "epoch": 0.25498611461752085, + "grad_norm": 1.0717836618423462, + "learning_rate": 2.697e-05, + "loss": 0.0569, "step": 1010 }, { - "epoch": 0.5148914689550732, - "grad_norm": 0.6298381686210632, - "learning_rate": 2.691065118626956e-05, - "loss": 0.0828, + "epoch": 0.2575107296137339, + "grad_norm": 0.7504699230194092, + "learning_rate": 2.6940000000000003e-05, + "loss": 0.072, "step": 1020 }, { - "epoch": 0.5199394245330641, - "grad_norm": 0.7023555636405945, - "learning_rate": 2.6880363452801618e-05, - "loss": 0.0809, + "epoch": 0.260035344609947, + "grad_norm": 0.9767778515815735, + "learning_rate": 2.691e-05, + "loss": 0.0658, "step": 1030 }, { - "epoch": 0.5249873801110551, - "grad_norm": 0.6804683804512024, - "learning_rate": 2.685007571933367e-05, - "loss": 0.0739, + "epoch": 0.2625599596061601, + "grad_norm": 0.5905674695968628, + "learning_rate": 2.688e-05, + "loss": 0.0775, "step": 1040 }, { - "epoch": 0.5300353356890459, - "grad_norm": 0.7743015885353088, - "learning_rate": 2.6819787985865725e-05, - "loss": 0.0658, + "epoch": 0.2650845746023731, + "grad_norm": 1.6352293491363525, + "learning_rate": 2.6850000000000002e-05, + "loss": 0.0855, "step": 1050 }, { - "epoch": 0.5350832912670368, - "grad_norm": 1.36810302734375, - "learning_rate": 2.678950025239778e-05, - "loss": 0.0747, + "epoch": 0.2650845746023731, + "eval_f1": 0.5963938973647711, + "eval_loss": 0.04910014942288399, + "eval_runtime": 986.8376, + "eval_samples_per_second": 209.013, + "eval_steps_per_second": 3.266, + "step": 1050 + }, + { + "epoch": 0.2676091895985862, + "grad_norm": 0.6634190082550049, + "learning_rate": 2.682e-05, + "loss": 0.0741, "step": 1060 }, { - "epoch": 0.5401312468450278, - "grad_norm": 0.47373896837234497, - "learning_rate": 2.6759212518929835e-05, - "loss": 0.0751, + "epoch": 0.2701338045947993, + "grad_norm": 0.5896914601325989, + "learning_rate": 2.6790000000000003e-05, + "loss": 0.0713, "step": 1070 }, { - "epoch": 0.5451792024230186, - "grad_norm": 0.6654021143913269, - "learning_rate": 2.6728924785461892e-05, - "loss": 0.0683, + "epoch": 0.2726584195910124, + "grad_norm": 1.3768564462661743, + "learning_rate": 2.676e-05, + "loss": 0.0684, "step": 1080 }, { - "epoch": 0.5502271580010096, - "grad_norm": 1.0054854154586792, - "learning_rate": 2.6698637051993942e-05, - "loss": 0.0676, + "epoch": 0.27518303458722543, + "grad_norm": 0.7323074340820312, + "learning_rate": 2.673e-05, + "loss": 0.084, "step": 1090 }, { - "epoch": 0.5552751135790005, - "grad_norm": 0.5544041395187378, - "learning_rate": 2.6668349318525995e-05, - "loss": 0.075, + "epoch": 0.2777076495834385, + "grad_norm": 0.6660707592964172, + "learning_rate": 2.6700000000000002e-05, + "loss": 0.0832, "step": 1100 }, { - "epoch": 0.5603230691569914, - "grad_norm": 0.6919006109237671, - "learning_rate": 2.6638061585058052e-05, - "loss": 0.0709, + "epoch": 0.2777076495834385, + "eval_f1": 0.6048397002825205, + "eval_loss": 0.049847185611724854, + "eval_runtime": 967.6797, + "eval_samples_per_second": 213.151, + "eval_steps_per_second": 3.331, + "step": 1100 + }, + { + "epoch": 0.2802322645796516, + "grad_norm": 1.425309419631958, + "learning_rate": 2.667e-05, + "loss": 0.0793, "step": 1110 }, { - "epoch": 0.5653710247349824, - "grad_norm": 0.5584747791290283, - "learning_rate": 2.6607773851590106e-05, - "loss": 0.0623, + "epoch": 0.2827568795758647, + "grad_norm": 1.3583918809890747, + "learning_rate": 2.6640000000000002e-05, + "loss": 0.0808, "step": 1120 }, { - "epoch": 0.5704189803129732, - "grad_norm": 0.47064319252967834, - "learning_rate": 2.657748611812216e-05, - "loss": 0.0744, + "epoch": 0.28528149457207774, + "grad_norm": 1.1851533651351929, + "learning_rate": 2.661e-05, + "loss": 0.0738, "step": 1130 }, { - "epoch": 0.5754669358909642, - "grad_norm": 0.5119986534118652, - "learning_rate": 2.6547198384654216e-05, - "loss": 0.0795, + "epoch": 0.28780610956829084, + "grad_norm": 1.4497005939483643, + "learning_rate": 2.658e-05, + "loss": 0.078, "step": 1140 }, { - "epoch": 0.5805148914689551, - "grad_norm": 0.9572923183441162, - "learning_rate": 2.651691065118627e-05, - "loss": 0.073, + "epoch": 0.29033072456450393, + "grad_norm": 1.4407027959823608, + "learning_rate": 2.655e-05, + "loss": 0.0821, + "step": 1150 + }, + { + "epoch": 0.29033072456450393, + "eval_f1": 0.6031633616619453, + "eval_loss": 0.059650588780641556, + "eval_runtime": 962.0892, + "eval_samples_per_second": 214.39, + "eval_steps_per_second": 3.35, "step": 1150 }, { - "epoch": 0.585562847046946, - "grad_norm": 0.5633489489555359, - "learning_rate": 2.6486622917718326e-05, - "loss": 0.0637, + "epoch": 0.29285533956071697, + "grad_norm": 1.0721668004989624, + "learning_rate": 2.652e-05, + "loss": 0.0706, "step": 1160 }, { - "epoch": 0.5906108026249369, - "grad_norm": 1.1218105554580688, - "learning_rate": 2.645633518425038e-05, - "loss": 0.0695, + "epoch": 0.29537995455693006, + "grad_norm": 1.1033729314804077, + "learning_rate": 2.6490000000000002e-05, + "loss": 0.0737, "step": 1170 }, { - "epoch": 0.5956587582029278, - "grad_norm": 0.6655285954475403, - "learning_rate": 2.6426047450782433e-05, - "loss": 0.0774, + "epoch": 0.29790456955314315, + "grad_norm": 0.9764577746391296, + "learning_rate": 2.646e-05, + "loss": 0.0743, "step": 1180 }, { - "epoch": 0.6007067137809188, - "grad_norm": 1.3088024854660034, - "learning_rate": 2.639575971731449e-05, - "loss": 0.0748, + "epoch": 0.30042918454935624, + "grad_norm": 1.2160297632217407, + "learning_rate": 2.643e-05, + "loss": 0.0768, "step": 1190 }, { - "epoch": 0.6057546693589096, - "grad_norm": 0.9868513941764832, - "learning_rate": 2.6365471983846543e-05, - "loss": 0.0695, + "epoch": 0.3029537995455693, + "grad_norm": 0.8387085795402527, + "learning_rate": 2.64e-05, + "loss": 0.0715, + "step": 1200 + }, + { + "epoch": 0.3029537995455693, + "eval_f1": NaN, + "eval_loss": 0.06428094953298569, + "eval_runtime": 961.1037, + "eval_samples_per_second": 214.61, + "eval_steps_per_second": 3.353, "step": 1200 }, { - "epoch": 0.6108026249369005, - "grad_norm": 0.5922626852989197, - "learning_rate": 2.63351842503786e-05, - "loss": 0.0678, + "epoch": 0.3054784145417824, + "grad_norm": 1.061087727546692, + "learning_rate": 2.637e-05, + "loss": 0.0672, "step": 1210 }, { - "epoch": 0.6158505805148915, - "grad_norm": 0.6839954257011414, - "learning_rate": 2.630489651691065e-05, - "loss": 0.0693, + "epoch": 0.30800302953799547, + "grad_norm": 0.6768150925636292, + "learning_rate": 2.6340000000000002e-05, + "loss": 0.0762, "step": 1220 }, { - "epoch": 0.6208985360928824, - "grad_norm": 0.6755519509315491, - "learning_rate": 2.6274608783442704e-05, - "loss": 0.0742, + "epoch": 0.31052764453420856, + "grad_norm": 0.7020296454429626, + "learning_rate": 2.631e-05, + "loss": 0.0838, "step": 1230 }, { - "epoch": 0.6259464916708734, - "grad_norm": 0.4968509078025818, - "learning_rate": 2.624432104997476e-05, - "loss": 0.0615, + "epoch": 0.3130522595304216, + "grad_norm": 0.9264736175537109, + "learning_rate": 2.628e-05, + "loss": 0.0769, "step": 1240 }, { - "epoch": 0.6309944472488642, - "grad_norm": 1.1036404371261597, - "learning_rate": 2.6214033316506814e-05, - "loss": 0.0727, + "epoch": 0.3155768745266347, + "grad_norm": 0.657778799533844, + "learning_rate": 2.625e-05, + "loss": 0.085, "step": 1250 }, { - "epoch": 0.6360424028268551, - "grad_norm": 0.810405969619751, - "learning_rate": 2.618374558303887e-05, - "loss": 0.072, + "epoch": 0.3155768745266347, + "eval_f1": 0.6054250016184373, + "eval_loss": 0.06593530625104904, + "eval_runtime": 970.0262, + "eval_samples_per_second": 212.635, + "eval_steps_per_second": 3.323, + "step": 1250 + }, + { + "epoch": 0.3181014895228478, + "grad_norm": 0.6904731392860413, + "learning_rate": 2.622e-05, + "loss": 0.0736, "step": 1260 }, { - "epoch": 0.6410903584048461, - "grad_norm": 0.730140209197998, - "learning_rate": 2.6153457849570924e-05, - "loss": 0.0652, + "epoch": 0.3206261045190608, + "grad_norm": 1.4745820760726929, + "learning_rate": 2.619e-05, + "loss": 0.0832, "step": 1270 }, { - "epoch": 0.6461383139828369, - "grad_norm": 1.1645480394363403, - "learning_rate": 2.6123170116102978e-05, - "loss": 0.0716, + "epoch": 0.3231507195152739, + "grad_norm": 1.0614553689956665, + "learning_rate": 2.616e-05, + "loss": 0.0781, "step": 1280 }, { - "epoch": 0.6511862695608278, - "grad_norm": 0.8481037020683289, - "learning_rate": 2.6092882382635034e-05, - "loss": 0.0737, + "epoch": 0.325675334511487, + "grad_norm": 1.2228913307189941, + "learning_rate": 2.6130000000000002e-05, + "loss": 0.0872, "step": 1290 }, { - "epoch": 0.6562342251388188, - "grad_norm": 0.5972946882247925, - "learning_rate": 2.6062594649167088e-05, - "loss": 0.0704, + "epoch": 0.3281999495077001, + "grad_norm": 0.9905760288238525, + "learning_rate": 2.61e-05, + "loss": 0.0826, "step": 1300 }, { - "epoch": 0.6612821807168097, - "grad_norm": 0.6405556201934814, - "learning_rate": 2.6032306915699145e-05, - "loss": 0.0628, + "epoch": 0.3281999495077001, + "eval_f1": 0.6011740745177908, + "eval_loss": 0.05560224503278732, + "eval_runtime": 964.1962, + "eval_samples_per_second": 213.921, + "eval_steps_per_second": 3.343, + "step": 1300 + }, + { + "epoch": 0.33072456450391313, + "grad_norm": 1.1195616722106934, + "learning_rate": 2.607e-05, + "loss": 0.0751, "step": 1310 }, { - "epoch": 0.6663301362948006, - "grad_norm": 0.8645715117454529, - "learning_rate": 2.6002019182231198e-05, - "loss": 0.0742, + "epoch": 0.3332491795001262, + "grad_norm": 0.9830445647239685, + "learning_rate": 2.604e-05, + "loss": 0.0694, "step": 1320 }, { - "epoch": 0.6713780918727915, - "grad_norm": 1.4211089611053467, - "learning_rate": 2.597173144876325e-05, - "loss": 0.0731, + "epoch": 0.3357737944963393, + "grad_norm": 1.7140698432922363, + "learning_rate": 2.601e-05, + "loss": 0.0694, "step": 1330 }, { - "epoch": 0.6764260474507824, - "grad_norm": 0.8079481720924377, - "learning_rate": 2.594144371529531e-05, - "loss": 0.0732, + "epoch": 0.3382984094925524, + "grad_norm": 0.9545607566833496, + "learning_rate": 2.5980000000000002e-05, + "loss": 0.0626, "step": 1340 }, { - "epoch": 0.6814740030287734, - "grad_norm": 0.6517273783683777, - "learning_rate": 2.591115598182736e-05, - "loss": 0.0688, + "epoch": 0.34082302448876545, + "grad_norm": 0.8236456513404846, + "learning_rate": 2.595e-05, + "loss": 0.064, "step": 1350 }, { - "epoch": 0.6865219586067642, - "grad_norm": 1.2093323469161987, - "learning_rate": 2.5880868248359415e-05, - "loss": 0.0729, + "epoch": 0.34082302448876545, + "eval_f1": NaN, + "eval_loss": 0.0564185306429863, + "eval_runtime": 1030.8018, + "eval_samples_per_second": 200.099, + "eval_steps_per_second": 3.127, + "step": 1350 + }, + { + "epoch": 0.34334763948497854, + "grad_norm": 1.0344712734222412, + "learning_rate": 2.592e-05, + "loss": 0.074, "step": 1360 }, { - "epoch": 0.6915699141847552, - "grad_norm": 0.6432307362556458, - "learning_rate": 2.585058051489147e-05, - "loss": 0.076, + "epoch": 0.34587225448119163, + "grad_norm": 1.647894024848938, + "learning_rate": 2.589e-05, + "loss": 0.0756, "step": 1370 }, { - "epoch": 0.6966178697627461, - "grad_norm": 0.5220794677734375, - "learning_rate": 2.5820292781423522e-05, - "loss": 0.0702, + "epoch": 0.34839686947740467, + "grad_norm": 1.0268642902374268, + "learning_rate": 2.586e-05, + "loss": 0.064, "step": 1380 }, { - "epoch": 0.701665825340737, - "grad_norm": 1.0983613729476929, - "learning_rate": 2.579000504795558e-05, - "loss": 0.0676, + "epoch": 0.35092148447361776, + "grad_norm": 0.6588199734687805, + "learning_rate": 2.5830000000000002e-05, + "loss": 0.0685, "step": 1390 }, { - "epoch": 0.7067137809187279, - "grad_norm": 0.859348475933075, - "learning_rate": 2.5759717314487633e-05, - "loss": 0.0615, + "epoch": 0.35344609946983085, + "grad_norm": 0.8278918862342834, + "learning_rate": 2.58e-05, + "loss": 0.0854, "step": 1400 }, { - "epoch": 0.7117617364967188, - "grad_norm": 0.7912864685058594, - "learning_rate": 2.572942958101969e-05, - "loss": 0.0681, + "epoch": 0.35344609946983085, + "eval_f1": NaN, + "eval_loss": 0.05516933649778366, + "eval_runtime": 1032.3177, + "eval_samples_per_second": 199.805, + "eval_steps_per_second": 3.122, + "step": 1400 + }, + { + "epoch": 0.35597071446604395, + "grad_norm": 0.4216013550758362, + "learning_rate": 2.577e-05, + "loss": 0.0785, "step": 1410 }, { - "epoch": 0.7168096920747098, - "grad_norm": 0.6189167499542236, - "learning_rate": 2.5699141847551743e-05, - "loss": 0.0682, + "epoch": 0.358495329462257, + "grad_norm": 0.9567118287086487, + "learning_rate": 2.574e-05, + "loss": 0.089, "step": 1420 }, { - "epoch": 0.7218576476527007, - "grad_norm": 0.5456287860870361, - "learning_rate": 2.5668854114083796e-05, - "loss": 0.0591, + "epoch": 0.3610199444584701, + "grad_norm": 1.3202637434005737, + "learning_rate": 2.571e-05, + "loss": 0.0884, "step": 1430 }, { - "epoch": 0.7269056032306915, - "grad_norm": 0.485055148601532, - "learning_rate": 2.5638566380615853e-05, - "loss": 0.0729, + "epoch": 0.36354455945468317, + "grad_norm": 1.3245704174041748, + "learning_rate": 2.568e-05, + "loss": 0.0739, "step": 1440 }, { - "epoch": 0.7319535588086825, - "grad_norm": 0.46423906087875366, - "learning_rate": 2.5608278647147907e-05, - "loss": 0.0646, + "epoch": 0.36606917445089626, + "grad_norm": 0.6416196823120117, + "learning_rate": 2.565e-05, + "loss": 0.0702, "step": 1450 }, { - "epoch": 0.7370015143866734, - "grad_norm": 0.5944865345954895, - "learning_rate": 2.557799091367996e-05, - "loss": 0.0696, + "epoch": 0.36606917445089626, + "eval_f1": 0.6061020319393525, + "eval_loss": 0.06748606264591217, + "eval_runtime": 999.3826, + "eval_samples_per_second": 206.389, + "eval_steps_per_second": 3.225, + "step": 1450 + }, + { + "epoch": 0.3685937894471093, + "grad_norm": 0.9312785267829895, + "learning_rate": 2.562e-05, + "loss": 0.0674, "step": 1460 }, { - "epoch": 0.7420494699646644, - "grad_norm": 0.794015645980835, - "learning_rate": 2.5547703180212014e-05, - "loss": 0.0671, + "epoch": 0.3711184044433224, + "grad_norm": 0.9092572927474976, + "learning_rate": 2.559e-05, + "loss": 0.0676, "step": 1470 }, { - "epoch": 0.7470974255426552, - "grad_norm": 0.6759900450706482, - "learning_rate": 2.5517415446744067e-05, - "loss": 0.074, + "epoch": 0.3736430194395355, + "grad_norm": 1.4935100078582764, + "learning_rate": 2.556e-05, + "loss": 0.0712, "step": 1480 }, { - "epoch": 0.7521453811206461, - "grad_norm": 0.6719480156898499, - "learning_rate": 2.5487127713276124e-05, - "loss": 0.0708, + "epoch": 0.3761676344357485, + "grad_norm": 0.9569060802459717, + "learning_rate": 2.553e-05, + "loss": 0.0747, "step": 1490 }, { - "epoch": 0.7571933366986371, - "grad_norm": 0.7934426665306091, - "learning_rate": 2.5456839979808177e-05, - "loss": 0.0664, + "epoch": 0.3786922494319616, + "grad_norm": 0.947384774684906, + "learning_rate": 2.55e-05, + "loss": 0.0771, + "step": 1500 + }, + { + "epoch": 0.3786922494319616, + "eval_f1": NaN, + "eval_loss": 0.057753585278987885, + "eval_runtime": 1000.0105, + "eval_samples_per_second": 206.26, + "eval_steps_per_second": 3.223, "step": 1500 }, { - "epoch": 0.762241292276628, - "grad_norm": 1.4169378280639648, - "learning_rate": 2.542655224634023e-05, - "loss": 0.0726, + "epoch": 0.3812168644281747, + "grad_norm": 0.6996080875396729, + "learning_rate": 2.547e-05, + "loss": 0.0696, "step": 1510 }, { - "epoch": 0.7672892478546188, - "grad_norm": 0.5849716067314148, - "learning_rate": 2.5396264512872288e-05, - "loss": 0.0709, + "epoch": 0.3837414794243878, + "grad_norm": 0.5415595173835754, + "learning_rate": 2.544e-05, + "loss": 0.0757, "step": 1520 }, { - "epoch": 0.7723372034326098, - "grad_norm": 0.8471559286117554, - "learning_rate": 2.536597677940434e-05, - "loss": 0.0764, + "epoch": 0.38626609442060084, + "grad_norm": 0.5137012600898743, + "learning_rate": 2.541e-05, + "loss": 0.0621, "step": 1530 }, { - "epoch": 0.7773851590106007, - "grad_norm": 0.7494149804115295, - "learning_rate": 2.5335689045936398e-05, - "loss": 0.0629, + "epoch": 0.38879070941681393, + "grad_norm": 0.9606865048408508, + "learning_rate": 2.538e-05, + "loss": 0.073, "step": 1540 }, { - "epoch": 0.7824331145885917, - "grad_norm": 0.7659397721290588, - "learning_rate": 2.530540131246845e-05, - "loss": 0.061, + "epoch": 0.391315324413027, + "grad_norm": 1.1751604080200195, + "learning_rate": 2.535e-05, + "loss": 0.08, + "step": 1550 + }, + { + "epoch": 0.391315324413027, + "eval_f1": NaN, + "eval_loss": 0.0491572804749012, + "eval_runtime": 963.3777, + "eval_samples_per_second": 214.103, + "eval_steps_per_second": 3.346, "step": 1550 }, { - "epoch": 0.7874810701665825, - "grad_norm": 0.8505954146385193, - "learning_rate": 2.5275113579000505e-05, - "loss": 0.0693, + "epoch": 0.3938399394092401, + "grad_norm": 0.935338020324707, + "learning_rate": 2.5319999999999998e-05, + "loss": 0.0729, "step": 1560 }, { - "epoch": 0.7925290257445734, - "grad_norm": 0.8126624226570129, - "learning_rate": 2.524482584553256e-05, - "loss": 0.0738, + "epoch": 0.39636455440545315, + "grad_norm": 0.7157814502716064, + "learning_rate": 2.529e-05, + "loss": 0.0719, "step": 1570 }, { - "epoch": 0.7975769813225644, - "grad_norm": 0.9350792765617371, - "learning_rate": 2.5214538112064615e-05, - "loss": 0.0821, + "epoch": 0.39888916940166624, + "grad_norm": 0.6739543676376343, + "learning_rate": 2.526e-05, + "loss": 0.0631, "step": 1580 }, { - "epoch": 0.8026249369005553, - "grad_norm": 1.075035810470581, - "learning_rate": 2.5184250378596672e-05, - "loss": 0.0758, + "epoch": 0.40141378439787934, + "grad_norm": 0.4896785318851471, + "learning_rate": 2.523e-05, + "loss": 0.0746, "step": 1590 }, { - "epoch": 0.8076728924785462, - "grad_norm": 0.6885321736335754, - "learning_rate": 2.5153962645128722e-05, - "loss": 0.0641, + "epoch": 0.4039383993940924, + "grad_norm": 0.7619987726211548, + "learning_rate": 2.52e-05, + "loss": 0.0804, + "step": 1600 + }, + { + "epoch": 0.4039383993940924, + "eval_f1": 0.6111605289687482, + "eval_loss": 0.05378127843141556, + "eval_runtime": 958.466, + "eval_samples_per_second": 215.2, + "eval_steps_per_second": 3.363, "step": 1600 }, { - "epoch": 0.8127208480565371, - "grad_norm": 0.7702226042747498, - "learning_rate": 2.5123674911660775e-05, - "loss": 0.0642, + "epoch": 0.40646301439030547, + "grad_norm": 0.7464210987091064, + "learning_rate": 2.517e-05, + "loss": 0.0707, "step": 1610 }, { - "epoch": 0.817768803634528, - "grad_norm": 0.9809953570365906, - "learning_rate": 2.5093387178192832e-05, - "loss": 0.0759, + "epoch": 0.40898762938651856, + "grad_norm": 0.6707102656364441, + "learning_rate": 2.514e-05, + "loss": 0.0671, "step": 1620 }, { - "epoch": 0.822816759212519, - "grad_norm": 0.5996444225311279, - "learning_rate": 2.5063099444724886e-05, - "loss": 0.0686, + "epoch": 0.41151224438273165, + "grad_norm": 1.246846079826355, + "learning_rate": 2.511e-05, + "loss": 0.0627, "step": 1630 }, { - "epoch": 0.8278647147905098, - "grad_norm": 0.5003983378410339, - "learning_rate": 2.5032811711256942e-05, - "loss": 0.0697, + "epoch": 0.4140368593789447, + "grad_norm": 0.9796457886695862, + "learning_rate": 2.508e-05, + "loss": 0.0677, "step": 1640 }, { - "epoch": 0.8329126703685008, - "grad_norm": 0.7024896740913391, - "learning_rate": 2.5002523977788996e-05, - "loss": 0.0699, + "epoch": 0.4165614743751578, + "grad_norm": 0.9717236161231995, + "learning_rate": 2.505e-05, + "loss": 0.083, "step": 1650 }, { - "epoch": 0.8379606259464917, - "grad_norm": 0.5384397506713867, - "learning_rate": 2.497223624432105e-05, - "loss": 0.0684, + "epoch": 0.4165614743751578, + "eval_f1": 0.6047686163965234, + "eval_loss": 0.057900335639715195, + "eval_runtime": 962.7843, + "eval_samples_per_second": 214.235, + "eval_steps_per_second": 3.348, + "step": 1650 + }, + { + "epoch": 0.4190860893713709, + "grad_norm": 1.1706446409225464, + "learning_rate": 2.502e-05, + "loss": 0.0764, "step": 1660 }, { - "epoch": 0.8430085815244825, - "grad_norm": 1.176849126815796, - "learning_rate": 2.4941948510853106e-05, - "loss": 0.065, + "epoch": 0.42161070436758397, + "grad_norm": 0.45280393958091736, + "learning_rate": 2.499e-05, + "loss": 0.0682, "step": 1670 }, { - "epoch": 0.8480565371024735, - "grad_norm": 0.7623859643936157, - "learning_rate": 2.491166077738516e-05, - "loss": 0.0676, + "epoch": 0.424135319363797, + "grad_norm": 1.0100760459899902, + "learning_rate": 2.4959999999999998e-05, + "loss": 0.0892, "step": 1680 }, { - "epoch": 0.8531044926804644, - "grad_norm": 0.8817411065101624, - "learning_rate": 2.4881373043917216e-05, - "loss": 0.0712, + "epoch": 0.4266599343600101, + "grad_norm": 1.0506736040115356, + "learning_rate": 2.493e-05, + "loss": 0.0666, "step": 1690 }, { - "epoch": 0.8581524482584554, - "grad_norm": 0.7471240162849426, - "learning_rate": 2.485108531044927e-05, - "loss": 0.0719, + "epoch": 0.4291845493562232, + "grad_norm": 0.7978639006614685, + "learning_rate": 2.49e-05, + "loss": 0.0701, + "step": 1700 + }, + { + "epoch": 0.4291845493562232, + "eval_f1": 0.6044656147662996, + "eval_loss": 0.06738731265068054, + "eval_runtime": 1102.7864, + "eval_samples_per_second": 187.037, + "eval_steps_per_second": 2.923, "step": 1700 }, { - "epoch": 0.8632004038364463, - "grad_norm": 0.9217013120651245, - "learning_rate": 2.4820797576981323e-05, - "loss": 0.0758, + "epoch": 0.4317091643524363, + "grad_norm": 1.121317982673645, + "learning_rate": 2.487e-05, + "loss": 0.0771, "step": 1710 }, { - "epoch": 0.8682483594144371, - "grad_norm": 0.4985320568084717, - "learning_rate": 2.479050984351338e-05, - "loss": 0.075, + "epoch": 0.4342337793486493, + "grad_norm": 1.0836131572723389, + "learning_rate": 2.484e-05, + "loss": 0.0719, "step": 1720 }, { - "epoch": 0.8732963149924281, - "grad_norm": 0.47823965549468994, - "learning_rate": 2.476022211004543e-05, - "loss": 0.0576, + "epoch": 0.4367583943448624, + "grad_norm": 0.61658775806427, + "learning_rate": 2.4809999999999998e-05, + "loss": 0.0681, "step": 1730 }, { - "epoch": 0.878344270570419, - "grad_norm": 0.5073914527893066, - "learning_rate": 2.4729934376577487e-05, - "loss": 0.0619, + "epoch": 0.4392830093410755, + "grad_norm": 0.647393524646759, + "learning_rate": 2.478e-05, + "loss": 0.0668, "step": 1740 }, { - "epoch": 0.8833922261484098, - "grad_norm": 0.6744971871376038, - "learning_rate": 2.469964664310954e-05, - "loss": 0.0674, + "epoch": 0.44180762433728854, + "grad_norm": 0.782483696937561, + "learning_rate": 2.475e-05, + "loss": 0.0721, "step": 1750 }, { - "epoch": 0.8884401817264008, - "grad_norm": 0.7287705540657043, - "learning_rate": 2.4669358909641594e-05, - "loss": 0.0705, - "step": 1760 + "epoch": 0.44180762433728854, + "eval_f1": 0.5979155238617663, + "eval_loss": 0.04912808537483215, + "eval_runtime": 1171.9033, + "eval_samples_per_second": 176.006, + "eval_steps_per_second": 2.75, + "step": 1750 }, { - "epoch": 0.8934881373043917, - "grad_norm": 0.6387834548950195, - "learning_rate": 2.463907117617365e-05, - "loss": 0.0736, + "epoch": 0.44433223933350163, + "grad_norm": 0.4538789987564087, + "learning_rate": 2.472e-05, + "loss": 0.0641, + "step": 1760 + }, + { + "epoch": 0.4468568543297147, + "grad_norm": 0.7954159379005432, + "learning_rate": 2.469e-05, + "loss": 0.079, "step": 1770 }, { - "epoch": 0.8985360928823827, - "grad_norm": 0.8428398370742798, - "learning_rate": 2.4608783442705704e-05, - "loss": 0.0741, + "epoch": 0.4493814693259278, + "grad_norm": 0.4370203912258148, + "learning_rate": 2.4659999999999998e-05, + "loss": 0.0769, "step": 1780 }, { - "epoch": 0.9035840484603735, - "grad_norm": 0.6455987691879272, - "learning_rate": 2.4578495709237758e-05, - "loss": 0.0639, + "epoch": 0.45190608432214086, + "grad_norm": 1.2641068696975708, + "learning_rate": 2.463e-05, + "loss": 0.0649, "step": 1790 }, { - "epoch": 0.9086320040383644, - "grad_norm": 0.6735292673110962, - "learning_rate": 2.4548207975769815e-05, - "loss": 0.0795, + "epoch": 0.45443069931835395, + "grad_norm": 1.262468695640564, + "learning_rate": 2.4599999999999998e-05, + "loss": 0.0765, + "step": 1800 + }, + { + "epoch": 0.45443069931835395, + "eval_f1": NaN, + "eval_loss": 0.04386861249804497, + "eval_runtime": 1166.6982, + "eval_samples_per_second": 176.791, + "eval_steps_per_second": 2.762, "step": 1800 }, { - "epoch": 0.9136799596163554, - "grad_norm": 0.6157563924789429, - "learning_rate": 2.4517920242301868e-05, - "loss": 0.0699, + "epoch": 0.45695531431456704, + "grad_norm": 1.0922938585281372, + "learning_rate": 2.457e-05, + "loss": 0.074, "step": 1810 }, { - "epoch": 0.9187279151943463, - "grad_norm": 0.7483514547348022, - "learning_rate": 2.4487632508833925e-05, - "loss": 0.0681, + "epoch": 0.45947992931078013, + "grad_norm": 0.8298421502113342, + "learning_rate": 2.454e-05, + "loss": 0.0778, "step": 1820 }, { - "epoch": 0.9237758707723372, - "grad_norm": 0.5686767101287842, - "learning_rate": 2.4457344775365978e-05, - "loss": 0.0713, + "epoch": 0.46200454430699317, + "grad_norm": 1.182712435722351, + "learning_rate": 2.4509999999999997e-05, + "loss": 0.0793, "step": 1830 }, { - "epoch": 0.9288238263503281, - "grad_norm": 0.352909654378891, - "learning_rate": 2.4427057041898032e-05, - "loss": 0.0641, + "epoch": 0.46452915930320626, + "grad_norm": 0.7366443276405334, + "learning_rate": 2.448e-05, + "loss": 0.0655, "step": 1840 }, { - "epoch": 0.933871781928319, - "grad_norm": 0.6095912456512451, - "learning_rate": 2.439676930843009e-05, - "loss": 0.0794, + "epoch": 0.46705377429941936, + "grad_norm": 0.9185643792152405, + "learning_rate": 2.4449999999999998e-05, + "loss": 0.0692, + "step": 1850 + }, + { + "epoch": 0.46705377429941936, + "eval_f1": 0.6057632592224568, + "eval_loss": 0.04681675508618355, + "eval_runtime": 1170.8075, + "eval_samples_per_second": 176.171, + "eval_steps_per_second": 2.753, "step": 1850 }, { - "epoch": 0.93891973750631, - "grad_norm": 0.3929665684700012, - "learning_rate": 2.436648157496214e-05, - "loss": 0.0672, + "epoch": 0.4695783892956324, + "grad_norm": 0.99897301197052, + "learning_rate": 2.442e-05, + "loss": 0.0685, "step": 1860 }, { - "epoch": 0.9439676930843008, - "grad_norm": 0.22026501595973969, - "learning_rate": 2.4336193841494195e-05, - "loss": 0.0699, + "epoch": 0.4721030042918455, + "grad_norm": 1.0028034448623657, + "learning_rate": 2.439e-05, + "loss": 0.0748, "step": 1870 }, { - "epoch": 0.9490156486622918, - "grad_norm": 0.5952547788619995, - "learning_rate": 2.430590610802625e-05, - "loss": 0.0733, + "epoch": 0.4746276192880586, + "grad_norm": 2.5226945877075195, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.0715, "step": 1880 }, { - "epoch": 0.9540636042402827, - "grad_norm": 0.7297592163085938, - "learning_rate": 2.4275618374558302e-05, - "loss": 0.0725, + "epoch": 0.47715223428427167, + "grad_norm": 0.903256893157959, + "learning_rate": 2.4330000000000003e-05, + "loss": 0.0709, "step": 1890 }, { - "epoch": 0.9591115598182736, - "grad_norm": 0.35177797079086304, - "learning_rate": 2.424533064109036e-05, - "loss": 0.0651, + "epoch": 0.4796768492804847, + "grad_norm": 0.9269793629646301, + "learning_rate": 2.43e-05, + "loss": 0.0761, "step": 1900 }, { - "epoch": 0.9641595153962645, - "grad_norm": 0.6706666350364685, - "learning_rate": 2.4215042907622413e-05, - "loss": 0.0737, + "epoch": 0.4796768492804847, + "eval_f1": 0.6124984470120511, + "eval_loss": 0.05741230770945549, + "eval_runtime": 1159.3714, + "eval_samples_per_second": 177.908, + "eval_steps_per_second": 2.78, + "step": 1900 + }, + { + "epoch": 0.4822014642766978, + "grad_norm": 1.0651170015335083, + "learning_rate": 2.4270000000000003e-05, + "loss": 0.0751, "step": 1910 }, { - "epoch": 0.9692074709742554, - "grad_norm": 0.7155650854110718, - "learning_rate": 2.418475517415447e-05, - "loss": 0.074, + "epoch": 0.4847260792729109, + "grad_norm": 1.2628437280654907, + "learning_rate": 2.4240000000000002e-05, + "loss": 0.0852, "step": 1920 }, { - "epoch": 0.9742554265522464, - "grad_norm": 0.5200046300888062, - "learning_rate": 2.4154467440686523e-05, - "loss": 0.0706, + "epoch": 0.487250694269124, + "grad_norm": 1.3889621496200562, + "learning_rate": 2.4210000000000004e-05, + "loss": 0.073, "step": 1930 }, { - "epoch": 0.9793033821302373, - "grad_norm": 0.46796679496765137, - "learning_rate": 2.4124179707218576e-05, - "loss": 0.0592, + "epoch": 0.489775309265337, + "grad_norm": 1.028456687927246, + "learning_rate": 2.4180000000000002e-05, + "loss": 0.0644, "step": 1940 }, { - "epoch": 0.9843513377082281, - "grad_norm": 0.5713896751403809, - "learning_rate": 2.4093891973750633e-05, - "loss": 0.0586, + "epoch": 0.4922999242615501, + "grad_norm": 0.6997565627098083, + "learning_rate": 2.415e-05, + "loss": 0.0757, "step": 1950 }, { - "epoch": 0.9893992932862191, - "grad_norm": 0.9147453308105469, - "learning_rate": 2.4063604240282687e-05, - "loss": 0.0848, + "epoch": 0.4922999242615501, + "eval_f1": 0.6126181795711549, + "eval_loss": 0.05692484602332115, + "eval_runtime": 1161.9825, + "eval_samples_per_second": 177.509, + "eval_steps_per_second": 2.774, + "step": 1950 + }, + { + "epoch": 0.4948245392577632, + "grad_norm": 1.384186863899231, + "learning_rate": 2.4120000000000003e-05, + "loss": 0.0697, "step": 1960 }, { - "epoch": 0.99444724886421, - "grad_norm": 1.1067036390304565, - "learning_rate": 2.4033316506814744e-05, - "loss": 0.07, + "epoch": 0.49734915425397624, + "grad_norm": 0.8674394488334656, + "learning_rate": 2.409e-05, + "loss": 0.0739, "step": 1970 }, { - "epoch": 0.9994952044422009, - "grad_norm": 0.5658775568008423, - "learning_rate": 2.4003028773346797e-05, - "loss": 0.0594, + "epoch": 0.49987376925018934, + "grad_norm": 1.826121211051941, + "learning_rate": 2.4060000000000003e-05, + "loss": 0.0739, "step": 1980 }, { - "epoch": 1.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.04397369921207428, - "eval_runtime": 594.1594, - "eval_samples_per_second": 347.149, - "eval_steps_per_second": 2.713, - "step": 1981 + "epoch": 0.5023983842464024, + "grad_norm": 0.6903666257858276, + "learning_rate": 2.4030000000000002e-05, + "loss": 0.0661, + "step": 1990 }, { - "epoch": 1.0045431600201917, - "grad_norm": 0.6783074736595154, - "learning_rate": 2.3972741039878847e-05, - "loss": 0.0783, - "step": 1990 + "epoch": 0.5049229992426155, + "grad_norm": 0.7339742183685303, + "learning_rate": 2.4e-05, + "loss": 0.0654, + "step": 2000 }, { - "epoch": 1.0095911155981827, - "grad_norm": 0.5741100311279297, - "learning_rate": 2.3942453306410904e-05, - "loss": 0.0612, + "epoch": 0.5049229992426155, + "eval_f1": 0.6095153739086423, + "eval_loss": 0.05489746853709221, + "eval_runtime": 1168.0449, + "eval_samples_per_second": 176.587, + "eval_steps_per_second": 2.759, "step": 2000 }, { - "epoch": 1.0146390711761737, - "grad_norm": 0.8516017198562622, - "learning_rate": 2.3912165572942957e-05, - "loss": 0.0654, + "epoch": 0.5074476142388286, + "grad_norm": 0.604178786277771, + "learning_rate": 2.3970000000000003e-05, + "loss": 0.0551, "step": 2010 }, { - "epoch": 1.0196870267541647, - "grad_norm": 0.48648303747177124, - "learning_rate": 2.3881877839475014e-05, - "loss": 0.0659, + "epoch": 0.5099722292350417, + "grad_norm": 0.6526350378990173, + "learning_rate": 2.394e-05, + "loss": 0.0734, "step": 2020 }, { - "epoch": 1.0247349823321554, - "grad_norm": 0.48170068860054016, - "learning_rate": 2.3851590106007068e-05, - "loss": 0.0687, + "epoch": 0.5124968442312547, + "grad_norm": 0.8096711039543152, + "learning_rate": 2.3910000000000003e-05, + "loss": 0.0724, "step": 2030 }, { - "epoch": 1.0297829379101464, - "grad_norm": 0.8060422539710999, - "learning_rate": 2.382130237253912e-05, - "loss": 0.0741, + "epoch": 0.5150214592274678, + "grad_norm": 1.262484073638916, + "learning_rate": 2.3880000000000002e-05, + "loss": 0.0949, "step": 2040 }, { - "epoch": 1.0348308934881374, - "grad_norm": 0.3721982538700104, - "learning_rate": 2.3791014639071178e-05, - "loss": 0.0643, + "epoch": 0.5175460742236809, + "grad_norm": 0.8815634846687317, + "learning_rate": 2.385e-05, + "loss": 0.0706, + "step": 2050 + }, + { + "epoch": 0.5175460742236809, + "eval_f1": 0.6041730781067275, + "eval_loss": 0.04226996377110481, + "eval_runtime": 1154.7019, + "eval_samples_per_second": 178.628, + "eval_steps_per_second": 2.791, "step": 2050 }, { - "epoch": 1.0398788490661282, - "grad_norm": 0.9289938807487488, - "learning_rate": 2.376072690560323e-05, - "loss": 0.0678, + "epoch": 0.520070689219894, + "grad_norm": 0.6676633954048157, + "learning_rate": 2.3820000000000002e-05, + "loss": 0.0831, "step": 2060 }, { - "epoch": 1.0449268046441191, - "grad_norm": 0.7339480519294739, - "learning_rate": 2.3730439172135288e-05, - "loss": 0.065, + "epoch": 0.522595304216107, + "grad_norm": 0.9431056976318359, + "learning_rate": 2.379e-05, + "loss": 0.0762, "step": 2070 }, { - "epoch": 1.0499747602221101, - "grad_norm": 0.5676091313362122, - "learning_rate": 2.370015143866734e-05, - "loss": 0.0665, + "epoch": 0.5251199192123202, + "grad_norm": 0.8600429892539978, + "learning_rate": 2.3760000000000003e-05, + "loss": 0.0674, "step": 2080 }, { - "epoch": 1.0550227158001009, - "grad_norm": 1.0972354412078857, - "learning_rate": 2.3669863705199395e-05, - "loss": 0.0664, + "epoch": 0.5276445342085332, + "grad_norm": 1.0786969661712646, + "learning_rate": 2.373e-05, + "loss": 0.0688, "step": 2090 }, { - "epoch": 1.0600706713780919, - "grad_norm": 1.11980402469635, - "learning_rate": 2.3639575971731452e-05, - "loss": 0.0742, + "epoch": 0.5301691492047462, + "grad_norm": 0.6463090181350708, + "learning_rate": 2.37e-05, + "loss": 0.0647, + "step": 2100 + }, + { + "epoch": 0.5301691492047462, + "eval_f1": 0.6056007895386134, + "eval_loss": 0.04632845148444176, + "eval_runtime": 1152.9435, + "eval_samples_per_second": 178.9, + "eval_steps_per_second": 2.795, "step": 2100 }, { - "epoch": 1.0651186269560828, - "grad_norm": 0.6586318016052246, - "learning_rate": 2.3609288238263502e-05, - "loss": 0.0755, + "epoch": 0.5326937642009594, + "grad_norm": 1.0313136577606201, + "learning_rate": 2.3670000000000002e-05, + "loss": 0.0673, "step": 2110 }, { - "epoch": 1.0701665825340738, - "grad_norm": 0.6912874579429626, - "learning_rate": 2.3579000504795555e-05, + "epoch": 0.5352183791971724, + "grad_norm": 1.19906485080719, + "learning_rate": 2.364e-05, "loss": 0.0722, "step": 2120 }, { - "epoch": 1.0752145381120646, - "grad_norm": 0.5603944659233093, - "learning_rate": 2.3548712771327612e-05, - "loss": 0.0636, + "epoch": 0.5377429941933856, + "grad_norm": 0.8951911926269531, + "learning_rate": 2.3610000000000003e-05, + "loss": 0.0801, "step": 2130 }, { - "epoch": 1.0802624936900556, - "grad_norm": 0.7324510216712952, - "learning_rate": 2.3518425037859666e-05, - "loss": 0.0697, + "epoch": 0.5402676091895986, + "grad_norm": 0.856438159942627, + "learning_rate": 2.358e-05, + "loss": 0.0664, "step": 2140 }, { - "epoch": 1.0853104492680465, - "grad_norm": 0.6833095550537109, - "learning_rate": 2.3488137304391723e-05, - "loss": 0.0678, + "epoch": 0.5427922241858116, + "grad_norm": 0.74139404296875, + "learning_rate": 2.3550000000000003e-05, + "loss": 0.0626, "step": 2150 }, { - "epoch": 1.0903584048460373, - "grad_norm": 0.49107661843299866, - "learning_rate": 2.3457849570923776e-05, - "loss": 0.0608, - "step": 2160 - }, - { - "epoch": 1.0954063604240283, - "grad_norm": 0.541980504989624, - "learning_rate": 2.342756183745583e-05, - "loss": 0.0645, - "step": 2170 - }, - { - "epoch": 1.1004543160020193, - "grad_norm": 0.487343966960907, - "learning_rate": 2.3397274103987886e-05, - "loss": 0.0573, - "step": 2180 - }, - { - "epoch": 1.10550227158001, - "grad_norm": 0.3503382205963135, - "learning_rate": 2.336698637051994e-05, - "loss": 0.0753, - "step": 2190 - }, - { - "epoch": 1.110550227158001, - "grad_norm": 0.750566840171814, - "learning_rate": 2.3336698637051997e-05, - "loss": 0.0703, - "step": 2200 - }, - { - "epoch": 1.115598182735992, - "grad_norm": 1.1437385082244873, - "learning_rate": 2.330641090358405e-05, - "loss": 0.0706, - "step": 2210 - }, - { - "epoch": 1.1206461383139827, - "grad_norm": 0.4508492648601532, - "learning_rate": 2.3276123170116103e-05, - "loss": 0.064, - "step": 2220 - }, - { - "epoch": 1.1256940938919737, - "grad_norm": 1.0053447484970093, - "learning_rate": 2.324583543664816e-05, - "loss": 0.0595, - "step": 2230 - }, - { - "epoch": 1.1307420494699647, - "grad_norm": 0.5974487662315369, - "learning_rate": 2.321554770318021e-05, - "loss": 0.0613, - "step": 2240 - }, - { - "epoch": 1.1357900050479555, - "grad_norm": 0.48302361369132996, - "learning_rate": 2.3185259969712267e-05, - "loss": 0.0553, - "step": 2250 - }, - { - "epoch": 1.1408379606259464, - "grad_norm": 0.7124462127685547, - "learning_rate": 2.315497223624432e-05, - "loss": 0.0628, - "step": 2260 - }, - { - "epoch": 1.1458859162039374, - "grad_norm": 0.8712441921234131, - "learning_rate": 2.3124684502776374e-05, - "loss": 0.066, - "step": 2270 - }, - { - "epoch": 1.1509338717819284, - "grad_norm": 0.7473580241203308, - "learning_rate": 2.309439676930843e-05, - "loss": 0.0687, - "step": 2280 - }, - { - "epoch": 1.1559818273599192, - "grad_norm": 0.8231186866760254, - "learning_rate": 2.3064109035840484e-05, - "loss": 0.0686, - "step": 2290 - }, - { - "epoch": 1.1610297829379101, - "grad_norm": 0.5205137729644775, - "learning_rate": 2.303382130237254e-05, - "loss": 0.0668, - "step": 2300 - }, - { - "epoch": 1.1660777385159011, - "grad_norm": 0.5173012614250183, - "learning_rate": 2.3003533568904595e-05, - "loss": 0.0664, - "step": 2310 - }, - { - "epoch": 1.171125694093892, - "grad_norm": 0.6976504325866699, - "learning_rate": 2.2973245835436648e-05, - "loss": 0.067, - "step": 2320 - }, - { - "epoch": 1.1761736496718829, - "grad_norm": 0.7795687317848206, - "learning_rate": 2.2942958101968705e-05, - "loss": 0.0591, - "step": 2330 - }, - { - "epoch": 1.1812216052498739, - "grad_norm": 0.35292479395866394, - "learning_rate": 2.291267036850076e-05, - "loss": 0.0721, - "step": 2340 - }, - { - "epoch": 1.1862695608278648, - "grad_norm": 1.548770546913147, - "learning_rate": 2.2882382635032815e-05, - "loss": 0.0608, - "step": 2350 - }, - { - "epoch": 1.1913175164058556, - "grad_norm": 0.521295964717865, - "learning_rate": 2.285209490156487e-05, - "loss": 0.0735, - "step": 2360 - }, - { - "epoch": 1.1963654719838466, - "grad_norm": 0.6001691818237305, - "learning_rate": 2.282180716809692e-05, - "loss": 0.0646, - "step": 2370 - }, - { - "epoch": 1.2014134275618376, - "grad_norm": 0.9061608910560608, - "learning_rate": 2.2791519434628976e-05, - "loss": 0.0598, - "step": 2380 - }, - { - "epoch": 1.2064613831398283, - "grad_norm": 0.6509453654289246, - "learning_rate": 2.276123170116103e-05, - "loss": 0.0591, - "step": 2390 - }, - { - "epoch": 1.2115093387178193, - "grad_norm": 0.4685826301574707, - "learning_rate": 2.2730943967693086e-05, - "loss": 0.0675, - "step": 2400 - }, - { - "epoch": 1.2165572942958103, - "grad_norm": 0.4527621865272522, - "learning_rate": 2.270065623422514e-05, - "loss": 0.0635, - "step": 2410 - }, - { - "epoch": 1.221605249873801, - "grad_norm": 0.46990010142326355, - "learning_rate": 2.2670368500757193e-05, - "loss": 0.0609, - "step": 2420 - }, - { - "epoch": 1.226653205451792, - "grad_norm": 0.7978981137275696, - "learning_rate": 2.264008076728925e-05, - "loss": 0.0682, - "step": 2430 - }, - { - "epoch": 1.231701161029783, - "grad_norm": 0.5001055598258972, - "learning_rate": 2.2609793033821303e-05, - "loss": 0.0657, - "step": 2440 - }, - { - "epoch": 1.2367491166077738, - "grad_norm": 0.7271714806556702, - "learning_rate": 2.2579505300353356e-05, - "loss": 0.0627, - "step": 2450 - }, - { - "epoch": 1.2417970721857647, - "grad_norm": 0.3601450026035309, - "learning_rate": 2.2549217566885413e-05, - "loss": 0.0649, - "step": 2460 - }, - { - "epoch": 1.2468450277637557, - "grad_norm": 0.6351629495620728, - "learning_rate": 2.2518929833417467e-05, - "loss": 0.0619, - "step": 2470 - }, - { - "epoch": 1.2518929833417465, - "grad_norm": 0.8523517847061157, - "learning_rate": 2.2488642099949524e-05, - "loss": 0.078, - "step": 2480 - }, - { - "epoch": 1.2569409389197375, - "grad_norm": 1.0878459215164185, - "learning_rate": 2.2458354366481577e-05, - "loss": 0.0636, - "step": 2490 - }, - { - "epoch": 1.2619888944977284, - "grad_norm": 0.6811727285385132, - "learning_rate": 2.2428066633013627e-05, - "loss": 0.0703, - "step": 2500 - }, - { - "epoch": 1.2670368500757192, - "grad_norm": 0.6043427586555481, - "learning_rate": 2.2397778899545684e-05, - "loss": 0.0587, - "step": 2510 - }, - { - "epoch": 1.2720848056537102, - "grad_norm": 0.6673144102096558, - "learning_rate": 2.2367491166077737e-05, - "loss": 0.0675, - "step": 2520 - }, - { - "epoch": 1.2771327612317012, - "grad_norm": 0.3510701358318329, - "learning_rate": 2.2337203432609794e-05, - "loss": 0.069, - "step": 2530 - }, - { - "epoch": 1.2821807168096921, - "grad_norm": 0.302438884973526, - "learning_rate": 2.2306915699141848e-05, - "loss": 0.0609, - "step": 2540 - }, - { - "epoch": 1.2872286723876831, - "grad_norm": 0.8073706030845642, - "learning_rate": 2.22766279656739e-05, - "loss": 0.076, - "step": 2550 - }, - { - "epoch": 1.2922766279656739, - "grad_norm": 0.7314086556434631, - "learning_rate": 2.2246340232205958e-05, - "loss": 0.0676, - "step": 2560 - }, - { - "epoch": 1.2973245835436649, - "grad_norm": 0.6998431086540222, - "learning_rate": 2.221605249873801e-05, - "loss": 0.0594, - "step": 2570 - }, - { - "epoch": 1.3023725391216558, - "grad_norm": 0.9340649843215942, - "learning_rate": 2.2185764765270068e-05, - "loss": 0.0601, - "step": 2580 - }, - { - "epoch": 1.3074204946996466, - "grad_norm": 0.5486651062965393, - "learning_rate": 2.215547703180212e-05, - "loss": 0.0752, - "step": 2590 - }, - { - "epoch": 1.3124684502776376, - "grad_norm": 0.3997117280960083, - "learning_rate": 2.2125189298334175e-05, - "loss": 0.0669, - "step": 2600 - }, - { - "epoch": 1.3175164058556286, - "grad_norm": 0.6159607172012329, - "learning_rate": 2.2094901564866232e-05, - "loss": 0.0646, - "step": 2610 - }, - { - "epoch": 1.3225643614336193, - "grad_norm": 1.0720511674880981, - "learning_rate": 2.2064613831398285e-05, - "loss": 0.0697, - "step": 2620 - }, - { - "epoch": 1.3276123170116103, - "grad_norm": 0.6496064066886902, - "learning_rate": 2.203432609793034e-05, - "loss": 0.0642, - "step": 2630 - }, - { - "epoch": 1.3326602725896013, - "grad_norm": 0.5649464726448059, - "learning_rate": 2.2004038364462392e-05, - "loss": 0.0596, - "step": 2640 - }, - { - "epoch": 1.337708228167592, - "grad_norm": 0.5532758235931396, - "learning_rate": 2.1973750630994446e-05, - "loss": 0.0651, - "step": 2650 - }, - { - "epoch": 1.342756183745583, - "grad_norm": 0.4955766797065735, - "learning_rate": 2.1943462897526503e-05, - "loss": 0.0661, - "step": 2660 - }, - { - "epoch": 1.347804139323574, - "grad_norm": 0.5403378009796143, - "learning_rate": 2.1913175164058556e-05, - "loss": 0.068, - "step": 2670 - }, - { - "epoch": 1.3528520949015648, - "grad_norm": 0.8987810015678406, - "learning_rate": 2.1882887430590613e-05, - "loss": 0.0551, - "step": 2680 - }, - { - "epoch": 1.3579000504795558, - "grad_norm": 0.5531570911407471, - "learning_rate": 2.1852599697122666e-05, - "loss": 0.0554, - "step": 2690 - }, - { - "epoch": 1.3629480060575467, - "grad_norm": 0.8810332417488098, - "learning_rate": 2.182231196365472e-05, - "loss": 0.0683, - "step": 2700 - }, - { - "epoch": 1.3679959616355375, - "grad_norm": 0.8977289199829102, - "learning_rate": 2.1792024230186777e-05, - "loss": 0.0682, - "step": 2710 - }, - { - "epoch": 1.3730439172135285, - "grad_norm": 0.6664491295814514, - "learning_rate": 2.176173649671883e-05, - "loss": 0.0652, - "step": 2720 - }, - { - "epoch": 1.3780918727915195, - "grad_norm": 0.7725427150726318, - "learning_rate": 2.1731448763250883e-05, - "loss": 0.0693, - "step": 2730 - }, - { - "epoch": 1.3831398283695102, - "grad_norm": 1.149824857711792, - "learning_rate": 2.170116102978294e-05, - "loss": 0.0697, - "step": 2740 - }, - { - "epoch": 1.3881877839475012, - "grad_norm": 0.8231659531593323, - "learning_rate": 2.167087329631499e-05, - "loss": 0.0586, - "step": 2750 - }, - { - "epoch": 1.3932357395254922, - "grad_norm": 0.5706813335418701, - "learning_rate": 2.1640585562847047e-05, - "loss": 0.0648, - "step": 2760 - }, - { - "epoch": 1.3982836951034832, - "grad_norm": 0.4602285623550415, - "learning_rate": 2.16102978293791e-05, - "loss": 0.0642, - "step": 2770 - }, - { - "epoch": 1.4033316506814741, - "grad_norm": 0.5022104978561401, - "learning_rate": 2.1580010095911154e-05, - "loss": 0.0582, - "step": 2780 - }, - { - "epoch": 1.408379606259465, - "grad_norm": 0.3675612211227417, - "learning_rate": 2.154972236244321e-05, - "loss": 0.0685, - "step": 2790 - }, - { - "epoch": 1.4134275618374559, - "grad_norm": 0.5692434906959534, - "learning_rate": 2.1519434628975264e-05, - "loss": 0.0625, - "step": 2800 - }, - { - "epoch": 1.4184755174154469, - "grad_norm": 0.44433364272117615, - "learning_rate": 2.148914689550732e-05, - "loss": 0.0683, - "step": 2810 - }, - { - "epoch": 1.4235234729934376, - "grad_norm": 0.5225184559822083, - "learning_rate": 2.1458859162039375e-05, - "loss": 0.0676, - "step": 2820 - }, - { - "epoch": 1.4285714285714286, - "grad_norm": 1.125475287437439, - "learning_rate": 2.1428571428571428e-05, - "loss": 0.0641, - "step": 2830 - }, - { - "epoch": 1.4336193841494196, - "grad_norm": 0.6783428192138672, - "learning_rate": 2.1398283695103485e-05, - "loss": 0.0735, - "step": 2840 - }, - { - "epoch": 1.4386673397274103, - "grad_norm": 0.6056823134422302, - "learning_rate": 2.136799596163554e-05, - "loss": 0.0607, - "step": 2850 - }, - { - "epoch": 1.4437152953054013, - "grad_norm": 0.7588714361190796, - "learning_rate": 2.1337708228167595e-05, - "loss": 0.0638, - "step": 2860 - }, - { - "epoch": 1.4487632508833923, - "grad_norm": 0.5353738069534302, - "learning_rate": 2.130742049469965e-05, - "loss": 0.0628, - "step": 2870 - }, - { - "epoch": 1.453811206461383, - "grad_norm": 0.3690322935581207, - "learning_rate": 2.12771327612317e-05, - "loss": 0.055, - "step": 2880 - }, - { - "epoch": 1.458859162039374, - "grad_norm": 0.5556847453117371, - "learning_rate": 2.1246845027763756e-05, - "loss": 0.0672, - "step": 2890 - }, - { - "epoch": 1.463907117617365, - "grad_norm": 0.5658410787582397, - "learning_rate": 2.121655729429581e-05, - "loss": 0.0634, - "step": 2900 - }, - { - "epoch": 1.4689550731953558, - "grad_norm": 1.1000596284866333, - "learning_rate": 2.1186269560827866e-05, - "loss": 0.0648, - "step": 2910 - }, - { - "epoch": 1.4740030287733468, - "grad_norm": 0.5739458799362183, - "learning_rate": 2.115598182735992e-05, - "loss": 0.0622, - "step": 2920 - }, - { - "epoch": 1.4790509843513377, - "grad_norm": 0.9371837377548218, - "learning_rate": 2.1125694093891973e-05, - "loss": 0.067, - "step": 2930 - }, - { - "epoch": 1.4840989399293285, - "grad_norm": 0.5997252464294434, - "learning_rate": 2.109540636042403e-05, - "loss": 0.0665, - "step": 2940 - }, - { - "epoch": 1.4891468955073195, - "grad_norm": 0.6729413866996765, - "learning_rate": 2.1065118626956083e-05, - "loss": 0.0576, - "step": 2950 - }, - { - "epoch": 1.4941948510853105, - "grad_norm": 0.796592652797699, - "learning_rate": 2.103483089348814e-05, - "loss": 0.0671, - "step": 2960 - }, - { - "epoch": 1.4992428066633012, - "grad_norm": 0.7947612404823303, - "learning_rate": 2.1004543160020193e-05, - "loss": 0.0701, - "step": 2970 - }, - { - "epoch": 1.5042907622412924, - "grad_norm": 0.7790849208831787, - "learning_rate": 2.0974255426552247e-05, - "loss": 0.065, - "step": 2980 - }, - { - "epoch": 1.5093387178192832, - "grad_norm": 0.5330706238746643, - "learning_rate": 2.0943967693084304e-05, - "loss": 0.0587, - "step": 2990 - }, - { - "epoch": 1.514386673397274, - "grad_norm": 1.0482598543167114, - "learning_rate": 2.0913679959616357e-05, - "loss": 0.0696, - "step": 3000 - }, - { - "epoch": 1.5194346289752652, - "grad_norm": 0.46928080916404724, - "learning_rate": 2.088339222614841e-05, - "loss": 0.0668, - "step": 3010 - }, - { - "epoch": 1.524482584553256, - "grad_norm": 1.0525529384613037, - "learning_rate": 2.0853104492680464e-05, - "loss": 0.0664, - "step": 3020 - }, - { - "epoch": 1.529530540131247, - "grad_norm": 0.43941500782966614, - "learning_rate": 2.0822816759212517e-05, - "loss": 0.0642, - "step": 3030 - }, - { - "epoch": 1.5345784957092379, - "grad_norm": 0.6985353231430054, - "learning_rate": 2.0792529025744574e-05, - "loss": 0.068, - "step": 3040 - }, - { - "epoch": 1.5396264512872286, - "grad_norm": 0.6110888123512268, - "learning_rate": 2.0762241292276628e-05, - "loss": 0.0639, - "step": 3050 - }, - { - "epoch": 1.5446744068652196, - "grad_norm": 0.8250141739845276, - "learning_rate": 2.073195355880868e-05, - "loss": 0.0614, - "step": 3060 - }, - { - "epoch": 1.5497223624432106, - "grad_norm": 0.4882888197898865, - "learning_rate": 2.0701665825340738e-05, - "loss": 0.066, - "step": 3070 - }, - { - "epoch": 1.5547703180212014, - "grad_norm": 0.38679155707359314, - "learning_rate": 2.067137809187279e-05, - "loss": 0.0684, - "step": 3080 - }, - { - "epoch": 1.5598182735991923, - "grad_norm": 0.6574121117591858, - "learning_rate": 2.0641090358404848e-05, - "loss": 0.0666, - "step": 3090 - }, - { - "epoch": 1.5648662291771833, - "grad_norm": 0.48571038246154785, - "learning_rate": 2.0610802624936902e-05, - "loss": 0.0646, - "step": 3100 - }, - { - "epoch": 1.569914184755174, - "grad_norm": 0.8285214304924011, - "learning_rate": 2.0580514891468955e-05, - "loss": 0.0634, - "step": 3110 - }, - { - "epoch": 1.574962140333165, - "grad_norm": 0.5619475245475769, - "learning_rate": 2.0550227158001012e-05, - "loss": 0.0665, - "step": 3120 - }, - { - "epoch": 1.580010095911156, - "grad_norm": 0.47569337487220764, - "learning_rate": 2.0519939424533065e-05, - "loss": 0.0661, - "step": 3130 - }, - { - "epoch": 1.5850580514891468, - "grad_norm": 0.8858407139778137, - "learning_rate": 2.048965169106512e-05, - "loss": 0.0696, - "step": 3140 - }, - { - "epoch": 1.5901060070671378, - "grad_norm": 0.5578007698059082, - "learning_rate": 2.0459363957597172e-05, - "loss": 0.0547, - "step": 3150 - }, - { - "epoch": 1.5951539626451288, - "grad_norm": 0.6875492334365845, - "learning_rate": 2.0429076224129226e-05, - "loss": 0.0608, - "step": 3160 - }, - { - "epoch": 1.6002019182231195, - "grad_norm": 0.5009766221046448, - "learning_rate": 2.0398788490661283e-05, - "loss": 0.0684, - "step": 3170 - }, - { - "epoch": 1.6052498738011105, - "grad_norm": 0.7467596530914307, - "learning_rate": 2.0368500757193336e-05, - "loss": 0.0654, - "step": 3180 - }, - { - "epoch": 1.6102978293791015, - "grad_norm": 0.5688017010688782, - "learning_rate": 2.0338213023725393e-05, - "loss": 0.0594, - "step": 3190 - }, - { - "epoch": 1.6153457849570922, - "grad_norm": 0.9353786110877991, - "learning_rate": 2.0307925290257446e-05, - "loss": 0.0685, - "step": 3200 - }, - { - "epoch": 1.6203937405350834, - "grad_norm": 0.5310063362121582, - "learning_rate": 2.02776375567895e-05, - "loss": 0.0597, - "step": 3210 - }, - { - "epoch": 1.6254416961130742, - "grad_norm": 1.107693076133728, - "learning_rate": 2.0247349823321557e-05, - "loss": 0.0722, - "step": 3220 - }, - { - "epoch": 1.630489651691065, - "grad_norm": 0.688391923904419, - "learning_rate": 2.021706208985361e-05, - "loss": 0.0719, - "step": 3230 - }, - { - "epoch": 1.6355376072690562, - "grad_norm": 0.4255257546901703, - "learning_rate": 2.0186774356385667e-05, - "loss": 0.0638, - "step": 3240 - }, - { - "epoch": 1.640585562847047, - "grad_norm": 0.6049216389656067, - "learning_rate": 2.015648662291772e-05, - "loss": 0.0555, - "step": 3250 - }, - { - "epoch": 1.645633518425038, - "grad_norm": 0.6898351311683655, - "learning_rate": 2.012619888944977e-05, - "loss": 0.0599, - "step": 3260 - }, - { - "epoch": 1.650681474003029, - "grad_norm": 0.6150475144386292, - "learning_rate": 2.0095911155981827e-05, - "loss": 0.0664, - "step": 3270 - }, - { - "epoch": 1.6557294295810197, - "grad_norm": 0.5084889531135559, - "learning_rate": 2.006562342251388e-05, - "loss": 0.0574, - "step": 3280 - }, - { - "epoch": 1.6607773851590106, - "grad_norm": 0.9478010535240173, - "learning_rate": 2.0035335689045938e-05, - "loss": 0.0619, - "step": 3290 - }, - { - "epoch": 1.6658253407370016, - "grad_norm": 1.1725986003875732, - "learning_rate": 2.000504795557799e-05, - "loss": 0.0672, - "step": 3300 - }, - { - "epoch": 1.6708732963149924, - "grad_norm": 0.8932427763938904, - "learning_rate": 1.9974760222110044e-05, - "loss": 0.0604, - "step": 3310 - }, - { - "epoch": 1.6759212518929834, - "grad_norm": 0.4670265316963196, - "learning_rate": 1.99444724886421e-05, - "loss": 0.0658, - "step": 3320 - }, - { - "epoch": 1.6809692074709743, - "grad_norm": 0.518844485282898, - "learning_rate": 1.9914184755174155e-05, - "loss": 0.068, - "step": 3330 - }, - { - "epoch": 1.686017163048965, - "grad_norm": 0.7717642784118652, - "learning_rate": 1.988389702170621e-05, - "loss": 0.0594, - "step": 3340 - }, - { - "epoch": 1.691065118626956, - "grad_norm": 0.9715004563331604, - "learning_rate": 1.9853609288238265e-05, - "loss": 0.0651, - "step": 3350 - }, - { - "epoch": 1.696113074204947, - "grad_norm": 0.7362111210823059, - "learning_rate": 1.982332155477032e-05, - "loss": 0.0664, - "step": 3360 - }, - { - "epoch": 1.7011610297829378, - "grad_norm": 0.480751633644104, - "learning_rate": 1.9793033821302375e-05, - "loss": 0.0609, - "step": 3370 - }, - { - "epoch": 1.7062089853609288, - "grad_norm": 0.31802135705947876, - "learning_rate": 1.976274608783443e-05, - "loss": 0.0658, - "step": 3380 - }, - { - "epoch": 1.7112569409389198, - "grad_norm": 0.5285906195640564, - "learning_rate": 1.973245835436648e-05, - "loss": 0.0606, - "step": 3390 - }, - { - "epoch": 1.7163048965169105, - "grad_norm": 0.7230745553970337, - "learning_rate": 1.9702170620898536e-05, - "loss": 0.0618, - "step": 3400 - }, - { - "epoch": 1.7213528520949015, - "grad_norm": 0.566842257976532, - "learning_rate": 1.967188288743059e-05, - "loss": 0.0623, - "step": 3410 - }, - { - "epoch": 1.7264008076728925, - "grad_norm": 0.9110565781593323, - "learning_rate": 1.9641595153962646e-05, - "loss": 0.0712, - "step": 3420 - }, - { - "epoch": 1.7314487632508833, - "grad_norm": 0.5621252059936523, - "learning_rate": 1.96113074204947e-05, - "loss": 0.0624, - "step": 3430 - }, - { - "epoch": 1.7364967188288745, - "grad_norm": 0.6153441667556763, - "learning_rate": 1.9581019687026753e-05, - "loss": 0.0679, - "step": 3440 - }, - { - "epoch": 1.7415446744068652, - "grad_norm": 0.7521117925643921, - "learning_rate": 1.955073195355881e-05, - "loss": 0.073, - "step": 3450 - }, - { - "epoch": 1.746592629984856, - "grad_norm": 0.7781336307525635, - "learning_rate": 1.9520444220090863e-05, - "loss": 0.0576, - "step": 3460 - }, - { - "epoch": 1.7516405855628472, - "grad_norm": 0.5981038808822632, - "learning_rate": 1.949015648662292e-05, - "loss": 0.0558, - "step": 3470 - }, - { - "epoch": 1.756688541140838, - "grad_norm": 0.5716273188591003, - "learning_rate": 1.9459868753154973e-05, - "loss": 0.0615, - "step": 3480 - }, - { - "epoch": 1.761736496718829, - "grad_norm": 1.0969016551971436, - "learning_rate": 1.9429581019687027e-05, - "loss": 0.0695, - "step": 3490 - }, - { - "epoch": 1.76678445229682, - "grad_norm": 0.4081050157546997, - "learning_rate": 1.9399293286219084e-05, - "loss": 0.0569, - "step": 3500 - }, - { - "epoch": 1.7718324078748107, - "grad_norm": 0.6996564269065857, - "learning_rate": 1.9369005552751137e-05, - "loss": 0.0615, - "step": 3510 - }, - { - "epoch": 1.7768803634528016, - "grad_norm": 0.7040839791297913, - "learning_rate": 1.933871781928319e-05, - "loss": 0.0609, - "step": 3520 - }, - { - "epoch": 1.7819283190307926, - "grad_norm": 0.6955099105834961, - "learning_rate": 1.9308430085815244e-05, - "loss": 0.0596, - "step": 3530 - }, - { - "epoch": 1.7869762746087834, - "grad_norm": 0.49400514364242554, - "learning_rate": 1.9278142352347298e-05, - "loss": 0.0531, - "step": 3540 - }, - { - "epoch": 1.7920242301867744, - "grad_norm": 0.6069557666778564, - "learning_rate": 1.9247854618879354e-05, - "loss": 0.0663, - "step": 3550 - }, - { - "epoch": 1.7970721857647654, - "grad_norm": 0.859195351600647, - "learning_rate": 1.9217566885411408e-05, - "loss": 0.0539, - "step": 3560 - }, - { - "epoch": 1.802120141342756, - "grad_norm": 0.8939780592918396, - "learning_rate": 1.9187279151943465e-05, - "loss": 0.0668, - "step": 3570 - }, - { - "epoch": 1.807168096920747, - "grad_norm": 0.7258803248405457, - "learning_rate": 1.9156991418475518e-05, - "loss": 0.0585, - "step": 3580 - }, - { - "epoch": 1.812216052498738, - "grad_norm": 0.38900288939476013, - "learning_rate": 1.912670368500757e-05, - "loss": 0.0686, - "step": 3590 - }, - { - "epoch": 1.8172640080767288, - "grad_norm": 0.38506415486335754, - "learning_rate": 1.909641595153963e-05, - "loss": 0.0625, - "step": 3600 - }, - { - "epoch": 1.8223119636547198, - "grad_norm": 0.5235381722450256, - "learning_rate": 1.9066128218071682e-05, - "loss": 0.0597, - "step": 3610 - }, - { - "epoch": 1.8273599192327108, - "grad_norm": 0.4835253357887268, - "learning_rate": 1.903584048460374e-05, - "loss": 0.0667, - "step": 3620 - }, - { - "epoch": 1.8324078748107016, - "grad_norm": 0.6338971257209778, - "learning_rate": 1.9005552751135792e-05, - "loss": 0.0635, - "step": 3630 - }, - { - "epoch": 1.8374558303886925, - "grad_norm": 1.0663739442825317, - "learning_rate": 1.8975265017667846e-05, - "loss": 0.0744, - "step": 3640 - }, - { - "epoch": 1.8425037859666835, - "grad_norm": 0.6655123829841614, - "learning_rate": 1.89449772841999e-05, - "loss": 0.0654, - "step": 3650 - }, - { - "epoch": 1.8475517415446743, - "grad_norm": 0.582611083984375, - "learning_rate": 1.8914689550731952e-05, - "loss": 0.0661, - "step": 3660 - }, - { - "epoch": 1.8525996971226655, - "grad_norm": 0.6533240079879761, - "learning_rate": 1.888440181726401e-05, - "loss": 0.0613, - "step": 3670 - }, - { - "epoch": 1.8576476527006562, - "grad_norm": 0.4978090524673462, - "learning_rate": 1.8854114083796063e-05, - "loss": 0.0627, - "step": 3680 - }, - { - "epoch": 1.862695608278647, - "grad_norm": 0.7043678164482117, - "learning_rate": 1.8823826350328116e-05, - "loss": 0.0578, - "step": 3690 - }, - { - "epoch": 1.8677435638566382, - "grad_norm": 0.7941015362739563, - "learning_rate": 1.8793538616860173e-05, - "loss": 0.0622, - "step": 3700 - }, - { - "epoch": 1.872791519434629, - "grad_norm": 0.4428146183490753, - "learning_rate": 1.8763250883392226e-05, - "loss": 0.0613, - "step": 3710 - }, - { - "epoch": 1.87783947501262, - "grad_norm": 0.6554248929023743, - "learning_rate": 1.873296314992428e-05, - "loss": 0.0643, - "step": 3720 - }, - { - "epoch": 1.882887430590611, - "grad_norm": 0.48168087005615234, - "learning_rate": 1.8702675416456337e-05, - "loss": 0.055, - "step": 3730 - }, - { - "epoch": 1.8879353861686017, - "grad_norm": 0.509777307510376, - "learning_rate": 1.867238768298839e-05, - "loss": 0.058, - "step": 3740 - }, - { - "epoch": 1.8929833417465927, - "grad_norm": 0.5132505893707275, - "learning_rate": 1.8642099949520447e-05, - "loss": 0.0623, - "step": 3750 - }, - { - "epoch": 1.8980312973245836, - "grad_norm": 0.7474920749664307, - "learning_rate": 1.86118122160525e-05, - "loss": 0.0489, - "step": 3760 - }, - { - "epoch": 1.9030792529025744, - "grad_norm": 1.0404279232025146, - "learning_rate": 1.8581524482584554e-05, - "loss": 0.0687, - "step": 3770 - }, - { - "epoch": 1.9081272084805654, - "grad_norm": 0.6796401143074036, - "learning_rate": 1.8551236749116607e-05, - "loss": 0.0679, - "step": 3780 - }, - { - "epoch": 1.9131751640585564, - "grad_norm": 0.9071604609489441, - "learning_rate": 1.852094901564866e-05, - "loss": 0.0725, - "step": 3790 - }, - { - "epoch": 1.9182231196365471, - "grad_norm": 0.7023878693580627, - "learning_rate": 1.8490661282180718e-05, - "loss": 0.0702, - "step": 3800 - }, - { - "epoch": 1.923271075214538, - "grad_norm": 0.7312602996826172, - "learning_rate": 1.846037354871277e-05, - "loss": 0.0532, - "step": 3810 - }, - { - "epoch": 1.928319030792529, - "grad_norm": 0.6224806904792786, - "learning_rate": 1.8430085815244825e-05, - "loss": 0.0638, - "step": 3820 - }, - { - "epoch": 1.9333669863705198, - "grad_norm": 0.7255429029464722, - "learning_rate": 1.839979808177688e-05, - "loss": 0.0641, - "step": 3830 - }, - { - "epoch": 1.9384149419485108, - "grad_norm": 0.584086000919342, - "learning_rate": 1.8369510348308935e-05, - "loss": 0.0692, - "step": 3840 - }, - { - "epoch": 1.9434628975265018, - "grad_norm": 0.4826408326625824, - "learning_rate": 1.833922261484099e-05, - "loss": 0.0627, - "step": 3850 - }, - { - "epoch": 1.9485108531044926, - "grad_norm": 0.5803766846656799, - "learning_rate": 1.8308934881373045e-05, - "loss": 0.0635, - "step": 3860 - }, - { - "epoch": 1.9535588086824835, - "grad_norm": 0.7855948209762573, - "learning_rate": 1.82786471479051e-05, - "loss": 0.0659, - "step": 3870 - }, - { - "epoch": 1.9586067642604745, - "grad_norm": 0.5980962514877319, - "learning_rate": 1.8248359414437155e-05, - "loss": 0.0651, - "step": 3880 - }, - { - "epoch": 1.9636547198384653, - "grad_norm": 0.6440220475196838, - "learning_rate": 1.821807168096921e-05, - "loss": 0.0639, - "step": 3890 - }, - { - "epoch": 1.9687026754164565, - "grad_norm": 0.7104585766792297, - "learning_rate": 1.8187783947501262e-05, - "loss": 0.056, - "step": 3900 - }, - { - "epoch": 1.9737506309944473, - "grad_norm": 0.7219833731651306, - "learning_rate": 1.8157496214033316e-05, - "loss": 0.0574, - "step": 3910 - }, - { - "epoch": 1.978798586572438, - "grad_norm": 0.5478711724281311, - "learning_rate": 1.812720848056537e-05, - "loss": 0.0657, - "step": 3920 - }, - { - "epoch": 1.9838465421504292, - "grad_norm": 0.6501402854919434, - "learning_rate": 1.8096920747097426e-05, - "loss": 0.0641, - "step": 3930 - }, - { - "epoch": 1.98889449772842, - "grad_norm": 0.7231020331382751, - "learning_rate": 1.806663301362948e-05, - "loss": 0.0692, - "step": 3940 - }, - { - "epoch": 1.993942453306411, - "grad_norm": 0.6480854749679565, - "learning_rate": 1.8036345280161536e-05, - "loss": 0.0632, - "step": 3950 - }, - { - "epoch": 1.998990408884402, - "grad_norm": 0.4803590774536133, - "learning_rate": 1.800605754669359e-05, - "loss": 0.0678, - "step": 3960 - }, - { - "epoch": 2.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.0446692518889904, - "eval_runtime": 584.4017, - "eval_samples_per_second": 352.946, - "eval_steps_per_second": 2.758, - "step": 3962 - }, - { - "epoch": 2.0040383644623927, - "grad_norm": 0.680855393409729, - "learning_rate": 1.7975769813225643e-05, - "loss": 0.0567, - "step": 3970 - }, - { - "epoch": 2.0090863200403835, - "grad_norm": 0.47991836071014404, - "learning_rate": 1.79454820797577e-05, - "loss": 0.0562, - "step": 3980 - }, - { - "epoch": 2.0141342756183747, - "grad_norm": 0.8615912199020386, - "learning_rate": 1.7915194346289753e-05, - "loss": 0.0679, - "step": 3990 - }, - { - "epoch": 2.0191822311963654, - "grad_norm": 0.5970327258110046, - "learning_rate": 1.7884906612821807e-05, - "loss": 0.053, - "step": 4000 - }, - { - "epoch": 2.024230186774356, - "grad_norm": 0.5402255654335022, - "learning_rate": 1.7854618879353864e-05, - "loss": 0.0574, - "step": 4010 - }, - { - "epoch": 2.0292781423523474, - "grad_norm": 0.5014840364456177, - "learning_rate": 1.7824331145885917e-05, - "loss": 0.0649, - "step": 4020 - }, - { - "epoch": 2.034326097930338, - "grad_norm": 0.7147154808044434, - "learning_rate": 1.779404341241797e-05, - "loss": 0.0687, - "step": 4030 - }, - { - "epoch": 2.0393740535083293, - "grad_norm": 0.5346552729606628, - "learning_rate": 1.7763755678950024e-05, - "loss": 0.0638, - "step": 4040 - }, - { - "epoch": 2.04442200908632, - "grad_norm": 0.5596599578857422, - "learning_rate": 1.7733467945482078e-05, - "loss": 0.0669, - "step": 4050 - }, - { - "epoch": 2.049469964664311, - "grad_norm": 0.40591198205947876, - "learning_rate": 1.7703180212014134e-05, - "loss": 0.0564, - "step": 4060 - }, - { - "epoch": 2.054517920242302, - "grad_norm": 0.609337568283081, - "learning_rate": 1.7672892478546188e-05, - "loss": 0.0576, - "step": 4070 - }, - { - "epoch": 2.059565875820293, - "grad_norm": 0.5424002408981323, - "learning_rate": 1.7642604745078245e-05, - "loss": 0.0585, - "step": 4080 - }, - { - "epoch": 2.0646138313982836, - "grad_norm": 0.9868631362915039, - "learning_rate": 1.7612317011610298e-05, - "loss": 0.0684, - "step": 4090 - }, - { - "epoch": 2.069661786976275, - "grad_norm": 0.6492929458618164, - "learning_rate": 1.758202927814235e-05, - "loss": 0.0638, - "step": 4100 - }, - { - "epoch": 2.0747097425542655, - "grad_norm": 0.7837685346603394, - "learning_rate": 1.755174154467441e-05, - "loss": 0.0675, - "step": 4110 - }, - { - "epoch": 2.0797576981322563, - "grad_norm": 0.5961639881134033, - "learning_rate": 1.7521453811206462e-05, - "loss": 0.0575, - "step": 4120 - }, - { - "epoch": 2.0848056537102475, - "grad_norm": 0.4114825427532196, - "learning_rate": 1.749116607773852e-05, - "loss": 0.0659, - "step": 4130 - }, - { - "epoch": 2.0898536092882383, - "grad_norm": 0.4567316174507141, - "learning_rate": 1.7460878344270572e-05, - "loss": 0.0661, - "step": 4140 - }, - { - "epoch": 2.094901564866229, - "grad_norm": 0.6321776509284973, - "learning_rate": 1.7430590610802626e-05, - "loss": 0.066, - "step": 4150 - }, - { - "epoch": 2.0999495204442202, - "grad_norm": 0.8911116719245911, - "learning_rate": 1.740030287733468e-05, - "loss": 0.0585, - "step": 4160 - }, - { - "epoch": 2.104997476022211, - "grad_norm": 0.4896914064884186, - "learning_rate": 1.7370015143866733e-05, - "loss": 0.0612, - "step": 4170 - }, - { - "epoch": 2.1100454316002017, - "grad_norm": 0.7571251392364502, - "learning_rate": 1.733972741039879e-05, - "loss": 0.0563, - "step": 4180 - }, - { - "epoch": 2.115093387178193, - "grad_norm": 0.9115099310874939, - "learning_rate": 1.7309439676930843e-05, - "loss": 0.0698, - "step": 4190 - }, - { - "epoch": 2.1201413427561837, - "grad_norm": 0.5267325639724731, - "learning_rate": 1.7279151943462896e-05, - "loss": 0.0604, - "step": 4200 - }, - { - "epoch": 2.1251892983341745, - "grad_norm": 0.6659255623817444, - "learning_rate": 1.7248864209994953e-05, - "loss": 0.0627, - "step": 4210 - }, - { - "epoch": 2.1302372539121657, - "grad_norm": 0.89178466796875, - "learning_rate": 1.7218576476527007e-05, - "loss": 0.0552, - "step": 4220 - }, - { - "epoch": 2.1352852094901564, - "grad_norm": 0.4615127742290497, - "learning_rate": 1.7188288743059063e-05, - "loss": 0.0557, - "step": 4230 - }, - { - "epoch": 2.1403331650681476, - "grad_norm": 0.6602596044540405, - "learning_rate": 1.7158001009591117e-05, - "loss": 0.0548, - "step": 4240 - }, - { - "epoch": 2.1453811206461384, - "grad_norm": 0.7081389427185059, - "learning_rate": 1.712771327612317e-05, - "loss": 0.0606, - "step": 4250 - }, - { - "epoch": 2.150429076224129, - "grad_norm": 0.5817338824272156, - "learning_rate": 1.7097425542655227e-05, - "loss": 0.0606, - "step": 4260 - }, - { - "epoch": 2.1554770318021204, - "grad_norm": 0.4401390254497528, - "learning_rate": 1.706713780918728e-05, - "loss": 0.0607, - "step": 4270 - }, - { - "epoch": 2.160524987380111, - "grad_norm": 1.0127087831497192, - "learning_rate": 1.7036850075719337e-05, - "loss": 0.0615, - "step": 4280 - }, - { - "epoch": 2.165572942958102, - "grad_norm": 0.5774319171905518, - "learning_rate": 1.7006562342251387e-05, - "loss": 0.0525, - "step": 4290 - }, - { - "epoch": 2.170620898536093, - "grad_norm": 0.47623270750045776, - "learning_rate": 1.697627460878344e-05, - "loss": 0.0591, - "step": 4300 - }, - { - "epoch": 2.175668854114084, - "grad_norm": 0.7083358764648438, - "learning_rate": 1.6945986875315498e-05, - "loss": 0.0631, - "step": 4310 - }, - { - "epoch": 2.1807168096920746, - "grad_norm": 0.6057601571083069, - "learning_rate": 1.691569914184755e-05, - "loss": 0.0595, - "step": 4320 - }, - { - "epoch": 2.185764765270066, - "grad_norm": 0.8947880864143372, - "learning_rate": 1.6885411408379605e-05, - "loss": 0.0666, - "step": 4330 - }, - { - "epoch": 2.1908127208480566, - "grad_norm": 0.6460204720497131, - "learning_rate": 1.685512367491166e-05, - "loss": 0.0669, - "step": 4340 - }, - { - "epoch": 2.1958606764260473, - "grad_norm": 0.9029686450958252, - "learning_rate": 1.6824835941443715e-05, - "loss": 0.0607, - "step": 4350 - }, - { - "epoch": 2.2009086320040385, - "grad_norm": 0.5201438665390015, - "learning_rate": 1.6794548207975772e-05, - "loss": 0.0514, - "step": 4360 - }, - { - "epoch": 2.2059565875820293, - "grad_norm": 0.39414748549461365, - "learning_rate": 1.6764260474507825e-05, - "loss": 0.0581, - "step": 4370 - }, - { - "epoch": 2.21100454316002, - "grad_norm": 0.642257034778595, - "learning_rate": 1.673397274103988e-05, - "loss": 0.0611, - "step": 4380 - }, - { - "epoch": 2.2160524987380112, - "grad_norm": 0.7225739359855652, - "learning_rate": 1.6703685007571935e-05, - "loss": 0.0569, - "step": 4390 - }, - { - "epoch": 2.221100454316002, - "grad_norm": 0.6948502659797668, - "learning_rate": 1.667339727410399e-05, - "loss": 0.0652, - "step": 4400 - }, - { - "epoch": 2.2261484098939928, - "grad_norm": 0.5755937695503235, - "learning_rate": 1.6643109540636042e-05, - "loss": 0.0566, - "step": 4410 - }, - { - "epoch": 2.231196365471984, - "grad_norm": 0.4249815046787262, - "learning_rate": 1.6612821807168096e-05, - "loss": 0.0642, - "step": 4420 - }, - { - "epoch": 2.2362443210499747, - "grad_norm": 0.5442089438438416, - "learning_rate": 1.658253407370015e-05, - "loss": 0.0685, - "step": 4430 - }, - { - "epoch": 2.2412922766279655, - "grad_norm": 0.8074495792388916, - "learning_rate": 1.6552246340232206e-05, - "loss": 0.0558, - "step": 4440 - }, - { - "epoch": 2.2463402322059567, - "grad_norm": 0.8810071349143982, - "learning_rate": 1.652195860676426e-05, - "loss": 0.0685, - "step": 4450 - }, - { - "epoch": 2.2513881877839474, - "grad_norm": 0.5399377942085266, - "learning_rate": 1.6491670873296316e-05, - "loss": 0.0607, - "step": 4460 - }, - { - "epoch": 2.256436143361938, - "grad_norm": 0.7178535461425781, - "learning_rate": 1.646138313982837e-05, - "loss": 0.0504, - "step": 4470 - }, - { - "epoch": 2.2614840989399294, - "grad_norm": 0.4272046983242035, - "learning_rate": 1.6431095406360423e-05, - "loss": 0.0583, - "step": 4480 - }, - { - "epoch": 2.26653205451792, - "grad_norm": 0.6807524561882019, - "learning_rate": 1.640080767289248e-05, - "loss": 0.0639, - "step": 4490 - }, - { - "epoch": 2.271580010095911, - "grad_norm": 0.5895000100135803, - "learning_rate": 1.6370519939424534e-05, - "loss": 0.0675, - "step": 4500 - }, - { - "epoch": 2.276627965673902, - "grad_norm": 0.6640876531600952, - "learning_rate": 1.634023220595659e-05, - "loss": 0.0603, - "step": 4510 - }, - { - "epoch": 2.281675921251893, - "grad_norm": 0.4367890954017639, - "learning_rate": 1.6309944472488644e-05, - "loss": 0.0517, - "step": 4520 - }, - { - "epoch": 2.2867238768298837, - "grad_norm": 1.082713007926941, - "learning_rate": 1.6279656739020697e-05, - "loss": 0.0524, - "step": 4530 - }, - { - "epoch": 2.291771832407875, - "grad_norm": 0.5186300277709961, - "learning_rate": 1.624936900555275e-05, - "loss": 0.0566, - "step": 4540 - }, - { - "epoch": 2.2968197879858656, - "grad_norm": 1.2778280973434448, - "learning_rate": 1.6219081272084804e-05, - "loss": 0.0531, - "step": 4550 - }, - { - "epoch": 2.301867743563857, - "grad_norm": 0.46757417917251587, - "learning_rate": 1.618879353861686e-05, - "loss": 0.0637, - "step": 4560 - }, - { - "epoch": 2.3069156991418476, - "grad_norm": 0.6333388686180115, - "learning_rate": 1.6158505805148914e-05, - "loss": 0.0557, - "step": 4570 - }, - { - "epoch": 2.3119636547198383, - "grad_norm": 0.4005846381187439, - "learning_rate": 1.6128218071680968e-05, - "loss": 0.0512, - "step": 4580 - }, - { - "epoch": 2.3170116102978295, - "grad_norm": 1.0479962825775146, - "learning_rate": 1.6097930338213025e-05, - "loss": 0.0639, - "step": 4590 - }, - { - "epoch": 2.3220595658758203, - "grad_norm": 1.1324669122695923, - "learning_rate": 1.6067642604745078e-05, - "loss": 0.0642, - "step": 4600 - }, - { - "epoch": 2.327107521453811, - "grad_norm": 0.827215313911438, - "learning_rate": 1.6037354871277135e-05, - "loss": 0.0654, - "step": 4610 - }, - { - "epoch": 2.3321554770318023, - "grad_norm": 0.8228656649589539, - "learning_rate": 1.600706713780919e-05, - "loss": 0.0648, - "step": 4620 - }, - { - "epoch": 2.337203432609793, - "grad_norm": 0.5897762775421143, - "learning_rate": 1.5976779404341242e-05, - "loss": 0.0546, - "step": 4630 - }, - { - "epoch": 2.342251388187784, - "grad_norm": 0.6223641633987427, - "learning_rate": 1.59464916708733e-05, - "loss": 0.0712, - "step": 4640 - }, - { - "epoch": 2.347299343765775, - "grad_norm": 0.5593187808990479, - "learning_rate": 1.5916203937405352e-05, - "loss": 0.0707, - "step": 4650 - }, - { - "epoch": 2.3523472993437657, - "grad_norm": 0.9349427223205566, - "learning_rate": 1.5885916203937406e-05, - "loss": 0.0581, - "step": 4660 - }, - { - "epoch": 2.3573952549217565, - "grad_norm": 0.47101134061813354, - "learning_rate": 1.585562847046946e-05, - "loss": 0.0688, - "step": 4670 - }, - { - "epoch": 2.3624432104997477, - "grad_norm": 0.5073738098144531, - "learning_rate": 1.5825340737001513e-05, - "loss": 0.0678, - "step": 4680 - }, - { - "epoch": 2.3674911660777385, - "grad_norm": 0.5324171781539917, - "learning_rate": 1.579505300353357e-05, - "loss": 0.0614, - "step": 4690 - }, - { - "epoch": 2.3725391216557297, - "grad_norm": 0.662965714931488, - "learning_rate": 1.5764765270065623e-05, - "loss": 0.0507, - "step": 4700 - }, - { - "epoch": 2.3775870772337204, - "grad_norm": 0.6482782959938049, - "learning_rate": 1.5734477536597676e-05, - "loss": 0.0537, - "step": 4710 - }, - { - "epoch": 2.382635032811711, - "grad_norm": 1.0039052963256836, - "learning_rate": 1.5704189803129733e-05, - "loss": 0.059, - "step": 4720 - }, - { - "epoch": 2.3876829883897024, - "grad_norm": 0.8546132445335388, - "learning_rate": 1.5673902069661787e-05, - "loss": 0.0691, - "step": 4730 - }, - { - "epoch": 2.392730943967693, - "grad_norm": 0.4903261363506317, - "learning_rate": 1.5643614336193843e-05, - "loss": 0.0535, - "step": 4740 - }, - { - "epoch": 2.397778899545684, - "grad_norm": 0.8538033962249756, - "learning_rate": 1.5613326602725897e-05, - "loss": 0.0616, - "step": 4750 - }, - { - "epoch": 2.402826855123675, - "grad_norm": 0.7978336215019226, - "learning_rate": 1.558303886925795e-05, - "loss": 0.0613, - "step": 4760 - }, - { - "epoch": 2.407874810701666, - "grad_norm": 0.6981778740882874, - "learning_rate": 1.5552751135790007e-05, - "loss": 0.0646, - "step": 4770 - }, - { - "epoch": 2.4129227662796566, - "grad_norm": 0.8517895936965942, - "learning_rate": 1.552246340232206e-05, - "loss": 0.0705, - "step": 4780 - }, - { - "epoch": 2.417970721857648, - "grad_norm": 0.4087599813938141, - "learning_rate": 1.5492175668854117e-05, - "loss": 0.0638, - "step": 4790 - }, - { - "epoch": 2.4230186774356386, - "grad_norm": 0.3779948651790619, - "learning_rate": 1.5461887935386168e-05, - "loss": 0.0524, - "step": 4800 - }, - { - "epoch": 2.4280666330136293, - "grad_norm": 0.42263171076774597, - "learning_rate": 1.543160020191822e-05, - "loss": 0.0623, - "step": 4810 - }, - { - "epoch": 2.4331145885916206, - "grad_norm": 0.5812351107597351, - "learning_rate": 1.5401312468450278e-05, - "loss": 0.0573, - "step": 4820 - }, - { - "epoch": 2.4381625441696113, - "grad_norm": 0.6073315143585205, - "learning_rate": 1.537102473498233e-05, - "loss": 0.057, - "step": 4830 - }, - { - "epoch": 2.443210499747602, - "grad_norm": 0.8706870079040527, - "learning_rate": 1.5340737001514388e-05, - "loss": 0.0606, - "step": 4840 - }, - { - "epoch": 2.4482584553255933, - "grad_norm": 0.9355966448783875, - "learning_rate": 1.531044926804644e-05, - "loss": 0.0563, - "step": 4850 - }, - { - "epoch": 2.453306410903584, - "grad_norm": 0.6352431774139404, - "learning_rate": 1.5280161534578495e-05, - "loss": 0.0537, - "step": 4860 - }, - { - "epoch": 2.458354366481575, - "grad_norm": 0.5970965623855591, - "learning_rate": 1.524987380111055e-05, - "loss": 0.0663, - "step": 4870 - }, - { - "epoch": 2.463402322059566, - "grad_norm": 0.40907353162765503, - "learning_rate": 1.5219586067642605e-05, - "loss": 0.0502, - "step": 4880 - }, - { - "epoch": 2.4684502776375568, - "grad_norm": 0.5130166411399841, - "learning_rate": 1.518929833417466e-05, - "loss": 0.0538, - "step": 4890 - }, - { - "epoch": 2.4734982332155475, - "grad_norm": 0.9824861288070679, - "learning_rate": 1.5159010600706716e-05, - "loss": 0.0518, - "step": 4900 - }, - { - "epoch": 2.4785461887935387, - "grad_norm": 0.6424157023429871, - "learning_rate": 1.512872286723877e-05, - "loss": 0.0599, - "step": 4910 - }, - { - "epoch": 2.4835941443715295, - "grad_norm": 0.8797338008880615, - "learning_rate": 1.5098435133770824e-05, - "loss": 0.0534, - "step": 4920 - }, - { - "epoch": 2.4886420999495202, - "grad_norm": 1.0275185108184814, - "learning_rate": 1.5068147400302876e-05, - "loss": 0.063, - "step": 4930 - }, - { - "epoch": 2.4936900555275114, - "grad_norm": 0.6370276808738708, - "learning_rate": 1.5037859666834931e-05, - "loss": 0.0584, - "step": 4940 - }, - { - "epoch": 2.498738011105502, - "grad_norm": 0.5083595514297485, - "learning_rate": 1.5007571933366986e-05, - "loss": 0.0635, - "step": 4950 - }, - { - "epoch": 2.503785966683493, - "grad_norm": 0.8423396348953247, - "learning_rate": 1.4977284199899041e-05, - "loss": 0.0593, - "step": 4960 - }, - { - "epoch": 2.508833922261484, - "grad_norm": 0.6133778691291809, - "learning_rate": 1.4946996466431095e-05, - "loss": 0.0652, - "step": 4970 - }, - { - "epoch": 2.513881877839475, - "grad_norm": 0.5626839995384216, - "learning_rate": 1.491670873296315e-05, - "loss": 0.061, - "step": 4980 - }, - { - "epoch": 2.5189298334174657, - "grad_norm": 0.6379786729812622, - "learning_rate": 1.4886420999495205e-05, - "loss": 0.0583, - "step": 4990 - }, - { - "epoch": 2.523977788995457, - "grad_norm": 0.39859360456466675, - "learning_rate": 1.485613326602726e-05, - "loss": 0.057, - "step": 5000 - }, - { - "epoch": 2.5290257445734476, - "grad_norm": 0.4674101173877716, - "learning_rate": 1.4825845532559315e-05, - "loss": 0.0584, - "step": 5010 - }, - { - "epoch": 2.5340737001514384, - "grad_norm": 0.6018111705780029, - "learning_rate": 1.4795557799091367e-05, - "loss": 0.0606, - "step": 5020 - }, - { - "epoch": 2.5391216557294296, - "grad_norm": 0.4932622015476227, - "learning_rate": 1.4765270065623422e-05, - "loss": 0.0551, - "step": 5030 - }, - { - "epoch": 2.5441696113074204, - "grad_norm": 0.5576731562614441, - "learning_rate": 1.4734982332155477e-05, - "loss": 0.0562, - "step": 5040 - }, - { - "epoch": 2.5492175668854116, - "grad_norm": 0.5910426378250122, - "learning_rate": 1.4704694598687533e-05, - "loss": 0.0632, - "step": 5050 - }, - { - "epoch": 2.5542655224634023, - "grad_norm": 0.42830216884613037, - "learning_rate": 1.4674406865219586e-05, - "loss": 0.0589, - "step": 5060 - }, - { - "epoch": 2.559313478041393, - "grad_norm": 0.657305896282196, - "learning_rate": 1.4644119131751641e-05, - "loss": 0.0666, - "step": 5070 - }, - { - "epoch": 2.5643614336193843, - "grad_norm": 0.5498583912849426, - "learning_rate": 1.4613831398283696e-05, - "loss": 0.0677, - "step": 5080 - }, - { - "epoch": 2.569409389197375, - "grad_norm": 1.5641086101531982, - "learning_rate": 1.458354366481575e-05, - "loss": 0.0618, - "step": 5090 - }, - { - "epoch": 2.5744573447753663, - "grad_norm": 0.576878011226654, - "learning_rate": 1.4553255931347805e-05, - "loss": 0.0596, - "step": 5100 - }, - { - "epoch": 2.579505300353357, - "grad_norm": 0.6855084896087646, - "learning_rate": 1.4522968197879858e-05, - "loss": 0.0684, - "step": 5110 - }, - { - "epoch": 2.5845532559313478, - "grad_norm": 0.46760818362236023, - "learning_rate": 1.4492680464411913e-05, - "loss": 0.0628, - "step": 5120 - }, - { - "epoch": 2.589601211509339, - "grad_norm": 0.4708857834339142, - "learning_rate": 1.4462392730943969e-05, - "loss": 0.0656, - "step": 5130 - }, - { - "epoch": 2.5946491670873297, - "grad_norm": 0.957336962223053, - "learning_rate": 1.4432104997476024e-05, - "loss": 0.0527, - "step": 5140 - }, - { - "epoch": 2.5996971226653205, - "grad_norm": 0.6079381704330444, - "learning_rate": 1.4401817264008077e-05, - "loss": 0.0499, - "step": 5150 - }, - { - "epoch": 2.6047450782433117, - "grad_norm": 0.644965410232544, - "learning_rate": 1.437152953054013e-05, - "loss": 0.0567, - "step": 5160 - }, - { - "epoch": 2.6097930338213025, - "grad_norm": 0.9058682322502136, - "learning_rate": 1.4341241797072186e-05, - "loss": 0.059, - "step": 5170 - }, - { - "epoch": 2.614840989399293, - "grad_norm": 0.6784061789512634, - "learning_rate": 1.4310954063604241e-05, - "loss": 0.0577, - "step": 5180 - }, - { - "epoch": 2.6198889449772844, - "grad_norm": 0.7699759602546692, - "learning_rate": 1.4280666330136296e-05, - "loss": 0.056, - "step": 5190 - }, - { - "epoch": 2.624936900555275, - "grad_norm": 1.0204094648361206, - "learning_rate": 1.425037859666835e-05, - "loss": 0.0595, - "step": 5200 - }, - { - "epoch": 2.629984856133266, - "grad_norm": 0.3317660987377167, - "learning_rate": 1.4220090863200403e-05, - "loss": 0.0579, - "step": 5210 - }, - { - "epoch": 2.635032811711257, - "grad_norm": 0.7586853504180908, - "learning_rate": 1.4189803129732458e-05, - "loss": 0.0612, - "step": 5220 - }, - { - "epoch": 2.640080767289248, - "grad_norm": 0.43295013904571533, - "learning_rate": 1.4159515396264513e-05, - "loss": 0.0584, - "step": 5230 - }, - { - "epoch": 2.6451287228672387, - "grad_norm": 0.9083705544471741, - "learning_rate": 1.4129227662796568e-05, - "loss": 0.0698, - "step": 5240 - }, - { - "epoch": 2.65017667844523, - "grad_norm": 0.6299885511398315, - "learning_rate": 1.4098939929328622e-05, - "loss": 0.0602, - "step": 5250 - }, - { - "epoch": 2.6552246340232206, - "grad_norm": 0.538589358329773, - "learning_rate": 1.4068652195860677e-05, - "loss": 0.0634, - "step": 5260 - }, - { - "epoch": 2.6602725896012114, - "grad_norm": 0.5712538361549377, - "learning_rate": 1.4038364462392732e-05, - "loss": 0.0625, - "step": 5270 - }, - { - "epoch": 2.6653205451792026, - "grad_norm": 0.5739433765411377, - "learning_rate": 1.4008076728924786e-05, - "loss": 0.0647, - "step": 5280 - }, - { - "epoch": 2.6703685007571933, - "grad_norm": 0.5050386786460876, - "learning_rate": 1.397778899545684e-05, - "loss": 0.0592, - "step": 5290 - }, - { - "epoch": 2.675416456335184, - "grad_norm": 0.41851407289505005, - "learning_rate": 1.3947501261988894e-05, - "loss": 0.0581, - "step": 5300 - }, - { - "epoch": 2.6804644119131753, - "grad_norm": 0.5866436958312988, - "learning_rate": 1.391721352852095e-05, - "loss": 0.0656, - "step": 5310 - }, - { - "epoch": 2.685512367491166, - "grad_norm": 0.47498345375061035, - "learning_rate": 1.3886925795053004e-05, - "loss": 0.0657, - "step": 5320 - }, - { - "epoch": 2.690560323069157, - "grad_norm": 0.5748500227928162, - "learning_rate": 1.385663806158506e-05, - "loss": 0.0588, - "step": 5330 - }, - { - "epoch": 2.695608278647148, - "grad_norm": 0.685787558555603, - "learning_rate": 1.3826350328117113e-05, - "loss": 0.0621, - "step": 5340 - }, - { - "epoch": 2.700656234225139, - "grad_norm": 0.5321753025054932, - "learning_rate": 1.3796062594649166e-05, - "loss": 0.0665, - "step": 5350 - }, - { - "epoch": 2.7057041898031295, - "grad_norm": 0.4687628746032715, - "learning_rate": 1.3765774861181222e-05, - "loss": 0.0622, - "step": 5360 - }, - { - "epoch": 2.7107521453811207, - "grad_norm": 0.6931032538414001, - "learning_rate": 1.3735487127713277e-05, - "loss": 0.0542, - "step": 5370 - }, - { - "epoch": 2.7158001009591115, - "grad_norm": 0.6347541213035583, - "learning_rate": 1.3705199394245332e-05, - "loss": 0.0618, - "step": 5380 - }, - { - "epoch": 2.7208480565371023, - "grad_norm": 0.5090097188949585, - "learning_rate": 1.3674911660777385e-05, - "loss": 0.0577, - "step": 5390 - }, - { - "epoch": 2.7258960121150935, - "grad_norm": 0.557161808013916, - "learning_rate": 1.3644623927309439e-05, - "loss": 0.0485, - "step": 5400 - }, - { - "epoch": 2.7309439676930842, - "grad_norm": 0.7229135036468506, - "learning_rate": 1.3614336193841494e-05, - "loss": 0.0642, - "step": 5410 - }, - { - "epoch": 2.735991923271075, - "grad_norm": 0.7802084684371948, - "learning_rate": 1.3584048460373549e-05, - "loss": 0.0721, - "step": 5420 - }, - { - "epoch": 2.741039878849066, - "grad_norm": 0.8350520730018616, - "learning_rate": 1.3553760726905604e-05, - "loss": 0.05, - "step": 5430 - }, - { - "epoch": 2.746087834427057, - "grad_norm": 0.24809196591377258, - "learning_rate": 1.3523472993437658e-05, - "loss": 0.0577, - "step": 5440 - }, - { - "epoch": 2.7511357900050477, - "grad_norm": 0.5501554608345032, - "learning_rate": 1.3493185259969713e-05, - "loss": 0.0613, - "step": 5450 - }, - { - "epoch": 2.756183745583039, - "grad_norm": 0.6459994912147522, - "learning_rate": 1.3462897526501768e-05, - "loss": 0.0545, - "step": 5460 - }, - { - "epoch": 2.7612317011610297, - "grad_norm": 1.0892735719680786, - "learning_rate": 1.3432609793033821e-05, - "loss": 0.0517, - "step": 5470 - }, - { - "epoch": 2.7662796567390204, - "grad_norm": 0.8553361296653748, - "learning_rate": 1.3402322059565877e-05, - "loss": 0.055, - "step": 5480 - }, - { - "epoch": 2.7713276123170116, - "grad_norm": 0.5909534692764282, - "learning_rate": 1.337203432609793e-05, - "loss": 0.0583, - "step": 5490 - }, - { - "epoch": 2.7763755678950024, - "grad_norm": 0.3620651662349701, - "learning_rate": 1.3341746592629985e-05, - "loss": 0.053, - "step": 5500 - }, - { - "epoch": 2.7814235234729936, - "grad_norm": 0.6525430083274841, - "learning_rate": 1.331145885916204e-05, - "loss": 0.0667, - "step": 5510 - }, - { - "epoch": 2.7864714790509844, - "grad_norm": 0.6129066944122314, - "learning_rate": 1.3281171125694095e-05, - "loss": 0.0578, - "step": 5520 - }, - { - "epoch": 2.791519434628975, - "grad_norm": 0.6374188661575317, - "learning_rate": 1.3250883392226147e-05, - "loss": 0.0598, - "step": 5530 - }, - { - "epoch": 2.7965673902069663, - "grad_norm": 0.6404274702072144, - "learning_rate": 1.3220595658758202e-05, - "loss": 0.064, - "step": 5540 - }, - { - "epoch": 2.801615345784957, - "grad_norm": 0.3882500231266022, - "learning_rate": 1.3190307925290257e-05, - "loss": 0.0556, - "step": 5550 - }, - { - "epoch": 2.8066633013629483, - "grad_norm": 0.827498197555542, - "learning_rate": 1.3160020191822313e-05, - "loss": 0.056, - "step": 5560 - }, - { - "epoch": 2.811711256940939, - "grad_norm": 0.5474889874458313, - "learning_rate": 1.3129732458354368e-05, - "loss": 0.0559, - "step": 5570 - }, - { - "epoch": 2.81675921251893, - "grad_norm": 0.7505003809928894, - "learning_rate": 1.3099444724886421e-05, - "loss": 0.0562, - "step": 5580 - }, - { - "epoch": 2.821807168096921, - "grad_norm": 0.7723977565765381, - "learning_rate": 1.3069156991418476e-05, - "loss": 0.0711, - "step": 5590 - }, - { - "epoch": 2.8268551236749118, - "grad_norm": 0.5930567979812622, - "learning_rate": 1.303886925795053e-05, - "loss": 0.0666, - "step": 5600 - }, - { - "epoch": 2.8319030792529025, - "grad_norm": 0.9205801486968994, - "learning_rate": 1.3008581524482585e-05, - "loss": 0.0635, - "step": 5610 - }, - { - "epoch": 2.8369510348308937, - "grad_norm": 0.6520891189575195, - "learning_rate": 1.297829379101464e-05, - "loss": 0.0503, - "step": 5620 - }, - { - "epoch": 2.8419989904088845, - "grad_norm": 0.697742760181427, - "learning_rate": 1.2948006057546693e-05, - "loss": 0.0527, - "step": 5630 - }, - { - "epoch": 2.8470469459868752, - "grad_norm": 0.5600337386131287, - "learning_rate": 1.2917718324078749e-05, - "loss": 0.0658, - "step": 5640 - }, - { - "epoch": 2.8520949015648664, - "grad_norm": 0.7648780941963196, - "learning_rate": 1.2887430590610804e-05, - "loss": 0.0503, - "step": 5650 - }, - { - "epoch": 2.857142857142857, - "grad_norm": 0.44580090045928955, - "learning_rate": 1.2857142857142857e-05, - "loss": 0.0569, - "step": 5660 - }, - { - "epoch": 2.862190812720848, - "grad_norm": 0.6274628043174744, - "learning_rate": 1.2826855123674912e-05, - "loss": 0.0544, - "step": 5670 - }, - { - "epoch": 2.867238768298839, - "grad_norm": 0.5967713594436646, - "learning_rate": 1.2796567390206966e-05, - "loss": 0.049, - "step": 5680 - }, - { - "epoch": 2.87228672387683, - "grad_norm": 0.49563518166542053, - "learning_rate": 1.2766279656739021e-05, - "loss": 0.0637, - "step": 5690 - }, - { - "epoch": 2.8773346794548207, - "grad_norm": 0.5065841674804688, - "learning_rate": 1.2735991923271076e-05, - "loss": 0.0635, - "step": 5700 - }, - { - "epoch": 2.882382635032812, - "grad_norm": 0.4228837490081787, - "learning_rate": 1.2705704189803131e-05, - "loss": 0.0561, - "step": 5710 - }, - { - "epoch": 2.8874305906108026, - "grad_norm": 0.36254429817199707, - "learning_rate": 1.2675416456335183e-05, - "loss": 0.0564, - "step": 5720 - }, - { - "epoch": 2.8924785461887934, - "grad_norm": 0.6964749097824097, - "learning_rate": 1.2645128722867238e-05, - "loss": 0.0566, - "step": 5730 - }, - { - "epoch": 2.8975265017667846, - "grad_norm": 1.2399131059646606, - "learning_rate": 1.2614840989399293e-05, - "loss": 0.0528, - "step": 5740 - }, - { - "epoch": 2.9025744573447754, - "grad_norm": 0.45011046528816223, - "learning_rate": 1.2584553255931348e-05, - "loss": 0.0605, - "step": 5750 - }, - { - "epoch": 2.907622412922766, - "grad_norm": 0.6450422406196594, - "learning_rate": 1.2554265522463404e-05, - "loss": 0.0579, - "step": 5760 - }, - { - "epoch": 2.9126703685007573, - "grad_norm": 0.6685008406639099, - "learning_rate": 1.2523977788995457e-05, - "loss": 0.0596, - "step": 5770 - }, - { - "epoch": 2.917718324078748, - "grad_norm": 0.7710725665092468, - "learning_rate": 1.2493690055527512e-05, - "loss": 0.063, - "step": 5780 - }, - { - "epoch": 2.922766279656739, - "grad_norm": 0.6229269504547119, - "learning_rate": 1.2463402322059566e-05, - "loss": 0.0542, - "step": 5790 - }, - { - "epoch": 2.92781423523473, - "grad_norm": 0.41364407539367676, - "learning_rate": 1.243311458859162e-05, - "loss": 0.0588, - "step": 5800 - }, - { - "epoch": 2.932862190812721, - "grad_norm": 0.5546961426734924, - "learning_rate": 1.2402826855123676e-05, - "loss": 0.0607, - "step": 5810 - }, - { - "epoch": 2.9379101463907116, - "grad_norm": 0.6814476251602173, - "learning_rate": 1.237253912165573e-05, - "loss": 0.0587, - "step": 5820 - }, - { - "epoch": 2.9429581019687028, - "grad_norm": 0.7745892405509949, - "learning_rate": 1.2342251388187784e-05, - "loss": 0.0484, - "step": 5830 - }, - { - "epoch": 2.9480060575466935, - "grad_norm": 0.9947149157524109, - "learning_rate": 1.231196365471984e-05, - "loss": 0.056, - "step": 5840 - }, - { - "epoch": 2.9530540131246843, - "grad_norm": 0.599892258644104, - "learning_rate": 1.2281675921251893e-05, - "loss": 0.0603, - "step": 5850 - }, - { - "epoch": 2.9581019687026755, - "grad_norm": 0.4991750121116638, - "learning_rate": 1.2251388187783947e-05, - "loss": 0.0603, - "step": 5860 - }, - { - "epoch": 2.9631499242806663, - "grad_norm": 0.44697603583335876, - "learning_rate": 1.2221100454316002e-05, - "loss": 0.0614, - "step": 5870 - }, - { - "epoch": 2.968197879858657, - "grad_norm": 0.34608447551727295, - "learning_rate": 1.2190812720848057e-05, - "loss": 0.0633, - "step": 5880 - }, - { - "epoch": 2.973245835436648, - "grad_norm": 0.6991161108016968, - "learning_rate": 1.2160524987380112e-05, - "loss": 0.0713, - "step": 5890 - }, - { - "epoch": 2.978293791014639, - "grad_norm": 0.7053156495094299, - "learning_rate": 1.2130237253912167e-05, - "loss": 0.0642, - "step": 5900 - }, - { - "epoch": 2.9833417465926297, - "grad_norm": 0.4541454315185547, - "learning_rate": 1.209994952044422e-05, - "loss": 0.0583, - "step": 5910 - }, - { - "epoch": 2.988389702170621, - "grad_norm": 0.5963706970214844, - "learning_rate": 1.2069661786976274e-05, - "loss": 0.0551, - "step": 5920 - }, - { - "epoch": 2.9934376577486117, - "grad_norm": 0.37611526250839233, - "learning_rate": 1.2039374053508329e-05, - "loss": 0.0551, - "step": 5930 - }, - { - "epoch": 2.9984856133266025, - "grad_norm": 0.5949448943138123, - "learning_rate": 1.2009086320040384e-05, - "loss": 0.0615, - "step": 5940 - }, - { - "epoch": 3.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.04155249148607254, - "eval_runtime": 582.0561, - "eval_samples_per_second": 354.368, - "eval_steps_per_second": 2.769, - "step": 5943 - }, - { - "epoch": 3.0035335689045937, - "grad_norm": 0.732612133026123, - "learning_rate": 1.197879858657244e-05, - "loss": 0.0473, - "step": 5950 - }, - { - "epoch": 3.0085815244825844, - "grad_norm": 0.8803137540817261, - "learning_rate": 1.1948510853104493e-05, - "loss": 0.0513, - "step": 5960 - }, - { - "epoch": 3.0136294800605756, - "grad_norm": 0.5578094720840454, - "learning_rate": 1.1918223119636548e-05, - "loss": 0.0603, - "step": 5970 - }, - { - "epoch": 3.0186774356385664, - "grad_norm": 0.9948665499687195, - "learning_rate": 1.1887935386168601e-05, - "loss": 0.0592, - "step": 5980 - }, - { - "epoch": 3.023725391216557, - "grad_norm": 0.6967259049415588, - "learning_rate": 1.1857647652700657e-05, - "loss": 0.0741, - "step": 5990 - }, - { - "epoch": 3.0287733467945483, - "grad_norm": 0.48011064529418945, - "learning_rate": 1.182735991923271e-05, - "loss": 0.055, - "step": 6000 - }, - { - "epoch": 3.033821302372539, - "grad_norm": 0.663847804069519, - "learning_rate": 1.1797072185764765e-05, - "loss": 0.0591, - "step": 6010 - }, - { - "epoch": 3.03886925795053, - "grad_norm": 0.589154839515686, - "learning_rate": 1.176678445229682e-05, - "loss": 0.0508, - "step": 6020 - }, - { - "epoch": 3.043917213528521, - "grad_norm": 0.7075181007385254, - "learning_rate": 1.1736496718828875e-05, - "loss": 0.0493, - "step": 6030 - }, - { - "epoch": 3.048965169106512, - "grad_norm": 0.6230030655860901, - "learning_rate": 1.1706208985360929e-05, - "loss": 0.0589, - "step": 6040 - }, - { - "epoch": 3.0540131246845026, - "grad_norm": 0.6204888820648193, - "learning_rate": 1.1675921251892982e-05, - "loss": 0.0602, - "step": 6050 - }, - { - "epoch": 3.059061080262494, - "grad_norm": 0.456939160823822, - "learning_rate": 1.1645633518425038e-05, - "loss": 0.059, - "step": 6060 - }, - { - "epoch": 3.0641090358404846, - "grad_norm": 0.7607660889625549, - "learning_rate": 1.1615345784957093e-05, - "loss": 0.0488, - "step": 6070 - }, - { - "epoch": 3.0691569914184753, - "grad_norm": 1.2064040899276733, - "learning_rate": 1.1585058051489148e-05, - "loss": 0.0695, - "step": 6080 - }, - { - "epoch": 3.0742049469964665, - "grad_norm": 0.5143324732780457, - "learning_rate": 1.1554770318021203e-05, - "loss": 0.0606, - "step": 6090 - }, - { - "epoch": 3.0792529025744573, - "grad_norm": 0.6567758917808533, - "learning_rate": 1.1524482584553256e-05, - "loss": 0.0581, - "step": 6100 - }, - { - "epoch": 3.0843008581524485, - "grad_norm": 0.7469787001609802, - "learning_rate": 1.149419485108531e-05, - "loss": 0.0535, - "step": 6110 - }, - { - "epoch": 3.0893488137304392, - "grad_norm": 0.40161028504371643, - "learning_rate": 1.1463907117617365e-05, - "loss": 0.056, - "step": 6120 - }, - { - "epoch": 3.09439676930843, - "grad_norm": 0.7404605150222778, - "learning_rate": 1.143361938414942e-05, - "loss": 0.0471, - "step": 6130 - }, - { - "epoch": 3.099444724886421, - "grad_norm": 0.8587531447410583, - "learning_rate": 1.1403331650681475e-05, - "loss": 0.0558, - "step": 6140 - }, - { - "epoch": 3.104492680464412, - "grad_norm": 0.424450159072876, - "learning_rate": 1.1373043917213529e-05, - "loss": 0.0558, - "step": 6150 - }, - { - "epoch": 3.1095406360424027, - "grad_norm": 0.9383788704872131, - "learning_rate": 1.1342756183745584e-05, - "loss": 0.0517, - "step": 6160 - }, - { - "epoch": 3.114588591620394, - "grad_norm": 0.8069589734077454, - "learning_rate": 1.1312468450277637e-05, - "loss": 0.0588, - "step": 6170 - }, - { - "epoch": 3.1196365471983847, - "grad_norm": 0.8677689433097839, - "learning_rate": 1.1282180716809692e-05, - "loss": 0.0611, - "step": 6180 - }, - { - "epoch": 3.1246845027763754, - "grad_norm": 0.7949932813644409, - "learning_rate": 1.1251892983341746e-05, - "loss": 0.0553, - "step": 6190 - }, - { - "epoch": 3.1297324583543666, - "grad_norm": 0.6563514471054077, - "learning_rate": 1.1221605249873801e-05, - "loss": 0.0549, - "step": 6200 - }, - { - "epoch": 3.1347804139323574, - "grad_norm": 0.5856168866157532, - "learning_rate": 1.1191317516405856e-05, - "loss": 0.0585, - "step": 6210 - }, - { - "epoch": 3.139828369510348, - "grad_norm": 0.6840217709541321, - "learning_rate": 1.1161029782937911e-05, - "loss": 0.0683, - "step": 6220 - }, - { - "epoch": 3.1448763250883394, - "grad_norm": 1.310652494430542, - "learning_rate": 1.1130742049469966e-05, - "loss": 0.057, - "step": 6230 - }, - { - "epoch": 3.14992428066633, - "grad_norm": 0.6700050830841064, - "learning_rate": 1.1100454316002018e-05, - "loss": 0.0562, - "step": 6240 - }, - { - "epoch": 3.154972236244321, - "grad_norm": 0.5210493803024292, - "learning_rate": 1.1070166582534073e-05, - "loss": 0.0545, - "step": 6250 - }, - { - "epoch": 3.160020191822312, - "grad_norm": 0.44693487882614136, - "learning_rate": 1.1039878849066128e-05, - "loss": 0.0614, - "step": 6260 - }, - { - "epoch": 3.165068147400303, - "grad_norm": 0.8827401995658875, - "learning_rate": 1.1009591115598184e-05, - "loss": 0.06, - "step": 6270 - }, - { - "epoch": 3.1701161029782936, - "grad_norm": 0.29074421525001526, - "learning_rate": 1.0979303382130239e-05, - "loss": 0.059, - "step": 6280 - }, - { - "epoch": 3.175164058556285, - "grad_norm": 0.8659618496894836, - "learning_rate": 1.0949015648662292e-05, - "loss": 0.0541, - "step": 6290 - }, - { - "epoch": 3.1802120141342756, - "grad_norm": 0.8624622821807861, - "learning_rate": 1.0918727915194346e-05, - "loss": 0.0661, - "step": 6300 - }, - { - "epoch": 3.1852599697122663, - "grad_norm": 0.6411763429641724, - "learning_rate": 1.08884401817264e-05, - "loss": 0.0642, - "step": 6310 - }, - { - "epoch": 3.1903079252902575, - "grad_norm": 0.5271298289299011, - "learning_rate": 1.0858152448258456e-05, - "loss": 0.0552, - "step": 6320 - }, - { - "epoch": 3.1953558808682483, - "grad_norm": 0.9701720476150513, - "learning_rate": 1.082786471479051e-05, - "loss": 0.0586, - "step": 6330 - }, - { - "epoch": 3.200403836446239, - "grad_norm": 0.5633390545845032, - "learning_rate": 1.0797576981322565e-05, - "loss": 0.0554, - "step": 6340 - }, - { - "epoch": 3.2054517920242303, - "grad_norm": 0.45846840739250183, - "learning_rate": 1.076728924785462e-05, - "loss": 0.0582, - "step": 6350 - }, - { - "epoch": 3.210499747602221, - "grad_norm": 0.43338650465011597, - "learning_rate": 1.0737001514386673e-05, - "loss": 0.0588, - "step": 6360 - }, - { - "epoch": 3.215547703180212, - "grad_norm": 0.8287716507911682, - "learning_rate": 1.0706713780918728e-05, - "loss": 0.053, - "step": 6370 - }, - { - "epoch": 3.220595658758203, - "grad_norm": 0.5174350142478943, - "learning_rate": 1.0676426047450782e-05, - "loss": 0.0587, - "step": 6380 - }, - { - "epoch": 3.2256436143361937, - "grad_norm": 0.47460228204727173, - "learning_rate": 1.0646138313982837e-05, - "loss": 0.0598, - "step": 6390 - }, - { - "epoch": 3.230691569914185, - "grad_norm": 0.49122539162635803, - "learning_rate": 1.0615850580514892e-05, - "loss": 0.0535, - "step": 6400 - }, - { - "epoch": 3.2357395254921757, - "grad_norm": 0.5462148189544678, - "learning_rate": 1.0585562847046947e-05, - "loss": 0.0518, - "step": 6410 - }, - { - "epoch": 3.2407874810701665, - "grad_norm": 0.7671846747398376, - "learning_rate": 1.0555275113579002e-05, - "loss": 0.0611, - "step": 6420 - }, - { - "epoch": 3.2458354366481577, - "grad_norm": 0.6748913526535034, - "learning_rate": 1.0524987380111054e-05, - "loss": 0.0561, - "step": 6430 - }, - { - "epoch": 3.2508833922261484, - "grad_norm": 0.5004613399505615, - "learning_rate": 1.049469964664311e-05, - "loss": 0.0534, - "step": 6440 - }, - { - "epoch": 3.255931347804139, - "grad_norm": 0.4895551800727844, - "learning_rate": 1.0464411913175164e-05, - "loss": 0.0459, - "step": 6450 - }, - { - "epoch": 3.2609793033821304, - "grad_norm": 0.47480469942092896, - "learning_rate": 1.043412417970722e-05, - "loss": 0.0601, - "step": 6460 - }, - { - "epoch": 3.266027258960121, - "grad_norm": 0.4885694086551666, - "learning_rate": 1.0403836446239273e-05, - "loss": 0.0598, - "step": 6470 - }, - { - "epoch": 3.271075214538112, - "grad_norm": 0.6375486254692078, - "learning_rate": 1.0373548712771328e-05, - "loss": 0.0602, - "step": 6480 - }, - { - "epoch": 3.276123170116103, - "grad_norm": 0.7264606356620789, - "learning_rate": 1.0343260979303382e-05, - "loss": 0.0579, - "step": 6490 - }, - { - "epoch": 3.281171125694094, - "grad_norm": 0.5704456567764282, - "learning_rate": 1.0312973245835437e-05, - "loss": 0.056, - "step": 6500 - }, - { - "epoch": 3.2862190812720846, - "grad_norm": 0.6324512362480164, - "learning_rate": 1.0282685512367492e-05, - "loss": 0.0515, - "step": 6510 - }, - { - "epoch": 3.291267036850076, - "grad_norm": 0.5736483931541443, - "learning_rate": 1.0252397778899545e-05, - "loss": 0.0538, - "step": 6520 - }, - { - "epoch": 3.2963149924280666, - "grad_norm": 0.48032522201538086, - "learning_rate": 1.02221100454316e-05, - "loss": 0.0568, - "step": 6530 - }, - { - "epoch": 3.301362948006058, - "grad_norm": 0.6696997880935669, - "learning_rate": 1.0191822311963656e-05, - "loss": 0.0537, - "step": 6540 - }, - { - "epoch": 3.3064109035840485, - "grad_norm": 0.44333356618881226, - "learning_rate": 1.016153457849571e-05, - "loss": 0.0514, - "step": 6550 - }, - { - "epoch": 3.3114588591620393, - "grad_norm": 0.6224443912506104, - "learning_rate": 1.0131246845027764e-05, - "loss": 0.0607, - "step": 6560 - }, - { - "epoch": 3.3165068147400305, - "grad_norm": 0.7066437602043152, - "learning_rate": 1.0100959111559818e-05, - "loss": 0.0563, - "step": 6570 - }, - { - "epoch": 3.3215547703180213, - "grad_norm": 0.6406083106994629, - "learning_rate": 1.0070671378091873e-05, - "loss": 0.0573, - "step": 6580 - }, - { - "epoch": 3.326602725896012, - "grad_norm": 0.44534462690353394, - "learning_rate": 1.0040383644623928e-05, - "loss": 0.059, - "step": 6590 - }, - { - "epoch": 3.3316506814740032, - "grad_norm": 0.7137624025344849, - "learning_rate": 1.0010095911155983e-05, - "loss": 0.0568, - "step": 6600 - }, - { - "epoch": 3.336698637051994, - "grad_norm": 0.6909269690513611, - "learning_rate": 9.979808177688038e-06, - "loss": 0.0493, - "step": 6610 - }, - { - "epoch": 3.3417465926299847, - "grad_norm": 0.6987153887748718, - "learning_rate": 9.94952044422009e-06, - "loss": 0.059, - "step": 6620 - }, - { - "epoch": 3.346794548207976, - "grad_norm": 0.538732647895813, - "learning_rate": 9.919232710752145e-06, - "loss": 0.0582, - "step": 6630 - }, - { - "epoch": 3.3518425037859667, - "grad_norm": 0.6330693960189819, - "learning_rate": 9.8889449772842e-06, - "loss": 0.0506, - "step": 6640 - }, - { - "epoch": 3.3568904593639575, - "grad_norm": 0.5216783881187439, - "learning_rate": 9.858657243816255e-06, - "loss": 0.0544, - "step": 6650 - }, - { - "epoch": 3.3619384149419487, - "grad_norm": 0.7052462697029114, - "learning_rate": 9.828369510348309e-06, - "loss": 0.0553, - "step": 6660 - }, - { - "epoch": 3.3669863705199394, - "grad_norm": 0.7679615616798401, - "learning_rate": 9.798081776880364e-06, - "loss": 0.061, - "step": 6670 - }, - { - "epoch": 3.37203432609793, - "grad_norm": 0.530564546585083, - "learning_rate": 9.767794043412417e-06, - "loss": 0.0567, - "step": 6680 - }, - { - "epoch": 3.3770822816759214, - "grad_norm": 0.6907301545143127, - "learning_rate": 9.737506309944473e-06, - "loss": 0.0561, - "step": 6690 - }, - { - "epoch": 3.382130237253912, - "grad_norm": 0.7837420105934143, - "learning_rate": 9.707218576476528e-06, - "loss": 0.0618, - "step": 6700 - }, - { - "epoch": 3.387178192831903, - "grad_norm": 0.6361984014511108, - "learning_rate": 9.676930843008581e-06, - "loss": 0.0533, - "step": 6710 - }, - { - "epoch": 3.392226148409894, - "grad_norm": 0.6775834560394287, - "learning_rate": 9.646643109540636e-06, - "loss": 0.0571, - "step": 6720 - }, - { - "epoch": 3.397274103987885, - "grad_norm": 0.4820801615715027, - "learning_rate": 9.616355376072691e-06, - "loss": 0.063, - "step": 6730 - }, - { - "epoch": 3.4023220595658756, - "grad_norm": 0.511091411113739, - "learning_rate": 9.586067642604747e-06, - "loss": 0.0621, - "step": 6740 - }, - { - "epoch": 3.407370015143867, - "grad_norm": 0.5163900852203369, - "learning_rate": 9.5557799091368e-06, - "loss": 0.0606, - "step": 6750 - }, - { - "epoch": 3.4124179707218576, - "grad_norm": 0.4652441740036011, - "learning_rate": 9.525492175668853e-06, - "loss": 0.0539, - "step": 6760 - }, - { - "epoch": 3.4174659262998484, - "grad_norm": 0.5968872904777527, - "learning_rate": 9.495204442200909e-06, - "loss": 0.0599, - "step": 6770 - }, - { - "epoch": 3.4225138818778396, - "grad_norm": 0.4634818732738495, - "learning_rate": 9.464916708732964e-06, - "loss": 0.0518, - "step": 6780 - }, - { - "epoch": 3.4275618374558303, - "grad_norm": 0.34169018268585205, - "learning_rate": 9.434628975265019e-06, - "loss": 0.0588, - "step": 6790 - }, - { - "epoch": 3.432609793033821, - "grad_norm": 0.719494640827179, - "learning_rate": 9.404341241797072e-06, - "loss": 0.0538, - "step": 6800 - }, - { - "epoch": 3.4376577486118123, - "grad_norm": 0.4465346336364746, - "learning_rate": 9.374053508329126e-06, - "loss": 0.0577, - "step": 6810 - }, - { - "epoch": 3.442705704189803, - "grad_norm": 0.6223052740097046, - "learning_rate": 9.343765774861181e-06, - "loss": 0.0598, - "step": 6820 - }, - { - "epoch": 3.447753659767794, - "grad_norm": 0.6854692697525024, - "learning_rate": 9.313478041393236e-06, - "loss": 0.0544, - "step": 6830 - }, - { - "epoch": 3.452801615345785, - "grad_norm": 1.0640225410461426, - "learning_rate": 9.283190307925291e-06, - "loss": 0.0569, - "step": 6840 - }, - { - "epoch": 3.4578495709237758, - "grad_norm": 0.5437650680541992, - "learning_rate": 9.252902574457345e-06, - "loss": 0.0612, - "step": 6850 - }, - { - "epoch": 3.462897526501767, - "grad_norm": 0.5767130255699158, - "learning_rate": 9.2226148409894e-06, - "loss": 0.0618, - "step": 6860 - }, - { - "epoch": 3.4679454820797577, - "grad_norm": 0.5814956426620483, - "learning_rate": 9.192327107521453e-06, - "loss": 0.0571, - "step": 6870 - }, - { - "epoch": 3.4729934376577485, - "grad_norm": 0.31469887495040894, - "learning_rate": 9.162039374053508e-06, - "loss": 0.0573, - "step": 6880 - }, - { - "epoch": 3.4780413932357397, - "grad_norm": 0.3987484872341156, - "learning_rate": 9.131751640585563e-06, - "loss": 0.0534, - "step": 6890 - }, - { - "epoch": 3.4830893488137304, - "grad_norm": 0.47312065958976746, - "learning_rate": 9.101463907117617e-06, - "loss": 0.0608, - "step": 6900 - }, - { - "epoch": 3.488137304391721, - "grad_norm": 0.4635220170021057, - "learning_rate": 9.071176173649672e-06, - "loss": 0.05, - "step": 6910 - }, - { - "epoch": 3.4931852599697124, - "grad_norm": 1.146721363067627, - "learning_rate": 9.040888440181727e-06, - "loss": 0.0548, - "step": 6920 - }, - { - "epoch": 3.498233215547703, - "grad_norm": 0.42057961225509644, - "learning_rate": 9.010600706713782e-06, - "loss": 0.0463, - "step": 6930 - }, - { - "epoch": 3.5032811711256944, - "grad_norm": 0.7835047841072083, - "learning_rate": 8.980312973245836e-06, - "loss": 0.0507, - "step": 6940 - }, - { - "epoch": 3.508329126703685, - "grad_norm": 0.6441161036491394, - "learning_rate": 8.95002523977789e-06, - "loss": 0.0571, - "step": 6950 - }, - { - "epoch": 3.513377082281676, - "grad_norm": 0.6828143000602722, - "learning_rate": 8.919737506309944e-06, - "loss": 0.0525, - "step": 6960 - }, - { - "epoch": 3.518425037859667, - "grad_norm": 0.8285954594612122, - "learning_rate": 8.889449772842e-06, - "loss": 0.0621, - "step": 6970 - }, - { - "epoch": 3.523472993437658, - "grad_norm": 0.4954177439212799, - "learning_rate": 8.859162039374055e-06, - "loss": 0.0625, - "step": 6980 - }, - { - "epoch": 3.5285209490156486, - "grad_norm": 0.7900820374488831, - "learning_rate": 8.828874305906108e-06, - "loss": 0.0603, - "step": 6990 - }, - { - "epoch": 3.53356890459364, - "grad_norm": 0.6767242550849915, - "learning_rate": 8.798586572438162e-06, - "loss": 0.0586, - "step": 7000 - }, - { - "epoch": 3.5386168601716306, - "grad_norm": 0.5408624410629272, - "learning_rate": 8.768298838970217e-06, - "loss": 0.0561, - "step": 7010 - }, - { - "epoch": 3.5436648157496213, - "grad_norm": 0.4577973484992981, - "learning_rate": 8.738011105502272e-06, - "loss": 0.057, - "step": 7020 - }, - { - "epoch": 3.5487127713276125, - "grad_norm": 0.7334242463111877, - "learning_rate": 8.707723372034327e-06, - "loss": 0.0602, - "step": 7030 - }, - { - "epoch": 3.5537607269056033, - "grad_norm": 0.5569146275520325, - "learning_rate": 8.67743563856638e-06, - "loss": 0.0564, - "step": 7040 - }, - { - "epoch": 3.558808682483594, - "grad_norm": 0.5739743709564209, - "learning_rate": 8.647147905098436e-06, - "loss": 0.0605, - "step": 7050 - }, - { - "epoch": 3.5638566380615853, - "grad_norm": 0.5553867816925049, - "learning_rate": 8.61686017163049e-06, - "loss": 0.0573, - "step": 7060 - }, - { - "epoch": 3.568904593639576, - "grad_norm": 0.7109550833702087, - "learning_rate": 8.586572438162544e-06, - "loss": 0.0634, - "step": 7070 - }, - { - "epoch": 3.5739525492175668, - "grad_norm": 0.46534502506256104, - "learning_rate": 8.5562847046946e-06, - "loss": 0.0494, - "step": 7080 - }, - { - "epoch": 3.579000504795558, - "grad_norm": 0.47850191593170166, - "learning_rate": 8.525996971226653e-06, - "loss": 0.0613, - "step": 7090 - }, - { - "epoch": 3.5840484603735487, - "grad_norm": 0.3749614953994751, - "learning_rate": 8.495709237758708e-06, - "loss": 0.0574, - "step": 7100 - }, - { - "epoch": 3.5890964159515395, - "grad_norm": 0.5852258801460266, - "learning_rate": 8.465421504290763e-06, - "loss": 0.064, - "step": 7110 - }, - { - "epoch": 3.5941443715295307, - "grad_norm": 0.3820860981941223, - "learning_rate": 8.435133770822818e-06, - "loss": 0.0559, - "step": 7120 - }, - { - "epoch": 3.5991923271075215, - "grad_norm": 0.5200080275535583, - "learning_rate": 8.40484603735487e-06, - "loss": 0.0556, - "step": 7130 - }, - { - "epoch": 3.604240282685512, - "grad_norm": 0.6472256183624268, - "learning_rate": 8.374558303886925e-06, - "loss": 0.0596, - "step": 7140 - }, - { - "epoch": 3.6092882382635034, - "grad_norm": 0.43182119727134705, - "learning_rate": 8.34427057041898e-06, - "loss": 0.0478, - "step": 7150 - }, - { - "epoch": 3.614336193841494, - "grad_norm": 0.6659020781517029, - "learning_rate": 8.313982836951035e-06, - "loss": 0.054, - "step": 7160 - }, - { - "epoch": 3.619384149419485, - "grad_norm": 0.6561934947967529, - "learning_rate": 8.28369510348309e-06, - "loss": 0.0583, - "step": 7170 - }, - { - "epoch": 3.624432104997476, - "grad_norm": 0.7083423733711243, - "learning_rate": 8.253407370015144e-06, - "loss": 0.0598, - "step": 7180 - }, - { - "epoch": 3.629480060575467, - "grad_norm": 0.6030146479606628, - "learning_rate": 8.223119636547197e-06, - "loss": 0.0569, - "step": 7190 - }, - { - "epoch": 3.6345280161534577, - "grad_norm": 0.4650856554508209, - "learning_rate": 8.192831903079253e-06, - "loss": 0.0593, - "step": 7200 - }, - { - "epoch": 3.639575971731449, - "grad_norm": 0.5656235814094543, - "learning_rate": 8.162544169611308e-06, - "loss": 0.058, - "step": 7210 - }, - { - "epoch": 3.6446239273094396, - "grad_norm": 0.5745735764503479, - "learning_rate": 8.132256436143363e-06, - "loss": 0.0582, - "step": 7220 - }, - { - "epoch": 3.6496718828874304, - "grad_norm": 0.7879515886306763, - "learning_rate": 8.101968702675416e-06, - "loss": 0.0593, - "step": 7230 - }, - { - "epoch": 3.6547198384654216, - "grad_norm": 0.7000477313995361, - "learning_rate": 8.071680969207471e-06, - "loss": 0.0517, - "step": 7240 - }, - { - "epoch": 3.6597677940434123, - "grad_norm": 0.44397464394569397, - "learning_rate": 8.041393235739527e-06, - "loss": 0.0569, - "step": 7250 - }, - { - "epoch": 3.664815749621403, - "grad_norm": 0.55961674451828, - "learning_rate": 8.01110550227158e-06, - "loss": 0.0529, - "step": 7260 - }, - { - "epoch": 3.6698637051993943, - "grad_norm": 0.5441805720329285, - "learning_rate": 7.980817768803635e-06, - "loss": 0.0537, - "step": 7270 - }, - { - "epoch": 3.674911660777385, - "grad_norm": 0.5779780149459839, - "learning_rate": 7.950530035335689e-06, - "loss": 0.0549, - "step": 7280 - }, - { - "epoch": 3.679959616355376, - "grad_norm": 0.4491129517555237, - "learning_rate": 7.920242301867744e-06, - "loss": 0.0527, - "step": 7290 - }, - { - "epoch": 3.685007571933367, - "grad_norm": 0.6601787209510803, - "learning_rate": 7.889954568399799e-06, - "loss": 0.0545, - "step": 7300 - }, - { - "epoch": 3.690055527511358, - "grad_norm": 0.7920609712600708, - "learning_rate": 7.859666834931854e-06, - "loss": 0.0607, - "step": 7310 - }, - { - "epoch": 3.6951034830893486, - "grad_norm": 0.6220458149909973, - "learning_rate": 7.829379101463906e-06, - "loss": 0.0574, - "step": 7320 - }, - { - "epoch": 3.7001514386673398, - "grad_norm": 0.6900739669799805, - "learning_rate": 7.799091367995961e-06, - "loss": 0.0549, - "step": 7330 - }, - { - "epoch": 3.7051993942453305, - "grad_norm": 1.071191430091858, - "learning_rate": 7.768803634528016e-06, - "loss": 0.0644, - "step": 7340 - }, - { - "epoch": 3.7102473498233217, - "grad_norm": 0.5342854261398315, - "learning_rate": 7.738515901060071e-06, - "loss": 0.0639, - "step": 7350 - }, - { - "epoch": 3.7152953054013125, - "grad_norm": 0.49695709347724915, - "learning_rate": 7.708228167592126e-06, - "loss": 0.0525, - "step": 7360 - }, - { - "epoch": 3.7203432609793032, - "grad_norm": 0.6041547060012817, - "learning_rate": 7.67794043412418e-06, - "loss": 0.0596, - "step": 7370 - }, - { - "epoch": 3.7253912165572944, - "grad_norm": 0.6425964832305908, - "learning_rate": 7.647652700656235e-06, - "loss": 0.0626, - "step": 7380 - }, - { - "epoch": 3.730439172135285, - "grad_norm": 0.5185597538948059, - "learning_rate": 7.617364967188288e-06, - "loss": 0.063, - "step": 7390 - }, - { - "epoch": 3.7354871277132764, - "grad_norm": 0.48031681776046753, - "learning_rate": 7.587077233720343e-06, - "loss": 0.0633, - "step": 7400 - }, - { - "epoch": 3.740535083291267, - "grad_norm": 0.46377626061439514, - "learning_rate": 7.556789500252398e-06, - "loss": 0.0581, - "step": 7410 - }, - { - "epoch": 3.745583038869258, - "grad_norm": 0.7336452007293701, - "learning_rate": 7.526501766784453e-06, - "loss": 0.0572, - "step": 7420 - }, - { - "epoch": 3.750630994447249, - "grad_norm": 0.8720684051513672, - "learning_rate": 7.4962140333165064e-06, - "loss": 0.0558, - "step": 7430 - }, - { - "epoch": 3.75567895002524, - "grad_norm": 0.372592031955719, - "learning_rate": 7.465926299848562e-06, - "loss": 0.0613, - "step": 7440 - }, - { - "epoch": 3.7607269056032306, - "grad_norm": 0.5049020648002625, - "learning_rate": 7.435638566380616e-06, - "loss": 0.058, - "step": 7450 - }, - { - "epoch": 3.765774861181222, - "grad_norm": 0.5402325391769409, - "learning_rate": 7.405350832912671e-06, - "loss": 0.0484, - "step": 7460 - }, - { - "epoch": 3.7708228167592126, - "grad_norm": 0.5662652850151062, - "learning_rate": 7.375063099444725e-06, - "loss": 0.0613, - "step": 7470 - }, - { - "epoch": 3.7758707723372034, - "grad_norm": 0.6431825160980225, - "learning_rate": 7.34477536597678e-06, - "loss": 0.0522, - "step": 7480 - }, - { - "epoch": 3.7809187279151946, - "grad_norm": 0.9309275150299072, - "learning_rate": 7.314487632508835e-06, - "loss": 0.0602, - "step": 7490 - }, - { - "epoch": 3.7859666834931853, - "grad_norm": 0.801145076751709, - "learning_rate": 7.284199899040888e-06, - "loss": 0.0581, - "step": 7500 - }, - { - "epoch": 3.791014639071176, - "grad_norm": 0.5122712850570679, - "learning_rate": 7.253912165572943e-06, - "loss": 0.0552, - "step": 7510 - }, - { - "epoch": 3.7960625946491673, - "grad_norm": 0.39402052760124207, - "learning_rate": 7.223624432104998e-06, - "loss": 0.0552, - "step": 7520 - }, - { - "epoch": 3.801110550227158, - "grad_norm": 0.5302004814147949, - "learning_rate": 7.193336698637052e-06, - "loss": 0.0626, - "step": 7530 - }, - { - "epoch": 3.806158505805149, - "grad_norm": 0.4123098850250244, - "learning_rate": 7.163048965169107e-06, - "loss": 0.0569, - "step": 7540 - }, - { - "epoch": 3.81120646138314, - "grad_norm": 0.8736279010772705, - "learning_rate": 7.132761231701161e-06, - "loss": 0.0537, - "step": 7550 - }, - { - "epoch": 3.8162544169611308, - "grad_norm": 0.4374080002307892, - "learning_rate": 7.102473498233216e-06, - "loss": 0.057, - "step": 7560 - }, - { - "epoch": 3.8213023725391215, - "grad_norm": 0.863776445388794, - "learning_rate": 7.07218576476527e-06, - "loss": 0.049, - "step": 7570 - }, - { - "epoch": 3.8263503281171127, - "grad_norm": 0.5356324315071106, - "learning_rate": 7.041898031297325e-06, - "loss": 0.0578, - "step": 7580 - }, - { - "epoch": 3.8313982836951035, - "grad_norm": 0.5422727465629578, - "learning_rate": 7.0116102978293786e-06, - "loss": 0.0577, - "step": 7590 - }, - { - "epoch": 3.8364462392730942, - "grad_norm": 0.6234108805656433, - "learning_rate": 6.981322564361434e-06, - "loss": 0.0573, - "step": 7600 - }, - { - "epoch": 3.8414941948510855, - "grad_norm": 0.9067860841751099, - "learning_rate": 6.951034830893489e-06, - "loss": 0.0471, - "step": 7610 - }, - { - "epoch": 3.846542150429076, - "grad_norm": 0.5522469878196716, - "learning_rate": 6.920747097425543e-06, - "loss": 0.053, - "step": 7620 - }, - { - "epoch": 3.851590106007067, - "grad_norm": 0.7358270287513733, - "learning_rate": 6.8904593639575974e-06, - "loss": 0.0561, - "step": 7630 - }, - { - "epoch": 3.856638061585058, - "grad_norm": 0.5285794138908386, - "learning_rate": 6.860171630489652e-06, - "loss": 0.0618, - "step": 7640 - }, - { - "epoch": 3.861686017163049, - "grad_norm": 0.6937068700790405, - "learning_rate": 6.829883897021707e-06, - "loss": 0.059, - "step": 7650 - }, - { - "epoch": 3.8667339727410397, - "grad_norm": 0.6941738724708557, - "learning_rate": 6.79959616355376e-06, - "loss": 0.0515, - "step": 7660 - }, - { - "epoch": 3.871781928319031, - "grad_norm": 0.8964054584503174, - "learning_rate": 6.7693084300858155e-06, - "loss": 0.0526, - "step": 7670 - }, - { - "epoch": 3.8768298838970217, - "grad_norm": 0.5919986367225647, - "learning_rate": 6.739020696617871e-06, - "loss": 0.0577, - "step": 7680 - }, - { - "epoch": 3.8818778394750124, - "grad_norm": 0.4616561532020569, - "learning_rate": 6.708732963149924e-06, - "loss": 0.0509, - "step": 7690 - }, - { - "epoch": 3.8869257950530036, - "grad_norm": 0.6349731087684631, - "learning_rate": 6.678445229681979e-06, - "loss": 0.0535, - "step": 7700 - }, - { - "epoch": 3.8919737506309944, - "grad_norm": 0.6474828720092773, - "learning_rate": 6.6481574962140335e-06, - "loss": 0.0552, - "step": 7710 - }, - { - "epoch": 3.897021706208985, - "grad_norm": 0.5433930158615112, - "learning_rate": 6.617869762746088e-06, - "loss": 0.062, - "step": 7720 - }, - { - "epoch": 3.9020696617869763, - "grad_norm": 0.6113614439964294, - "learning_rate": 6.587582029278142e-06, - "loss": 0.06, - "step": 7730 - }, - { - "epoch": 3.907117617364967, - "grad_norm": 0.8800488114356995, - "learning_rate": 6.557294295810197e-06, - "loss": 0.0578, - "step": 7740 - }, - { - "epoch": 3.912165572942958, - "grad_norm": 0.5158660411834717, - "learning_rate": 6.5270065623422515e-06, - "loss": 0.0524, - "step": 7750 - }, - { - "epoch": 3.917213528520949, - "grad_norm": 0.5676606297492981, - "learning_rate": 6.496718828874306e-06, - "loss": 0.0474, - "step": 7760 - }, - { - "epoch": 3.92226148409894, - "grad_norm": 0.6438203454017639, - "learning_rate": 6.466431095406361e-06, - "loss": 0.0587, - "step": 7770 - }, - { - "epoch": 3.9273094396769306, - "grad_norm": 0.6570119857788086, - "learning_rate": 6.436143361938415e-06, - "loss": 0.0489, - "step": 7780 - }, - { - "epoch": 3.932357395254922, - "grad_norm": 0.5620145201683044, - "learning_rate": 6.4058556284704695e-06, - "loss": 0.0559, - "step": 7790 - }, - { - "epoch": 3.9374053508329125, - "grad_norm": 0.6886317729949951, - "learning_rate": 6.375567895002524e-06, - "loss": 0.0581, - "step": 7800 - }, - { - "epoch": 3.9424533064109037, - "grad_norm": 0.7463077306747437, - "learning_rate": 6.345280161534579e-06, - "loss": 0.0477, - "step": 7810 - }, - { - "epoch": 3.9475012619888945, - "grad_norm": 0.5246394276618958, - "learning_rate": 6.314992428066633e-06, - "loss": 0.0501, - "step": 7820 - }, - { - "epoch": 3.9525492175668853, - "grad_norm": 0.5147930979728699, - "learning_rate": 6.2847046945986876e-06, - "loss": 0.0603, - "step": 7830 - }, - { - "epoch": 3.9575971731448765, - "grad_norm": 0.3963003158569336, - "learning_rate": 6.254416961130743e-06, - "loss": 0.059, - "step": 7840 - }, - { - "epoch": 3.9626451287228672, - "grad_norm": 0.7148598432540894, - "learning_rate": 6.224129227662796e-06, - "loss": 0.0524, - "step": 7850 - }, - { - "epoch": 3.967693084300858, - "grad_norm": 0.5985211133956909, - "learning_rate": 6.193841494194851e-06, - "loss": 0.0609, - "step": 7860 - }, - { - "epoch": 3.972741039878849, - "grad_norm": 0.6152123808860779, - "learning_rate": 6.163553760726906e-06, - "loss": 0.0622, - "step": 7870 - }, - { - "epoch": 3.97778899545684, - "grad_norm": 0.49580270051956177, - "learning_rate": 6.13326602725896e-06, - "loss": 0.056, - "step": 7880 - }, - { - "epoch": 3.982836951034831, - "grad_norm": 0.8874292373657227, - "learning_rate": 6.102978293791015e-06, - "loss": 0.0599, - "step": 7890 - }, - { - "epoch": 3.987884906612822, - "grad_norm": 0.6198350787162781, - "learning_rate": 6.072690560323069e-06, - "loss": 0.0546, - "step": 7900 - }, - { - "epoch": 3.9929328621908127, - "grad_norm": 0.39257192611694336, - "learning_rate": 6.042402826855124e-06, - "loss": 0.0523, - "step": 7910 - }, - { - "epoch": 3.997980817768804, - "grad_norm": 0.4612904191017151, - "learning_rate": 6.012115093387178e-06, - "loss": 0.0685, - "step": 7920 - }, - { - "epoch": 4.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.038467586040496826, - "eval_runtime": 578.9562, - "eval_samples_per_second": 356.265, - "eval_steps_per_second": 2.784, - "step": 7924 - }, - { - "epoch": 4.003028773346794, - "grad_norm": 0.7146291732788086, - "learning_rate": 5.981827359919233e-06, - "loss": 0.0575, - "step": 7930 - }, - { - "epoch": 4.008076728924785, - "grad_norm": 0.6313480138778687, - "learning_rate": 5.951539626451287e-06, - "loss": 0.0581, - "step": 7940 - }, - { - "epoch": 4.013124684502777, - "grad_norm": 0.4977870583534241, - "learning_rate": 5.921251892983342e-06, - "loss": 0.0582, - "step": 7950 - }, - { - "epoch": 4.018172640080767, - "grad_norm": 0.4447147250175476, - "learning_rate": 5.890964159515397e-06, - "loss": 0.0544, - "step": 7960 - }, - { - "epoch": 4.023220595658758, - "grad_norm": 0.6496310234069824, - "learning_rate": 5.860676426047451e-06, - "loss": 0.0595, - "step": 7970 - }, - { - "epoch": 4.028268551236749, - "grad_norm": 0.4380001127719879, - "learning_rate": 5.830388692579505e-06, - "loss": 0.0549, - "step": 7980 - }, - { - "epoch": 4.03331650681474, - "grad_norm": 0.5718368887901306, - "learning_rate": 5.80010095911156e-06, - "loss": 0.0559, - "step": 7990 - }, - { - "epoch": 4.038364462392731, - "grad_norm": 0.5859358906745911, - "learning_rate": 5.769813225643615e-06, - "loss": 0.0572, - "step": 8000 - }, - { - "epoch": 4.043412417970722, - "grad_norm": 0.49378788471221924, - "learning_rate": 5.739525492175669e-06, - "loss": 0.054, - "step": 8010 - }, - { - "epoch": 4.048460373548712, - "grad_norm": 0.6780097484588623, - "learning_rate": 5.709237758707723e-06, - "loss": 0.0568, - "step": 8020 - }, - { - "epoch": 4.053508329126704, - "grad_norm": 0.8048389554023743, - "learning_rate": 5.6789500252397786e-06, - "loss": 0.0527, - "step": 8030 - }, - { - "epoch": 4.058556284704695, - "grad_norm": 0.4513346254825592, - "learning_rate": 5.648662291771832e-06, - "loss": 0.0597, - "step": 8040 - }, - { - "epoch": 4.063604240282685, - "grad_norm": 0.6877405643463135, - "learning_rate": 5.618374558303887e-06, - "loss": 0.0594, - "step": 8050 - }, - { - "epoch": 4.068652195860676, - "grad_norm": 0.41468387842178345, - "learning_rate": 5.5880868248359414e-06, - "loss": 0.0563, - "step": 8060 - }, - { - "epoch": 4.0737001514386675, - "grad_norm": 0.5062978267669678, - "learning_rate": 5.557799091367996e-06, - "loss": 0.0598, - "step": 8070 - }, - { - "epoch": 4.078748107016659, - "grad_norm": 0.6427041888237, - "learning_rate": 5.527511357900051e-06, - "loss": 0.057, - "step": 8080 - }, - { - "epoch": 4.083796062594649, - "grad_norm": 0.5508936643600464, - "learning_rate": 5.497223624432105e-06, - "loss": 0.0472, - "step": 8090 - }, - { - "epoch": 4.08884401817264, - "grad_norm": 0.39490872621536255, - "learning_rate": 5.4669358909641595e-06, - "loss": 0.0589, - "step": 8100 - }, - { - "epoch": 4.093891973750631, - "grad_norm": 0.5776220560073853, - "learning_rate": 5.436648157496214e-06, - "loss": 0.0602, - "step": 8110 - }, - { - "epoch": 4.098939929328622, - "grad_norm": 0.36714500188827515, - "learning_rate": 5.406360424028269e-06, - "loss": 0.0474, - "step": 8120 - }, - { - "epoch": 4.103987884906613, - "grad_norm": 0.7429747581481934, - "learning_rate": 5.376072690560323e-06, - "loss": 0.0516, - "step": 8130 - }, - { - "epoch": 4.109035840484604, - "grad_norm": 0.7167190909385681, - "learning_rate": 5.3457849570923775e-06, - "loss": 0.0559, - "step": 8140 - }, - { - "epoch": 4.1140837960625944, - "grad_norm": 0.5668296217918396, - "learning_rate": 5.315497223624433e-06, - "loss": 0.0558, - "step": 8150 - }, - { - "epoch": 4.119131751640586, - "grad_norm": 0.5577311515808105, - "learning_rate": 5.285209490156487e-06, - "loss": 0.0589, - "step": 8160 - }, - { - "epoch": 4.124179707218577, - "grad_norm": 0.611304759979248, - "learning_rate": 5.254921756688541e-06, - "loss": 0.0546, - "step": 8170 - }, - { - "epoch": 4.129227662796567, - "grad_norm": 0.5540894865989685, - "learning_rate": 5.2246340232205955e-06, - "loss": 0.0611, - "step": 8180 - }, - { - "epoch": 4.134275618374558, - "grad_norm": 0.5128312706947327, - "learning_rate": 5.194346289752651e-06, - "loss": 0.0552, - "step": 8190 - }, - { - "epoch": 4.13932357395255, - "grad_norm": 0.6017599105834961, - "learning_rate": 5.164058556284704e-06, - "loss": 0.0494, - "step": 8200 - }, - { - "epoch": 4.14437152953054, - "grad_norm": 0.42843466997146606, - "learning_rate": 5.133770822816759e-06, - "loss": 0.0534, - "step": 8210 - }, - { - "epoch": 4.149419485108531, - "grad_norm": 0.6050401926040649, - "learning_rate": 5.103483089348814e-06, - "loss": 0.0524, - "step": 8220 - }, - { - "epoch": 4.154467440686522, - "grad_norm": 0.512793242931366, - "learning_rate": 5.073195355880868e-06, - "loss": 0.0562, - "step": 8230 - }, - { - "epoch": 4.159515396264513, - "grad_norm": 0.5130860209465027, - "learning_rate": 5.042907622412923e-06, - "loss": 0.0413, - "step": 8240 - }, - { - "epoch": 4.164563351842504, - "grad_norm": 0.6443082690238953, - "learning_rate": 5.012619888944977e-06, - "loss": 0.0593, - "step": 8250 - }, - { - "epoch": 4.169611307420495, - "grad_norm": 0.6051344871520996, - "learning_rate": 4.982332155477032e-06, - "loss": 0.0542, - "step": 8260 - }, - { - "epoch": 4.174659262998485, - "grad_norm": 0.5795598030090332, - "learning_rate": 4.952044422009086e-06, - "loss": 0.0569, - "step": 8270 - }, - { - "epoch": 4.1797072185764765, - "grad_norm": 0.6054142117500305, - "learning_rate": 4.921756688541141e-06, - "loss": 0.0575, - "step": 8280 - }, - { - "epoch": 4.184755174154468, - "grad_norm": 0.6954050660133362, - "learning_rate": 4.891468955073196e-06, - "loss": 0.0609, - "step": 8290 - }, - { - "epoch": 4.189803129732458, - "grad_norm": 0.7217870354652405, - "learning_rate": 4.86118122160525e-06, - "loss": 0.0559, - "step": 8300 - }, - { - "epoch": 4.194851085310449, - "grad_norm": 0.49758586287498474, - "learning_rate": 4.830893488137305e-06, - "loss": 0.0506, - "step": 8310 - }, - { - "epoch": 4.1998990408884405, - "grad_norm": 0.4497081935405731, - "learning_rate": 4.800605754669359e-06, - "loss": 0.0581, - "step": 8320 - }, - { - "epoch": 4.204946996466431, - "grad_norm": 0.6054022312164307, - "learning_rate": 4.770318021201413e-06, - "loss": 0.0596, - "step": 8330 - }, - { - "epoch": 4.209994952044422, - "grad_norm": 0.7262012958526611, - "learning_rate": 4.7400302877334685e-06, - "loss": 0.0489, - "step": 8340 - }, - { - "epoch": 4.215042907622413, - "grad_norm": 0.6226342916488647, - "learning_rate": 4.709742554265523e-06, - "loss": 0.0596, - "step": 8350 - }, - { - "epoch": 4.2200908632004035, - "grad_norm": 0.8234953284263611, - "learning_rate": 4.679454820797577e-06, - "loss": 0.057, - "step": 8360 - }, - { - "epoch": 4.225138818778395, - "grad_norm": 0.8438859581947327, - "learning_rate": 4.649167087329631e-06, - "loss": 0.0516, - "step": 8370 - }, - { - "epoch": 4.230186774356386, - "grad_norm": 0.5095875263214111, - "learning_rate": 4.6188793538616865e-06, - "loss": 0.0646, - "step": 8380 - }, - { - "epoch": 4.235234729934376, - "grad_norm": 0.5543855428695679, - "learning_rate": 4.58859162039374e-06, - "loss": 0.0482, - "step": 8390 - }, - { - "epoch": 4.240282685512367, - "grad_norm": 0.7510880827903748, - "learning_rate": 4.558303886925795e-06, - "loss": 0.0595, - "step": 8400 - }, - { - "epoch": 4.245330641090359, - "grad_norm": 0.5140940546989441, - "learning_rate": 4.52801615345785e-06, - "loss": 0.0568, - "step": 8410 - }, - { - "epoch": 4.250378596668349, - "grad_norm": 0.43089789152145386, - "learning_rate": 4.497728419989904e-06, - "loss": 0.058, - "step": 8420 - }, - { - "epoch": 4.25542655224634, - "grad_norm": 0.6229716539382935, - "learning_rate": 4.467440686521959e-06, - "loss": 0.0538, - "step": 8430 - }, - { - "epoch": 4.260474507824331, - "grad_norm": 0.6465341448783875, - "learning_rate": 4.437152953054013e-06, - "loss": 0.0544, - "step": 8440 - }, - { - "epoch": 4.265522463402322, - "grad_norm": 0.42706695199012756, - "learning_rate": 4.406865219586068e-06, - "loss": 0.0562, - "step": 8450 - }, - { - "epoch": 4.270570418980313, - "grad_norm": 0.5305337309837341, - "learning_rate": 4.376577486118122e-06, - "loss": 0.0567, - "step": 8460 - }, - { - "epoch": 4.275618374558304, - "grad_norm": 0.7307097315788269, - "learning_rate": 4.346289752650177e-06, - "loss": 0.0486, - "step": 8470 - }, - { - "epoch": 4.280666330136295, - "grad_norm": 0.5940870046615601, - "learning_rate": 4.316002019182232e-06, - "loss": 0.0514, - "step": 8480 - }, - { - "epoch": 4.285714285714286, - "grad_norm": 0.4446733593940735, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.0545, - "step": 8490 - }, - { - "epoch": 4.290762241292277, - "grad_norm": 0.9121294617652893, - "learning_rate": 4.255426552246341e-06, - "loss": 0.0557, - "step": 8500 - }, - { - "epoch": 4.295810196870267, - "grad_norm": 0.568056583404541, - "learning_rate": 4.225138818778395e-06, - "loss": 0.0522, - "step": 8510 - }, - { - "epoch": 4.300858152448258, - "grad_norm": 0.8788109421730042, - "learning_rate": 4.194851085310449e-06, - "loss": 0.0433, - "step": 8520 - }, - { - "epoch": 4.3059061080262495, - "grad_norm": 0.7445030808448792, - "learning_rate": 4.1645633518425035e-06, - "loss": 0.05, - "step": 8530 - }, - { - "epoch": 4.310954063604241, - "grad_norm": 0.8348667621612549, - "learning_rate": 4.134275618374559e-06, - "loss": 0.0584, - "step": 8540 - }, - { - "epoch": 4.316002019182231, - "grad_norm": 0.462342232465744, - "learning_rate": 4.103987884906613e-06, - "loss": 0.0555, - "step": 8550 - }, - { - "epoch": 4.321049974760222, - "grad_norm": 0.42785176634788513, - "learning_rate": 4.073700151438667e-06, - "loss": 0.0607, - "step": 8560 - }, - { - "epoch": 4.326097930338213, - "grad_norm": 0.7172122597694397, - "learning_rate": 4.043412417970722e-06, - "loss": 0.0675, - "step": 8570 - }, - { - "epoch": 4.331145885916204, - "grad_norm": 0.4495554566383362, - "learning_rate": 4.013124684502776e-06, - "loss": 0.0546, - "step": 8580 - }, - { - "epoch": 4.336193841494195, - "grad_norm": 0.5083460807800293, - "learning_rate": 3.982836951034831e-06, - "loss": 0.06, - "step": 8590 - }, - { - "epoch": 4.341241797072186, - "grad_norm": 0.4353145360946655, - "learning_rate": 3.952549217566885e-06, - "loss": 0.0535, - "step": 8600 - }, - { - "epoch": 4.3462897526501765, - "grad_norm": 0.6741386651992798, - "learning_rate": 3.92226148409894e-06, - "loss": 0.0581, - "step": 8610 - }, - { - "epoch": 4.351337708228168, - "grad_norm": 0.47798269987106323, - "learning_rate": 3.891973750630995e-06, - "loss": 0.0541, - "step": 8620 - }, - { - "epoch": 4.356385663806159, - "grad_norm": 0.49109166860580444, - "learning_rate": 3.861686017163049e-06, - "loss": 0.0608, - "step": 8630 - }, - { - "epoch": 4.361433619384149, - "grad_norm": 0.8310505747795105, - "learning_rate": 3.831398283695104e-06, - "loss": 0.0514, - "step": 8640 - }, - { - "epoch": 4.36648157496214, - "grad_norm": 0.4586045742034912, - "learning_rate": 3.801110550227158e-06, - "loss": 0.0538, - "step": 8650 - }, - { - "epoch": 4.371529530540132, - "grad_norm": 0.4350300133228302, - "learning_rate": 3.7708228167592127e-06, - "loss": 0.0526, - "step": 8660 - }, - { - "epoch": 4.376577486118122, - "grad_norm": 0.6310685276985168, - "learning_rate": 3.740535083291267e-06, - "loss": 0.0597, - "step": 8670 - }, - { - "epoch": 4.381625441696113, - "grad_norm": 0.6845548152923584, - "learning_rate": 3.7102473498233217e-06, - "loss": 0.0542, - "step": 8680 - }, - { - "epoch": 4.386673397274104, - "grad_norm": 1.085631012916565, - "learning_rate": 3.679959616355376e-06, - "loss": 0.0601, - "step": 8690 - }, - { - "epoch": 4.391721352852095, - "grad_norm": 0.6232538223266602, - "learning_rate": 3.6496718828874303e-06, - "loss": 0.0557, - "step": 8700 - }, - { - "epoch": 4.396769308430086, - "grad_norm": 0.4568091630935669, - "learning_rate": 3.6193841494194855e-06, - "loss": 0.0494, - "step": 8710 - }, - { - "epoch": 4.401817264008077, - "grad_norm": 0.7550612092018127, - "learning_rate": 3.5890964159515398e-06, - "loss": 0.0562, - "step": 8720 - }, - { - "epoch": 4.406865219586067, - "grad_norm": 0.5380585789680481, - "learning_rate": 3.5588086824835945e-06, - "loss": 0.0521, - "step": 8730 - }, - { - "epoch": 4.411913175164059, - "grad_norm": 0.42225027084350586, - "learning_rate": 3.5285209490156488e-06, - "loss": 0.0515, - "step": 8740 - }, - { - "epoch": 4.41696113074205, - "grad_norm": 0.5831999778747559, - "learning_rate": 3.498233215547703e-06, - "loss": 0.0465, - "step": 8750 - }, - { - "epoch": 4.42200908632004, - "grad_norm": 0.7943524718284607, - "learning_rate": 3.4679454820797578e-06, - "loss": 0.062, - "step": 8760 - }, - { - "epoch": 4.427057041898031, - "grad_norm": 0.634747326374054, - "learning_rate": 3.437657748611812e-06, - "loss": 0.0496, - "step": 8770 - }, - { - "epoch": 4.4321049974760225, - "grad_norm": 0.5734288692474365, - "learning_rate": 3.407370015143867e-06, - "loss": 0.0612, - "step": 8780 - }, - { - "epoch": 4.437152953054013, - "grad_norm": 0.7079018354415894, - "learning_rate": 3.3770822816759215e-06, - "loss": 0.0578, - "step": 8790 - }, - { - "epoch": 4.442200908632004, - "grad_norm": 0.44444698095321655, - "learning_rate": 3.346794548207976e-06, - "loss": 0.0559, - "step": 8800 - }, - { - "epoch": 4.447248864209995, - "grad_norm": 0.7473122477531433, - "learning_rate": 3.3165068147400305e-06, - "loss": 0.0544, - "step": 8810 - }, - { - "epoch": 4.4522968197879855, - "grad_norm": 0.6658338308334351, - "learning_rate": 3.286219081272085e-06, - "loss": 0.0552, - "step": 8820 - }, - { - "epoch": 4.457344775365977, - "grad_norm": 0.48870500922203064, - "learning_rate": 3.255931347804139e-06, - "loss": 0.0566, - "step": 8830 - }, - { - "epoch": 4.462392730943968, - "grad_norm": 0.6261917948722839, - "learning_rate": 3.2256436143361943e-06, - "loss": 0.0487, - "step": 8840 - }, - { - "epoch": 4.467440686521958, - "grad_norm": 0.6060011982917786, - "learning_rate": 3.1953558808682486e-06, - "loss": 0.0514, - "step": 8850 - }, - { - "epoch": 4.4724886420999495, - "grad_norm": 0.4858971834182739, - "learning_rate": 3.165068147400303e-06, - "loss": 0.05, - "step": 8860 - }, - { - "epoch": 4.477536597677941, - "grad_norm": 0.6394979357719421, - "learning_rate": 3.1347804139323576e-06, - "loss": 0.0604, - "step": 8870 - }, - { - "epoch": 4.482584553255931, - "grad_norm": 0.6840482950210571, - "learning_rate": 3.104492680464412e-06, - "loss": 0.0514, - "step": 8880 - }, - { - "epoch": 4.487632508833922, - "grad_norm": 0.388715535402298, - "learning_rate": 3.0742049469964666e-06, - "loss": 0.0479, - "step": 8890 - }, - { - "epoch": 4.492680464411913, - "grad_norm": 0.6516565084457397, - "learning_rate": 3.043917213528521e-06, - "loss": 0.0608, - "step": 8900 - }, - { - "epoch": 4.497728419989904, - "grad_norm": 0.76282799243927, - "learning_rate": 3.0136294800605756e-06, - "loss": 0.0572, - "step": 8910 - }, - { - "epoch": 4.502776375567895, - "grad_norm": 0.49448370933532715, - "learning_rate": 2.9833417465926303e-06, - "loss": 0.0575, - "step": 8920 - }, - { - "epoch": 4.507824331145886, - "grad_norm": 0.5593730807304382, - "learning_rate": 2.9530540131246846e-06, - "loss": 0.0486, - "step": 8930 - }, - { - "epoch": 4.512872286723876, - "grad_norm": 0.5773325562477112, - "learning_rate": 2.922766279656739e-06, - "loss": 0.0541, - "step": 8940 - }, - { - "epoch": 4.517920242301868, - "grad_norm": 0.34630000591278076, - "learning_rate": 2.8924785461887936e-06, - "loss": 0.0606, - "step": 8950 - }, - { - "epoch": 4.522968197879859, - "grad_norm": 0.5409483313560486, - "learning_rate": 2.862190812720848e-06, - "loss": 0.0589, - "step": 8960 - }, - { - "epoch": 4.52801615345785, - "grad_norm": 0.5004202127456665, - "learning_rate": 2.8319030792529026e-06, - "loss": 0.0621, - "step": 8970 - }, - { - "epoch": 4.53306410903584, - "grad_norm": 0.4979722797870636, - "learning_rate": 2.8016153457849574e-06, - "loss": 0.0537, - "step": 8980 - }, - { - "epoch": 4.5381120646138315, - "grad_norm": 0.6733251214027405, - "learning_rate": 2.7713276123170117e-06, - "loss": 0.069, - "step": 8990 - }, - { - "epoch": 4.543160020191822, - "grad_norm": 0.4152880609035492, - "learning_rate": 2.7410398788490664e-06, - "loss": 0.0565, - "step": 9000 - }, - { - "epoch": 4.548207975769813, - "grad_norm": 0.6170037984848022, - "learning_rate": 2.7107521453811207e-06, - "loss": 0.0589, - "step": 9010 - }, - { - "epoch": 4.553255931347804, - "grad_norm": 0.5258937478065491, - "learning_rate": 2.680464411913175e-06, - "loss": 0.0548, - "step": 9020 - }, - { - "epoch": 4.5583038869257955, - "grad_norm": 0.534015417098999, - "learning_rate": 2.6501766784452297e-06, - "loss": 0.0447, - "step": 9030 - }, - { - "epoch": 4.563351842503786, - "grad_norm": 0.86041259765625, - "learning_rate": 2.6198889449772844e-06, - "loss": 0.0578, - "step": 9040 - }, - { - "epoch": 4.568399798081777, - "grad_norm": 0.8807480335235596, - "learning_rate": 2.589601211509339e-06, - "loss": 0.0479, - "step": 9050 - }, - { - "epoch": 4.573447753659767, - "grad_norm": 0.6071127653121948, - "learning_rate": 2.5593134780413934e-06, - "loss": 0.0521, - "step": 9060 - }, - { - "epoch": 4.5784957092377585, - "grad_norm": 0.9106950759887695, - "learning_rate": 2.5290257445734477e-06, - "loss": 0.056, - "step": 9070 - }, - { - "epoch": 4.58354366481575, - "grad_norm": 0.6179044246673584, - "learning_rate": 2.4987380111055024e-06, - "loss": 0.0548, - "step": 9080 - }, - { - "epoch": 4.588591620393741, - "grad_norm": 0.9295970797538757, - "learning_rate": 2.4684502776375567e-06, - "loss": 0.0626, - "step": 9090 - }, - { - "epoch": 4.593639575971731, - "grad_norm": 0.4483726918697357, - "learning_rate": 2.438162544169611e-06, - "loss": 0.0531, - "step": 9100 - }, - { - "epoch": 4.598687531549722, - "grad_norm": 0.38749760389328003, - "learning_rate": 2.407874810701666e-06, - "loss": 0.0514, - "step": 9110 - }, - { - "epoch": 4.603735487127714, - "grad_norm": 0.7203320860862732, - "learning_rate": 2.3775870772337205e-06, - "loss": 0.0603, - "step": 9120 - }, - { - "epoch": 4.608783442705704, - "grad_norm": 0.8010473251342773, - "learning_rate": 2.347299343765775e-06, - "loss": 0.053, - "step": 9130 - }, - { - "epoch": 4.613831398283695, - "grad_norm": 0.7866964936256409, - "learning_rate": 2.3170116102978295e-06, - "loss": 0.0544, - "step": 9140 - }, - { - "epoch": 4.618879353861686, - "grad_norm": 0.9333378076553345, - "learning_rate": 2.2867238768298838e-06, - "loss": 0.0472, - "step": 9150 - }, - { - "epoch": 4.623927309439677, - "grad_norm": 0.5904621481895447, - "learning_rate": 2.2564361433619385e-06, - "loss": 0.0515, - "step": 9160 - }, - { - "epoch": 4.628975265017668, - "grad_norm": 0.6837446093559265, - "learning_rate": 2.2261484098939928e-06, - "loss": 0.0566, - "step": 9170 - }, - { - "epoch": 4.634023220595659, - "grad_norm": 0.5726220607757568, - "learning_rate": 2.1958606764260475e-06, - "loss": 0.0521, - "step": 9180 - }, - { - "epoch": 4.639071176173649, - "grad_norm": 0.5920945405960083, - "learning_rate": 2.1655729429581022e-06, - "loss": 0.0527, - "step": 9190 - }, - { - "epoch": 4.644119131751641, - "grad_norm": 0.5921088457107544, - "learning_rate": 2.1352852094901565e-06, - "loss": 0.0594, - "step": 9200 - }, - { - "epoch": 4.649167087329632, - "grad_norm": 0.8026402592658997, - "learning_rate": 2.1049974760222112e-06, - "loss": 0.058, - "step": 9210 - }, - { - "epoch": 4.654215042907622, - "grad_norm": 0.9913181066513062, - "learning_rate": 2.0747097425542655e-06, - "loss": 0.0591, - "step": 9220 - }, - { - "epoch": 4.659262998485613, - "grad_norm": 0.675123393535614, - "learning_rate": 2.04442200908632e-06, - "loss": 0.0561, - "step": 9230 - }, - { - "epoch": 4.6643109540636045, - "grad_norm": 0.5947641730308533, - "learning_rate": 2.014134275618375e-06, - "loss": 0.0486, - "step": 9240 - }, - { - "epoch": 4.669358909641595, - "grad_norm": 0.5389765501022339, - "learning_rate": 1.9838465421504293e-06, - "loss": 0.0586, - "step": 9250 - }, - { - "epoch": 4.674406865219586, - "grad_norm": 0.5905711054801941, - "learning_rate": 1.9535588086824836e-06, - "loss": 0.0523, - "step": 9260 - }, - { - "epoch": 4.679454820797577, - "grad_norm": 0.36754655838012695, - "learning_rate": 1.9232710752145383e-06, - "loss": 0.0518, - "step": 9270 - }, - { - "epoch": 4.684502776375568, - "grad_norm": 0.5583412647247314, - "learning_rate": 1.8929833417465926e-06, - "loss": 0.0536, - "step": 9280 - }, - { - "epoch": 4.689550731953559, - "grad_norm": 0.4586925506591797, - "learning_rate": 1.8626956082786473e-06, - "loss": 0.0482, - "step": 9290 - }, - { - "epoch": 4.69459868753155, - "grad_norm": 0.4932919442653656, - "learning_rate": 1.8324078748107018e-06, - "loss": 0.0484, - "step": 9300 - }, - { - "epoch": 4.69964664310954, - "grad_norm": 0.3211473524570465, - "learning_rate": 1.802120141342756e-06, - "loss": 0.0522, - "step": 9310 - }, - { - "epoch": 4.7046945986875315, - "grad_norm": 0.8603491187095642, - "learning_rate": 1.7718324078748106e-06, - "loss": 0.0585, - "step": 9320 - }, - { - "epoch": 4.709742554265523, - "grad_norm": 0.7181740999221802, - "learning_rate": 1.7415446744068653e-06, - "loss": 0.0522, - "step": 9330 - }, - { - "epoch": 4.714790509843513, - "grad_norm": 0.49415314197540283, - "learning_rate": 1.7112569409389198e-06, - "loss": 0.0417, - "step": 9340 - }, - { - "epoch": 4.719838465421504, - "grad_norm": 0.758638322353363, - "learning_rate": 1.6809692074709741e-06, - "loss": 0.0608, - "step": 9350 - }, - { - "epoch": 4.724886420999495, - "grad_norm": 0.6659887433052063, - "learning_rate": 1.6506814740030288e-06, - "loss": 0.0468, - "step": 9360 - }, - { - "epoch": 4.729934376577486, - "grad_norm": 0.3270837962627411, - "learning_rate": 1.6203937405350833e-06, - "loss": 0.0602, - "step": 9370 - }, - { - "epoch": 4.734982332155477, - "grad_norm": 0.6695159077644348, - "learning_rate": 1.5901060070671379e-06, - "loss": 0.0515, - "step": 9380 - }, - { - "epoch": 4.740030287733468, - "grad_norm": 0.8143603205680847, - "learning_rate": 1.5598182735991924e-06, - "loss": 0.0613, - "step": 9390 - }, - { - "epoch": 4.745078243311459, - "grad_norm": 0.6727936863899231, - "learning_rate": 1.5295305401312469e-06, - "loss": 0.0505, - "step": 9400 - }, - { - "epoch": 4.75012619888945, - "grad_norm": 0.5365564823150635, - "learning_rate": 1.4992428066633014e-06, - "loss": 0.0512, - "step": 9410 - }, - { - "epoch": 4.755174154467441, - "grad_norm": 0.5240725874900818, - "learning_rate": 1.4689550731953559e-06, - "loss": 0.0526, - "step": 9420 - }, - { - "epoch": 4.760222110045431, - "grad_norm": 0.6975441575050354, - "learning_rate": 1.4386673397274104e-06, - "loss": 0.0592, - "step": 9430 - }, - { - "epoch": 4.765270065623422, - "grad_norm": 0.44649407267570496, - "learning_rate": 1.408379606259465e-06, - "loss": 0.0597, - "step": 9440 - }, - { - "epoch": 4.770318021201414, - "grad_norm": 0.598850429058075, - "learning_rate": 1.3780918727915194e-06, - "loss": 0.0606, - "step": 9450 - }, - { - "epoch": 4.775365976779405, - "grad_norm": 0.57352614402771, - "learning_rate": 1.3478041393235741e-06, - "loss": 0.0502, - "step": 9460 - }, - { - "epoch": 4.780413932357395, - "grad_norm": 0.7437055706977844, - "learning_rate": 1.3175164058556284e-06, - "loss": 0.0521, - "step": 9470 - }, - { - "epoch": 4.785461887935386, - "grad_norm": 0.6993494629859924, - "learning_rate": 1.287228672387683e-06, - "loss": 0.0565, - "step": 9480 - }, - { - "epoch": 4.790509843513377, - "grad_norm": 0.8067084550857544, - "learning_rate": 1.2569409389197376e-06, - "loss": 0.0575, - "step": 9490 - }, - { - "epoch": 4.795557799091368, - "grad_norm": 0.5363942384719849, - "learning_rate": 1.2266532054517921e-06, - "loss": 0.058, - "step": 9500 - }, - { - "epoch": 4.800605754669359, - "grad_norm": 0.8145700693130493, - "learning_rate": 1.1963654719838464e-06, - "loss": 0.0488, - "step": 9510 - }, - { - "epoch": 4.80565371024735, - "grad_norm": 0.7701184153556824, - "learning_rate": 1.166077738515901e-06, - "loss": 0.0577, - "step": 9520 - }, - { - "epoch": 4.8107016658253405, - "grad_norm": 0.5177111625671387, - "learning_rate": 1.1357900050479557e-06, - "loss": 0.0605, - "step": 9530 - }, - { - "epoch": 4.815749621403332, - "grad_norm": 0.44751742482185364, - "learning_rate": 1.1055022715800102e-06, - "loss": 0.0565, - "step": 9540 - }, - { - "epoch": 4.820797576981323, - "grad_norm": 0.37919309735298157, - "learning_rate": 1.0752145381120645e-06, - "loss": 0.0454, - "step": 9550 - }, - { - "epoch": 4.825845532559313, - "grad_norm": 0.6037785410881042, - "learning_rate": 1.0449268046441192e-06, - "loss": 0.0606, - "step": 9560 - }, - { - "epoch": 4.8308934881373045, - "grad_norm": 0.3584793508052826, - "learning_rate": 1.0146390711761737e-06, - "loss": 0.0503, - "step": 9570 - }, - { - "epoch": 4.835941443715296, - "grad_norm": 0.49841853976249695, - "learning_rate": 9.843513377082282e-07, - "loss": 0.0434, - "step": 9580 - }, - { - "epoch": 4.840989399293286, - "grad_norm": 0.5114769339561462, - "learning_rate": 9.540636042402827e-07, - "loss": 0.0535, - "step": 9590 - }, - { - "epoch": 4.846037354871277, - "grad_norm": 0.5932824611663818, - "learning_rate": 9.237758707723372e-07, - "loss": 0.0547, - "step": 9600 - }, - { - "epoch": 4.851085310449268, - "grad_norm": 0.6020333766937256, - "learning_rate": 8.934881373043917e-07, - "loss": 0.0597, - "step": 9610 - }, - { - "epoch": 4.856133266027259, - "grad_norm": 0.721193790435791, - "learning_rate": 8.632004038364462e-07, - "loss": 0.0614, - "step": 9620 - }, - { - "epoch": 4.86118122160525, - "grad_norm": 0.4858354926109314, - "learning_rate": 8.329126703685008e-07, - "loss": 0.0555, - "step": 9630 - }, - { - "epoch": 4.866229177183241, - "grad_norm": 0.7863103747367859, - "learning_rate": 8.026249369005552e-07, - "loss": 0.0554, - "step": 9640 - }, - { - "epoch": 4.871277132761231, - "grad_norm": 0.8363025784492493, - "learning_rate": 7.723372034326099e-07, - "loss": 0.0565, - "step": 9650 - }, - { - "epoch": 4.876325088339223, - "grad_norm": 0.6137521266937256, - "learning_rate": 7.420494699646643e-07, - "loss": 0.0575, - "step": 9660 - }, - { - "epoch": 4.881373043917214, - "grad_norm": 0.4781091511249542, - "learning_rate": 7.117617364967189e-07, - "loss": 0.0478, - "step": 9670 - }, - { - "epoch": 4.886420999495204, - "grad_norm": 0.8294112086296082, - "learning_rate": 6.814740030287734e-07, - "loss": 0.0593, - "step": 9680 - }, - { - "epoch": 4.891468955073195, - "grad_norm": 0.5780894160270691, - "learning_rate": 6.511862695608279e-07, - "loss": 0.0518, - "step": 9690 - }, - { - "epoch": 4.8965169106511865, - "grad_norm": 0.4407060146331787, - "learning_rate": 6.208985360928824e-07, - "loss": 0.0522, - "step": 9700 - }, - { - "epoch": 4.901564866229177, - "grad_norm": 0.4369337558746338, - "learning_rate": 5.906108026249369e-07, - "loss": 0.0522, - "step": 9710 - }, - { - "epoch": 4.906612821807168, - "grad_norm": 0.8428089022636414, - "learning_rate": 5.603230691569914e-07, - "loss": 0.0468, - "step": 9720 - }, - { - "epoch": 4.911660777385159, - "grad_norm": 0.6303294897079468, - "learning_rate": 5.30035335689046e-07, - "loss": 0.0577, - "step": 9730 - }, - { - "epoch": 4.91670873296315, - "grad_norm": 0.4869242012500763, - "learning_rate": 4.997476022211004e-07, - "loss": 0.0472, - "step": 9740 - }, - { - "epoch": 4.921756688541141, - "grad_norm": 0.5907611846923828, - "learning_rate": 4.69459868753155e-07, - "loss": 0.0455, - "step": 9750 - }, - { - "epoch": 4.926804644119132, - "grad_norm": 0.6162139177322388, - "learning_rate": 4.3917213528520954e-07, - "loss": 0.0475, - "step": 9760 - }, - { - "epoch": 4.931852599697122, - "grad_norm": 0.5222154259681702, - "learning_rate": 4.0888440181726405e-07, - "loss": 0.0513, - "step": 9770 - }, - { - "epoch": 4.9369005552751135, - "grad_norm": 0.5132977366447449, - "learning_rate": 3.7859666834931856e-07, - "loss": 0.043, - "step": 9780 - }, - { - "epoch": 4.941948510853105, - "grad_norm": 0.6620015501976013, - "learning_rate": 3.4830893488137306e-07, - "loss": 0.0598, - "step": 9790 - }, - { - "epoch": 4.946996466431095, - "grad_norm": 0.7160341143608093, - "learning_rate": 3.1802120141342757e-07, - "loss": 0.0539, - "step": 9800 - }, - { - "epoch": 4.952044422009086, - "grad_norm": 0.5954631567001343, - "learning_rate": 2.8773346794548213e-07, - "loss": 0.0581, - "step": 9810 - }, - { - "epoch": 4.957092377587077, - "grad_norm": 1.0010461807250977, - "learning_rate": 2.5744573447753664e-07, - "loss": 0.0499, - "step": 9820 - }, - { - "epoch": 4.962140333165069, - "grad_norm": 0.5768128633499146, - "learning_rate": 2.2715800100959112e-07, - "loss": 0.0562, - "step": 9830 - }, - { - "epoch": 4.967188288743059, - "grad_norm": 0.6427052617073059, - "learning_rate": 1.9687026754164563e-07, - "loss": 0.0545, - "step": 9840 - }, - { - "epoch": 4.97223624432105, - "grad_norm": 0.6932212114334106, - "learning_rate": 1.6658253407370016e-07, - "loss": 0.0575, - "step": 9850 - }, - { - "epoch": 4.9772841998990405, - "grad_norm": 0.4219547510147095, - "learning_rate": 1.3629480060575467e-07, - "loss": 0.0491, - "step": 9860 - }, - { - "epoch": 4.982332155477032, - "grad_norm": 0.5215485692024231, - "learning_rate": 1.0600706713780919e-07, - "loss": 0.0438, - "step": 9870 - }, - { - "epoch": 4.987380111055023, - "grad_norm": 0.36851760745048523, - "learning_rate": 7.57193336698637e-08, - "loss": 0.052, - "step": 9880 - }, - { - "epoch": 4.992428066633014, - "grad_norm": 0.5213483572006226, - "learning_rate": 4.5431600201918226e-08, - "loss": 0.0472, - "step": 9890 - }, - { - "epoch": 4.997476022211004, - "grad_norm": 0.710657000541687, - "learning_rate": 1.514386673397274e-08, - "loss": 0.0582, - "step": 9900 - }, - { - "epoch": 5.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.03909851238131523, - "eval_runtime": 579.4034, - "eval_samples_per_second": 355.99, - "eval_steps_per_second": 2.782, - "step": 9905 + "epoch": 0.5427922241858116, + "eval_f1": 0.6048198696667897, + "eval_loss": 0.047770071774721146, + "eval_runtime": 1156.5853, + "eval_samples_per_second": 178.337, + "eval_steps_per_second": 2.787, + "step": 2150 }, { - "epoch": 5.0, - "step": 9905, - "total_flos": 9.82152667464321e+19, - "train_loss": 0.0, - "train_runtime": 0.0648, - "train_samples_per_second": 19542495.273, - "train_steps_per_second": 152740.8 + "epoch": 0.5427922241858116, + "step": 2150, + "total_flos": 1.0663947529637069e+19, + "train_loss": 0.08201153971428095, + "train_runtime": 47443.3166, + "train_samples_per_second": 13.49, + "train_steps_per_second": 0.211 } ], "logging_steps": 10, - "max_steps": 9905, + "max_steps": 10000, "num_input_tokens_seen": 0, - "num_train_epochs": 5, + "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { - "early_stopping_patience": 2, + "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { @@ -7014,13 +1931,13 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } }, - "total_flos": 9.82152667464321e+19, - "train_batch_size": 128, + "total_flos": 1.0663947529637069e+19, + "train_batch_size": 64, "trial_name": null, "trial_params": null }