diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,13943 +1,7013 @@ { - "best_metric": 0.04290741682052612, - "best_model_checkpoint": "./test_default_model/checkpoint-19805", + "best_metric": 0.038467586040496826, + "best_model_checkpoint": "./test_microsoft_dit/checkpoint-7924", "epoch": 5.0, "eval_steps": 500, - "global_step": 19805, + "global_step": 9905, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.00297000297000297, - "grad_norm": 1.7466098070144653, - "learning_rate": 2.9982179982179983e-05, - "loss": 0.2849, + "epoch": 0.005047955577990914, + "grad_norm": 0.8398004174232483, + "learning_rate": 2.9969712266532054e-05, + "loss": 0.3087, "step": 10 }, { - "epoch": 0.00594000594000594, - "grad_norm": 1.1830675601959229, - "learning_rate": 2.9964359964359965e-05, - "loss": 0.165, + "epoch": 0.010095911155981827, + "grad_norm": 1.147126317024231, + "learning_rate": 2.993942453306411e-05, + "loss": 0.202, "step": 20 }, { - "epoch": 0.00891000891000891, - "grad_norm": 1.1286729574203491, - "learning_rate": 2.9946539946539947e-05, - "loss": 0.1477, + "epoch": 0.01514386673397274, + "grad_norm": 1.1376692056655884, + "learning_rate": 2.9909136799596164e-05, + "loss": 0.1375, "step": 30 }, { - "epoch": 0.01188001188001188, - "grad_norm": 1.1905314922332764, - "learning_rate": 2.992871992871993e-05, - "loss": 0.1291, + "epoch": 0.020191822311963654, + "grad_norm": 3.0222654342651367, + "learning_rate": 2.987884906612822e-05, + "loss": 0.1254, "step": 40 }, { - "epoch": 0.01485001485001485, - "grad_norm": 1.1131917238235474, - "learning_rate": 2.991089991089991e-05, - "loss": 0.1261, + "epoch": 0.02523977788995457, + "grad_norm": 1.3963178396224976, + "learning_rate": 2.9848561332660275e-05, + "loss": 0.1105, "step": 50 }, { - "epoch": 0.01782001782001782, - "grad_norm": 1.267146348953247, - "learning_rate": 2.9893079893079894e-05, - "loss": 0.1202, + "epoch": 0.03028773346794548, + "grad_norm": 0.741131067276001, + "learning_rate": 2.9818273599192328e-05, + "loss": 0.1022, "step": 60 }, { - "epoch": 0.02079002079002079, - "grad_norm": 1.0151565074920654, - "learning_rate": 2.9875259875259876e-05, - "loss": 0.1209, + "epoch": 0.0353356890459364, + "grad_norm": 1.0705397129058838, + "learning_rate": 2.978798586572438e-05, + "loss": 0.1027, "step": 70 }, { - "epoch": 0.02376002376002376, - "grad_norm": 0.8143087029457092, - "learning_rate": 2.9857439857439858e-05, - "loss": 0.1142, + "epoch": 0.04038364462392731, + "grad_norm": 1.127729892730713, + "learning_rate": 2.9757698132256435e-05, + "loss": 0.0979, "step": 80 }, { - "epoch": 0.02673002673002673, - "grad_norm": 1.4103150367736816, - "learning_rate": 2.983961983961984e-05, - "loss": 0.1123, + "epoch": 0.04543160020191822, + "grad_norm": 0.888960063457489, + "learning_rate": 2.9727410398788492e-05, + "loss": 0.1024, "step": 90 }, { - "epoch": 0.0297000297000297, - "grad_norm": 1.6572177410125732, - "learning_rate": 2.9821799821799822e-05, - "loss": 0.1077, + "epoch": 0.05047955577990914, + "grad_norm": 0.9185839295387268, + "learning_rate": 2.9697122665320545e-05, + "loss": 0.1142, "step": 100 }, { - "epoch": 0.03267003267003267, - "grad_norm": 0.7510947585105896, - "learning_rate": 2.9803979803979805e-05, - "loss": 0.1209, + "epoch": 0.05552751135790005, + "grad_norm": 0.737047016620636, + "learning_rate": 2.96668349318526e-05, + "loss": 0.0956, "step": 110 }, { - "epoch": 0.03564003564003564, - "grad_norm": 1.0185679197311401, - "learning_rate": 2.9786159786159787e-05, - "loss": 0.1073, + "epoch": 0.06057546693589096, + "grad_norm": 0.7749747037887573, + "learning_rate": 2.9636547198384656e-05, + "loss": 0.0978, "step": 120 }, { - "epoch": 0.03861003861003861, - "grad_norm": 1.214509129524231, - "learning_rate": 2.976833976833977e-05, - "loss": 0.1243, + "epoch": 0.06562342251388188, + "grad_norm": 1.079695224761963, + "learning_rate": 2.960625946491671e-05, + "loss": 0.092, "step": 130 }, { - "epoch": 0.04158004158004158, - "grad_norm": 0.7015527486801147, - "learning_rate": 2.975051975051975e-05, - "loss": 0.1073, + "epoch": 0.0706713780918728, + "grad_norm": 0.8315634727478027, + "learning_rate": 2.9575971731448766e-05, + "loss": 0.0975, "step": 140 }, { - "epoch": 0.04455004455004455, - "grad_norm": 0.7993521690368652, - "learning_rate": 2.9732699732699733e-05, - "loss": 0.1162, + "epoch": 0.0757193336698637, + "grad_norm": 0.7270865440368652, + "learning_rate": 2.954568399798082e-05, + "loss": 0.098, "step": 150 }, { - "epoch": 0.04752004752004752, - "grad_norm": 1.4627108573913574, - "learning_rate": 2.9714879714879715e-05, - "loss": 0.1094, + "epoch": 0.08076728924785462, + "grad_norm": 0.5786823630332947, + "learning_rate": 2.9515396264512873e-05, + "loss": 0.0846, "step": 160 }, { - "epoch": 0.05049005049005049, - "grad_norm": 0.5512596368789673, - "learning_rate": 2.9697059697059698e-05, - "loss": 0.1105, + "epoch": 0.08581524482584553, + "grad_norm": 0.7117003798484802, + "learning_rate": 2.948510853104493e-05, + "loss": 0.0905, "step": 170 }, { - "epoch": 0.05346005346005346, - "grad_norm": 0.6437152028083801, - "learning_rate": 2.967923967923968e-05, - "loss": 0.1165, + "epoch": 0.09086320040383644, + "grad_norm": 0.6765159368515015, + "learning_rate": 2.9454820797576983e-05, + "loss": 0.0764, "step": 180 }, { - "epoch": 0.05643005643005643, - "grad_norm": 0.9050448536872864, - "learning_rate": 2.9661419661419662e-05, - "loss": 0.1158, + "epoch": 0.09591115598182735, + "grad_norm": 1.1397738456726074, + "learning_rate": 2.9424533064109037e-05, + "loss": 0.0882, "step": 190 }, { - "epoch": 0.0594000594000594, - "grad_norm": 0.7845998406410217, - "learning_rate": 2.9643599643599644e-05, - "loss": 0.1046, + "epoch": 0.10095911155981828, + "grad_norm": 0.6545870900154114, + "learning_rate": 2.939424533064109e-05, + "loss": 0.0991, "step": 200 }, { - "epoch": 0.062370062370062374, - "grad_norm": 1.2148687839508057, - "learning_rate": 2.9625779625779626e-05, - "loss": 0.1048, + "epoch": 0.10600706713780919, + "grad_norm": 0.8882391452789307, + "learning_rate": 2.9363957597173144e-05, + "loss": 0.0902, "step": 210 }, { - "epoch": 0.06534006534006534, - "grad_norm": 0.5540338754653931, - "learning_rate": 2.960795960795961e-05, - "loss": 0.0989, + "epoch": 0.1110550227158001, + "grad_norm": 0.5973140001296997, + "learning_rate": 2.93336698637052e-05, + "loss": 0.0968, "step": 220 }, { - "epoch": 0.0683100683100683, - "grad_norm": 1.147627830505371, - "learning_rate": 2.959013959013959e-05, - "loss": 0.0948, + "epoch": 0.11610297829379101, + "grad_norm": 1.3215384483337402, + "learning_rate": 2.9303382130237254e-05, + "loss": 0.0901, "step": 230 }, { - "epoch": 0.07128007128007129, - "grad_norm": 0.6427733898162842, - "learning_rate": 2.9572319572319573e-05, - "loss": 0.1132, + "epoch": 0.12115093387178193, + "grad_norm": 0.6139042973518372, + "learning_rate": 2.9273094396769307e-05, + "loss": 0.0739, "step": 240 }, { - "epoch": 0.07425007425007425, - "grad_norm": 0.48930391669273376, - "learning_rate": 2.9554499554499555e-05, - "loss": 0.1077, + "epoch": 0.12619888944977284, + "grad_norm": 0.9095037579536438, + "learning_rate": 2.9242806663301364e-05, + "loss": 0.0907, "step": 250 }, { - "epoch": 0.07722007722007722, - "grad_norm": 0.5915262699127197, - "learning_rate": 2.9536679536679537e-05, - "loss": 0.0988, + "epoch": 0.13124684502776376, + "grad_norm": 1.0266954898834229, + "learning_rate": 2.9212518929833418e-05, + "loss": 0.0726, "step": 260 }, { - "epoch": 0.08019008019008018, - "grad_norm": 0.9437302350997925, - "learning_rate": 2.951885951885952e-05, - "loss": 0.0943, + "epoch": 0.13629480060575466, + "grad_norm": 0.734716534614563, + "learning_rate": 2.9182231196365474e-05, + "loss": 0.0891, "step": 270 }, { - "epoch": 0.08316008316008316, - "grad_norm": 0.8880864381790161, - "learning_rate": 2.95010395010395e-05, - "loss": 0.0874, + "epoch": 0.1413427561837456, + "grad_norm": 0.7633081674575806, + "learning_rate": 2.9151943462897528e-05, + "loss": 0.0747, "step": 280 }, { - "epoch": 0.08613008613008613, - "grad_norm": 0.5144450068473816, - "learning_rate": 2.9483219483219484e-05, - "loss": 0.1055, + "epoch": 0.1463907117617365, + "grad_norm": 0.8185615539550781, + "learning_rate": 2.912165572942958e-05, + "loss": 0.0815, "step": 290 }, { - "epoch": 0.0891000891000891, - "grad_norm": 0.718809187412262, - "learning_rate": 2.9465399465399466e-05, - "loss": 0.1052, + "epoch": 0.1514386673397274, + "grad_norm": 1.2503191232681274, + "learning_rate": 2.9091367995961638e-05, + "loss": 0.0844, "step": 300 }, { - "epoch": 0.09207009207009206, - "grad_norm": 1.3571438789367676, - "learning_rate": 2.9447579447579448e-05, - "loss": 0.1063, + "epoch": 0.15648662291771834, + "grad_norm": 0.52531898021698, + "learning_rate": 2.906108026249369e-05, + "loss": 0.0863, "step": 310 }, { - "epoch": 0.09504009504009504, - "grad_norm": 1.6258764266967773, - "learning_rate": 2.942975942975943e-05, - "loss": 0.1162, + "epoch": 0.16153457849570924, + "grad_norm": 0.8883135914802551, + "learning_rate": 2.9030792529025745e-05, + "loss": 0.0833, "step": 320 }, { - "epoch": 0.09801009801009801, - "grad_norm": 0.46299999952316284, - "learning_rate": 2.9411939411939412e-05, - "loss": 0.0883, + "epoch": 0.16658253407370016, + "grad_norm": 0.5173369646072388, + "learning_rate": 2.90005047955578e-05, + "loss": 0.0882, "step": 330 }, { - "epoch": 0.10098010098010098, - "grad_norm": 0.6494696140289307, - "learning_rate": 2.9394119394119395e-05, - "loss": 0.112, + "epoch": 0.17163048965169106, + "grad_norm": 0.5770648717880249, + "learning_rate": 2.8970217062089852e-05, + "loss": 0.0814, "step": 340 }, { - "epoch": 0.10395010395010396, - "grad_norm": 0.6922862529754639, - "learning_rate": 2.9376299376299377e-05, - "loss": 0.099, + "epoch": 0.17667844522968199, + "grad_norm": 0.8828192949295044, + "learning_rate": 2.893992932862191e-05, + "loss": 0.0776, "step": 350 }, { - "epoch": 0.10692010692010692, - "grad_norm": 0.6345096230506897, - "learning_rate": 2.935847935847936e-05, - "loss": 0.0879, + "epoch": 0.18172640080767288, + "grad_norm": 0.756236732006073, + "learning_rate": 2.8909641595153962e-05, + "loss": 0.0736, "step": 360 }, { - "epoch": 0.10989010989010989, - "grad_norm": 1.3332024812698364, - "learning_rate": 2.934065934065934e-05, - "loss": 0.1001, + "epoch": 0.1867743563856638, + "grad_norm": 0.47730007767677307, + "learning_rate": 2.887935386168602e-05, + "loss": 0.0856, "step": 370 }, { - "epoch": 0.11286011286011285, - "grad_norm": 1.0127153396606445, - "learning_rate": 2.9322839322839323e-05, - "loss": 0.0997, + "epoch": 0.1918223119636547, + "grad_norm": 2.5338025093078613, + "learning_rate": 2.8849066128218072e-05, + "loss": 0.0879, "step": 380 }, { - "epoch": 0.11583011583011583, - "grad_norm": 0.48219984769821167, - "learning_rate": 2.930501930501931e-05, - "loss": 0.0875, + "epoch": 0.19687026754164563, + "grad_norm": 0.6218165159225464, + "learning_rate": 2.8818778394750126e-05, + "loss": 0.0724, "step": 390 }, { - "epoch": 0.1188001188001188, - "grad_norm": 0.8579444289207458, - "learning_rate": 2.9287199287199288e-05, - "loss": 0.1045, + "epoch": 0.20191822311963656, + "grad_norm": 1.1621041297912598, + "learning_rate": 2.8788490661282183e-05, + "loss": 0.0742, "step": 400 }, { - "epoch": 0.12177012177012177, - "grad_norm": 0.5488039255142212, - "learning_rate": 2.926937926937927e-05, - "loss": 0.1084, + "epoch": 0.20696617869762746, + "grad_norm": 0.8511998653411865, + "learning_rate": 2.8758202927814236e-05, + "loss": 0.0798, "step": 410 }, { - "epoch": 0.12474012474012475, - "grad_norm": 1.2597718238830566, - "learning_rate": 2.9251559251559252e-05, - "loss": 0.0853, + "epoch": 0.21201413427561838, + "grad_norm": 0.5848472118377686, + "learning_rate": 2.8727915194346293e-05, + "loss": 0.0834, "step": 420 }, { - "epoch": 0.1277101277101277, - "grad_norm": 1.077631950378418, - "learning_rate": 2.9233739233739234e-05, - "loss": 0.0962, + "epoch": 0.21706208985360928, + "grad_norm": 0.5747645497322083, + "learning_rate": 2.8697627460878346e-05, + "loss": 0.0745, "step": 430 }, { - "epoch": 0.13068013068013068, - "grad_norm": 0.5581513047218323, - "learning_rate": 2.9215919215919216e-05, - "loss": 0.0922, + "epoch": 0.2221100454316002, + "grad_norm": 1.058206558227539, + "learning_rate": 2.86673397274104e-05, + "loss": 0.0767, "step": 440 }, { - "epoch": 0.13365013365013365, - "grad_norm": 0.6805756092071533, - "learning_rate": 2.91980991980992e-05, - "loss": 0.1083, + "epoch": 0.2271580010095911, + "grad_norm": 0.8267918825149536, + "learning_rate": 2.8637051993942453e-05, + "loss": 0.0893, "step": 450 }, { - "epoch": 0.1366201366201366, - "grad_norm": 0.860261857509613, - "learning_rate": 2.9180279180279184e-05, - "loss": 0.0852, + "epoch": 0.23220595658758203, + "grad_norm": 1.1392240524291992, + "learning_rate": 2.8606764260474507e-05, + "loss": 0.0833, "step": 460 }, { - "epoch": 0.13959013959013958, - "grad_norm": 1.9232168197631836, - "learning_rate": 2.9162459162459163e-05, - "loss": 0.0933, + "epoch": 0.23725391216557296, + "grad_norm": 0.9474436044692993, + "learning_rate": 2.8576476527006564e-05, + "loss": 0.0896, "step": 470 }, { - "epoch": 0.14256014256014257, - "grad_norm": 0.8232311606407166, - "learning_rate": 2.9144639144639145e-05, - "loss": 0.0928, + "epoch": 0.24230186774356385, + "grad_norm": 1.2880048751831055, + "learning_rate": 2.8546188793538617e-05, + "loss": 0.0924, "step": 480 }, { - "epoch": 0.14553014553014554, - "grad_norm": 0.8007870316505432, - "learning_rate": 2.9126819126819127e-05, - "loss": 0.0906, + "epoch": 0.24734982332155478, + "grad_norm": 0.6342403888702393, + "learning_rate": 2.851590106007067e-05, + "loss": 0.0799, "step": 490 }, { - "epoch": 0.1485001485001485, - "grad_norm": 1.1848207712173462, - "learning_rate": 2.910899910899911e-05, - "loss": 0.1052, + "epoch": 0.2523977788995457, + "grad_norm": 0.5780256986618042, + "learning_rate": 2.8485613326602727e-05, + "loss": 0.0798, "step": 500 }, { - "epoch": 0.15147015147015147, - "grad_norm": 0.5605499744415283, - "learning_rate": 2.909117909117909e-05, - "loss": 0.093, + "epoch": 0.2574457344775366, + "grad_norm": 0.7743504643440247, + "learning_rate": 2.845532559313478e-05, + "loss": 0.0681, "step": 510 }, { - "epoch": 0.15444015444015444, - "grad_norm": 0.6382190585136414, - "learning_rate": 2.9073359073359074e-05, - "loss": 0.0997, + "epoch": 0.26249369005552753, + "grad_norm": 0.5771861672401428, + "learning_rate": 2.8425037859666834e-05, + "loss": 0.0753, "step": 520 }, { - "epoch": 0.1574101574101574, - "grad_norm": 0.5192627310752869, - "learning_rate": 2.905553905553906e-05, - "loss": 0.1098, + "epoch": 0.2675416456335184, + "grad_norm": 0.6735575199127197, + "learning_rate": 2.839475012619889e-05, + "loss": 0.0773, "step": 530 }, { - "epoch": 0.16038016038016037, - "grad_norm": 0.5898168683052063, - "learning_rate": 2.9037719037719038e-05, - "loss": 0.0951, + "epoch": 0.2725896012115093, + "grad_norm": 0.7692667841911316, + "learning_rate": 2.8364462392730945e-05, + "loss": 0.0732, "step": 540 }, { - "epoch": 0.16335016335016336, - "grad_norm": 0.465077668428421, - "learning_rate": 2.901989901989902e-05, - "loss": 0.0929, + "epoch": 0.27763755678950025, + "grad_norm": 0.5109196901321411, + "learning_rate": 2.8334174659263e-05, + "loss": 0.0859, "step": 550 }, { - "epoch": 0.16632016632016633, - "grad_norm": 0.6358753442764282, - "learning_rate": 2.9002079002079002e-05, - "loss": 0.0999, + "epoch": 0.2826855123674912, + "grad_norm": 0.726249098777771, + "learning_rate": 2.8303886925795055e-05, + "loss": 0.0801, "step": 560 }, { - "epoch": 0.1692901692901693, - "grad_norm": 0.7714558839797974, - "learning_rate": 2.8984258984258984e-05, - "loss": 0.1031, + "epoch": 0.2877334679454821, + "grad_norm": 0.8817322254180908, + "learning_rate": 2.8273599192327108e-05, + "loss": 0.0739, "step": 570 }, { - "epoch": 0.17226017226017226, - "grad_norm": 0.865616500377655, - "learning_rate": 2.8966438966438967e-05, - "loss": 0.0932, + "epoch": 0.292781423523473, + "grad_norm": 0.5081413984298706, + "learning_rate": 2.8243311458859162e-05, + "loss": 0.0727, "step": 580 }, { - "epoch": 0.17523017523017523, - "grad_norm": 0.621036171913147, - "learning_rate": 2.894861894861895e-05, - "loss": 0.094, + "epoch": 0.2978293791014639, + "grad_norm": 0.9367203712463379, + "learning_rate": 2.8213023725391215e-05, + "loss": 0.0751, "step": 590 }, { - "epoch": 0.1782001782001782, - "grad_norm": 0.5007760524749756, - "learning_rate": 2.8930798930798934e-05, - "loss": 0.099, + "epoch": 0.3028773346794548, + "grad_norm": 0.5382592678070068, + "learning_rate": 2.8182735991923272e-05, + "loss": 0.0756, "step": 600 }, { - "epoch": 0.18117018117018116, - "grad_norm": 0.47733157873153687, - "learning_rate": 2.8912978912978913e-05, - "loss": 0.0795, + "epoch": 0.30792529025744575, + "grad_norm": 0.40977007150650024, + "learning_rate": 2.8152448258455325e-05, + "loss": 0.0714, "step": 610 }, { - "epoch": 0.18414018414018413, - "grad_norm": 0.40642765164375305, - "learning_rate": 2.8895158895158895e-05, - "loss": 0.0829, + "epoch": 0.3129732458354367, + "grad_norm": 0.6829769015312195, + "learning_rate": 2.812216052498738e-05, + "loss": 0.0809, "step": 620 }, { - "epoch": 0.18711018711018712, - "grad_norm": 1.1361258029937744, - "learning_rate": 2.8877338877338877e-05, - "loss": 0.0893, + "epoch": 0.31802120141342755, + "grad_norm": 0.4805002212524414, + "learning_rate": 2.8091872791519436e-05, + "loss": 0.0789, "step": 630 }, { - "epoch": 0.1900801900801901, - "grad_norm": 0.7784861922264099, - "learning_rate": 2.885951885951886e-05, - "loss": 0.0888, + "epoch": 0.32306915699141847, + "grad_norm": 0.6755364537239075, + "learning_rate": 2.806158505805149e-05, + "loss": 0.0819, "step": 640 }, { - "epoch": 0.19305019305019305, - "grad_norm": 0.43066325783729553, - "learning_rate": 2.8841698841698842e-05, - "loss": 0.0966, + "epoch": 0.3281171125694094, + "grad_norm": 1.3035857677459717, + "learning_rate": 2.8031297324583546e-05, + "loss": 0.0861, "step": 650 }, { - "epoch": 0.19602019602019602, - "grad_norm": 0.36752209067344666, - "learning_rate": 2.8823878823878824e-05, - "loss": 0.0999, + "epoch": 0.3331650681474003, + "grad_norm": 0.7905831933021545, + "learning_rate": 2.80010095911156e-05, + "loss": 0.0739, "step": 660 }, { - "epoch": 0.19899019899019899, - "grad_norm": 0.9712108969688416, - "learning_rate": 2.880605880605881e-05, - "loss": 0.0906, + "epoch": 0.3382130237253912, + "grad_norm": 0.8810652494430542, + "learning_rate": 2.7970721857647653e-05, + "loss": 0.0678, "step": 670 }, { - "epoch": 0.20196020196020195, - "grad_norm": 0.714443564414978, - "learning_rate": 2.878823878823879e-05, - "loss": 0.1049, + "epoch": 0.3432609793033821, + "grad_norm": 1.1220252513885498, + "learning_rate": 2.794043412417971e-05, + "loss": 0.07, "step": 680 }, { - "epoch": 0.20493020493020492, - "grad_norm": 0.3934662640094757, - "learning_rate": 2.877041877041877e-05, - "loss": 0.09, + "epoch": 0.34830893488137304, + "grad_norm": 0.8519473075866699, + "learning_rate": 2.7910146390711763e-05, + "loss": 0.076, "step": 690 }, { - "epoch": 0.2079002079002079, - "grad_norm": 1.9262911081314087, - "learning_rate": 2.8752598752598753e-05, - "loss": 0.1051, + "epoch": 0.35335689045936397, + "grad_norm": 0.49878937005996704, + "learning_rate": 2.787985865724382e-05, + "loss": 0.0787, "step": 700 }, { - "epoch": 0.21087021087021088, - "grad_norm": 0.6336867809295654, - "learning_rate": 2.8734778734778735e-05, - "loss": 0.0852, + "epoch": 0.3584048460373549, + "grad_norm": 1.4854084253311157, + "learning_rate": 2.784957092377587e-05, + "loss": 0.0872, "step": 710 }, { - "epoch": 0.21384021384021384, - "grad_norm": 0.45155736804008484, - "learning_rate": 2.8716958716958717e-05, - "loss": 0.0928, + "epoch": 0.36345280161534577, + "grad_norm": 0.787535548210144, + "learning_rate": 2.7819283190307924e-05, + "loss": 0.0805, "step": 720 }, { - "epoch": 0.2168102168102168, - "grad_norm": 0.6008352041244507, - "learning_rate": 2.86991386991387e-05, - "loss": 0.0829, + "epoch": 0.3685007571933367, + "grad_norm": 0.8322392106056213, + "learning_rate": 2.778899545683998e-05, + "loss": 0.0726, "step": 730 }, { - "epoch": 0.21978021978021978, - "grad_norm": 0.4825937747955322, - "learning_rate": 2.8681318681318685e-05, - "loss": 0.0995, + "epoch": 0.3735487127713276, + "grad_norm": 0.48470157384872437, + "learning_rate": 2.7758707723372034e-05, + "loss": 0.0673, "step": 740 }, { - "epoch": 0.22275022275022274, - "grad_norm": 1.0774333477020264, - "learning_rate": 2.8663498663498664e-05, - "loss": 0.1031, + "epoch": 0.37859666834931854, + "grad_norm": 0.8375622034072876, + "learning_rate": 2.772841998990409e-05, + "loss": 0.0767, "step": 750 }, { - "epoch": 0.2257202257202257, - "grad_norm": 0.7147405743598938, - "learning_rate": 2.8645678645678646e-05, - "loss": 0.1018, + "epoch": 0.3836446239273094, + "grad_norm": 0.5212222337722778, + "learning_rate": 2.7698132256436144e-05, + "loss": 0.0737, "step": 760 }, { - "epoch": 0.2286902286902287, - "grad_norm": 0.6777707934379578, - "learning_rate": 2.8627858627858628e-05, - "loss": 0.0901, + "epoch": 0.38869257950530034, + "grad_norm": 0.503209114074707, + "learning_rate": 2.7667844522968198e-05, + "loss": 0.0657, "step": 770 }, { - "epoch": 0.23166023166023167, - "grad_norm": 0.4215840697288513, - "learning_rate": 2.861003861003861e-05, - "loss": 0.0862, + "epoch": 0.39374053508329127, + "grad_norm": 0.4290629029273987, + "learning_rate": 2.7637556789500254e-05, + "loss": 0.0745, "step": 780 }, { - "epoch": 0.23463023463023464, - "grad_norm": 0.4555210471153259, - "learning_rate": 2.8592218592218592e-05, - "loss": 0.0825, + "epoch": 0.3987884906612822, + "grad_norm": 0.7535534501075745, + "learning_rate": 2.7607269056032308e-05, + "loss": 0.0702, "step": 790 }, { - "epoch": 0.2376002376002376, - "grad_norm": 0.7088650465011597, - "learning_rate": 2.8574398574398574e-05, - "loss": 0.0932, + "epoch": 0.4038364462392731, + "grad_norm": 0.67135089635849, + "learning_rate": 2.757698132256436e-05, + "loss": 0.0754, "step": 800 }, { - "epoch": 0.24057024057024057, - "grad_norm": 0.6595791578292847, - "learning_rate": 2.855657855657856e-05, - "loss": 0.098, + "epoch": 0.408884401817264, + "grad_norm": 0.5307912230491638, + "learning_rate": 2.7546693589096418e-05, + "loss": 0.0717, "step": 810 }, { - "epoch": 0.24354024354024353, - "grad_norm": 0.5375499725341797, - "learning_rate": 2.853875853875854e-05, - "loss": 0.0875, + "epoch": 0.4139323573952549, + "grad_norm": 0.46130767464637756, + "learning_rate": 2.751640585562847e-05, + "loss": 0.065, "step": 820 }, { - "epoch": 0.2465102465102465, - "grad_norm": 0.4199369549751282, - "learning_rate": 2.852093852093852e-05, - "loss": 0.0807, + "epoch": 0.41898031297324584, + "grad_norm": 1.2904905080795288, + "learning_rate": 2.748611812216053e-05, + "loss": 0.0818, "step": 830 }, { - "epoch": 0.2494802494802495, - "grad_norm": 0.41728097200393677, - "learning_rate": 2.8503118503118503e-05, - "loss": 0.0925, + "epoch": 0.42402826855123676, + "grad_norm": 2.0480494499206543, + "learning_rate": 2.745583038869258e-05, + "loss": 0.085, "step": 840 }, { - "epoch": 0.25245025245025243, - "grad_norm": 0.6526634693145752, - "learning_rate": 2.8485298485298485e-05, - "loss": 0.093, + "epoch": 0.4290762241292277, + "grad_norm": 0.5108308792114258, + "learning_rate": 2.7425542655224632e-05, + "loss": 0.0729, "step": 850 }, { - "epoch": 0.2554202554202554, - "grad_norm": 0.6086540222167969, - "learning_rate": 2.8467478467478467e-05, - "loss": 0.1036, + "epoch": 0.43412417970721856, + "grad_norm": 0.6915296912193298, + "learning_rate": 2.739525492175669e-05, + "loss": 0.071, "step": 860 }, { - "epoch": 0.25839025839025836, - "grad_norm": 0.8363798260688782, - "learning_rate": 2.844965844965845e-05, - "loss": 0.0871, + "epoch": 0.4391721352852095, + "grad_norm": 0.8100910782814026, + "learning_rate": 2.7364967188288742e-05, + "loss": 0.0667, "step": 870 }, { - "epoch": 0.26136026136026136, - "grad_norm": 0.49175241589546204, - "learning_rate": 2.8431838431838435e-05, - "loss": 0.0953, + "epoch": 0.4442200908632004, + "grad_norm": 0.626818835735321, + "learning_rate": 2.73346794548208e-05, + "loss": 0.0695, "step": 880 }, { - "epoch": 0.26433026433026435, - "grad_norm": 0.6891732811927795, - "learning_rate": 2.8414018414018414e-05, - "loss": 0.0789, + "epoch": 0.44926804644119134, + "grad_norm": 0.673156201839447, + "learning_rate": 2.7304391721352853e-05, + "loss": 0.0793, "step": 890 }, { - "epoch": 0.2673002673002673, - "grad_norm": 0.7982739210128784, - "learning_rate": 2.8396198396198396e-05, - "loss": 0.1105, + "epoch": 0.4543160020191822, + "grad_norm": 0.5740798711776733, + "learning_rate": 2.7274103987884906e-05, + "loss": 0.0731, "step": 900 }, { - "epoch": 0.2702702702702703, - "grad_norm": 1.5775654315948486, - "learning_rate": 2.8378378378378378e-05, - "loss": 0.0902, + "epoch": 0.45936395759717313, + "grad_norm": 0.744429349899292, + "learning_rate": 2.7243816254416963e-05, + "loss": 0.0743, "step": 910 }, { - "epoch": 0.2732402732402732, - "grad_norm": 0.42223745584487915, - "learning_rate": 2.836055836055836e-05, - "loss": 0.0937, + "epoch": 0.46441191317516406, + "grad_norm": 0.5837222933769226, + "learning_rate": 2.7213528520949016e-05, + "loss": 0.0747, "step": 920 }, { - "epoch": 0.2762102762102762, - "grad_norm": 0.8138239979743958, - "learning_rate": 2.8342738342738343e-05, - "loss": 0.0989, + "epoch": 0.469459868753155, + "grad_norm": 0.500978410243988, + "learning_rate": 2.7183240787481073e-05, + "loss": 0.0753, "step": 930 }, { - "epoch": 0.27918027918027916, - "grad_norm": 0.5486122965812683, - "learning_rate": 2.8324918324918325e-05, - "loss": 0.0901, + "epoch": 0.4745078243311459, + "grad_norm": 1.0817604064941406, + "learning_rate": 2.7152953054013127e-05, + "loss": 0.0748, "step": 940 }, { - "epoch": 0.28215028215028215, - "grad_norm": 0.5096667408943176, - "learning_rate": 2.830709830709831e-05, - "loss": 0.0799, + "epoch": 0.4795557799091368, + "grad_norm": 0.5821205377578735, + "learning_rate": 2.712266532054518e-05, + "loss": 0.0766, "step": 950 }, { - "epoch": 0.28512028512028514, - "grad_norm": 0.5797027945518494, - "learning_rate": 2.8289278289278293e-05, - "loss": 0.0907, + "epoch": 0.4846037354871277, + "grad_norm": 0.6120801568031311, + "learning_rate": 2.7092377587077233e-05, + "loss": 0.0827, "step": 960 }, { - "epoch": 0.2880902880902881, - "grad_norm": 0.7815655469894409, - "learning_rate": 2.827145827145827e-05, - "loss": 0.0793, + "epoch": 0.48965169106511863, + "grad_norm": 0.4379239082336426, + "learning_rate": 2.7062089853609287e-05, + "loss": 0.0664, "step": 970 }, { - "epoch": 0.2910602910602911, - "grad_norm": 0.5682644248008728, - "learning_rate": 2.8253638253638253e-05, - "loss": 0.0799, + "epoch": 0.49469964664310956, + "grad_norm": 0.5472243428230286, + "learning_rate": 2.7031802120141344e-05, + "loss": 0.0767, "step": 980 }, { - "epoch": 0.294030294030294, - "grad_norm": 0.5554261207580566, - "learning_rate": 2.8235818235818236e-05, - "loss": 0.0949, + "epoch": 0.49974760222110043, + "grad_norm": 1.0190905332565308, + "learning_rate": 2.7001514386673397e-05, + "loss": 0.0739, "step": 990 }, { - "epoch": 0.297000297000297, - "grad_norm": 0.5728469491004944, - "learning_rate": 2.8217998217998218e-05, - "loss": 0.1111, + "epoch": 0.5047955577990914, + "grad_norm": 0.7046610713005066, + "learning_rate": 2.697122665320545e-05, + "loss": 0.0685, "step": 1000 }, { - "epoch": 0.29997029997029995, - "grad_norm": 0.5483665466308594, - "learning_rate": 2.82001782001782e-05, - "loss": 0.0901, + "epoch": 0.5098435133770823, + "grad_norm": 0.5559498071670532, + "learning_rate": 2.6940938919737507e-05, + "loss": 0.0715, "step": 1010 }, { - "epoch": 0.30294030294030294, - "grad_norm": 0.7061681151390076, - "learning_rate": 2.8182358182358186e-05, - "loss": 0.0942, + "epoch": 0.5148914689550732, + "grad_norm": 0.6298381686210632, + "learning_rate": 2.691065118626956e-05, + "loss": 0.0828, "step": 1020 }, { - "epoch": 0.30591030591030594, - "grad_norm": 0.4503157436847687, - "learning_rate": 2.8164538164538168e-05, - "loss": 0.0837, + "epoch": 0.5199394245330641, + "grad_norm": 0.7023555636405945, + "learning_rate": 2.6880363452801618e-05, + "loss": 0.0809, "step": 1030 }, { - "epoch": 0.3088803088803089, - "grad_norm": 1.187880277633667, - "learning_rate": 2.8146718146718146e-05, - "loss": 0.0866, + "epoch": 0.5249873801110551, + "grad_norm": 0.6804683804512024, + "learning_rate": 2.685007571933367e-05, + "loss": 0.0739, "step": 1040 }, { - "epoch": 0.31185031185031187, - "grad_norm": 1.120139718055725, - "learning_rate": 2.812889812889813e-05, - "loss": 0.1058, + "epoch": 0.5300353356890459, + "grad_norm": 0.7743015885353088, + "learning_rate": 2.6819787985865725e-05, + "loss": 0.0658, "step": 1050 }, { - "epoch": 0.3148203148203148, - "grad_norm": 0.7681704759597778, - "learning_rate": 2.811107811107811e-05, - "loss": 0.0864, + "epoch": 0.5350832912670368, + "grad_norm": 1.36810302734375, + "learning_rate": 2.678950025239778e-05, + "loss": 0.0747, "step": 1060 }, { - "epoch": 0.3177903177903178, - "grad_norm": 0.6372396349906921, - "learning_rate": 2.8093258093258093e-05, - "loss": 0.0886, + "epoch": 0.5401312468450278, + "grad_norm": 0.47373896837234497, + "learning_rate": 2.6759212518929835e-05, + "loss": 0.0751, "step": 1070 }, { - "epoch": 0.32076032076032074, - "grad_norm": 0.7018745541572571, - "learning_rate": 2.8075438075438075e-05, - "loss": 0.0786, + "epoch": 0.5451792024230186, + "grad_norm": 0.6654021143913269, + "learning_rate": 2.6728924785461892e-05, + "loss": 0.0683, "step": 1080 }, { - "epoch": 0.32373032373032373, - "grad_norm": 0.8289116621017456, - "learning_rate": 2.805761805761806e-05, - "loss": 0.0964, + "epoch": 0.5502271580010096, + "grad_norm": 1.0054854154586792, + "learning_rate": 2.6698637051993942e-05, + "loss": 0.0676, "step": 1090 }, { - "epoch": 0.3267003267003267, - "grad_norm": 0.7211658954620361, - "learning_rate": 2.8039798039798043e-05, - "loss": 0.1066, + "epoch": 0.5552751135790005, + "grad_norm": 0.5544041395187378, + "learning_rate": 2.6668349318525995e-05, + "loss": 0.075, "step": 1100 }, { - "epoch": 0.32967032967032966, - "grad_norm": 0.677126407623291, - "learning_rate": 2.802197802197802e-05, - "loss": 0.081, + "epoch": 0.5603230691569914, + "grad_norm": 0.6919006109237671, + "learning_rate": 2.6638061585058052e-05, + "loss": 0.0709, "step": 1110 }, { - "epoch": 0.33264033264033266, - "grad_norm": 0.3897887170314789, - "learning_rate": 2.8004158004158004e-05, - "loss": 0.0937, + "epoch": 0.5653710247349824, + "grad_norm": 0.5584747791290283, + "learning_rate": 2.6607773851590106e-05, + "loss": 0.0623, "step": 1120 }, { - "epoch": 0.3356103356103356, - "grad_norm": 0.5881434679031372, - "learning_rate": 2.7986337986337986e-05, - "loss": 0.0852, + "epoch": 0.5704189803129732, + "grad_norm": 0.47064319252967834, + "learning_rate": 2.657748611812216e-05, + "loss": 0.0744, "step": 1130 }, { - "epoch": 0.3385803385803386, - "grad_norm": 0.6897678971290588, - "learning_rate": 2.7968517968517968e-05, - "loss": 0.0873, + "epoch": 0.5754669358909642, + "grad_norm": 0.5119986534118652, + "learning_rate": 2.6547198384654216e-05, + "loss": 0.0795, "step": 1140 }, { - "epoch": 0.34155034155034153, - "grad_norm": 0.6038883328437805, - "learning_rate": 2.795069795069795e-05, - "loss": 0.0805, + "epoch": 0.5805148914689551, + "grad_norm": 0.9572923183441162, + "learning_rate": 2.651691065118627e-05, + "loss": 0.073, "step": 1150 }, { - "epoch": 0.3445203445203445, - "grad_norm": 0.4414396286010742, - "learning_rate": 2.7932877932877936e-05, - "loss": 0.0981, + "epoch": 0.585562847046946, + "grad_norm": 0.5633489489555359, + "learning_rate": 2.6486622917718326e-05, + "loss": 0.0637, "step": 1160 }, { - "epoch": 0.3474903474903475, - "grad_norm": 0.48170387744903564, - "learning_rate": 2.7915057915057918e-05, - "loss": 0.0938, + "epoch": 0.5906108026249369, + "grad_norm": 1.1218105554580688, + "learning_rate": 2.645633518425038e-05, + "loss": 0.0695, "step": 1170 }, { - "epoch": 0.35046035046035046, - "grad_norm": 0.5567618012428284, - "learning_rate": 2.7897237897237897e-05, - "loss": 0.0897, + "epoch": 0.5956587582029278, + "grad_norm": 0.6655285954475403, + "learning_rate": 2.6426047450782433e-05, + "loss": 0.0774, "step": 1180 }, { - "epoch": 0.35343035343035345, - "grad_norm": 0.6452346444129944, - "learning_rate": 2.787941787941788e-05, - "loss": 0.0948, + "epoch": 0.6007067137809188, + "grad_norm": 1.3088024854660034, + "learning_rate": 2.639575971731449e-05, + "loss": 0.0748, "step": 1190 }, { - "epoch": 0.3564003564003564, - "grad_norm": 0.4139314889907837, - "learning_rate": 2.786159786159786e-05, - "loss": 0.0849, + "epoch": 0.6057546693589096, + "grad_norm": 0.9868513941764832, + "learning_rate": 2.6365471983846543e-05, + "loss": 0.0695, "step": 1200 }, { - "epoch": 0.3593703593703594, - "grad_norm": 0.5524829030036926, - "learning_rate": 2.7843777843777843e-05, - "loss": 0.0742, + "epoch": 0.6108026249369005, + "grad_norm": 0.5922626852989197, + "learning_rate": 2.63351842503786e-05, + "loss": 0.0678, "step": 1210 }, { - "epoch": 0.3623403623403623, - "grad_norm": 1.0731943845748901, - "learning_rate": 2.7825957825957826e-05, - "loss": 0.0834, + "epoch": 0.6158505805148915, + "grad_norm": 0.6839954257011414, + "learning_rate": 2.630489651691065e-05, + "loss": 0.0693, "step": 1220 }, { - "epoch": 0.3653103653103653, - "grad_norm": 0.6787437796592712, - "learning_rate": 2.780813780813781e-05, - "loss": 0.0921, + "epoch": 0.6208985360928824, + "grad_norm": 0.6755519509315491, + "learning_rate": 2.6274608783442704e-05, + "loss": 0.0742, "step": 1230 }, { - "epoch": 0.36828036828036825, - "grad_norm": 0.536044716835022, - "learning_rate": 2.7790317790317793e-05, - "loss": 0.0837, + "epoch": 0.6259464916708734, + "grad_norm": 0.4968509078025818, + "learning_rate": 2.624432104997476e-05, + "loss": 0.0615, "step": 1240 }, { - "epoch": 0.37125037125037125, - "grad_norm": 0.4149301052093506, - "learning_rate": 2.7772497772497772e-05, - "loss": 0.0847, + "epoch": 0.6309944472488642, + "grad_norm": 1.1036404371261597, + "learning_rate": 2.6214033316506814e-05, + "loss": 0.0727, "step": 1250 }, { - "epoch": 0.37422037422037424, - "grad_norm": 0.6760357618331909, - "learning_rate": 2.7754677754677754e-05, - "loss": 0.0965, + "epoch": 0.6360424028268551, + "grad_norm": 0.810405969619751, + "learning_rate": 2.618374558303887e-05, + "loss": 0.072, "step": 1260 }, { - "epoch": 0.3771903771903772, - "grad_norm": 0.8695538640022278, - "learning_rate": 2.7736857736857736e-05, - "loss": 0.0862, + "epoch": 0.6410903584048461, + "grad_norm": 0.730140209197998, + "learning_rate": 2.6153457849570924e-05, + "loss": 0.0652, "step": 1270 }, { - "epoch": 0.3801603801603802, - "grad_norm": 1.1023316383361816, - "learning_rate": 2.771903771903772e-05, - "loss": 0.0818, + "epoch": 0.6461383139828369, + "grad_norm": 1.1645480394363403, + "learning_rate": 2.6123170116102978e-05, + "loss": 0.0716, "step": 1280 }, { - "epoch": 0.3831303831303831, - "grad_norm": 1.0046688318252563, - "learning_rate": 2.77012177012177e-05, - "loss": 0.0923, + "epoch": 0.6511862695608278, + "grad_norm": 0.8481037020683289, + "learning_rate": 2.6092882382635034e-05, + "loss": 0.0737, "step": 1290 }, { - "epoch": 0.3861003861003861, - "grad_norm": 0.4843716323375702, - "learning_rate": 2.7683397683397686e-05, - "loss": 0.0774, + "epoch": 0.6562342251388188, + "grad_norm": 0.5972946882247925, + "learning_rate": 2.6062594649167088e-05, + "loss": 0.0704, "step": 1300 }, { - "epoch": 0.38907038907038904, - "grad_norm": 0.6335024833679199, - "learning_rate": 2.766557766557767e-05, - "loss": 0.083, + "epoch": 0.6612821807168097, + "grad_norm": 0.6405556201934814, + "learning_rate": 2.6032306915699145e-05, + "loss": 0.0628, "step": 1310 }, { - "epoch": 0.39204039204039204, - "grad_norm": 0.5234698057174683, - "learning_rate": 2.7647757647757647e-05, - "loss": 0.0755, + "epoch": 0.6663301362948006, + "grad_norm": 0.8645715117454529, + "learning_rate": 2.6002019182231198e-05, + "loss": 0.0742, "step": 1320 }, { - "epoch": 0.39501039501039503, - "grad_norm": 0.477662056684494, - "learning_rate": 2.762993762993763e-05, - "loss": 0.0756, + "epoch": 0.6713780918727915, + "grad_norm": 1.4211089611053467, + "learning_rate": 2.597173144876325e-05, + "loss": 0.0731, "step": 1330 }, { - "epoch": 0.39798039798039797, - "grad_norm": 0.5107772350311279, - "learning_rate": 2.761211761211761e-05, - "loss": 0.0688, + "epoch": 0.6764260474507824, + "grad_norm": 0.8079481720924377, + "learning_rate": 2.594144371529531e-05, + "loss": 0.0732, "step": 1340 }, { - "epoch": 0.40095040095040096, - "grad_norm": 0.6898319125175476, - "learning_rate": 2.7594297594297594e-05, - "loss": 0.0856, + "epoch": 0.6814740030287734, + "grad_norm": 0.6517273783683777, + "learning_rate": 2.591115598182736e-05, + "loss": 0.0688, "step": 1350 }, { - "epoch": 0.4039204039204039, - "grad_norm": 0.5590442419052124, - "learning_rate": 2.7576477576477576e-05, - "loss": 0.0857, + "epoch": 0.6865219586067642, + "grad_norm": 1.2093323469161987, + "learning_rate": 2.5880868248359415e-05, + "loss": 0.0729, "step": 1360 }, { - "epoch": 0.4068904068904069, - "grad_norm": 0.6682615280151367, - "learning_rate": 2.755865755865756e-05, - "loss": 0.0828, + "epoch": 0.6915699141847552, + "grad_norm": 0.6432307362556458, + "learning_rate": 2.585058051489147e-05, + "loss": 0.076, "step": 1370 }, { - "epoch": 0.40986040986040984, - "grad_norm": 0.27072158455848694, - "learning_rate": 2.7540837540837544e-05, - "loss": 0.0775, + "epoch": 0.6966178697627461, + "grad_norm": 0.5220794677734375, + "learning_rate": 2.5820292781423522e-05, + "loss": 0.0702, "step": 1380 }, { - "epoch": 0.41283041283041283, - "grad_norm": 0.6918196082115173, - "learning_rate": 2.7523017523017522e-05, - "loss": 0.0734, + "epoch": 0.701665825340737, + "grad_norm": 1.0983613729476929, + "learning_rate": 2.579000504795558e-05, + "loss": 0.0676, "step": 1390 }, { - "epoch": 0.4158004158004158, - "grad_norm": 0.6403471827507019, - "learning_rate": 2.7505197505197505e-05, - "loss": 0.0814, + "epoch": 0.7067137809187279, + "grad_norm": 0.859348475933075, + "learning_rate": 2.5759717314487633e-05, + "loss": 0.0615, "step": 1400 }, { - "epoch": 0.41877041877041876, - "grad_norm": 0.7018643617630005, - "learning_rate": 2.7487377487377487e-05, - "loss": 0.0811, + "epoch": 0.7117617364967188, + "grad_norm": 0.7912864685058594, + "learning_rate": 2.572942958101969e-05, + "loss": 0.0681, "step": 1410 }, { - "epoch": 0.42174042174042176, - "grad_norm": 0.6571378111839294, - "learning_rate": 2.746955746955747e-05, - "loss": 0.0786, + "epoch": 0.7168096920747098, + "grad_norm": 0.6189167499542236, + "learning_rate": 2.5699141847551743e-05, + "loss": 0.0682, "step": 1420 }, { - "epoch": 0.4247104247104247, - "grad_norm": 0.7818433046340942, - "learning_rate": 2.745173745173745e-05, - "loss": 0.0743, + "epoch": 0.7218576476527007, + "grad_norm": 0.5456287860870361, + "learning_rate": 2.5668854114083796e-05, + "loss": 0.0591, "step": 1430 }, { - "epoch": 0.4276804276804277, - "grad_norm": 0.7524327635765076, - "learning_rate": 2.7433917433917437e-05, - "loss": 0.0757, + "epoch": 0.7269056032306915, + "grad_norm": 0.485055148601532, + "learning_rate": 2.5638566380615853e-05, + "loss": 0.0729, "step": 1440 }, { - "epoch": 0.4306504306504306, - "grad_norm": 0.8632511496543884, - "learning_rate": 2.741609741609742e-05, - "loss": 0.084, + "epoch": 0.7319535588086825, + "grad_norm": 0.46423906087875366, + "learning_rate": 2.5608278647147907e-05, + "loss": 0.0646, "step": 1450 }, { - "epoch": 0.4336204336204336, - "grad_norm": 0.6295231580734253, - "learning_rate": 2.7398277398277398e-05, - "loss": 0.0892, + "epoch": 0.7370015143866734, + "grad_norm": 0.5944865345954895, + "learning_rate": 2.557799091367996e-05, + "loss": 0.0696, "step": 1460 }, { - "epoch": 0.4365904365904366, - "grad_norm": 0.6907210946083069, - "learning_rate": 2.738045738045738e-05, - "loss": 0.1006, + "epoch": 0.7420494699646644, + "grad_norm": 0.794015645980835, + "learning_rate": 2.5547703180212014e-05, + "loss": 0.0671, "step": 1470 }, { - "epoch": 0.43956043956043955, - "grad_norm": 0.617152988910675, - "learning_rate": 2.7362637362637362e-05, - "loss": 0.097, + "epoch": 0.7470974255426552, + "grad_norm": 0.6759900450706482, + "learning_rate": 2.5517415446744067e-05, + "loss": 0.074, "step": 1480 }, { - "epoch": 0.44253044253044255, - "grad_norm": 0.6373753547668457, - "learning_rate": 2.7344817344817344e-05, - "loss": 0.0784, + "epoch": 0.7521453811206461, + "grad_norm": 0.6719480156898499, + "learning_rate": 2.5487127713276124e-05, + "loss": 0.0708, "step": 1490 }, { - "epoch": 0.4455004455004455, - "grad_norm": 0.7640069723129272, - "learning_rate": 2.7326997326997326e-05, - "loss": 0.0729, + "epoch": 0.7571933366986371, + "grad_norm": 0.7934426665306091, + "learning_rate": 2.5456839979808177e-05, + "loss": 0.0664, "step": 1500 }, { - "epoch": 0.4484704484704485, - "grad_norm": 0.5482354164123535, - "learning_rate": 2.7309177309177312e-05, - "loss": 0.0876, + "epoch": 0.762241292276628, + "grad_norm": 1.4169378280639648, + "learning_rate": 2.542655224634023e-05, + "loss": 0.0726, "step": 1510 }, { - "epoch": 0.4514404514404514, - "grad_norm": 0.7966523766517639, - "learning_rate": 2.7291357291357294e-05, - "loss": 0.0833, + "epoch": 0.7672892478546188, + "grad_norm": 0.5849716067314148, + "learning_rate": 2.5396264512872288e-05, + "loss": 0.0709, "step": 1520 }, { - "epoch": 0.4544104544104544, - "grad_norm": 0.6484697461128235, - "learning_rate": 2.7273537273537276e-05, - "loss": 0.0854, + "epoch": 0.7723372034326098, + "grad_norm": 0.8471559286117554, + "learning_rate": 2.536597677940434e-05, + "loss": 0.0764, "step": 1530 }, { - "epoch": 0.4573804573804574, - "grad_norm": 0.43090665340423584, - "learning_rate": 2.7255717255717255e-05, - "loss": 0.0914, + "epoch": 0.7773851590106007, + "grad_norm": 0.7494149804115295, + "learning_rate": 2.5335689045936398e-05, + "loss": 0.0629, "step": 1540 }, { - "epoch": 0.46035046035046034, - "grad_norm": 0.5118837356567383, - "learning_rate": 2.7237897237897237e-05, - "loss": 0.0819, + "epoch": 0.7824331145885917, + "grad_norm": 0.7659397721290588, + "learning_rate": 2.530540131246845e-05, + "loss": 0.061, "step": 1550 }, { - "epoch": 0.46332046332046334, - "grad_norm": 0.9723702669143677, - "learning_rate": 2.722007722007722e-05, - "loss": 0.088, + "epoch": 0.7874810701665825, + "grad_norm": 0.8505954146385193, + "learning_rate": 2.5275113579000505e-05, + "loss": 0.0693, "step": 1560 }, { - "epoch": 0.4662904662904663, - "grad_norm": 1.0589011907577515, - "learning_rate": 2.72022572022572e-05, - "loss": 0.0963, + "epoch": 0.7925290257445734, + "grad_norm": 0.8126624226570129, + "learning_rate": 2.524482584553256e-05, + "loss": 0.0738, "step": 1570 }, { - "epoch": 0.46926046926046927, - "grad_norm": 0.6201198697090149, - "learning_rate": 2.7184437184437187e-05, - "loss": 0.0702, + "epoch": 0.7975769813225644, + "grad_norm": 0.9350792765617371, + "learning_rate": 2.5214538112064615e-05, + "loss": 0.0821, "step": 1580 }, { - "epoch": 0.4722304722304722, - "grad_norm": 0.40020257234573364, - "learning_rate": 2.716661716661717e-05, - "loss": 0.0752, + "epoch": 0.8026249369005553, + "grad_norm": 1.075035810470581, + "learning_rate": 2.5184250378596672e-05, + "loss": 0.0758, "step": 1590 }, { - "epoch": 0.4752004752004752, - "grad_norm": 0.8229923844337463, - "learning_rate": 2.714879714879715e-05, - "loss": 0.1031, + "epoch": 0.8076728924785462, + "grad_norm": 0.6885321736335754, + "learning_rate": 2.5153962645128722e-05, + "loss": 0.0641, "step": 1600 }, { - "epoch": 0.4781704781704782, - "grad_norm": 0.5380883812904358, - "learning_rate": 2.713097713097713e-05, - "loss": 0.0911, + "epoch": 0.8127208480565371, + "grad_norm": 0.7702226042747498, + "learning_rate": 2.5123674911660775e-05, + "loss": 0.0642, "step": 1610 }, { - "epoch": 0.48114048114048114, - "grad_norm": 0.507243812084198, - "learning_rate": 2.7113157113157112e-05, - "loss": 0.079, + "epoch": 0.817768803634528, + "grad_norm": 0.9809953570365906, + "learning_rate": 2.5093387178192832e-05, + "loss": 0.0759, "step": 1620 }, { - "epoch": 0.48411048411048413, - "grad_norm": 0.6244765520095825, - "learning_rate": 2.7095337095337095e-05, - "loss": 0.0643, + "epoch": 0.822816759212519, + "grad_norm": 0.5996444225311279, + "learning_rate": 2.5063099444724886e-05, + "loss": 0.0686, "step": 1630 }, { - "epoch": 0.48708048708048707, - "grad_norm": 1.1058402061462402, - "learning_rate": 2.7077517077517077e-05, - "loss": 0.0928, + "epoch": 0.8278647147905098, + "grad_norm": 0.5003983378410339, + "learning_rate": 2.5032811711256942e-05, + "loss": 0.0697, "step": 1640 }, { - "epoch": 0.49005049005049006, - "grad_norm": 0.8316872715950012, - "learning_rate": 2.7059697059697062e-05, - "loss": 0.0729, + "epoch": 0.8329126703685008, + "grad_norm": 0.7024896740913391, + "learning_rate": 2.5002523977788996e-05, + "loss": 0.0699, "step": 1650 }, { - "epoch": 0.493020493020493, - "grad_norm": 0.6039434671401978, - "learning_rate": 2.7041877041877044e-05, - "loss": 0.0907, + "epoch": 0.8379606259464917, + "grad_norm": 0.5384397506713867, + "learning_rate": 2.497223624432105e-05, + "loss": 0.0684, "step": 1660 }, { - "epoch": 0.495990495990496, - "grad_norm": 0.47073495388031006, - "learning_rate": 2.7024057024057027e-05, - "loss": 0.0806, + "epoch": 0.8430085815244825, + "grad_norm": 1.176849126815796, + "learning_rate": 2.4941948510853106e-05, + "loss": 0.065, "step": 1670 }, { - "epoch": 0.498960498960499, - "grad_norm": 0.4234858453273773, - "learning_rate": 2.7006237006237005e-05, - "loss": 0.0796, + "epoch": 0.8480565371024735, + "grad_norm": 0.7623859643936157, + "learning_rate": 2.491166077738516e-05, + "loss": 0.0676, "step": 1680 }, { - "epoch": 0.5019305019305019, - "grad_norm": 0.7585604190826416, - "learning_rate": 2.6988416988416988e-05, - "loss": 0.0922, + "epoch": 0.8531044926804644, + "grad_norm": 0.8817411065101624, + "learning_rate": 2.4881373043917216e-05, + "loss": 0.0712, "step": 1690 }, { - "epoch": 0.5049005049005049, - "grad_norm": 0.5006585717201233, - "learning_rate": 2.697059697059697e-05, - "loss": 0.0826, + "epoch": 0.8581524482584554, + "grad_norm": 0.7471240162849426, + "learning_rate": 2.485108531044927e-05, + "loss": 0.0719, "step": 1700 }, { - "epoch": 0.5078705078705079, - "grad_norm": 0.6841594576835632, - "learning_rate": 2.6952776952776952e-05, - "loss": 0.0879, + "epoch": 0.8632004038364463, + "grad_norm": 0.9217013120651245, + "learning_rate": 2.4820797576981323e-05, + "loss": 0.0758, "step": 1710 }, { - "epoch": 0.5108405108405109, - "grad_norm": 0.6505159139633179, - "learning_rate": 2.6934956934956937e-05, - "loss": 0.097, + "epoch": 0.8682483594144371, + "grad_norm": 0.4985320568084717, + "learning_rate": 2.479050984351338e-05, + "loss": 0.075, "step": 1720 }, { - "epoch": 0.5138105138105138, - "grad_norm": 0.48233747482299805, - "learning_rate": 2.691713691713692e-05, - "loss": 0.079, + "epoch": 0.8732963149924281, + "grad_norm": 0.47823965549468994, + "learning_rate": 2.476022211004543e-05, + "loss": 0.0576, "step": 1730 }, { - "epoch": 0.5167805167805167, - "grad_norm": 0.5792484879493713, - "learning_rate": 2.6899316899316902e-05, - "loss": 0.0847, + "epoch": 0.878344270570419, + "grad_norm": 0.5073914527893066, + "learning_rate": 2.4729934376577487e-05, + "loss": 0.0619, "step": 1740 }, { - "epoch": 0.5197505197505198, - "grad_norm": 0.6649707555770874, - "learning_rate": 2.688149688149688e-05, - "loss": 0.0584, + "epoch": 0.8833922261484098, + "grad_norm": 0.6744971871376038, + "learning_rate": 2.469964664310954e-05, + "loss": 0.0674, "step": 1750 }, { - "epoch": 0.5227205227205227, - "grad_norm": 0.6543247699737549, - "learning_rate": 2.6863676863676863e-05, - "loss": 0.0558, + "epoch": 0.8884401817264008, + "grad_norm": 0.7287705540657043, + "learning_rate": 2.4669358909641594e-05, + "loss": 0.0705, "step": 1760 }, { - "epoch": 0.5256905256905257, - "grad_norm": 0.6927476525306702, - "learning_rate": 2.6845856845856845e-05, - "loss": 0.0828, + "epoch": 0.8934881373043917, + "grad_norm": 0.6387834548950195, + "learning_rate": 2.463907117617365e-05, + "loss": 0.0736, "step": 1770 }, { - "epoch": 0.5286605286605287, - "grad_norm": 0.9066148996353149, - "learning_rate": 2.6828036828036827e-05, - "loss": 0.0827, + "epoch": 0.8985360928823827, + "grad_norm": 0.8428398370742798, + "learning_rate": 2.4608783442705704e-05, + "loss": 0.0741, "step": 1780 }, { - "epoch": 0.5316305316305316, - "grad_norm": 0.6122345924377441, - "learning_rate": 2.6810216810216813e-05, - "loss": 0.0831, + "epoch": 0.9035840484603735, + "grad_norm": 0.6455987691879272, + "learning_rate": 2.4578495709237758e-05, + "loss": 0.0639, "step": 1790 }, { - "epoch": 0.5346005346005346, - "grad_norm": 0.5523887872695923, - "learning_rate": 2.6792396792396795e-05, - "loss": 0.0925, + "epoch": 0.9086320040383644, + "grad_norm": 0.6735292673110962, + "learning_rate": 2.4548207975769815e-05, + "loss": 0.0795, "step": 1800 }, { - "epoch": 0.5375705375705375, - "grad_norm": 0.9167420268058777, - "learning_rate": 2.6774576774576777e-05, - "loss": 0.0756, + "epoch": 0.9136799596163554, + "grad_norm": 0.6157563924789429, + "learning_rate": 2.4517920242301868e-05, + "loss": 0.0699, "step": 1810 }, { - "epoch": 0.5405405405405406, - "grad_norm": 0.4656206965446472, - "learning_rate": 2.6756756756756756e-05, - "loss": 0.0827, + "epoch": 0.9187279151943463, + "grad_norm": 0.7483514547348022, + "learning_rate": 2.4487632508833925e-05, + "loss": 0.0681, "step": 1820 }, { - "epoch": 0.5435105435105435, - "grad_norm": 0.49738115072250366, - "learning_rate": 2.6738936738936738e-05, - "loss": 0.0996, + "epoch": 0.9237758707723372, + "grad_norm": 0.5686767101287842, + "learning_rate": 2.4457344775365978e-05, + "loss": 0.0713, "step": 1830 }, { - "epoch": 0.5464805464805464, - "grad_norm": 0.7212559580802917, - "learning_rate": 2.672111672111672e-05, - "loss": 0.0791, + "epoch": 0.9288238263503281, + "grad_norm": 0.352909654378891, + "learning_rate": 2.4427057041898032e-05, + "loss": 0.0641, "step": 1840 }, { - "epoch": 0.5494505494505495, - "grad_norm": 0.6626265645027161, - "learning_rate": 2.6703296703296702e-05, - "loss": 0.091, + "epoch": 0.933871781928319, + "grad_norm": 0.6095912456512451, + "learning_rate": 2.439676930843009e-05, + "loss": 0.0794, "step": 1850 }, { - "epoch": 0.5524205524205524, - "grad_norm": 0.38933899998664856, - "learning_rate": 2.6685476685476688e-05, - "loss": 0.0788, + "epoch": 0.93891973750631, + "grad_norm": 0.3929665684700012, + "learning_rate": 2.436648157496214e-05, + "loss": 0.0672, "step": 1860 }, { - "epoch": 0.5553905553905554, - "grad_norm": 0.41860514879226685, - "learning_rate": 2.666765666765667e-05, - "loss": 0.1031, + "epoch": 0.9439676930843008, + "grad_norm": 0.22026501595973969, + "learning_rate": 2.4336193841494195e-05, + "loss": 0.0699, "step": 1870 }, { - "epoch": 0.5583605583605583, - "grad_norm": 0.5364987850189209, - "learning_rate": 2.6649836649836652e-05, - "loss": 0.0891, + "epoch": 0.9490156486622918, + "grad_norm": 0.5952547788619995, + "learning_rate": 2.430590610802625e-05, + "loss": 0.0733, "step": 1880 }, { - "epoch": 0.5613305613305614, - "grad_norm": 0.28089386224746704, - "learning_rate": 2.663201663201663e-05, - "loss": 0.0921, + "epoch": 0.9540636042402827, + "grad_norm": 0.7297592163085938, + "learning_rate": 2.4275618374558302e-05, + "loss": 0.0725, "step": 1890 }, { - "epoch": 0.5643005643005643, - "grad_norm": 0.6708937287330627, - "learning_rate": 2.6614196614196613e-05, - "loss": 0.0876, + "epoch": 0.9591115598182736, + "grad_norm": 0.35177797079086304, + "learning_rate": 2.424533064109036e-05, + "loss": 0.0651, "step": 1900 }, { - "epoch": 0.5672705672705672, - "grad_norm": 0.49499982595443726, - "learning_rate": 2.6596376596376595e-05, - "loss": 0.0889, + "epoch": 0.9641595153962645, + "grad_norm": 0.6706666350364685, + "learning_rate": 2.4215042907622413e-05, + "loss": 0.0737, "step": 1910 }, { - "epoch": 0.5702405702405703, - "grad_norm": 0.5181038975715637, - "learning_rate": 2.6578556578556577e-05, - "loss": 0.0687, + "epoch": 0.9692074709742554, + "grad_norm": 0.7155650854110718, + "learning_rate": 2.418475517415447e-05, + "loss": 0.074, "step": 1920 }, { - "epoch": 0.5732105732105732, - "grad_norm": 0.4590006172657013, - "learning_rate": 2.6560736560736563e-05, - "loss": 0.0843, + "epoch": 0.9742554265522464, + "grad_norm": 0.5200046300888062, + "learning_rate": 2.4154467440686523e-05, + "loss": 0.0706, "step": 1930 }, { - "epoch": 0.5761805761805762, - "grad_norm": 0.542353630065918, - "learning_rate": 2.6542916542916545e-05, - "loss": 0.0807, + "epoch": 0.9793033821302373, + "grad_norm": 0.46796679496765137, + "learning_rate": 2.4124179707218576e-05, + "loss": 0.0592, "step": 1940 }, { - "epoch": 0.5791505791505791, - "grad_norm": 0.4152495861053467, - "learning_rate": 2.6525096525096527e-05, - "loss": 0.0833, + "epoch": 0.9843513377082281, + "grad_norm": 0.5713896751403809, + "learning_rate": 2.4093891973750633e-05, + "loss": 0.0586, "step": 1950 }, { - "epoch": 0.5821205821205822, - "grad_norm": 0.4847126603126526, - "learning_rate": 2.6507276507276506e-05, - "loss": 0.0844, + "epoch": 0.9893992932862191, + "grad_norm": 0.9147453308105469, + "learning_rate": 2.4063604240282687e-05, + "loss": 0.0848, "step": 1960 }, { - "epoch": 0.5850905850905851, - "grad_norm": 0.5619663596153259, - "learning_rate": 2.648945648945649e-05, - "loss": 0.0768, + "epoch": 0.99444724886421, + "grad_norm": 1.1067036390304565, + "learning_rate": 2.4033316506814744e-05, + "loss": 0.07, "step": 1970 }, { - "epoch": 0.588060588060588, - "grad_norm": 0.6558105945587158, - "learning_rate": 2.647163647163647e-05, - "loss": 0.0754, + "epoch": 0.9994952044422009, + "grad_norm": 0.5658775568008423, + "learning_rate": 2.4003028773346797e-05, + "loss": 0.0594, "step": 1980 }, { - "epoch": 0.5910305910305911, - "grad_norm": 0.9754857420921326, - "learning_rate": 2.6453816453816453e-05, - "loss": 0.0868, + "epoch": 1.0, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.04397369921207428, + "eval_runtime": 594.1594, + "eval_samples_per_second": 347.149, + "eval_steps_per_second": 2.713, + "step": 1981 + }, + { + "epoch": 1.0045431600201917, + "grad_norm": 0.6783074736595154, + "learning_rate": 2.3972741039878847e-05, + "loss": 0.0783, "step": 1990 }, { - "epoch": 0.594000594000594, - "grad_norm": 0.4641966223716736, - "learning_rate": 2.6435996435996438e-05, - "loss": 0.0929, + "epoch": 1.0095911155981827, + "grad_norm": 0.5741100311279297, + "learning_rate": 2.3942453306410904e-05, + "loss": 0.0612, "step": 2000 }, { - "epoch": 0.596970596970597, - "grad_norm": 0.46997398138046265, - "learning_rate": 2.641817641817642e-05, - "loss": 0.0822, + "epoch": 1.0146390711761737, + "grad_norm": 0.8516017198562622, + "learning_rate": 2.3912165572942957e-05, + "loss": 0.0654, "step": 2010 }, { - "epoch": 0.5999405999405999, - "grad_norm": 0.6096898913383484, - "learning_rate": 2.6400356400356403e-05, - "loss": 0.0871, + "epoch": 1.0196870267541647, + "grad_norm": 0.48648303747177124, + "learning_rate": 2.3881877839475014e-05, + "loss": 0.0659, "step": 2020 }, { - "epoch": 0.6029106029106029, - "grad_norm": 0.4723495543003082, - "learning_rate": 2.638253638253638e-05, - "loss": 0.0767, + "epoch": 1.0247349823321554, + "grad_norm": 0.48170068860054016, + "learning_rate": 2.3851590106007068e-05, + "loss": 0.0687, "step": 2030 }, { - "epoch": 0.6058806058806059, - "grad_norm": 0.5081328749656677, - "learning_rate": 2.6364716364716364e-05, - "loss": 0.0728, + "epoch": 1.0297829379101464, + "grad_norm": 0.8060422539710999, + "learning_rate": 2.382130237253912e-05, + "loss": 0.0741, "step": 2040 }, { - "epoch": 0.6088506088506088, - "grad_norm": 0.5929988026618958, - "learning_rate": 2.6346896346896346e-05, - "loss": 0.0777, + "epoch": 1.0348308934881374, + "grad_norm": 0.3721982538700104, + "learning_rate": 2.3791014639071178e-05, + "loss": 0.0643, "step": 2050 }, { - "epoch": 0.6118206118206119, - "grad_norm": 0.5095152854919434, - "learning_rate": 2.6329076329076328e-05, - "loss": 0.084, + "epoch": 1.0398788490661282, + "grad_norm": 0.9289938807487488, + "learning_rate": 2.376072690560323e-05, + "loss": 0.0678, "step": 2060 }, { - "epoch": 0.6147906147906148, - "grad_norm": 0.47717463970184326, - "learning_rate": 2.6311256311256313e-05, - "loss": 0.0724, + "epoch": 1.0449268046441191, + "grad_norm": 0.7339480519294739, + "learning_rate": 2.3730439172135288e-05, + "loss": 0.065, "step": 2070 }, { - "epoch": 0.6177606177606177, - "grad_norm": 0.3432537615299225, - "learning_rate": 2.6293436293436296e-05, - "loss": 0.0727, + "epoch": 1.0499747602221101, + "grad_norm": 0.5676091313362122, + "learning_rate": 2.370015143866734e-05, + "loss": 0.0665, "step": 2080 }, { - "epoch": 0.6207306207306207, - "grad_norm": 0.6386498212814331, - "learning_rate": 2.6275616275616278e-05, - "loss": 0.0814, + "epoch": 1.0550227158001009, + "grad_norm": 1.0972354412078857, + "learning_rate": 2.3669863705199395e-05, + "loss": 0.0664, "step": 2090 }, { - "epoch": 0.6237006237006237, - "grad_norm": 0.5590204000473022, - "learning_rate": 2.625779625779626e-05, - "loss": 0.0803, + "epoch": 1.0600706713780919, + "grad_norm": 1.11980402469635, + "learning_rate": 2.3639575971731452e-05, + "loss": 0.0742, "step": 2100 }, { - "epoch": 0.6266706266706267, - "grad_norm": 0.3727136552333832, - "learning_rate": 2.623997623997624e-05, - "loss": 0.0784, + "epoch": 1.0651186269560828, + "grad_norm": 0.6586318016052246, + "learning_rate": 2.3609288238263502e-05, + "loss": 0.0755, "step": 2110 }, { - "epoch": 0.6296406296406296, - "grad_norm": 0.9345456957817078, - "learning_rate": 2.622215622215622e-05, - "loss": 0.102, + "epoch": 1.0701665825340738, + "grad_norm": 0.6912874579429626, + "learning_rate": 2.3579000504795555e-05, + "loss": 0.0722, "step": 2120 }, { - "epoch": 0.6326106326106327, - "grad_norm": 0.6383994221687317, - "learning_rate": 2.6204336204336203e-05, - "loss": 0.0799, + "epoch": 1.0752145381120646, + "grad_norm": 0.5603944659233093, + "learning_rate": 2.3548712771327612e-05, + "loss": 0.0636, "step": 2130 }, { - "epoch": 0.6355806355806356, - "grad_norm": 0.6339811682701111, - "learning_rate": 2.618651618651619e-05, + "epoch": 1.0802624936900556, + "grad_norm": 0.7324510216712952, + "learning_rate": 2.3518425037859666e-05, "loss": 0.0697, "step": 2140 }, { - "epoch": 0.6385506385506385, - "grad_norm": 0.6489042639732361, - "learning_rate": 2.616869616869617e-05, - "loss": 0.0762, + "epoch": 1.0853104492680465, + "grad_norm": 0.6833095550537109, + "learning_rate": 2.3488137304391723e-05, + "loss": 0.0678, "step": 2150 }, { - "epoch": 0.6415206415206415, - "grad_norm": 0.43688729405403137, - "learning_rate": 2.6150876150876153e-05, - "loss": 0.0856, + "epoch": 1.0903584048460373, + "grad_norm": 0.49107661843299866, + "learning_rate": 2.3457849570923776e-05, + "loss": 0.0608, "step": 2160 }, { - "epoch": 0.6444906444906445, - "grad_norm": 0.5854159593582153, - "learning_rate": 2.6133056133056135e-05, - "loss": 0.0923, + "epoch": 1.0954063604240283, + "grad_norm": 0.541980504989624, + "learning_rate": 2.342756183745583e-05, + "loss": 0.0645, "step": 2170 }, { - "epoch": 0.6474606474606475, - "grad_norm": 0.4497719407081604, - "learning_rate": 2.6115236115236114e-05, - "loss": 0.0863, + "epoch": 1.1004543160020193, + "grad_norm": 0.487343966960907, + "learning_rate": 2.3397274103987886e-05, + "loss": 0.0573, "step": 2180 }, { - "epoch": 0.6504306504306504, - "grad_norm": 0.39971357583999634, - "learning_rate": 2.6097416097416096e-05, - "loss": 0.0693, + "epoch": 1.10550227158001, + "grad_norm": 0.3503382205963135, + "learning_rate": 2.336698637051994e-05, + "loss": 0.0753, "step": 2190 }, { - "epoch": 0.6534006534006535, - "grad_norm": 0.6880261301994324, - "learning_rate": 2.6079596079596078e-05, - "loss": 0.0861, + "epoch": 1.110550227158001, + "grad_norm": 0.750566840171814, + "learning_rate": 2.3336698637051997e-05, + "loss": 0.0703, "step": 2200 }, { - "epoch": 0.6563706563706564, - "grad_norm": 0.39452025294303894, - "learning_rate": 2.6061776061776064e-05, - "loss": 0.0666, + "epoch": 1.115598182735992, + "grad_norm": 1.1437385082244873, + "learning_rate": 2.330641090358405e-05, + "loss": 0.0706, "step": 2210 }, { - "epoch": 0.6593406593406593, - "grad_norm": 0.4145357310771942, - "learning_rate": 2.6043956043956046e-05, - "loss": 0.07, + "epoch": 1.1206461383139827, + "grad_norm": 0.4508492648601532, + "learning_rate": 2.3276123170116103e-05, + "loss": 0.064, "step": 2220 }, { - "epoch": 0.6623106623106623, - "grad_norm": 0.6330484747886658, - "learning_rate": 2.6026136026136028e-05, - "loss": 0.084, + "epoch": 1.1256940938919737, + "grad_norm": 1.0053447484970093, + "learning_rate": 2.324583543664816e-05, + "loss": 0.0595, "step": 2230 }, { - "epoch": 0.6652806652806653, - "grad_norm": 0.5894971489906311, - "learning_rate": 2.600831600831601e-05, - "loss": 0.0925, + "epoch": 1.1307420494699647, + "grad_norm": 0.5974487662315369, + "learning_rate": 2.321554770318021e-05, + "loss": 0.0613, "step": 2240 }, { - "epoch": 0.6682506682506683, - "grad_norm": 0.3733588457107544, - "learning_rate": 2.599049599049599e-05, - "loss": 0.082, + "epoch": 1.1357900050479555, + "grad_norm": 0.48302361369132996, + "learning_rate": 2.3185259969712267e-05, + "loss": 0.0553, "step": 2250 }, { - "epoch": 0.6712206712206712, - "grad_norm": 0.45527949929237366, - "learning_rate": 2.597267597267597e-05, - "loss": 0.0769, + "epoch": 1.1408379606259464, + "grad_norm": 0.7124462127685547, + "learning_rate": 2.315497223624432e-05, + "loss": 0.0628, "step": 2260 }, { - "epoch": 0.6741906741906742, - "grad_norm": 0.6295212507247925, - "learning_rate": 2.5954855954855953e-05, - "loss": 0.0798, + "epoch": 1.1458859162039374, + "grad_norm": 0.8712441921234131, + "learning_rate": 2.3124684502776374e-05, + "loss": 0.066, "step": 2270 }, { - "epoch": 0.6771606771606772, - "grad_norm": 0.4148741066455841, - "learning_rate": 2.593703593703594e-05, - "loss": 0.0702, + "epoch": 1.1509338717819284, + "grad_norm": 0.7473580241203308, + "learning_rate": 2.309439676930843e-05, + "loss": 0.0687, "step": 2280 }, { - "epoch": 0.6801306801306801, - "grad_norm": 0.4446201026439667, - "learning_rate": 2.591921591921592e-05, - "loss": 0.081, + "epoch": 1.1559818273599192, + "grad_norm": 0.8231186866760254, + "learning_rate": 2.3064109035840484e-05, + "loss": 0.0686, "step": 2290 }, { - "epoch": 0.6831006831006831, - "grad_norm": 0.5348713397979736, - "learning_rate": 2.5901395901395903e-05, - "loss": 0.0804, + "epoch": 1.1610297829379101, + "grad_norm": 0.5205137729644775, + "learning_rate": 2.303382130237254e-05, + "loss": 0.0668, "step": 2300 }, { - "epoch": 0.6860706860706861, - "grad_norm": 0.7064197659492493, - "learning_rate": 2.5883575883575886e-05, - "loss": 0.0766, + "epoch": 1.1660777385159011, + "grad_norm": 0.5173012614250183, + "learning_rate": 2.3003533568904595e-05, + "loss": 0.0664, "step": 2310 }, { - "epoch": 0.689040689040689, - "grad_norm": 0.5868175029754639, - "learning_rate": 2.5865755865755864e-05, - "loss": 0.0797, + "epoch": 1.171125694093892, + "grad_norm": 0.6976504325866699, + "learning_rate": 2.2973245835436648e-05, + "loss": 0.067, "step": 2320 }, { - "epoch": 0.692010692010692, - "grad_norm": 0.6839095950126648, - "learning_rate": 2.5847935847935846e-05, - "loss": 0.0794, + "epoch": 1.1761736496718829, + "grad_norm": 0.7795687317848206, + "learning_rate": 2.2942958101968705e-05, + "loss": 0.0591, "step": 2330 }, { - "epoch": 0.694980694980695, - "grad_norm": 0.41192343831062317, - "learning_rate": 2.583011583011583e-05, - "loss": 0.0706, + "epoch": 1.1812216052498739, + "grad_norm": 0.35292479395866394, + "learning_rate": 2.291267036850076e-05, + "loss": 0.0721, "step": 2340 }, { - "epoch": 0.697950697950698, - "grad_norm": 0.7668315768241882, - "learning_rate": 2.5812295812295814e-05, - "loss": 0.0785, + "epoch": 1.1862695608278648, + "grad_norm": 1.548770546913147, + "learning_rate": 2.2882382635032815e-05, + "loss": 0.0608, "step": 2350 }, { - "epoch": 0.7009207009207009, - "grad_norm": 0.43974947929382324, - "learning_rate": 2.5794475794475796e-05, - "loss": 0.0712, + "epoch": 1.1913175164058556, + "grad_norm": 0.521295964717865, + "learning_rate": 2.285209490156487e-05, + "loss": 0.0735, "step": 2360 }, { - "epoch": 0.7038907038907039, - "grad_norm": 0.3848420977592468, - "learning_rate": 2.577665577665578e-05, - "loss": 0.077, + "epoch": 1.1963654719838466, + "grad_norm": 0.6001691818237305, + "learning_rate": 2.282180716809692e-05, + "loss": 0.0646, "step": 2370 }, { - "epoch": 0.7068607068607069, - "grad_norm": 0.6403735280036926, - "learning_rate": 2.575883575883576e-05, - "loss": 0.0729, + "epoch": 1.2014134275618376, + "grad_norm": 0.9061608910560608, + "learning_rate": 2.2791519434628976e-05, + "loss": 0.0598, "step": 2380 }, { - "epoch": 0.7098307098307098, - "grad_norm": 0.5417028665542603, - "learning_rate": 2.574101574101574e-05, - "loss": 0.0834, + "epoch": 1.2064613831398283, + "grad_norm": 0.6509453654289246, + "learning_rate": 2.276123170116103e-05, + "loss": 0.0591, "step": 2390 }, { - "epoch": 0.7128007128007128, - "grad_norm": 0.9361075162887573, - "learning_rate": 2.572319572319572e-05, - "loss": 0.077, + "epoch": 1.2115093387178193, + "grad_norm": 0.4685826301574707, + "learning_rate": 2.2730943967693086e-05, + "loss": 0.0675, "step": 2400 }, { - "epoch": 0.7157707157707157, - "grad_norm": 0.483093798160553, - "learning_rate": 2.5705375705375707e-05, - "loss": 0.088, + "epoch": 1.2165572942958103, + "grad_norm": 0.4527621865272522, + "learning_rate": 2.270065623422514e-05, + "loss": 0.0635, "step": 2410 }, { - "epoch": 0.7187407187407188, - "grad_norm": 0.4506361782550812, - "learning_rate": 2.568755568755569e-05, - "loss": 0.0919, + "epoch": 1.221605249873801, + "grad_norm": 0.46990010142326355, + "learning_rate": 2.2670368500757193e-05, + "loss": 0.0609, "step": 2420 }, { - "epoch": 0.7217107217107217, - "grad_norm": 0.6593904495239258, - "learning_rate": 2.566973566973567e-05, - "loss": 0.087, + "epoch": 1.226653205451792, + "grad_norm": 0.7978981137275696, + "learning_rate": 2.264008076728925e-05, + "loss": 0.0682, "step": 2430 }, { - "epoch": 0.7246807246807246, - "grad_norm": 0.5274522304534912, - "learning_rate": 2.5651915651915654e-05, - "loss": 0.0768, + "epoch": 1.231701161029783, + "grad_norm": 0.5001055598258972, + "learning_rate": 2.2609793033821303e-05, + "loss": 0.0657, "step": 2440 }, { - "epoch": 0.7276507276507277, - "grad_norm": 0.5065791606903076, - "learning_rate": 2.5634095634095636e-05, - "loss": 0.0828, + "epoch": 1.2367491166077738, + "grad_norm": 0.7271714806556702, + "learning_rate": 2.2579505300353356e-05, + "loss": 0.0627, "step": 2450 }, { - "epoch": 0.7306207306207306, - "grad_norm": 0.6130974888801575, - "learning_rate": 2.5616275616275615e-05, - "loss": 0.0742, + "epoch": 1.2417970721857647, + "grad_norm": 0.3601450026035309, + "learning_rate": 2.2549217566885413e-05, + "loss": 0.0649, "step": 2460 }, { - "epoch": 0.7335907335907336, - "grad_norm": 0.6379355192184448, - "learning_rate": 2.5598455598455597e-05, - "loss": 0.0847, + "epoch": 1.2468450277637557, + "grad_norm": 0.6351629495620728, + "learning_rate": 2.2518929833417467e-05, + "loss": 0.0619, "step": 2470 }, { - "epoch": 0.7365607365607365, - "grad_norm": 0.6738227009773254, - "learning_rate": 2.5580635580635582e-05, - "loss": 0.0793, + "epoch": 1.2518929833417465, + "grad_norm": 0.8523517847061157, + "learning_rate": 2.2488642099949524e-05, + "loss": 0.078, "step": 2480 }, { - "epoch": 0.7395307395307396, - "grad_norm": 0.6309618949890137, - "learning_rate": 2.5562815562815565e-05, - "loss": 0.0871, + "epoch": 1.2569409389197375, + "grad_norm": 1.0878459215164185, + "learning_rate": 2.2458354366481577e-05, + "loss": 0.0636, "step": 2490 }, { - "epoch": 0.7425007425007425, - "grad_norm": 0.2825660705566406, - "learning_rate": 2.5544995544995547e-05, - "loss": 0.074, + "epoch": 1.2619888944977284, + "grad_norm": 0.6811727285385132, + "learning_rate": 2.2428066633013627e-05, + "loss": 0.0703, "step": 2500 }, { - "epoch": 0.7454707454707454, - "grad_norm": 0.43583425879478455, - "learning_rate": 2.552717552717553e-05, - "loss": 0.0858, + "epoch": 1.2670368500757192, + "grad_norm": 0.6043427586555481, + "learning_rate": 2.2397778899545684e-05, + "loss": 0.0587, "step": 2510 }, { - "epoch": 0.7484407484407485, - "grad_norm": 0.7557492256164551, - "learning_rate": 2.550935550935551e-05, - "loss": 0.0691, + "epoch": 1.2720848056537102, + "grad_norm": 0.6673144102096558, + "learning_rate": 2.2367491166077737e-05, + "loss": 0.0675, "step": 2520 }, { - "epoch": 0.7514107514107514, - "grad_norm": 0.44126811623573303, - "learning_rate": 2.549153549153549e-05, - "loss": 0.0664, + "epoch": 1.2771327612317012, + "grad_norm": 0.3510701358318329, + "learning_rate": 2.2337203432609794e-05, + "loss": 0.069, "step": 2530 }, { - "epoch": 0.7543807543807544, - "grad_norm": 0.5966764092445374, - "learning_rate": 2.5473715473715472e-05, - "loss": 0.0766, + "epoch": 1.2821807168096921, + "grad_norm": 0.302438884973526, + "learning_rate": 2.2306915699141848e-05, + "loss": 0.0609, "step": 2540 }, { - "epoch": 0.7573507573507573, - "grad_norm": 0.4621107578277588, - "learning_rate": 2.5455895455895458e-05, - "loss": 0.0834, + "epoch": 1.2872286723876831, + "grad_norm": 0.8073706030845642, + "learning_rate": 2.22766279656739e-05, + "loss": 0.076, "step": 2550 }, { - "epoch": 0.7603207603207603, - "grad_norm": 0.593605637550354, - "learning_rate": 2.543807543807544e-05, - "loss": 0.0812, + "epoch": 1.2922766279656739, + "grad_norm": 0.7314086556434631, + "learning_rate": 2.2246340232205958e-05, + "loss": 0.0676, "step": 2560 }, { - "epoch": 0.7632907632907633, - "grad_norm": 0.8139130473136902, - "learning_rate": 2.5420255420255422e-05, - "loss": 0.0663, + "epoch": 1.2973245835436649, + "grad_norm": 0.6998431086540222, + "learning_rate": 2.221605249873801e-05, + "loss": 0.0594, "step": 2570 }, { - "epoch": 0.7662607662607662, - "grad_norm": 0.4853007197380066, - "learning_rate": 2.5402435402435404e-05, - "loss": 0.0789, + "epoch": 1.3023725391216558, + "grad_norm": 0.9340649843215942, + "learning_rate": 2.2185764765270068e-05, + "loss": 0.0601, "step": 2580 }, { - "epoch": 0.7692307692307693, - "grad_norm": 0.4105505645275116, - "learning_rate": 2.5384615384615386e-05, - "loss": 0.0702, + "epoch": 1.3074204946996466, + "grad_norm": 0.5486651062965393, + "learning_rate": 2.215547703180212e-05, + "loss": 0.0752, "step": 2590 }, { - "epoch": 0.7722007722007722, - "grad_norm": 0.5971934795379639, - "learning_rate": 2.5366795366795365e-05, - "loss": 0.0847, + "epoch": 1.3124684502776376, + "grad_norm": 0.3997117280960083, + "learning_rate": 2.2125189298334175e-05, + "loss": 0.0669, "step": 2600 }, { - "epoch": 0.7751707751707752, - "grad_norm": 0.34833744168281555, - "learning_rate": 2.5348975348975347e-05, - "loss": 0.064, + "epoch": 1.3175164058556286, + "grad_norm": 0.6159607172012329, + "learning_rate": 2.2094901564866232e-05, + "loss": 0.0646, "step": 2610 }, { - "epoch": 0.7781407781407781, - "grad_norm": 0.35726526379585266, - "learning_rate": 2.5331155331155333e-05, - "loss": 0.0758, + "epoch": 1.3225643614336193, + "grad_norm": 1.0720511674880981, + "learning_rate": 2.2064613831398285e-05, + "loss": 0.0697, "step": 2620 }, { - "epoch": 0.7811107811107811, - "grad_norm": 0.4475048780441284, - "learning_rate": 2.5313335313335315e-05, - "loss": 0.0768, + "epoch": 1.3276123170116103, + "grad_norm": 0.6496064066886902, + "learning_rate": 2.203432609793034e-05, + "loss": 0.0642, "step": 2630 }, { - "epoch": 0.7840807840807841, - "grad_norm": 0.48018935322761536, - "learning_rate": 2.5295515295515297e-05, - "loss": 0.0723, + "epoch": 1.3326602725896013, + "grad_norm": 0.5649464726448059, + "learning_rate": 2.2004038364462392e-05, + "loss": 0.0596, "step": 2640 }, { - "epoch": 0.787050787050787, - "grad_norm": 0.47765350341796875, - "learning_rate": 2.527769527769528e-05, - "loss": 0.0765, + "epoch": 1.337708228167592, + "grad_norm": 0.5532758235931396, + "learning_rate": 2.1973750630994446e-05, + "loss": 0.0651, "step": 2650 }, { - "epoch": 0.7900207900207901, - "grad_norm": 0.6376664638519287, - "learning_rate": 2.525987525987526e-05, - "loss": 0.0777, + "epoch": 1.342756183745583, + "grad_norm": 0.4955766797065735, + "learning_rate": 2.1943462897526503e-05, + "loss": 0.0661, "step": 2660 }, { - "epoch": 0.792990792990793, - "grad_norm": 0.7332932353019714, - "learning_rate": 2.524205524205524e-05, - "loss": 0.0963, + "epoch": 1.347804139323574, + "grad_norm": 0.5403378009796143, + "learning_rate": 2.1913175164058556e-05, + "loss": 0.068, "step": 2670 }, { - "epoch": 0.7959607959607959, - "grad_norm": 0.6165478825569153, - "learning_rate": 2.5224235224235222e-05, - "loss": 0.0827, + "epoch": 1.3528520949015648, + "grad_norm": 0.8987810015678406, + "learning_rate": 2.1882887430590613e-05, + "loss": 0.0551, "step": 2680 }, { - "epoch": 0.7989307989307989, - "grad_norm": 0.693350613117218, - "learning_rate": 2.5206415206415208e-05, - "loss": 0.0752, + "epoch": 1.3579000504795558, + "grad_norm": 0.5531570911407471, + "learning_rate": 2.1852599697122666e-05, + "loss": 0.0554, "step": 2690 }, { - "epoch": 0.8019008019008019, - "grad_norm": 0.5711894035339355, - "learning_rate": 2.518859518859519e-05, - "loss": 0.0699, + "epoch": 1.3629480060575467, + "grad_norm": 0.8810332417488098, + "learning_rate": 2.182231196365472e-05, + "loss": 0.0683, "step": 2700 }, { - "epoch": 0.8048708048708049, - "grad_norm": 0.6042230725288391, - "learning_rate": 2.5170775170775172e-05, - "loss": 0.0681, + "epoch": 1.3679959616355375, + "grad_norm": 0.8977289199829102, + "learning_rate": 2.1792024230186777e-05, + "loss": 0.0682, "step": 2710 }, { - "epoch": 0.8078408078408078, - "grad_norm": 0.43989643454551697, - "learning_rate": 2.5152955152955155e-05, - "loss": 0.0684, + "epoch": 1.3730439172135285, + "grad_norm": 0.6664491295814514, + "learning_rate": 2.176173649671883e-05, + "loss": 0.0652, "step": 2720 }, { - "epoch": 0.8108108108108109, - "grad_norm": 0.3606058359146118, - "learning_rate": 2.5135135135135137e-05, - "loss": 0.0793, + "epoch": 1.3780918727915195, + "grad_norm": 0.7725427150726318, + "learning_rate": 2.1731448763250883e-05, + "loss": 0.0693, "step": 2730 }, { - "epoch": 0.8137808137808138, - "grad_norm": 0.578762412071228, - "learning_rate": 2.511731511731512e-05, - "loss": 0.0703, + "epoch": 1.3831398283695102, + "grad_norm": 1.149824857711792, + "learning_rate": 2.170116102978294e-05, + "loss": 0.0697, "step": 2740 }, { - "epoch": 0.8167508167508167, - "grad_norm": 0.5686031579971313, - "learning_rate": 2.5099495099495098e-05, - "loss": 0.0851, + "epoch": 1.3881877839475012, + "grad_norm": 0.8231659531593323, + "learning_rate": 2.167087329631499e-05, + "loss": 0.0586, "step": 2750 }, { - "epoch": 0.8197208197208197, - "grad_norm": 0.5423585772514343, - "learning_rate": 2.5081675081675083e-05, - "loss": 0.0744, + "epoch": 1.3932357395254922, + "grad_norm": 0.5706813335418701, + "learning_rate": 2.1640585562847047e-05, + "loss": 0.0648, "step": 2760 }, { - "epoch": 0.8226908226908227, - "grad_norm": 0.6459795236587524, - "learning_rate": 2.5063855063855065e-05, - "loss": 0.0749, + "epoch": 1.3982836951034832, + "grad_norm": 0.4602285623550415, + "learning_rate": 2.16102978293791e-05, + "loss": 0.0642, "step": 2770 }, { - "epoch": 0.8256608256608257, - "grad_norm": 0.5151922106742859, - "learning_rate": 2.5046035046035048e-05, - "loss": 0.0838, + "epoch": 1.4033316506814741, + "grad_norm": 0.5022104978561401, + "learning_rate": 2.1580010095911154e-05, + "loss": 0.0582, "step": 2780 }, { - "epoch": 0.8286308286308286, - "grad_norm": 0.49044474959373474, - "learning_rate": 2.502821502821503e-05, - "loss": 0.081, + "epoch": 1.408379606259465, + "grad_norm": 0.3675612211227417, + "learning_rate": 2.154972236244321e-05, + "loss": 0.0685, "step": 2790 }, { - "epoch": 0.8316008316008316, - "grad_norm": 0.6159443855285645, - "learning_rate": 2.5010395010395012e-05, - "loss": 0.0814, + "epoch": 1.4134275618374559, + "grad_norm": 0.5692434906959534, + "learning_rate": 2.1519434628975264e-05, + "loss": 0.0625, "step": 2800 }, { - "epoch": 0.8345708345708346, - "grad_norm": 0.6860203146934509, - "learning_rate": 2.4992574992574994e-05, - "loss": 0.0731, + "epoch": 1.4184755174154469, + "grad_norm": 0.44433364272117615, + "learning_rate": 2.148914689550732e-05, + "loss": 0.0683, "step": 2810 }, { - "epoch": 0.8375408375408375, - "grad_norm": 0.43102753162384033, - "learning_rate": 2.4974754974754973e-05, - "loss": 0.0867, + "epoch": 1.4235234729934376, + "grad_norm": 0.5225184559822083, + "learning_rate": 2.1458859162039375e-05, + "loss": 0.0676, "step": 2820 }, { - "epoch": 0.8405108405108405, - "grad_norm": 0.6863781809806824, - "learning_rate": 2.495693495693496e-05, - "loss": 0.0681, + "epoch": 1.4285714285714286, + "grad_norm": 1.125475287437439, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.0641, "step": 2830 }, { - "epoch": 0.8434808434808435, - "grad_norm": 0.6627882122993469, - "learning_rate": 2.493911493911494e-05, - "loss": 0.0692, + "epoch": 1.4336193841494196, + "grad_norm": 0.6783428192138672, + "learning_rate": 2.1398283695103485e-05, + "loss": 0.0735, "step": 2840 }, { - "epoch": 0.8464508464508465, - "grad_norm": 0.556719183921814, - "learning_rate": 2.4921294921294923e-05, - "loss": 0.0942, + "epoch": 1.4386673397274103, + "grad_norm": 0.6056823134422302, + "learning_rate": 2.136799596163554e-05, + "loss": 0.0607, "step": 2850 }, { - "epoch": 0.8494208494208494, - "grad_norm": 0.6097808480262756, - "learning_rate": 2.4903474903474905e-05, - "loss": 0.0788, + "epoch": 1.4437152953054013, + "grad_norm": 0.7588714361190796, + "learning_rate": 2.1337708228167595e-05, + "loss": 0.0638, "step": 2860 }, { - "epoch": 0.8523908523908524, - "grad_norm": 0.3771260976791382, - "learning_rate": 2.4885654885654887e-05, - "loss": 0.0872, + "epoch": 1.4487632508833923, + "grad_norm": 0.5353738069534302, + "learning_rate": 2.130742049469965e-05, + "loss": 0.0628, "step": 2870 }, { - "epoch": 0.8553608553608554, - "grad_norm": 0.2577713131904602, - "learning_rate": 2.486783486783487e-05, - "loss": 0.0849, + "epoch": 1.453811206461383, + "grad_norm": 0.3690322935581207, + "learning_rate": 2.12771327612317e-05, + "loss": 0.055, "step": 2880 }, { - "epoch": 0.8583308583308583, - "grad_norm": 0.6618907451629639, - "learning_rate": 2.4850014850014848e-05, - "loss": 0.0794, + "epoch": 1.458859162039374, + "grad_norm": 0.5556847453117371, + "learning_rate": 2.1246845027763756e-05, + "loss": 0.0672, "step": 2890 }, { - "epoch": 0.8613008613008613, - "grad_norm": 0.33715909719467163, - "learning_rate": 2.4832194832194834e-05, - "loss": 0.0689, + "epoch": 1.463907117617365, + "grad_norm": 0.5658410787582397, + "learning_rate": 2.121655729429581e-05, + "loss": 0.0634, "step": 2900 }, { - "epoch": 0.8642708642708643, - "grad_norm": 0.5500791072845459, - "learning_rate": 2.4814374814374816e-05, - "loss": 0.0862, + "epoch": 1.4689550731953558, + "grad_norm": 1.1000596284866333, + "learning_rate": 2.1186269560827866e-05, + "loss": 0.0648, "step": 2910 }, { - "epoch": 0.8672408672408672, - "grad_norm": 0.6228634119033813, - "learning_rate": 2.4796554796554798e-05, - "loss": 0.0769, + "epoch": 1.4740030287733468, + "grad_norm": 0.5739458799362183, + "learning_rate": 2.115598182735992e-05, + "loss": 0.0622, "step": 2920 }, { - "epoch": 0.8702108702108702, - "grad_norm": 0.8019270896911621, - "learning_rate": 2.477873477873478e-05, - "loss": 0.071, + "epoch": 1.4790509843513377, + "grad_norm": 0.9371837377548218, + "learning_rate": 2.1125694093891973e-05, + "loss": 0.067, "step": 2930 }, { - "epoch": 0.8731808731808732, - "grad_norm": 0.47143444418907166, - "learning_rate": 2.4760914760914762e-05, - "loss": 0.0812, + "epoch": 1.4840989399293285, + "grad_norm": 0.5997252464294434, + "learning_rate": 2.109540636042403e-05, + "loss": 0.0665, "step": 2940 }, { - "epoch": 0.8761508761508762, - "grad_norm": 0.47617822885513306, - "learning_rate": 2.4743094743094744e-05, - "loss": 0.0769, + "epoch": 1.4891468955073195, + "grad_norm": 0.6729413866996765, + "learning_rate": 2.1065118626956083e-05, + "loss": 0.0576, "step": 2950 }, { - "epoch": 0.8791208791208791, - "grad_norm": 0.6791771054267883, - "learning_rate": 2.4725274725274723e-05, - "loss": 0.0693, + "epoch": 1.4941948510853105, + "grad_norm": 0.796592652797699, + "learning_rate": 2.103483089348814e-05, + "loss": 0.0671, "step": 2960 }, { - "epoch": 0.882090882090882, - "grad_norm": 0.4986003339290619, - "learning_rate": 2.470745470745471e-05, - "loss": 0.0778, + "epoch": 1.4992428066633012, + "grad_norm": 0.7947612404823303, + "learning_rate": 2.1004543160020193e-05, + "loss": 0.0701, "step": 2970 }, { - "epoch": 0.8850608850608851, - "grad_norm": 0.351012647151947, - "learning_rate": 2.468963468963469e-05, - "loss": 0.073, + "epoch": 1.5042907622412924, + "grad_norm": 0.7790849208831787, + "learning_rate": 2.0974255426552247e-05, + "loss": 0.065, "step": 2980 }, { - "epoch": 0.888030888030888, - "grad_norm": 0.6079609394073486, - "learning_rate": 2.4671814671814673e-05, - "loss": 0.075, + "epoch": 1.5093387178192832, + "grad_norm": 0.5330706238746643, + "learning_rate": 2.0943967693084304e-05, + "loss": 0.0587, "step": 2990 }, { - "epoch": 0.891000891000891, - "grad_norm": 0.49167245626449585, - "learning_rate": 2.4653994653994655e-05, - "loss": 0.0745, + "epoch": 1.514386673397274, + "grad_norm": 1.0482598543167114, + "learning_rate": 2.0913679959616357e-05, + "loss": 0.0696, "step": 3000 }, { - "epoch": 0.893970893970894, - "grad_norm": 0.49965718388557434, - "learning_rate": 2.4636174636174637e-05, - "loss": 0.0861, + "epoch": 1.5194346289752652, + "grad_norm": 0.46928080916404724, + "learning_rate": 2.088339222614841e-05, + "loss": 0.0668, "step": 3010 }, { - "epoch": 0.896940896940897, - "grad_norm": 0.5942029356956482, - "learning_rate": 2.461835461835462e-05, - "loss": 0.0775, + "epoch": 1.524482584553256, + "grad_norm": 1.0525529384613037, + "learning_rate": 2.0853104492680464e-05, + "loss": 0.0664, "step": 3020 }, { - "epoch": 0.8999108999108999, - "grad_norm": 0.5431137084960938, - "learning_rate": 2.46005346005346e-05, - "loss": 0.0732, + "epoch": 1.529530540131247, + "grad_norm": 0.43941500782966614, + "learning_rate": 2.0822816759212517e-05, + "loss": 0.0642, "step": 3030 }, { - "epoch": 0.9028809028809028, - "grad_norm": 0.4982147514820099, - "learning_rate": 2.4582714582714584e-05, - "loss": 0.0749, + "epoch": 1.5345784957092379, + "grad_norm": 0.6985353231430054, + "learning_rate": 2.0792529025744574e-05, + "loss": 0.068, "step": 3040 }, { - "epoch": 0.9058509058509059, - "grad_norm": 0.6718347072601318, - "learning_rate": 2.4564894564894566e-05, - "loss": 0.0843, + "epoch": 1.5396264512872286, + "grad_norm": 0.6110888123512268, + "learning_rate": 2.0762241292276628e-05, + "loss": 0.0639, "step": 3050 }, { - "epoch": 0.9088209088209088, - "grad_norm": 0.7574843168258667, - "learning_rate": 2.454707454707455e-05, - "loss": 0.0769, + "epoch": 1.5446744068652196, + "grad_norm": 0.8250141739845276, + "learning_rate": 2.073195355880868e-05, + "loss": 0.0614, "step": 3060 }, { - "epoch": 0.9117909117909118, - "grad_norm": 0.5467488169670105, - "learning_rate": 2.452925452925453e-05, - "loss": 0.0802, + "epoch": 1.5497223624432106, + "grad_norm": 0.4882888197898865, + "learning_rate": 2.0701665825340738e-05, + "loss": 0.066, "step": 3070 }, { - "epoch": 0.9147609147609148, - "grad_norm": 0.4699064791202545, - "learning_rate": 2.4511434511434513e-05, - "loss": 0.0813, + "epoch": 1.5547703180212014, + "grad_norm": 0.38679155707359314, + "learning_rate": 2.067137809187279e-05, + "loss": 0.0684, "step": 3080 }, { - "epoch": 0.9177309177309178, - "grad_norm": 0.4939485788345337, - "learning_rate": 2.4493614493614495e-05, - "loss": 0.0763, + "epoch": 1.5598182735991923, + "grad_norm": 0.6574121117591858, + "learning_rate": 2.0641090358404848e-05, + "loss": 0.0666, "step": 3090 }, { - "epoch": 0.9207009207009207, - "grad_norm": 0.4790801405906677, - "learning_rate": 2.4475794475794474e-05, - "loss": 0.0765, + "epoch": 1.5648662291771833, + "grad_norm": 0.48571038246154785, + "learning_rate": 2.0610802624936902e-05, + "loss": 0.0646, "step": 3100 }, { - "epoch": 0.9236709236709236, - "grad_norm": 0.3700208365917206, - "learning_rate": 2.445797445797446e-05, - "loss": 0.0862, + "epoch": 1.569914184755174, + "grad_norm": 0.8285214304924011, + "learning_rate": 2.0580514891468955e-05, + "loss": 0.0634, "step": 3110 }, { - "epoch": 0.9266409266409267, - "grad_norm": 0.5105488300323486, - "learning_rate": 2.444015444015444e-05, - "loss": 0.0773, + "epoch": 1.574962140333165, + "grad_norm": 0.5619475245475769, + "learning_rate": 2.0550227158001012e-05, + "loss": 0.0665, "step": 3120 }, { - "epoch": 0.9296109296109296, - "grad_norm": 0.3455560803413391, - "learning_rate": 2.4422334422334424e-05, - "loss": 0.0716, + "epoch": 1.580010095911156, + "grad_norm": 0.47569337487220764, + "learning_rate": 2.0519939424533065e-05, + "loss": 0.0661, "step": 3130 }, { - "epoch": 0.9325809325809326, - "grad_norm": 0.5318461656570435, - "learning_rate": 2.4404514404514406e-05, - "loss": 0.079, + "epoch": 1.5850580514891468, + "grad_norm": 0.8858407139778137, + "learning_rate": 2.048965169106512e-05, + "loss": 0.0696, "step": 3140 }, { - "epoch": 0.9355509355509356, - "grad_norm": 0.42595726251602173, - "learning_rate": 2.4386694386694388e-05, - "loss": 0.0892, + "epoch": 1.5901060070671378, + "grad_norm": 0.5578007698059082, + "learning_rate": 2.0459363957597172e-05, + "loss": 0.0547, "step": 3150 }, { - "epoch": 0.9385209385209385, - "grad_norm": 0.651802659034729, - "learning_rate": 2.436887436887437e-05, - "loss": 0.0793, + "epoch": 1.5951539626451288, + "grad_norm": 0.6875492334365845, + "learning_rate": 2.0429076224129226e-05, + "loss": 0.0608, "step": 3160 }, { - "epoch": 0.9414909414909415, - "grad_norm": 0.6579793095588684, - "learning_rate": 2.435105435105435e-05, - "loss": 0.0661, + "epoch": 1.6002019182231195, + "grad_norm": 0.5009766221046448, + "learning_rate": 2.0398788490661283e-05, + "loss": 0.0684, "step": 3170 }, { - "epoch": 0.9444609444609444, - "grad_norm": 0.5980479717254639, - "learning_rate": 2.4333234333234334e-05, - "loss": 0.0757, + "epoch": 1.6052498738011105, + "grad_norm": 0.7467596530914307, + "learning_rate": 2.0368500757193336e-05, + "loss": 0.0654, "step": 3180 }, { - "epoch": 0.9474309474309475, - "grad_norm": 0.5788313746452332, - "learning_rate": 2.4315414315414317e-05, - "loss": 0.0921, + "epoch": 1.6102978293791015, + "grad_norm": 0.5688017010688782, + "learning_rate": 2.0338213023725393e-05, + "loss": 0.0594, "step": 3190 }, { - "epoch": 0.9504009504009504, - "grad_norm": 0.47703874111175537, - "learning_rate": 2.42975942975943e-05, - "loss": 0.0661, + "epoch": 1.6153457849570922, + "grad_norm": 0.9353786110877991, + "learning_rate": 2.0307925290257446e-05, + "loss": 0.0685, "step": 3200 }, { - "epoch": 0.9533709533709533, - "grad_norm": 0.5644926428794861, - "learning_rate": 2.427977427977428e-05, - "loss": 0.0706, + "epoch": 1.6203937405350834, + "grad_norm": 0.5310063362121582, + "learning_rate": 2.02776375567895e-05, + "loss": 0.0597, "step": 3210 }, { - "epoch": 0.9563409563409564, - "grad_norm": 0.6008754372596741, - "learning_rate": 2.4261954261954263e-05, - "loss": 0.0757, + "epoch": 1.6254416961130742, + "grad_norm": 1.107693076133728, + "learning_rate": 2.0247349823321557e-05, + "loss": 0.0722, "step": 3220 }, { - "epoch": 0.9593109593109593, - "grad_norm": 0.5607688426971436, - "learning_rate": 2.4244134244134245e-05, - "loss": 0.0718, + "epoch": 1.630489651691065, + "grad_norm": 0.688391923904419, + "learning_rate": 2.021706208985361e-05, + "loss": 0.0719, "step": 3230 }, { - "epoch": 0.9622809622809623, - "grad_norm": 0.7359547019004822, - "learning_rate": 2.4226314226314224e-05, - "loss": 0.083, + "epoch": 1.6355376072690562, + "grad_norm": 0.4255257546901703, + "learning_rate": 2.0186774356385667e-05, + "loss": 0.0638, "step": 3240 }, { - "epoch": 0.9652509652509652, - "grad_norm": 0.596994161605835, - "learning_rate": 2.420849420849421e-05, - "loss": 0.0717, + "epoch": 1.640585562847047, + "grad_norm": 0.6049216389656067, + "learning_rate": 2.015648662291772e-05, + "loss": 0.0555, "step": 3250 }, { - "epoch": 0.9682209682209683, - "grad_norm": 0.7237496972084045, - "learning_rate": 2.4190674190674192e-05, - "loss": 0.0767, + "epoch": 1.645633518425038, + "grad_norm": 0.6898351311683655, + "learning_rate": 2.012619888944977e-05, + "loss": 0.0599, "step": 3260 }, { - "epoch": 0.9711909711909712, - "grad_norm": 0.6103200316429138, - "learning_rate": 2.4172854172854174e-05, - "loss": 0.0738, + "epoch": 1.650681474003029, + "grad_norm": 0.6150475144386292, + "learning_rate": 2.0095911155981827e-05, + "loss": 0.0664, "step": 3270 }, { - "epoch": 0.9741609741609741, - "grad_norm": 0.7314611077308655, - "learning_rate": 2.4155034155034156e-05, - "loss": 0.0851, + "epoch": 1.6557294295810197, + "grad_norm": 0.5084889531135559, + "learning_rate": 2.006562342251388e-05, + "loss": 0.0574, "step": 3280 }, { - "epoch": 0.9771309771309772, - "grad_norm": 0.496187299489975, - "learning_rate": 2.4137214137214138e-05, - "loss": 0.0757, + "epoch": 1.6607773851590106, + "grad_norm": 0.9478010535240173, + "learning_rate": 2.0035335689045938e-05, + "loss": 0.0619, "step": 3290 }, { - "epoch": 0.9801009801009801, - "grad_norm": 0.5102724432945251, - "learning_rate": 2.411939411939412e-05, - "loss": 0.0705, + "epoch": 1.6658253407370016, + "grad_norm": 1.1725986003875732, + "learning_rate": 2.000504795557799e-05, + "loss": 0.0672, "step": 3300 }, { - "epoch": 0.9830709830709831, - "grad_norm": 0.43364787101745605, - "learning_rate": 2.4101574101574103e-05, - "loss": 0.0594, + "epoch": 1.6708732963149924, + "grad_norm": 0.8932427763938904, + "learning_rate": 1.9974760222110044e-05, + "loss": 0.0604, "step": 3310 }, { - "epoch": 0.986040986040986, - "grad_norm": 0.5329870581626892, - "learning_rate": 2.4083754083754085e-05, - "loss": 0.0757, + "epoch": 1.6759212518929834, + "grad_norm": 0.4670265316963196, + "learning_rate": 1.99444724886421e-05, + "loss": 0.0658, "step": 3320 }, { - "epoch": 0.989010989010989, - "grad_norm": 0.5290941596031189, - "learning_rate": 2.4065934065934067e-05, - "loss": 0.0798, + "epoch": 1.6809692074709743, + "grad_norm": 0.518844485282898, + "learning_rate": 1.9914184755174155e-05, + "loss": 0.068, "step": 3330 }, { - "epoch": 0.991980991980992, - "grad_norm": 0.5744608044624329, - "learning_rate": 2.404811404811405e-05, - "loss": 0.072, + "epoch": 1.686017163048965, + "grad_norm": 0.7717642784118652, + "learning_rate": 1.988389702170621e-05, + "loss": 0.0594, "step": 3340 }, { - "epoch": 0.9949509949509949, - "grad_norm": 0.5449424386024475, - "learning_rate": 2.403029403029403e-05, - "loss": 0.0827, + "epoch": 1.691065118626956, + "grad_norm": 0.9715004563331604, + "learning_rate": 1.9853609288238265e-05, + "loss": 0.0651, "step": 3350 }, { - "epoch": 0.997920997920998, - "grad_norm": 0.5638298392295837, - "learning_rate": 2.4012474012474013e-05, - "loss": 0.0796, + "epoch": 1.696113074204947, + "grad_norm": 0.7362111210823059, + "learning_rate": 1.982332155477032e-05, + "loss": 0.0664, "step": 3360 }, { - "epoch": 1.0, - "eval_f1": 0.49727767695099817, - "eval_loss": 0.0686563029885292, - "eval_runtime": 821.5096, - "eval_samples_per_second": 46.279, - "eval_steps_per_second": 0.724, - "step": 3367 - }, - { - "epoch": 1.0008910008910008, - "grad_norm": 0.5497238039970398, - "learning_rate": 2.3994653994653996e-05, - "loss": 0.0703, + "epoch": 1.7011610297829378, + "grad_norm": 0.480751633644104, + "learning_rate": 1.9793033821302375e-05, + "loss": 0.0609, "step": 3370 }, { - "epoch": 1.0038610038610039, - "grad_norm": 0.3895362913608551, - "learning_rate": 2.3976833976833978e-05, - "loss": 0.0714, + "epoch": 1.7062089853609288, + "grad_norm": 0.31802135705947876, + "learning_rate": 1.976274608783443e-05, + "loss": 0.0658, "step": 3380 }, { - "epoch": 1.006831006831007, - "grad_norm": 0.5208247900009155, - "learning_rate": 2.395901395901396e-05, - "loss": 0.0882, + "epoch": 1.7112569409389198, + "grad_norm": 0.5285906195640564, + "learning_rate": 1.973245835436648e-05, + "loss": 0.0606, "step": 3390 }, { - "epoch": 1.0098010098010097, - "grad_norm": 0.4272199273109436, - "learning_rate": 2.3941193941193942e-05, - "loss": 0.0735, + "epoch": 1.7163048965169105, + "grad_norm": 0.7230745553970337, + "learning_rate": 1.9702170620898536e-05, + "loss": 0.0618, "step": 3400 }, { - "epoch": 1.0127710127710128, - "grad_norm": 0.5025156140327454, - "learning_rate": 2.3923373923373924e-05, - "loss": 0.0706, + "epoch": 1.7213528520949015, + "grad_norm": 0.566842257976532, + "learning_rate": 1.967188288743059e-05, + "loss": 0.0623, "step": 3410 }, { - "epoch": 1.0157410157410158, - "grad_norm": 0.3242335617542267, - "learning_rate": 2.3905553905553906e-05, - "loss": 0.0678, + "epoch": 1.7264008076728925, + "grad_norm": 0.9110565781593323, + "learning_rate": 1.9641595153962646e-05, + "loss": 0.0712, "step": 3420 }, { - "epoch": 1.0187110187110187, - "grad_norm": 0.3997895121574402, - "learning_rate": 2.388773388773389e-05, - "loss": 0.0812, + "epoch": 1.7314487632508833, + "grad_norm": 0.5621252059936523, + "learning_rate": 1.96113074204947e-05, + "loss": 0.0624, "step": 3430 }, { - "epoch": 1.0216810216810217, - "grad_norm": 0.752778172492981, - "learning_rate": 2.386991386991387e-05, - "loss": 0.0884, + "epoch": 1.7364967188288745, + "grad_norm": 0.6153441667556763, + "learning_rate": 1.9581019687026753e-05, + "loss": 0.0679, "step": 3440 }, { - "epoch": 1.0246510246510248, - "grad_norm": 0.8602269291877747, - "learning_rate": 2.3852093852093853e-05, - "loss": 0.0878, + "epoch": 1.7415446744068652, + "grad_norm": 0.7521117925643921, + "learning_rate": 1.955073195355881e-05, + "loss": 0.073, "step": 3450 }, { - "epoch": 1.0276210276210276, - "grad_norm": 0.4281240403652191, - "learning_rate": 2.3834273834273835e-05, - "loss": 0.0718, + "epoch": 1.746592629984856, + "grad_norm": 0.7781336307525635, + "learning_rate": 1.9520444220090863e-05, + "loss": 0.0576, "step": 3460 }, { - "epoch": 1.0305910305910306, - "grad_norm": 0.5941810607910156, - "learning_rate": 2.3816453816453817e-05, - "loss": 0.0737, + "epoch": 1.7516405855628472, + "grad_norm": 0.5981038808822632, + "learning_rate": 1.949015648662292e-05, + "loss": 0.0558, "step": 3470 }, { - "epoch": 1.0335610335610335, - "grad_norm": 0.573628306388855, - "learning_rate": 2.37986337986338e-05, - "loss": 0.0659, + "epoch": 1.756688541140838, + "grad_norm": 0.5716273188591003, + "learning_rate": 1.9459868753154973e-05, + "loss": 0.0615, "step": 3480 }, { - "epoch": 1.0365310365310365, - "grad_norm": 0.6910396814346313, - "learning_rate": 2.378081378081378e-05, - "loss": 0.084, + "epoch": 1.761736496718829, + "grad_norm": 1.0969016551971436, + "learning_rate": 1.9429581019687027e-05, + "loss": 0.0695, "step": 3490 }, { - "epoch": 1.0395010395010396, - "grad_norm": 0.38856300711631775, - "learning_rate": 2.3762993762993764e-05, - "loss": 0.0761, + "epoch": 1.76678445229682, + "grad_norm": 0.4081050157546997, + "learning_rate": 1.9399293286219084e-05, + "loss": 0.0569, "step": 3500 }, { - "epoch": 1.0424710424710424, - "grad_norm": 0.41457536816596985, - "learning_rate": 2.3745173745173746e-05, - "loss": 0.082, + "epoch": 1.7718324078748107, + "grad_norm": 0.6996564269065857, + "learning_rate": 1.9369005552751137e-05, + "loss": 0.0615, "step": 3510 }, { - "epoch": 1.0454410454410454, - "grad_norm": 0.6538494825363159, - "learning_rate": 2.3727353727353728e-05, - "loss": 0.0817, + "epoch": 1.7768803634528016, + "grad_norm": 0.7040839791297913, + "learning_rate": 1.933871781928319e-05, + "loss": 0.0609, "step": 3520 }, { - "epoch": 1.0484110484110485, - "grad_norm": 0.3478659689426422, - "learning_rate": 2.370953370953371e-05, - "loss": 0.0851, + "epoch": 1.7819283190307926, + "grad_norm": 0.6955099105834961, + "learning_rate": 1.9308430085815244e-05, + "loss": 0.0596, "step": 3530 }, { - "epoch": 1.0513810513810513, - "grad_norm": 0.546033501625061, - "learning_rate": 2.3691713691713692e-05, - "loss": 0.084, + "epoch": 1.7869762746087834, + "grad_norm": 0.49400514364242554, + "learning_rate": 1.9278142352347298e-05, + "loss": 0.0531, "step": 3540 }, { - "epoch": 1.0543510543510544, - "grad_norm": 0.4026525020599365, - "learning_rate": 2.3673893673893675e-05, - "loss": 0.0725, + "epoch": 1.7920242301867744, + "grad_norm": 0.6069557666778564, + "learning_rate": 1.9247854618879354e-05, + "loss": 0.0663, "step": 3550 }, { - "epoch": 1.0573210573210574, - "grad_norm": 0.5000739097595215, - "learning_rate": 2.3656073656073657e-05, - "loss": 0.0822, + "epoch": 1.7970721857647654, + "grad_norm": 0.859195351600647, + "learning_rate": 1.9217566885411408e-05, + "loss": 0.0539, "step": 3560 }, { - "epoch": 1.0602910602910602, - "grad_norm": 0.48692411184310913, - "learning_rate": 2.363825363825364e-05, - "loss": 0.0796, + "epoch": 1.802120141342756, + "grad_norm": 0.8939780592918396, + "learning_rate": 1.9187279151943465e-05, + "loss": 0.0668, "step": 3570 }, { - "epoch": 1.0632610632610633, - "grad_norm": 0.5664608478546143, - "learning_rate": 2.362043362043362e-05, - "loss": 0.0661, + "epoch": 1.807168096920747, + "grad_norm": 0.7258803248405457, + "learning_rate": 1.9156991418475518e-05, + "loss": 0.0585, "step": 3580 }, { - "epoch": 1.0662310662310661, - "grad_norm": 0.502124547958374, - "learning_rate": 2.3602613602613603e-05, - "loss": 0.0638, + "epoch": 1.812216052498738, + "grad_norm": 0.38900288939476013, + "learning_rate": 1.912670368500757e-05, + "loss": 0.0686, "step": 3590 }, { - "epoch": 1.0692010692010692, - "grad_norm": 0.5469791889190674, - "learning_rate": 2.3584793584793586e-05, - "loss": 0.0719, + "epoch": 1.8172640080767288, + "grad_norm": 0.38506415486335754, + "learning_rate": 1.909641595153963e-05, + "loss": 0.0625, "step": 3600 }, { - "epoch": 1.0721710721710722, - "grad_norm": 0.5133867859840393, - "learning_rate": 2.3566973566973568e-05, - "loss": 0.0772, + "epoch": 1.8223119636547198, + "grad_norm": 0.5235381722450256, + "learning_rate": 1.9066128218071682e-05, + "loss": 0.0597, "step": 3610 }, { - "epoch": 1.075141075141075, - "grad_norm": 0.5197412371635437, - "learning_rate": 2.354915354915355e-05, - "loss": 0.0862, + "epoch": 1.8273599192327108, + "grad_norm": 0.4835253357887268, + "learning_rate": 1.903584048460374e-05, + "loss": 0.0667, "step": 3620 }, { - "epoch": 1.078111078111078, - "grad_norm": 0.4368208050727844, - "learning_rate": 2.3531333531333532e-05, - "loss": 0.0618, + "epoch": 1.8324078748107016, + "grad_norm": 0.6338971257209778, + "learning_rate": 1.9005552751135792e-05, + "loss": 0.0635, "step": 3630 }, { - "epoch": 1.0810810810810811, - "grad_norm": 0.46737584471702576, - "learning_rate": 2.3513513513513514e-05, - "loss": 0.0618, + "epoch": 1.8374558303886925, + "grad_norm": 1.0663739442825317, + "learning_rate": 1.8975265017667846e-05, + "loss": 0.0744, "step": 3640 }, { - "epoch": 1.084051084051084, - "grad_norm": 0.46774664521217346, - "learning_rate": 2.3495693495693496e-05, - "loss": 0.0844, + "epoch": 1.8425037859666835, + "grad_norm": 0.6655123829841614, + "learning_rate": 1.89449772841999e-05, + "loss": 0.0654, "step": 3650 }, { - "epoch": 1.087021087021087, - "grad_norm": 0.5892476439476013, - "learning_rate": 2.347787347787348e-05, - "loss": 0.0823, + "epoch": 1.8475517415446743, + "grad_norm": 0.582611083984375, + "learning_rate": 1.8914689550731952e-05, + "loss": 0.0661, "step": 3660 }, { - "epoch": 1.08999108999109, - "grad_norm": 0.32615166902542114, - "learning_rate": 2.346005346005346e-05, - "loss": 0.0579, + "epoch": 1.8525996971226655, + "grad_norm": 0.6533240079879761, + "learning_rate": 1.888440181726401e-05, + "loss": 0.0613, "step": 3670 }, { - "epoch": 1.092961092961093, - "grad_norm": 0.4170238673686981, - "learning_rate": 2.3442233442233443e-05, - "loss": 0.0655, + "epoch": 1.8576476527006562, + "grad_norm": 0.4978090524673462, + "learning_rate": 1.8854114083796063e-05, + "loss": 0.0627, "step": 3680 }, { - "epoch": 1.095931095931096, - "grad_norm": 0.4704936146736145, - "learning_rate": 2.3424413424413425e-05, - "loss": 0.0814, + "epoch": 1.862695608278647, + "grad_norm": 0.7043678164482117, + "learning_rate": 1.8823826350328116e-05, + "loss": 0.0578, "step": 3690 }, { - "epoch": 1.098901098901099, - "grad_norm": 0.5191180109977722, - "learning_rate": 2.3406593406593407e-05, - "loss": 0.0787, + "epoch": 1.8677435638566382, + "grad_norm": 0.7941015362739563, + "learning_rate": 1.8793538616860173e-05, + "loss": 0.0622, "step": 3700 }, { - "epoch": 1.1018711018711018, - "grad_norm": 0.48460114002227783, - "learning_rate": 2.338877338877339e-05, - "loss": 0.0522, + "epoch": 1.872791519434629, + "grad_norm": 0.4428146183490753, + "learning_rate": 1.8763250883392226e-05, + "loss": 0.0613, "step": 3710 }, { - "epoch": 1.1048411048411049, - "grad_norm": 0.5503575205802917, - "learning_rate": 2.337095337095337e-05, - "loss": 0.0769, + "epoch": 1.87783947501262, + "grad_norm": 0.6554248929023743, + "learning_rate": 1.873296314992428e-05, + "loss": 0.0643, "step": 3720 }, { - "epoch": 1.107811107811108, - "grad_norm": 0.6398834586143494, - "learning_rate": 2.3353133353133354e-05, - "loss": 0.0664, + "epoch": 1.882887430590611, + "grad_norm": 0.48168087005615234, + "learning_rate": 1.8702675416456337e-05, + "loss": 0.055, "step": 3730 }, { - "epoch": 1.1107811107811107, - "grad_norm": 0.39908480644226074, - "learning_rate": 2.3335313335313336e-05, - "loss": 0.0759, + "epoch": 1.8879353861686017, + "grad_norm": 0.509777307510376, + "learning_rate": 1.867238768298839e-05, + "loss": 0.058, "step": 3740 }, { - "epoch": 1.1137511137511138, - "grad_norm": 0.4675776958465576, - "learning_rate": 2.3317493317493318e-05, - "loss": 0.0765, + "epoch": 1.8929833417465927, + "grad_norm": 0.5132505893707275, + "learning_rate": 1.8642099949520447e-05, + "loss": 0.0623, "step": 3750 }, { - "epoch": 1.1167211167211166, - "grad_norm": 0.350972443819046, - "learning_rate": 2.32996732996733e-05, - "loss": 0.0777, + "epoch": 1.8980312973245836, + "grad_norm": 0.7474920749664307, + "learning_rate": 1.86118122160525e-05, + "loss": 0.0489, "step": 3760 }, { - "epoch": 1.1196911196911197, - "grad_norm": 0.4611550569534302, - "learning_rate": 2.3281853281853282e-05, - "loss": 0.0709, + "epoch": 1.9030792529025744, + "grad_norm": 1.0404279232025146, + "learning_rate": 1.8581524482584554e-05, + "loss": 0.0687, "step": 3770 }, { - "epoch": 1.1226611226611227, - "grad_norm": 0.5342544913291931, - "learning_rate": 2.3264033264033265e-05, - "loss": 0.0649, + "epoch": 1.9081272084805654, + "grad_norm": 0.6796401143074036, + "learning_rate": 1.8551236749116607e-05, + "loss": 0.0679, "step": 3780 }, { - "epoch": 1.1256311256311256, - "grad_norm": 0.6507514119148254, - "learning_rate": 2.3246213246213247e-05, - "loss": 0.076, + "epoch": 1.9131751640585564, + "grad_norm": 0.9071604609489441, + "learning_rate": 1.852094901564866e-05, + "loss": 0.0725, "step": 3790 }, { - "epoch": 1.1286011286011286, - "grad_norm": 0.7478254437446594, - "learning_rate": 2.322839322839323e-05, - "loss": 0.0857, + "epoch": 1.9182231196365471, + "grad_norm": 0.7023878693580627, + "learning_rate": 1.8490661282180718e-05, + "loss": 0.0702, "step": 3800 }, { - "epoch": 1.1315711315711316, - "grad_norm": 0.5067834258079529, - "learning_rate": 2.321057321057321e-05, - "loss": 0.0745, + "epoch": 1.923271075214538, + "grad_norm": 0.7312602996826172, + "learning_rate": 1.846037354871277e-05, + "loss": 0.0532, "step": 3810 }, { - "epoch": 1.1345411345411345, - "grad_norm": 0.6091060042381287, - "learning_rate": 2.3192753192753193e-05, - "loss": 0.0761, + "epoch": 1.928319030792529, + "grad_norm": 0.6224806904792786, + "learning_rate": 1.8430085815244825e-05, + "loss": 0.0638, "step": 3820 }, { - "epoch": 1.1375111375111375, - "grad_norm": 0.4694317579269409, - "learning_rate": 2.3174933174933175e-05, - "loss": 0.0815, + "epoch": 1.9333669863705198, + "grad_norm": 0.7255429029464722, + "learning_rate": 1.839979808177688e-05, + "loss": 0.0641, "step": 3830 }, { - "epoch": 1.1404811404811406, - "grad_norm": 0.5222705006599426, - "learning_rate": 2.3157113157113158e-05, - "loss": 0.0788, + "epoch": 1.9384149419485108, + "grad_norm": 0.584086000919342, + "learning_rate": 1.8369510348308935e-05, + "loss": 0.0692, "step": 3840 }, { - "epoch": 1.1434511434511434, - "grad_norm": 0.5226296782493591, - "learning_rate": 2.313929313929314e-05, - "loss": 0.0773, + "epoch": 1.9434628975265018, + "grad_norm": 0.4826408326625824, + "learning_rate": 1.833922261484099e-05, + "loss": 0.0627, "step": 3850 }, { - "epoch": 1.1464211464211465, - "grad_norm": 0.5545721054077148, - "learning_rate": 2.3121473121473122e-05, - "loss": 0.0675, + "epoch": 1.9485108531044926, + "grad_norm": 0.5803766846656799, + "learning_rate": 1.8308934881373045e-05, + "loss": 0.0635, "step": 3860 }, { - "epoch": 1.1493911493911493, - "grad_norm": 0.5250979065895081, - "learning_rate": 2.3103653103653104e-05, - "loss": 0.0815, + "epoch": 1.9535588086824835, + "grad_norm": 0.7855948209762573, + "learning_rate": 1.82786471479051e-05, + "loss": 0.0659, "step": 3870 }, { - "epoch": 1.1523611523611523, - "grad_norm": 0.4267248213291168, - "learning_rate": 2.3085833085833086e-05, - "loss": 0.0704, + "epoch": 1.9586067642604745, + "grad_norm": 0.5980962514877319, + "learning_rate": 1.8248359414437155e-05, + "loss": 0.0651, "step": 3880 }, { - "epoch": 1.1553311553311554, - "grad_norm": 0.3308209478855133, - "learning_rate": 2.306801306801307e-05, - "loss": 0.08, + "epoch": 1.9636547198384653, + "grad_norm": 0.6440220475196838, + "learning_rate": 1.821807168096921e-05, + "loss": 0.0639, "step": 3890 }, { - "epoch": 1.1583011583011582, - "grad_norm": 0.49279993772506714, - "learning_rate": 2.305019305019305e-05, - "loss": 0.0868, + "epoch": 1.9687026754164565, + "grad_norm": 0.7104585766792297, + "learning_rate": 1.8187783947501262e-05, + "loss": 0.056, "step": 3900 }, { - "epoch": 1.1612711612711613, - "grad_norm": 0.49307748675346375, - "learning_rate": 2.3032373032373033e-05, - "loss": 0.081, + "epoch": 1.9737506309944473, + "grad_norm": 0.7219833731651306, + "learning_rate": 1.8157496214033316e-05, + "loss": 0.0574, "step": 3910 }, { - "epoch": 1.1642411642411643, - "grad_norm": 0.691349446773529, - "learning_rate": 2.3014553014553015e-05, - "loss": 0.0712, + "epoch": 1.978798586572438, + "grad_norm": 0.5478711724281311, + "learning_rate": 1.812720848056537e-05, + "loss": 0.0657, "step": 3920 }, { - "epoch": 1.1672111672111671, - "grad_norm": 0.4932047724723816, - "learning_rate": 2.2996732996732997e-05, - "loss": 0.0683, + "epoch": 1.9838465421504292, + "grad_norm": 0.6501402854919434, + "learning_rate": 1.8096920747097426e-05, + "loss": 0.0641, "step": 3930 }, { - "epoch": 1.1701811701811702, - "grad_norm": 0.5138940811157227, - "learning_rate": 2.297891297891298e-05, - "loss": 0.0646, + "epoch": 1.98889449772842, + "grad_norm": 0.7231020331382751, + "learning_rate": 1.806663301362948e-05, + "loss": 0.0692, "step": 3940 }, { - "epoch": 1.1731511731511732, - "grad_norm": 0.4573695659637451, - "learning_rate": 2.2961092961092965e-05, - "loss": 0.0593, + "epoch": 1.993942453306411, + "grad_norm": 0.6480854749679565, + "learning_rate": 1.8036345280161536e-05, + "loss": 0.0632, "step": 3950 }, { - "epoch": 1.176121176121176, - "grad_norm": 0.6048777103424072, - "learning_rate": 2.2943272943272944e-05, - "loss": 0.0768, + "epoch": 1.998990408884402, + "grad_norm": 0.4803590774536133, + "learning_rate": 1.800605754669359e-05, + "loss": 0.0678, "step": 3960 }, { - "epoch": 1.179091179091179, - "grad_norm": 0.6311981678009033, - "learning_rate": 2.2925452925452926e-05, - "loss": 0.0901, + "epoch": 2.0, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.0446692518889904, + "eval_runtime": 584.4017, + "eval_samples_per_second": 352.946, + "eval_steps_per_second": 2.758, + "step": 3962 + }, + { + "epoch": 2.0040383644623927, + "grad_norm": 0.680855393409729, + "learning_rate": 1.7975769813225643e-05, + "loss": 0.0567, "step": 3970 }, { - "epoch": 1.1820611820611822, - "grad_norm": 0.4408791661262512, - "learning_rate": 2.2907632907632908e-05, - "loss": 0.0729, + "epoch": 2.0090863200403835, + "grad_norm": 0.47991836071014404, + "learning_rate": 1.79454820797577e-05, + "loss": 0.0562, "step": 3980 }, { - "epoch": 1.185031185031185, - "grad_norm": 0.3359534740447998, - "learning_rate": 2.288981288981289e-05, - "loss": 0.071, + "epoch": 2.0141342756183747, + "grad_norm": 0.8615912199020386, + "learning_rate": 1.7915194346289753e-05, + "loss": 0.0679, "step": 3990 }, { - "epoch": 1.188001188001188, - "grad_norm": 0.3939429223537445, - "learning_rate": 2.2871992871992872e-05, - "loss": 0.0681, + "epoch": 2.0191822311963654, + "grad_norm": 0.5970327258110046, + "learning_rate": 1.7884906612821807e-05, + "loss": 0.053, "step": 4000 }, { - "epoch": 1.190971190971191, - "grad_norm": 0.46291255950927734, - "learning_rate": 2.2854172854172855e-05, - "loss": 0.0721, + "epoch": 2.024230186774356, + "grad_norm": 0.5402255654335022, + "learning_rate": 1.7854618879353864e-05, + "loss": 0.0574, "step": 4010 }, { - "epoch": 1.193941193941194, - "grad_norm": 0.4679121971130371, - "learning_rate": 2.283635283635284e-05, - "loss": 0.0823, + "epoch": 2.0292781423523474, + "grad_norm": 0.5014840364456177, + "learning_rate": 1.7824331145885917e-05, + "loss": 0.0649, "step": 4020 }, { - "epoch": 1.196911196911197, - "grad_norm": 0.6498029232025146, - "learning_rate": 2.281853281853282e-05, - "loss": 0.064, + "epoch": 2.034326097930338, + "grad_norm": 0.7147154808044434, + "learning_rate": 1.779404341241797e-05, + "loss": 0.0687, "step": 4030 }, { - "epoch": 1.1998811998811998, - "grad_norm": 0.5375266671180725, - "learning_rate": 2.28007128007128e-05, - "loss": 0.0827, + "epoch": 2.0393740535083293, + "grad_norm": 0.5346552729606628, + "learning_rate": 1.7763755678950024e-05, + "loss": 0.0638, "step": 4040 }, { - "epoch": 1.2028512028512028, - "grad_norm": 0.7022712230682373, - "learning_rate": 2.2782892782892783e-05, - "loss": 0.0723, + "epoch": 2.04442200908632, + "grad_norm": 0.5596599578857422, + "learning_rate": 1.7733467945482078e-05, + "loss": 0.0669, "step": 4050 }, { - "epoch": 1.2058212058212059, - "grad_norm": 0.888565182685852, - "learning_rate": 2.2765072765072765e-05, - "loss": 0.0699, + "epoch": 2.049469964664311, + "grad_norm": 0.40591198205947876, + "learning_rate": 1.7703180212014134e-05, + "loss": 0.0564, "step": 4060 }, { - "epoch": 1.2087912087912087, - "grad_norm": 0.615304172039032, - "learning_rate": 2.2747252747252748e-05, - "loss": 0.0639, + "epoch": 2.054517920242302, + "grad_norm": 0.609337568283081, + "learning_rate": 1.7672892478546188e-05, + "loss": 0.0576, "step": 4070 }, { - "epoch": 1.2117612117612118, - "grad_norm": 1.067995309829712, - "learning_rate": 2.272943272943273e-05, - "loss": 0.0727, + "epoch": 2.059565875820293, + "grad_norm": 0.5424002408981323, + "learning_rate": 1.7642604745078245e-05, + "loss": 0.0585, "step": 4080 }, { - "epoch": 1.2147312147312148, - "grad_norm": 0.38957396149635315, - "learning_rate": 2.2711612711612715e-05, - "loss": 0.0817, + "epoch": 2.0646138313982836, + "grad_norm": 0.9868631362915039, + "learning_rate": 1.7612317011610298e-05, + "loss": 0.0684, "step": 4090 }, { - "epoch": 1.2177012177012176, - "grad_norm": 0.4814799726009369, - "learning_rate": 2.2693792693792694e-05, - "loss": 0.0676, + "epoch": 2.069661786976275, + "grad_norm": 0.6492929458618164, + "learning_rate": 1.758202927814235e-05, + "loss": 0.0638, "step": 4100 }, { - "epoch": 1.2206712206712207, - "grad_norm": 0.33193427324295044, - "learning_rate": 2.2675972675972676e-05, - "loss": 0.0717, + "epoch": 2.0747097425542655, + "grad_norm": 0.7837685346603394, + "learning_rate": 1.755174154467441e-05, + "loss": 0.0675, "step": 4110 }, { - "epoch": 1.2236412236412235, - "grad_norm": 0.5651602149009705, - "learning_rate": 2.265815265815266e-05, - "loss": 0.0669, + "epoch": 2.0797576981322563, + "grad_norm": 0.5961639881134033, + "learning_rate": 1.7521453811206462e-05, + "loss": 0.0575, "step": 4120 }, { - "epoch": 1.2266112266112266, - "grad_norm": 0.6378253102302551, - "learning_rate": 2.264033264033264e-05, - "loss": 0.0897, + "epoch": 2.0848056537102475, + "grad_norm": 0.4114825427532196, + "learning_rate": 1.749116607773852e-05, + "loss": 0.0659, "step": 4130 }, { - "epoch": 1.2295812295812296, - "grad_norm": 0.6030372977256775, - "learning_rate": 2.2622512622512623e-05, - "loss": 0.0896, + "epoch": 2.0898536092882383, + "grad_norm": 0.4567316174507141, + "learning_rate": 1.7460878344270572e-05, + "loss": 0.0661, "step": 4140 }, { - "epoch": 1.2325512325512324, - "grad_norm": 0.8515591621398926, - "learning_rate": 2.2604692604692605e-05, - "loss": 0.0645, + "epoch": 2.094901564866229, + "grad_norm": 0.6321776509284973, + "learning_rate": 1.7430590610802626e-05, + "loss": 0.066, "step": 4150 }, { - "epoch": 1.2355212355212355, - "grad_norm": 0.6547635197639465, - "learning_rate": 2.258687258687259e-05, - "loss": 0.0736, + "epoch": 2.0999495204442202, + "grad_norm": 0.8911116719245911, + "learning_rate": 1.740030287733468e-05, + "loss": 0.0585, "step": 4160 }, { - "epoch": 1.2384912384912385, - "grad_norm": 0.4761018753051758, - "learning_rate": 2.256905256905257e-05, - "loss": 0.0689, + "epoch": 2.104997476022211, + "grad_norm": 0.4896914064884186, + "learning_rate": 1.7370015143866733e-05, + "loss": 0.0612, "step": 4170 }, { - "epoch": 1.2414612414612414, - "grad_norm": 0.39740657806396484, - "learning_rate": 2.255123255123255e-05, - "loss": 0.0696, + "epoch": 2.1100454316002017, + "grad_norm": 0.7571251392364502, + "learning_rate": 1.733972741039879e-05, + "loss": 0.0563, "step": 4180 }, { - "epoch": 1.2444312444312444, - "grad_norm": 0.49501290917396545, - "learning_rate": 2.2533412533412534e-05, - "loss": 0.0779, + "epoch": 2.115093387178193, + "grad_norm": 0.9115099310874939, + "learning_rate": 1.7309439676930843e-05, + "loss": 0.0698, "step": 4190 }, { - "epoch": 1.2474012474012475, - "grad_norm": 0.5703093409538269, - "learning_rate": 2.2515592515592516e-05, - "loss": 0.0663, + "epoch": 2.1201413427561837, + "grad_norm": 0.5267325639724731, + "learning_rate": 1.7279151943462896e-05, + "loss": 0.0604, "step": 4200 }, { - "epoch": 1.2503712503712503, - "grad_norm": 0.4675036370754242, - "learning_rate": 2.2497772497772498e-05, - "loss": 0.0772, + "epoch": 2.1251892983341745, + "grad_norm": 0.6659255623817444, + "learning_rate": 1.7248864209994953e-05, + "loss": 0.0627, "step": 4210 }, { - "epoch": 1.2533412533412533, - "grad_norm": 0.6520904898643494, - "learning_rate": 2.247995247995248e-05, - "loss": 0.074, + "epoch": 2.1302372539121657, + "grad_norm": 0.89178466796875, + "learning_rate": 1.7218576476527007e-05, + "loss": 0.0552, "step": 4220 }, { - "epoch": 1.2563112563112564, - "grad_norm": 0.4377146065235138, - "learning_rate": 2.2462132462132466e-05, - "loss": 0.0752, + "epoch": 2.1352852094901564, + "grad_norm": 0.4615127742290497, + "learning_rate": 1.7188288743059063e-05, + "loss": 0.0557, "step": 4230 }, { - "epoch": 1.2592812592812592, - "grad_norm": 0.4791605472564697, - "learning_rate": 2.2444312444312444e-05, - "loss": 0.0614, + "epoch": 2.1403331650681476, + "grad_norm": 0.6602596044540405, + "learning_rate": 1.7158001009591117e-05, + "loss": 0.0548, "step": 4240 }, { - "epoch": 1.2622512622512623, - "grad_norm": 0.5933295488357544, - "learning_rate": 2.2426492426492427e-05, - "loss": 0.0832, + "epoch": 2.1453811206461384, + "grad_norm": 0.7081389427185059, + "learning_rate": 1.712771327612317e-05, + "loss": 0.0606, "step": 4250 }, { - "epoch": 1.2652212652212653, - "grad_norm": 0.4189813435077667, - "learning_rate": 2.240867240867241e-05, - "loss": 0.069, + "epoch": 2.150429076224129, + "grad_norm": 0.5817338824272156, + "learning_rate": 1.7097425542655227e-05, + "loss": 0.0606, "step": 4260 }, { - "epoch": 1.2681912681912682, - "grad_norm": 0.651421070098877, - "learning_rate": 2.239085239085239e-05, - "loss": 0.0791, + "epoch": 2.1554770318021204, + "grad_norm": 0.4401390254497528, + "learning_rate": 1.706713780918728e-05, + "loss": 0.0607, "step": 4270 }, { - "epoch": 1.2711612711612712, - "grad_norm": 0.40593355894088745, - "learning_rate": 2.2373032373032373e-05, - "loss": 0.0638, + "epoch": 2.160524987380111, + "grad_norm": 1.0127087831497192, + "learning_rate": 1.7036850075719337e-05, + "loss": 0.0615, "step": 4280 }, { - "epoch": 1.2741312741312742, - "grad_norm": 0.5226801037788391, - "learning_rate": 2.2355212355212355e-05, - "loss": 0.077, + "epoch": 2.165572942958102, + "grad_norm": 0.5774319171905518, + "learning_rate": 1.7006562342251387e-05, + "loss": 0.0525, "step": 4290 }, { - "epoch": 1.277101277101277, - "grad_norm": 0.6062614321708679, - "learning_rate": 2.233739233739234e-05, - "loss": 0.068, + "epoch": 2.170620898536093, + "grad_norm": 0.47623270750045776, + "learning_rate": 1.697627460878344e-05, + "loss": 0.0591, "step": 4300 }, { - "epoch": 1.2800712800712801, - "grad_norm": 0.48023584485054016, - "learning_rate": 2.231957231957232e-05, - "loss": 0.0622, + "epoch": 2.175668854114084, + "grad_norm": 0.7083358764648438, + "learning_rate": 1.6945986875315498e-05, + "loss": 0.0631, "step": 4310 }, { - "epoch": 1.2830412830412832, - "grad_norm": 0.4292398989200592, - "learning_rate": 2.2301752301752302e-05, - "loss": 0.0951, + "epoch": 2.1807168096920746, + "grad_norm": 0.6057601571083069, + "learning_rate": 1.691569914184755e-05, + "loss": 0.0595, "step": 4320 }, { - "epoch": 1.286011286011286, - "grad_norm": 0.509908139705658, - "learning_rate": 2.2283932283932284e-05, - "loss": 0.0703, + "epoch": 2.185764765270066, + "grad_norm": 0.8947880864143372, + "learning_rate": 1.6885411408379605e-05, + "loss": 0.0666, "step": 4330 }, { - "epoch": 1.288981288981289, - "grad_norm": 0.36277303099632263, - "learning_rate": 2.2266112266112266e-05, - "loss": 0.0752, + "epoch": 2.1908127208480566, + "grad_norm": 0.6460204720497131, + "learning_rate": 1.685512367491166e-05, + "loss": 0.0669, "step": 4340 }, { - "epoch": 1.2919512919512919, - "grad_norm": 0.4135016202926636, - "learning_rate": 2.2248292248292248e-05, - "loss": 0.0673, + "epoch": 2.1958606764260473, + "grad_norm": 0.9029686450958252, + "learning_rate": 1.6824835941443715e-05, + "loss": 0.0607, "step": 4350 }, { - "epoch": 1.294921294921295, - "grad_norm": 0.4465673863887787, - "learning_rate": 2.223047223047223e-05, - "loss": 0.0774, + "epoch": 2.2009086320040385, + "grad_norm": 0.5201438665390015, + "learning_rate": 1.6794548207975772e-05, + "loss": 0.0514, "step": 4360 }, { - "epoch": 1.2978912978912978, - "grad_norm": 0.3581428825855255, - "learning_rate": 2.2212652212652216e-05, - "loss": 0.0722, + "epoch": 2.2059565875820293, + "grad_norm": 0.39414748549461365, + "learning_rate": 1.6764260474507825e-05, + "loss": 0.0581, "step": 4370 }, { - "epoch": 1.3008613008613008, - "grad_norm": 0.8216081261634827, - "learning_rate": 2.2194832194832195e-05, - "loss": 0.0695, + "epoch": 2.21100454316002, + "grad_norm": 0.642257034778595, + "learning_rate": 1.673397274103988e-05, + "loss": 0.0611, "step": 4380 }, { - "epoch": 1.3038313038313039, - "grad_norm": 0.3974524736404419, - "learning_rate": 2.2177012177012177e-05, - "loss": 0.0548, + "epoch": 2.2160524987380112, + "grad_norm": 0.7225739359855652, + "learning_rate": 1.6703685007571935e-05, + "loss": 0.0569, "step": 4390 }, { - "epoch": 1.3068013068013067, - "grad_norm": 0.40166157484054565, - "learning_rate": 2.215919215919216e-05, - "loss": 0.0821, + "epoch": 2.221100454316002, + "grad_norm": 0.6948502659797668, + "learning_rate": 1.667339727410399e-05, + "loss": 0.0652, "step": 4400 }, { - "epoch": 1.3097713097713097, - "grad_norm": 0.6108930706977844, - "learning_rate": 2.214137214137214e-05, - "loss": 0.0771, + "epoch": 2.2261484098939928, + "grad_norm": 0.5755937695503235, + "learning_rate": 1.6643109540636042e-05, + "loss": 0.0566, "step": 4410 }, { - "epoch": 1.3127413127413128, - "grad_norm": 0.33659735321998596, - "learning_rate": 2.2123552123552123e-05, - "loss": 0.0866, + "epoch": 2.231196365471984, + "grad_norm": 0.4249815046787262, + "learning_rate": 1.6612821807168096e-05, + "loss": 0.0642, "step": 4420 }, { - "epoch": 1.3157113157113156, - "grad_norm": 0.41419750452041626, - "learning_rate": 2.2105732105732106e-05, - "loss": 0.066, + "epoch": 2.2362443210499747, + "grad_norm": 0.5442089438438416, + "learning_rate": 1.658253407370015e-05, + "loss": 0.0685, "step": 4430 }, { - "epoch": 1.3186813186813187, - "grad_norm": 0.39843958616256714, - "learning_rate": 2.208791208791209e-05, - "loss": 0.0717, + "epoch": 2.2412922766279655, + "grad_norm": 0.8074495792388916, + "learning_rate": 1.6552246340232206e-05, + "loss": 0.0558, "step": 4440 }, { - "epoch": 1.3216513216513217, - "grad_norm": 0.4193469285964966, - "learning_rate": 2.207009207009207e-05, - "loss": 0.0608, + "epoch": 2.2463402322059567, + "grad_norm": 0.8810071349143982, + "learning_rate": 1.652195860676426e-05, + "loss": 0.0685, "step": 4450 }, { - "epoch": 1.3246213246213245, - "grad_norm": 0.310855507850647, - "learning_rate": 2.2052272052272052e-05, - "loss": 0.0623, + "epoch": 2.2513881877839474, + "grad_norm": 0.5399377942085266, + "learning_rate": 1.6491670873296316e-05, + "loss": 0.0607, "step": 4460 }, { - "epoch": 1.3275913275913276, - "grad_norm": 0.3885134160518646, - "learning_rate": 2.2034452034452034e-05, - "loss": 0.0565, + "epoch": 2.256436143361938, + "grad_norm": 0.7178535461425781, + "learning_rate": 1.646138313982837e-05, + "loss": 0.0504, "step": 4470 }, { - "epoch": 1.3305613305613306, - "grad_norm": 0.31589820981025696, - "learning_rate": 2.2016632016632017e-05, - "loss": 0.0591, + "epoch": 2.2614840989399294, + "grad_norm": 0.4272046983242035, + "learning_rate": 1.6431095406360423e-05, + "loss": 0.0583, "step": 4480 }, { - "epoch": 1.3335313335313335, - "grad_norm": 0.4833143651485443, - "learning_rate": 2.1998811998812e-05, - "loss": 0.0758, + "epoch": 2.26653205451792, + "grad_norm": 0.6807524561882019, + "learning_rate": 1.640080767289248e-05, + "loss": 0.0639, "step": 4490 }, { - "epoch": 1.3365013365013365, - "grad_norm": 0.47030189633369446, - "learning_rate": 2.198099198099198e-05, - "loss": 0.0644, + "epoch": 2.271580010095911, + "grad_norm": 0.5895000100135803, + "learning_rate": 1.6370519939424534e-05, + "loss": 0.0675, "step": 4500 }, { - "epoch": 1.3394713394713396, - "grad_norm": 0.44581151008605957, - "learning_rate": 2.1963171963171966e-05, - "loss": 0.0675, + "epoch": 2.276627965673902, + "grad_norm": 0.6640876531600952, + "learning_rate": 1.634023220595659e-05, + "loss": 0.0603, "step": 4510 }, { - "epoch": 1.3424413424413424, - "grad_norm": 0.5004817247390747, - "learning_rate": 2.1945351945351945e-05, - "loss": 0.0835, + "epoch": 2.281675921251893, + "grad_norm": 0.4367890954017639, + "learning_rate": 1.6309944472488644e-05, + "loss": 0.0517, "step": 4520 }, { - "epoch": 1.3454113454113454, - "grad_norm": 0.5188937783241272, - "learning_rate": 2.1927531927531927e-05, - "loss": 0.0739, + "epoch": 2.2867238768298837, + "grad_norm": 1.082713007926941, + "learning_rate": 1.6279656739020697e-05, + "loss": 0.0524, "step": 4530 }, { - "epoch": 1.3483813483813485, - "grad_norm": 0.386055052280426, - "learning_rate": 2.190971190971191e-05, - "loss": 0.0752, + "epoch": 2.291771832407875, + "grad_norm": 0.5186300277709961, + "learning_rate": 1.624936900555275e-05, + "loss": 0.0566, "step": 4540 }, { - "epoch": 1.3513513513513513, - "grad_norm": 0.5287050008773804, - "learning_rate": 2.1891891891891892e-05, - "loss": 0.0704, + "epoch": 2.2968197879858656, + "grad_norm": 1.2778280973434448, + "learning_rate": 1.6219081272084804e-05, + "loss": 0.0531, "step": 4550 }, { - "epoch": 1.3543213543213544, - "grad_norm": 0.5197706818580627, - "learning_rate": 2.1874071874071874e-05, - "loss": 0.0722, + "epoch": 2.301867743563857, + "grad_norm": 0.46757417917251587, + "learning_rate": 1.618879353861686e-05, + "loss": 0.0637, "step": 4560 }, { - "epoch": 1.3572913572913574, - "grad_norm": 1.044822335243225, - "learning_rate": 2.1856251856251856e-05, - "loss": 0.0774, + "epoch": 2.3069156991418476, + "grad_norm": 0.6333388686180115, + "learning_rate": 1.6158505805148914e-05, + "loss": 0.0557, "step": 4570 }, { - "epoch": 1.3602613602613602, - "grad_norm": 0.35167747735977173, - "learning_rate": 2.183843183843184e-05, - "loss": 0.0688, + "epoch": 2.3119636547198383, + "grad_norm": 0.4005846381187439, + "learning_rate": 1.6128218071680968e-05, + "loss": 0.0512, "step": 4580 }, { - "epoch": 1.3632313632313633, - "grad_norm": 0.5518337488174438, - "learning_rate": 2.1820611820611824e-05, - "loss": 0.0899, + "epoch": 2.3170116102978295, + "grad_norm": 1.0479962825775146, + "learning_rate": 1.6097930338213025e-05, + "loss": 0.0639, "step": 4590 }, { - "epoch": 1.3662013662013661, - "grad_norm": 0.5644456148147583, - "learning_rate": 2.1802791802791803e-05, - "loss": 0.0808, + "epoch": 2.3220595658758203, + "grad_norm": 1.1324669122695923, + "learning_rate": 1.6067642604745078e-05, + "loss": 0.0642, "step": 4600 }, { - "epoch": 1.3691713691713692, - "grad_norm": 0.45010289549827576, - "learning_rate": 2.1784971784971785e-05, - "loss": 0.0839, + "epoch": 2.327107521453811, + "grad_norm": 0.827215313911438, + "learning_rate": 1.6037354871277135e-05, + "loss": 0.0654, "step": 4610 }, { - "epoch": 1.3721413721413722, - "grad_norm": 0.6567732095718384, - "learning_rate": 2.1767151767151767e-05, - "loss": 0.0761, + "epoch": 2.3321554770318023, + "grad_norm": 0.8228656649589539, + "learning_rate": 1.600706713780919e-05, + "loss": 0.0648, "step": 4620 }, { - "epoch": 1.375111375111375, - "grad_norm": 0.582931399345398, - "learning_rate": 2.174933174933175e-05, - "loss": 0.0669, + "epoch": 2.337203432609793, + "grad_norm": 0.5897762775421143, + "learning_rate": 1.5976779404341242e-05, + "loss": 0.0546, "step": 4630 }, { - "epoch": 1.378081378081378, - "grad_norm": 0.39117926359176636, - "learning_rate": 2.173151173151173e-05, - "loss": 0.0763, + "epoch": 2.342251388187784, + "grad_norm": 0.6223641633987427, + "learning_rate": 1.59464916708733e-05, + "loss": 0.0712, "step": 4640 }, { - "epoch": 1.381051381051381, - "grad_norm": 0.44285526871681213, - "learning_rate": 2.1713691713691717e-05, - "loss": 0.071, + "epoch": 2.347299343765775, + "grad_norm": 0.5593187808990479, + "learning_rate": 1.5916203937405352e-05, + "loss": 0.0707, "step": 4650 }, { - "epoch": 1.384021384021384, - "grad_norm": 0.6497974395751953, - "learning_rate": 2.16958716958717e-05, - "loss": 0.0647, + "epoch": 2.3523472993437657, + "grad_norm": 0.9349427223205566, + "learning_rate": 1.5885916203937406e-05, + "loss": 0.0581, "step": 4660 }, { - "epoch": 1.386991386991387, - "grad_norm": 0.4394398033618927, - "learning_rate": 2.1678051678051678e-05, - "loss": 0.0666, + "epoch": 2.3573952549217565, + "grad_norm": 0.47101134061813354, + "learning_rate": 1.585562847046946e-05, + "loss": 0.0688, "step": 4670 }, { - "epoch": 1.3899613899613898, - "grad_norm": 0.6339782476425171, - "learning_rate": 2.166023166023166e-05, - "loss": 0.0693, + "epoch": 2.3624432104997477, + "grad_norm": 0.5073738098144531, + "learning_rate": 1.5825340737001513e-05, + "loss": 0.0678, "step": 4680 }, { - "epoch": 1.392931392931393, - "grad_norm": 0.24844326078891754, - "learning_rate": 2.1642411642411642e-05, - "loss": 0.0631, + "epoch": 2.3674911660777385, + "grad_norm": 0.5324171781539917, + "learning_rate": 1.579505300353357e-05, + "loss": 0.0614, "step": 4690 }, { - "epoch": 1.395901395901396, - "grad_norm": 0.41448843479156494, - "learning_rate": 2.1624591624591624e-05, - "loss": 0.0667, + "epoch": 2.3725391216557297, + "grad_norm": 0.662965714931488, + "learning_rate": 1.5764765270065623e-05, + "loss": 0.0507, "step": 4700 }, { - "epoch": 1.3988713988713988, - "grad_norm": 0.30131953954696655, - "learning_rate": 2.1606771606771606e-05, - "loss": 0.0625, + "epoch": 2.3775870772337204, + "grad_norm": 0.6482782959938049, + "learning_rate": 1.5734477536597676e-05, + "loss": 0.0537, "step": 4710 }, { - "epoch": 1.4018414018414018, - "grad_norm": 0.7573267817497253, - "learning_rate": 2.1588951588951592e-05, - "loss": 0.0672, + "epoch": 2.382635032811711, + "grad_norm": 1.0039052963256836, + "learning_rate": 1.5704189803129733e-05, + "loss": 0.059, "step": 4720 }, { - "epoch": 1.4048114048114049, - "grad_norm": 0.5527480840682983, - "learning_rate": 2.1571131571131574e-05, - "loss": 0.0597, + "epoch": 2.3876829883897024, + "grad_norm": 0.8546132445335388, + "learning_rate": 1.5673902069661787e-05, + "loss": 0.0691, "step": 4730 }, { - "epoch": 1.4077814077814077, - "grad_norm": 0.5866405367851257, - "learning_rate": 2.1553311553311553e-05, - "loss": 0.0676, + "epoch": 2.392730943967693, + "grad_norm": 0.4903261363506317, + "learning_rate": 1.5643614336193843e-05, + "loss": 0.0535, "step": 4740 }, { - "epoch": 1.4107514107514108, - "grad_norm": 0.3691079318523407, - "learning_rate": 2.1535491535491535e-05, - "loss": 0.073, + "epoch": 2.397778899545684, + "grad_norm": 0.8538033962249756, + "learning_rate": 1.5613326602725897e-05, + "loss": 0.0616, "step": 4750 }, { - "epoch": 1.4137214137214138, - "grad_norm": 0.46354126930236816, - "learning_rate": 2.1517671517671517e-05, - "loss": 0.062, + "epoch": 2.402826855123675, + "grad_norm": 0.7978336215019226, + "learning_rate": 1.558303886925795e-05, + "loss": 0.0613, "step": 4760 }, { - "epoch": 1.4166914166914166, - "grad_norm": 0.4648849368095398, - "learning_rate": 2.14998514998515e-05, - "loss": 0.0854, + "epoch": 2.407874810701666, + "grad_norm": 0.6981778740882874, + "learning_rate": 1.5552751135790007e-05, + "loss": 0.0646, "step": 4770 }, { - "epoch": 1.4196614196614197, - "grad_norm": 0.4591132402420044, - "learning_rate": 2.148203148203148e-05, - "loss": 0.0571, + "epoch": 2.4129227662796566, + "grad_norm": 0.8517895936965942, + "learning_rate": 1.552246340232206e-05, + "loss": 0.0705, "step": 4780 }, { - "epoch": 1.4226314226314227, - "grad_norm": 0.6278248429298401, - "learning_rate": 2.1464211464211467e-05, - "loss": 0.0669, + "epoch": 2.417970721857648, + "grad_norm": 0.4087599813938141, + "learning_rate": 1.5492175668854117e-05, + "loss": 0.0638, "step": 4790 }, { - "epoch": 1.4256014256014256, - "grad_norm": 0.7873584032058716, - "learning_rate": 2.144639144639145e-05, - "loss": 0.0708, + "epoch": 2.4230186774356386, + "grad_norm": 0.3779948651790619, + "learning_rate": 1.5461887935386168e-05, + "loss": 0.0524, "step": 4800 }, { - "epoch": 1.4285714285714286, - "grad_norm": 0.42913201451301575, - "learning_rate": 2.1428571428571428e-05, - "loss": 0.0666, + "epoch": 2.4280666330136293, + "grad_norm": 0.42263171076774597, + "learning_rate": 1.543160020191822e-05, + "loss": 0.0623, "step": 4810 }, { - "epoch": 1.4315414315414317, - "grad_norm": 0.34143778681755066, - "learning_rate": 2.141075141075141e-05, - "loss": 0.0829, + "epoch": 2.4331145885916206, + "grad_norm": 0.5812351107597351, + "learning_rate": 1.5401312468450278e-05, + "loss": 0.0573, "step": 4820 }, { - "epoch": 1.4345114345114345, - "grad_norm": 0.47077706456184387, - "learning_rate": 2.1392931392931392e-05, - "loss": 0.0794, + "epoch": 2.4381625441696113, + "grad_norm": 0.6073315143585205, + "learning_rate": 1.537102473498233e-05, + "loss": 0.057, "step": 4830 }, { - "epoch": 1.4374814374814375, - "grad_norm": 0.4886973202228546, - "learning_rate": 2.1375111375111375e-05, - "loss": 0.0646, + "epoch": 2.443210499747602, + "grad_norm": 0.8706870079040527, + "learning_rate": 1.5340737001514388e-05, + "loss": 0.0606, "step": 4840 }, { - "epoch": 1.4404514404514406, - "grad_norm": 0.4241088628768921, - "learning_rate": 2.1357291357291357e-05, - "loss": 0.0762, + "epoch": 2.4482584553255933, + "grad_norm": 0.9355966448783875, + "learning_rate": 1.531044926804644e-05, + "loss": 0.0563, "step": 4850 }, { - "epoch": 1.4434214434214434, - "grad_norm": 0.4464230537414551, - "learning_rate": 2.1339471339471342e-05, - "loss": 0.0698, + "epoch": 2.453306410903584, + "grad_norm": 0.6352431774139404, + "learning_rate": 1.5280161534578495e-05, + "loss": 0.0537, "step": 4860 }, { - "epoch": 1.4463914463914465, - "grad_norm": 0.36223044991493225, - "learning_rate": 2.1321651321651325e-05, - "loss": 0.0587, + "epoch": 2.458354366481575, + "grad_norm": 0.5970965623855591, + "learning_rate": 1.524987380111055e-05, + "loss": 0.0663, "step": 4870 }, { - "epoch": 1.4493614493614493, - "grad_norm": 0.5170213580131531, - "learning_rate": 2.1303831303831303e-05, - "loss": 0.0663, + "epoch": 2.463402322059566, + "grad_norm": 0.40907353162765503, + "learning_rate": 1.5219586067642605e-05, + "loss": 0.0502, "step": 4880 }, { - "epoch": 1.4523314523314523, - "grad_norm": 0.43765634298324585, - "learning_rate": 2.1286011286011286e-05, - "loss": 0.07, + "epoch": 2.4684502776375568, + "grad_norm": 0.5130166411399841, + "learning_rate": 1.518929833417466e-05, + "loss": 0.0538, "step": 4890 }, { - "epoch": 1.4553014553014554, - "grad_norm": 0.30947327613830566, - "learning_rate": 2.1268191268191268e-05, - "loss": 0.0682, + "epoch": 2.4734982332155475, + "grad_norm": 0.9824861288070679, + "learning_rate": 1.5159010600706716e-05, + "loss": 0.0518, "step": 4900 }, { - "epoch": 1.4582714582714582, - "grad_norm": 0.480027973651886, - "learning_rate": 2.125037125037125e-05, - "loss": 0.0652, + "epoch": 2.4785461887935387, + "grad_norm": 0.6424157023429871, + "learning_rate": 1.512872286723877e-05, + "loss": 0.0599, "step": 4910 }, { - "epoch": 1.4612414612414613, - "grad_norm": 0.7047821283340454, - "learning_rate": 2.1232551232551232e-05, - "loss": 0.0643, + "epoch": 2.4835941443715295, + "grad_norm": 0.8797338008880615, + "learning_rate": 1.5098435133770824e-05, + "loss": 0.0534, "step": 4920 }, { - "epoch": 1.464211464211464, - "grad_norm": 0.741016685962677, - "learning_rate": 2.1214731214731218e-05, - "loss": 0.0645, + "epoch": 2.4886420999495202, + "grad_norm": 1.0275185108184814, + "learning_rate": 1.5068147400302876e-05, + "loss": 0.063, "step": 4930 }, { - "epoch": 1.4671814671814671, - "grad_norm": 0.5473170280456543, - "learning_rate": 2.11969111969112e-05, - "loss": 0.0636, + "epoch": 2.4936900555275114, + "grad_norm": 0.6370276808738708, + "learning_rate": 1.5037859666834931e-05, + "loss": 0.0584, "step": 4940 }, { - "epoch": 1.4701514701514702, - "grad_norm": 0.4111592471599579, - "learning_rate": 2.117909117909118e-05, - "loss": 0.0676, + "epoch": 2.498738011105502, + "grad_norm": 0.5083595514297485, + "learning_rate": 1.5007571933366986e-05, + "loss": 0.0635, "step": 4950 }, { - "epoch": 1.473121473121473, - "grad_norm": 0.7355438470840454, - "learning_rate": 2.116127116127116e-05, - "loss": 0.0666, + "epoch": 2.503785966683493, + "grad_norm": 0.8423396348953247, + "learning_rate": 1.4977284199899041e-05, + "loss": 0.0593, "step": 4960 }, { - "epoch": 1.476091476091476, - "grad_norm": 0.2529616355895996, - "learning_rate": 2.1143451143451143e-05, - "loss": 0.0626, + "epoch": 2.508833922261484, + "grad_norm": 0.6133778691291809, + "learning_rate": 1.4946996466431095e-05, + "loss": 0.0652, "step": 4970 }, { - "epoch": 1.4790614790614791, - "grad_norm": 0.7075737714767456, - "learning_rate": 2.1125631125631125e-05, + "epoch": 2.513881877839475, + "grad_norm": 0.5626839995384216, + "learning_rate": 1.491670873296315e-05, "loss": 0.061, "step": 4980 }, { - "epoch": 1.482031482031482, - "grad_norm": 0.39400067925453186, - "learning_rate": 2.1107811107811107e-05, - "loss": 0.0714, + "epoch": 2.5189298334174657, + "grad_norm": 0.6379786729812622, + "learning_rate": 1.4886420999495205e-05, + "loss": 0.0583, "step": 4990 }, { - "epoch": 1.485001485001485, - "grad_norm": 0.4059322774410248, - "learning_rate": 2.1089991089991093e-05, - "loss": 0.0587, + "epoch": 2.523977788995457, + "grad_norm": 0.39859360456466675, + "learning_rate": 1.485613326602726e-05, + "loss": 0.057, "step": 5000 }, { - "epoch": 1.487971487971488, - "grad_norm": 0.3679432272911072, - "learning_rate": 2.1072171072171075e-05, - "loss": 0.0707, + "epoch": 2.5290257445734476, + "grad_norm": 0.4674101173877716, + "learning_rate": 1.4825845532559315e-05, + "loss": 0.0584, "step": 5010 }, { - "epoch": 1.4909414909414909, - "grad_norm": 0.45325401425361633, - "learning_rate": 2.1054351054351054e-05, - "loss": 0.0789, + "epoch": 2.5340737001514384, + "grad_norm": 0.6018111705780029, + "learning_rate": 1.4795557799091367e-05, + "loss": 0.0606, "step": 5020 }, { - "epoch": 1.493911493911494, - "grad_norm": 0.36480912566185, - "learning_rate": 2.1036531036531036e-05, - "loss": 0.0742, + "epoch": 2.5391216557294296, + "grad_norm": 0.4932622015476227, + "learning_rate": 1.4765270065623422e-05, + "loss": 0.0551, "step": 5030 }, { - "epoch": 1.496881496881497, - "grad_norm": 0.4680189788341522, - "learning_rate": 2.1018711018711018e-05, - "loss": 0.0691, + "epoch": 2.5441696113074204, + "grad_norm": 0.5576731562614441, + "learning_rate": 1.4734982332155477e-05, + "loss": 0.0562, "step": 5040 }, { - "epoch": 1.4998514998514998, - "grad_norm": 0.40691086649894714, - "learning_rate": 2.1000891000891e-05, - "loss": 0.0841, + "epoch": 2.5492175668854116, + "grad_norm": 0.5910426378250122, + "learning_rate": 1.4704694598687533e-05, + "loss": 0.0632, "step": 5050 }, { - "epoch": 1.5028215028215028, - "grad_norm": 0.30459049344062805, - "learning_rate": 2.0983070983070982e-05, - "loss": 0.0569, + "epoch": 2.5542655224634023, + "grad_norm": 0.42830216884613037, + "learning_rate": 1.4674406865219586e-05, + "loss": 0.0589, "step": 5060 }, { - "epoch": 1.505791505791506, - "grad_norm": 0.8983843326568604, - "learning_rate": 2.0965250965250968e-05, - "loss": 0.0719, + "epoch": 2.559313478041393, + "grad_norm": 0.657305896282196, + "learning_rate": 1.4644119131751641e-05, + "loss": 0.0666, "step": 5070 }, { - "epoch": 1.5087615087615087, - "grad_norm": 0.5937901139259338, - "learning_rate": 2.094743094743095e-05, - "loss": 0.076, + "epoch": 2.5643614336193843, + "grad_norm": 0.5498583912849426, + "learning_rate": 1.4613831398283696e-05, + "loss": 0.0677, "step": 5080 }, { - "epoch": 1.5117315117315118, - "grad_norm": 0.3914330005645752, - "learning_rate": 2.092961092961093e-05, - "loss": 0.0698, + "epoch": 2.569409389197375, + "grad_norm": 1.5641086101531982, + "learning_rate": 1.458354366481575e-05, + "loss": 0.0618, "step": 5090 }, { - "epoch": 1.5147015147015148, - "grad_norm": 0.38608691096305847, - "learning_rate": 2.091179091179091e-05, - "loss": 0.0777, + "epoch": 2.5744573447753663, + "grad_norm": 0.576878011226654, + "learning_rate": 1.4553255931347805e-05, + "loss": 0.0596, "step": 5100 }, { - "epoch": 1.5176715176715176, - "grad_norm": 0.7738357186317444, - "learning_rate": 2.0893970893970893e-05, - "loss": 0.0693, + "epoch": 2.579505300353357, + "grad_norm": 0.6855084896087646, + "learning_rate": 1.4522968197879858e-05, + "loss": 0.0684, "step": 5110 }, { - "epoch": 1.5206415206415207, - "grad_norm": 0.6664383411407471, - "learning_rate": 2.0876150876150875e-05, - "loss": 0.072, + "epoch": 2.5845532559313478, + "grad_norm": 0.46760818362236023, + "learning_rate": 1.4492680464411913e-05, + "loss": 0.0628, "step": 5120 }, { - "epoch": 1.5236115236115237, - "grad_norm": 0.38639238476753235, - "learning_rate": 2.0858330858330858e-05, - "loss": 0.0724, + "epoch": 2.589601211509339, + "grad_norm": 0.4708857834339142, + "learning_rate": 1.4462392730943969e-05, + "loss": 0.0656, "step": 5130 }, { - "epoch": 1.5265815265815266, - "grad_norm": 0.7060205936431885, - "learning_rate": 2.0840510840510843e-05, - "loss": 0.0665, + "epoch": 2.5946491670873297, + "grad_norm": 0.957336962223053, + "learning_rate": 1.4432104997476024e-05, + "loss": 0.0527, "step": 5140 }, { - "epoch": 1.5295515295515294, - "grad_norm": 0.674248993396759, - "learning_rate": 2.0822690822690825e-05, - "loss": 0.0769, + "epoch": 2.5996971226653205, + "grad_norm": 0.6079381704330444, + "learning_rate": 1.4401817264008077e-05, + "loss": 0.0499, "step": 5150 }, { - "epoch": 1.5325215325215327, - "grad_norm": 0.45502710342407227, - "learning_rate": 2.0804870804870808e-05, - "loss": 0.0808, + "epoch": 2.6047450782433117, + "grad_norm": 0.644965410232544, + "learning_rate": 1.437152953054013e-05, + "loss": 0.0567, "step": 5160 }, { - "epoch": 1.5354915354915355, - "grad_norm": 0.4794248938560486, - "learning_rate": 2.0787050787050786e-05, - "loss": 0.0701, + "epoch": 2.6097930338213025, + "grad_norm": 0.9058682322502136, + "learning_rate": 1.4341241797072186e-05, + "loss": 0.059, "step": 5170 }, { - "epoch": 1.5384615384615383, - "grad_norm": 0.6008143424987793, - "learning_rate": 2.076923076923077e-05, - "loss": 0.0697, + "epoch": 2.614840989399293, + "grad_norm": 0.6784061789512634, + "learning_rate": 1.4310954063604241e-05, + "loss": 0.0577, "step": 5180 }, { - "epoch": 1.5414315414315416, - "grad_norm": 0.5068689584732056, - "learning_rate": 2.075141075141075e-05, - "loss": 0.079, + "epoch": 2.6198889449772844, + "grad_norm": 0.7699759602546692, + "learning_rate": 1.4280666330136296e-05, + "loss": 0.056, "step": 5190 }, { - "epoch": 1.5444015444015444, - "grad_norm": 0.4885605275630951, - "learning_rate": 2.0733590733590733e-05, - "loss": 0.0749, + "epoch": 2.624936900555275, + "grad_norm": 1.0204094648361206, + "learning_rate": 1.425037859666835e-05, + "loss": 0.0595, "step": 5200 }, { - "epoch": 1.5473715473715473, - "grad_norm": 0.5522565841674805, - "learning_rate": 2.071577071577072e-05, - "loss": 0.071, + "epoch": 2.629984856133266, + "grad_norm": 0.3317660987377167, + "learning_rate": 1.4220090863200403e-05, + "loss": 0.0579, "step": 5210 }, { - "epoch": 1.5503415503415503, - "grad_norm": 0.32774174213409424, - "learning_rate": 2.06979506979507e-05, - "loss": 0.0556, + "epoch": 2.635032811711257, + "grad_norm": 0.7586853504180908, + "learning_rate": 1.4189803129732458e-05, + "loss": 0.0612, "step": 5220 }, { - "epoch": 1.5533115533115534, - "grad_norm": 0.5104330778121948, - "learning_rate": 2.0680130680130683e-05, - "loss": 0.0738, + "epoch": 2.640080767289248, + "grad_norm": 0.43295013904571533, + "learning_rate": 1.4159515396264513e-05, + "loss": 0.0584, "step": 5230 }, { - "epoch": 1.5562815562815562, - "grad_norm": 0.5387243628501892, - "learning_rate": 2.066231066231066e-05, - "loss": 0.0656, + "epoch": 2.6451287228672387, + "grad_norm": 0.9083705544471741, + "learning_rate": 1.4129227662796568e-05, + "loss": 0.0698, "step": 5240 }, { - "epoch": 1.5592515592515592, - "grad_norm": 0.49494025111198425, - "learning_rate": 2.0644490644490644e-05, - "loss": 0.0684, + "epoch": 2.65017667844523, + "grad_norm": 0.6299885511398315, + "learning_rate": 1.4098939929328622e-05, + "loss": 0.0602, "step": 5250 }, { - "epoch": 1.5622215622215623, - "grad_norm": 0.5351789593696594, - "learning_rate": 2.0626670626670626e-05, - "loss": 0.0629, + "epoch": 2.6552246340232206, + "grad_norm": 0.538589358329773, + "learning_rate": 1.4068652195860677e-05, + "loss": 0.0634, "step": 5260 }, { - "epoch": 1.565191565191565, - "grad_norm": 0.5836585760116577, - "learning_rate": 2.0608850608850608e-05, - "loss": 0.0725, + "epoch": 2.6602725896012114, + "grad_norm": 0.5712538361549377, + "learning_rate": 1.4038364462392732e-05, + "loss": 0.0625, "step": 5270 }, { - "epoch": 1.5681615681615682, - "grad_norm": 0.5254115462303162, - "learning_rate": 2.0591030591030594e-05, - "loss": 0.0617, + "epoch": 2.6653205451792026, + "grad_norm": 0.5739433765411377, + "learning_rate": 1.4008076728924786e-05, + "loss": 0.0647, "step": 5280 }, { - "epoch": 1.5711315711315712, - "grad_norm": 0.9055864810943604, - "learning_rate": 2.0573210573210576e-05, - "loss": 0.0702, + "epoch": 2.6703685007571933, + "grad_norm": 0.5050386786460876, + "learning_rate": 1.397778899545684e-05, + "loss": 0.0592, "step": 5290 }, { - "epoch": 1.574101574101574, - "grad_norm": 0.6344959139823914, - "learning_rate": 2.0555390555390558e-05, - "loss": 0.0571, + "epoch": 2.675416456335184, + "grad_norm": 0.41851407289505005, + "learning_rate": 1.3947501261988894e-05, + "loss": 0.0581, "step": 5300 }, { - "epoch": 1.577071577071577, - "grad_norm": 0.4069235622882843, - "learning_rate": 2.0537570537570537e-05, - "loss": 0.0562, + "epoch": 2.6804644119131753, + "grad_norm": 0.5866436958312988, + "learning_rate": 1.391721352852095e-05, + "loss": 0.0656, "step": 5310 }, { - "epoch": 1.5800415800415801, - "grad_norm": 0.4786476194858551, - "learning_rate": 2.051975051975052e-05, - "loss": 0.0661, + "epoch": 2.685512367491166, + "grad_norm": 0.47498345375061035, + "learning_rate": 1.3886925795053004e-05, + "loss": 0.0657, "step": 5320 }, { - "epoch": 1.583011583011583, - "grad_norm": 0.45690423250198364, - "learning_rate": 2.05019305019305e-05, - "loss": 0.0754, + "epoch": 2.690560323069157, + "grad_norm": 0.5748500227928162, + "learning_rate": 1.385663806158506e-05, + "loss": 0.0588, "step": 5330 }, { - "epoch": 1.585981585981586, - "grad_norm": 0.3506830036640167, - "learning_rate": 2.0484110484110483e-05, - "loss": 0.0671, + "epoch": 2.695608278647148, + "grad_norm": 0.685787558555603, + "learning_rate": 1.3826350328117113e-05, + "loss": 0.0621, "step": 5340 }, { - "epoch": 1.588951588951589, - "grad_norm": 0.6035703420639038, - "learning_rate": 2.046629046629047e-05, - "loss": 0.0697, + "epoch": 2.700656234225139, + "grad_norm": 0.5321753025054932, + "learning_rate": 1.3796062594649166e-05, + "loss": 0.0665, "step": 5350 }, { - "epoch": 1.5919215919215919, - "grad_norm": 0.5453073382377625, - "learning_rate": 2.044847044847045e-05, - "loss": 0.0782, + "epoch": 2.7057041898031295, + "grad_norm": 0.4687628746032715, + "learning_rate": 1.3765774861181222e-05, + "loss": 0.0622, "step": 5360 }, { - "epoch": 1.594891594891595, - "grad_norm": 0.5534022450447083, - "learning_rate": 2.0430650430650433e-05, - "loss": 0.058, + "epoch": 2.7107521453811207, + "grad_norm": 0.6931032538414001, + "learning_rate": 1.3735487127713277e-05, + "loss": 0.0542, "step": 5370 }, { - "epoch": 1.597861597861598, - "grad_norm": 0.6920284032821655, - "learning_rate": 2.0412830412830412e-05, - "loss": 0.0824, + "epoch": 2.7158001009591115, + "grad_norm": 0.6347541213035583, + "learning_rate": 1.3705199394245332e-05, + "loss": 0.0618, "step": 5380 }, { - "epoch": 1.6008316008316008, - "grad_norm": 0.4295329451560974, - "learning_rate": 2.0395010395010394e-05, - "loss": 0.0708, + "epoch": 2.7208480565371023, + "grad_norm": 0.5090097188949585, + "learning_rate": 1.3674911660777385e-05, + "loss": 0.0577, "step": 5390 }, { - "epoch": 1.6038016038016036, - "grad_norm": 0.6782526969909668, - "learning_rate": 2.0377190377190376e-05, - "loss": 0.0687, + "epoch": 2.7258960121150935, + "grad_norm": 0.557161808013916, + "learning_rate": 1.3644623927309439e-05, + "loss": 0.0485, "step": 5400 }, { - "epoch": 1.606771606771607, - "grad_norm": 0.37526410818099976, - "learning_rate": 2.035937035937036e-05, - "loss": 0.0706, + "epoch": 2.7309439676930842, + "grad_norm": 0.7229135036468506, + "learning_rate": 1.3614336193841494e-05, + "loss": 0.0642, "step": 5410 }, { - "epoch": 1.6097416097416097, - "grad_norm": 0.581466555595398, - "learning_rate": 2.0341550341550344e-05, - "loss": 0.0658, + "epoch": 2.735991923271075, + "grad_norm": 0.7802084684371948, + "learning_rate": 1.3584048460373549e-05, + "loss": 0.0721, "step": 5420 }, { - "epoch": 1.6127116127116126, - "grad_norm": 0.6016833186149597, - "learning_rate": 2.0323730323730326e-05, - "loss": 0.0778, + "epoch": 2.741039878849066, + "grad_norm": 0.8350520730018616, + "learning_rate": 1.3553760726905604e-05, + "loss": 0.05, "step": 5430 }, { - "epoch": 1.6156816156816158, - "grad_norm": 0.46572285890579224, - "learning_rate": 2.0305910305910308e-05, - "loss": 0.0704, + "epoch": 2.746087834427057, + "grad_norm": 0.24809196591377258, + "learning_rate": 1.3523472993437658e-05, + "loss": 0.0577, "step": 5440 }, { - "epoch": 1.6186516186516187, - "grad_norm": 0.4586747884750366, - "learning_rate": 2.0288090288090287e-05, - "loss": 0.0582, + "epoch": 2.7511357900050477, + "grad_norm": 0.5501554608345032, + "learning_rate": 1.3493185259969713e-05, + "loss": 0.0613, "step": 5450 }, { - "epoch": 1.6216216216216215, - "grad_norm": 0.5045327544212341, - "learning_rate": 2.027027027027027e-05, - "loss": 0.0774, + "epoch": 2.756183745583039, + "grad_norm": 0.6459994912147522, + "learning_rate": 1.3462897526501768e-05, + "loss": 0.0545, "step": 5460 }, { - "epoch": 1.6245916245916245, - "grad_norm": 0.4661884307861328, - "learning_rate": 2.025245025245025e-05, - "loss": 0.0706, + "epoch": 2.7612317011610297, + "grad_norm": 1.0892735719680786, + "learning_rate": 1.3432609793033821e-05, + "loss": 0.0517, "step": 5470 }, { - "epoch": 1.6275616275616276, - "grad_norm": 0.3280268609523773, - "learning_rate": 2.0234630234630234e-05, - "loss": 0.0663, + "epoch": 2.7662796567390204, + "grad_norm": 0.8553361296653748, + "learning_rate": 1.3402322059565877e-05, + "loss": 0.055, "step": 5480 }, { - "epoch": 1.6305316305316304, - "grad_norm": 0.4486147165298462, - "learning_rate": 2.021681021681022e-05, - "loss": 0.0698, + "epoch": 2.7713276123170116, + "grad_norm": 0.5909534692764282, + "learning_rate": 1.337203432609793e-05, + "loss": 0.0583, "step": 5490 }, { - "epoch": 1.6335016335016335, - "grad_norm": 0.5801326036453247, - "learning_rate": 2.01989901989902e-05, - "loss": 0.0808, + "epoch": 2.7763755678950024, + "grad_norm": 0.3620651662349701, + "learning_rate": 1.3341746592629985e-05, + "loss": 0.053, "step": 5500 }, { - "epoch": 1.6364716364716365, - "grad_norm": 0.43352991342544556, - "learning_rate": 2.0181170181170183e-05, - "loss": 0.0522, + "epoch": 2.7814235234729936, + "grad_norm": 0.6525430083274841, + "learning_rate": 1.331145885916204e-05, + "loss": 0.0667, "step": 5510 }, { - "epoch": 1.6394416394416393, - "grad_norm": 0.5242543816566467, - "learning_rate": 2.0163350163350162e-05, - "loss": 0.0744, + "epoch": 2.7864714790509844, + "grad_norm": 0.6129066944122314, + "learning_rate": 1.3281171125694095e-05, + "loss": 0.0578, "step": 5520 }, { - "epoch": 1.6424116424116424, - "grad_norm": 0.5735893249511719, - "learning_rate": 2.0145530145530144e-05, - "loss": 0.0663, + "epoch": 2.791519434628975, + "grad_norm": 0.6374188661575317, + "learning_rate": 1.3250883392226147e-05, + "loss": 0.0598, "step": 5530 }, { - "epoch": 1.6453816453816454, - "grad_norm": 0.3472582697868347, - "learning_rate": 2.0127710127710127e-05, - "loss": 0.0685, + "epoch": 2.7965673902069663, + "grad_norm": 0.6404274702072144, + "learning_rate": 1.3220595658758202e-05, + "loss": 0.064, "step": 5540 }, { - "epoch": 1.6483516483516483, - "grad_norm": 0.4069629907608032, - "learning_rate": 2.010989010989011e-05, - "loss": 0.0619, + "epoch": 2.801615345784957, + "grad_norm": 0.3882500231266022, + "learning_rate": 1.3190307925290257e-05, + "loss": 0.0556, "step": 5550 }, { - "epoch": 1.6513216513216513, - "grad_norm": 0.5932218432426453, - "learning_rate": 2.0092070092070094e-05, - "loss": 0.0711, + "epoch": 2.8066633013629483, + "grad_norm": 0.827498197555542, + "learning_rate": 1.3160020191822313e-05, + "loss": 0.056, "step": 5560 }, { - "epoch": 1.6542916542916544, - "grad_norm": 0.7638351321220398, - "learning_rate": 2.0074250074250076e-05, - "loss": 0.0837, + "epoch": 2.811711256940939, + "grad_norm": 0.5474889874458313, + "learning_rate": 1.3129732458354368e-05, + "loss": 0.0559, "step": 5570 }, { - "epoch": 1.6572616572616572, - "grad_norm": 0.7104766368865967, - "learning_rate": 2.005643005643006e-05, - "loss": 0.0659, + "epoch": 2.81675921251893, + "grad_norm": 0.7505003809928894, + "learning_rate": 1.3099444724886421e-05, + "loss": 0.0562, "step": 5580 }, { - "epoch": 1.6602316602316602, - "grad_norm": 0.6623921394348145, - "learning_rate": 2.0038610038610037e-05, - "loss": 0.0693, + "epoch": 2.821807168096921, + "grad_norm": 0.7723977565765381, + "learning_rate": 1.3069156991418476e-05, + "loss": 0.0711, "step": 5590 }, { - "epoch": 1.6632016632016633, - "grad_norm": 0.5632063746452332, - "learning_rate": 2.002079002079002e-05, - "loss": 0.0647, + "epoch": 2.8268551236749118, + "grad_norm": 0.5930567979812622, + "learning_rate": 1.303886925795053e-05, + "loss": 0.0666, "step": 5600 }, { - "epoch": 1.6661716661716661, - "grad_norm": 0.36101019382476807, - "learning_rate": 2.0002970002970002e-05, - "loss": 0.0724, + "epoch": 2.8319030792529025, + "grad_norm": 0.9205801486968994, + "learning_rate": 1.3008581524482585e-05, + "loss": 0.0635, "step": 5610 }, { - "epoch": 1.6691416691416692, - "grad_norm": 0.4157385230064392, - "learning_rate": 1.9985149985149984e-05, - "loss": 0.0609, + "epoch": 2.8369510348308937, + "grad_norm": 0.6520891189575195, + "learning_rate": 1.297829379101464e-05, + "loss": 0.0503, "step": 5620 }, { - "epoch": 1.6721116721116722, - "grad_norm": 0.4751082956790924, - "learning_rate": 1.996732996732997e-05, - "loss": 0.0681, + "epoch": 2.8419989904088845, + "grad_norm": 0.697742760181427, + "learning_rate": 1.2948006057546693e-05, + "loss": 0.0527, "step": 5630 }, { - "epoch": 1.675081675081675, - "grad_norm": 0.5545091032981873, - "learning_rate": 1.994950994950995e-05, - "loss": 0.0726, + "epoch": 2.8470469459868752, + "grad_norm": 0.5600337386131287, + "learning_rate": 1.2917718324078749e-05, + "loss": 0.0658, "step": 5640 }, { - "epoch": 1.678051678051678, - "grad_norm": 0.7477673888206482, - "learning_rate": 1.9931689931689934e-05, - "loss": 0.0707, + "epoch": 2.8520949015648664, + "grad_norm": 0.7648780941963196, + "learning_rate": 1.2887430590610804e-05, + "loss": 0.0503, "step": 5650 }, { - "epoch": 1.6810216810216811, - "grad_norm": 0.8139877915382385, - "learning_rate": 1.9913869913869913e-05, - "loss": 0.0708, + "epoch": 2.857142857142857, + "grad_norm": 0.44580090045928955, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.0569, "step": 5660 }, { - "epoch": 1.683991683991684, - "grad_norm": 0.26891762018203735, - "learning_rate": 1.9896049896049895e-05, - "loss": 0.0703, + "epoch": 2.862190812720848, + "grad_norm": 0.6274628043174744, + "learning_rate": 1.2826855123674912e-05, + "loss": 0.0544, "step": 5670 }, { - "epoch": 1.6869616869616868, - "grad_norm": 0.47424066066741943, - "learning_rate": 1.9878229878229877e-05, - "loss": 0.0772, + "epoch": 2.867238768298839, + "grad_norm": 0.5967713594436646, + "learning_rate": 1.2796567390206966e-05, + "loss": 0.049, "step": 5680 }, { - "epoch": 1.68993168993169, - "grad_norm": 0.41330039501190186, - "learning_rate": 1.986040986040986e-05, - "loss": 0.078, + "epoch": 2.87228672387683, + "grad_norm": 0.49563518166542053, + "learning_rate": 1.2766279656739021e-05, + "loss": 0.0637, "step": 5690 }, { - "epoch": 1.692901692901693, - "grad_norm": 0.45802241563796997, - "learning_rate": 1.9842589842589845e-05, - "loss": 0.0588, + "epoch": 2.8773346794548207, + "grad_norm": 0.5065841674804688, + "learning_rate": 1.2735991923271076e-05, + "loss": 0.0635, "step": 5700 }, { - "epoch": 1.6958716958716957, - "grad_norm": 0.5270569920539856, - "learning_rate": 1.9824769824769827e-05, - "loss": 0.066, + "epoch": 2.882382635032812, + "grad_norm": 0.4228837490081787, + "learning_rate": 1.2705704189803131e-05, + "loss": 0.0561, "step": 5710 }, { - "epoch": 1.698841698841699, - "grad_norm": 0.5334698557853699, - "learning_rate": 1.980694980694981e-05, - "loss": 0.0795, + "epoch": 2.8874305906108026, + "grad_norm": 0.36254429817199707, + "learning_rate": 1.2675416456335183e-05, + "loss": 0.0564, "step": 5720 }, { - "epoch": 1.7018117018117018, - "grad_norm": 0.4093966484069824, - "learning_rate": 1.978912978912979e-05, - "loss": 0.0752, + "epoch": 2.8924785461887934, + "grad_norm": 0.6964749097824097, + "learning_rate": 1.2645128722867238e-05, + "loss": 0.0566, "step": 5730 }, { - "epoch": 1.7047817047817047, - "grad_norm": 0.5499134659767151, - "learning_rate": 1.977130977130977e-05, - "loss": 0.071, + "epoch": 2.8975265017667846, + "grad_norm": 1.2399131059646606, + "learning_rate": 1.2614840989399293e-05, + "loss": 0.0528, "step": 5740 }, { - "epoch": 1.7077517077517077, - "grad_norm": 0.5507758259773254, - "learning_rate": 1.9753489753489752e-05, - "loss": 0.0762, + "epoch": 2.9025744573447754, + "grad_norm": 0.45011046528816223, + "learning_rate": 1.2584553255931348e-05, + "loss": 0.0605, "step": 5750 }, { - "epoch": 1.7107217107217108, - "grad_norm": 0.726193904876709, - "learning_rate": 1.9735669735669734e-05, - "loss": 0.073, + "epoch": 2.907622412922766, + "grad_norm": 0.6450422406196594, + "learning_rate": 1.2554265522463404e-05, + "loss": 0.0579, "step": 5760 }, { - "epoch": 1.7136917136917136, - "grad_norm": 0.499423086643219, - "learning_rate": 1.971784971784972e-05, - "loss": 0.0669, + "epoch": 2.9126703685007573, + "grad_norm": 0.6685008406639099, + "learning_rate": 1.2523977788995457e-05, + "loss": 0.0596, "step": 5770 }, { - "epoch": 1.7166617166617166, - "grad_norm": 0.4177100956439972, - "learning_rate": 1.9700029700029702e-05, - "loss": 0.0643, + "epoch": 2.917718324078748, + "grad_norm": 0.7710725665092468, + "learning_rate": 1.2493690055527512e-05, + "loss": 0.063, "step": 5780 }, { - "epoch": 1.7196317196317197, - "grad_norm": 0.7960310578346252, - "learning_rate": 1.9682209682209684e-05, - "loss": 0.0724, + "epoch": 2.922766279656739, + "grad_norm": 0.6229269504547119, + "learning_rate": 1.2463402322059566e-05, + "loss": 0.0542, "step": 5790 }, { - "epoch": 1.7226017226017225, - "grad_norm": 0.4406733512878418, - "learning_rate": 1.9664389664389666e-05, - "loss": 0.0776, + "epoch": 2.92781423523473, + "grad_norm": 0.41364407539367676, + "learning_rate": 1.243311458859162e-05, + "loss": 0.0588, "step": 5800 }, { - "epoch": 1.7255717255717256, - "grad_norm": 0.530737042427063, - "learning_rate": 1.9646569646569645e-05, - "loss": 0.0693, + "epoch": 2.932862190812721, + "grad_norm": 0.5546961426734924, + "learning_rate": 1.2402826855123676e-05, + "loss": 0.0607, "step": 5810 }, { - "epoch": 1.7285417285417286, - "grad_norm": 0.29855164885520935, - "learning_rate": 1.9628749628749627e-05, - "loss": 0.0719, + "epoch": 2.9379101463907116, + "grad_norm": 0.6814476251602173, + "learning_rate": 1.237253912165573e-05, + "loss": 0.0587, "step": 5820 }, { - "epoch": 1.7315117315117314, - "grad_norm": 0.5606129765510559, - "learning_rate": 1.961092961092961e-05, - "loss": 0.0773, + "epoch": 2.9429581019687028, + "grad_norm": 0.7745892405509949, + "learning_rate": 1.2342251388187784e-05, + "loss": 0.0484, "step": 5830 }, { - "epoch": 1.7344817344817345, - "grad_norm": 0.4716852307319641, - "learning_rate": 1.9593109593109595e-05, - "loss": 0.0699, + "epoch": 2.9480060575466935, + "grad_norm": 0.9947149157524109, + "learning_rate": 1.231196365471984e-05, + "loss": 0.056, "step": 5840 }, { - "epoch": 1.7374517374517375, - "grad_norm": 0.39249199628829956, - "learning_rate": 1.9575289575289577e-05, - "loss": 0.078, + "epoch": 2.9530540131246843, + "grad_norm": 0.599892258644104, + "learning_rate": 1.2281675921251893e-05, + "loss": 0.0603, "step": 5850 }, { - "epoch": 1.7404217404217404, - "grad_norm": 0.5014438629150391, - "learning_rate": 1.955746955746956e-05, - "loss": 0.0753, + "epoch": 2.9581019687026755, + "grad_norm": 0.4991750121116638, + "learning_rate": 1.2251388187783947e-05, + "loss": 0.0603, "step": 5860 }, { - "epoch": 1.7433917433917434, - "grad_norm": 0.535271167755127, - "learning_rate": 1.953964953964954e-05, - "loss": 0.0722, + "epoch": 2.9631499242806663, + "grad_norm": 0.44697603583335876, + "learning_rate": 1.2221100454316002e-05, + "loss": 0.0614, "step": 5870 }, { - "epoch": 1.7463617463617465, - "grad_norm": 0.3693440854549408, - "learning_rate": 1.952182952182952e-05, - "loss": 0.0824, + "epoch": 2.968197879858657, + "grad_norm": 0.34608447551727295, + "learning_rate": 1.2190812720848057e-05, + "loss": 0.0633, "step": 5880 }, { - "epoch": 1.7493317493317493, - "grad_norm": 0.6997837424278259, - "learning_rate": 1.9504009504009503e-05, - "loss": 0.0719, + "epoch": 2.973245835436648, + "grad_norm": 0.6991161108016968, + "learning_rate": 1.2160524987380112e-05, + "loss": 0.0713, "step": 5890 }, { - "epoch": 1.7523017523017523, - "grad_norm": 0.3417227864265442, - "learning_rate": 1.9486189486189485e-05, - "loss": 0.0701, + "epoch": 2.978293791014639, + "grad_norm": 0.7053156495094299, + "learning_rate": 1.2130237253912167e-05, + "loss": 0.0642, "step": 5900 }, { - "epoch": 1.7552717552717554, - "grad_norm": 1.077194094657898, - "learning_rate": 1.946836946836947e-05, - "loss": 0.0787, + "epoch": 2.9833417465926297, + "grad_norm": 0.4541454315185547, + "learning_rate": 1.209994952044422e-05, + "loss": 0.0583, "step": 5910 }, { - "epoch": 1.7582417582417582, - "grad_norm": 0.7957248687744141, - "learning_rate": 1.9450549450549452e-05, - "loss": 0.0878, + "epoch": 2.988389702170621, + "grad_norm": 0.5963706970214844, + "learning_rate": 1.2069661786976274e-05, + "loss": 0.0551, "step": 5920 }, { - "epoch": 1.7612117612117613, - "grad_norm": 0.7661225199699402, - "learning_rate": 1.9432729432729435e-05, - "loss": 0.0701, + "epoch": 2.9934376577486117, + "grad_norm": 0.37611526250839233, + "learning_rate": 1.2039374053508329e-05, + "loss": 0.0551, "step": 5930 }, { - "epoch": 1.7641817641817643, - "grad_norm": 0.4629841446876526, - "learning_rate": 1.9414909414909417e-05, - "loss": 0.0704, + "epoch": 2.9984856133266025, + "grad_norm": 0.5949448943138123, + "learning_rate": 1.2009086320040384e-05, + "loss": 0.0615, "step": 5940 }, { - "epoch": 1.7671517671517671, - "grad_norm": 0.9389346241950989, - "learning_rate": 1.9397089397089396e-05, - "loss": 0.076, + "epoch": 3.0, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.04155249148607254, + "eval_runtime": 582.0561, + "eval_samples_per_second": 354.368, + "eval_steps_per_second": 2.769, + "step": 5943 + }, + { + "epoch": 3.0035335689045937, + "grad_norm": 0.732612133026123, + "learning_rate": 1.197879858657244e-05, + "loss": 0.0473, "step": 5950 }, { - "epoch": 1.77012177012177, - "grad_norm": 0.3709728717803955, - "learning_rate": 1.9379269379269378e-05, - "loss": 0.064, + "epoch": 3.0085815244825844, + "grad_norm": 0.8803137540817261, + "learning_rate": 1.1948510853104493e-05, + "loss": 0.0513, "step": 5960 }, { - "epoch": 1.7730917730917732, - "grad_norm": 0.4123302102088928, - "learning_rate": 1.936144936144936e-05, - "loss": 0.0762, + "epoch": 3.0136294800605756, + "grad_norm": 0.5578094720840454, + "learning_rate": 1.1918223119636548e-05, + "loss": 0.0603, "step": 5970 }, { - "epoch": 1.776061776061776, - "grad_norm": 0.5153429508209229, - "learning_rate": 1.9343629343629345e-05, - "loss": 0.0825, + "epoch": 3.0186774356385664, + "grad_norm": 0.9948665499687195, + "learning_rate": 1.1887935386168601e-05, + "loss": 0.0592, "step": 5980 }, { - "epoch": 1.779031779031779, - "grad_norm": 0.2630942761898041, - "learning_rate": 1.9325809325809328e-05, - "loss": 0.0578, + "epoch": 3.023725391216557, + "grad_norm": 0.6967259049415588, + "learning_rate": 1.1857647652700657e-05, + "loss": 0.0741, "step": 5990 }, { - "epoch": 1.7820017820017822, - "grad_norm": 0.4419863522052765, - "learning_rate": 1.930798930798931e-05, - "loss": 0.0548, + "epoch": 3.0287733467945483, + "grad_norm": 0.48011064529418945, + "learning_rate": 1.182735991923271e-05, + "loss": 0.055, "step": 6000 }, { - "epoch": 1.784971784971785, - "grad_norm": 0.46090295910835266, - "learning_rate": 1.9290169290169292e-05, - "loss": 0.073, + "epoch": 3.033821302372539, + "grad_norm": 0.663847804069519, + "learning_rate": 1.1797072185764765e-05, + "loss": 0.0591, "step": 6010 }, { - "epoch": 1.7879417879417878, - "grad_norm": 1.1012392044067383, - "learning_rate": 1.927234927234927e-05, - "loss": 0.0725, + "epoch": 3.03886925795053, + "grad_norm": 0.589154839515686, + "learning_rate": 1.176678445229682e-05, + "loss": 0.0508, "step": 6020 }, { - "epoch": 1.7909117909117909, - "grad_norm": 0.422880083322525, - "learning_rate": 1.9254529254529253e-05, - "loss": 0.0674, + "epoch": 3.043917213528521, + "grad_norm": 0.7075181007385254, + "learning_rate": 1.1736496718828875e-05, + "loss": 0.0493, "step": 6030 }, { - "epoch": 1.793881793881794, - "grad_norm": 0.6051161885261536, - "learning_rate": 1.9236709236709235e-05, - "loss": 0.0669, + "epoch": 3.048965169106512, + "grad_norm": 0.6230030655860901, + "learning_rate": 1.1706208985360929e-05, + "loss": 0.0589, "step": 6040 }, { - "epoch": 1.7968517968517967, - "grad_norm": 0.351578027009964, - "learning_rate": 1.921888921888922e-05, - "loss": 0.069, + "epoch": 3.0540131246845026, + "grad_norm": 0.6204888820648193, + "learning_rate": 1.1675921251892982e-05, + "loss": 0.0602, "step": 6050 }, { - "epoch": 1.7998217998217998, - "grad_norm": 0.606691300868988, - "learning_rate": 1.9201069201069203e-05, - "loss": 0.0641, + "epoch": 3.059061080262494, + "grad_norm": 0.456939160823822, + "learning_rate": 1.1645633518425038e-05, + "loss": 0.059, "step": 6060 }, { - "epoch": 1.8027918027918028, - "grad_norm": 0.8968992829322815, - "learning_rate": 1.9183249183249185e-05, - "loss": 0.0734, + "epoch": 3.0641090358404846, + "grad_norm": 0.7607660889625549, + "learning_rate": 1.1615345784957093e-05, + "loss": 0.0488, "step": 6070 }, { - "epoch": 1.8057618057618057, - "grad_norm": 0.5204905867576599, - "learning_rate": 1.9165429165429167e-05, - "loss": 0.0741, + "epoch": 3.0691569914184753, + "grad_norm": 1.2064040899276733, + "learning_rate": 1.1585058051489148e-05, + "loss": 0.0695, "step": 6080 }, { - "epoch": 1.8087318087318087, - "grad_norm": 0.6135872602462769, - "learning_rate": 1.9147609147609146e-05, - "loss": 0.0791, + "epoch": 3.0742049469964665, + "grad_norm": 0.5143324732780457, + "learning_rate": 1.1554770318021203e-05, + "loss": 0.0606, "step": 6090 }, { - "epoch": 1.8117018117018118, - "grad_norm": 0.5273720622062683, - "learning_rate": 1.9129789129789128e-05, - "loss": 0.0655, + "epoch": 3.0792529025744573, + "grad_norm": 0.6567758917808533, + "learning_rate": 1.1524482584553256e-05, + "loss": 0.0581, "step": 6100 }, { - "epoch": 1.8146718146718146, - "grad_norm": 0.4117693305015564, - "learning_rate": 1.911196911196911e-05, - "loss": 0.0658, + "epoch": 3.0843008581524485, + "grad_norm": 0.7469787001609802, + "learning_rate": 1.149419485108531e-05, + "loss": 0.0535, "step": 6110 }, { - "epoch": 1.8176418176418176, - "grad_norm": 0.5177286267280579, - "learning_rate": 1.9094149094149096e-05, - "loss": 0.0773, + "epoch": 3.0893488137304392, + "grad_norm": 0.40161028504371643, + "learning_rate": 1.1463907117617365e-05, + "loss": 0.056, "step": 6120 }, { - "epoch": 1.8206118206118207, - "grad_norm": 0.5179166793823242, - "learning_rate": 1.9076329076329078e-05, - "loss": 0.06, + "epoch": 3.09439676930843, + "grad_norm": 0.7404605150222778, + "learning_rate": 1.143361938414942e-05, + "loss": 0.0471, "step": 6130 }, { - "epoch": 1.8235818235818235, - "grad_norm": 0.48499223589897156, - "learning_rate": 1.905850905850906e-05, - "loss": 0.0876, + "epoch": 3.099444724886421, + "grad_norm": 0.8587531447410583, + "learning_rate": 1.1403331650681475e-05, + "loss": 0.0558, "step": 6140 }, { - "epoch": 1.8265518265518266, - "grad_norm": 0.5573757886886597, - "learning_rate": 1.9040689040689042e-05, - "loss": 0.0684, + "epoch": 3.104492680464412, + "grad_norm": 0.424450159072876, + "learning_rate": 1.1373043917213529e-05, + "loss": 0.0558, "step": 6150 }, { - "epoch": 1.8295218295218296, - "grad_norm": 0.481963574886322, - "learning_rate": 1.902286902286902e-05, - "loss": 0.0807, + "epoch": 3.1095406360424027, + "grad_norm": 0.9383788704872131, + "learning_rate": 1.1342756183745584e-05, + "loss": 0.0517, "step": 6160 }, { - "epoch": 1.8324918324918325, - "grad_norm": 0.4293064475059509, - "learning_rate": 1.9005049005049003e-05, - "loss": 0.0779, + "epoch": 3.114588591620394, + "grad_norm": 0.8069589734077454, + "learning_rate": 1.1312468450277637e-05, + "loss": 0.0588, "step": 6170 }, { - "epoch": 1.8354618354618355, - "grad_norm": 0.4655805826187134, - "learning_rate": 1.8987228987228986e-05, - "loss": 0.0782, + "epoch": 3.1196365471983847, + "grad_norm": 0.8677689433097839, + "learning_rate": 1.1282180716809692e-05, + "loss": 0.0611, "step": 6180 }, { - "epoch": 1.8384318384318385, - "grad_norm": 0.5430210828781128, - "learning_rate": 1.896940896940897e-05, - "loss": 0.0601, + "epoch": 3.1246845027763754, + "grad_norm": 0.7949932813644409, + "learning_rate": 1.1251892983341746e-05, + "loss": 0.0553, "step": 6190 }, { - "epoch": 1.8414018414018414, - "grad_norm": 0.9118245244026184, - "learning_rate": 1.8951588951588953e-05, - "loss": 0.076, + "epoch": 3.1297324583543666, + "grad_norm": 0.6563514471054077, + "learning_rate": 1.1221605249873801e-05, + "loss": 0.0549, "step": 6200 }, { - "epoch": 1.8443718443718444, - "grad_norm": 0.3974968194961548, - "learning_rate": 1.8933768933768935e-05, - "loss": 0.0742, + "epoch": 3.1347804139323574, + "grad_norm": 0.5856168866157532, + "learning_rate": 1.1191317516405856e-05, + "loss": 0.0585, "step": 6210 }, { - "epoch": 1.8473418473418475, - "grad_norm": 0.393530935049057, - "learning_rate": 1.8915948915948918e-05, - "loss": 0.0755, + "epoch": 3.139828369510348, + "grad_norm": 0.6840217709541321, + "learning_rate": 1.1161029782937911e-05, + "loss": 0.0683, "step": 6220 }, { - "epoch": 1.8503118503118503, - "grad_norm": 0.6730740070343018, - "learning_rate": 1.8898128898128896e-05, - "loss": 0.0675, + "epoch": 3.1448763250883394, + "grad_norm": 1.310652494430542, + "learning_rate": 1.1130742049469966e-05, + "loss": 0.057, "step": 6230 }, { - "epoch": 1.8532818532818531, - "grad_norm": 0.5142623782157898, - "learning_rate": 1.888030888030888e-05, - "loss": 0.0694, + "epoch": 3.14992428066633, + "grad_norm": 0.6700050830841064, + "learning_rate": 1.1100454316002018e-05, + "loss": 0.0562, "step": 6240 }, { - "epoch": 1.8562518562518564, - "grad_norm": 0.344099223613739, - "learning_rate": 1.886248886248886e-05, - "loss": 0.0591, + "epoch": 3.154972236244321, + "grad_norm": 0.5210493803024292, + "learning_rate": 1.1070166582534073e-05, + "loss": 0.0545, "step": 6250 }, { - "epoch": 1.8592218592218592, - "grad_norm": 0.5664836168289185, - "learning_rate": 1.8844668844668846e-05, - "loss": 0.0725, + "epoch": 3.160020191822312, + "grad_norm": 0.44693487882614136, + "learning_rate": 1.1039878849066128e-05, + "loss": 0.0614, "step": 6260 }, { - "epoch": 1.862191862191862, - "grad_norm": 0.2773604989051819, - "learning_rate": 1.882684882684883e-05, - "loss": 0.0653, + "epoch": 3.165068147400303, + "grad_norm": 0.8827401995658875, + "learning_rate": 1.1009591115598184e-05, + "loss": 0.06, "step": 6270 }, { - "epoch": 1.865161865161865, - "grad_norm": 0.35496875643730164, - "learning_rate": 1.880902880902881e-05, - "loss": 0.0711, + "epoch": 3.1701161029782936, + "grad_norm": 0.29074421525001526, + "learning_rate": 1.0979303382130239e-05, + "loss": 0.059, "step": 6280 }, { - "epoch": 1.8681318681318682, - "grad_norm": 0.2887316644191742, - "learning_rate": 1.8791208791208793e-05, - "loss": 0.0661, + "epoch": 3.175164058556285, + "grad_norm": 0.8659618496894836, + "learning_rate": 1.0949015648662292e-05, + "loss": 0.0541, "step": 6290 }, { - "epoch": 1.871101871101871, - "grad_norm": 0.5518425107002258, - "learning_rate": 1.8773388773388775e-05, - "loss": 0.0663, + "epoch": 3.1802120141342756, + "grad_norm": 0.8624622821807861, + "learning_rate": 1.0918727915194346e-05, + "loss": 0.0661, "step": 6300 }, { - "epoch": 1.874071874071874, - "grad_norm": 0.6332146525382996, - "learning_rate": 1.8755568755568754e-05, - "loss": 0.079, + "epoch": 3.1852599697122663, + "grad_norm": 0.6411763429641724, + "learning_rate": 1.08884401817264e-05, + "loss": 0.0642, "step": 6310 }, { - "epoch": 1.877041877041877, - "grad_norm": 0.49415746331214905, - "learning_rate": 1.8737748737748736e-05, - "loss": 0.071, + "epoch": 3.1903079252902575, + "grad_norm": 0.5271298289299011, + "learning_rate": 1.0858152448258456e-05, + "loss": 0.0552, "step": 6320 }, { - "epoch": 1.88001188001188, - "grad_norm": 0.6736321449279785, - "learning_rate": 1.871992871992872e-05, - "loss": 0.0749, + "epoch": 3.1953558808682483, + "grad_norm": 0.9701720476150513, + "learning_rate": 1.082786471479051e-05, + "loss": 0.0586, "step": 6330 }, { - "epoch": 1.882981882981883, - "grad_norm": 0.9153728485107422, - "learning_rate": 1.8702108702108704e-05, - "loss": 0.0689, + "epoch": 3.200403836446239, + "grad_norm": 0.5633390545845032, + "learning_rate": 1.0797576981322565e-05, + "loss": 0.0554, "step": 6340 }, { - "epoch": 1.885951885951886, - "grad_norm": 0.3608382046222687, - "learning_rate": 1.8684288684288686e-05, - "loss": 0.064, + "epoch": 3.2054517920242303, + "grad_norm": 0.45846840739250183, + "learning_rate": 1.076728924785462e-05, + "loss": 0.0582, "step": 6350 }, { - "epoch": 1.8889218889218888, - "grad_norm": 0.3779090344905853, - "learning_rate": 1.8666468666468668e-05, - "loss": 0.072, + "epoch": 3.210499747602221, + "grad_norm": 0.43338650465011597, + "learning_rate": 1.0737001514386673e-05, + "loss": 0.0588, "step": 6360 }, { - "epoch": 1.8918918918918919, - "grad_norm": 0.5436325669288635, - "learning_rate": 1.864864864864865e-05, - "loss": 0.0738, + "epoch": 3.215547703180212, + "grad_norm": 0.8287716507911682, + "learning_rate": 1.0706713780918728e-05, + "loss": 0.053, "step": 6370 }, { - "epoch": 1.894861894861895, - "grad_norm": 0.720585823059082, - "learning_rate": 1.863082863082863e-05, - "loss": 0.0651, + "epoch": 3.220595658758203, + "grad_norm": 0.5174350142478943, + "learning_rate": 1.0676426047450782e-05, + "loss": 0.0587, "step": 6380 }, { - "epoch": 1.8978318978318978, - "grad_norm": 0.3137255609035492, - "learning_rate": 1.861300861300861e-05, - "loss": 0.0634, + "epoch": 3.2256436143361937, + "grad_norm": 0.47460228204727173, + "learning_rate": 1.0646138313982837e-05, + "loss": 0.0598, "step": 6390 }, { - "epoch": 1.9008019008019008, - "grad_norm": 0.6744168400764465, - "learning_rate": 1.8595188595188597e-05, - "loss": 0.0652, + "epoch": 3.230691569914185, + "grad_norm": 0.49122539162635803, + "learning_rate": 1.0615850580514892e-05, + "loss": 0.0535, "step": 6400 }, { - "epoch": 1.9037719037719039, - "grad_norm": 0.33474233746528625, - "learning_rate": 1.857736857736858e-05, - "loss": 0.0663, + "epoch": 3.2357395254921757, + "grad_norm": 0.5462148189544678, + "learning_rate": 1.0585562847046947e-05, + "loss": 0.0518, "step": 6410 }, { - "epoch": 1.9067419067419067, - "grad_norm": 0.3249685764312744, - "learning_rate": 1.855954855954856e-05, - "loss": 0.0772, + "epoch": 3.2407874810701665, + "grad_norm": 0.7671846747398376, + "learning_rate": 1.0555275113579002e-05, + "loss": 0.0611, "step": 6420 }, { - "epoch": 1.9097119097119097, - "grad_norm": 0.8481233716011047, - "learning_rate": 1.8541728541728543e-05, - "loss": 0.0715, + "epoch": 3.2458354366481577, + "grad_norm": 0.6748913526535034, + "learning_rate": 1.0524987380111054e-05, + "loss": 0.0561, "step": 6430 }, { - "epoch": 1.9126819126819128, - "grad_norm": 0.40865710377693176, - "learning_rate": 1.8523908523908525e-05, - "loss": 0.0672, + "epoch": 3.2508833922261484, + "grad_norm": 0.5004613399505615, + "learning_rate": 1.049469964664311e-05, + "loss": 0.0534, "step": 6440 }, { - "epoch": 1.9156519156519156, - "grad_norm": 0.40034782886505127, - "learning_rate": 1.8506088506088504e-05, - "loss": 0.0599, + "epoch": 3.255931347804139, + "grad_norm": 0.4895551800727844, + "learning_rate": 1.0464411913175164e-05, + "loss": 0.0459, "step": 6450 }, { - "epoch": 1.9186219186219187, - "grad_norm": 0.37332504987716675, - "learning_rate": 1.8488268488268486e-05, - "loss": 0.0726, + "epoch": 3.2609793033821304, + "grad_norm": 0.47480469942092896, + "learning_rate": 1.043412417970722e-05, + "loss": 0.0601, "step": 6460 }, { - "epoch": 1.9215919215919217, - "grad_norm": 0.39983534812927246, - "learning_rate": 1.8470448470448472e-05, - "loss": 0.0673, + "epoch": 3.266027258960121, + "grad_norm": 0.4885694086551666, + "learning_rate": 1.0403836446239273e-05, + "loss": 0.0598, "step": 6470 }, { - "epoch": 1.9245619245619245, - "grad_norm": 0.3581918776035309, - "learning_rate": 1.8452628452628454e-05, - "loss": 0.0644, + "epoch": 3.271075214538112, + "grad_norm": 0.6375486254692078, + "learning_rate": 1.0373548712771328e-05, + "loss": 0.0602, "step": 6480 }, { - "epoch": 1.9275319275319274, - "grad_norm": 0.4143809676170349, - "learning_rate": 1.8434808434808436e-05, - "loss": 0.0582, + "epoch": 3.276123170116103, + "grad_norm": 0.7264606356620789, + "learning_rate": 1.0343260979303382e-05, + "loss": 0.0579, "step": 6490 }, { - "epoch": 1.9305019305019306, - "grad_norm": 0.42415744066238403, - "learning_rate": 1.841698841698842e-05, - "loss": 0.067, + "epoch": 3.281171125694094, + "grad_norm": 0.5704456567764282, + "learning_rate": 1.0312973245835437e-05, + "loss": 0.056, "step": 6500 }, { - "epoch": 1.9334719334719335, - "grad_norm": 0.5267529487609863, - "learning_rate": 1.83991683991684e-05, - "loss": 0.0562, + "epoch": 3.2862190812720846, + "grad_norm": 0.6324512362480164, + "learning_rate": 1.0282685512367492e-05, + "loss": 0.0515, "step": 6510 }, { - "epoch": 1.9364419364419363, - "grad_norm": 0.44437262415885925, - "learning_rate": 1.838134838134838e-05, - "loss": 0.0723, + "epoch": 3.291267036850076, + "grad_norm": 0.5736483931541443, + "learning_rate": 1.0252397778899545e-05, + "loss": 0.0538, "step": 6520 }, { - "epoch": 1.9394119394119396, - "grad_norm": 0.5024462938308716, - "learning_rate": 1.836352836352836e-05, - "loss": 0.0788, + "epoch": 3.2963149924280666, + "grad_norm": 0.48032522201538086, + "learning_rate": 1.02221100454316e-05, + "loss": 0.0568, "step": 6530 }, { - "epoch": 1.9423819423819424, - "grad_norm": 0.3392117917537689, - "learning_rate": 1.8345708345708347e-05, - "loss": 0.0685, + "epoch": 3.301362948006058, + "grad_norm": 0.6696997880935669, + "learning_rate": 1.0191822311963656e-05, + "loss": 0.0537, "step": 6540 }, { - "epoch": 1.9453519453519452, - "grad_norm": 0.4275413751602173, - "learning_rate": 1.832788832788833e-05, - "loss": 0.0728, + "epoch": 3.3064109035840485, + "grad_norm": 0.44333356618881226, + "learning_rate": 1.016153457849571e-05, + "loss": 0.0514, "step": 6550 }, { - "epoch": 1.9483219483219483, - "grad_norm": 0.3413922190666199, - "learning_rate": 1.831006831006831e-05, - "loss": 0.0694, + "epoch": 3.3114588591620393, + "grad_norm": 0.6224443912506104, + "learning_rate": 1.0131246845027764e-05, + "loss": 0.0607, "step": 6560 }, { - "epoch": 1.9512919512919513, - "grad_norm": 0.4779782295227051, - "learning_rate": 1.8292248292248294e-05, - "loss": 0.0654, + "epoch": 3.3165068147400305, + "grad_norm": 0.7066437602043152, + "learning_rate": 1.0100959111559818e-05, + "loss": 0.0563, "step": 6570 }, { - "epoch": 1.9542619542619541, - "grad_norm": 0.4912964701652527, - "learning_rate": 1.8274428274428276e-05, - "loss": 0.0729, + "epoch": 3.3215547703180213, + "grad_norm": 0.6406083106994629, + "learning_rate": 1.0070671378091873e-05, + "loss": 0.0573, "step": 6580 }, { - "epoch": 1.9572319572319572, - "grad_norm": 0.3358478546142578, - "learning_rate": 1.8256608256608254e-05, - "loss": 0.069, + "epoch": 3.326602725896012, + "grad_norm": 0.44534462690353394, + "learning_rate": 1.0040383644623928e-05, + "loss": 0.059, "step": 6590 }, { - "epoch": 1.9602019602019602, - "grad_norm": 0.5066028237342834, - "learning_rate": 1.8238788238788237e-05, - "loss": 0.0626, + "epoch": 3.3316506814740032, + "grad_norm": 0.7137624025344849, + "learning_rate": 1.0010095911155983e-05, + "loss": 0.0568, "step": 6600 }, { - "epoch": 1.963171963171963, - "grad_norm": 0.5891350507736206, - "learning_rate": 1.8220968220968222e-05, - "loss": 0.0643, + "epoch": 3.336698637051994, + "grad_norm": 0.6909269690513611, + "learning_rate": 9.979808177688038e-06, + "loss": 0.0493, "step": 6610 }, { - "epoch": 1.9661419661419661, - "grad_norm": 0.5142768621444702, - "learning_rate": 1.8203148203148204e-05, - "loss": 0.06, + "epoch": 3.3417465926299847, + "grad_norm": 0.6987153887748718, + "learning_rate": 9.94952044422009e-06, + "loss": 0.059, "step": 6620 }, { - "epoch": 1.9691119691119692, - "grad_norm": 0.463016539812088, - "learning_rate": 1.8185328185328187e-05, - "loss": 0.062, + "epoch": 3.346794548207976, + "grad_norm": 0.538732647895813, + "learning_rate": 9.919232710752145e-06, + "loss": 0.0582, "step": 6630 }, { - "epoch": 1.972081972081972, - "grad_norm": 0.27797237038612366, - "learning_rate": 1.816750816750817e-05, - "loss": 0.0638, + "epoch": 3.3518425037859667, + "grad_norm": 0.6330693960189819, + "learning_rate": 9.8889449772842e-06, + "loss": 0.0506, "step": 6640 }, { - "epoch": 1.975051975051975, - "grad_norm": 0.8923652768135071, - "learning_rate": 1.814968814968815e-05, - "loss": 0.0815, + "epoch": 3.3568904593639575, + "grad_norm": 0.5216783881187439, + "learning_rate": 9.858657243816255e-06, + "loss": 0.0544, "step": 6650 }, { - "epoch": 1.978021978021978, - "grad_norm": 0.43557631969451904, - "learning_rate": 1.813186813186813e-05, - "loss": 0.066, + "epoch": 3.3619384149419487, + "grad_norm": 0.7052462697029114, + "learning_rate": 9.828369510348309e-06, + "loss": 0.0553, "step": 6660 }, { - "epoch": 1.980991980991981, - "grad_norm": 0.40481114387512207, - "learning_rate": 1.8114048114048112e-05, - "loss": 0.0685, + "epoch": 3.3669863705199394, + "grad_norm": 0.7679615616798401, + "learning_rate": 9.798081776880364e-06, + "loss": 0.061, "step": 6670 }, { - "epoch": 1.983961983961984, - "grad_norm": 0.5298916101455688, - "learning_rate": 1.8096228096228097e-05, - "loss": 0.0546, + "epoch": 3.37203432609793, + "grad_norm": 0.530564546585083, + "learning_rate": 9.767794043412417e-06, + "loss": 0.0567, "step": 6680 }, { - "epoch": 1.986931986931987, - "grad_norm": 0.687917172908783, - "learning_rate": 1.807840807840808e-05, - "loss": 0.0713, + "epoch": 3.3770822816759214, + "grad_norm": 0.6907301545143127, + "learning_rate": 9.737506309944473e-06, + "loss": 0.0561, "step": 6690 }, { - "epoch": 1.9899019899019899, - "grad_norm": 0.44359517097473145, - "learning_rate": 1.8060588060588062e-05, - "loss": 0.0659, + "epoch": 3.382130237253912, + "grad_norm": 0.7837420105934143, + "learning_rate": 9.707218576476528e-06, + "loss": 0.0618, "step": 6700 }, { - "epoch": 1.992871992871993, - "grad_norm": 0.48727744817733765, - "learning_rate": 1.8042768042768044e-05, - "loss": 0.075, + "epoch": 3.387178192831903, + "grad_norm": 0.6361984014511108, + "learning_rate": 9.676930843008581e-06, + "loss": 0.0533, "step": 6710 }, { - "epoch": 1.995841995841996, - "grad_norm": 0.46480777859687805, - "learning_rate": 1.8024948024948026e-05, - "loss": 0.0758, + "epoch": 3.392226148409894, + "grad_norm": 0.6775834560394287, + "learning_rate": 9.646643109540636e-06, + "loss": 0.0571, "step": 6720 }, { - "epoch": 1.9988119988119988, - "grad_norm": 0.7983739376068115, - "learning_rate": 1.8007128007128005e-05, - "loss": 0.0611, + "epoch": 3.397274103987885, + "grad_norm": 0.4820801615715027, + "learning_rate": 9.616355376072691e-06, + "loss": 0.063, "step": 6730 }, { - "epoch": 2.0, - "eval_f1": 0.49727767695099817, - "eval_loss": 0.05854379013180733, - "eval_runtime": 176.456, - "eval_samples_per_second": 215.459, - "eval_steps_per_second": 3.372, - "step": 6734 - }, - { - "epoch": 2.0017820017820016, - "grad_norm": 0.7805753946304321, - "learning_rate": 1.7989307989307987e-05, - "loss": 0.0529, + "epoch": 3.4023220595658756, + "grad_norm": 0.511091411113739, + "learning_rate": 9.586067642604747e-06, + "loss": 0.0621, "step": 6740 }, { - "epoch": 2.004752004752005, - "grad_norm": 0.436716765165329, - "learning_rate": 1.7971487971487973e-05, - "loss": 0.0657, + "epoch": 3.407370015143867, + "grad_norm": 0.5163900852203369, + "learning_rate": 9.5557799091368e-06, + "loss": 0.0606, "step": 6750 }, { - "epoch": 2.0077220077220077, - "grad_norm": 0.347323477268219, - "learning_rate": 1.7953667953667955e-05, - "loss": 0.0733, + "epoch": 3.4124179707218576, + "grad_norm": 0.4652441740036011, + "learning_rate": 9.525492175668853e-06, + "loss": 0.0539, "step": 6760 }, { - "epoch": 2.0106920106920105, - "grad_norm": 0.4401879608631134, - "learning_rate": 1.7935847935847937e-05, - "loss": 0.0709, + "epoch": 3.4174659262998484, + "grad_norm": 0.5968872904777527, + "learning_rate": 9.495204442200909e-06, + "loss": 0.0599, "step": 6770 }, { - "epoch": 2.013662013662014, - "grad_norm": 0.6687297224998474, - "learning_rate": 1.791802791802792e-05, - "loss": 0.0655, + "epoch": 3.4225138818778396, + "grad_norm": 0.4634818732738495, + "learning_rate": 9.464916708732964e-06, + "loss": 0.0518, "step": 6780 }, { - "epoch": 2.0166320166320166, - "grad_norm": 0.6250702142715454, - "learning_rate": 1.79002079002079e-05, - "loss": 0.072, + "epoch": 3.4275618374558303, + "grad_norm": 0.34169018268585205, + "learning_rate": 9.434628975265019e-06, + "loss": 0.0588, "step": 6790 }, { - "epoch": 2.0196020196020195, - "grad_norm": 0.7498565912246704, - "learning_rate": 1.788238788238788e-05, - "loss": 0.0724, + "epoch": 3.432609793033821, + "grad_norm": 0.719494640827179, + "learning_rate": 9.404341241797072e-06, + "loss": 0.0538, "step": 6800 }, { - "epoch": 2.0225720225720227, - "grad_norm": 0.5209968090057373, - "learning_rate": 1.7864567864567862e-05, - "loss": 0.0686, + "epoch": 3.4376577486118123, + "grad_norm": 0.4465346336364746, + "learning_rate": 9.374053508329126e-06, + "loss": 0.0577, "step": 6810 }, { - "epoch": 2.0255420255420256, - "grad_norm": 0.6656198501586914, - "learning_rate": 1.7846747846747848e-05, - "loss": 0.0636, + "epoch": 3.442705704189803, + "grad_norm": 0.6223052740097046, + "learning_rate": 9.343765774861181e-06, + "loss": 0.0598, "step": 6820 }, { - "epoch": 2.0285120285120284, - "grad_norm": 0.4398493766784668, - "learning_rate": 1.782892782892783e-05, - "loss": 0.0626, + "epoch": 3.447753659767794, + "grad_norm": 0.6854692697525024, + "learning_rate": 9.313478041393236e-06, + "loss": 0.0544, "step": 6830 }, { - "epoch": 2.0314820314820317, - "grad_norm": 0.3464367985725403, - "learning_rate": 1.7811107811107812e-05, - "loss": 0.0636, + "epoch": 3.452801615345785, + "grad_norm": 1.0640225410461426, + "learning_rate": 9.283190307925291e-06, + "loss": 0.0569, "step": 6840 }, { - "epoch": 2.0344520344520345, - "grad_norm": 0.5368358492851257, - "learning_rate": 1.7793287793287794e-05, - "loss": 0.0661, + "epoch": 3.4578495709237758, + "grad_norm": 0.5437650680541992, + "learning_rate": 9.252902574457345e-06, + "loss": 0.0612, "step": 6850 }, { - "epoch": 2.0374220374220373, - "grad_norm": 0.6472384929656982, - "learning_rate": 1.7775467775467776e-05, - "loss": 0.0647, + "epoch": 3.462897526501767, + "grad_norm": 0.5767130255699158, + "learning_rate": 9.2226148409894e-06, + "loss": 0.0618, "step": 6860 }, { - "epoch": 2.0403920403920406, - "grad_norm": 0.49248170852661133, - "learning_rate": 1.7757647757647755e-05, - "loss": 0.0689, + "epoch": 3.4679454820797577, + "grad_norm": 0.5814956426620483, + "learning_rate": 9.192327107521453e-06, + "loss": 0.0571, "step": 6870 }, { - "epoch": 2.0433620433620434, - "grad_norm": 0.2520935535430908, - "learning_rate": 1.7739827739827737e-05, - "loss": 0.074, + "epoch": 3.4729934376577485, + "grad_norm": 0.31469887495040894, + "learning_rate": 9.162039374053508e-06, + "loss": 0.0573, "step": 6880 }, { - "epoch": 2.0463320463320462, - "grad_norm": 0.5810942053794861, - "learning_rate": 1.7722007722007723e-05, - "loss": 0.0675, + "epoch": 3.4780413932357397, + "grad_norm": 0.3987484872341156, + "learning_rate": 9.131751640585563e-06, + "loss": 0.0534, "step": 6890 }, { - "epoch": 2.0493020493020495, - "grad_norm": 0.7744080424308777, - "learning_rate": 1.7704187704187705e-05, - "loss": 0.0636, + "epoch": 3.4830893488137304, + "grad_norm": 0.47312065958976746, + "learning_rate": 9.101463907117617e-06, + "loss": 0.0608, "step": 6900 }, { - "epoch": 2.0522720522720523, - "grad_norm": 0.6021985411643982, - "learning_rate": 1.7686367686367687e-05, - "loss": 0.0683, + "epoch": 3.488137304391721, + "grad_norm": 0.4635220170021057, + "learning_rate": 9.071176173649672e-06, + "loss": 0.05, "step": 6910 }, { - "epoch": 2.055242055242055, - "grad_norm": 0.6123180985450745, - "learning_rate": 1.766854766854767e-05, - "loss": 0.0791, + "epoch": 3.4931852599697124, + "grad_norm": 1.146721363067627, + "learning_rate": 9.040888440181727e-06, + "loss": 0.0548, "step": 6920 }, { - "epoch": 2.0582120582120584, - "grad_norm": 0.6447744965553284, - "learning_rate": 1.765072765072765e-05, - "loss": 0.0705, + "epoch": 3.498233215547703, + "grad_norm": 0.42057961225509644, + "learning_rate": 9.010600706713782e-06, + "loss": 0.0463, "step": 6930 }, { - "epoch": 2.0611820611820613, - "grad_norm": 0.5168854594230652, - "learning_rate": 1.7632907632907634e-05, - "loss": 0.0611, + "epoch": 3.5032811711256944, + "grad_norm": 0.7835047841072083, + "learning_rate": 8.980312973245836e-06, + "loss": 0.0507, "step": 6940 }, { - "epoch": 2.064152064152064, - "grad_norm": 0.9751071333885193, - "learning_rate": 1.7615087615087613e-05, - "loss": 0.0662, + "epoch": 3.508329126703685, + "grad_norm": 0.6441161036491394, + "learning_rate": 8.95002523977789e-06, + "loss": 0.0571, "step": 6950 }, { - "epoch": 2.067122067122067, - "grad_norm": 0.5001913905143738, - "learning_rate": 1.7597267597267598e-05, - "loss": 0.0654, + "epoch": 3.513377082281676, + "grad_norm": 0.6828143000602722, + "learning_rate": 8.919737506309944e-06, + "loss": 0.0525, "step": 6960 }, { - "epoch": 2.07009207009207, - "grad_norm": 0.6123823523521423, - "learning_rate": 1.757944757944758e-05, - "loss": 0.0806, + "epoch": 3.518425037859667, + "grad_norm": 0.8285954594612122, + "learning_rate": 8.889449772842e-06, + "loss": 0.0621, "step": 6970 }, { - "epoch": 2.073062073062073, - "grad_norm": 0.5449308156967163, - "learning_rate": 1.7561627561627563e-05, - "loss": 0.0643, + "epoch": 3.523472993437658, + "grad_norm": 0.4954177439212799, + "learning_rate": 8.859162039374055e-06, + "loss": 0.0625, "step": 6980 }, { - "epoch": 2.076032076032076, - "grad_norm": 0.31201791763305664, - "learning_rate": 1.7543807543807545e-05, - "loss": 0.0739, + "epoch": 3.5285209490156486, + "grad_norm": 0.7900820374488831, + "learning_rate": 8.828874305906108e-06, + "loss": 0.0603, "step": 6990 }, { - "epoch": 2.079002079002079, - "grad_norm": 0.8544298410415649, - "learning_rate": 1.7525987525987527e-05, - "loss": 0.069, + "epoch": 3.53356890459364, + "grad_norm": 0.6767242550849915, + "learning_rate": 8.798586572438162e-06, + "loss": 0.0586, "step": 7000 }, { - "epoch": 2.081972081972082, - "grad_norm": 0.5308842062950134, - "learning_rate": 1.750816750816751e-05, - "loss": 0.0658, + "epoch": 3.5386168601716306, + "grad_norm": 0.5408624410629272, + "learning_rate": 8.768298838970217e-06, + "loss": 0.0561, "step": 7010 }, { - "epoch": 2.0849420849420848, - "grad_norm": 0.47668206691741943, - "learning_rate": 1.7490347490347488e-05, - "loss": 0.076, + "epoch": 3.5436648157496213, + "grad_norm": 0.4577973484992981, + "learning_rate": 8.738011105502272e-06, + "loss": 0.057, "step": 7020 }, { - "epoch": 2.087912087912088, - "grad_norm": 0.47977229952812195, - "learning_rate": 1.7472527472527473e-05, - "loss": 0.0683, + "epoch": 3.5487127713276125, + "grad_norm": 0.7334242463111877, + "learning_rate": 8.707723372034327e-06, + "loss": 0.0602, "step": 7030 }, { - "epoch": 2.090882090882091, - "grad_norm": 0.9374080896377563, - "learning_rate": 1.7454707454707456e-05, - "loss": 0.0743, + "epoch": 3.5537607269056033, + "grad_norm": 0.5569146275520325, + "learning_rate": 8.67743563856638e-06, + "loss": 0.0564, "step": 7040 }, { - "epoch": 2.0938520938520937, - "grad_norm": 0.5665603280067444, - "learning_rate": 1.7436887436887438e-05, - "loss": 0.0725, + "epoch": 3.558808682483594, + "grad_norm": 0.5739743709564209, + "learning_rate": 8.647147905098436e-06, + "loss": 0.0605, "step": 7050 }, { - "epoch": 2.096822096822097, - "grad_norm": 0.44158872961997986, - "learning_rate": 1.741906741906742e-05, - "loss": 0.0769, + "epoch": 3.5638566380615853, + "grad_norm": 0.5553867816925049, + "learning_rate": 8.61686017163049e-06, + "loss": 0.0573, "step": 7060 }, { - "epoch": 2.0997920997921, - "grad_norm": 0.36570894718170166, - "learning_rate": 1.7401247401247402e-05, - "loss": 0.0686, + "epoch": 3.568904593639576, + "grad_norm": 0.7109550833702087, + "learning_rate": 8.586572438162544e-06, + "loss": 0.0634, "step": 7070 }, { - "epoch": 2.1027621027621026, - "grad_norm": 0.4633289575576782, - "learning_rate": 1.7383427383427384e-05, - "loss": 0.0717, + "epoch": 3.5739525492175668, + "grad_norm": 0.46534502506256104, + "learning_rate": 8.5562847046946e-06, + "loss": 0.0494, "step": 7080 }, { - "epoch": 2.105732105732106, - "grad_norm": 0.6178393959999084, - "learning_rate": 1.7365607365607363e-05, - "loss": 0.0719, + "epoch": 3.579000504795558, + "grad_norm": 0.47850191593170166, + "learning_rate": 8.525996971226653e-06, + "loss": 0.0613, "step": 7090 }, { - "epoch": 2.1087021087021087, - "grad_norm": 0.3964090049266815, - "learning_rate": 1.734778734778735e-05, - "loss": 0.0542, + "epoch": 3.5840484603735487, + "grad_norm": 0.3749614953994751, + "learning_rate": 8.495709237758708e-06, + "loss": 0.0574, "step": 7100 }, { - "epoch": 2.1116721116721116, - "grad_norm": 0.3831978738307953, - "learning_rate": 1.732996732996733e-05, - "loss": 0.0711, + "epoch": 3.5890964159515395, + "grad_norm": 0.5852258801460266, + "learning_rate": 8.465421504290763e-06, + "loss": 0.064, "step": 7110 }, { - "epoch": 2.114642114642115, - "grad_norm": 0.4152994453907013, - "learning_rate": 1.7312147312147313e-05, - "loss": 0.0617, + "epoch": 3.5941443715295307, + "grad_norm": 0.3820860981941223, + "learning_rate": 8.435133770822818e-06, + "loss": 0.0559, "step": 7120 }, { - "epoch": 2.1176121176121177, - "grad_norm": 0.5786647796630859, - "learning_rate": 1.7294327294327295e-05, - "loss": 0.0678, + "epoch": 3.5991923271075215, + "grad_norm": 0.5200080275535583, + "learning_rate": 8.40484603735487e-06, + "loss": 0.0556, "step": 7130 }, { - "epoch": 2.1205821205821205, - "grad_norm": 0.5444033145904541, - "learning_rate": 1.7276507276507277e-05, - "loss": 0.0605, + "epoch": 3.604240282685512, + "grad_norm": 0.6472256183624268, + "learning_rate": 8.374558303886925e-06, + "loss": 0.0596, "step": 7140 }, { - "epoch": 2.1235521235521237, - "grad_norm": 0.18499556183815002, - "learning_rate": 1.725868725868726e-05, - "loss": 0.0568, + "epoch": 3.6092882382635034, + "grad_norm": 0.43182119727134705, + "learning_rate": 8.34427057041898e-06, + "loss": 0.0478, "step": 7150 }, { - "epoch": 2.1265221265221266, - "grad_norm": 0.3817172050476074, - "learning_rate": 1.7240867240867238e-05, - "loss": 0.0559, + "epoch": 3.614336193841494, + "grad_norm": 0.6659020781517029, + "learning_rate": 8.313982836951035e-06, + "loss": 0.054, "step": 7160 }, { - "epoch": 2.1294921294921294, - "grad_norm": 0.5504813194274902, - "learning_rate": 1.7223047223047224e-05, - "loss": 0.0642, + "epoch": 3.619384149419485, + "grad_norm": 0.6561934947967529, + "learning_rate": 8.28369510348309e-06, + "loss": 0.0583, "step": 7170 }, { - "epoch": 2.1324621324621322, - "grad_norm": 0.34808218479156494, - "learning_rate": 1.7205227205227206e-05, - "loss": 0.0483, + "epoch": 3.624432104997476, + "grad_norm": 0.7083423733711243, + "learning_rate": 8.253407370015144e-06, + "loss": 0.0598, "step": 7180 }, { - "epoch": 2.1354321354321355, - "grad_norm": 0.45135316252708435, - "learning_rate": 1.7187407187407188e-05, - "loss": 0.0591, + "epoch": 3.629480060575467, + "grad_norm": 0.6030146479606628, + "learning_rate": 8.223119636547197e-06, + "loss": 0.0569, "step": 7190 }, { - "epoch": 2.1384021384021383, - "grad_norm": 0.5405902862548828, - "learning_rate": 1.716958716958717e-05, - "loss": 0.0548, + "epoch": 3.6345280161534577, + "grad_norm": 0.4650856554508209, + "learning_rate": 8.192831903079253e-06, + "loss": 0.0593, "step": 7200 }, { - "epoch": 2.141372141372141, - "grad_norm": 0.4525381624698639, - "learning_rate": 1.7151767151767152e-05, - "loss": 0.0775, + "epoch": 3.639575971731449, + "grad_norm": 0.5656235814094543, + "learning_rate": 8.162544169611308e-06, + "loss": 0.058, "step": 7210 }, { - "epoch": 2.1443421443421444, - "grad_norm": 0.9278238415718079, - "learning_rate": 1.7133947133947135e-05, - "loss": 0.0748, + "epoch": 3.6446239273094396, + "grad_norm": 0.5745735764503479, + "learning_rate": 8.132256436143363e-06, + "loss": 0.0582, "step": 7220 }, { - "epoch": 2.1473121473121473, - "grad_norm": 0.34462785720825195, - "learning_rate": 1.7116127116127117e-05, - "loss": 0.0693, + "epoch": 3.6496718828874304, + "grad_norm": 0.7879515886306763, + "learning_rate": 8.101968702675416e-06, + "loss": 0.0593, "step": 7230 }, { - "epoch": 2.15028215028215, - "grad_norm": 0.5502927899360657, - "learning_rate": 1.70983070983071e-05, - "loss": 0.0704, + "epoch": 3.6547198384654216, + "grad_norm": 0.7000477313995361, + "learning_rate": 8.071680969207471e-06, + "loss": 0.0517, "step": 7240 }, { - "epoch": 2.1532521532521534, - "grad_norm": 0.5558304786682129, - "learning_rate": 1.708048708048708e-05, - "loss": 0.072, + "epoch": 3.6597677940434123, + "grad_norm": 0.44397464394569397, + "learning_rate": 8.041393235739527e-06, + "loss": 0.0569, "step": 7250 }, { - "epoch": 2.156222156222156, - "grad_norm": 0.43772050738334656, - "learning_rate": 1.7062667062667063e-05, - "loss": 0.0609, + "epoch": 3.664815749621403, + "grad_norm": 0.55961674451828, + "learning_rate": 8.01110550227158e-06, + "loss": 0.0529, "step": 7260 }, { - "epoch": 2.159192159192159, - "grad_norm": 0.85486900806427, - "learning_rate": 1.7044847044847045e-05, - "loss": 0.0768, + "epoch": 3.6698637051993943, + "grad_norm": 0.5441805720329285, + "learning_rate": 7.980817768803635e-06, + "loss": 0.0537, "step": 7270 }, { - "epoch": 2.1621621621621623, - "grad_norm": 0.31786465644836426, - "learning_rate": 1.7027027027027028e-05, - "loss": 0.0578, + "epoch": 3.674911660777385, + "grad_norm": 0.5779780149459839, + "learning_rate": 7.950530035335689e-06, + "loss": 0.0549, "step": 7280 }, { - "epoch": 2.165132165132165, - "grad_norm": 0.37934377789497375, - "learning_rate": 1.700920700920701e-05, - "loss": 0.0724, + "epoch": 3.679959616355376, + "grad_norm": 0.4491129517555237, + "learning_rate": 7.920242301867744e-06, + "loss": 0.0527, "step": 7290 }, { - "epoch": 2.168102168102168, - "grad_norm": 0.5212098360061646, - "learning_rate": 1.6991386991386992e-05, - "loss": 0.0587, + "epoch": 3.685007571933367, + "grad_norm": 0.6601787209510803, + "learning_rate": 7.889954568399799e-06, + "loss": 0.0545, "step": 7300 }, { - "epoch": 2.171072171072171, - "grad_norm": 0.4610010087490082, - "learning_rate": 1.6973566973566974e-05, - "loss": 0.0659, + "epoch": 3.690055527511358, + "grad_norm": 0.7920609712600708, + "learning_rate": 7.859666834931854e-06, + "loss": 0.0607, "step": 7310 }, { - "epoch": 2.174042174042174, - "grad_norm": 0.4683549404144287, - "learning_rate": 1.6955746955746956e-05, - "loss": 0.0559, + "epoch": 3.6951034830893486, + "grad_norm": 0.6220458149909973, + "learning_rate": 7.829379101463906e-06, + "loss": 0.0574, "step": 7320 }, { - "epoch": 2.177012177012177, - "grad_norm": 1.537858009338379, - "learning_rate": 1.693792693792694e-05, - "loss": 0.0657, + "epoch": 3.7001514386673398, + "grad_norm": 0.6900739669799805, + "learning_rate": 7.799091367995961e-06, + "loss": 0.0549, "step": 7330 }, { - "epoch": 2.17998217998218, - "grad_norm": 0.8588612675666809, - "learning_rate": 1.692010692010692e-05, - "loss": 0.0818, + "epoch": 3.7051993942453305, + "grad_norm": 1.071191430091858, + "learning_rate": 7.768803634528016e-06, + "loss": 0.0644, "step": 7340 }, { - "epoch": 2.182952182952183, - "grad_norm": 0.5644201636314392, - "learning_rate": 1.6902286902286903e-05, - "loss": 0.0553, + "epoch": 3.7102473498233217, + "grad_norm": 0.5342854261398315, + "learning_rate": 7.738515901060071e-06, + "loss": 0.0639, "step": 7350 }, { - "epoch": 2.185922185922186, - "grad_norm": 0.5589690804481506, - "learning_rate": 1.6884466884466885e-05, - "loss": 0.0634, + "epoch": 3.7152953054013125, + "grad_norm": 0.49695709347724915, + "learning_rate": 7.708228167592126e-06, + "loss": 0.0525, "step": 7360 }, { - "epoch": 2.188892188892189, - "grad_norm": 0.4421149790287018, - "learning_rate": 1.6866646866646867e-05, - "loss": 0.0714, + "epoch": 3.7203432609793032, + "grad_norm": 0.6041547060012817, + "learning_rate": 7.67794043412418e-06, + "loss": 0.0596, "step": 7370 }, { - "epoch": 2.191862191862192, - "grad_norm": 0.7251006960868835, - "learning_rate": 1.684882684882685e-05, - "loss": 0.0741, + "epoch": 3.7253912165572944, + "grad_norm": 0.6425964832305908, + "learning_rate": 7.647652700656235e-06, + "loss": 0.0626, "step": 7380 }, { - "epoch": 2.1948321948321947, - "grad_norm": 0.5653437972068787, - "learning_rate": 1.683100683100683e-05, - "loss": 0.0636, + "epoch": 3.730439172135285, + "grad_norm": 0.5185597538948059, + "learning_rate": 7.617364967188288e-06, + "loss": 0.063, "step": 7390 }, { - "epoch": 2.197802197802198, - "grad_norm": 0.37989261746406555, - "learning_rate": 1.6813186813186814e-05, - "loss": 0.0681, + "epoch": 3.7354871277132764, + "grad_norm": 0.48031681776046753, + "learning_rate": 7.587077233720343e-06, + "loss": 0.0633, "step": 7400 }, { - "epoch": 2.200772200772201, - "grad_norm": 0.38947612047195435, - "learning_rate": 1.6795366795366796e-05, - "loss": 0.0584, + "epoch": 3.740535083291267, + "grad_norm": 0.46377626061439514, + "learning_rate": 7.556789500252398e-06, + "loss": 0.0581, "step": 7410 }, { - "epoch": 2.2037422037422036, - "grad_norm": 0.5566168427467346, - "learning_rate": 1.6777546777546778e-05, - "loss": 0.0567, + "epoch": 3.745583038869258, + "grad_norm": 0.7336452007293701, + "learning_rate": 7.526501766784453e-06, + "loss": 0.0572, "step": 7420 }, { - "epoch": 2.206712206712207, - "grad_norm": 0.664364755153656, - "learning_rate": 1.675972675972676e-05, - "loss": 0.0735, + "epoch": 3.750630994447249, + "grad_norm": 0.8720684051513672, + "learning_rate": 7.4962140333165064e-06, + "loss": 0.0558, "step": 7430 }, { - "epoch": 2.2096822096822097, - "grad_norm": 0.3879406154155731, - "learning_rate": 1.6741906741906742e-05, - "loss": 0.0684, + "epoch": 3.75567895002524, + "grad_norm": 0.372592031955719, + "learning_rate": 7.465926299848562e-06, + "loss": 0.0613, "step": 7440 }, { - "epoch": 2.2126522126522126, - "grad_norm": 0.34745240211486816, - "learning_rate": 1.6724086724086725e-05, - "loss": 0.0727, + "epoch": 3.7607269056032306, + "grad_norm": 0.5049020648002625, + "learning_rate": 7.435638566380616e-06, + "loss": 0.058, "step": 7450 }, { - "epoch": 2.215622215622216, - "grad_norm": 0.48188093304634094, - "learning_rate": 1.6706266706266707e-05, - "loss": 0.0622, + "epoch": 3.765774861181222, + "grad_norm": 0.5402325391769409, + "learning_rate": 7.405350832912671e-06, + "loss": 0.0484, "step": 7460 }, { - "epoch": 2.2185922185922187, - "grad_norm": 0.6234533786773682, - "learning_rate": 1.668844668844669e-05, - "loss": 0.0681, + "epoch": 3.7708228167592126, + "grad_norm": 0.5662652850151062, + "learning_rate": 7.375063099444725e-06, + "loss": 0.0613, "step": 7470 }, { - "epoch": 2.2215622215622215, - "grad_norm": 0.6129441261291504, - "learning_rate": 1.667062667062667e-05, - "loss": 0.0674, + "epoch": 3.7758707723372034, + "grad_norm": 0.6431825160980225, + "learning_rate": 7.34477536597678e-06, + "loss": 0.0522, "step": 7480 }, { - "epoch": 2.2245322245322248, - "grad_norm": 0.36409103870391846, - "learning_rate": 1.6652806652806653e-05, - "loss": 0.0673, + "epoch": 3.7809187279151946, + "grad_norm": 0.9309275150299072, + "learning_rate": 7.314487632508835e-06, + "loss": 0.0602, "step": 7490 }, { - "epoch": 2.2275022275022276, - "grad_norm": 0.8309186697006226, - "learning_rate": 1.6634986634986635e-05, - "loss": 0.0543, + "epoch": 3.7859666834931853, + "grad_norm": 0.801145076751709, + "learning_rate": 7.284199899040888e-06, + "loss": 0.0581, "step": 7500 }, { - "epoch": 2.2304722304722304, - "grad_norm": 0.4031508266925812, - "learning_rate": 1.6617166617166618e-05, - "loss": 0.0842, + "epoch": 3.791014639071176, + "grad_norm": 0.5122712850570679, + "learning_rate": 7.253912165572943e-06, + "loss": 0.0552, "step": 7510 }, { - "epoch": 2.2334422334422332, - "grad_norm": 0.35200947523117065, - "learning_rate": 1.65993465993466e-05, - "loss": 0.0611, + "epoch": 3.7960625946491673, + "grad_norm": 0.39402052760124207, + "learning_rate": 7.223624432104998e-06, + "loss": 0.0552, "step": 7520 }, { - "epoch": 2.2364122364122365, - "grad_norm": 0.5327655673027039, - "learning_rate": 1.6581526581526582e-05, - "loss": 0.0611, + "epoch": 3.801110550227158, + "grad_norm": 0.5302004814147949, + "learning_rate": 7.193336698637052e-06, + "loss": 0.0626, "step": 7530 }, { - "epoch": 2.2393822393822393, - "grad_norm": 0.3595449924468994, - "learning_rate": 1.6563706563706564e-05, - "loss": 0.0718, + "epoch": 3.806158505805149, + "grad_norm": 0.4123098850250244, + "learning_rate": 7.163048965169107e-06, + "loss": 0.0569, "step": 7540 }, { - "epoch": 2.242352242352242, - "grad_norm": 0.6577255129814148, - "learning_rate": 1.6545886545886546e-05, - "loss": 0.0748, + "epoch": 3.81120646138314, + "grad_norm": 0.8736279010772705, + "learning_rate": 7.132761231701161e-06, + "loss": 0.0537, "step": 7550 }, { - "epoch": 2.2453222453222454, - "grad_norm": 0.46405327320098877, - "learning_rate": 1.652806652806653e-05, - "loss": 0.0735, + "epoch": 3.8162544169611308, + "grad_norm": 0.4374080002307892, + "learning_rate": 7.102473498233216e-06, + "loss": 0.057, "step": 7560 }, { - "epoch": 2.2482922482922483, - "grad_norm": 0.6792459487915039, - "learning_rate": 1.651024651024651e-05, - "loss": 0.0809, + "epoch": 3.8213023725391215, + "grad_norm": 0.863776445388794, + "learning_rate": 7.07218576476527e-06, + "loss": 0.049, "step": 7570 }, { - "epoch": 2.251262251262251, - "grad_norm": 0.4969274401664734, - "learning_rate": 1.6492426492426496e-05, - "loss": 0.0616, + "epoch": 3.8263503281171127, + "grad_norm": 0.5356324315071106, + "learning_rate": 7.041898031297325e-06, + "loss": 0.0578, "step": 7580 }, { - "epoch": 2.2542322542322544, - "grad_norm": 0.7882120609283447, - "learning_rate": 1.6474606474606475e-05, - "loss": 0.0756, + "epoch": 3.8313982836951035, + "grad_norm": 0.5422727465629578, + "learning_rate": 7.0116102978293786e-06, + "loss": 0.0577, "step": 7590 }, { - "epoch": 2.257202257202257, - "grad_norm": 0.4611985683441162, - "learning_rate": 1.6456786456786457e-05, - "loss": 0.0658, + "epoch": 3.8364462392730942, + "grad_norm": 0.6234108805656433, + "learning_rate": 6.981322564361434e-06, + "loss": 0.0573, "step": 7600 }, { - "epoch": 2.26017226017226, - "grad_norm": 0.5099868774414062, - "learning_rate": 1.643896643896644e-05, - "loss": 0.0691, + "epoch": 3.8414941948510855, + "grad_norm": 0.9067860841751099, + "learning_rate": 6.951034830893489e-06, + "loss": 0.0471, "step": 7610 }, { - "epoch": 2.2631422631422633, - "grad_norm": 0.461518257856369, - "learning_rate": 1.642114642114642e-05, - "loss": 0.0604, + "epoch": 3.846542150429076, + "grad_norm": 0.5522469878196716, + "learning_rate": 6.920747097425543e-06, + "loss": 0.053, "step": 7620 }, { - "epoch": 2.266112266112266, - "grad_norm": 0.3580944240093231, - "learning_rate": 1.6403326403326404e-05, - "loss": 0.0609, + "epoch": 3.851590106007067, + "grad_norm": 0.7358270287513733, + "learning_rate": 6.8904593639575974e-06, + "loss": 0.0561, "step": 7630 }, { - "epoch": 2.269082269082269, - "grad_norm": 0.36803242564201355, - "learning_rate": 1.6385506385506386e-05, - "loss": 0.0674, + "epoch": 3.856638061585058, + "grad_norm": 0.5285794138908386, + "learning_rate": 6.860171630489652e-06, + "loss": 0.0618, "step": 7640 }, { - "epoch": 2.2720522720522722, - "grad_norm": 0.3887629806995392, - "learning_rate": 1.636768636768637e-05, - "loss": 0.0733, + "epoch": 3.861686017163049, + "grad_norm": 0.6937068700790405, + "learning_rate": 6.829883897021707e-06, + "loss": 0.059, "step": 7650 }, { - "epoch": 2.275022275022275, - "grad_norm": 0.6474005579948425, - "learning_rate": 1.634986634986635e-05, - "loss": 0.0679, + "epoch": 3.8667339727410397, + "grad_norm": 0.6941738724708557, + "learning_rate": 6.79959616355376e-06, + "loss": 0.0515, "step": 7660 }, { - "epoch": 2.277992277992278, - "grad_norm": 0.6048378944396973, - "learning_rate": 1.6332046332046332e-05, - "loss": 0.0643, + "epoch": 3.871781928319031, + "grad_norm": 0.8964054584503174, + "learning_rate": 6.7693084300858155e-06, + "loss": 0.0526, "step": 7670 }, { - "epoch": 2.280962280962281, - "grad_norm": 0.45778071880340576, - "learning_rate": 1.6314226314226314e-05, - "loss": 0.0673, + "epoch": 3.8768298838970217, + "grad_norm": 0.5919986367225647, + "learning_rate": 6.739020696617871e-06, + "loss": 0.0577, "step": 7680 }, { - "epoch": 2.283932283932284, - "grad_norm": 0.6061732172966003, - "learning_rate": 1.6296406296406297e-05, - "loss": 0.0786, + "epoch": 3.8818778394750124, + "grad_norm": 0.4616561532020569, + "learning_rate": 6.708732963149924e-06, + "loss": 0.0509, "step": 7690 }, { - "epoch": 2.286902286902287, - "grad_norm": 0.4730798602104187, - "learning_rate": 1.627858627858628e-05, - "loss": 0.0572, + "epoch": 3.8869257950530036, + "grad_norm": 0.6349731087684631, + "learning_rate": 6.678445229681979e-06, + "loss": 0.0535, "step": 7700 }, { - "epoch": 2.2898722898722896, - "grad_norm": 0.43137332797050476, - "learning_rate": 1.626076626076626e-05, - "loss": 0.0572, + "epoch": 3.8919737506309944, + "grad_norm": 0.6474828720092773, + "learning_rate": 6.6481574962140335e-06, + "loss": 0.0552, "step": 7710 }, { - "epoch": 2.292842292842293, - "grad_norm": 0.36243513226509094, - "learning_rate": 1.6242946242946247e-05, - "loss": 0.0722, + "epoch": 3.897021706208985, + "grad_norm": 0.5433930158615112, + "learning_rate": 6.617869762746088e-06, + "loss": 0.062, "step": 7720 }, { - "epoch": 2.2958122958122957, - "grad_norm": 0.6039568781852722, - "learning_rate": 1.6225126225126225e-05, - "loss": 0.0628, + "epoch": 3.9020696617869763, + "grad_norm": 0.6113614439964294, + "learning_rate": 6.587582029278142e-06, + "loss": 0.06, "step": 7730 }, { - "epoch": 2.2987822987822986, - "grad_norm": 0.7253831028938293, - "learning_rate": 1.6207306207306207e-05, - "loss": 0.054, + "epoch": 3.907117617364967, + "grad_norm": 0.8800488114356995, + "learning_rate": 6.557294295810197e-06, + "loss": 0.0578, "step": 7740 }, { - "epoch": 2.301752301752302, - "grad_norm": 0.675613522529602, - "learning_rate": 1.618948618948619e-05, - "loss": 0.0689, + "epoch": 3.912165572942958, + "grad_norm": 0.5158660411834717, + "learning_rate": 6.5270065623422515e-06, + "loss": 0.0524, "step": 7750 }, { - "epoch": 2.3047223047223047, - "grad_norm": 0.5215617418289185, - "learning_rate": 1.6171666171666172e-05, - "loss": 0.0733, + "epoch": 3.917213528520949, + "grad_norm": 0.5676606297492981, + "learning_rate": 6.496718828874306e-06, + "loss": 0.0474, "step": 7760 }, { - "epoch": 2.3076923076923075, - "grad_norm": 0.6846175193786621, - "learning_rate": 1.6153846153846154e-05, - "loss": 0.0615, + "epoch": 3.92226148409894, + "grad_norm": 0.6438203454017639, + "learning_rate": 6.466431095406361e-06, + "loss": 0.0587, "step": 7770 }, { - "epoch": 2.3106623106623108, - "grad_norm": 0.397480845451355, - "learning_rate": 1.6136026136026136e-05, - "loss": 0.0641, + "epoch": 3.9273094396769306, + "grad_norm": 0.6570119857788086, + "learning_rate": 6.436143361938415e-06, + "loss": 0.0489, "step": 7780 }, { - "epoch": 2.3136323136323136, - "grad_norm": 0.33144304156303406, - "learning_rate": 1.6118206118206122e-05, - "loss": 0.0605, + "epoch": 3.932357395254922, + "grad_norm": 0.5620145201683044, + "learning_rate": 6.4058556284704695e-06, + "loss": 0.0559, "step": 7790 }, { - "epoch": 2.3166023166023164, - "grad_norm": 0.452396035194397, - "learning_rate": 1.61003861003861e-05, - "loss": 0.0787, + "epoch": 3.9374053508329125, + "grad_norm": 0.6886317729949951, + "learning_rate": 6.375567895002524e-06, + "loss": 0.0581, "step": 7800 }, { - "epoch": 2.3195723195723197, - "grad_norm": 0.4840039908885956, - "learning_rate": 1.6082566082566083e-05, - "loss": 0.0756, + "epoch": 3.9424533064109037, + "grad_norm": 0.7463077306747437, + "learning_rate": 6.345280161534579e-06, + "loss": 0.0477, "step": 7810 }, { - "epoch": 2.3225423225423225, - "grad_norm": 0.3425714671611786, - "learning_rate": 1.6064746064746065e-05, - "loss": 0.0661, + "epoch": 3.9475012619888945, + "grad_norm": 0.5246394276618958, + "learning_rate": 6.314992428066633e-06, + "loss": 0.0501, "step": 7820 }, { - "epoch": 2.3255123255123253, - "grad_norm": 0.5093306303024292, - "learning_rate": 1.6046926046926047e-05, - "loss": 0.0765, + "epoch": 3.9525492175668853, + "grad_norm": 0.5147930979728699, + "learning_rate": 6.2847046945986876e-06, + "loss": 0.0603, "step": 7830 }, { - "epoch": 2.3284823284823286, - "grad_norm": 0.37477800250053406, - "learning_rate": 1.602910602910603e-05, - "loss": 0.0751, + "epoch": 3.9575971731448765, + "grad_norm": 0.3963003158569336, + "learning_rate": 6.254416961130743e-06, + "loss": 0.059, "step": 7840 }, { - "epoch": 2.3314523314523314, - "grad_norm": 0.6057155132293701, - "learning_rate": 1.601128601128601e-05, - "loss": 0.0738, + "epoch": 3.9626451287228672, + "grad_norm": 0.7148598432540894, + "learning_rate": 6.224129227662796e-06, + "loss": 0.0524, "step": 7850 }, { - "epoch": 2.3344223344223343, - "grad_norm": 0.5092906355857849, - "learning_rate": 1.5993465993465997e-05, - "loss": 0.0684, + "epoch": 3.967693084300858, + "grad_norm": 0.5985211133956909, + "learning_rate": 6.193841494194851e-06, + "loss": 0.0609, "step": 7860 }, { - "epoch": 2.3373923373923375, - "grad_norm": 0.464747816324234, - "learning_rate": 1.5975645975645976e-05, - "loss": 0.0614, + "epoch": 3.972741039878849, + "grad_norm": 0.6152123808860779, + "learning_rate": 6.163553760726906e-06, + "loss": 0.0622, "step": 7870 }, { - "epoch": 2.3403623403623404, - "grad_norm": 0.4768252372741699, - "learning_rate": 1.5957825957825958e-05, - "loss": 0.0722, + "epoch": 3.97778899545684, + "grad_norm": 0.49580270051956177, + "learning_rate": 6.13326602725896e-06, + "loss": 0.056, "step": 7880 }, { - "epoch": 2.343332343332343, - "grad_norm": 0.5153640508651733, - "learning_rate": 1.594000594000594e-05, - "loss": 0.0713, + "epoch": 3.982836951034831, + "grad_norm": 0.8874292373657227, + "learning_rate": 6.102978293791015e-06, + "loss": 0.0599, "step": 7890 }, { - "epoch": 2.3463023463023465, - "grad_norm": 0.6367037296295166, - "learning_rate": 1.5922185922185922e-05, - "loss": 0.0681, + "epoch": 3.987884906612822, + "grad_norm": 0.6198350787162781, + "learning_rate": 6.072690560323069e-06, + "loss": 0.0546, "step": 7900 }, { - "epoch": 2.3492723492723493, - "grad_norm": 0.36707803606987, - "learning_rate": 1.5904365904365904e-05, - "loss": 0.0625, + "epoch": 3.9929328621908127, + "grad_norm": 0.39257192611694336, + "learning_rate": 6.042402826855124e-06, + "loss": 0.0523, "step": 7910 }, { - "epoch": 2.352242352242352, - "grad_norm": 0.21663016080856323, - "learning_rate": 1.5886545886545887e-05, - "loss": 0.0602, + "epoch": 3.997980817768804, + "grad_norm": 0.4612904191017151, + "learning_rate": 6.012115093387178e-06, + "loss": 0.0685, "step": 7920 }, { - "epoch": 2.3552123552123554, - "grad_norm": 0.5676469206809998, - "learning_rate": 1.5868725868725872e-05, - "loss": 0.0605, + "epoch": 4.0, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.038467586040496826, + "eval_runtime": 578.9562, + "eval_samples_per_second": 356.265, + "eval_steps_per_second": 2.784, + "step": 7924 + }, + { + "epoch": 4.003028773346794, + "grad_norm": 0.7146291732788086, + "learning_rate": 5.981827359919233e-06, + "loss": 0.0575, "step": 7930 }, { - "epoch": 2.358182358182358, - "grad_norm": 0.4324367940425873, - "learning_rate": 1.585090585090585e-05, - "loss": 0.0661, + "epoch": 4.008076728924785, + "grad_norm": 0.6313480138778687, + "learning_rate": 5.951539626451287e-06, + "loss": 0.0581, "step": 7940 }, { - "epoch": 2.361152361152361, - "grad_norm": 0.40506285429000854, - "learning_rate": 1.5833085833085833e-05, - "loss": 0.0555, + "epoch": 4.013124684502777, + "grad_norm": 0.4977870583534241, + "learning_rate": 5.921251892983342e-06, + "loss": 0.0582, "step": 7950 }, { - "epoch": 2.3641223641223643, - "grad_norm": 0.30328163504600525, - "learning_rate": 1.5815265815265815e-05, - "loss": 0.0676, + "epoch": 4.018172640080767, + "grad_norm": 0.4447147250175476, + "learning_rate": 5.890964159515397e-06, + "loss": 0.0544, "step": 7960 }, { - "epoch": 2.367092367092367, - "grad_norm": 0.449945330619812, - "learning_rate": 1.5797445797445797e-05, - "loss": 0.0731, + "epoch": 4.023220595658758, + "grad_norm": 0.6496310234069824, + "learning_rate": 5.860676426047451e-06, + "loss": 0.0595, "step": 7970 }, { - "epoch": 2.37006237006237, - "grad_norm": 0.5466241836547852, - "learning_rate": 1.577962577962578e-05, - "loss": 0.0587, + "epoch": 4.028268551236749, + "grad_norm": 0.4380001127719879, + "learning_rate": 5.830388692579505e-06, + "loss": 0.0549, "step": 7980 }, { - "epoch": 2.3730323730323732, - "grad_norm": 0.2828434407711029, - "learning_rate": 1.5761805761805762e-05, - "loss": 0.0699, + "epoch": 4.03331650681474, + "grad_norm": 0.5718368887901306, + "learning_rate": 5.80010095911156e-06, + "loss": 0.0559, "step": 7990 }, { - "epoch": 2.376002376002376, - "grad_norm": 0.7119054794311523, - "learning_rate": 1.5743985743985747e-05, - "loss": 0.0615, + "epoch": 4.038364462392731, + "grad_norm": 0.5859358906745911, + "learning_rate": 5.769813225643615e-06, + "loss": 0.0572, "step": 8000 }, { - "epoch": 2.378972378972379, - "grad_norm": 0.5612084269523621, - "learning_rate": 1.5726165726165726e-05, - "loss": 0.0713, + "epoch": 4.043412417970722, + "grad_norm": 0.49378788471221924, + "learning_rate": 5.739525492175669e-06, + "loss": 0.054, "step": 8010 }, { - "epoch": 2.381942381942382, - "grad_norm": 0.3906906545162201, - "learning_rate": 1.5708345708345708e-05, - "loss": 0.0628, + "epoch": 4.048460373548712, + "grad_norm": 0.6780097484588623, + "learning_rate": 5.709237758707723e-06, + "loss": 0.0568, "step": 8020 }, { - "epoch": 2.384912384912385, - "grad_norm": 0.4246062636375427, - "learning_rate": 1.569052569052569e-05, - "loss": 0.0693, + "epoch": 4.053508329126704, + "grad_norm": 0.8048389554023743, + "learning_rate": 5.6789500252397786e-06, + "loss": 0.0527, "step": 8030 }, { - "epoch": 2.387882387882388, - "grad_norm": 0.7282578349113464, - "learning_rate": 1.5672705672705673e-05, - "loss": 0.0627, + "epoch": 4.058556284704695, + "grad_norm": 0.4513346254825592, + "learning_rate": 5.648662291771832e-06, + "loss": 0.0597, "step": 8040 }, { - "epoch": 2.390852390852391, - "grad_norm": 0.4457809627056122, - "learning_rate": 1.5654885654885655e-05, - "loss": 0.0756, + "epoch": 4.063604240282685, + "grad_norm": 0.6877405643463135, + "learning_rate": 5.618374558303887e-06, + "loss": 0.0594, "step": 8050 }, { - "epoch": 2.393822393822394, - "grad_norm": 0.460112065076828, - "learning_rate": 1.5637065637065637e-05, - "loss": 0.0715, + "epoch": 4.068652195860676, + "grad_norm": 0.41468387842178345, + "learning_rate": 5.5880868248359414e-06, + "loss": 0.0563, "step": 8060 }, { - "epoch": 2.3967923967923968, - "grad_norm": 0.8285795450210571, - "learning_rate": 1.5619245619245622e-05, - "loss": 0.0731, + "epoch": 4.0737001514386675, + "grad_norm": 0.5062978267669678, + "learning_rate": 5.557799091367996e-06, + "loss": 0.0598, "step": 8070 }, { - "epoch": 2.3997623997623996, - "grad_norm": 0.6187976002693176, - "learning_rate": 1.56014256014256e-05, - "loss": 0.0711, + "epoch": 4.078748107016659, + "grad_norm": 0.6427041888237, + "learning_rate": 5.527511357900051e-06, + "loss": 0.057, "step": 8080 }, { - "epoch": 2.402732402732403, - "grad_norm": 0.42191964387893677, - "learning_rate": 1.5583605583605583e-05, - "loss": 0.062, + "epoch": 4.083796062594649, + "grad_norm": 0.5508936643600464, + "learning_rate": 5.497223624432105e-06, + "loss": 0.0472, "step": 8090 }, { - "epoch": 2.4057024057024057, - "grad_norm": 0.3665825128555298, - "learning_rate": 1.5565785565785566e-05, - "loss": 0.0726, + "epoch": 4.08884401817264, + "grad_norm": 0.39490872621536255, + "learning_rate": 5.4669358909641595e-06, + "loss": 0.0589, "step": 8100 }, { - "epoch": 2.4086724086724085, - "grad_norm": 0.535072386264801, - "learning_rate": 1.5547965547965548e-05, - "loss": 0.0767, + "epoch": 4.093891973750631, + "grad_norm": 0.5776220560073853, + "learning_rate": 5.436648157496214e-06, + "loss": 0.0602, "step": 8110 }, { - "epoch": 2.4116424116424118, - "grad_norm": 0.5114570260047913, - "learning_rate": 1.553014553014553e-05, - "loss": 0.0571, + "epoch": 4.098939929328622, + "grad_norm": 0.36714500188827515, + "learning_rate": 5.406360424028269e-06, + "loss": 0.0474, "step": 8120 }, { - "epoch": 2.4146124146124146, - "grad_norm": 0.5549605488777161, - "learning_rate": 1.5512325512325512e-05, - "loss": 0.0741, + "epoch": 4.103987884906613, + "grad_norm": 0.7429747581481934, + "learning_rate": 5.376072690560323e-06, + "loss": 0.0516, "step": 8130 }, { - "epoch": 2.4175824175824174, - "grad_norm": 0.47435063123703003, - "learning_rate": 1.5494505494505498e-05, - "loss": 0.0715, + "epoch": 4.109035840484604, + "grad_norm": 0.7167190909385681, + "learning_rate": 5.3457849570923775e-06, + "loss": 0.0559, "step": 8140 }, { - "epoch": 2.4205524205524207, - "grad_norm": 0.45239725708961487, - "learning_rate": 1.547668547668548e-05, - "loss": 0.076, + "epoch": 4.1140837960625944, + "grad_norm": 0.5668296217918396, + "learning_rate": 5.315497223624433e-06, + "loss": 0.0558, "step": 8150 }, { - "epoch": 2.4235224235224235, - "grad_norm": 0.3530510663986206, - "learning_rate": 1.545886545886546e-05, - "loss": 0.0603, + "epoch": 4.119131751640586, + "grad_norm": 0.5577311515808105, + "learning_rate": 5.285209490156487e-06, + "loss": 0.0589, "step": 8160 }, { - "epoch": 2.4264924264924264, - "grad_norm": 0.34246182441711426, - "learning_rate": 1.544104544104544e-05, - "loss": 0.0687, + "epoch": 4.124179707218577, + "grad_norm": 0.611304759979248, + "learning_rate": 5.254921756688541e-06, + "loss": 0.0546, "step": 8170 }, { - "epoch": 2.4294624294624296, - "grad_norm": 0.4563390612602234, - "learning_rate": 1.5423225423225423e-05, - "loss": 0.0592, + "epoch": 4.129227662796567, + "grad_norm": 0.5540894865989685, + "learning_rate": 5.2246340232205955e-06, + "loss": 0.0611, "step": 8180 }, { - "epoch": 2.4324324324324325, - "grad_norm": 0.6311094760894775, - "learning_rate": 1.5405405405405405e-05, - "loss": 0.0734, + "epoch": 4.134275618374558, + "grad_norm": 0.5128312706947327, + "learning_rate": 5.194346289752651e-06, + "loss": 0.0552, "step": 8190 }, { - "epoch": 2.4354024354024353, - "grad_norm": 0.398874431848526, - "learning_rate": 1.5387585387585387e-05, - "loss": 0.0658, + "epoch": 4.13932357395255, + "grad_norm": 0.6017599105834961, + "learning_rate": 5.164058556284704e-06, + "loss": 0.0494, "step": 8200 }, { - "epoch": 2.4383724383724386, - "grad_norm": 0.47651001811027527, - "learning_rate": 1.5369765369765373e-05, - "loss": 0.0653, + "epoch": 4.14437152953054, + "grad_norm": 0.42843466997146606, + "learning_rate": 5.133770822816759e-06, + "loss": 0.0534, "step": 8210 }, { - "epoch": 2.4413424413424414, - "grad_norm": 0.5543814897537231, - "learning_rate": 1.5351945351945355e-05, - "loss": 0.077, + "epoch": 4.149419485108531, + "grad_norm": 0.6050401926040649, + "learning_rate": 5.103483089348814e-06, + "loss": 0.0524, "step": 8220 }, { - "epoch": 2.444312444312444, - "grad_norm": 0.6091018915176392, - "learning_rate": 1.5334125334125334e-05, - "loss": 0.0673, + "epoch": 4.154467440686522, + "grad_norm": 0.512793242931366, + "learning_rate": 5.073195355880868e-06, + "loss": 0.0562, "step": 8230 }, { - "epoch": 2.447282447282447, - "grad_norm": 0.5908657312393188, - "learning_rate": 1.5316305316305316e-05, - "loss": 0.0592, + "epoch": 4.159515396264513, + "grad_norm": 0.5130860209465027, + "learning_rate": 5.042907622412923e-06, + "loss": 0.0413, "step": 8240 }, { - "epoch": 2.4502524502524503, - "grad_norm": 0.707524836063385, - "learning_rate": 1.5298485298485298e-05, - "loss": 0.067, + "epoch": 4.164563351842504, + "grad_norm": 0.6443082690238953, + "learning_rate": 5.012619888944977e-06, + "loss": 0.0593, "step": 8250 }, { - "epoch": 2.453222453222453, - "grad_norm": 0.5802726745605469, - "learning_rate": 1.528066528066528e-05, - "loss": 0.0717, + "epoch": 4.169611307420495, + "grad_norm": 0.6051344871520996, + "learning_rate": 4.982332155477032e-06, + "loss": 0.0542, "step": 8260 }, { - "epoch": 2.456192456192456, - "grad_norm": 0.5758719444274902, - "learning_rate": 1.5262845262845263e-05, - "loss": 0.0654, + "epoch": 4.174659262998485, + "grad_norm": 0.5795598030090332, + "learning_rate": 4.952044422009086e-06, + "loss": 0.0569, "step": 8270 }, { - "epoch": 2.4591624591624592, - "grad_norm": 0.2951982617378235, - "learning_rate": 1.5245025245025246e-05, - "loss": 0.0646, + "epoch": 4.1797072185764765, + "grad_norm": 0.6054142117500305, + "learning_rate": 4.921756688541141e-06, + "loss": 0.0575, "step": 8280 }, { - "epoch": 2.462132462132462, - "grad_norm": 0.6033930778503418, - "learning_rate": 1.5227205227205229e-05, - "loss": 0.0616, + "epoch": 4.184755174154468, + "grad_norm": 0.6954050660133362, + "learning_rate": 4.891468955073196e-06, + "loss": 0.0609, "step": 8290 }, { - "epoch": 2.465102465102465, - "grad_norm": 0.7335968613624573, - "learning_rate": 1.520938520938521e-05, - "loss": 0.0527, + "epoch": 4.189803129732458, + "grad_norm": 0.7217870354652405, + "learning_rate": 4.86118122160525e-06, + "loss": 0.0559, "step": 8300 }, { - "epoch": 2.468072468072468, - "grad_norm": 0.2696143686771393, - "learning_rate": 1.5191565191565193e-05, - "loss": 0.062, + "epoch": 4.194851085310449, + "grad_norm": 0.49758586287498474, + "learning_rate": 4.830893488137305e-06, + "loss": 0.0506, "step": 8310 }, { - "epoch": 2.471042471042471, - "grad_norm": 0.24488599598407745, - "learning_rate": 1.5173745173745173e-05, - "loss": 0.0661, + "epoch": 4.1998990408884405, + "grad_norm": 0.4497081935405731, + "learning_rate": 4.800605754669359e-06, + "loss": 0.0581, "step": 8320 }, { - "epoch": 2.474012474012474, - "grad_norm": 0.43688690662384033, - "learning_rate": 1.5155925155925156e-05, - "loss": 0.0563, + "epoch": 4.204946996466431, + "grad_norm": 0.6054022312164307, + "learning_rate": 4.770318021201413e-06, + "loss": 0.0596, "step": 8330 }, { - "epoch": 2.476982476982477, - "grad_norm": 0.557025134563446, - "learning_rate": 1.5138105138105138e-05, - "loss": 0.0598, + "epoch": 4.209994952044422, + "grad_norm": 0.7262012958526611, + "learning_rate": 4.7400302877334685e-06, + "loss": 0.0489, "step": 8340 }, { - "epoch": 2.47995247995248, - "grad_norm": 0.6465451717376709, - "learning_rate": 1.5120285120285122e-05, - "loss": 0.0669, + "epoch": 4.215042907622413, + "grad_norm": 0.6226342916488647, + "learning_rate": 4.709742554265523e-06, + "loss": 0.0596, "step": 8350 }, { - "epoch": 2.4829224829224827, - "grad_norm": 0.38359346985816956, - "learning_rate": 1.5102465102465104e-05, - "loss": 0.0715, + "epoch": 4.2200908632004035, + "grad_norm": 0.8234953284263611, + "learning_rate": 4.679454820797577e-06, + "loss": 0.057, "step": 8360 }, { - "epoch": 2.485892485892486, - "grad_norm": 0.6876797080039978, - "learning_rate": 1.5084645084645086e-05, - "loss": 0.0799, + "epoch": 4.225138818778395, + "grad_norm": 0.8438859581947327, + "learning_rate": 4.649167087329631e-06, + "loss": 0.0516, "step": 8370 }, { - "epoch": 2.488862488862489, - "grad_norm": 0.47395193576812744, - "learning_rate": 1.5066825066825068e-05, - "loss": 0.0699, + "epoch": 4.230186774356386, + "grad_norm": 0.5095875263214111, + "learning_rate": 4.6188793538616865e-06, + "loss": 0.0646, "step": 8380 }, { - "epoch": 2.4918324918324917, - "grad_norm": 0.4975154399871826, - "learning_rate": 1.5049005049005049e-05, - "loss": 0.0655, + "epoch": 4.235234729934376, + "grad_norm": 0.5543855428695679, + "learning_rate": 4.58859162039374e-06, + "loss": 0.0482, "step": 8390 }, { - "epoch": 2.494802494802495, - "grad_norm": 0.5417085886001587, - "learning_rate": 1.503118503118503e-05, - "loss": 0.0768, + "epoch": 4.240282685512367, + "grad_norm": 0.7510880827903748, + "learning_rate": 4.558303886925795e-06, + "loss": 0.0595, "step": 8400 }, { - "epoch": 2.4977724977724978, - "grad_norm": 0.593275785446167, - "learning_rate": 1.5013365013365013e-05, - "loss": 0.0613, + "epoch": 4.245330641090359, + "grad_norm": 0.5140940546989441, + "learning_rate": 4.52801615345785e-06, + "loss": 0.0568, "step": 8410 }, { - "epoch": 2.5007425007425006, - "grad_norm": 0.5146422386169434, - "learning_rate": 1.4995544995544995e-05, - "loss": 0.0539, + "epoch": 4.250378596668349, + "grad_norm": 0.43089789152145386, + "learning_rate": 4.497728419989904e-06, + "loss": 0.058, "step": 8420 }, { - "epoch": 2.503712503712504, - "grad_norm": 0.5107398629188538, - "learning_rate": 1.4977724977724977e-05, - "loss": 0.0642, + "epoch": 4.25542655224634, + "grad_norm": 0.6229716539382935, + "learning_rate": 4.467440686521959e-06, + "loss": 0.0538, "step": 8430 }, { - "epoch": 2.5066825066825067, - "grad_norm": 0.40161600708961487, - "learning_rate": 1.4959904959904961e-05, - "loss": 0.0598, + "epoch": 4.260474507824331, + "grad_norm": 0.6465341448783875, + "learning_rate": 4.437152953054013e-06, + "loss": 0.0544, "step": 8440 }, { - "epoch": 2.5096525096525095, - "grad_norm": 0.3634410500526428, - "learning_rate": 1.4942084942084943e-05, - "loss": 0.064, + "epoch": 4.265522463402322, + "grad_norm": 0.42706695199012756, + "learning_rate": 4.406865219586068e-06, + "loss": 0.0562, "step": 8450 }, { - "epoch": 2.512622512622513, - "grad_norm": 0.29765084385871887, - "learning_rate": 1.4924264924264924e-05, - "loss": 0.0596, + "epoch": 4.270570418980313, + "grad_norm": 0.5305337309837341, + "learning_rate": 4.376577486118122e-06, + "loss": 0.0567, "step": 8460 }, { - "epoch": 2.5155925155925156, - "grad_norm": 0.5146110653877258, - "learning_rate": 1.4906444906444908e-05, - "loss": 0.0794, + "epoch": 4.275618374558304, + "grad_norm": 0.7307097315788269, + "learning_rate": 4.346289752650177e-06, + "loss": 0.0486, "step": 8470 }, { - "epoch": 2.5185625185625184, - "grad_norm": 0.49415668845176697, - "learning_rate": 1.488862488862489e-05, - "loss": 0.0612, + "epoch": 4.280666330136295, + "grad_norm": 0.5940870046615601, + "learning_rate": 4.316002019182232e-06, + "loss": 0.0514, "step": 8480 }, { - "epoch": 2.5215325215325217, - "grad_norm": 0.3173198997974396, - "learning_rate": 1.487080487080487e-05, - "loss": 0.0576, + "epoch": 4.285714285714286, + "grad_norm": 0.4446733593940735, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.0545, "step": 8490 }, { - "epoch": 2.5245025245025245, - "grad_norm": 0.4311143159866333, - "learning_rate": 1.4852984852984852e-05, - "loss": 0.0721, + "epoch": 4.290762241292277, + "grad_norm": 0.9121294617652893, + "learning_rate": 4.255426552246341e-06, + "loss": 0.0557, "step": 8500 }, { - "epoch": 2.5274725274725274, - "grad_norm": 0.3858831524848938, - "learning_rate": 1.4835164835164836e-05, - "loss": 0.0553, + "epoch": 4.295810196870267, + "grad_norm": 0.568056583404541, + "learning_rate": 4.225138818778395e-06, + "loss": 0.0522, "step": 8510 }, { - "epoch": 2.5304425304425306, - "grad_norm": 0.4288255572319031, - "learning_rate": 1.4817344817344818e-05, - "loss": 0.0598, + "epoch": 4.300858152448258, + "grad_norm": 0.8788109421730042, + "learning_rate": 4.194851085310449e-06, + "loss": 0.0433, "step": 8520 }, { - "epoch": 2.5334125334125335, - "grad_norm": 0.6533911228179932, - "learning_rate": 1.4799524799524799e-05, - "loss": 0.0776, + "epoch": 4.3059061080262495, + "grad_norm": 0.7445030808448792, + "learning_rate": 4.1645633518425035e-06, + "loss": 0.05, "step": 8530 }, { - "epoch": 2.5363825363825363, - "grad_norm": 0.4716707468032837, - "learning_rate": 1.4781704781704783e-05, - "loss": 0.0586, + "epoch": 4.310954063604241, + "grad_norm": 0.8348667621612549, + "learning_rate": 4.134275618374559e-06, + "loss": 0.0584, "step": 8540 }, { - "epoch": 2.5393525393525396, - "grad_norm": 0.40273916721343994, - "learning_rate": 1.4763884763884765e-05, - "loss": 0.0665, + "epoch": 4.316002019182231, + "grad_norm": 0.462342232465744, + "learning_rate": 4.103987884906613e-06, + "loss": 0.0555, "step": 8550 }, { - "epoch": 2.5423225423225424, - "grad_norm": 0.5408886075019836, - "learning_rate": 1.4746064746064745e-05, - "loss": 0.0606, + "epoch": 4.321049974760222, + "grad_norm": 0.42785176634788513, + "learning_rate": 4.073700151438667e-06, + "loss": 0.0607, "step": 8560 }, { - "epoch": 2.5452925452925452, - "grad_norm": 0.4306439757347107, - "learning_rate": 1.4728244728244728e-05, - "loss": 0.0725, + "epoch": 4.326097930338213, + "grad_norm": 0.7172122597694397, + "learning_rate": 4.043412417970722e-06, + "loss": 0.0675, "step": 8570 }, { - "epoch": 2.5482625482625485, - "grad_norm": 0.6419402360916138, - "learning_rate": 1.4710424710424711e-05, - "loss": 0.0808, + "epoch": 4.331145885916204, + "grad_norm": 0.4495554566383362, + "learning_rate": 4.013124684502776e-06, + "loss": 0.0546, "step": 8580 }, { - "epoch": 2.5512325512325513, - "grad_norm": 0.4105408489704132, - "learning_rate": 1.4692604692604694e-05, - "loss": 0.0618, + "epoch": 4.336193841494195, + "grad_norm": 0.5083460807800293, + "learning_rate": 3.982836951034831e-06, + "loss": 0.06, "step": 8590 }, { - "epoch": 2.554202554202554, - "grad_norm": 0.6038805246353149, - "learning_rate": 1.4674784674784674e-05, - "loss": 0.0583, + "epoch": 4.341241797072186, + "grad_norm": 0.4353145360946655, + "learning_rate": 3.952549217566885e-06, + "loss": 0.0535, "step": 8600 }, { - "epoch": 2.5571725571725574, - "grad_norm": 0.6562811732292175, - "learning_rate": 1.4656964656964658e-05, - "loss": 0.0631, + "epoch": 4.3462897526501765, + "grad_norm": 0.6741386651992798, + "learning_rate": 3.92226148409894e-06, + "loss": 0.0581, "step": 8610 }, { - "epoch": 2.5601425601425603, - "grad_norm": 0.5605425238609314, - "learning_rate": 1.463914463914464e-05, - "loss": 0.0679, + "epoch": 4.351337708228168, + "grad_norm": 0.47798269987106323, + "learning_rate": 3.891973750630995e-06, + "loss": 0.0541, "step": 8620 }, { - "epoch": 2.563112563112563, - "grad_norm": 0.4276842474937439, - "learning_rate": 1.4621324621324622e-05, - "loss": 0.0592, + "epoch": 4.356385663806159, + "grad_norm": 0.49109166860580444, + "learning_rate": 3.861686017163049e-06, + "loss": 0.0608, "step": 8630 }, { - "epoch": 2.5660825660825664, - "grad_norm": 0.46107247471809387, - "learning_rate": 1.4603504603504603e-05, - "loss": 0.0585, + "epoch": 4.361433619384149, + "grad_norm": 0.8310505747795105, + "learning_rate": 3.831398283695104e-06, + "loss": 0.0514, "step": 8640 }, { - "epoch": 2.569052569052569, - "grad_norm": 0.5134992003440857, - "learning_rate": 1.4585684585684587e-05, - "loss": 0.0734, + "epoch": 4.36648157496214, + "grad_norm": 0.4586045742034912, + "learning_rate": 3.801110550227158e-06, + "loss": 0.0538, "step": 8650 }, { - "epoch": 2.572022572022572, - "grad_norm": 0.4581449031829834, - "learning_rate": 1.4567864567864569e-05, - "loss": 0.0549, + "epoch": 4.371529530540132, + "grad_norm": 0.4350300133228302, + "learning_rate": 3.7708228167592127e-06, + "loss": 0.0526, "step": 8660 }, { - "epoch": 2.574992574992575, - "grad_norm": 0.5128817558288574, - "learning_rate": 1.455004455004455e-05, - "loss": 0.0648, + "epoch": 4.376577486118122, + "grad_norm": 0.6310685276985168, + "learning_rate": 3.740535083291267e-06, + "loss": 0.0597, "step": 8670 }, { - "epoch": 2.577962577962578, - "grad_norm": 0.5839508771896362, - "learning_rate": 1.4532224532224533e-05, - "loss": 0.0794, + "epoch": 4.381625441696113, + "grad_norm": 0.6845548152923584, + "learning_rate": 3.7102473498233217e-06, + "loss": 0.0542, "step": 8680 }, { - "epoch": 2.580932580932581, - "grad_norm": 0.4006098210811615, - "learning_rate": 1.4514404514404515e-05, - "loss": 0.0679, + "epoch": 4.386673397274104, + "grad_norm": 1.085631012916565, + "learning_rate": 3.679959616355376e-06, + "loss": 0.0601, "step": 8690 }, { - "epoch": 2.5839025839025838, - "grad_norm": 0.24022406339645386, - "learning_rate": 1.4496584496584498e-05, - "loss": 0.0671, + "epoch": 4.391721352852095, + "grad_norm": 0.6232538223266602, + "learning_rate": 3.6496718828874303e-06, + "loss": 0.0557, "step": 8700 }, { - "epoch": 2.586872586872587, - "grad_norm": 0.390082448720932, - "learning_rate": 1.4478764478764478e-05, - "loss": 0.0533, + "epoch": 4.396769308430086, + "grad_norm": 0.4568091630935669, + "learning_rate": 3.6193841494194855e-06, + "loss": 0.0494, "step": 8710 }, { - "epoch": 2.58984258984259, - "grad_norm": 0.5063132643699646, - "learning_rate": 1.4460944460944462e-05, - "loss": 0.065, + "epoch": 4.401817264008077, + "grad_norm": 0.7550612092018127, + "learning_rate": 3.5890964159515398e-06, + "loss": 0.0562, "step": 8720 }, { - "epoch": 2.5928125928125927, - "grad_norm": 0.4413723647594452, - "learning_rate": 1.4443124443124444e-05, - "loss": 0.0628, + "epoch": 4.406865219586067, + "grad_norm": 0.5380585789680481, + "learning_rate": 3.5588086824835945e-06, + "loss": 0.0521, "step": 8730 }, { - "epoch": 2.5957825957825955, - "grad_norm": 0.5134592056274414, - "learning_rate": 1.4425304425304425e-05, - "loss": 0.0726, + "epoch": 4.411913175164059, + "grad_norm": 0.42225027084350586, + "learning_rate": 3.5285209490156488e-06, + "loss": 0.0515, "step": 8740 }, { - "epoch": 2.598752598752599, - "grad_norm": 0.6060248613357544, - "learning_rate": 1.4407484407484408e-05, - "loss": 0.0685, + "epoch": 4.41696113074205, + "grad_norm": 0.5831999778747559, + "learning_rate": 3.498233215547703e-06, + "loss": 0.0465, "step": 8750 }, { - "epoch": 2.6017226017226016, - "grad_norm": 0.54691481590271, - "learning_rate": 1.438966438966439e-05, - "loss": 0.0707, + "epoch": 4.42200908632004, + "grad_norm": 0.7943524718284607, + "learning_rate": 3.4679454820797578e-06, + "loss": 0.062, "step": 8760 }, { - "epoch": 2.6046926046926044, - "grad_norm": 0.3673838675022125, - "learning_rate": 1.4371844371844373e-05, - "loss": 0.0735, + "epoch": 4.427057041898031, + "grad_norm": 0.634747326374054, + "learning_rate": 3.437657748611812e-06, + "loss": 0.0496, "step": 8770 }, { - "epoch": 2.6076626076626077, - "grad_norm": 0.47034141421318054, - "learning_rate": 1.4354024354024353e-05, - "loss": 0.0683, + "epoch": 4.4321049974760225, + "grad_norm": 0.5734288692474365, + "learning_rate": 3.407370015143867e-06, + "loss": 0.0612, "step": 8780 }, { - "epoch": 2.6106326106326105, - "grad_norm": 0.6322289109230042, - "learning_rate": 1.4336204336204337e-05, - "loss": 0.0712, + "epoch": 4.437152953054013, + "grad_norm": 0.7079018354415894, + "learning_rate": 3.3770822816759215e-06, + "loss": 0.0578, "step": 8790 }, { - "epoch": 2.6136026136026134, - "grad_norm": 0.3621010482311249, - "learning_rate": 1.431838431838432e-05, - "loss": 0.0599, + "epoch": 4.442200908632004, + "grad_norm": 0.44444698095321655, + "learning_rate": 3.346794548207976e-06, + "loss": 0.0559, "step": 8800 }, { - "epoch": 2.6165726165726166, - "grad_norm": 0.3426471948623657, - "learning_rate": 1.43005643005643e-05, - "loss": 0.0699, + "epoch": 4.447248864209995, + "grad_norm": 0.7473122477531433, + "learning_rate": 3.3165068147400305e-06, + "loss": 0.0544, "step": 8810 }, { - "epoch": 2.6195426195426195, - "grad_norm": 0.4808509647846222, - "learning_rate": 1.4282744282744284e-05, - "loss": 0.0563, + "epoch": 4.4522968197879855, + "grad_norm": 0.6658338308334351, + "learning_rate": 3.286219081272085e-06, + "loss": 0.0552, "step": 8820 }, { - "epoch": 2.6225126225126223, - "grad_norm": 0.588964581489563, - "learning_rate": 1.4264924264924266e-05, - "loss": 0.0667, + "epoch": 4.457344775365977, + "grad_norm": 0.48870500922203064, + "learning_rate": 3.255931347804139e-06, + "loss": 0.0566, "step": 8830 }, { - "epoch": 2.6254826254826256, - "grad_norm": 0.48968905210494995, - "learning_rate": 1.4247104247104248e-05, - "loss": 0.0648, + "epoch": 4.462392730943968, + "grad_norm": 0.6261917948722839, + "learning_rate": 3.2256436143361943e-06, + "loss": 0.0487, "step": 8840 }, { - "epoch": 2.6284526284526284, - "grad_norm": 0.4962276816368103, - "learning_rate": 1.4229284229284228e-05, - "loss": 0.0719, + "epoch": 4.467440686521958, + "grad_norm": 0.6060011982917786, + "learning_rate": 3.1953558808682486e-06, + "loss": 0.0514, "step": 8850 }, { - "epoch": 2.631422631422631, - "grad_norm": 0.5036596059799194, - "learning_rate": 1.4211464211464212e-05, - "loss": 0.0736, + "epoch": 4.4724886420999495, + "grad_norm": 0.4858971834182739, + "learning_rate": 3.165068147400303e-06, + "loss": 0.05, "step": 8860 }, { - "epoch": 2.6343926343926345, - "grad_norm": 0.47525274753570557, - "learning_rate": 1.4193644193644194e-05, - "loss": 0.0606, + "epoch": 4.477536597677941, + "grad_norm": 0.6394979357719421, + "learning_rate": 3.1347804139323576e-06, + "loss": 0.0604, "step": 8870 }, { - "epoch": 2.6373626373626373, - "grad_norm": 0.6138589978218079, - "learning_rate": 1.4175824175824177e-05, - "loss": 0.064, + "epoch": 4.482584553255931, + "grad_norm": 0.6840482950210571, + "learning_rate": 3.104492680464412e-06, + "loss": 0.0514, "step": 8880 }, { - "epoch": 2.64033264033264, - "grad_norm": 0.2877761721611023, - "learning_rate": 1.4158004158004159e-05, - "loss": 0.0645, + "epoch": 4.487632508833922, + "grad_norm": 0.388715535402298, + "learning_rate": 3.0742049469964666e-06, + "loss": 0.0479, "step": 8890 }, { - "epoch": 2.6433026433026434, - "grad_norm": 0.4664807617664337, - "learning_rate": 1.4140184140184141e-05, - "loss": 0.0573, + "epoch": 4.492680464411913, + "grad_norm": 0.6516565084457397, + "learning_rate": 3.043917213528521e-06, + "loss": 0.0608, "step": 8900 }, { - "epoch": 2.6462726462726462, - "grad_norm": 0.38519200682640076, - "learning_rate": 1.4122364122364123e-05, - "loss": 0.0666, + "epoch": 4.497728419989904, + "grad_norm": 0.76282799243927, + "learning_rate": 3.0136294800605756e-06, + "loss": 0.0572, "step": 8910 }, { - "epoch": 2.649242649242649, - "grad_norm": 0.7016706466674805, - "learning_rate": 1.4104544104544104e-05, - "loss": 0.0597, + "epoch": 4.502776375567895, + "grad_norm": 0.49448370933532715, + "learning_rate": 2.9833417465926303e-06, + "loss": 0.0575, "step": 8920 }, { - "epoch": 2.6522126522126523, - "grad_norm": 0.5760989785194397, - "learning_rate": 1.4086724086724087e-05, - "loss": 0.0606, + "epoch": 4.507824331145886, + "grad_norm": 0.5593730807304382, + "learning_rate": 2.9530540131246846e-06, + "loss": 0.0486, "step": 8930 }, { - "epoch": 2.655182655182655, - "grad_norm": 0.47811734676361084, - "learning_rate": 1.406890406890407e-05, - "loss": 0.0688, + "epoch": 4.512872286723876, + "grad_norm": 0.5773325562477112, + "learning_rate": 2.922766279656739e-06, + "loss": 0.0541, "step": 8940 }, { - "epoch": 2.658152658152658, - "grad_norm": 0.3496223986148834, - "learning_rate": 1.4051084051084052e-05, - "loss": 0.0569, + "epoch": 4.517920242301868, + "grad_norm": 0.34630000591278076, + "learning_rate": 2.8924785461887936e-06, + "loss": 0.0606, "step": 8950 }, { - "epoch": 2.6611226611226613, - "grad_norm": 0.6245877742767334, - "learning_rate": 1.4033264033264034e-05, - "loss": 0.0694, + "epoch": 4.522968197879859, + "grad_norm": 0.5409483313560486, + "learning_rate": 2.862190812720848e-06, + "loss": 0.0589, "step": 8960 }, { - "epoch": 2.664092664092664, - "grad_norm": 0.38785070180892944, - "learning_rate": 1.4015444015444016e-05, - "loss": 0.0599, + "epoch": 4.52801615345785, + "grad_norm": 0.5004202127456665, + "learning_rate": 2.8319030792529026e-06, + "loss": 0.0621, "step": 8970 }, { - "epoch": 2.667062667062667, - "grad_norm": 0.3740558624267578, - "learning_rate": 1.3997623997623998e-05, - "loss": 0.0562, + "epoch": 4.53306410903584, + "grad_norm": 0.4979722797870636, + "learning_rate": 2.8016153457849574e-06, + "loss": 0.0537, "step": 8980 }, { - "epoch": 2.67003267003267, - "grad_norm": 0.4402414560317993, - "learning_rate": 1.3979803979803979e-05, - "loss": 0.0595, + "epoch": 4.5381120646138315, + "grad_norm": 0.6733251214027405, + "learning_rate": 2.7713276123170117e-06, + "loss": 0.069, "step": 8990 }, { - "epoch": 2.673002673002673, - "grad_norm": 0.6891340017318726, - "learning_rate": 1.3961983961983963e-05, - "loss": 0.0712, + "epoch": 4.543160020191822, + "grad_norm": 0.4152880609035492, + "learning_rate": 2.7410398788490664e-06, + "loss": 0.0565, "step": 9000 }, { - "epoch": 2.675972675972676, - "grad_norm": 0.44056686758995056, - "learning_rate": 1.3944163944163945e-05, - "loss": 0.0712, + "epoch": 4.548207975769813, + "grad_norm": 0.6170037984848022, + "learning_rate": 2.7107521453811207e-06, + "loss": 0.0589, "step": 9010 }, { - "epoch": 2.678942678942679, - "grad_norm": 0.42997923493385315, - "learning_rate": 1.3926343926343927e-05, - "loss": 0.0509, + "epoch": 4.553255931347804, + "grad_norm": 0.5258937478065491, + "learning_rate": 2.680464411913175e-06, + "loss": 0.0548, "step": 9020 }, { - "epoch": 2.681912681912682, - "grad_norm": 0.4868006110191345, - "learning_rate": 1.390852390852391e-05, - "loss": 0.0722, + "epoch": 4.5583038869257955, + "grad_norm": 0.534015417098999, + "learning_rate": 2.6501766784452297e-06, + "loss": 0.0447, "step": 9030 }, { - "epoch": 2.684882684882685, - "grad_norm": 0.4716143310070038, - "learning_rate": 1.3890703890703891e-05, - "loss": 0.0643, + "epoch": 4.563351842503786, + "grad_norm": 0.86041259765625, + "learning_rate": 2.6198889449772844e-06, + "loss": 0.0578, "step": 9040 }, { - "epoch": 2.687852687852688, - "grad_norm": 0.4905288815498352, - "learning_rate": 1.3872883872883874e-05, - "loss": 0.0592, + "epoch": 4.568399798081777, + "grad_norm": 0.8807480335235596, + "learning_rate": 2.589601211509339e-06, + "loss": 0.0479, "step": 9050 }, { - "epoch": 2.690822690822691, - "grad_norm": 0.4081631302833557, - "learning_rate": 1.3855063855063854e-05, - "loss": 0.0736, + "epoch": 4.573447753659767, + "grad_norm": 0.6071127653121948, + "learning_rate": 2.5593134780413934e-06, + "loss": 0.0521, "step": 9060 }, { - "epoch": 2.6937926937926937, - "grad_norm": 0.447644978761673, - "learning_rate": 1.3837243837243838e-05, - "loss": 0.0654, + "epoch": 4.5784957092377585, + "grad_norm": 0.9106950759887695, + "learning_rate": 2.5290257445734477e-06, + "loss": 0.056, "step": 9070 }, { - "epoch": 2.696762696762697, - "grad_norm": 0.22904683649539948, - "learning_rate": 1.381942381942382e-05, - "loss": 0.0562, + "epoch": 4.58354366481575, + "grad_norm": 0.6179044246673584, + "learning_rate": 2.4987380111055024e-06, + "loss": 0.0548, "step": 9080 }, { - "epoch": 2.6997326997327, - "grad_norm": 0.5609009861946106, - "learning_rate": 1.3801603801603802e-05, - "loss": 0.0558, + "epoch": 4.588591620393741, + "grad_norm": 0.9295970797538757, + "learning_rate": 2.4684502776375567e-06, + "loss": 0.0626, "step": 9090 }, { - "epoch": 2.7027027027027026, - "grad_norm": 0.6101239919662476, - "learning_rate": 1.3783783783783784e-05, - "loss": 0.0665, + "epoch": 4.593639575971731, + "grad_norm": 0.4483726918697357, + "learning_rate": 2.438162544169611e-06, + "loss": 0.0531, "step": 9100 }, { - "epoch": 2.705672705672706, - "grad_norm": 0.49575671553611755, - "learning_rate": 1.3765963765963767e-05, - "loss": 0.0589, + "epoch": 4.598687531549722, + "grad_norm": 0.38749760389328003, + "learning_rate": 2.407874810701666e-06, + "loss": 0.0514, "step": 9110 }, { - "epoch": 2.7086427086427087, - "grad_norm": 0.5980531573295593, - "learning_rate": 1.3748143748143749e-05, - "loss": 0.0715, + "epoch": 4.603735487127714, + "grad_norm": 0.7203320860862732, + "learning_rate": 2.3775870772337205e-06, + "loss": 0.0603, "step": 9120 }, { - "epoch": 2.7116127116127116, - "grad_norm": 0.3581327497959137, - "learning_rate": 1.373032373032373e-05, - "loss": 0.0641, + "epoch": 4.608783442705704, + "grad_norm": 0.8010473251342773, + "learning_rate": 2.347299343765775e-06, + "loss": 0.053, "step": 9130 }, { - "epoch": 2.714582714582715, - "grad_norm": 0.5521288514137268, - "learning_rate": 1.3712503712503713e-05, - "loss": 0.0611, + "epoch": 4.613831398283695, + "grad_norm": 0.7866964936256409, + "learning_rate": 2.3170116102978295e-06, + "loss": 0.0544, "step": 9140 }, { - "epoch": 2.7175527175527177, - "grad_norm": 0.617689847946167, - "learning_rate": 1.3694683694683695e-05, - "loss": 0.0556, + "epoch": 4.618879353861686, + "grad_norm": 0.9333378076553345, + "learning_rate": 2.2867238768298838e-06, + "loss": 0.0472, "step": 9150 }, { - "epoch": 2.7205227205227205, - "grad_norm": 0.32165491580963135, - "learning_rate": 1.3676863676863677e-05, - "loss": 0.0714, + "epoch": 4.623927309439677, + "grad_norm": 0.5904621481895447, + "learning_rate": 2.2564361433619385e-06, + "loss": 0.0515, "step": 9160 }, { - "epoch": 2.7234927234927238, - "grad_norm": 0.3842147886753082, - "learning_rate": 1.365904365904366e-05, - "loss": 0.0599, + "epoch": 4.628975265017668, + "grad_norm": 0.6837446093559265, + "learning_rate": 2.2261484098939928e-06, + "loss": 0.0566, "step": 9170 }, { - "epoch": 2.7264627264627266, - "grad_norm": 0.41680991649627686, - "learning_rate": 1.3641223641223642e-05, - "loss": 0.0628, + "epoch": 4.634023220595659, + "grad_norm": 0.5726220607757568, + "learning_rate": 2.1958606764260475e-06, + "loss": 0.0521, "step": 9180 }, { - "epoch": 2.7294327294327294, - "grad_norm": 0.6326974630355835, - "learning_rate": 1.3623403623403624e-05, - "loss": 0.0571, + "epoch": 4.639071176173649, + "grad_norm": 0.5920945405960083, + "learning_rate": 2.1655729429581022e-06, + "loss": 0.0527, "step": 9190 }, { - "epoch": 2.7324027324027322, - "grad_norm": 0.4563412070274353, - "learning_rate": 1.3605583605583606e-05, - "loss": 0.0647, + "epoch": 4.644119131751641, + "grad_norm": 0.5921088457107544, + "learning_rate": 2.1352852094901565e-06, + "loss": 0.0594, "step": 9200 }, { - "epoch": 2.7353727353727355, - "grad_norm": 0.5637513995170593, - "learning_rate": 1.3587763587763588e-05, - "loss": 0.0661, + "epoch": 4.649167087329632, + "grad_norm": 0.8026402592658997, + "learning_rate": 2.1049974760222112e-06, + "loss": 0.058, "step": 9210 }, { - "epoch": 2.7383427383427383, - "grad_norm": 0.373116135597229, - "learning_rate": 1.356994356994357e-05, - "loss": 0.0552, + "epoch": 4.654215042907622, + "grad_norm": 0.9913181066513062, + "learning_rate": 2.0747097425542655e-06, + "loss": 0.0591, "step": 9220 }, { - "epoch": 2.741312741312741, - "grad_norm": 0.6611971259117126, - "learning_rate": 1.3552123552123553e-05, - "loss": 0.0694, + "epoch": 4.659262998485613, + "grad_norm": 0.675123393535614, + "learning_rate": 2.04442200908632e-06, + "loss": 0.0561, "step": 9230 }, { - "epoch": 2.7442827442827444, - "grad_norm": 0.5132849812507629, - "learning_rate": 1.3534303534303535e-05, - "loss": 0.0607, + "epoch": 4.6643109540636045, + "grad_norm": 0.5947641730308533, + "learning_rate": 2.014134275618375e-06, + "loss": 0.0486, "step": 9240 }, { - "epoch": 2.7472527472527473, - "grad_norm": 0.40150707960128784, - "learning_rate": 1.3516483516483517e-05, - "loss": 0.0682, + "epoch": 4.669358909641595, + "grad_norm": 0.5389765501022339, + "learning_rate": 1.9838465421504293e-06, + "loss": 0.0586, "step": 9250 }, { - "epoch": 2.75022275022275, - "grad_norm": 0.8982331156730652, - "learning_rate": 1.3498663498663499e-05, - "loss": 0.058, + "epoch": 4.674406865219586, + "grad_norm": 0.5905711054801941, + "learning_rate": 1.9535588086824836e-06, + "loss": 0.0523, "step": 9260 }, { - "epoch": 2.753192753192753, - "grad_norm": 0.42595192790031433, - "learning_rate": 1.3480843480843481e-05, - "loss": 0.0673, + "epoch": 4.679454820797577, + "grad_norm": 0.36754655838012695, + "learning_rate": 1.9232710752145383e-06, + "loss": 0.0518, "step": 9270 }, { - "epoch": 2.756162756162756, - "grad_norm": 0.5409243106842041, - "learning_rate": 1.3463023463023463e-05, - "loss": 0.0553, + "epoch": 4.684502776375568, + "grad_norm": 0.5583412647247314, + "learning_rate": 1.8929833417465926e-06, + "loss": 0.0536, "step": 9280 }, { - "epoch": 2.759132759132759, - "grad_norm": 0.5729924440383911, - "learning_rate": 1.3445203445203446e-05, - "loss": 0.0677, + "epoch": 4.689550731953559, + "grad_norm": 0.4586925506591797, + "learning_rate": 1.8626956082786473e-06, + "loss": 0.0482, "step": 9290 }, { - "epoch": 2.762102762102762, - "grad_norm": 0.4854719638824463, - "learning_rate": 1.3427383427383428e-05, - "loss": 0.054, + "epoch": 4.69459868753155, + "grad_norm": 0.4932919442653656, + "learning_rate": 1.8324078748107018e-06, + "loss": 0.0484, "step": 9300 }, { - "epoch": 2.765072765072765, - "grad_norm": 0.7021495699882507, - "learning_rate": 1.340956340956341e-05, - "loss": 0.0618, + "epoch": 4.69964664310954, + "grad_norm": 0.3211473524570465, + "learning_rate": 1.802120141342756e-06, + "loss": 0.0522, "step": 9310 }, { - "epoch": 2.768042768042768, - "grad_norm": 0.5088809132575989, - "learning_rate": 1.3391743391743392e-05, - "loss": 0.0652, + "epoch": 4.7046945986875315, + "grad_norm": 0.8603491187095642, + "learning_rate": 1.7718324078748106e-06, + "loss": 0.0585, "step": 9320 }, { - "epoch": 2.7710127710127708, - "grad_norm": 0.3599695861339569, - "learning_rate": 1.3373923373923374e-05, - "loss": 0.0758, + "epoch": 4.709742554265523, + "grad_norm": 0.7181740999221802, + "learning_rate": 1.7415446744068653e-06, + "loss": 0.0522, "step": 9330 }, { - "epoch": 2.773982773982774, - "grad_norm": 0.2429090142250061, - "learning_rate": 1.3356103356103356e-05, - "loss": 0.0721, + "epoch": 4.714790509843513, + "grad_norm": 0.49415314197540283, + "learning_rate": 1.7112569409389198e-06, + "loss": 0.0417, "step": 9340 }, { - "epoch": 2.776952776952777, - "grad_norm": 0.42269906401634216, - "learning_rate": 1.3338283338283339e-05, - "loss": 0.0642, + "epoch": 4.719838465421504, + "grad_norm": 0.758638322353363, + "learning_rate": 1.6809692074709741e-06, + "loss": 0.0608, "step": 9350 }, { - "epoch": 2.7799227799227797, - "grad_norm": 0.5263569951057434, - "learning_rate": 1.332046332046332e-05, - "loss": 0.0635, + "epoch": 4.724886420999495, + "grad_norm": 0.6659887433052063, + "learning_rate": 1.6506814740030288e-06, + "loss": 0.0468, "step": 9360 }, { - "epoch": 2.782892782892783, - "grad_norm": 0.3662327527999878, - "learning_rate": 1.3302643302643303e-05, - "loss": 0.0628, + "epoch": 4.729934376577486, + "grad_norm": 0.3270837962627411, + "learning_rate": 1.6203937405350833e-06, + "loss": 0.0602, "step": 9370 }, { - "epoch": 2.785862785862786, - "grad_norm": 0.43335428833961487, - "learning_rate": 1.3284823284823285e-05, - "loss": 0.0713, + "epoch": 4.734982332155477, + "grad_norm": 0.6695159077644348, + "learning_rate": 1.5901060070671379e-06, + "loss": 0.0515, "step": 9380 }, { - "epoch": 2.7888327888327886, - "grad_norm": 0.5907623767852783, - "learning_rate": 1.3267003267003267e-05, - "loss": 0.0534, + "epoch": 4.740030287733468, + "grad_norm": 0.8143603205680847, + "learning_rate": 1.5598182735991924e-06, + "loss": 0.0613, "step": 9390 }, { - "epoch": 2.791802791802792, - "grad_norm": 0.340541809797287, - "learning_rate": 1.324918324918325e-05, - "loss": 0.052, + "epoch": 4.745078243311459, + "grad_norm": 0.6727936863899231, + "learning_rate": 1.5295305401312469e-06, + "loss": 0.0505, "step": 9400 }, { - "epoch": 2.7947727947727947, - "grad_norm": 0.4090157151222229, - "learning_rate": 1.3231363231363232e-05, - "loss": 0.0686, + "epoch": 4.75012619888945, + "grad_norm": 0.5365564823150635, + "learning_rate": 1.4992428066633014e-06, + "loss": 0.0512, "step": 9410 }, { - "epoch": 2.7977427977427975, - "grad_norm": 0.3752903640270233, - "learning_rate": 1.3213543213543214e-05, - "loss": 0.0612, + "epoch": 4.755174154467441, + "grad_norm": 0.5240725874900818, + "learning_rate": 1.4689550731953559e-06, + "loss": 0.0526, "step": 9420 }, { - "epoch": 2.800712800712801, - "grad_norm": 0.48351070284843445, - "learning_rate": 1.3195723195723196e-05, - "loss": 0.058, + "epoch": 4.760222110045431, + "grad_norm": 0.6975441575050354, + "learning_rate": 1.4386673397274104e-06, + "loss": 0.0592, "step": 9430 }, { - "epoch": 2.8036828036828036, - "grad_norm": 0.7287899851799011, - "learning_rate": 1.3177903177903178e-05, - "loss": 0.0787, + "epoch": 4.765270065623422, + "grad_norm": 0.44649407267570496, + "learning_rate": 1.408379606259465e-06, + "loss": 0.0597, "step": 9440 }, { - "epoch": 2.8066528066528065, - "grad_norm": 0.4591059684753418, - "learning_rate": 1.316008316008316e-05, - "loss": 0.0507, + "epoch": 4.770318021201414, + "grad_norm": 0.598850429058075, + "learning_rate": 1.3780918727915194e-06, + "loss": 0.0606, "step": 9450 }, { - "epoch": 2.8096228096228097, - "grad_norm": 0.6308128833770752, - "learning_rate": 1.3142263142263142e-05, - "loss": 0.0783, + "epoch": 4.775365976779405, + "grad_norm": 0.57352614402771, + "learning_rate": 1.3478041393235741e-06, + "loss": 0.0502, "step": 9460 }, { - "epoch": 2.8125928125928126, - "grad_norm": 0.5566859841346741, - "learning_rate": 1.3124443124443125e-05, - "loss": 0.067, + "epoch": 4.780413932357395, + "grad_norm": 0.7437055706977844, + "learning_rate": 1.3175164058556284e-06, + "loss": 0.0521, "step": 9470 }, { - "epoch": 2.8155628155628154, - "grad_norm": 0.42038193345069885, - "learning_rate": 1.3106623106623107e-05, - "loss": 0.0554, + "epoch": 4.785461887935386, + "grad_norm": 0.6993494629859924, + "learning_rate": 1.287228672387683e-06, + "loss": 0.0565, "step": 9480 }, { - "epoch": 2.8185328185328187, - "grad_norm": 0.34577420353889465, - "learning_rate": 1.3088803088803089e-05, - "loss": 0.0742, + "epoch": 4.790509843513377, + "grad_norm": 0.8067084550857544, + "learning_rate": 1.2569409389197376e-06, + "loss": 0.0575, "step": 9490 }, { - "epoch": 2.8215028215028215, - "grad_norm": 0.5111622214317322, - "learning_rate": 1.3070983070983071e-05, - "loss": 0.068, + "epoch": 4.795557799091368, + "grad_norm": 0.5363942384719849, + "learning_rate": 1.2266532054517921e-06, + "loss": 0.058, "step": 9500 }, { - "epoch": 2.8244728244728243, - "grad_norm": 0.24577349424362183, - "learning_rate": 1.3053163053163053e-05, - "loss": 0.0617, + "epoch": 4.800605754669359, + "grad_norm": 0.8145700693130493, + "learning_rate": 1.1963654719838464e-06, + "loss": 0.0488, "step": 9510 }, { - "epoch": 2.8274428274428276, - "grad_norm": 0.3329918682575226, - "learning_rate": 1.3035343035343037e-05, - "loss": 0.072, + "epoch": 4.80565371024735, + "grad_norm": 0.7701184153556824, + "learning_rate": 1.166077738515901e-06, + "loss": 0.0577, "step": 9520 }, { - "epoch": 2.8304128304128304, - "grad_norm": 0.5380098819732666, - "learning_rate": 1.3017523017523018e-05, - "loss": 0.0618, + "epoch": 4.8107016658253405, + "grad_norm": 0.5177111625671387, + "learning_rate": 1.1357900050479557e-06, + "loss": 0.0605, "step": 9530 }, { - "epoch": 2.8333828333828333, - "grad_norm": 0.539607584476471, - "learning_rate": 1.2999702999703e-05, - "loss": 0.0529, + "epoch": 4.815749621403332, + "grad_norm": 0.44751742482185364, + "learning_rate": 1.1055022715800102e-06, + "loss": 0.0565, "step": 9540 }, { - "epoch": 2.8363528363528365, - "grad_norm": 0.6192976236343384, - "learning_rate": 1.2981882981882982e-05, - "loss": 0.0695, + "epoch": 4.820797576981323, + "grad_norm": 0.37919309735298157, + "learning_rate": 1.0752145381120645e-06, + "loss": 0.0454, "step": 9550 }, { - "epoch": 2.8393228393228394, - "grad_norm": 0.44225507974624634, - "learning_rate": 1.2964062964062964e-05, - "loss": 0.0537, + "epoch": 4.825845532559313, + "grad_norm": 0.6037785410881042, + "learning_rate": 1.0449268046441192e-06, + "loss": 0.0606, "step": 9560 }, { - "epoch": 2.842292842292842, - "grad_norm": 0.5681447386741638, - "learning_rate": 1.2946242946242946e-05, - "loss": 0.0737, + "epoch": 4.8308934881373045, + "grad_norm": 0.3584793508052826, + "learning_rate": 1.0146390711761737e-06, + "loss": 0.0503, "step": 9570 }, { - "epoch": 2.8452628452628455, - "grad_norm": 0.5931240320205688, - "learning_rate": 1.2928422928422929e-05, - "loss": 0.0661, + "epoch": 4.835941443715296, + "grad_norm": 0.49841853976249695, + "learning_rate": 9.843513377082282e-07, + "loss": 0.0434, "step": 9580 }, { - "epoch": 2.8482328482328483, - "grad_norm": 0.4011771082878113, - "learning_rate": 1.2910602910602912e-05, - "loss": 0.0661, + "epoch": 4.840989399293286, + "grad_norm": 0.5114769339561462, + "learning_rate": 9.540636042402827e-07, + "loss": 0.0535, "step": 9590 }, { - "epoch": 2.851202851202851, - "grad_norm": 0.574195921421051, - "learning_rate": 1.2892782892782893e-05, - "loss": 0.0677, + "epoch": 4.846037354871277, + "grad_norm": 0.5932824611663818, + "learning_rate": 9.237758707723372e-07, + "loss": 0.0547, "step": 9600 }, { - "epoch": 2.8541728541728544, - "grad_norm": 0.5977892875671387, - "learning_rate": 1.2874962874962875e-05, - "loss": 0.075, + "epoch": 4.851085310449268, + "grad_norm": 0.6020333766937256, + "learning_rate": 8.934881373043917e-07, + "loss": 0.0597, "step": 9610 }, { - "epoch": 2.857142857142857, - "grad_norm": 0.3630739152431488, - "learning_rate": 1.2857142857142857e-05, - "loss": 0.0555, + "epoch": 4.856133266027259, + "grad_norm": 0.721193790435791, + "learning_rate": 8.632004038364462e-07, + "loss": 0.0614, "step": 9620 }, { - "epoch": 2.86011286011286, - "grad_norm": 0.39152857661247253, - "learning_rate": 1.283932283932284e-05, - "loss": 0.0692, + "epoch": 4.86118122160525, + "grad_norm": 0.4858354926109314, + "learning_rate": 8.329126703685008e-07, + "loss": 0.0555, "step": 9630 }, { - "epoch": 2.8630828630828633, - "grad_norm": 0.2847200036048889, - "learning_rate": 1.2821502821502822e-05, - "loss": 0.0504, + "epoch": 4.866229177183241, + "grad_norm": 0.7863103747367859, + "learning_rate": 8.026249369005552e-07, + "loss": 0.0554, "step": 9640 }, { - "epoch": 2.866052866052866, - "grad_norm": 0.46334296464920044, - "learning_rate": 1.2803682803682804e-05, - "loss": 0.067, + "epoch": 4.871277132761231, + "grad_norm": 0.8363025784492493, + "learning_rate": 7.723372034326099e-07, + "loss": 0.0565, "step": 9650 }, { - "epoch": 2.869022869022869, - "grad_norm": 0.6711926460266113, - "learning_rate": 1.2785862785862788e-05, - "loss": 0.0739, + "epoch": 4.876325088339223, + "grad_norm": 0.6137521266937256, + "learning_rate": 7.420494699646643e-07, + "loss": 0.0575, "step": 9660 }, { - "epoch": 2.8719928719928722, - "grad_norm": 0.5789605975151062, - "learning_rate": 1.2768042768042768e-05, - "loss": 0.0649, + "epoch": 4.881373043917214, + "grad_norm": 0.4781091511249542, + "learning_rate": 7.117617364967189e-07, + "loss": 0.0478, "step": 9670 }, { - "epoch": 2.874962874962875, - "grad_norm": 0.5450757741928101, - "learning_rate": 1.275022275022275e-05, - "loss": 0.0659, + "epoch": 4.886420999495204, + "grad_norm": 0.8294112086296082, + "learning_rate": 6.814740030287734e-07, + "loss": 0.0593, "step": 9680 }, { - "epoch": 2.877932877932878, - "grad_norm": 0.4336056709289551, - "learning_rate": 1.2732402732402732e-05, - "loss": 0.064, + "epoch": 4.891468955073195, + "grad_norm": 0.5780894160270691, + "learning_rate": 6.511862695608279e-07, + "loss": 0.0518, "step": 9690 }, { - "epoch": 2.880902880902881, - "grad_norm": 0.43332991003990173, - "learning_rate": 1.2714582714582715e-05, - "loss": 0.0703, + "epoch": 4.8965169106511865, + "grad_norm": 0.4407060146331787, + "learning_rate": 6.208985360928824e-07, + "loss": 0.0522, "step": 9700 }, { - "epoch": 2.883872883872884, - "grad_norm": 0.26582634449005127, - "learning_rate": 1.2696762696762697e-05, - "loss": 0.0573, + "epoch": 4.901564866229177, + "grad_norm": 0.4369337558746338, + "learning_rate": 5.906108026249369e-07, + "loss": 0.0522, "step": 9710 }, { - "epoch": 2.886842886842887, - "grad_norm": 0.39930054545402527, - "learning_rate": 1.2678942678942679e-05, - "loss": 0.0608, + "epoch": 4.906612821807168, + "grad_norm": 0.8428089022636414, + "learning_rate": 5.603230691569914e-07, + "loss": 0.0468, "step": 9720 }, { - "epoch": 2.88981288981289, - "grad_norm": 0.6703673601150513, - "learning_rate": 1.2661122661122663e-05, - "loss": 0.0706, + "epoch": 4.911660777385159, + "grad_norm": 0.6303294897079468, + "learning_rate": 5.30035335689046e-07, + "loss": 0.0577, "step": 9730 }, { - "epoch": 2.892782892782893, - "grad_norm": 0.3226848542690277, - "learning_rate": 1.2643302643302643e-05, - "loss": 0.0565, + "epoch": 4.91670873296315, + "grad_norm": 0.4869242012500763, + "learning_rate": 4.997476022211004e-07, + "loss": 0.0472, "step": 9740 }, { - "epoch": 2.8957528957528957, - "grad_norm": 0.4727514386177063, - "learning_rate": 1.2625482625482625e-05, - "loss": 0.0482, + "epoch": 4.921756688541141, + "grad_norm": 0.5907611846923828, + "learning_rate": 4.69459868753155e-07, + "loss": 0.0455, "step": 9750 }, { - "epoch": 2.8987228987228986, - "grad_norm": 0.744326651096344, - "learning_rate": 1.2607662607662608e-05, - "loss": 0.0679, + "epoch": 4.926804644119132, + "grad_norm": 0.6162139177322388, + "learning_rate": 4.3917213528520954e-07, + "loss": 0.0475, "step": 9760 }, { - "epoch": 2.901692901692902, - "grad_norm": 0.46024101972579956, - "learning_rate": 1.258984258984259e-05, - "loss": 0.0648, + "epoch": 4.931852599697122, + "grad_norm": 0.5222154259681702, + "learning_rate": 4.0888440181726405e-07, + "loss": 0.0513, "step": 9770 }, { - "epoch": 2.9046629046629047, - "grad_norm": 0.5013512969017029, - "learning_rate": 1.2572022572022572e-05, - "loss": 0.0563, + "epoch": 4.9369005552751135, + "grad_norm": 0.5132977366447449, + "learning_rate": 3.7859666834931856e-07, + "loss": 0.043, "step": 9780 }, { - "epoch": 2.9076329076329075, - "grad_norm": 0.7148948907852173, - "learning_rate": 1.2554202554202554e-05, - "loss": 0.0735, + "epoch": 4.941948510853105, + "grad_norm": 0.6620015501976013, + "learning_rate": 3.4830893488137306e-07, + "loss": 0.0598, "step": 9790 }, { - "epoch": 2.9106029106029108, - "grad_norm": 0.4620581865310669, - "learning_rate": 1.2536382536382538e-05, - "loss": 0.0678, + "epoch": 4.946996466431095, + "grad_norm": 0.7160341143608093, + "learning_rate": 3.1802120141342757e-07, + "loss": 0.0539, "step": 9800 }, { - "epoch": 2.9135729135729136, - "grad_norm": 0.5615851879119873, - "learning_rate": 1.2518562518562518e-05, - "loss": 0.0599, + "epoch": 4.952044422009086, + "grad_norm": 0.5954631567001343, + "learning_rate": 2.8773346794548213e-07, + "loss": 0.0581, "step": 9810 }, { - "epoch": 2.9165429165429164, - "grad_norm": 0.5745916366577148, - "learning_rate": 1.25007425007425e-05, - "loss": 0.0663, + "epoch": 4.957092377587077, + "grad_norm": 1.0010461807250977, + "learning_rate": 2.5744573447753664e-07, + "loss": 0.0499, "step": 9820 }, { - "epoch": 2.9195129195129192, - "grad_norm": 0.34011173248291016, - "learning_rate": 1.2482922482922483e-05, - "loss": 0.0524, + "epoch": 4.962140333165069, + "grad_norm": 0.5768128633499146, + "learning_rate": 2.2715800100959112e-07, + "loss": 0.0562, "step": 9830 }, { - "epoch": 2.9224829224829225, - "grad_norm": 0.5845355987548828, - "learning_rate": 1.2465102465102467e-05, - "loss": 0.0625, + "epoch": 4.967188288743059, + "grad_norm": 0.6427052617073059, + "learning_rate": 1.9687026754164563e-07, + "loss": 0.0545, "step": 9840 }, { - "epoch": 2.9254529254529253, - "grad_norm": 0.5317063331604004, - "learning_rate": 1.2447282447282447e-05, - "loss": 0.0589, + "epoch": 4.97223624432105, + "grad_norm": 0.6932212114334106, + "learning_rate": 1.6658253407370016e-07, + "loss": 0.0575, "step": 9850 }, { - "epoch": 2.928422928422928, - "grad_norm": 0.3282083570957184, - "learning_rate": 1.242946242946243e-05, - "loss": 0.059, + "epoch": 4.9772841998990405, + "grad_norm": 0.4219547510147095, + "learning_rate": 1.3629480060575467e-07, + "loss": 0.0491, "step": 9860 }, { - "epoch": 2.9313929313929314, - "grad_norm": 0.3801690638065338, - "learning_rate": 1.2411642411642413e-05, - "loss": 0.0628, + "epoch": 4.982332155477032, + "grad_norm": 0.5215485692024231, + "learning_rate": 1.0600706713780919e-07, + "loss": 0.0438, "step": 9870 }, { - "epoch": 2.9343629343629343, - "grad_norm": 0.5469937324523926, - "learning_rate": 1.2393822393822394e-05, - "loss": 0.0681, + "epoch": 4.987380111055023, + "grad_norm": 0.36851760745048523, + "learning_rate": 7.57193336698637e-08, + "loss": 0.052, "step": 9880 }, { - "epoch": 2.937332937332937, - "grad_norm": 0.7467171549797058, - "learning_rate": 1.2376002376002376e-05, - "loss": 0.0555, + "epoch": 4.992428066633014, + "grad_norm": 0.5213483572006226, + "learning_rate": 4.5431600201918226e-08, + "loss": 0.0472, "step": 9890 }, { - "epoch": 2.9403029403029404, - "grad_norm": 0.5576099157333374, - "learning_rate": 1.2358182358182358e-05, - "loss": 0.0722, + "epoch": 4.997476022211004, + "grad_norm": 0.710657000541687, + "learning_rate": 1.514386673397274e-08, + "loss": 0.0582, "step": 9900 }, { - "epoch": 2.943272943272943, - "grad_norm": 0.5140604972839355, - "learning_rate": 1.2340362340362342e-05, - "loss": 0.0628, - "step": 9910 - }, - { - "epoch": 2.946242946242946, - "grad_norm": 0.6918432116508484, - "learning_rate": 1.2322542322542322e-05, - "loss": 0.0709, - "step": 9920 - }, - { - "epoch": 2.9492129492129493, - "grad_norm": 0.4932166635990143, - "learning_rate": 1.2304722304722305e-05, - "loss": 0.0685, - "step": 9930 - }, - { - "epoch": 2.952182952182952, - "grad_norm": 0.42789584398269653, - "learning_rate": 1.2286902286902288e-05, - "loss": 0.0654, - "step": 9940 - }, - { - "epoch": 2.955152955152955, - "grad_norm": 0.6951575875282288, - "learning_rate": 1.2269082269082269e-05, - "loss": 0.0677, - "step": 9950 - }, - { - "epoch": 2.9581229581229582, - "grad_norm": 0.5306366682052612, - "learning_rate": 1.2251262251262251e-05, - "loss": 0.0561, - "step": 9960 - }, - { - "epoch": 2.961092961092961, - "grad_norm": 0.45280131697654724, - "learning_rate": 1.2233442233442233e-05, - "loss": 0.0594, - "step": 9970 - }, - { - "epoch": 2.964062964062964, - "grad_norm": 0.5789720416069031, - "learning_rate": 1.2215622215622217e-05, - "loss": 0.0691, - "step": 9980 - }, - { - "epoch": 2.967032967032967, - "grad_norm": 0.4295837879180908, - "learning_rate": 1.2197802197802198e-05, - "loss": 0.0492, - "step": 9990 - }, - { - "epoch": 2.97000297000297, - "grad_norm": 0.3032509684562683, - "learning_rate": 1.217998217998218e-05, - "loss": 0.0634, - "step": 10000 - }, - { - "epoch": 2.972972972972973, - "grad_norm": 0.6462733149528503, - "learning_rate": 1.2162162162162164e-05, - "loss": 0.0667, - "step": 10010 - }, - { - "epoch": 2.975942975942976, - "grad_norm": 0.5056395530700684, - "learning_rate": 1.2144342144342144e-05, - "loss": 0.0637, - "step": 10020 - }, - { - "epoch": 2.978912978912979, - "grad_norm": 0.3662366569042206, - "learning_rate": 1.2126522126522126e-05, - "loss": 0.0709, - "step": 10030 - }, - { - "epoch": 2.9818829818829817, - "grad_norm": 0.49650683999061584, - "learning_rate": 1.2108702108702108e-05, - "loss": 0.0711, - "step": 10040 - }, - { - "epoch": 2.984852984852985, - "grad_norm": 0.44112861156463623, - "learning_rate": 1.2090882090882092e-05, - "loss": 0.066, - "step": 10050 - }, - { - "epoch": 2.987822987822988, - "grad_norm": 0.5365132689476013, - "learning_rate": 1.2073062073062073e-05, - "loss": 0.0589, - "step": 10060 - }, - { - "epoch": 2.9907929907929907, - "grad_norm": 0.4564819931983948, - "learning_rate": 1.2055242055242055e-05, - "loss": 0.0686, - "step": 10070 - }, - { - "epoch": 2.993762993762994, - "grad_norm": 0.6063446402549744, - "learning_rate": 1.2037422037422039e-05, - "loss": 0.0673, - "step": 10080 - }, - { - "epoch": 2.9967329967329968, - "grad_norm": 0.516140878200531, - "learning_rate": 1.2019602019602021e-05, - "loss": 0.0454, - "step": 10090 - }, - { - "epoch": 2.9997029997029996, - "grad_norm": 0.36144575476646423, - "learning_rate": 1.2001782001782001e-05, - "loss": 0.0581, - "step": 10100 - }, - { - "epoch": 3.0, - "eval_f1": 0.49727767695099817, - "eval_loss": 0.059147998690605164, - "eval_runtime": 179.7759, - "eval_samples_per_second": 211.48, - "eval_steps_per_second": 3.31, - "step": 10101 - }, - { - "epoch": 3.002673002673003, - "grad_norm": 0.46553778648376465, - "learning_rate": 1.1983961983961984e-05, - "loss": 0.0572, - "step": 10110 - }, - { - "epoch": 3.0056430056430057, - "grad_norm": 0.38310161232948303, - "learning_rate": 1.1966141966141967e-05, - "loss": 0.0653, - "step": 10120 - }, - { - "epoch": 3.0086130086130085, - "grad_norm": 0.7176486253738403, - "learning_rate": 1.1948321948321948e-05, - "loss": 0.0696, - "step": 10130 - }, - { - "epoch": 3.011583011583012, - "grad_norm": 0.3964185118675232, - "learning_rate": 1.193050193050193e-05, - "loss": 0.0559, - "step": 10140 - }, - { - "epoch": 3.0145530145530146, - "grad_norm": 0.480051189661026, - "learning_rate": 1.1912681912681914e-05, - "loss": 0.0688, - "step": 10150 - }, - { - "epoch": 3.0175230175230174, - "grad_norm": 0.4801310896873474, - "learning_rate": 1.1894861894861896e-05, - "loss": 0.0653, - "step": 10160 - }, - { - "epoch": 3.0204930204930207, - "grad_norm": 0.7674263119697571, - "learning_rate": 1.1877041877041877e-05, - "loss": 0.0639, - "step": 10170 - }, - { - "epoch": 3.0234630234630235, - "grad_norm": 0.35185834765434265, - "learning_rate": 1.1859221859221859e-05, - "loss": 0.0446, - "step": 10180 - }, - { - "epoch": 3.0264330264330264, - "grad_norm": 0.6630620956420898, - "learning_rate": 1.1841401841401843e-05, - "loss": 0.0705, - "step": 10190 - }, - { - "epoch": 3.029403029403029, - "grad_norm": 0.5050874352455139, - "learning_rate": 1.1823581823581823e-05, - "loss": 0.0563, - "step": 10200 - }, - { - "epoch": 3.0323730323730325, - "grad_norm": 0.29523542523384094, - "learning_rate": 1.1805761805761805e-05, - "loss": 0.0598, - "step": 10210 - }, - { - "epoch": 3.0353430353430353, - "grad_norm": 0.5692099928855896, - "learning_rate": 1.1787941787941789e-05, - "loss": 0.0733, - "step": 10220 - }, - { - "epoch": 3.038313038313038, - "grad_norm": 0.714964747428894, - "learning_rate": 1.1770121770121771e-05, - "loss": 0.0547, - "step": 10230 - }, - { - "epoch": 3.0412830412830414, - "grad_norm": 0.4890214502811432, - "learning_rate": 1.1752301752301752e-05, - "loss": 0.0679, - "step": 10240 - }, - { - "epoch": 3.044253044253044, - "grad_norm": 0.5631494522094727, - "learning_rate": 1.1734481734481734e-05, - "loss": 0.06, - "step": 10250 - }, - { - "epoch": 3.047223047223047, - "grad_norm": 0.6472118496894836, - "learning_rate": 1.1716661716661718e-05, - "loss": 0.0533, - "step": 10260 - }, - { - "epoch": 3.0501930501930503, - "grad_norm": 0.6611066460609436, - "learning_rate": 1.1698841698841698e-05, - "loss": 0.0591, - "step": 10270 - }, - { - "epoch": 3.053163053163053, - "grad_norm": 0.4274856448173523, - "learning_rate": 1.168102168102168e-05, - "loss": 0.0652, - "step": 10280 - }, - { - "epoch": 3.056133056133056, - "grad_norm": 0.32548412680625916, - "learning_rate": 1.1663201663201664e-05, - "loss": 0.0678, - "step": 10290 - }, - { - "epoch": 3.0591030591030592, - "grad_norm": 0.36015450954437256, - "learning_rate": 1.1645381645381647e-05, - "loss": 0.0691, - "step": 10300 - }, - { - "epoch": 3.062073062073062, - "grad_norm": 0.5831524133682251, - "learning_rate": 1.1627561627561627e-05, - "loss": 0.0735, - "step": 10310 - }, - { - "epoch": 3.065043065043065, - "grad_norm": 0.7021368741989136, - "learning_rate": 1.160974160974161e-05, - "loss": 0.0728, - "step": 10320 - }, - { - "epoch": 3.068013068013068, - "grad_norm": 0.5424765944480896, - "learning_rate": 1.1591921591921593e-05, - "loss": 0.0617, - "step": 10330 - }, - { - "epoch": 3.070983070983071, - "grad_norm": 0.7176571488380432, - "learning_rate": 1.1574101574101574e-05, - "loss": 0.0677, - "step": 10340 - }, - { - "epoch": 3.073953073953074, - "grad_norm": 0.33526375889778137, - "learning_rate": 1.1556281556281556e-05, - "loss": 0.0626, - "step": 10350 + "epoch": 5.0, + "eval_f1": 0.9705180789481339, + "eval_loss": 0.03909851238131523, + "eval_runtime": 579.4034, + "eval_samples_per_second": 355.99, + "eval_steps_per_second": 2.782, + "step": 9905 }, { - "epoch": 3.076923076923077, - "grad_norm": 0.4724681079387665, - "learning_rate": 1.153846153846154e-05, - "loss": 0.0575, - "step": 10360 + "epoch": 5.0, + "step": 9905, + "total_flos": 9.82152667464321e+19, + "train_loss": 0.0, + "train_runtime": 0.0928, + "train_samples_per_second": 13658896.327, + "train_steps_per_second": 106755.597 + } + ], + "logging_steps": 10, + "max_steps": 9905, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 2, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } }, - { - "epoch": 3.07989307989308, - "grad_norm": 0.6367087364196777, - "learning_rate": 1.1520641520641522e-05, - "loss": 0.0538, - "step": 10370 - }, - { - "epoch": 3.0828630828630827, - "grad_norm": 0.31437206268310547, - "learning_rate": 1.1502821502821502e-05, - "loss": 0.0643, - "step": 10380 - }, - { - "epoch": 3.085833085833086, - "grad_norm": 0.4423040449619293, - "learning_rate": 1.1485001485001484e-05, - "loss": 0.0684, - "step": 10390 - }, - { - "epoch": 3.088803088803089, - "grad_norm": 0.4041610360145569, - "learning_rate": 1.1467181467181468e-05, - "loss": 0.0552, - "step": 10400 - }, - { - "epoch": 3.0917730917730917, - "grad_norm": 0.4148096442222595, - "learning_rate": 1.144936144936145e-05, - "loss": 0.0693, - "step": 10410 - }, - { - "epoch": 3.094743094743095, - "grad_norm": 0.30476808547973633, - "learning_rate": 1.1431541431541431e-05, - "loss": 0.0684, - "step": 10420 - }, - { - "epoch": 3.0977130977130978, - "grad_norm": 0.7706785798072815, - "learning_rate": 1.1413721413721415e-05, - "loss": 0.0707, - "step": 10430 - }, - { - "epoch": 3.1006831006831006, - "grad_norm": 0.3732987940311432, - "learning_rate": 1.1395901395901397e-05, - "loss": 0.0619, - "step": 10440 - }, - { - "epoch": 3.1036531036531034, - "grad_norm": 0.4054795503616333, - "learning_rate": 1.1378081378081377e-05, - "loss": 0.0666, - "step": 10450 - }, - { - "epoch": 3.1066231066231067, - "grad_norm": 0.660860538482666, - "learning_rate": 1.136026136026136e-05, - "loss": 0.0639, - "step": 10460 - }, - { - "epoch": 3.1095931095931095, - "grad_norm": 0.5180338025093079, - "learning_rate": 1.1342441342441343e-05, - "loss": 0.068, - "step": 10470 - }, - { - "epoch": 3.1125631125631124, - "grad_norm": 0.44153326749801636, - "learning_rate": 1.1324621324621326e-05, - "loss": 0.0621, - "step": 10480 - }, - { - "epoch": 3.1155331155331156, - "grad_norm": 0.6957278251647949, - "learning_rate": 1.1306801306801306e-05, - "loss": 0.0553, - "step": 10490 - }, - { - "epoch": 3.1185031185031185, - "grad_norm": 0.29442107677459717, - "learning_rate": 1.128898128898129e-05, - "loss": 0.0646, - "step": 10500 - }, - { - "epoch": 3.1214731214731213, - "grad_norm": 0.4631500244140625, - "learning_rate": 1.1271161271161272e-05, - "loss": 0.0699, - "step": 10510 - }, - { - "epoch": 3.1244431244431246, - "grad_norm": 0.4856095314025879, - "learning_rate": 1.1253341253341253e-05, - "loss": 0.0685, - "step": 10520 - }, - { - "epoch": 3.1274131274131274, - "grad_norm": 0.7424579858779907, - "learning_rate": 1.1235521235521235e-05, - "loss": 0.0639, - "step": 10530 - }, - { - "epoch": 3.13038313038313, - "grad_norm": 0.5345817804336548, - "learning_rate": 1.1217701217701219e-05, - "loss": 0.0641, - "step": 10540 - }, - { - "epoch": 3.1333531333531335, - "grad_norm": 0.5012867450714111, - "learning_rate": 1.11998811998812e-05, - "loss": 0.0803, - "step": 10550 - }, - { - "epoch": 3.1363231363231363, - "grad_norm": 0.5213742852210999, - "learning_rate": 1.1182061182061181e-05, - "loss": 0.0469, - "step": 10560 - }, - { - "epoch": 3.139293139293139, - "grad_norm": 0.39430922269821167, - "learning_rate": 1.1164241164241165e-05, - "loss": 0.0502, - "step": 10570 - }, - { - "epoch": 3.1422631422631424, - "grad_norm": 0.6875708699226379, - "learning_rate": 1.1146421146421147e-05, - "loss": 0.0617, - "step": 10580 - }, - { - "epoch": 3.1452331452331452, - "grad_norm": 0.4213047921657562, - "learning_rate": 1.1128601128601128e-05, - "loss": 0.067, - "step": 10590 - }, - { - "epoch": 3.148203148203148, - "grad_norm": 0.9495222568511963, - "learning_rate": 1.111078111078111e-05, - "loss": 0.0446, - "step": 10600 - }, - { - "epoch": 3.1511731511731513, - "grad_norm": 0.37120023369789124, - "learning_rate": 1.1092961092961094e-05, - "loss": 0.0727, - "step": 10610 - }, - { - "epoch": 3.154143154143154, - "grad_norm": 0.44335830211639404, - "learning_rate": 1.1075141075141076e-05, - "loss": 0.0574, - "step": 10620 - }, - { - "epoch": 3.157113157113157, - "grad_norm": 0.6420602798461914, - "learning_rate": 1.1057321057321056e-05, - "loss": 0.0708, - "step": 10630 - }, - { - "epoch": 3.1600831600831603, - "grad_norm": 0.4319610297679901, - "learning_rate": 1.103950103950104e-05, - "loss": 0.0646, - "step": 10640 - }, - { - "epoch": 3.163053163053163, - "grad_norm": 0.34275874495506287, - "learning_rate": 1.1021681021681022e-05, - "loss": 0.0511, - "step": 10650 - }, - { - "epoch": 3.166023166023166, - "grad_norm": 0.32853662967681885, - "learning_rate": 1.1003861003861003e-05, - "loss": 0.0476, - "step": 10660 - }, - { - "epoch": 3.168993168993169, - "grad_norm": 0.7371835708618164, - "learning_rate": 1.0986040986040985e-05, - "loss": 0.067, - "step": 10670 - }, - { - "epoch": 3.171963171963172, - "grad_norm": 0.23537606000900269, - "learning_rate": 1.0968220968220969e-05, - "loss": 0.0636, - "step": 10680 - }, - { - "epoch": 3.174933174933175, - "grad_norm": 0.638041615486145, - "learning_rate": 1.0950400950400951e-05, - "loss": 0.0658, - "step": 10690 - }, - { - "epoch": 3.177903177903178, - "grad_norm": 0.7828889489173889, - "learning_rate": 1.0932580932580932e-05, - "loss": 0.0603, - "step": 10700 - }, - { - "epoch": 3.180873180873181, - "grad_norm": 0.41569939255714417, - "learning_rate": 1.0914760914760916e-05, - "loss": 0.0528, - "step": 10710 - }, - { - "epoch": 3.1838431838431838, - "grad_norm": 0.4870140552520752, - "learning_rate": 1.0896940896940898e-05, - "loss": 0.0565, - "step": 10720 - }, - { - "epoch": 3.186813186813187, - "grad_norm": 0.3599897623062134, - "learning_rate": 1.087912087912088e-05, - "loss": 0.0469, - "step": 10730 - }, - { - "epoch": 3.18978318978319, - "grad_norm": 0.26678797602653503, - "learning_rate": 1.086130086130086e-05, - "loss": 0.0708, - "step": 10740 - }, - { - "epoch": 3.1927531927531927, - "grad_norm": 0.6243604421615601, - "learning_rate": 1.0843480843480844e-05, - "loss": 0.0486, - "step": 10750 - }, - { - "epoch": 3.1957231957231955, - "grad_norm": 0.5825532674789429, - "learning_rate": 1.0825660825660826e-05, - "loss": 0.0663, - "step": 10760 - }, - { - "epoch": 3.198693198693199, - "grad_norm": 0.4092167913913727, - "learning_rate": 1.0807840807840807e-05, - "loss": 0.0704, - "step": 10770 - }, - { - "epoch": 3.2016632016632016, - "grad_norm": 0.5701293349266052, - "learning_rate": 1.079002079002079e-05, - "loss": 0.0635, - "step": 10780 - }, - { - "epoch": 3.2046332046332044, - "grad_norm": 0.25863227248191833, - "learning_rate": 1.0772200772200773e-05, - "loss": 0.0641, - "step": 10790 - }, - { - "epoch": 3.2076032076032077, - "grad_norm": 0.3742627203464508, - "learning_rate": 1.0754380754380755e-05, - "loss": 0.0698, - "step": 10800 - }, - { - "epoch": 3.2105732105732105, - "grad_norm": 0.3190728724002838, - "learning_rate": 1.0736560736560736e-05, - "loss": 0.0585, - "step": 10810 - }, - { - "epoch": 3.2135432135432134, - "grad_norm": 0.49537599086761475, - "learning_rate": 1.071874071874072e-05, - "loss": 0.0635, - "step": 10820 - }, - { - "epoch": 3.2165132165132166, - "grad_norm": 0.3896566927433014, - "learning_rate": 1.0700920700920702e-05, - "loss": 0.0753, - "step": 10830 - }, - { - "epoch": 3.2194832194832195, - "grad_norm": 0.6234869956970215, - "learning_rate": 1.0683100683100682e-05, - "loss": 0.0545, - "step": 10840 - }, - { - "epoch": 3.2224532224532223, - "grad_norm": 0.421795129776001, - "learning_rate": 1.0665280665280666e-05, - "loss": 0.0714, - "step": 10850 - }, - { - "epoch": 3.2254232254232256, - "grad_norm": 0.6576681733131409, - "learning_rate": 1.0647460647460648e-05, - "loss": 0.0661, - "step": 10860 - }, - { - "epoch": 3.2283932283932284, - "grad_norm": 0.5803960561752319, - "learning_rate": 1.062964062964063e-05, - "loss": 0.0732, - "step": 10870 - }, - { - "epoch": 3.2313632313632312, - "grad_norm": 0.39635559916496277, - "learning_rate": 1.0611820611820612e-05, - "loss": 0.0473, - "step": 10880 - }, - { - "epoch": 3.2343332343332345, - "grad_norm": 0.5573329329490662, - "learning_rate": 1.0594000594000595e-05, - "loss": 0.059, - "step": 10890 - }, - { - "epoch": 3.2373032373032373, - "grad_norm": 0.6418017148971558, - "learning_rate": 1.0576180576180577e-05, - "loss": 0.0633, - "step": 10900 - }, - { - "epoch": 3.24027324027324, - "grad_norm": 0.6030585169792175, - "learning_rate": 1.0558360558360557e-05, - "loss": 0.0596, - "step": 10910 - }, - { - "epoch": 3.2432432432432434, - "grad_norm": 0.41735953092575073, - "learning_rate": 1.0540540540540541e-05, - "loss": 0.0585, - "step": 10920 - }, - { - "epoch": 3.2462132462132463, - "grad_norm": 0.7560169696807861, - "learning_rate": 1.0522720522720523e-05, - "loss": 0.0566, - "step": 10930 - }, - { - "epoch": 3.249183249183249, - "grad_norm": 0.2606422007083893, - "learning_rate": 1.0504900504900505e-05, - "loss": 0.0498, - "step": 10940 - }, - { - "epoch": 3.252153252153252, - "grad_norm": 0.5863521695137024, - "learning_rate": 1.0487080487080488e-05, - "loss": 0.0603, - "step": 10950 - }, - { - "epoch": 3.255123255123255, - "grad_norm": 0.4618661403656006, - "learning_rate": 1.046926046926047e-05, - "loss": 0.0709, - "step": 10960 - }, - { - "epoch": 3.258093258093258, - "grad_norm": 0.3728097975254059, - "learning_rate": 1.0451440451440452e-05, - "loss": 0.0605, - "step": 10970 - }, - { - "epoch": 3.261063261063261, - "grad_norm": 0.4798294007778168, - "learning_rate": 1.0433620433620434e-05, - "loss": 0.0559, - "step": 10980 - }, - { - "epoch": 3.264033264033264, - "grad_norm": 0.6178519129753113, - "learning_rate": 1.0415800415800416e-05, - "loss": 0.0696, - "step": 10990 - }, - { - "epoch": 3.267003267003267, - "grad_norm": 0.4793247580528259, - "learning_rate": 1.0397980397980398e-05, - "loss": 0.059, - "step": 11000 - }, - { - "epoch": 3.2699732699732698, - "grad_norm": 0.33142969012260437, - "learning_rate": 1.038016038016038e-05, - "loss": 0.0439, - "step": 11010 - }, - { - "epoch": 3.272943272943273, - "grad_norm": 0.261089950799942, - "learning_rate": 1.0362340362340363e-05, - "loss": 0.0586, - "step": 11020 - }, - { - "epoch": 3.275913275913276, - "grad_norm": 0.34269529581069946, - "learning_rate": 1.0344520344520345e-05, - "loss": 0.0679, - "step": 11030 - }, - { - "epoch": 3.2788832788832787, - "grad_norm": 0.4112348258495331, - "learning_rate": 1.0326700326700327e-05, - "loss": 0.0599, - "step": 11040 - }, - { - "epoch": 3.281853281853282, - "grad_norm": 0.5969886183738708, - "learning_rate": 1.030888030888031e-05, - "loss": 0.0719, - "step": 11050 - }, - { - "epoch": 3.284823284823285, - "grad_norm": 0.5105575323104858, - "learning_rate": 1.0291060291060291e-05, - "loss": 0.053, - "step": 11060 - }, - { - "epoch": 3.2877932877932876, - "grad_norm": 0.4884382486343384, - "learning_rate": 1.0273240273240274e-05, - "loss": 0.0553, - "step": 11070 - }, - { - "epoch": 3.290763290763291, - "grad_norm": 0.4914264678955078, - "learning_rate": 1.0255420255420256e-05, - "loss": 0.0592, - "step": 11080 - }, - { - "epoch": 3.2937332937332937, - "grad_norm": 0.44552749395370483, - "learning_rate": 1.0237600237600238e-05, - "loss": 0.0527, - "step": 11090 - }, - { - "epoch": 3.2967032967032965, - "grad_norm": 0.43704137206077576, - "learning_rate": 1.021978021978022e-05, - "loss": 0.062, - "step": 11100 - }, - { - "epoch": 3.2996732996733, - "grad_norm": 0.45537468791007996, - "learning_rate": 1.0201960201960202e-05, - "loss": 0.0685, - "step": 11110 - }, - { - "epoch": 3.3026433026433026, - "grad_norm": 0.45990774035453796, - "learning_rate": 1.0184140184140184e-05, - "loss": 0.0675, - "step": 11120 - }, - { - "epoch": 3.3056133056133055, - "grad_norm": 0.375456303358078, - "learning_rate": 1.0166320166320167e-05, - "loss": 0.0585, - "step": 11130 - }, - { - "epoch": 3.3085833085833087, - "grad_norm": 0.42089733481407166, - "learning_rate": 1.0148500148500149e-05, - "loss": 0.0636, - "step": 11140 - }, - { - "epoch": 3.3115533115533116, - "grad_norm": 0.4135701060295105, - "learning_rate": 1.0130680130680131e-05, - "loss": 0.0742, - "step": 11150 - }, - { - "epoch": 3.3145233145233144, - "grad_norm": 0.47297653555870056, - "learning_rate": 1.0112860112860113e-05, - "loss": 0.0723, - "step": 11160 - }, - { - "epoch": 3.3174933174933177, - "grad_norm": 0.5323516726493835, - "learning_rate": 1.0095040095040095e-05, - "loss": 0.0577, - "step": 11170 - }, - { - "epoch": 3.3204633204633205, - "grad_norm": 0.37327128648757935, - "learning_rate": 1.0077220077220078e-05, - "loss": 0.0606, - "step": 11180 - }, - { - "epoch": 3.3234333234333233, - "grad_norm": 0.9338498711585999, - "learning_rate": 1.005940005940006e-05, - "loss": 0.0679, - "step": 11190 - }, - { - "epoch": 3.3264033264033266, - "grad_norm": 0.5313315987586975, - "learning_rate": 1.0041580041580042e-05, - "loss": 0.0592, - "step": 11200 - }, - { - "epoch": 3.3293733293733294, - "grad_norm": 0.27918875217437744, - "learning_rate": 1.0023760023760024e-05, - "loss": 0.0661, - "step": 11210 - }, - { - "epoch": 3.3323433323433322, - "grad_norm": 0.3626916706562042, - "learning_rate": 1.0005940005940006e-05, - "loss": 0.0493, - "step": 11220 - }, - { - "epoch": 3.3353133353133355, - "grad_norm": 0.43011564016342163, - "learning_rate": 9.988119988119988e-06, - "loss": 0.0681, - "step": 11230 - }, - { - "epoch": 3.3382833382833383, - "grad_norm": 0.5412601232528687, - "learning_rate": 9.97029997029997e-06, - "loss": 0.0639, - "step": 11240 - }, - { - "epoch": 3.341253341253341, - "grad_norm": 0.6399582028388977, - "learning_rate": 9.952479952479953e-06, - "loss": 0.0497, - "step": 11250 - }, - { - "epoch": 3.3442233442233444, - "grad_norm": 0.44264036417007446, - "learning_rate": 9.934659934659935e-06, - "loss": 0.0582, - "step": 11260 - }, - { - "epoch": 3.3471933471933473, - "grad_norm": 0.3276296854019165, - "learning_rate": 9.916839916839917e-06, - "loss": 0.0463, - "step": 11270 - }, - { - "epoch": 3.35016335016335, - "grad_norm": 0.3752717077732086, - "learning_rate": 9.8990198990199e-06, - "loss": 0.0725, - "step": 11280 - }, - { - "epoch": 3.3531333531333534, - "grad_norm": 0.5361660718917847, - "learning_rate": 9.881199881199881e-06, - "loss": 0.0674, - "step": 11290 - }, - { - "epoch": 3.356103356103356, - "grad_norm": 0.5567395687103271, - "learning_rate": 9.863379863379865e-06, - "loss": 0.0708, - "step": 11300 - }, - { - "epoch": 3.359073359073359, - "grad_norm": 0.4132004678249359, - "learning_rate": 9.845559845559846e-06, - "loss": 0.0667, - "step": 11310 - }, - { - "epoch": 3.362043362043362, - "grad_norm": 0.5917862057685852, - "learning_rate": 9.827739827739828e-06, - "loss": 0.0494, - "step": 11320 - }, - { - "epoch": 3.365013365013365, - "grad_norm": 0.41967251896858215, - "learning_rate": 9.80991980991981e-06, - "loss": 0.0526, - "step": 11330 - }, - { - "epoch": 3.367983367983368, - "grad_norm": 0.2610296308994293, - "learning_rate": 9.792099792099792e-06, - "loss": 0.0597, - "step": 11340 - }, - { - "epoch": 3.3709533709533708, - "grad_norm": 0.6055606603622437, - "learning_rate": 9.774279774279774e-06, - "loss": 0.0615, - "step": 11350 - }, - { - "epoch": 3.373923373923374, - "grad_norm": 0.6907655596733093, - "learning_rate": 9.756459756459757e-06, - "loss": 0.0592, - "step": 11360 - }, - { - "epoch": 3.376893376893377, - "grad_norm": 0.5287322402000427, - "learning_rate": 9.73863973863974e-06, - "loss": 0.0501, - "step": 11370 - }, - { - "epoch": 3.3798633798633797, - "grad_norm": 0.3826773762702942, - "learning_rate": 9.720819720819721e-06, - "loss": 0.0654, - "step": 11380 - }, - { - "epoch": 3.382833382833383, - "grad_norm": 0.4057276248931885, - "learning_rate": 9.702999702999703e-06, - "loss": 0.0686, - "step": 11390 - }, - { - "epoch": 3.385803385803386, - "grad_norm": 0.3789379596710205, - "learning_rate": 9.685179685179685e-06, - "loss": 0.0705, - "step": 11400 - }, - { - "epoch": 3.3887733887733886, - "grad_norm": 0.6244688630104065, - "learning_rate": 9.667359667359667e-06, - "loss": 0.0529, - "step": 11410 - }, - { - "epoch": 3.391743391743392, - "grad_norm": 0.4109695255756378, - "learning_rate": 9.64953964953965e-06, - "loss": 0.0606, - "step": 11420 - }, - { - "epoch": 3.3947133947133947, - "grad_norm": 0.5615403652191162, - "learning_rate": 9.631719631719632e-06, - "loss": 0.0619, - "step": 11430 - }, - { - "epoch": 3.3976833976833976, - "grad_norm": 0.5328549742698669, - "learning_rate": 9.613899613899616e-06, - "loss": 0.0577, - "step": 11440 - }, - { - "epoch": 3.400653400653401, - "grad_norm": 0.6064325571060181, - "learning_rate": 9.596079596079596e-06, - "loss": 0.0627, - "step": 11450 - }, - { - "epoch": 3.4036234036234037, - "grad_norm": 0.3764317035675049, - "learning_rate": 9.578259578259578e-06, - "loss": 0.0492, - "step": 11460 - }, - { - "epoch": 3.4065934065934065, - "grad_norm": 0.40372684597969055, - "learning_rate": 9.56043956043956e-06, - "loss": 0.0625, - "step": 11470 - }, - { - "epoch": 3.4095634095634098, - "grad_norm": 0.5874956250190735, - "learning_rate": 9.542619542619543e-06, - "loss": 0.0554, - "step": 11480 - }, - { - "epoch": 3.4125334125334126, - "grad_norm": 0.6757147908210754, - "learning_rate": 9.524799524799525e-06, - "loss": 0.0503, - "step": 11490 - }, - { - "epoch": 3.4155034155034154, - "grad_norm": 0.33406156301498413, - "learning_rate": 9.506979506979507e-06, - "loss": 0.0751, - "step": 11500 - }, - { - "epoch": 3.4184734184734182, - "grad_norm": 0.22471563518047333, - "learning_rate": 9.48915948915949e-06, - "loss": 0.0608, - "step": 11510 - }, - { - "epoch": 3.4214434214434215, - "grad_norm": 0.29463276267051697, - "learning_rate": 9.471339471339471e-06, - "loss": 0.0541, - "step": 11520 - }, - { - "epoch": 3.4244134244134243, - "grad_norm": 0.5052198171615601, - "learning_rate": 9.453519453519453e-06, - "loss": 0.0705, - "step": 11530 - }, - { - "epoch": 3.427383427383427, - "grad_norm": 0.4901563823223114, - "learning_rate": 9.435699435699436e-06, - "loss": 0.0676, - "step": 11540 - }, - { - "epoch": 3.4303534303534304, - "grad_norm": 0.8629029989242554, - "learning_rate": 9.417879417879418e-06, - "loss": 0.0601, - "step": 11550 - }, - { - "epoch": 3.4333234333234333, - "grad_norm": 0.4550071656703949, - "learning_rate": 9.4000594000594e-06, - "loss": 0.056, - "step": 11560 - }, - { - "epoch": 3.436293436293436, - "grad_norm": 0.4358525574207306, - "learning_rate": 9.382239382239382e-06, - "loss": 0.0581, - "step": 11570 - }, - { - "epoch": 3.4392634392634394, - "grad_norm": 0.5264983773231506, - "learning_rate": 9.364419364419366e-06, - "loss": 0.0545, - "step": 11580 - }, - { - "epoch": 3.442233442233442, - "grad_norm": 0.6651470065116882, - "learning_rate": 9.346599346599347e-06, - "loss": 0.0502, - "step": 11590 - }, - { - "epoch": 3.445203445203445, - "grad_norm": 0.8447353839874268, - "learning_rate": 9.328779328779329e-06, - "loss": 0.0635, - "step": 11600 - }, - { - "epoch": 3.4481734481734483, - "grad_norm": 0.39546453952789307, - "learning_rate": 9.31095931095931e-06, - "loss": 0.0569, - "step": 11610 - }, - { - "epoch": 3.451143451143451, - "grad_norm": 0.5016785860061646, - "learning_rate": 9.293139293139295e-06, - "loss": 0.0566, - "step": 11620 - }, - { - "epoch": 3.454113454113454, - "grad_norm": 0.4469011723995209, - "learning_rate": 9.275319275319275e-06, - "loss": 0.0561, - "step": 11630 - }, - { - "epoch": 3.457083457083457, - "grad_norm": 0.7788525819778442, - "learning_rate": 9.257499257499257e-06, - "loss": 0.0609, - "step": 11640 - }, - { - "epoch": 3.46005346005346, - "grad_norm": 0.38840779662132263, - "learning_rate": 9.239679239679241e-06, - "loss": 0.0563, - "step": 11650 - }, - { - "epoch": 3.463023463023463, - "grad_norm": 0.4505913257598877, - "learning_rate": 9.221859221859222e-06, - "loss": 0.0551, - "step": 11660 - }, - { - "epoch": 3.465993465993466, - "grad_norm": 0.4120921492576599, - "learning_rate": 9.204039204039204e-06, - "loss": 0.0626, - "step": 11670 - }, - { - "epoch": 3.468963468963469, - "grad_norm": 0.32375073432922363, - "learning_rate": 9.186219186219186e-06, - "loss": 0.0702, - "step": 11680 - }, - { - "epoch": 3.471933471933472, - "grad_norm": 0.7593043446540833, - "learning_rate": 9.16839916839917e-06, - "loss": 0.0585, - "step": 11690 - }, - { - "epoch": 3.474903474903475, - "grad_norm": 0.8873201608657837, - "learning_rate": 9.15057915057915e-06, - "loss": 0.0556, - "step": 11700 - }, - { - "epoch": 3.477873477873478, - "grad_norm": 0.23573075234889984, - "learning_rate": 9.132759132759133e-06, - "loss": 0.0691, - "step": 11710 - }, - { - "epoch": 3.4808434808434807, - "grad_norm": 0.6220734119415283, - "learning_rate": 9.114939114939116e-06, - "loss": 0.0666, - "step": 11720 - }, - { - "epoch": 3.483813483813484, - "grad_norm": 0.34220409393310547, - "learning_rate": 9.097119097119097e-06, - "loss": 0.064, - "step": 11730 - }, - { - "epoch": 3.486783486783487, - "grad_norm": 0.46370092034339905, - "learning_rate": 9.079299079299079e-06, - "loss": 0.0723, - "step": 11740 - }, - { - "epoch": 3.4897534897534896, - "grad_norm": 0.3304176926612854, - "learning_rate": 9.061479061479061e-06, - "loss": 0.0536, - "step": 11750 - }, - { - "epoch": 3.492723492723493, - "grad_norm": 0.31319358944892883, - "learning_rate": 9.043659043659045e-06, - "loss": 0.0643, - "step": 11760 - }, - { - "epoch": 3.4956934956934957, - "grad_norm": 0.7758733034133911, - "learning_rate": 9.025839025839026e-06, - "loss": 0.056, - "step": 11770 - }, - { - "epoch": 3.4986634986634986, - "grad_norm": 0.48000404238700867, - "learning_rate": 9.008019008019008e-06, - "loss": 0.0533, - "step": 11780 - }, - { - "epoch": 3.501633501633502, - "grad_norm": 0.6242011189460754, - "learning_rate": 8.990198990198992e-06, - "loss": 0.06, - "step": 11790 - }, - { - "epoch": 3.5046035046035047, - "grad_norm": 0.6063371896743774, - "learning_rate": 8.972378972378972e-06, - "loss": 0.0478, - "step": 11800 - }, - { - "epoch": 3.5075735075735075, - "grad_norm": 0.4791490137577057, - "learning_rate": 8.954558954558954e-06, - "loss": 0.0486, - "step": 11810 - }, - { - "epoch": 3.5105435105435108, - "grad_norm": 0.3279794752597809, - "learning_rate": 8.936738936738936e-06, - "loss": 0.0655, - "step": 11820 - }, - { - "epoch": 3.5135135135135136, - "grad_norm": 0.42596834897994995, - "learning_rate": 8.91891891891892e-06, - "loss": 0.0733, - "step": 11830 - }, - { - "epoch": 3.5164835164835164, - "grad_norm": 0.4257424771785736, - "learning_rate": 8.9010989010989e-06, - "loss": 0.0624, - "step": 11840 - }, - { - "epoch": 3.5194535194535197, - "grad_norm": 0.46473416686058044, - "learning_rate": 8.883278883278883e-06, - "loss": 0.0562, - "step": 11850 - }, - { - "epoch": 3.5224235224235225, - "grad_norm": 0.6375032663345337, - "learning_rate": 8.865458865458867e-06, - "loss": 0.0589, - "step": 11860 - }, - { - "epoch": 3.5253935253935254, - "grad_norm": 0.35437679290771484, - "learning_rate": 8.847638847638847e-06, - "loss": 0.0731, - "step": 11870 - }, - { - "epoch": 3.5283635283635286, - "grad_norm": 0.5066477060317993, - "learning_rate": 8.82981882981883e-06, - "loss": 0.0614, - "step": 11880 - }, - { - "epoch": 3.5313335313335315, - "grad_norm": 0.5429478883743286, - "learning_rate": 8.811998811998812e-06, - "loss": 0.0571, - "step": 11890 - }, - { - "epoch": 3.5343035343035343, - "grad_norm": 0.6550652980804443, - "learning_rate": 8.794178794178795e-06, - "loss": 0.0569, - "step": 11900 - }, - { - "epoch": 3.5372735372735375, - "grad_norm": 0.7283503413200378, - "learning_rate": 8.776358776358776e-06, - "loss": 0.0543, - "step": 11910 - }, - { - "epoch": 3.5402435402435404, - "grad_norm": 0.3151548504829407, - "learning_rate": 8.758538758538758e-06, - "loss": 0.0566, - "step": 11920 - }, - { - "epoch": 3.543213543213543, - "grad_norm": 0.507347583770752, - "learning_rate": 8.740718740718742e-06, - "loss": 0.053, - "step": 11930 - }, - { - "epoch": 3.546183546183546, - "grad_norm": 0.7897441387176514, - "learning_rate": 8.722898722898724e-06, - "loss": 0.0688, - "step": 11940 - }, - { - "epoch": 3.5491535491535493, - "grad_norm": 0.37791678309440613, - "learning_rate": 8.705078705078705e-06, - "loss": 0.0757, - "step": 11950 - }, - { - "epoch": 3.552123552123552, - "grad_norm": 0.5913348197937012, - "learning_rate": 8.687258687258687e-06, - "loss": 0.0689, - "step": 11960 - }, - { - "epoch": 3.555093555093555, - "grad_norm": 0.7024880647659302, - "learning_rate": 8.66943866943867e-06, - "loss": 0.0648, - "step": 11970 - }, - { - "epoch": 3.5580635580635582, - "grad_norm": 0.37222567200660706, - "learning_rate": 8.651618651618651e-06, - "loss": 0.0595, - "step": 11980 - }, - { - "epoch": 3.561033561033561, - "grad_norm": 0.45320606231689453, - "learning_rate": 8.633798633798633e-06, - "loss": 0.0638, - "step": 11990 - }, - { - "epoch": 3.564003564003564, - "grad_norm": 0.46902260184288025, - "learning_rate": 8.615978615978617e-06, - "loss": 0.0776, - "step": 12000 - }, - { - "epoch": 3.5669735669735667, - "grad_norm": 0.5722383260726929, - "learning_rate": 8.5981585981586e-06, - "loss": 0.0531, - "step": 12010 - }, - { - "epoch": 3.56994356994357, - "grad_norm": 0.5090997815132141, - "learning_rate": 8.58033858033858e-06, - "loss": 0.0534, - "step": 12020 - }, - { - "epoch": 3.572913572913573, - "grad_norm": 0.4689802825450897, - "learning_rate": 8.562518562518562e-06, - "loss": 0.0717, - "step": 12030 - }, - { - "epoch": 3.5758835758835756, - "grad_norm": 0.4180223345756531, - "learning_rate": 8.544698544698546e-06, - "loss": 0.0456, - "step": 12040 - }, - { - "epoch": 3.578853578853579, - "grad_norm": 0.30135074257850647, - "learning_rate": 8.526878526878526e-06, - "loss": 0.0548, - "step": 12050 - }, - { - "epoch": 3.5818235818235817, - "grad_norm": 0.5609501600265503, - "learning_rate": 8.509058509058509e-06, - "loss": 0.0569, - "step": 12060 - }, - { - "epoch": 3.5847935847935846, - "grad_norm": 0.30133068561553955, - "learning_rate": 8.491238491238492e-06, - "loss": 0.0499, - "step": 12070 - }, - { - "epoch": 3.587763587763588, - "grad_norm": 0.4278302490711212, - "learning_rate": 8.473418473418475e-06, - "loss": 0.0556, - "step": 12080 - }, - { - "epoch": 3.5907335907335907, - "grad_norm": 0.570552408695221, - "learning_rate": 8.455598455598455e-06, - "loss": 0.0667, - "step": 12090 - }, - { - "epoch": 3.5937035937035935, - "grad_norm": 0.3945624828338623, - "learning_rate": 8.437778437778437e-06, - "loss": 0.0588, - "step": 12100 - }, - { - "epoch": 3.5966735966735968, - "grad_norm": 0.5016827583312988, - "learning_rate": 8.419958419958421e-06, - "loss": 0.0454, - "step": 12110 - }, - { - "epoch": 3.5996435996435996, - "grad_norm": 0.4540993869304657, - "learning_rate": 8.402138402138402e-06, - "loss": 0.0692, - "step": 12120 - }, - { - "epoch": 3.6026136026136024, - "grad_norm": 0.4178672730922699, - "learning_rate": 8.384318384318384e-06, - "loss": 0.054, - "step": 12130 - }, - { - "epoch": 3.6055836055836057, - "grad_norm": 0.6967900395393372, - "learning_rate": 8.366498366498368e-06, - "loss": 0.0629, - "step": 12140 - }, - { - "epoch": 3.6085536085536085, - "grad_norm": 0.5746413469314575, - "learning_rate": 8.34867834867835e-06, - "loss": 0.0635, - "step": 12150 - }, - { - "epoch": 3.6115236115236113, - "grad_norm": 0.45530760288238525, - "learning_rate": 8.33085833085833e-06, - "loss": 0.0632, - "step": 12160 - }, - { - "epoch": 3.6144936144936146, - "grad_norm": 0.5083603858947754, - "learning_rate": 8.313038313038312e-06, - "loss": 0.0522, - "step": 12170 - }, - { - "epoch": 3.6174636174636174, - "grad_norm": 0.4417908489704132, - "learning_rate": 8.295218295218296e-06, - "loss": 0.0537, - "step": 12180 - }, - { - "epoch": 3.6204336204336203, - "grad_norm": 0.5929802656173706, - "learning_rate": 8.277398277398278e-06, - "loss": 0.0628, - "step": 12190 - }, - { - "epoch": 3.6234036234036235, - "grad_norm": 0.35801371932029724, - "learning_rate": 8.259578259578259e-06, - "loss": 0.0568, - "step": 12200 - }, - { - "epoch": 3.6263736263736264, - "grad_norm": 0.42152899503707886, - "learning_rate": 8.241758241758243e-06, - "loss": 0.0575, - "step": 12210 - }, - { - "epoch": 3.629343629343629, - "grad_norm": 0.5134592652320862, - "learning_rate": 8.223938223938225e-06, - "loss": 0.0589, - "step": 12220 - }, - { - "epoch": 3.6323136323136325, - "grad_norm": 0.5800890922546387, - "learning_rate": 8.206118206118205e-06, - "loss": 0.0711, - "step": 12230 - }, - { - "epoch": 3.6352836352836353, - "grad_norm": 0.6621565222740173, - "learning_rate": 8.188298188298188e-06, - "loss": 0.0626, - "step": 12240 - }, - { - "epoch": 3.638253638253638, - "grad_norm": 0.19206875562667847, - "learning_rate": 8.170478170478171e-06, - "loss": 0.054, - "step": 12250 - }, - { - "epoch": 3.6412236412236414, - "grad_norm": 0.3461471199989319, - "learning_rate": 8.152658152658154e-06, - "loss": 0.063, - "step": 12260 - }, - { - "epoch": 3.644193644193644, - "grad_norm": 0.5503948926925659, - "learning_rate": 8.134838134838134e-06, - "loss": 0.0674, - "step": 12270 - }, - { - "epoch": 3.647163647163647, - "grad_norm": 0.3993360698223114, - "learning_rate": 8.117018117018118e-06, - "loss": 0.0523, - "step": 12280 - }, - { - "epoch": 3.6501336501336503, - "grad_norm": 0.5561977624893188, - "learning_rate": 8.0991980991981e-06, - "loss": 0.0601, - "step": 12290 - }, - { - "epoch": 3.653103653103653, - "grad_norm": 0.4218428134918213, - "learning_rate": 8.08137808137808e-06, - "loss": 0.0597, - "step": 12300 - }, - { - "epoch": 3.656073656073656, - "grad_norm": 0.6830678582191467, - "learning_rate": 8.063558063558063e-06, - "loss": 0.0624, - "step": 12310 - }, - { - "epoch": 3.6590436590436592, - "grad_norm": 0.5021694302558899, - "learning_rate": 8.045738045738047e-06, - "loss": 0.0725, - "step": 12320 - }, - { - "epoch": 3.662013662013662, - "grad_norm": 0.6278291344642639, - "learning_rate": 8.027918027918029e-06, - "loss": 0.0715, - "step": 12330 - }, - { - "epoch": 3.664983664983665, - "grad_norm": 0.7712084650993347, - "learning_rate": 8.01009801009801e-06, - "loss": 0.0609, - "step": 12340 - }, - { - "epoch": 3.667953667953668, - "grad_norm": 0.47669193148612976, - "learning_rate": 7.992277992277993e-06, - "loss": 0.0645, - "step": 12350 - }, - { - "epoch": 3.670923670923671, - "grad_norm": 0.5000527501106262, - "learning_rate": 7.974457974457975e-06, - "loss": 0.0463, - "step": 12360 - }, - { - "epoch": 3.673893673893674, - "grad_norm": 0.409820020198822, - "learning_rate": 7.956637956637956e-06, - "loss": 0.0552, - "step": 12370 - }, - { - "epoch": 3.676863676863677, - "grad_norm": 0.48183321952819824, - "learning_rate": 7.938817938817938e-06, - "loss": 0.0605, - "step": 12380 - }, - { - "epoch": 3.67983367983368, - "grad_norm": 0.5534571409225464, - "learning_rate": 7.920997920997922e-06, - "loss": 0.0638, - "step": 12390 - }, - { - "epoch": 3.6828036828036828, - "grad_norm": 0.4206744432449341, - "learning_rate": 7.903177903177904e-06, - "loss": 0.0634, - "step": 12400 - }, - { - "epoch": 3.685773685773686, - "grad_norm": 0.5539330244064331, - "learning_rate": 7.885357885357884e-06, - "loss": 0.0583, - "step": 12410 - }, - { - "epoch": 3.688743688743689, - "grad_norm": 0.32335200905799866, - "learning_rate": 7.867537867537868e-06, - "loss": 0.0604, - "step": 12420 - }, - { - "epoch": 3.6917136917136917, - "grad_norm": 0.6858915686607361, - "learning_rate": 7.84971784971785e-06, - "loss": 0.0711, - "step": 12430 - }, - { - "epoch": 3.694683694683695, - "grad_norm": 0.4419819116592407, - "learning_rate": 7.831897831897831e-06, - "loss": 0.0535, - "step": 12440 - }, - { - "epoch": 3.697653697653698, - "grad_norm": 0.5330691933631897, - "learning_rate": 7.814077814077813e-06, - "loss": 0.0604, - "step": 12450 - }, - { - "epoch": 3.7006237006237006, - "grad_norm": 0.5260715484619141, - "learning_rate": 7.796257796257797e-06, - "loss": 0.0607, - "step": 12460 - }, - { - "epoch": 3.7035937035937034, - "grad_norm": 0.7059239149093628, - "learning_rate": 7.77843777843778e-06, - "loss": 0.0683, - "step": 12470 - }, - { - "epoch": 3.7065637065637067, - "grad_norm": 0.31892430782318115, - "learning_rate": 7.76061776061776e-06, - "loss": 0.0701, - "step": 12480 - }, - { - "epoch": 3.7095337095337095, - "grad_norm": 0.4127281606197357, - "learning_rate": 7.742797742797744e-06, - "loss": 0.0682, - "step": 12490 - }, - { - "epoch": 3.7125037125037124, - "grad_norm": 0.23683589696884155, - "learning_rate": 7.724977724977726e-06, - "loss": 0.0612, - "step": 12500 - }, - { - "epoch": 3.7154737154737156, - "grad_norm": 0.47517532110214233, - "learning_rate": 7.707157707157708e-06, - "loss": 0.0462, - "step": 12510 - }, - { - "epoch": 3.7184437184437185, - "grad_norm": 0.6467389464378357, - "learning_rate": 7.689337689337688e-06, - "loss": 0.0664, - "step": 12520 - }, - { - "epoch": 3.7214137214137213, - "grad_norm": 0.6246938705444336, - "learning_rate": 7.671517671517672e-06, - "loss": 0.0638, - "step": 12530 - }, - { - "epoch": 3.724383724383724, - "grad_norm": 0.4938197433948517, - "learning_rate": 7.653697653697654e-06, - "loss": 0.0629, - "step": 12540 - }, - { - "epoch": 3.7273537273537274, - "grad_norm": 0.5381590127944946, - "learning_rate": 7.635877635877635e-06, - "loss": 0.0621, - "step": 12550 - }, - { - "epoch": 3.73032373032373, - "grad_norm": 0.2848157286643982, - "learning_rate": 7.618057618057619e-06, - "loss": 0.0641, - "step": 12560 - }, - { - "epoch": 3.733293733293733, - "grad_norm": 0.4204511046409607, - "learning_rate": 7.600237600237601e-06, - "loss": 0.0606, - "step": 12570 - }, - { - "epoch": 3.7362637362637363, - "grad_norm": 0.5741158723831177, - "learning_rate": 7.582417582417582e-06, - "loss": 0.0478, - "step": 12580 - }, - { - "epoch": 3.739233739233739, - "grad_norm": 0.3851994574069977, - "learning_rate": 7.564597564597564e-06, - "loss": 0.067, - "step": 12590 - }, - { - "epoch": 3.742203742203742, - "grad_norm": 0.35587117075920105, - "learning_rate": 7.546777546777547e-06, - "loss": 0.0665, - "step": 12600 - }, - { - "epoch": 3.7451737451737452, - "grad_norm": 0.30616384744644165, - "learning_rate": 7.528957528957529e-06, - "loss": 0.064, - "step": 12610 - }, - { - "epoch": 3.748143748143748, - "grad_norm": 0.5584198832511902, - "learning_rate": 7.511137511137511e-06, - "loss": 0.0546, - "step": 12620 - }, - { - "epoch": 3.751113751113751, - "grad_norm": 0.3456946015357971, - "learning_rate": 7.493317493317493e-06, - "loss": 0.0571, - "step": 12630 - }, - { - "epoch": 3.754083754083754, - "grad_norm": 0.5522321462631226, - "learning_rate": 7.475497475497476e-06, - "loss": 0.0809, - "step": 12640 - }, - { - "epoch": 3.757053757053757, - "grad_norm": 0.42469412088394165, - "learning_rate": 7.457677457677457e-06, - "loss": 0.0629, - "step": 12650 - }, - { - "epoch": 3.76002376002376, - "grad_norm": 0.5727609395980835, - "learning_rate": 7.4398574398574404e-06, - "loss": 0.0635, - "step": 12660 - }, - { - "epoch": 3.762993762993763, - "grad_norm": 0.3833814859390259, - "learning_rate": 7.422037422037423e-06, - "loss": 0.0667, - "step": 12670 - }, - { - "epoch": 3.765963765963766, - "grad_norm": 0.7218992114067078, - "learning_rate": 7.404217404217404e-06, - "loss": 0.0589, - "step": 12680 - }, - { - "epoch": 3.7689337689337687, - "grad_norm": 0.5727225542068481, - "learning_rate": 7.386397386397387e-06, - "loss": 0.0528, - "step": 12690 - }, - { - "epoch": 3.771903771903772, - "grad_norm": 0.38015714287757874, - "learning_rate": 7.368577368577368e-06, - "loss": 0.0538, - "step": 12700 - }, - { - "epoch": 3.774873774873775, - "grad_norm": 0.32746824622154236, - "learning_rate": 7.350757350757351e-06, - "loss": 0.0511, - "step": 12710 - }, - { - "epoch": 3.7778437778437777, - "grad_norm": 0.3238430321216583, - "learning_rate": 7.332937332937333e-06, - "loss": 0.0572, - "step": 12720 - }, - { - "epoch": 3.780813780813781, - "grad_norm": 0.3043205142021179, - "learning_rate": 7.315117315117316e-06, - "loss": 0.0543, - "step": 12730 - }, - { - "epoch": 3.7837837837837838, - "grad_norm": 0.23511236906051636, - "learning_rate": 7.297297297297298e-06, - "loss": 0.0567, - "step": 12740 - }, - { - "epoch": 3.7867537867537866, - "grad_norm": 0.44706740975379944, - "learning_rate": 7.27947727947728e-06, - "loss": 0.0598, - "step": 12750 - }, - { - "epoch": 3.78972378972379, - "grad_norm": 0.700774610042572, - "learning_rate": 7.261657261657262e-06, - "loss": 0.0671, - "step": 12760 - }, - { - "epoch": 3.7926937926937927, - "grad_norm": 0.35849860310554504, - "learning_rate": 7.2438372438372435e-06, - "loss": 0.0586, - "step": 12770 - }, - { - "epoch": 3.7956637956637955, - "grad_norm": 0.4785964787006378, - "learning_rate": 7.2260172260172265e-06, - "loss": 0.0616, - "step": 12780 - }, - { - "epoch": 3.798633798633799, - "grad_norm": 0.6433180570602417, - "learning_rate": 7.208197208197208e-06, - "loss": 0.0582, - "step": 12790 - }, - { - "epoch": 3.8016038016038016, - "grad_norm": 0.37284335494041443, - "learning_rate": 7.190377190377191e-06, - "loss": 0.0491, - "step": 12800 - }, - { - "epoch": 3.8045738045738045, - "grad_norm": 0.576884388923645, - "learning_rate": 7.172557172557173e-06, - "loss": 0.0644, - "step": 12810 - }, - { - "epoch": 3.8075438075438077, - "grad_norm": 0.5406507253646851, - "learning_rate": 7.154737154737155e-06, - "loss": 0.0621, - "step": 12820 - }, - { - "epoch": 3.8105138105138106, - "grad_norm": 0.5398954749107361, - "learning_rate": 7.136917136917137e-06, - "loss": 0.0606, - "step": 12830 - }, - { - "epoch": 3.8134838134838134, - "grad_norm": 0.3366580605506897, - "learning_rate": 7.119097119097119e-06, - "loss": 0.0543, - "step": 12840 - }, - { - "epoch": 3.8164538164538166, - "grad_norm": 0.4284731149673462, - "learning_rate": 7.101277101277102e-06, - "loss": 0.0608, - "step": 12850 - }, - { - "epoch": 3.8194238194238195, - "grad_norm": 0.6001728773117065, - "learning_rate": 7.083457083457083e-06, - "loss": 0.052, - "step": 12860 - }, - { - "epoch": 3.8223938223938223, - "grad_norm": 0.6029428243637085, - "learning_rate": 7.065637065637066e-06, - "loss": 0.0516, - "step": 12870 - }, - { - "epoch": 3.8253638253638256, - "grad_norm": 0.39351001381874084, - "learning_rate": 7.047817047817048e-06, - "loss": 0.0705, - "step": 12880 - }, - { - "epoch": 3.8283338283338284, - "grad_norm": 0.5574610829353333, - "learning_rate": 7.02999702999703e-06, - "loss": 0.0749, - "step": 12890 - }, - { - "epoch": 3.8313038313038312, - "grad_norm": 0.35019442439079285, - "learning_rate": 7.0121770121770125e-06, - "loss": 0.0612, - "step": 12900 - }, - { - "epoch": 3.8342738342738345, - "grad_norm": 0.4754871129989624, - "learning_rate": 6.994356994356995e-06, - "loss": 0.0553, - "step": 12910 - }, - { - "epoch": 3.8372438372438373, - "grad_norm": 0.42024219036102295, - "learning_rate": 6.976536976536977e-06, - "loss": 0.0655, - "step": 12920 - }, - { - "epoch": 3.84021384021384, - "grad_norm": 0.5387091636657715, - "learning_rate": 6.958716958716958e-06, - "loss": 0.0599, - "step": 12930 - }, - { - "epoch": 3.8431838431838434, - "grad_norm": 0.570585310459137, - "learning_rate": 6.940896940896941e-06, - "loss": 0.0614, - "step": 12940 - }, - { - "epoch": 3.8461538461538463, - "grad_norm": 0.456065833568573, - "learning_rate": 6.923076923076923e-06, - "loss": 0.0654, - "step": 12950 - }, - { - "epoch": 3.849123849123849, - "grad_norm": 0.7067524790763855, - "learning_rate": 6.9052569052569056e-06, - "loss": 0.0756, - "step": 12960 - }, - { - "epoch": 3.8520938520938524, - "grad_norm": 0.46172401309013367, - "learning_rate": 6.887436887436888e-06, - "loss": 0.053, - "step": 12970 - }, - { - "epoch": 3.855063855063855, - "grad_norm": 0.561151921749115, - "learning_rate": 6.86961686961687e-06, - "loss": 0.0503, - "step": 12980 - }, - { - "epoch": 3.858033858033858, - "grad_norm": 0.3379230201244354, - "learning_rate": 6.851796851796852e-06, - "loss": 0.0626, - "step": 12990 - }, - { - "epoch": 3.861003861003861, - "grad_norm": 0.6056146621704102, - "learning_rate": 6.833976833976834e-06, - "loss": 0.0628, - "step": 13000 - }, - { - "epoch": 3.863973863973864, - "grad_norm": 0.48145750164985657, - "learning_rate": 6.816156816156816e-06, - "loss": 0.061, - "step": 13010 - }, - { - "epoch": 3.866943866943867, - "grad_norm": 0.4073619246482849, - "learning_rate": 6.7983367983367986e-06, - "loss": 0.068, - "step": 13020 - }, - { - "epoch": 3.8699138699138698, - "grad_norm": 0.4736767113208771, - "learning_rate": 6.780516780516781e-06, - "loss": 0.0595, - "step": 13030 - }, - { - "epoch": 3.872883872883873, - "grad_norm": 0.4397349953651428, - "learning_rate": 6.762696762696763e-06, - "loss": 0.0676, - "step": 13040 - }, - { - "epoch": 3.875853875853876, - "grad_norm": 0.4046313166618347, - "learning_rate": 6.744876744876745e-06, - "loss": 0.0552, - "step": 13050 - }, - { - "epoch": 3.8788238788238787, - "grad_norm": 0.3561536371707916, - "learning_rate": 6.727056727056727e-06, - "loss": 0.0595, - "step": 13060 - }, - { - "epoch": 3.8817938817938815, - "grad_norm": 0.5443368554115295, - "learning_rate": 6.7092367092367094e-06, - "loss": 0.0557, - "step": 13070 - }, - { - "epoch": 3.884763884763885, - "grad_norm": 0.515012264251709, - "learning_rate": 6.691416691416692e-06, - "loss": 0.0488, - "step": 13080 - }, - { - "epoch": 3.8877338877338876, - "grad_norm": 0.37932658195495605, - "learning_rate": 6.673596673596674e-06, - "loss": 0.0629, - "step": 13090 - }, - { - "epoch": 3.8907038907038904, - "grad_norm": 0.4500630795955658, - "learning_rate": 6.655776655776656e-06, - "loss": 0.0589, - "step": 13100 - }, - { - "epoch": 3.8936738936738937, - "grad_norm": 0.759432852268219, - "learning_rate": 6.637956637956638e-06, - "loss": 0.0547, - "step": 13110 - }, - { - "epoch": 3.8966438966438965, - "grad_norm": 0.6136061549186707, - "learning_rate": 6.62013662013662e-06, - "loss": 0.0572, - "step": 13120 - }, - { - "epoch": 3.8996138996138994, - "grad_norm": 0.5580719709396362, - "learning_rate": 6.6023166023166025e-06, - "loss": 0.0573, - "step": 13130 - }, - { - "epoch": 3.9025839025839026, - "grad_norm": 0.5806880593299866, - "learning_rate": 6.584496584496585e-06, - "loss": 0.0624, - "step": 13140 - }, - { - "epoch": 3.9055539055539055, - "grad_norm": 0.7236099243164062, - "learning_rate": 6.566676566676567e-06, - "loss": 0.069, - "step": 13150 - }, - { - "epoch": 3.9085239085239083, - "grad_norm": 0.4525713622570038, - "learning_rate": 6.548856548856549e-06, - "loss": 0.065, - "step": 13160 - }, - { - "epoch": 3.9114939114939116, - "grad_norm": 0.5687326788902283, - "learning_rate": 6.531036531036531e-06, - "loss": 0.0592, - "step": 13170 - }, - { - "epoch": 3.9144639144639144, - "grad_norm": 0.4574839770793915, - "learning_rate": 6.513216513216513e-06, - "loss": 0.0691, - "step": 13180 - }, - { - "epoch": 3.9174339174339172, - "grad_norm": 0.4538971483707428, - "learning_rate": 6.4953964953964955e-06, - "loss": 0.0562, - "step": 13190 - }, - { - "epoch": 3.9204039204039205, - "grad_norm": 0.32448068261146545, - "learning_rate": 6.477576477576478e-06, - "loss": 0.0555, - "step": 13200 - }, - { - "epoch": 3.9233739233739233, - "grad_norm": 0.5266978144645691, - "learning_rate": 6.45975645975646e-06, - "loss": 0.0532, - "step": 13210 - }, - { - "epoch": 3.926343926343926, - "grad_norm": 0.48830193281173706, - "learning_rate": 6.441936441936442e-06, - "loss": 0.0609, - "step": 13220 - }, - { - "epoch": 3.9293139293139294, - "grad_norm": 0.48386427760124207, - "learning_rate": 6.424116424116425e-06, - "loss": 0.0595, - "step": 13230 - }, - { - "epoch": 3.9322839322839322, - "grad_norm": 0.33438950777053833, - "learning_rate": 6.406296406296406e-06, - "loss": 0.0638, - "step": 13240 - }, - { - "epoch": 3.935253935253935, - "grad_norm": 0.5018361806869507, - "learning_rate": 6.3884763884763885e-06, - "loss": 0.0589, - "step": 13250 - }, - { - "epoch": 3.9382239382239383, - "grad_norm": 0.5178138613700867, - "learning_rate": 6.370656370656371e-06, - "loss": 0.0619, - "step": 13260 - }, - { - "epoch": 3.941193941193941, - "grad_norm": 0.4681033194065094, - "learning_rate": 6.352836352836353e-06, - "loss": 0.0577, - "step": 13270 - }, - { - "epoch": 3.944163944163944, - "grad_norm": 0.6118980050086975, - "learning_rate": 6.335016335016335e-06, - "loss": 0.0563, - "step": 13280 - }, - { - "epoch": 3.9471339471339473, - "grad_norm": 0.4309462010860443, - "learning_rate": 6.317196317196317e-06, - "loss": 0.0465, - "step": 13290 - }, - { - "epoch": 3.95010395010395, - "grad_norm": 0.5277587175369263, - "learning_rate": 6.2993762993763e-06, - "loss": 0.0706, - "step": 13300 - }, - { - "epoch": 3.953073953073953, - "grad_norm": 0.5027768611907959, - "learning_rate": 6.2815562815562815e-06, - "loss": 0.0491, - "step": 13310 - }, - { - "epoch": 3.956043956043956, - "grad_norm": 0.560326874256134, - "learning_rate": 6.2637362637362645e-06, - "loss": 0.0487, - "step": 13320 - }, - { - "epoch": 3.959013959013959, - "grad_norm": 0.5669682621955872, - "learning_rate": 6.245916245916246e-06, - "loss": 0.0491, - "step": 13330 - }, - { - "epoch": 3.961983961983962, - "grad_norm": 0.49655118584632874, - "learning_rate": 6.228096228096228e-06, - "loss": 0.0497, - "step": 13340 - }, - { - "epoch": 3.964953964953965, - "grad_norm": 0.8173234462738037, - "learning_rate": 6.21027621027621e-06, - "loss": 0.071, - "step": 13350 - }, - { - "epoch": 3.967923967923968, - "grad_norm": 0.5078877210617065, - "learning_rate": 6.192456192456192e-06, - "loss": 0.0482, - "step": 13360 - }, - { - "epoch": 3.970893970893971, - "grad_norm": 0.4183073937892914, - "learning_rate": 6.174636174636175e-06, - "loss": 0.0537, - "step": 13370 - }, - { - "epoch": 3.973863973863974, - "grad_norm": 0.5460306406021118, - "learning_rate": 6.156816156816157e-06, - "loss": 0.0707, - "step": 13380 - }, - { - "epoch": 3.976833976833977, - "grad_norm": 0.8355798125267029, - "learning_rate": 6.13899613899614e-06, - "loss": 0.0663, - "step": 13390 - }, - { - "epoch": 3.9798039798039797, - "grad_norm": 0.5097036361694336, - "learning_rate": 6.121176121176121e-06, - "loss": 0.0652, - "step": 13400 - }, - { - "epoch": 3.982773982773983, - "grad_norm": 0.5116889476776123, - "learning_rate": 6.103356103356103e-06, - "loss": 0.0584, - "step": 13410 - }, - { - "epoch": 3.985743985743986, - "grad_norm": 0.4749346971511841, - "learning_rate": 6.085536085536085e-06, - "loss": 0.0598, - "step": 13420 - }, - { - "epoch": 3.9887139887139886, - "grad_norm": 0.3732450306415558, - "learning_rate": 6.0677160677160676e-06, - "loss": 0.0623, - "step": 13430 - }, - { - "epoch": 3.991683991683992, - "grad_norm": 0.37360072135925293, - "learning_rate": 6.049896049896051e-06, - "loss": 0.0577, - "step": 13440 - }, - { - "epoch": 3.9946539946539947, - "grad_norm": 0.5997447967529297, - "learning_rate": 6.032076032076032e-06, - "loss": 0.0634, - "step": 13450 - }, - { - "epoch": 3.9976239976239976, - "grad_norm": 0.4489789605140686, - "learning_rate": 6.014256014256015e-06, - "loss": 0.059, - "step": 13460 - }, - { - "epoch": 4.0, - "eval_f1": 0.49727767695099817, - "eval_loss": 0.054960619658231735, - "eval_runtime": 178.7066, - "eval_samples_per_second": 212.745, - "eval_steps_per_second": 3.329, - "step": 13468 - }, - { - "epoch": 4.000594000594001, - "grad_norm": 0.8586022257804871, - "learning_rate": 5.996435996435996e-06, - "loss": 0.0538, - "step": 13470 - }, - { - "epoch": 4.003564003564003, - "grad_norm": 0.5165749192237854, - "learning_rate": 5.978615978615979e-06, - "loss": 0.0637, - "step": 13480 - }, - { - "epoch": 4.0065340065340065, - "grad_norm": 0.6126316785812378, - "learning_rate": 5.960795960795961e-06, - "loss": 0.0514, - "step": 13490 - }, - { - "epoch": 4.00950400950401, - "grad_norm": 0.572414219379425, - "learning_rate": 5.942975942975943e-06, - "loss": 0.0592, - "step": 13500 - }, - { - "epoch": 4.012474012474012, - "grad_norm": 0.615561842918396, - "learning_rate": 5.925155925155926e-06, - "loss": 0.0551, - "step": 13510 - }, - { - "epoch": 4.015444015444015, - "grad_norm": 0.5001277923583984, - "learning_rate": 5.907335907335907e-06, - "loss": 0.0586, - "step": 13520 - }, - { - "epoch": 4.018414018414019, - "grad_norm": 0.4393932521343231, - "learning_rate": 5.88951588951589e-06, - "loss": 0.0567, - "step": 13530 - }, - { - "epoch": 4.021384021384021, - "grad_norm": 0.3287888467311859, - "learning_rate": 5.8716958716958714e-06, - "loss": 0.0525, - "step": 13540 - }, - { - "epoch": 4.024354024354024, - "grad_norm": 0.6189230680465698, - "learning_rate": 5.8538758538758545e-06, - "loss": 0.061, - "step": 13550 - }, - { - "epoch": 4.027324027324028, - "grad_norm": 0.8372416496276855, - "learning_rate": 5.836055836055836e-06, - "loss": 0.0665, - "step": 13560 - }, - { - "epoch": 4.03029403029403, - "grad_norm": 0.4526776373386383, - "learning_rate": 5.818235818235818e-06, - "loss": 0.0617, - "step": 13570 - }, - { - "epoch": 4.033264033264033, - "grad_norm": 0.6300592422485352, - "learning_rate": 5.800415800415801e-06, - "loss": 0.0684, - "step": 13580 - }, - { - "epoch": 4.0362340362340365, - "grad_norm": 0.7106212973594666, - "learning_rate": 5.782595782595782e-06, - "loss": 0.0635, - "step": 13590 - }, - { - "epoch": 4.039204039204039, - "grad_norm": 0.47979801893234253, - "learning_rate": 5.764775764775765e-06, - "loss": 0.0543, - "step": 13600 - }, - { - "epoch": 4.042174042174042, - "grad_norm": 0.704913854598999, - "learning_rate": 5.746955746955747e-06, - "loss": 0.0495, - "step": 13610 - }, - { - "epoch": 4.0451440451440455, - "grad_norm": 0.5323979258537292, - "learning_rate": 5.72913572913573e-06, - "loss": 0.0549, - "step": 13620 - }, - { - "epoch": 4.048114048114048, - "grad_norm": 0.6989266276359558, - "learning_rate": 5.711315711315711e-06, - "loss": 0.07, - "step": 13630 - }, - { - "epoch": 4.051084051084051, - "grad_norm": 0.6013164520263672, - "learning_rate": 5.693495693495694e-06, - "loss": 0.0565, - "step": 13640 - }, - { - "epoch": 4.054054054054054, - "grad_norm": 0.43801942467689514, - "learning_rate": 5.675675675675676e-06, - "loss": 0.0574, - "step": 13650 - }, - { - "epoch": 4.057024057024057, - "grad_norm": 0.6650937795639038, - "learning_rate": 5.6578556578556575e-06, - "loss": 0.0637, - "step": 13660 - }, - { - "epoch": 4.05999405999406, - "grad_norm": 0.4909881055355072, - "learning_rate": 5.6400356400356405e-06, - "loss": 0.0612, - "step": 13670 - }, - { - "epoch": 4.062964062964063, - "grad_norm": 0.3323568105697632, - "learning_rate": 5.622215622215622e-06, - "loss": 0.0584, - "step": 13680 - }, - { - "epoch": 4.065934065934066, - "grad_norm": 0.6184719800949097, - "learning_rate": 5.604395604395605e-06, - "loss": 0.0504, - "step": 13690 - }, - { - "epoch": 4.068904068904069, - "grad_norm": 0.5047394037246704, - "learning_rate": 5.586575586575586e-06, - "loss": 0.0531, - "step": 13700 - }, - { - "epoch": 4.071874071874072, - "grad_norm": 0.6481796503067017, - "learning_rate": 5.568755568755569e-06, - "loss": 0.0613, - "step": 13710 - }, - { - "epoch": 4.074844074844075, - "grad_norm": 0.7215176224708557, - "learning_rate": 5.550935550935551e-06, - "loss": 0.0633, - "step": 13720 - }, - { - "epoch": 4.077814077814078, - "grad_norm": 0.16339148581027985, - "learning_rate": 5.533115533115533e-06, - "loss": 0.058, - "step": 13730 - }, - { - "epoch": 4.080784080784081, - "grad_norm": 0.5942262411117554, - "learning_rate": 5.515295515295516e-06, - "loss": 0.0733, - "step": 13740 - }, - { - "epoch": 4.0837540837540836, - "grad_norm": 0.4894910454750061, - "learning_rate": 5.497475497475497e-06, - "loss": 0.0645, - "step": 13750 - }, - { - "epoch": 4.086724086724087, - "grad_norm": 0.44156116247177124, - "learning_rate": 5.47965547965548e-06, - "loss": 0.0555, - "step": 13760 - }, - { - "epoch": 4.08969408969409, - "grad_norm": 0.45034366846084595, - "learning_rate": 5.461835461835461e-06, - "loss": 0.0648, - "step": 13770 - }, - { - "epoch": 4.0926640926640925, - "grad_norm": 0.43881091475486755, - "learning_rate": 5.444015444015444e-06, - "loss": 0.059, - "step": 13780 - }, - { - "epoch": 4.095634095634096, - "grad_norm": 0.6181434988975525, - "learning_rate": 5.4261954261954265e-06, - "loss": 0.0698, - "step": 13790 - }, - { - "epoch": 4.098604098604099, - "grad_norm": 0.39531105756759644, - "learning_rate": 5.408375408375409e-06, - "loss": 0.0563, - "step": 13800 - }, - { - "epoch": 4.101574101574101, - "grad_norm": 0.44663333892822266, - "learning_rate": 5.390555390555391e-06, - "loss": 0.0529, - "step": 13810 - }, - { - "epoch": 4.104544104544105, - "grad_norm": 0.591187059879303, - "learning_rate": 5.372735372735372e-06, - "loss": 0.0524, - "step": 13820 - }, - { - "epoch": 4.107514107514108, - "grad_norm": 0.5794005990028381, - "learning_rate": 5.354915354915355e-06, - "loss": 0.0529, - "step": 13830 - }, - { - "epoch": 4.11048411048411, - "grad_norm": 0.312919944524765, - "learning_rate": 5.337095337095337e-06, - "loss": 0.0651, - "step": 13840 - }, - { - "epoch": 4.113454113454114, - "grad_norm": 0.5957525968551636, - "learning_rate": 5.3192753192753196e-06, - "loss": 0.0523, - "step": 13850 - }, - { - "epoch": 4.116424116424117, - "grad_norm": 0.6151428818702698, - "learning_rate": 5.301455301455302e-06, - "loss": 0.0581, - "step": 13860 - }, - { - "epoch": 4.119394119394119, - "grad_norm": 0.4753796458244324, - "learning_rate": 5.283635283635284e-06, - "loss": 0.0514, - "step": 13870 - }, - { - "epoch": 4.1223641223641225, - "grad_norm": 0.45062291622161865, - "learning_rate": 5.265815265815266e-06, - "loss": 0.0467, - "step": 13880 - }, - { - "epoch": 4.125334125334125, - "grad_norm": 0.4602527320384979, - "learning_rate": 5.247995247995247e-06, - "loss": 0.0583, - "step": 13890 - }, - { - "epoch": 4.128304128304128, - "grad_norm": 0.543065071105957, - "learning_rate": 5.2301752301752304e-06, - "loss": 0.0493, - "step": 13900 - }, - { - "epoch": 4.1312741312741315, - "grad_norm": 0.40139061212539673, - "learning_rate": 5.212355212355213e-06, - "loss": 0.0543, - "step": 13910 - }, - { - "epoch": 4.134244134244134, - "grad_norm": 0.40932586789131165, - "learning_rate": 5.194535194535195e-06, - "loss": 0.0491, - "step": 13920 - }, - { - "epoch": 4.137214137214137, - "grad_norm": 0.7136752605438232, - "learning_rate": 5.176715176715177e-06, - "loss": 0.0665, - "step": 13930 - }, - { - "epoch": 4.14018414018414, - "grad_norm": 0.81020188331604, - "learning_rate": 5.158895158895159e-06, - "loss": 0.0516, - "step": 13940 - }, - { - "epoch": 4.143154143154143, - "grad_norm": 0.3689301311969757, - "learning_rate": 5.141075141075141e-06, - "loss": 0.0617, - "step": 13950 - }, - { - "epoch": 4.146124146124146, - "grad_norm": 0.296916663646698, - "learning_rate": 5.1232551232551234e-06, - "loss": 0.0507, - "step": 13960 - }, - { - "epoch": 4.149094149094149, - "grad_norm": 0.45669737458229065, - "learning_rate": 5.105435105435106e-06, - "loss": 0.0589, - "step": 13970 - }, - { - "epoch": 4.152064152064152, - "grad_norm": 0.7257834076881409, - "learning_rate": 5.087615087615088e-06, - "loss": 0.0762, - "step": 13980 - }, - { - "epoch": 4.155034155034155, - "grad_norm": 0.4654732942581177, - "learning_rate": 5.06979506979507e-06, - "loss": 0.0503, - "step": 13990 - }, - { - "epoch": 4.158004158004158, - "grad_norm": 0.4994029700756073, - "learning_rate": 5.051975051975052e-06, - "loss": 0.0535, - "step": 14000 - }, - { - "epoch": 4.160974160974161, - "grad_norm": 0.47293511033058167, - "learning_rate": 5.034155034155034e-06, - "loss": 0.0564, - "step": 14010 - }, - { - "epoch": 4.163944163944164, - "grad_norm": 0.3141496777534485, - "learning_rate": 5.0163350163350165e-06, - "loss": 0.0597, - "step": 14020 - }, - { - "epoch": 4.166914166914167, - "grad_norm": 0.2851223051548004, - "learning_rate": 4.998514998514999e-06, - "loss": 0.0549, - "step": 14030 - }, - { - "epoch": 4.1698841698841695, - "grad_norm": 0.9652001261711121, - "learning_rate": 4.980694980694981e-06, - "loss": 0.0605, - "step": 14040 - }, - { - "epoch": 4.172854172854173, - "grad_norm": 0.6175165772438049, - "learning_rate": 4.962874962874963e-06, - "loss": 0.0574, - "step": 14050 - }, - { - "epoch": 4.175824175824176, - "grad_norm": 0.39955687522888184, - "learning_rate": 4.945054945054945e-06, - "loss": 0.0733, - "step": 14060 - }, - { - "epoch": 4.1787941787941785, - "grad_norm": 0.5539454817771912, - "learning_rate": 4.927234927234927e-06, - "loss": 0.0587, - "step": 14070 - }, - { - "epoch": 4.181764181764182, - "grad_norm": 0.574409008026123, - "learning_rate": 4.9094149094149095e-06, - "loss": 0.0642, - "step": 14080 - }, - { - "epoch": 4.184734184734185, - "grad_norm": 0.4297143816947937, - "learning_rate": 4.891594891594892e-06, - "loss": 0.0567, - "step": 14090 - }, - { - "epoch": 4.187704187704187, - "grad_norm": 0.49302181601524353, - "learning_rate": 4.873774873774874e-06, - "loss": 0.0562, - "step": 14100 - }, - { - "epoch": 4.190674190674191, - "grad_norm": 0.8171068429946899, - "learning_rate": 4.855954855954856e-06, - "loss": 0.0656, - "step": 14110 - }, - { - "epoch": 4.193644193644194, - "grad_norm": 0.6117607951164246, - "learning_rate": 4.838134838134839e-06, - "loss": 0.0596, - "step": 14120 - }, - { - "epoch": 4.196614196614196, - "grad_norm": 0.33238255977630615, - "learning_rate": 4.82031482031482e-06, - "loss": 0.0493, - "step": 14130 - }, - { - "epoch": 4.1995841995842, - "grad_norm": 0.3627205789089203, - "learning_rate": 4.8024948024948025e-06, - "loss": 0.0579, - "step": 14140 - }, - { - "epoch": 4.202554202554203, - "grad_norm": 0.6033427119255066, - "learning_rate": 4.784674784674785e-06, - "loss": 0.0568, - "step": 14150 - }, - { - "epoch": 4.205524205524205, - "grad_norm": 0.5274185538291931, - "learning_rate": 4.766854766854767e-06, - "loss": 0.062, - "step": 14160 - }, - { - "epoch": 4.2084942084942085, - "grad_norm": 0.4550093114376068, - "learning_rate": 4.749034749034749e-06, - "loss": 0.0747, - "step": 14170 - }, - { - "epoch": 4.211464211464212, - "grad_norm": 0.382213294506073, - "learning_rate": 4.731214731214731e-06, - "loss": 0.0457, - "step": 14180 - }, - { - "epoch": 4.214434214434214, - "grad_norm": 0.5736550092697144, - "learning_rate": 4.713394713394714e-06, - "loss": 0.0614, - "step": 14190 - }, - { - "epoch": 4.2174042174042174, - "grad_norm": 0.5673187971115112, - "learning_rate": 4.6955746955746955e-06, - "loss": 0.0582, - "step": 14200 - }, - { - "epoch": 4.220374220374221, - "grad_norm": 0.6587729454040527, - "learning_rate": 4.677754677754678e-06, - "loss": 0.0681, - "step": 14210 - }, - { - "epoch": 4.223344223344223, - "grad_norm": 0.6249194741249084, - "learning_rate": 4.65993465993466e-06, - "loss": 0.0662, - "step": 14220 - }, - { - "epoch": 4.226314226314226, - "grad_norm": 0.6569053530693054, - "learning_rate": 4.642114642114642e-06, - "loss": 0.0634, - "step": 14230 - }, - { - "epoch": 4.22928422928423, - "grad_norm": 0.6076725125312805, - "learning_rate": 4.624294624294624e-06, - "loss": 0.0641, - "step": 14240 - }, - { - "epoch": 4.232254232254232, - "grad_norm": 0.4433649182319641, - "learning_rate": 4.606474606474606e-06, - "loss": 0.0471, - "step": 14250 - }, - { - "epoch": 4.235224235224235, - "grad_norm": 0.34535735845565796, - "learning_rate": 4.588654588654589e-06, - "loss": 0.0619, - "step": 14260 - }, - { - "epoch": 4.238194238194239, - "grad_norm": 0.3933964967727661, - "learning_rate": 4.570834570834571e-06, - "loss": 0.0588, - "step": 14270 - }, - { - "epoch": 4.241164241164241, - "grad_norm": 0.577758252620697, - "learning_rate": 4.553014553014554e-06, - "loss": 0.0625, - "step": 14280 - }, - { - "epoch": 4.244134244134244, - "grad_norm": 0.4267483353614807, - "learning_rate": 4.535194535194535e-06, - "loss": 0.0456, - "step": 14290 - }, - { - "epoch": 4.2471042471042475, - "grad_norm": 0.42397600412368774, - "learning_rate": 4.517374517374517e-06, - "loss": 0.056, - "step": 14300 - }, - { - "epoch": 4.25007425007425, - "grad_norm": 0.3087056279182434, - "learning_rate": 4.499554499554499e-06, - "loss": 0.0564, - "step": 14310 - }, - { - "epoch": 4.253044253044253, - "grad_norm": 0.3736560046672821, - "learning_rate": 4.481734481734482e-06, - "loss": 0.0647, - "step": 14320 - }, - { - "epoch": 4.256014256014256, - "grad_norm": 0.37401074171066284, - "learning_rate": 4.463914463914465e-06, - "loss": 0.0526, - "step": 14330 - }, - { - "epoch": 4.258984258984259, - "grad_norm": 0.6431254744529724, - "learning_rate": 4.446094446094446e-06, - "loss": 0.072, - "step": 14340 - }, - { - "epoch": 4.261954261954262, - "grad_norm": 0.3994961380958557, - "learning_rate": 4.428274428274429e-06, - "loss": 0.0539, - "step": 14350 - }, - { - "epoch": 4.2649242649242645, - "grad_norm": 0.5059460997581482, - "learning_rate": 4.41045441045441e-06, - "loss": 0.0557, - "step": 14360 - }, - { - "epoch": 4.267894267894268, - "grad_norm": 0.201277494430542, - "learning_rate": 4.392634392634393e-06, - "loss": 0.0557, - "step": 14370 - }, - { - "epoch": 4.270864270864271, - "grad_norm": 0.22198070585727692, - "learning_rate": 4.374814374814375e-06, - "loss": 0.0526, - "step": 14380 - }, - { - "epoch": 4.273834273834273, - "grad_norm": 0.2608140707015991, - "learning_rate": 4.356994356994357e-06, - "loss": 0.0518, - "step": 14390 - }, - { - "epoch": 4.276804276804277, - "grad_norm": 0.3986319601535797, - "learning_rate": 4.33917433917434e-06, - "loss": 0.0596, - "step": 14400 - }, - { - "epoch": 4.27977427977428, - "grad_norm": 0.5883366465568542, - "learning_rate": 4.321354321354321e-06, - "loss": 0.058, - "step": 14410 - }, - { - "epoch": 4.282744282744282, - "grad_norm": 0.7146860361099243, - "learning_rate": 4.303534303534304e-06, - "loss": 0.0638, - "step": 14420 - }, - { - "epoch": 4.285714285714286, - "grad_norm": 0.630452036857605, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.0519, - "step": 14430 - }, - { - "epoch": 4.288684288684289, - "grad_norm": 0.7049713730812073, - "learning_rate": 4.2678942678942685e-06, - "loss": 0.0559, - "step": 14440 - }, - { - "epoch": 4.291654291654291, - "grad_norm": 0.31321823596954346, - "learning_rate": 4.25007425007425e-06, - "loss": 0.054, - "step": 14450 - }, - { - "epoch": 4.2946242946242945, - "grad_norm": 0.8444371223449707, - "learning_rate": 4.232254232254232e-06, - "loss": 0.0589, - "step": 14460 - }, - { - "epoch": 4.297594297594298, - "grad_norm": 0.5905739665031433, - "learning_rate": 4.214434214434215e-06, - "loss": 0.0633, - "step": 14470 - }, - { - "epoch": 4.3005643005643, - "grad_norm": 0.4641624093055725, - "learning_rate": 4.196614196614196e-06, - "loss": 0.0611, - "step": 14480 - }, - { - "epoch": 4.303534303534303, - "grad_norm": 0.5575865507125854, - "learning_rate": 4.178794178794179e-06, - "loss": 0.0576, - "step": 14490 - }, - { - "epoch": 4.306504306504307, - "grad_norm": 0.7232492566108704, - "learning_rate": 4.160974160974161e-06, - "loss": 0.0575, - "step": 14500 - }, - { - "epoch": 4.309474309474309, - "grad_norm": 0.5242018103599548, - "learning_rate": 4.143154143154144e-06, - "loss": 0.0692, - "step": 14510 - }, - { - "epoch": 4.312444312444312, - "grad_norm": 0.622914731502533, - "learning_rate": 4.125334125334125e-06, - "loss": 0.0688, - "step": 14520 - }, - { - "epoch": 4.315414315414316, - "grad_norm": 0.5062875151634216, - "learning_rate": 4.107514107514108e-06, - "loss": 0.0542, - "step": 14530 - }, - { - "epoch": 4.318384318384318, - "grad_norm": 0.5135970711708069, - "learning_rate": 4.08969408969409e-06, - "loss": 0.0525, - "step": 14540 - }, - { - "epoch": 4.321354321354321, - "grad_norm": 0.2701030969619751, - "learning_rate": 4.0718740718740715e-06, - "loss": 0.0525, - "step": 14550 - }, - { - "epoch": 4.324324324324325, - "grad_norm": 0.7602173089981079, - "learning_rate": 4.0540540540540545e-06, - "loss": 0.053, - "step": 14560 - }, - { - "epoch": 4.327294327294327, - "grad_norm": 0.7320886254310608, - "learning_rate": 4.036234036234036e-06, - "loss": 0.0576, - "step": 14570 - }, - { - "epoch": 4.33026433026433, - "grad_norm": 0.422878623008728, - "learning_rate": 4.018414018414019e-06, - "loss": 0.0578, - "step": 14580 - }, - { - "epoch": 4.3332343332343335, - "grad_norm": 0.449724018573761, - "learning_rate": 4.000594000594e-06, - "loss": 0.0664, - "step": 14590 - }, - { - "epoch": 4.336204336204336, - "grad_norm": 0.22872653603553772, - "learning_rate": 3.982773982773983e-06, - "loss": 0.0529, - "step": 14600 - }, - { - "epoch": 4.339174339174339, - "grad_norm": 0.4547821581363678, - "learning_rate": 3.964953964953965e-06, - "loss": 0.0541, - "step": 14610 - }, - { - "epoch": 4.342144342144342, - "grad_norm": 0.5161837339401245, - "learning_rate": 3.947133947133947e-06, - "loss": 0.0439, - "step": 14620 - }, - { - "epoch": 4.345114345114345, - "grad_norm": 0.6731418371200562, - "learning_rate": 3.92931392931393e-06, - "loss": 0.0554, - "step": 14630 - }, - { - "epoch": 4.348084348084348, - "grad_norm": 0.46018585562705994, - "learning_rate": 3.911493911493911e-06, - "loss": 0.0632, - "step": 14640 - }, - { - "epoch": 4.351054351054351, - "grad_norm": 0.3375426232814789, - "learning_rate": 3.893673893673894e-06, - "loss": 0.065, - "step": 14650 - }, - { - "epoch": 4.354024354024354, - "grad_norm": 0.5720539093017578, - "learning_rate": 3.875853875853875e-06, - "loss": 0.0595, - "step": 14660 - }, - { - "epoch": 4.356994356994357, - "grad_norm": 0.542365312576294, - "learning_rate": 3.858033858033858e-06, - "loss": 0.0493, - "step": 14670 - }, - { - "epoch": 4.35996435996436, - "grad_norm": 0.6491771340370178, - "learning_rate": 3.8402138402138406e-06, - "loss": 0.0588, - "step": 14680 - }, - { - "epoch": 4.362934362934363, - "grad_norm": 0.7092576622962952, - "learning_rate": 3.822393822393823e-06, - "loss": 0.0637, - "step": 14690 - }, - { - "epoch": 4.365904365904366, - "grad_norm": 0.5155068635940552, - "learning_rate": 3.804573804573805e-06, - "loss": 0.0502, - "step": 14700 - }, - { - "epoch": 4.368874368874369, - "grad_norm": 0.31838563084602356, - "learning_rate": 3.7867537867537867e-06, - "loss": 0.0575, - "step": 14710 - }, - { - "epoch": 4.371844371844372, - "grad_norm": 0.7911087274551392, - "learning_rate": 3.7689337689337693e-06, - "loss": 0.0628, - "step": 14720 - }, - { - "epoch": 4.374814374814375, - "grad_norm": 0.26239511370658875, - "learning_rate": 3.751113751113751e-06, - "loss": 0.0452, - "step": 14730 - }, - { - "epoch": 4.377784377784378, - "grad_norm": 0.5743318796157837, - "learning_rate": 3.733293733293733e-06, - "loss": 0.0618, - "step": 14740 - }, - { - "epoch": 4.3807543807543805, - "grad_norm": 0.520468533039093, - "learning_rate": 3.7154737154737153e-06, - "loss": 0.0578, - "step": 14750 - }, - { - "epoch": 4.383724383724384, - "grad_norm": 0.30406662821769714, - "learning_rate": 3.6976536976536975e-06, - "loss": 0.0479, - "step": 14760 - }, - { - "epoch": 4.386694386694387, - "grad_norm": 0.363372266292572, - "learning_rate": 3.67983367983368e-06, - "loss": 0.0523, - "step": 14770 - }, - { - "epoch": 4.389664389664389, - "grad_norm": 0.6119177341461182, - "learning_rate": 3.6620136620136623e-06, - "loss": 0.0593, - "step": 14780 - }, - { - "epoch": 4.392634392634393, - "grad_norm": 0.7049959897994995, - "learning_rate": 3.6441936441936444e-06, - "loss": 0.0554, - "step": 14790 - }, - { - "epoch": 4.395604395604396, - "grad_norm": 0.6827198266983032, - "learning_rate": 3.6263736263736266e-06, - "loss": 0.0536, - "step": 14800 - }, - { - "epoch": 4.398574398574398, - "grad_norm": 0.4505496025085449, - "learning_rate": 3.6085536085536088e-06, - "loss": 0.044, - "step": 14810 - }, - { - "epoch": 4.401544401544402, - "grad_norm": 0.36443957686424255, - "learning_rate": 3.5907335907335905e-06, - "loss": 0.065, - "step": 14820 - }, - { - "epoch": 4.404514404514405, - "grad_norm": 0.4884301424026489, - "learning_rate": 3.5729135729135727e-06, - "loss": 0.0476, - "step": 14830 - }, - { - "epoch": 4.407484407484407, - "grad_norm": 0.504188597202301, - "learning_rate": 3.5550935550935553e-06, - "loss": 0.0563, - "step": 14840 - }, - { - "epoch": 4.410454410454411, - "grad_norm": 0.19332559406757355, - "learning_rate": 3.5372735372735375e-06, - "loss": 0.0727, - "step": 14850 - }, - { - "epoch": 4.413424413424414, - "grad_norm": 0.33928439021110535, - "learning_rate": 3.5194535194535196e-06, - "loss": 0.0538, - "step": 14860 - }, - { - "epoch": 4.416394416394416, - "grad_norm": 0.6077583432197571, - "learning_rate": 3.501633501633502e-06, - "loss": 0.0583, - "step": 14870 - }, - { - "epoch": 4.4193644193644195, - "grad_norm": 0.5217536091804504, - "learning_rate": 3.483813483813484e-06, - "loss": 0.0515, - "step": 14880 - }, - { - "epoch": 4.422334422334423, - "grad_norm": 0.7069948315620422, - "learning_rate": 3.465993465993466e-06, - "loss": 0.0787, - "step": 14890 - }, - { - "epoch": 4.425304425304425, - "grad_norm": 0.5601736307144165, - "learning_rate": 3.448173448173448e-06, - "loss": 0.0603, - "step": 14900 - }, - { - "epoch": 4.428274428274428, - "grad_norm": 0.687710702419281, - "learning_rate": 3.4303534303534305e-06, - "loss": 0.0743, - "step": 14910 - }, - { - "epoch": 4.431244431244432, - "grad_norm": 0.4097294807434082, - "learning_rate": 3.4125334125334127e-06, - "loss": 0.053, - "step": 14920 - }, - { - "epoch": 4.434214434214434, - "grad_norm": 0.42233291268348694, - "learning_rate": 3.394713394713395e-06, - "loss": 0.0547, - "step": 14930 - }, - { - "epoch": 4.437184437184437, - "grad_norm": 0.44154420495033264, - "learning_rate": 3.376893376893377e-06, - "loss": 0.0619, - "step": 14940 - }, - { - "epoch": 4.440154440154441, - "grad_norm": 0.8544827699661255, - "learning_rate": 3.359073359073359e-06, - "loss": 0.0567, - "step": 14950 - }, - { - "epoch": 4.443124443124443, - "grad_norm": 0.5352084636688232, - "learning_rate": 3.3412533412533413e-06, - "loss": 0.0544, - "step": 14960 - }, - { - "epoch": 4.446094446094446, - "grad_norm": 0.3393558859825134, - "learning_rate": 3.3234333234333235e-06, - "loss": 0.048, - "step": 14970 - }, - { - "epoch": 4.4490644490644495, - "grad_norm": 0.4206056296825409, - "learning_rate": 3.3056133056133057e-06, - "loss": 0.0599, - "step": 14980 - }, - { - "epoch": 4.452034452034452, - "grad_norm": 0.4742394983768463, - "learning_rate": 3.287793287793288e-06, - "loss": 0.067, - "step": 14990 - }, - { - "epoch": 4.455004455004455, - "grad_norm": 0.5058844685554504, - "learning_rate": 3.26997326997327e-06, - "loss": 0.0625, - "step": 15000 - }, - { - "epoch": 4.457974457974458, - "grad_norm": 0.39022761583328247, - "learning_rate": 3.252153252153252e-06, - "loss": 0.0634, - "step": 15010 - }, - { - "epoch": 4.460944460944461, - "grad_norm": 0.46778586506843567, - "learning_rate": 3.2343332343332344e-06, - "loss": 0.0609, - "step": 15020 - }, - { - "epoch": 4.463914463914464, - "grad_norm": 0.7826917767524719, - "learning_rate": 3.2165132165132165e-06, - "loss": 0.0652, - "step": 15030 - }, - { - "epoch": 4.4668844668844665, - "grad_norm": 0.3851190507411957, - "learning_rate": 3.1986931986931987e-06, - "loss": 0.0706, - "step": 15040 - }, - { - "epoch": 4.46985446985447, - "grad_norm": 0.5744338631629944, - "learning_rate": 3.1808731808731813e-06, - "loss": 0.0616, - "step": 15050 - }, - { - "epoch": 4.472824472824473, - "grad_norm": 0.2627275288105011, - "learning_rate": 3.163053163053163e-06, - "loss": 0.0545, - "step": 15060 - }, - { - "epoch": 4.475794475794475, - "grad_norm": 0.6903046369552612, - "learning_rate": 3.1452331452331452e-06, - "loss": 0.0538, - "step": 15070 - }, - { - "epoch": 4.478764478764479, - "grad_norm": 0.49576228857040405, - "learning_rate": 3.1274131274131274e-06, - "loss": 0.0555, - "step": 15080 - }, - { - "epoch": 4.481734481734482, - "grad_norm": 0.5750555396080017, - "learning_rate": 3.1095931095931096e-06, - "loss": 0.0526, - "step": 15090 - }, - { - "epoch": 4.484704484704484, - "grad_norm": 0.5842902660369873, - "learning_rate": 3.0917730917730917e-06, - "loss": 0.0654, - "step": 15100 - }, - { - "epoch": 4.487674487674488, - "grad_norm": 0.6240746974945068, - "learning_rate": 3.073953073953074e-06, - "loss": 0.0542, - "step": 15110 - }, - { - "epoch": 4.490644490644491, - "grad_norm": 0.5041930079460144, - "learning_rate": 3.0561330561330565e-06, - "loss": 0.0539, - "step": 15120 - }, - { - "epoch": 4.493614493614493, - "grad_norm": 0.7403512597084045, - "learning_rate": 3.0383130383130387e-06, - "loss": 0.0514, - "step": 15130 - }, - { - "epoch": 4.4965844965844965, - "grad_norm": 0.39922061562538147, - "learning_rate": 3.0204930204930204e-06, - "loss": 0.0552, - "step": 15140 - }, - { - "epoch": 4.4995544995545, - "grad_norm": 0.2986512780189514, - "learning_rate": 3.0026730026730026e-06, - "loss": 0.0417, - "step": 15150 - }, - { - "epoch": 4.502524502524502, - "grad_norm": 0.5681390166282654, - "learning_rate": 2.9848529848529848e-06, - "loss": 0.0576, - "step": 15160 - }, - { - "epoch": 4.5054945054945055, - "grad_norm": 0.3006349802017212, - "learning_rate": 2.967032967032967e-06, - "loss": 0.0556, - "step": 15170 - }, - { - "epoch": 4.508464508464509, - "grad_norm": 0.35743093490600586, - "learning_rate": 2.949212949212949e-06, - "loss": 0.0598, - "step": 15180 - }, - { - "epoch": 4.511434511434511, - "grad_norm": 0.7890453934669495, - "learning_rate": 2.9313929313929317e-06, - "loss": 0.0611, - "step": 15190 - }, - { - "epoch": 4.514404514404514, - "grad_norm": 0.5027909874916077, - "learning_rate": 2.913572913572914e-06, - "loss": 0.0599, - "step": 15200 - }, - { - "epoch": 4.517374517374518, - "grad_norm": 0.41626325249671936, - "learning_rate": 2.895752895752896e-06, - "loss": 0.0689, - "step": 15210 - }, - { - "epoch": 4.52034452034452, - "grad_norm": 0.48036375641822815, - "learning_rate": 2.877932877932878e-06, - "loss": 0.0591, - "step": 15220 - }, - { - "epoch": 4.523314523314523, - "grad_norm": 0.3339380919933319, - "learning_rate": 2.86011286011286e-06, - "loss": 0.0522, - "step": 15230 - }, - { - "epoch": 4.526284526284527, - "grad_norm": 0.5192808508872986, - "learning_rate": 2.842292842292842e-06, - "loss": 0.0495, - "step": 15240 - }, - { - "epoch": 4.529254529254529, - "grad_norm": 0.39185869693756104, - "learning_rate": 2.8244728244728243e-06, - "loss": 0.0523, - "step": 15250 - }, - { - "epoch": 4.532224532224532, - "grad_norm": 0.5810967683792114, - "learning_rate": 2.806652806652807e-06, - "loss": 0.0596, - "step": 15260 - }, - { - "epoch": 4.5351945351945355, - "grad_norm": 0.48891574144363403, - "learning_rate": 2.788832788832789e-06, - "loss": 0.0684, - "step": 15270 - }, - { - "epoch": 4.538164538164538, - "grad_norm": 0.6249604821205139, - "learning_rate": 2.7710127710127712e-06, - "loss": 0.0605, - "step": 15280 - }, - { - "epoch": 4.541134541134541, - "grad_norm": 0.5719090700149536, - "learning_rate": 2.7531927531927534e-06, - "loss": 0.0513, - "step": 15290 - }, - { - "epoch": 4.5441045441045445, - "grad_norm": 0.5488110780715942, - "learning_rate": 2.7353727353727356e-06, - "loss": 0.0685, - "step": 15300 - }, - { - "epoch": 4.547074547074547, - "grad_norm": 0.38646382093429565, - "learning_rate": 2.7175527175527173e-06, - "loss": 0.0557, - "step": 15310 - }, - { - "epoch": 4.55004455004455, - "grad_norm": 0.4562876224517822, - "learning_rate": 2.6997326997326995e-06, - "loss": 0.0538, - "step": 15320 - }, - { - "epoch": 4.553014553014553, - "grad_norm": 0.3206016719341278, - "learning_rate": 2.681912681912682e-06, - "loss": 0.0469, - "step": 15330 - }, - { - "epoch": 4.555984555984556, - "grad_norm": 0.4230201542377472, - "learning_rate": 2.6640926640926642e-06, - "loss": 0.064, - "step": 15340 - }, - { - "epoch": 4.558954558954559, - "grad_norm": 0.6635040640830994, - "learning_rate": 2.6462726462726464e-06, - "loss": 0.0572, - "step": 15350 - }, - { - "epoch": 4.561924561924562, - "grad_norm": 0.6302227973937988, - "learning_rate": 2.6284526284526286e-06, - "loss": 0.0706, - "step": 15360 - }, - { - "epoch": 4.564894564894565, - "grad_norm": 0.6194272637367249, - "learning_rate": 2.6106326106326108e-06, - "loss": 0.0787, - "step": 15370 - }, - { - "epoch": 4.567864567864568, - "grad_norm": 0.7719616293907166, - "learning_rate": 2.592812592812593e-06, - "loss": 0.0657, - "step": 15380 - }, - { - "epoch": 4.57083457083457, - "grad_norm": 0.541800856590271, - "learning_rate": 2.574992574992575e-06, - "loss": 0.0691, - "step": 15390 - }, - { - "epoch": 4.573804573804574, - "grad_norm": 0.4170493483543396, - "learning_rate": 2.5571725571725573e-06, - "loss": 0.0623, - "step": 15400 - }, - { - "epoch": 4.576774576774577, - "grad_norm": 0.6817463636398315, - "learning_rate": 2.5393525393525394e-06, - "loss": 0.0475, - "step": 15410 - }, - { - "epoch": 4.579744579744579, - "grad_norm": 0.5324716567993164, - "learning_rate": 2.5215325215325216e-06, - "loss": 0.0609, - "step": 15420 - }, - { - "epoch": 4.5827145827145825, - "grad_norm": 0.3345739245414734, - "learning_rate": 2.5037125037125038e-06, - "loss": 0.0547, - "step": 15430 - }, - { - "epoch": 4.585684585684586, - "grad_norm": 0.5235359072685242, - "learning_rate": 2.485892485892486e-06, - "loss": 0.0439, - "step": 15440 - }, - { - "epoch": 4.588654588654588, - "grad_norm": 0.5767530202865601, - "learning_rate": 2.468072468072468e-06, - "loss": 0.0509, - "step": 15450 - }, - { - "epoch": 4.5916245916245915, - "grad_norm": 0.37491995096206665, - "learning_rate": 2.4502524502524507e-06, - "loss": 0.0537, - "step": 15460 - }, - { - "epoch": 4.594594594594595, - "grad_norm": 0.49927496910095215, - "learning_rate": 2.4324324324324325e-06, - "loss": 0.0612, - "step": 15470 - }, - { - "epoch": 4.597564597564597, - "grad_norm": 0.8037787079811096, - "learning_rate": 2.4146124146124146e-06, - "loss": 0.059, - "step": 15480 - }, - { - "epoch": 4.6005346005346, - "grad_norm": 0.6241805553436279, - "learning_rate": 2.396792396792397e-06, - "loss": 0.0667, - "step": 15490 - }, - { - "epoch": 4.603504603504604, - "grad_norm": 0.4899803698062897, - "learning_rate": 2.378972378972379e-06, - "loss": 0.0547, - "step": 15500 - }, - { - "epoch": 4.606474606474606, - "grad_norm": 0.7477651834487915, - "learning_rate": 2.361152361152361e-06, - "loss": 0.055, - "step": 15510 - }, - { - "epoch": 4.609444609444609, - "grad_norm": 0.35865089297294617, - "learning_rate": 2.3433323433323433e-06, - "loss": 0.0518, - "step": 15520 - }, - { - "epoch": 4.612414612414613, - "grad_norm": 0.6939175128936768, - "learning_rate": 2.325512325512326e-06, - "loss": 0.0556, - "step": 15530 - }, - { - "epoch": 4.615384615384615, - "grad_norm": 0.6515450477600098, - "learning_rate": 2.307692307692308e-06, - "loss": 0.066, - "step": 15540 - }, - { - "epoch": 4.618354618354618, - "grad_norm": 0.5336460471153259, - "learning_rate": 2.28987228987229e-06, - "loss": 0.0601, - "step": 15550 - }, - { - "epoch": 4.6213246213246215, - "grad_norm": 0.43573182821273804, - "learning_rate": 2.272052272052272e-06, - "loss": 0.062, - "step": 15560 - }, - { - "epoch": 4.624294624294624, - "grad_norm": 0.5898021459579468, - "learning_rate": 2.254232254232254e-06, - "loss": 0.0508, - "step": 15570 - }, - { - "epoch": 4.627264627264627, - "grad_norm": 0.4121955931186676, - "learning_rate": 2.2364122364122363e-06, - "loss": 0.0572, - "step": 15580 - }, - { - "epoch": 4.63023463023463, - "grad_norm": 0.4609485864639282, - "learning_rate": 2.2185922185922185e-06, - "loss": 0.0569, - "step": 15590 - }, - { - "epoch": 4.633204633204633, - "grad_norm": 0.5508905053138733, - "learning_rate": 2.200772200772201e-06, - "loss": 0.0514, - "step": 15600 - }, - { - "epoch": 4.636174636174636, - "grad_norm": 0.2802213430404663, - "learning_rate": 2.1829521829521833e-06, - "loss": 0.0655, - "step": 15610 - }, - { - "epoch": 4.639144639144639, - "grad_norm": 0.4371926784515381, - "learning_rate": 2.1651321651321654e-06, - "loss": 0.0651, - "step": 15620 - }, - { - "epoch": 4.642114642114642, - "grad_norm": 0.42453938722610474, - "learning_rate": 2.147312147312147e-06, - "loss": 0.0511, - "step": 15630 - }, - { - "epoch": 4.645084645084645, - "grad_norm": 0.5437641143798828, - "learning_rate": 2.1294921294921294e-06, - "loss": 0.0668, - "step": 15640 - }, - { - "epoch": 4.648054648054648, - "grad_norm": 0.3894469141960144, - "learning_rate": 2.1116721116721115e-06, - "loss": 0.062, - "step": 15650 - }, - { - "epoch": 4.651024651024651, - "grad_norm": 0.47583234310150146, - "learning_rate": 2.0938520938520937e-06, - "loss": 0.0566, - "step": 15660 - }, - { - "epoch": 4.653994653994654, - "grad_norm": 0.4548485279083252, - "learning_rate": 2.0760320760320763e-06, - "loss": 0.056, - "step": 15670 - }, - { - "epoch": 4.656964656964657, - "grad_norm": 0.802291989326477, - "learning_rate": 2.0582120582120585e-06, - "loss": 0.0655, - "step": 15680 - }, - { - "epoch": 4.65993465993466, - "grad_norm": 0.7516531944274902, - "learning_rate": 2.0403920403920406e-06, - "loss": 0.0639, - "step": 15690 - }, - { - "epoch": 4.662904662904663, - "grad_norm": 0.32423585653305054, - "learning_rate": 2.022572022572023e-06, - "loss": 0.0569, - "step": 15700 - }, - { - "epoch": 4.665874665874666, - "grad_norm": 0.6043174266815186, - "learning_rate": 2.0047520047520046e-06, - "loss": 0.0579, - "step": 15710 - }, - { - "epoch": 4.6688446688446685, - "grad_norm": 0.6407160758972168, - "learning_rate": 1.9869319869319867e-06, - "loss": 0.0567, - "step": 15720 - }, - { - "epoch": 4.671814671814672, - "grad_norm": 0.4470699727535248, - "learning_rate": 1.969111969111969e-06, - "loss": 0.0588, - "step": 15730 - }, - { - "epoch": 4.674784674784675, - "grad_norm": 0.582695484161377, - "learning_rate": 1.9512919512919515e-06, - "loss": 0.0617, - "step": 15740 - }, - { - "epoch": 4.6777546777546775, - "grad_norm": 0.5506203770637512, - "learning_rate": 1.9334719334719337e-06, - "loss": 0.0561, - "step": 15750 - }, - { - "epoch": 4.680724680724681, - "grad_norm": 0.47693562507629395, - "learning_rate": 1.915651915651916e-06, - "loss": 0.0681, - "step": 15760 - }, - { - "epoch": 4.683694683694684, - "grad_norm": 0.5993645191192627, - "learning_rate": 1.8978318978318978e-06, - "loss": 0.0568, - "step": 15770 - }, - { - "epoch": 4.686664686664686, - "grad_norm": 0.4940324127674103, - "learning_rate": 1.88001188001188e-06, - "loss": 0.0598, - "step": 15780 - }, - { - "epoch": 4.68963468963469, - "grad_norm": 0.2504422664642334, - "learning_rate": 1.8621918621918623e-06, - "loss": 0.0529, - "step": 15790 - }, - { - "epoch": 4.692604692604693, - "grad_norm": 0.46793416142463684, - "learning_rate": 1.8443718443718445e-06, - "loss": 0.0512, - "step": 15800 - }, - { - "epoch": 4.695574695574695, - "grad_norm": 0.5812580585479736, - "learning_rate": 1.8265518265518265e-06, - "loss": 0.0651, - "step": 15810 - }, - { - "epoch": 4.698544698544699, - "grad_norm": 0.6013128757476807, - "learning_rate": 1.8087318087318088e-06, - "loss": 0.0468, - "step": 15820 - }, - { - "epoch": 4.701514701514702, - "grad_norm": 0.7705390453338623, - "learning_rate": 1.790911790911791e-06, - "loss": 0.068, - "step": 15830 - }, - { - "epoch": 4.704484704484704, - "grad_norm": 0.3266391158103943, - "learning_rate": 1.7730917730917732e-06, - "loss": 0.0599, - "step": 15840 - }, - { - "epoch": 4.7074547074547075, - "grad_norm": 0.5801645517349243, - "learning_rate": 1.7552717552717551e-06, - "loss": 0.05, - "step": 15850 - }, - { - "epoch": 4.710424710424711, - "grad_norm": 0.4369991719722748, - "learning_rate": 1.7374517374517375e-06, - "loss": 0.0555, - "step": 15860 - }, - { - "epoch": 4.713394713394713, - "grad_norm": 0.411531925201416, - "learning_rate": 1.7196317196317197e-06, - "loss": 0.0548, - "step": 15870 - }, - { - "epoch": 4.716364716364716, - "grad_norm": 0.3529096841812134, - "learning_rate": 1.7018117018117019e-06, - "loss": 0.0603, - "step": 15880 - }, - { - "epoch": 4.71933471933472, - "grad_norm": 0.4712686240673065, - "learning_rate": 1.683991683991684e-06, - "loss": 0.044, - "step": 15890 - }, - { - "epoch": 4.722304722304722, - "grad_norm": 0.5159938335418701, - "learning_rate": 1.6661716661716662e-06, - "loss": 0.0581, - "step": 15900 - }, - { - "epoch": 4.725274725274725, - "grad_norm": 0.7334055304527283, - "learning_rate": 1.6483516483516484e-06, - "loss": 0.0515, - "step": 15910 - }, - { - "epoch": 4.728244728244729, - "grad_norm": 0.6379159688949585, - "learning_rate": 1.6305316305316306e-06, - "loss": 0.0504, - "step": 15920 - }, - { - "epoch": 4.731214731214731, - "grad_norm": 0.45981767773628235, - "learning_rate": 1.6127116127116127e-06, - "loss": 0.0656, - "step": 15930 - }, - { - "epoch": 4.734184734184734, - "grad_norm": 0.39534708857536316, - "learning_rate": 1.594891594891595e-06, - "loss": 0.0563, - "step": 15940 - }, - { - "epoch": 4.737154737154738, - "grad_norm": 0.40459519624710083, - "learning_rate": 1.577071577071577e-06, - "loss": 0.0539, - "step": 15950 - }, - { - "epoch": 4.74012474012474, - "grad_norm": 0.353635311126709, - "learning_rate": 1.5592515592515594e-06, - "loss": 0.0473, - "step": 15960 - }, - { - "epoch": 4.743094743094743, - "grad_norm": 0.45498237013816833, - "learning_rate": 1.5414315414315414e-06, - "loss": 0.0547, - "step": 15970 - }, - { - "epoch": 4.7460647460647465, - "grad_norm": 0.47604072093963623, - "learning_rate": 1.5236115236115236e-06, - "loss": 0.0664, - "step": 15980 - }, - { - "epoch": 4.749034749034749, - "grad_norm": 0.5030866265296936, - "learning_rate": 1.5057915057915057e-06, - "loss": 0.0542, - "step": 15990 - }, - { - "epoch": 4.752004752004752, - "grad_norm": 0.6311854124069214, - "learning_rate": 1.4879714879714881e-06, - "loss": 0.0574, - "step": 16000 - }, - { - "epoch": 4.754974754974755, - "grad_norm": 0.44515228271484375, - "learning_rate": 1.47015147015147e-06, - "loss": 0.0396, - "step": 16010 - }, - { - "epoch": 4.757944757944758, - "grad_norm": 0.31190499663352966, - "learning_rate": 1.4523314523314523e-06, - "loss": 0.048, - "step": 16020 - }, - { - "epoch": 4.760914760914761, - "grad_norm": 0.3565562069416046, - "learning_rate": 1.4345114345114346e-06, - "loss": 0.0548, - "step": 16030 - }, - { - "epoch": 4.763884763884764, - "grad_norm": 0.4140501320362091, - "learning_rate": 1.4166914166914168e-06, - "loss": 0.0577, - "step": 16040 - }, - { - "epoch": 4.766854766854767, - "grad_norm": 0.5318161845207214, - "learning_rate": 1.3988713988713988e-06, - "loss": 0.052, - "step": 16050 - }, - { - "epoch": 4.76982476982477, - "grad_norm": 0.48852646350860596, - "learning_rate": 1.381051381051381e-06, - "loss": 0.0738, - "step": 16060 - }, - { - "epoch": 4.772794772794773, - "grad_norm": 0.501015305519104, - "learning_rate": 1.3632313632313633e-06, - "loss": 0.0584, - "step": 16070 - }, - { - "epoch": 4.775764775764776, - "grad_norm": 0.46425512433052063, - "learning_rate": 1.3454113454113455e-06, - "loss": 0.0639, - "step": 16080 - }, - { - "epoch": 4.778734778734779, - "grad_norm": 0.4860481321811676, - "learning_rate": 1.3275913275913275e-06, - "loss": 0.0624, - "step": 16090 - }, - { - "epoch": 4.781704781704782, - "grad_norm": 0.7363678812980652, - "learning_rate": 1.3097713097713098e-06, - "loss": 0.0735, - "step": 16100 - }, - { - "epoch": 4.784674784674785, - "grad_norm": 0.6220631003379822, - "learning_rate": 1.291951291951292e-06, - "loss": 0.0591, - "step": 16110 - }, - { - "epoch": 4.787644787644788, - "grad_norm": 0.3801935613155365, - "learning_rate": 1.2741312741312742e-06, - "loss": 0.0486, - "step": 16120 - }, - { - "epoch": 4.79061479061479, - "grad_norm": 0.39542245864868164, - "learning_rate": 1.2563112563112563e-06, - "loss": 0.0567, - "step": 16130 - }, - { - "epoch": 4.7935847935847935, - "grad_norm": 0.6074013113975525, - "learning_rate": 1.2384912384912385e-06, - "loss": 0.0597, - "step": 16140 - }, - { - "epoch": 4.796554796554797, - "grad_norm": 0.3309972584247589, - "learning_rate": 1.2206712206712207e-06, - "loss": 0.0726, - "step": 16150 - }, - { - "epoch": 4.799524799524799, - "grad_norm": 0.5621445775032043, - "learning_rate": 1.2028512028512029e-06, - "loss": 0.0561, - "step": 16160 - }, - { - "epoch": 4.802494802494802, - "grad_norm": 0.571205198764801, - "learning_rate": 1.185031185031185e-06, - "loss": 0.0476, - "step": 16170 - }, - { - "epoch": 4.805464805464806, - "grad_norm": 0.4768125116825104, - "learning_rate": 1.1672111672111672e-06, - "loss": 0.0513, - "step": 16180 - }, - { - "epoch": 4.808434808434808, - "grad_norm": 0.5495672821998596, - "learning_rate": 1.1493911493911494e-06, - "loss": 0.0585, - "step": 16190 - }, - { - "epoch": 4.811404811404811, - "grad_norm": 0.4319486916065216, - "learning_rate": 1.1315711315711318e-06, - "loss": 0.0559, - "step": 16200 - }, - { - "epoch": 4.814374814374815, - "grad_norm": 0.5664613246917725, - "learning_rate": 1.1137511137511137e-06, - "loss": 0.0524, - "step": 16210 - }, - { - "epoch": 4.817344817344817, - "grad_norm": 0.4833865463733673, - "learning_rate": 1.0959310959310959e-06, - "loss": 0.0592, - "step": 16220 - }, - { - "epoch": 4.82031482031482, - "grad_norm": 0.49978017807006836, - "learning_rate": 1.078111078111078e-06, - "loss": 0.0616, - "step": 16230 - }, - { - "epoch": 4.8232848232848236, - "grad_norm": 0.434505432844162, - "learning_rate": 1.0602910602910604e-06, - "loss": 0.0699, - "step": 16240 - }, - { - "epoch": 4.826254826254826, - "grad_norm": 0.4497815668582916, - "learning_rate": 1.0424710424710424e-06, - "loss": 0.0573, - "step": 16250 - }, - { - "epoch": 4.829224829224829, - "grad_norm": 0.5861119031906128, - "learning_rate": 1.0246510246510246e-06, - "loss": 0.0623, - "step": 16260 - }, - { - "epoch": 4.8321948321948325, - "grad_norm": 0.3796347677707672, - "learning_rate": 1.006831006831007e-06, - "loss": 0.0628, - "step": 16270 - }, - { - "epoch": 4.835164835164835, - "grad_norm": 0.5198697447776794, - "learning_rate": 9.890109890109891e-07, - "loss": 0.0502, - "step": 16280 - }, - { - "epoch": 4.838134838134838, - "grad_norm": 0.8420373797416687, - "learning_rate": 9.711909711909713e-07, - "loss": 0.0627, - "step": 16290 - }, - { - "epoch": 4.841104841104841, - "grad_norm": 0.5385600328445435, - "learning_rate": 9.533709533709534e-07, - "loss": 0.054, - "step": 16300 - }, - { - "epoch": 4.844074844074844, - "grad_norm": 0.675041913986206, - "learning_rate": 9.355509355509356e-07, - "loss": 0.056, - "step": 16310 - }, - { - "epoch": 4.847044847044847, - "grad_norm": 0.6432201862335205, - "learning_rate": 9.177309177309178e-07, - "loss": 0.046, - "step": 16320 - }, - { - "epoch": 4.85001485001485, - "grad_norm": 0.26743176579475403, - "learning_rate": 8.999108999109e-07, - "loss": 0.0611, - "step": 16330 - }, - { - "epoch": 4.852984852984853, - "grad_norm": 0.4432642459869385, - "learning_rate": 8.820908820908821e-07, - "loss": 0.0514, - "step": 16340 - }, - { - "epoch": 4.855954855954856, - "grad_norm": 0.6377805471420288, - "learning_rate": 8.642708642708643e-07, - "loss": 0.0523, - "step": 16350 - }, - { - "epoch": 4.858924858924859, - "grad_norm": 0.34250327944755554, - "learning_rate": 8.464508464508465e-07, - "loss": 0.053, - "step": 16360 - }, - { - "epoch": 4.861894861894862, - "grad_norm": 0.32881438732147217, - "learning_rate": 8.286308286308286e-07, - "loss": 0.0568, - "step": 16370 - }, - { - "epoch": 4.864864864864865, - "grad_norm": 0.38570886850357056, - "learning_rate": 8.108108108108109e-07, - "loss": 0.0611, - "step": 16380 - }, - { - "epoch": 4.867834867834868, - "grad_norm": 0.34529945254325867, - "learning_rate": 7.92990792990793e-07, - "loss": 0.0528, - "step": 16390 - }, - { - "epoch": 4.870804870804871, - "grad_norm": 0.401865690946579, - "learning_rate": 7.751707751707753e-07, - "loss": 0.0592, - "step": 16400 - }, - { - "epoch": 4.873774873774874, - "grad_norm": 0.4996122419834137, - "learning_rate": 7.573507573507573e-07, - "loss": 0.0717, - "step": 16410 - }, - { - "epoch": 4.876744876744877, - "grad_norm": 0.9507100582122803, - "learning_rate": 7.395307395307396e-07, - "loss": 0.0518, - "step": 16420 - }, - { - "epoch": 4.8797148797148795, - "grad_norm": 0.5046108961105347, - "learning_rate": 7.217107217107217e-07, - "loss": 0.0649, - "step": 16430 - }, - { - "epoch": 4.882684882684883, - "grad_norm": 0.6091616153717041, - "learning_rate": 7.03890703890704e-07, - "loss": 0.0544, - "step": 16440 - }, - { - "epoch": 4.885654885654886, - "grad_norm": 0.526391863822937, - "learning_rate": 6.860706860706861e-07, - "loss": 0.0581, - "step": 16450 - }, - { - "epoch": 4.888624888624888, - "grad_norm": 0.6778053641319275, - "learning_rate": 6.682506682506683e-07, - "loss": 0.0534, - "step": 16460 - }, - { - "epoch": 4.891594891594892, - "grad_norm": 0.5965909957885742, - "learning_rate": 6.504306504306505e-07, - "loss": 0.0584, - "step": 16470 - }, - { - "epoch": 4.894564894564894, - "grad_norm": 0.4215756058692932, - "learning_rate": 6.326106326106326e-07, - "loss": 0.0603, - "step": 16480 - }, - { - "epoch": 4.897534897534897, - "grad_norm": 0.3755158483982086, - "learning_rate": 6.147906147906148e-07, - "loss": 0.0571, - "step": 16490 - }, - { - "epoch": 4.900504900504901, - "grad_norm": 0.6438432335853577, - "learning_rate": 5.969705969705971e-07, - "loss": 0.0544, - "step": 16500 - }, - { - "epoch": 4.903474903474903, - "grad_norm": 0.4326302111148834, - "learning_rate": 5.791505791505791e-07, - "loss": 0.064, - "step": 16510 - }, - { - "epoch": 4.906444906444906, - "grad_norm": 0.39046719670295715, - "learning_rate": 5.613305613305614e-07, - "loss": 0.0541, - "step": 16520 - }, - { - "epoch": 4.9094149094149095, - "grad_norm": 0.35211583971977234, - "learning_rate": 5.435105435105435e-07, - "loss": 0.0629, - "step": 16530 - }, - { - "epoch": 4.912384912384912, - "grad_norm": 0.6884411573410034, - "learning_rate": 5.256905256905258e-07, - "loss": 0.0538, - "step": 16540 - }, - { - "epoch": 4.915354915354915, - "grad_norm": 0.322244793176651, - "learning_rate": 5.078705078705078e-07, - "loss": 0.0696, - "step": 16550 - }, - { - "epoch": 4.9183249183249185, - "grad_norm": 0.5389405488967896, - "learning_rate": 4.900504900504901e-07, - "loss": 0.0568, - "step": 16560 - }, - { - "epoch": 4.921294921294921, - "grad_norm": 0.5457450747489929, - "learning_rate": 4.7223047223047227e-07, - "loss": 0.0529, - "step": 16570 - }, - { - "epoch": 4.924264924264924, - "grad_norm": 0.4502221643924713, - "learning_rate": 4.544104544104544e-07, - "loss": 0.0438, - "step": 16580 - }, - { - "epoch": 4.927234927234927, - "grad_norm": 0.486514151096344, - "learning_rate": 4.365904365904366e-07, - "loss": 0.0625, - "step": 16590 - }, - { - "epoch": 4.93020493020493, - "grad_norm": 0.5433149337768555, - "learning_rate": 4.187704187704188e-07, - "loss": 0.0605, - "step": 16600 - }, - { - "epoch": 4.933174933174933, - "grad_norm": 0.7155877351760864, - "learning_rate": 4.0095040095040095e-07, - "loss": 0.0538, - "step": 16610 - }, - { - "epoch": 4.936144936144936, - "grad_norm": 0.24715302884578705, - "learning_rate": 3.831303831303831e-07, - "loss": 0.0484, - "step": 16620 - }, - { - "epoch": 4.939114939114939, - "grad_norm": 0.47398287057876587, - "learning_rate": 3.653103653103653e-07, - "loss": 0.0638, - "step": 16630 - }, - { - "epoch": 4.942084942084942, - "grad_norm": 0.31817615032196045, - "learning_rate": 3.4749034749034746e-07, - "loss": 0.0498, - "step": 16640 - }, - { - "epoch": 4.945054945054945, - "grad_norm": 0.30618026852607727, - "learning_rate": 3.296703296703297e-07, - "loss": 0.0606, - "step": 16650 - }, - { - "epoch": 4.948024948024948, - "grad_norm": 0.29916951060295105, - "learning_rate": 3.1185031185031186e-07, - "loss": 0.0471, - "step": 16660 - }, - { - "epoch": 4.950994950994951, - "grad_norm": 0.711303174495697, - "learning_rate": 2.9403029403029403e-07, - "loss": 0.0548, - "step": 16670 - }, - { - "epoch": 4.953964953964954, - "grad_norm": 0.34268659353256226, - "learning_rate": 2.762102762102762e-07, - "loss": 0.052, - "step": 16680 - }, - { - "epoch": 4.956934956934957, - "grad_norm": 0.5266901850700378, - "learning_rate": 2.5839025839025837e-07, - "loss": 0.061, - "step": 16690 - }, - { - "epoch": 4.95990495990496, - "grad_norm": 0.5868252515792847, - "learning_rate": 2.4057024057024054e-07, - "loss": 0.0563, - "step": 16700 - }, - { - "epoch": 4.962874962874963, - "grad_norm": 0.24245183169841766, - "learning_rate": 2.2275022275022276e-07, - "loss": 0.0639, - "step": 16710 - }, - { - "epoch": 4.9658449658449655, - "grad_norm": 0.4494710862636566, - "learning_rate": 2.0493020493020493e-07, - "loss": 0.0498, - "step": 16720 - }, - { - "epoch": 4.968814968814969, - "grad_norm": 0.525842010974884, - "learning_rate": 1.8711018711018713e-07, - "loss": 0.0578, - "step": 16730 - }, - { - "epoch": 4.971784971784972, - "grad_norm": 0.6086418032646179, - "learning_rate": 1.692901692901693e-07, - "loss": 0.0424, - "step": 16740 - }, - { - "epoch": 4.974754974754974, - "grad_norm": 0.4772314429283142, - "learning_rate": 1.5147015147015147e-07, - "loss": 0.0534, - "step": 16750 - }, - { - "epoch": 4.977724977724978, - "grad_norm": 0.7088252305984497, - "learning_rate": 1.3365013365013367e-07, - "loss": 0.055, - "step": 16760 - }, - { - "epoch": 4.980694980694981, - "grad_norm": 0.5903290510177612, - "learning_rate": 1.1583011583011584e-07, - "loss": 0.0549, - "step": 16770 - }, - { - "epoch": 4.983664983664983, - "grad_norm": 0.48824286460876465, - "learning_rate": 9.801009801009801e-08, - "loss": 0.0468, - "step": 16780 - }, - { - "epoch": 4.986634986634987, - "grad_norm": 0.3941417634487152, - "learning_rate": 8.019008019008019e-08, - "loss": 0.0708, - "step": 16790 - }, - { - "epoch": 4.98960498960499, - "grad_norm": 0.5667988657951355, - "learning_rate": 6.237006237006238e-08, - "loss": 0.0532, - "step": 16800 - }, - { - "epoch": 4.992574992574992, - "grad_norm": 0.4835197627544403, - "learning_rate": 4.4550044550044554e-08, - "loss": 0.0511, - "step": 16810 - }, - { - "epoch": 4.9955449955449955, - "grad_norm": 0.5256985425949097, - "learning_rate": 2.673002673002673e-08, - "loss": 0.0624, - "step": 16820 - }, - { - "epoch": 4.998514998514999, - "grad_norm": 0.398252934217453, - "learning_rate": 8.91000891000891e-09, - "loss": 0.0563, - "step": 16830 - }, - { - "epoch": 5.0, - "eval_f1": 0.49727767695099817, - "eval_loss": 0.053983673453330994, - "eval_runtime": 176.2895, - "eval_samples_per_second": 215.662, - "eval_steps_per_second": 3.375, - "step": 16835 - }, - { - "epoch": 4.251451653622823, - "grad_norm": 0.4622216820716858, - "learning_rate": 4.491290078263065e-06, - "loss": 0.0754, - "step": 16840 - }, - { - "epoch": 4.253976268619035, - "grad_norm": 0.38623130321502686, - "learning_rate": 4.476142388285787e-06, - "loss": 0.0505, - "step": 16850 - }, - { - "epoch": 4.2565008836152485, - "grad_norm": 0.32597488164901733, - "learning_rate": 4.460994698308508e-06, - "loss": 0.0473, - "step": 16860 - }, - { - "epoch": 4.259025498611462, - "grad_norm": 0.599904477596283, - "learning_rate": 4.44584700833123e-06, - "loss": 0.0524, - "step": 16870 - }, - { - "epoch": 4.261550113607675, - "grad_norm": 0.4074048101902008, - "learning_rate": 4.4306993183539506e-06, - "loss": 0.0605, - "step": 16880 - }, - { - "epoch": 4.264074728603888, - "grad_norm": 0.626695454120636, - "learning_rate": 4.415551628376672e-06, - "loss": 0.0584, - "step": 16890 - }, - { - "epoch": 4.266599343600101, - "grad_norm": 0.46520286798477173, - "learning_rate": 4.400403938399395e-06, - "loss": 0.0452, - "step": 16900 - }, - { - "epoch": 4.269123958596314, - "grad_norm": 0.7951592206954956, - "learning_rate": 4.385256248422115e-06, - "loss": 0.071, - "step": 16910 - }, - { - "epoch": 4.271648573592527, - "grad_norm": 0.5409834384918213, - "learning_rate": 4.370108558444837e-06, - "loss": 0.0467, - "step": 16920 - }, - { - "epoch": 4.27417318858874, - "grad_norm": 0.6036372780799866, - "learning_rate": 4.354960868467559e-06, - "loss": 0.064, - "step": 16930 - }, - { - "epoch": 4.276697803584954, - "grad_norm": 0.4542910158634186, - "learning_rate": 4.33981317849028e-06, - "loss": 0.0547, - "step": 16940 - }, - { - "epoch": 4.279222418581166, - "grad_norm": 0.6374622583389282, - "learning_rate": 4.324665488513002e-06, - "loss": 0.0537, - "step": 16950 - }, - { - "epoch": 4.281747033577379, - "grad_norm": 0.6870420575141907, - "learning_rate": 4.309517798535723e-06, - "loss": 0.0639, - "step": 16960 - }, - { - "epoch": 4.2842716485735926, - "grad_norm": 0.24296802282333374, - "learning_rate": 4.294370108558445e-06, - "loss": 0.0614, - "step": 16970 - }, - { - "epoch": 4.286796263569806, - "grad_norm": 0.5068966150283813, - "learning_rate": 4.2792224185811665e-06, - "loss": 0.0667, - "step": 16980 - }, - { - "epoch": 4.289320878566019, - "grad_norm": 0.49634042382240295, - "learning_rate": 4.264074728603888e-06, - "loss": 0.0482, - "step": 16990 - }, - { - "epoch": 4.291845493562231, - "grad_norm": 0.8153424263000488, - "learning_rate": 4.24892703862661e-06, - "loss": 0.0542, - "step": 17000 - }, - { - "epoch": 4.294370108558445, - "grad_norm": 0.19083431363105774, - "learning_rate": 4.233779348649331e-06, - "loss": 0.0612, - "step": 17010 - }, - { - "epoch": 4.296894723554658, - "grad_norm": 0.4229993522167206, - "learning_rate": 4.218631658672053e-06, - "loss": 0.0518, - "step": 17020 - }, - { - "epoch": 4.299419338550871, - "grad_norm": 0.8197377920150757, - "learning_rate": 4.2034839686947745e-06, - "loss": 0.0547, - "step": 17030 - }, - { - "epoch": 4.3019439535470845, - "grad_norm": 0.44996774196624756, - "learning_rate": 4.188336278717495e-06, - "loss": 0.0495, - "step": 17040 - }, - { - "epoch": 4.304468568543297, - "grad_norm": 0.4352714419364929, - "learning_rate": 4.173188588740217e-06, - "loss": 0.047, - "step": 17050 - }, - { - "epoch": 4.30699318353951, - "grad_norm": 0.3896523714065552, - "learning_rate": 4.158040898762939e-06, - "loss": 0.0599, - "step": 17060 - }, - { - "epoch": 4.309517798535723, - "grad_norm": 0.6314728260040283, - "learning_rate": 4.14289320878566e-06, - "loss": 0.0604, - "step": 17070 - }, - { - "epoch": 4.312042413531937, - "grad_norm": 0.6164297461509705, - "learning_rate": 4.127745518808382e-06, - "loss": 0.0649, - "step": 17080 - }, - { - "epoch": 4.314567028528149, - "grad_norm": 0.47392478585243225, - "learning_rate": 4.112597828831104e-06, - "loss": 0.0482, - "step": 17090 - }, - { - "epoch": 4.317091643524362, - "grad_norm": 0.4184396266937256, - "learning_rate": 4.097450138853825e-06, - "loss": 0.0576, - "step": 17100 - }, - { - "epoch": 4.3196162585205755, - "grad_norm": 0.3965582251548767, - "learning_rate": 4.082302448876546e-06, - "loss": 0.0658, - "step": 17110 - }, - { - "epoch": 4.322140873516789, - "grad_norm": 0.4759332835674286, - "learning_rate": 4.067154758899268e-06, - "loss": 0.0694, - "step": 17120 - }, - { - "epoch": 4.324665488513002, - "grad_norm": 0.6103851795196533, - "learning_rate": 4.0520070689219896e-06, - "loss": 0.0632, - "step": 17130 - }, - { - "epoch": 4.327190103509215, - "grad_norm": 0.3435596525669098, - "learning_rate": 4.036859378944711e-06, - "loss": 0.0632, - "step": 17140 - }, - { - "epoch": 4.329714718505428, - "grad_norm": 0.6255317330360413, - "learning_rate": 4.021711688967433e-06, - "loss": 0.0607, - "step": 17150 - }, - { - "epoch": 4.332239333501641, - "grad_norm": 0.8034877181053162, - "learning_rate": 4.006563998990154e-06, - "loss": 0.0624, - "step": 17160 - }, - { - "epoch": 4.334763948497854, - "grad_norm": 0.5104978084564209, - "learning_rate": 3.991416309012875e-06, - "loss": 0.0723, - "step": 17170 - }, - { - "epoch": 4.337288563494067, - "grad_norm": 0.6457841992378235, - "learning_rate": 3.976268619035597e-06, - "loss": 0.0622, - "step": 17180 - }, - { - "epoch": 4.33981317849028, - "grad_norm": 0.5124953985214233, - "learning_rate": 3.961120929058319e-06, - "loss": 0.0608, - "step": 17190 - }, - { - "epoch": 4.342337793486493, - "grad_norm": 0.4378756582736969, - "learning_rate": 3.94597323908104e-06, - "loss": 0.0483, - "step": 17200 - }, - { - "epoch": 4.344862408482706, - "grad_norm": 0.47140154242515564, - "learning_rate": 3.9308255491037615e-06, - "loss": 0.0649, - "step": 17210 - }, - { - "epoch": 4.34738702347892, - "grad_norm": 0.39003312587738037, - "learning_rate": 3.915677859126484e-06, - "loss": 0.0589, - "step": 17220 - }, - { - "epoch": 4.349911638475133, - "grad_norm": 0.5201835036277771, - "learning_rate": 3.900530169149205e-06, - "loss": 0.0528, - "step": 17230 - }, - { - "epoch": 4.352436253471345, - "grad_norm": 0.4116949439048767, - "learning_rate": 3.885382479171926e-06, - "loss": 0.0531, - "step": 17240 - }, - { - "epoch": 4.3549608684675585, - "grad_norm": 0.39697498083114624, - "learning_rate": 3.870234789194648e-06, - "loss": 0.0733, - "step": 17250 - }, - { - "epoch": 4.357485483463772, - "grad_norm": 0.4850797653198242, - "learning_rate": 3.8550870992173695e-06, - "loss": 0.0654, - "step": 17260 - }, - { - "epoch": 4.360010098459985, - "grad_norm": 0.42553943395614624, - "learning_rate": 3.839939409240091e-06, - "loss": 0.0451, - "step": 17270 - }, - { - "epoch": 4.362534713456198, - "grad_norm": 0.27774763107299805, - "learning_rate": 3.824791719262813e-06, - "loss": 0.0532, - "step": 17280 - }, - { - "epoch": 4.365059328452411, - "grad_norm": 0.36856329441070557, - "learning_rate": 3.8096440292855342e-06, - "loss": 0.0499, - "step": 17290 - }, - { - "epoch": 4.367583943448624, - "grad_norm": 0.6865664720535278, - "learning_rate": 3.7944963393082554e-06, - "loss": 0.0646, - "step": 17300 - }, - { - "epoch": 4.370108558444837, - "grad_norm": 0.809834897518158, - "learning_rate": 3.7793486493309766e-06, - "loss": 0.0623, - "step": 17310 - }, - { - "epoch": 4.37263317344105, - "grad_norm": 0.5114462971687317, - "learning_rate": 3.764200959353699e-06, - "loss": 0.055, - "step": 17320 - }, - { - "epoch": 4.375157788437264, - "grad_norm": 0.6078599095344543, - "learning_rate": 3.74905326937642e-06, - "loss": 0.0654, - "step": 17330 - }, - { - "epoch": 4.377682403433476, - "grad_norm": 0.48811349272727966, - "learning_rate": 3.733905579399142e-06, - "loss": 0.0555, - "step": 17340 - }, - { - "epoch": 4.380207018429689, - "grad_norm": 0.7374588847160339, - "learning_rate": 3.718757889421863e-06, - "loss": 0.0593, - "step": 17350 - }, - { - "epoch": 4.3827316334259026, - "grad_norm": 0.6511560678482056, - "learning_rate": 3.703610199444585e-06, - "loss": 0.0544, - "step": 17360 - }, - { - "epoch": 4.385256248422116, - "grad_norm": 0.4263114333152771, - "learning_rate": 3.6884625094673066e-06, - "loss": 0.064, - "step": 17370 - }, - { - "epoch": 4.387780863418329, - "grad_norm": 0.2922056317329407, - "learning_rate": 3.6733148194900277e-06, - "loss": 0.0573, - "step": 17380 - }, - { - "epoch": 4.390305478414541, - "grad_norm": 0.7642768025398254, - "learning_rate": 3.6581671295127493e-06, - "loss": 0.0558, - "step": 17390 - }, - { - "epoch": 4.392830093410755, - "grad_norm": 0.5975914597511292, - "learning_rate": 3.643019439535471e-06, - "loss": 0.075, - "step": 17400 - }, - { - "epoch": 4.395354708406968, - "grad_norm": 0.4351644515991211, - "learning_rate": 3.6278717495581925e-06, - "loss": 0.0635, - "step": 17410 - }, - { - "epoch": 4.397879323403181, - "grad_norm": 0.6523928046226501, - "learning_rate": 3.612724059580914e-06, - "loss": 0.0545, - "step": 17420 - }, - { - "epoch": 4.4004039383993945, - "grad_norm": 0.4286153018474579, - "learning_rate": 3.5975763696036353e-06, - "loss": 0.0507, - "step": 17430 - }, - { - "epoch": 4.402928553395607, - "grad_norm": 0.402811735868454, - "learning_rate": 3.582428679626357e-06, - "loss": 0.0595, - "step": 17440 - }, - { - "epoch": 4.40545316839182, - "grad_norm": 0.5500208139419556, - "learning_rate": 3.567280989649079e-06, - "loss": 0.0553, - "step": 17450 - }, - { - "epoch": 4.407977783388033, - "grad_norm": 0.7133852243423462, - "learning_rate": 3.5521332996718e-06, - "loss": 0.059, - "step": 17460 - }, - { - "epoch": 4.410502398384247, - "grad_norm": 0.8194918036460876, - "learning_rate": 3.5369856096945217e-06, - "loss": 0.0521, - "step": 17470 - }, - { - "epoch": 4.41302701338046, - "grad_norm": 0.5027428865432739, - "learning_rate": 3.521837919717243e-06, - "loss": 0.0491, - "step": 17480 - }, - { - "epoch": 4.415551628376672, - "grad_norm": 0.46674638986587524, - "learning_rate": 3.506690229739965e-06, - "loss": 0.0543, - "step": 17490 - }, - { - "epoch": 4.4180762433728855, - "grad_norm": 0.6677160263061523, - "learning_rate": 3.4915425397626865e-06, - "loss": 0.0464, - "step": 17500 - }, - { - "epoch": 4.420600858369099, - "grad_norm": 0.3993780314922333, - "learning_rate": 3.4763948497854076e-06, - "loss": 0.0635, - "step": 17510 - }, - { - "epoch": 4.423125473365312, - "grad_norm": 0.44299226999282837, - "learning_rate": 3.4612471598081292e-06, - "loss": 0.0675, - "step": 17520 - }, - { - "epoch": 4.425650088361525, - "grad_norm": 0.47991326451301575, - "learning_rate": 3.4460994698308512e-06, - "loss": 0.0491, - "step": 17530 - }, - { - "epoch": 4.428174703357738, - "grad_norm": 0.5460741519927979, - "learning_rate": 3.4309517798535724e-06, - "loss": 0.0688, - "step": 17540 - }, - { - "epoch": 4.430699318353951, - "grad_norm": 0.5100826621055603, - "learning_rate": 3.415804089876294e-06, - "loss": 0.0686, - "step": 17550 - }, - { - "epoch": 4.433223933350164, - "grad_norm": 0.7981113195419312, - "learning_rate": 3.400656399899015e-06, - "loss": 0.0605, - "step": 17560 - }, - { - "epoch": 4.435748548346377, - "grad_norm": 0.42095330357551575, - "learning_rate": 3.385508709921737e-06, - "loss": 0.0621, - "step": 17570 - }, - { - "epoch": 4.43827316334259, - "grad_norm": 0.4400339722633362, - "learning_rate": 3.3703610199444588e-06, - "loss": 0.0636, - "step": 17580 - }, - { - "epoch": 4.440797778338803, - "grad_norm": 0.4648873805999756, - "learning_rate": 3.35521332996718e-06, - "loss": 0.0746, - "step": 17590 - }, - { - "epoch": 4.443322393335016, - "grad_norm": 0.4564558267593384, - "learning_rate": 3.3400656399899016e-06, - "loss": 0.0546, - "step": 17600 - }, - { - "epoch": 4.44584700833123, - "grad_norm": 0.4136642515659332, - "learning_rate": 3.324917950012623e-06, - "loss": 0.0557, - "step": 17610 - }, - { - "epoch": 4.448371623327443, - "grad_norm": 0.4328581392765045, - "learning_rate": 3.3097702600353447e-06, - "loss": 0.0565, - "step": 17620 - }, - { - "epoch": 4.450896238323656, - "grad_norm": 0.3888933062553406, - "learning_rate": 3.2946225700580663e-06, - "loss": 0.057, - "step": 17630 - }, - { - "epoch": 4.4534208533198685, - "grad_norm": 0.5712131857872009, - "learning_rate": 3.2794748800807875e-06, - "loss": 0.0484, - "step": 17640 - }, - { - "epoch": 4.455945468316082, - "grad_norm": 0.5881834626197815, - "learning_rate": 3.2643271901035095e-06, - "loss": 0.0531, - "step": 17650 - }, - { - "epoch": 4.458470083312295, - "grad_norm": 0.5216571688652039, - "learning_rate": 3.249179500126231e-06, - "loss": 0.0522, - "step": 17660 - }, - { - "epoch": 4.460994698308508, - "grad_norm": 0.5654059648513794, - "learning_rate": 3.2340318101489523e-06, - "loss": 0.0498, - "step": 17670 - }, - { - "epoch": 4.463519313304721, - "grad_norm": 0.6211521625518799, - "learning_rate": 3.218884120171674e-06, - "loss": 0.0546, - "step": 17680 - }, - { - "epoch": 4.466043928300934, - "grad_norm": 0.48394614458084106, - "learning_rate": 3.2037364301943955e-06, - "loss": 0.0588, - "step": 17690 - }, - { - "epoch": 4.468568543297147, - "grad_norm": 0.7053552269935608, - "learning_rate": 3.188588740217117e-06, - "loss": 0.0598, - "step": 17700 - }, - { - "epoch": 4.47109315829336, - "grad_norm": 0.4579329192638397, - "learning_rate": 3.1734410502398387e-06, - "loss": 0.0512, - "step": 17710 - }, - { - "epoch": 4.473617773289574, - "grad_norm": 0.3756571114063263, - "learning_rate": 3.15829336026256e-06, - "loss": 0.0512, - "step": 17720 - }, - { - "epoch": 4.476142388285786, - "grad_norm": 0.3513215482234955, - "learning_rate": 3.1431456702852814e-06, - "loss": 0.0557, - "step": 17730 - }, - { - "epoch": 4.478667003281999, - "grad_norm": 0.5204163193702698, - "learning_rate": 3.127997980308003e-06, - "loss": 0.0799, - "step": 17740 - }, - { - "epoch": 4.4811916182782126, - "grad_norm": 0.5801984071731567, - "learning_rate": 3.1128502903307246e-06, - "loss": 0.0419, - "step": 17750 - }, - { - "epoch": 4.483716233274426, - "grad_norm": 0.6490535140037537, - "learning_rate": 3.0977026003534462e-06, - "loss": 0.0551, - "step": 17760 - }, - { - "epoch": 4.486240848270639, - "grad_norm": 0.5970304012298584, - "learning_rate": 3.0825549103761674e-06, - "loss": 0.061, - "step": 17770 - }, - { - "epoch": 4.488765463266851, - "grad_norm": 0.8191946744918823, - "learning_rate": 3.0674072203988894e-06, - "loss": 0.058, - "step": 17780 - }, - { - "epoch": 4.491290078263065, - "grad_norm": 0.7532091736793518, - "learning_rate": 3.052259530421611e-06, - "loss": 0.0578, - "step": 17790 - }, - { - "epoch": 4.493814693259278, - "grad_norm": 0.6891248226165771, - "learning_rate": 3.037111840444332e-06, - "loss": 0.0776, - "step": 17800 - }, - { - "epoch": 4.496339308255491, - "grad_norm": 0.3589613139629364, - "learning_rate": 3.0219641504670538e-06, - "loss": 0.0578, - "step": 17810 - }, - { - "epoch": 4.4988639232517045, - "grad_norm": 0.4397825598716736, - "learning_rate": 3.0068164604897754e-06, - "loss": 0.0477, - "step": 17820 - }, - { - "epoch": 4.501388538247917, - "grad_norm": 0.6630678772926331, - "learning_rate": 2.991668770512497e-06, - "loss": 0.0621, - "step": 17830 - }, - { - "epoch": 4.50391315324413, - "grad_norm": 0.4310142695903778, - "learning_rate": 2.9765210805352185e-06, - "loss": 0.0486, - "step": 17840 - }, - { - "epoch": 4.506437768240343, - "grad_norm": 0.5123319625854492, - "learning_rate": 2.9613733905579397e-06, - "loss": 0.0419, - "step": 17850 - }, - { - "epoch": 4.508962383236557, - "grad_norm": 0.8451969027519226, - "learning_rate": 2.9462257005806617e-06, - "loss": 0.0636, - "step": 17860 - }, - { - "epoch": 4.51148699823277, - "grad_norm": 0.5869598388671875, - "learning_rate": 2.9310780106033833e-06, - "loss": 0.0592, - "step": 17870 - }, - { - "epoch": 4.514011613228982, - "grad_norm": 0.8282822370529175, - "learning_rate": 2.9159303206261045e-06, - "loss": 0.0614, - "step": 17880 - }, - { - "epoch": 4.5165362282251955, - "grad_norm": 0.5392325520515442, - "learning_rate": 2.900782630648826e-06, - "loss": 0.0537, - "step": 17890 - }, - { - "epoch": 4.519060843221409, - "grad_norm": 0.6844165325164795, - "learning_rate": 2.8856349406715477e-06, - "loss": 0.0628, - "step": 17900 - }, - { - "epoch": 4.521585458217622, - "grad_norm": 0.5177090764045715, - "learning_rate": 2.8704872506942693e-06, - "loss": 0.0536, - "step": 17910 - }, - { - "epoch": 4.524110073213835, - "grad_norm": 0.395877480506897, - "learning_rate": 2.855339560716991e-06, - "loss": 0.0602, - "step": 17920 - }, - { - "epoch": 4.526634688210048, - "grad_norm": 0.30185338854789734, - "learning_rate": 2.840191870739712e-06, - "loss": 0.0624, - "step": 17930 - }, - { - "epoch": 4.529159303206261, - "grad_norm": 0.5236132740974426, - "learning_rate": 2.825044180762434e-06, - "loss": 0.0648, - "step": 17940 - }, - { - "epoch": 4.531683918202474, - "grad_norm": 0.5160847306251526, - "learning_rate": 2.8098964907851552e-06, - "loss": 0.0554, - "step": 17950 - }, - { - "epoch": 4.534208533198687, - "grad_norm": 0.5891533493995667, - "learning_rate": 2.794748800807877e-06, - "loss": 0.065, - "step": 17960 - }, - { - "epoch": 4.5367331481949, - "grad_norm": 0.3929848074913025, - "learning_rate": 2.7796011108305984e-06, - "loss": 0.064, - "step": 17970 - }, - { - "epoch": 4.539257763191113, - "grad_norm": 0.4245711863040924, - "learning_rate": 2.76445342085332e-06, - "loss": 0.0589, - "step": 17980 - }, - { - "epoch": 4.541782378187326, - "grad_norm": 0.3840174376964569, - "learning_rate": 2.7493057308760416e-06, - "loss": 0.0533, - "step": 17990 - }, - { - "epoch": 4.54430699318354, - "grad_norm": 0.5826268196105957, - "learning_rate": 2.734158040898763e-06, - "loss": 0.0574, - "step": 18000 - }, - { - "epoch": 4.546831608179753, - "grad_norm": 0.34730613231658936, - "learning_rate": 2.7190103509214844e-06, - "loss": 0.0567, - "step": 18010 - }, - { - "epoch": 4.549356223175966, - "grad_norm": 0.45773857831954956, - "learning_rate": 2.703862660944206e-06, - "loss": 0.0601, - "step": 18020 - }, - { - "epoch": 4.5518808381721785, - "grad_norm": 0.3971637189388275, - "learning_rate": 2.6887149709669276e-06, - "loss": 0.0551, - "step": 18030 - }, - { - "epoch": 4.554405453168392, - "grad_norm": 0.46903976798057556, - "learning_rate": 2.673567280989649e-06, - "loss": 0.0553, - "step": 18040 - }, - { - "epoch": 4.556930068164605, - "grad_norm": 0.5735597014427185, - "learning_rate": 2.6584195910123708e-06, - "loss": 0.0544, - "step": 18050 - }, - { - "epoch": 4.559454683160818, - "grad_norm": 0.7702049612998962, - "learning_rate": 2.643271901035092e-06, - "loss": 0.0424, - "step": 18060 - }, - { - "epoch": 4.561979298157031, - "grad_norm": 0.7742976546287537, - "learning_rate": 2.628124211057814e-06, - "loss": 0.0625, - "step": 18070 - }, - { - "epoch": 4.564503913153244, - "grad_norm": 0.7160853147506714, - "learning_rate": 2.612976521080535e-06, - "loss": 0.058, - "step": 18080 - }, - { - "epoch": 4.567028528149457, - "grad_norm": 0.23403172194957733, - "learning_rate": 2.5978288311032567e-06, - "loss": 0.0477, - "step": 18090 - }, - { - "epoch": 4.56955314314567, - "grad_norm": 0.25679340958595276, - "learning_rate": 2.5826811411259783e-06, - "loss": 0.0578, - "step": 18100 - }, - { - "epoch": 4.572077758141884, - "grad_norm": 0.6108934879302979, - "learning_rate": 2.5675334511487e-06, - "loss": 0.0567, - "step": 18110 - }, - { - "epoch": 4.574602373138097, - "grad_norm": 0.570832371711731, - "learning_rate": 2.5523857611714215e-06, - "loss": 0.0503, - "step": 18120 - }, - { - "epoch": 4.577126988134309, - "grad_norm": 0.49613040685653687, - "learning_rate": 2.537238071194143e-06, - "loss": 0.0552, - "step": 18130 - }, - { - "epoch": 4.5796516031305226, - "grad_norm": 0.43599942326545715, - "learning_rate": 2.5220903812168643e-06, - "loss": 0.0736, - "step": 18140 - }, - { - "epoch": 4.582176218126736, - "grad_norm": 0.5823941230773926, - "learning_rate": 2.5069426912395863e-06, - "loss": 0.0452, - "step": 18150 - }, - { - "epoch": 4.584700833122949, - "grad_norm": 0.6966807842254639, - "learning_rate": 2.4917950012623075e-06, - "loss": 0.0567, - "step": 18160 - }, - { - "epoch": 4.587225448119161, - "grad_norm": 0.9933467507362366, - "learning_rate": 2.476647311285029e-06, - "loss": 0.0621, - "step": 18170 - }, - { - "epoch": 4.589750063115375, - "grad_norm": 0.44380226731300354, - "learning_rate": 2.4614996213077506e-06, - "loss": 0.0607, - "step": 18180 - }, - { - "epoch": 4.592274678111588, - "grad_norm": 0.3192310631275177, - "learning_rate": 2.4463519313304722e-06, - "loss": 0.0601, - "step": 18190 - }, - { - "epoch": 4.594799293107801, - "grad_norm": 0.5151782035827637, - "learning_rate": 2.431204241353194e-06, - "loss": 0.0441, - "step": 18200 - }, - { - "epoch": 4.5973239081040145, - "grad_norm": 0.8137912154197693, - "learning_rate": 2.4160565513759154e-06, - "loss": 0.0708, - "step": 18210 - }, - { - "epoch": 4.599848523100227, - "grad_norm": 0.4802444875240326, - "learning_rate": 2.4009088613986366e-06, - "loss": 0.0553, - "step": 18220 - }, - { - "epoch": 4.60237313809644, - "grad_norm": 0.41935741901397705, - "learning_rate": 2.3857611714213586e-06, - "loss": 0.057, - "step": 18230 - }, - { - "epoch": 4.604897753092653, - "grad_norm": 0.42669227719306946, - "learning_rate": 2.3706134814440798e-06, - "loss": 0.0671, - "step": 18240 - }, - { - "epoch": 4.607422368088867, - "grad_norm": 0.5261390209197998, - "learning_rate": 2.3554657914668014e-06, - "loss": 0.0561, - "step": 18250 - }, - { - "epoch": 4.60994698308508, - "grad_norm": 0.495779424905777, - "learning_rate": 2.340318101489523e-06, - "loss": 0.059, - "step": 18260 - }, - { - "epoch": 4.612471598081292, - "grad_norm": 0.5515862107276917, - "learning_rate": 2.325170411512244e-06, - "loss": 0.061, - "step": 18270 - }, - { - "epoch": 4.6149962130775055, - "grad_norm": 0.8136048913002014, - "learning_rate": 2.310022721534966e-06, - "loss": 0.0572, - "step": 18280 - }, - { - "epoch": 4.617520828073719, - "grad_norm": 0.393250972032547, - "learning_rate": 2.2948750315576873e-06, - "loss": 0.0523, - "step": 18290 - }, - { - "epoch": 4.620045443069932, - "grad_norm": 0.5420840978622437, - "learning_rate": 2.279727341580409e-06, - "loss": 0.0605, - "step": 18300 - }, - { - "epoch": 4.622570058066145, - "grad_norm": 0.5676819086074829, - "learning_rate": 2.2645796516031305e-06, - "loss": 0.0523, - "step": 18310 - }, - { - "epoch": 4.625094673062358, - "grad_norm": 0.36500880122184753, - "learning_rate": 2.249431961625852e-06, - "loss": 0.0562, - "step": 18320 - }, - { - "epoch": 4.627619288058571, - "grad_norm": 0.5303543210029602, - "learning_rate": 2.2342842716485737e-06, - "loss": 0.0474, - "step": 18330 - }, - { - "epoch": 4.630143903054784, - "grad_norm": 0.4387858510017395, - "learning_rate": 2.2191365816712953e-06, - "loss": 0.0438, - "step": 18340 - }, - { - "epoch": 4.632668518050997, - "grad_norm": 0.2990294098854065, - "learning_rate": 2.2039888916940165e-06, - "loss": 0.0447, - "step": 18350 - }, - { - "epoch": 4.63519313304721, - "grad_norm": 0.37644967436790466, - "learning_rate": 2.1888412017167385e-06, - "loss": 0.049, - "step": 18360 - }, - { - "epoch": 4.637717748043423, - "grad_norm": 0.5996664762496948, - "learning_rate": 2.1736935117394597e-06, - "loss": 0.0618, - "step": 18370 - }, - { - "epoch": 4.640242363039636, - "grad_norm": 0.5396886467933655, - "learning_rate": 2.1585458217621813e-06, - "loss": 0.054, - "step": 18380 - }, - { - "epoch": 4.64276697803585, - "grad_norm": 0.5311841368675232, - "learning_rate": 2.143398131784903e-06, - "loss": 0.059, - "step": 18390 - }, - { - "epoch": 4.645291593032063, - "grad_norm": 0.6080347299575806, - "learning_rate": 2.1282504418076244e-06, - "loss": 0.063, - "step": 18400 - }, - { - "epoch": 4.647816208028276, - "grad_norm": 0.720029354095459, - "learning_rate": 2.113102751830346e-06, - "loss": 0.0674, - "step": 18410 - }, - { - "epoch": 4.6503408230244885, - "grad_norm": 0.26142123341560364, - "learning_rate": 2.0979550618530672e-06, - "loss": 0.043, - "step": 18420 - }, - { - "epoch": 4.652865438020702, - "grad_norm": 0.8284344673156738, - "learning_rate": 2.082807371875789e-06, - "loss": 0.0632, - "step": 18430 - }, - { - "epoch": 4.655390053016915, - "grad_norm": 0.5045512914657593, - "learning_rate": 2.067659681898511e-06, - "loss": 0.0558, - "step": 18440 - }, - { - "epoch": 4.657914668013128, - "grad_norm": 0.3474113345146179, - "learning_rate": 2.052511991921232e-06, - "loss": 0.0484, - "step": 18450 - }, - { - "epoch": 4.660439283009341, - "grad_norm": 0.4987110197544098, - "learning_rate": 2.0373643019439536e-06, - "loss": 0.0542, - "step": 18460 - }, - { - "epoch": 4.662963898005554, - "grad_norm": 0.48412591218948364, - "learning_rate": 2.022216611966675e-06, - "loss": 0.0585, - "step": 18470 - }, - { - "epoch": 4.665488513001767, - "grad_norm": 0.48798561096191406, - "learning_rate": 2.0070689219893968e-06, - "loss": 0.0512, - "step": 18480 - }, - { - "epoch": 4.66801312799798, - "grad_norm": 0.3808564245700836, - "learning_rate": 1.9919212320121184e-06, - "loss": 0.0607, - "step": 18490 - }, - { - "epoch": 4.670537742994194, - "grad_norm": 0.3918999135494232, - "learning_rate": 1.9767735420348395e-06, - "loss": 0.061, - "step": 18500 - }, - { - "epoch": 4.673062357990407, - "grad_norm": 0.7011407017707825, - "learning_rate": 1.961625852057561e-06, - "loss": 0.0621, - "step": 18510 - }, - { - "epoch": 4.675586972986619, - "grad_norm": 0.31626319885253906, - "learning_rate": 1.946478162080283e-06, - "loss": 0.0564, - "step": 18520 - }, - { - "epoch": 4.6781115879828326, - "grad_norm": 0.5955636501312256, - "learning_rate": 1.9313304721030043e-06, - "loss": 0.0473, - "step": 18530 - }, - { - "epoch": 4.680636202979046, - "grad_norm": 0.50102698802948, - "learning_rate": 1.916182782125726e-06, - "loss": 0.0585, - "step": 18540 - }, - { - "epoch": 4.683160817975259, - "grad_norm": 0.46877047419548035, - "learning_rate": 1.9010350921484475e-06, - "loss": 0.0519, - "step": 18550 - }, - { - "epoch": 4.685685432971471, - "grad_norm": 0.45812228322029114, - "learning_rate": 1.885887402171169e-06, - "loss": 0.0466, - "step": 18560 - }, - { - "epoch": 4.688210047967685, - "grad_norm": 0.8704932332038879, - "learning_rate": 1.8707397121938903e-06, - "loss": 0.0538, - "step": 18570 - }, - { - "epoch": 4.690734662963898, - "grad_norm": 0.61441969871521, - "learning_rate": 1.855592022216612e-06, - "loss": 0.0577, - "step": 18580 - }, - { - "epoch": 4.693259277960111, - "grad_norm": 0.32448869943618774, - "learning_rate": 1.8404443322393335e-06, - "loss": 0.0501, - "step": 18590 - }, - { - "epoch": 4.6957838929563245, - "grad_norm": 0.5979995727539062, - "learning_rate": 1.825296642262055e-06, - "loss": 0.0502, - "step": 18600 - }, - { - "epoch": 4.698308507952537, - "grad_norm": 0.38927385210990906, - "learning_rate": 1.8101489522847765e-06, - "loss": 0.0562, - "step": 18610 - }, - { - "epoch": 4.70083312294875, - "grad_norm": 0.6443197131156921, - "learning_rate": 1.7950012623074983e-06, - "loss": 0.0485, - "step": 18620 - }, - { - "epoch": 4.703357737944963, - "grad_norm": 0.5753923654556274, - "learning_rate": 1.7798535723302196e-06, - "loss": 0.0455, - "step": 18630 - }, - { - "epoch": 4.705882352941177, - "grad_norm": 0.5932863354682922, - "learning_rate": 1.7647058823529412e-06, - "loss": 0.0651, - "step": 18640 - }, - { - "epoch": 4.70840696793739, - "grad_norm": 0.3984706401824951, - "learning_rate": 1.7495581923756626e-06, - "loss": 0.0575, - "step": 18650 - }, - { - "epoch": 4.710931582933602, - "grad_norm": 0.5167907476425171, - "learning_rate": 1.7344105023983844e-06, - "loss": 0.0639, - "step": 18660 - }, - { - "epoch": 4.7134561979298155, - "grad_norm": 0.5363221764564514, - "learning_rate": 1.7192628124211058e-06, - "loss": 0.0418, - "step": 18670 - }, - { - "epoch": 4.715980812926029, - "grad_norm": 0.584365963935852, - "learning_rate": 1.7041151224438274e-06, - "loss": 0.0477, - "step": 18680 - }, - { - "epoch": 4.718505427922242, - "grad_norm": 0.2417188286781311, - "learning_rate": 1.6889674324665488e-06, - "loss": 0.0543, - "step": 18690 - }, - { - "epoch": 4.721030042918455, - "grad_norm": 0.5733222365379333, - "learning_rate": 1.6738197424892706e-06, - "loss": 0.0684, - "step": 18700 - }, - { - "epoch": 4.723554657914668, - "grad_norm": 0.7107726335525513, - "learning_rate": 1.658672052511992e-06, - "loss": 0.0543, - "step": 18710 - }, - { - "epoch": 4.726079272910881, - "grad_norm": 0.7579614520072937, - "learning_rate": 1.6435243625347136e-06, - "loss": 0.0507, - "step": 18720 - }, - { - "epoch": 4.728603887907094, - "grad_norm": 0.4801480174064636, - "learning_rate": 1.628376672557435e-06, - "loss": 0.0611, - "step": 18730 - }, - { - "epoch": 4.731128502903307, - "grad_norm": 0.5679193139076233, - "learning_rate": 1.6132289825801565e-06, - "loss": 0.0555, - "step": 18740 - }, - { - "epoch": 4.733653117899521, - "grad_norm": 0.4143296480178833, - "learning_rate": 1.5980812926028781e-06, - "loss": 0.0564, - "step": 18750 - }, - { - "epoch": 4.736177732895733, - "grad_norm": 0.5309060215950012, - "learning_rate": 1.5829336026255997e-06, - "loss": 0.047, - "step": 18760 - }, - { - "epoch": 4.738702347891946, - "grad_norm": 0.49305611848831177, - "learning_rate": 1.5677859126483211e-06, - "loss": 0.053, - "step": 18770 - }, - { - "epoch": 4.74122696288816, - "grad_norm": 0.5996381044387817, - "learning_rate": 1.5526382226710427e-06, - "loss": 0.0659, - "step": 18780 - }, - { - "epoch": 4.743751577884373, - "grad_norm": 0.6321601271629333, - "learning_rate": 1.5374905326937643e-06, - "loss": 0.0447, - "step": 18790 - }, - { - "epoch": 4.746276192880586, - "grad_norm": 0.7180649638175964, - "learning_rate": 1.522342842716486e-06, - "loss": 0.0489, - "step": 18800 - }, - { - "epoch": 4.7488008078767985, - "grad_norm": 0.40703126788139343, - "learning_rate": 1.5071951527392073e-06, - "loss": 0.0584, - "step": 18810 - }, - { - "epoch": 4.751325422873012, - "grad_norm": 0.5110107064247131, - "learning_rate": 1.4920474627619289e-06, - "loss": 0.0486, - "step": 18820 - }, - { - "epoch": 4.753850037869225, - "grad_norm": 0.5644401907920837, - "learning_rate": 1.4768997727846505e-06, - "loss": 0.0562, - "step": 18830 - }, - { - "epoch": 4.756374652865438, - "grad_norm": 0.5056483745574951, - "learning_rate": 1.4617520828073719e-06, - "loss": 0.0705, - "step": 18840 - }, - { - "epoch": 4.758899267861651, - "grad_norm": 0.48723912239074707, - "learning_rate": 1.4466043928300934e-06, - "loss": 0.0561, - "step": 18850 - }, - { - "epoch": 4.761423882857864, - "grad_norm": 0.47025883197784424, - "learning_rate": 1.4314567028528148e-06, - "loss": 0.0602, - "step": 18860 - }, - { - "epoch": 4.763948497854077, - "grad_norm": 0.3964708745479584, - "learning_rate": 1.4163090128755366e-06, - "loss": 0.0634, - "step": 18870 - }, - { - "epoch": 4.76647311285029, - "grad_norm": 0.7490302920341492, - "learning_rate": 1.401161322898258e-06, - "loss": 0.0648, - "step": 18880 - }, - { - "epoch": 4.768997727846504, - "grad_norm": 0.3066612184047699, - "learning_rate": 1.3860136329209796e-06, - "loss": 0.063, - "step": 18890 - }, - { - "epoch": 4.771522342842717, - "grad_norm": 0.4892236590385437, - "learning_rate": 1.370865942943701e-06, - "loss": 0.0498, - "step": 18900 - }, - { - "epoch": 4.774046957838929, - "grad_norm": 0.5352323651313782, - "learning_rate": 1.3557182529664226e-06, - "loss": 0.0514, - "step": 18910 - }, - { - "epoch": 4.7765715728351426, - "grad_norm": 0.6631128191947937, - "learning_rate": 1.3405705629891442e-06, - "loss": 0.0567, - "step": 18920 - }, - { - "epoch": 4.779096187831356, - "grad_norm": 0.49421215057373047, - "learning_rate": 1.3254228730118658e-06, - "loss": 0.0556, - "step": 18930 - }, - { - "epoch": 4.781620802827569, - "grad_norm": 0.36637288331985474, - "learning_rate": 1.3102751830345872e-06, - "loss": 0.0569, - "step": 18940 - }, - { - "epoch": 4.784145417823781, - "grad_norm": 0.32764676213264465, - "learning_rate": 1.2951274930573088e-06, - "loss": 0.0584, - "step": 18950 - }, - { - "epoch": 4.786670032819995, - "grad_norm": 0.7302456498146057, - "learning_rate": 1.2799798030800304e-06, - "loss": 0.0535, - "step": 18960 - }, - { - "epoch": 4.789194647816208, - "grad_norm": 0.5171737670898438, - "learning_rate": 1.264832113102752e-06, - "loss": 0.0524, - "step": 18970 - }, - { - "epoch": 4.791719262812421, - "grad_norm": 0.6158316135406494, - "learning_rate": 1.2496844231254733e-06, - "loss": 0.0626, - "step": 18980 - }, - { - "epoch": 4.7942438778086345, - "grad_norm": 0.5882306694984436, - "learning_rate": 1.234536733148195e-06, - "loss": 0.0542, - "step": 18990 - }, - { - "epoch": 4.796768492804848, - "grad_norm": 0.6305384039878845, - "learning_rate": 1.2193890431709165e-06, - "loss": 0.0576, - "step": 19000 - }, - { - "epoch": 4.79929310780106, - "grad_norm": 0.46403953433036804, - "learning_rate": 1.2042413531936381e-06, - "loss": 0.0453, - "step": 19010 - }, - { - "epoch": 4.801817722797273, - "grad_norm": 0.6074075698852539, - "learning_rate": 1.1890936632163595e-06, - "loss": 0.0682, - "step": 19020 - }, - { - "epoch": 4.804342337793487, - "grad_norm": 0.43722423911094666, - "learning_rate": 1.173945973239081e-06, - "loss": 0.0542, - "step": 19030 - }, - { - "epoch": 4.8068669527897, - "grad_norm": 0.35191863775253296, - "learning_rate": 1.1587982832618027e-06, - "loss": 0.0639, - "step": 19040 - }, - { - "epoch": 4.809391567785912, - "grad_norm": 0.6911765336990356, - "learning_rate": 1.1436505932845243e-06, - "loss": 0.0553, - "step": 19050 - }, - { - "epoch": 4.8119161827821255, - "grad_norm": 0.22319677472114563, - "learning_rate": 1.1285029033072457e-06, - "loss": 0.066, - "step": 19060 - }, - { - "epoch": 4.814440797778339, - "grad_norm": 0.520487904548645, - "learning_rate": 1.1133552133299673e-06, - "loss": 0.0596, - "step": 19070 - }, - { - "epoch": 4.816965412774552, - "grad_norm": 0.40240347385406494, - "learning_rate": 1.0982075233526886e-06, - "loss": 0.0618, - "step": 19080 - }, - { - "epoch": 4.819490027770765, - "grad_norm": 0.6012730598449707, - "learning_rate": 1.0830598333754104e-06, - "loss": 0.0401, - "step": 19090 - }, - { - "epoch": 4.822014642766978, - "grad_norm": 0.6411862373352051, - "learning_rate": 1.0679121433981318e-06, - "loss": 0.0569, - "step": 19100 - }, - { - "epoch": 4.824539257763191, - "grad_norm": 0.7546837329864502, - "learning_rate": 1.0527644534208532e-06, - "loss": 0.0619, - "step": 19110 - }, - { - "epoch": 4.827063872759404, - "grad_norm": 0.5956974625587463, - "learning_rate": 1.0376167634435748e-06, - "loss": 0.0651, - "step": 19120 - }, - { - "epoch": 4.829588487755617, - "grad_norm": 0.41826269030570984, - "learning_rate": 1.0224690734662964e-06, - "loss": 0.0459, - "step": 19130 - }, - { - "epoch": 4.832113102751831, - "grad_norm": 0.39252969622612, - "learning_rate": 1.007321383489018e-06, - "loss": 0.0549, - "step": 19140 - }, - { - "epoch": 4.834637717748043, - "grad_norm": 0.45689401030540466, - "learning_rate": 9.921736935117394e-07, - "loss": 0.048, - "step": 19150 - }, - { - "epoch": 4.837162332744256, - "grad_norm": 0.47611868381500244, - "learning_rate": 9.77026003534461e-07, - "loss": 0.0472, - "step": 19160 - }, - { - "epoch": 4.83968694774047, - "grad_norm": 0.5146605968475342, - "learning_rate": 9.618783135571826e-07, - "loss": 0.0537, - "step": 19170 - }, - { - "epoch": 4.842211562736683, - "grad_norm": 0.5189658999443054, - "learning_rate": 9.467306235799042e-07, - "loss": 0.0682, - "step": 19180 - }, - { - "epoch": 4.844736177732896, - "grad_norm": 0.37280428409576416, - "learning_rate": 9.315829336026256e-07, - "loss": 0.0581, - "step": 19190 - }, - { - "epoch": 4.8472607927291085, - "grad_norm": 0.5254796743392944, - "learning_rate": 9.164352436253472e-07, - "loss": 0.0653, - "step": 19200 - }, - { - "epoch": 4.849785407725322, - "grad_norm": 0.5634022951126099, - "learning_rate": 9.012875536480687e-07, - "loss": 0.0641, - "step": 19210 - }, - { - "epoch": 4.852310022721535, - "grad_norm": 0.6558578014373779, - "learning_rate": 8.861398636707903e-07, - "loss": 0.0665, - "step": 19220 - }, - { - "epoch": 4.854834637717748, - "grad_norm": 0.7143204808235168, - "learning_rate": 8.709921736935118e-07, - "loss": 0.0615, - "step": 19230 - }, - { - "epoch": 4.857359252713961, - "grad_norm": 0.40588563680648804, - "learning_rate": 8.558444837162333e-07, - "loss": 0.059, - "step": 19240 - }, - { - "epoch": 4.859883867710174, - "grad_norm": 0.5825479030609131, - "learning_rate": 8.406967937389548e-07, - "loss": 0.0575, - "step": 19250 - }, - { - "epoch": 4.862408482706387, - "grad_norm": 0.6735103726387024, - "learning_rate": 8.255491037616763e-07, - "loss": 0.0667, - "step": 19260 - }, - { - "epoch": 4.8649330977026, - "grad_norm": 0.5344639420509338, - "learning_rate": 8.104014137843979e-07, - "loss": 0.0563, - "step": 19270 - }, - { - "epoch": 4.867457712698814, - "grad_norm": 0.611815869808197, - "learning_rate": 7.952537238071194e-07, - "loss": 0.0589, - "step": 19280 - }, - { - "epoch": 4.869982327695027, - "grad_norm": 0.5727031826972961, - "learning_rate": 7.80106033829841e-07, - "loss": 0.0601, - "step": 19290 - }, - { - "epoch": 4.872506942691239, - "grad_norm": 0.39414718747138977, - "learning_rate": 7.649583438525624e-07, - "loss": 0.0542, - "step": 19300 - }, - { - "epoch": 4.8750315576874526, - "grad_norm": 0.49244511127471924, - "learning_rate": 7.49810653875284e-07, - "loss": 0.0598, - "step": 19310 - }, - { - "epoch": 4.877556172683666, - "grad_norm": 0.5638169050216675, - "learning_rate": 7.346629638980055e-07, - "loss": 0.0537, - "step": 19320 - }, - { - "epoch": 4.880080787679879, - "grad_norm": 0.4944647550582886, - "learning_rate": 7.195152739207271e-07, - "loss": 0.0515, - "step": 19330 - }, - { - "epoch": 4.882605402676091, - "grad_norm": 0.847815215587616, - "learning_rate": 7.043675839434486e-07, - "loss": 0.0653, - "step": 19340 - }, - { - "epoch": 4.885130017672305, - "grad_norm": 0.7950305938720703, - "learning_rate": 6.892198939661702e-07, - "loss": 0.057, - "step": 19350 - }, - { - "epoch": 4.887654632668518, - "grad_norm": 0.680915892124176, - "learning_rate": 6.740722039888917e-07, - "loss": 0.0554, - "step": 19360 - }, - { - "epoch": 4.890179247664731, - "grad_norm": 0.42906680703163147, - "learning_rate": 6.589245140116133e-07, - "loss": 0.0535, - "step": 19370 - }, - { - "epoch": 4.8927038626609445, - "grad_norm": 0.872386634349823, - "learning_rate": 6.437768240343348e-07, - "loss": 0.0622, - "step": 19380 - }, - { - "epoch": 4.895228477657158, - "grad_norm": 0.619981586933136, - "learning_rate": 6.286291340570563e-07, - "loss": 0.0672, - "step": 19390 - }, - { - "epoch": 4.89775309265337, - "grad_norm": 0.538330614566803, - "learning_rate": 6.134814440797779e-07, - "loss": 0.0525, - "step": 19400 - }, - { - "epoch": 4.900277707649583, - "grad_norm": 0.4021759033203125, - "learning_rate": 5.983337541024993e-07, - "loss": 0.0546, - "step": 19410 - }, - { - "epoch": 4.902802322645797, - "grad_norm": 0.6232868432998657, - "learning_rate": 5.831860641252209e-07, - "loss": 0.0581, - "step": 19420 - }, - { - "epoch": 4.90532693764201, - "grad_norm": 0.6456800699234009, - "learning_rate": 5.680383741479424e-07, - "loss": 0.0545, - "step": 19430 - }, - { - "epoch": 4.907851552638222, - "grad_norm": 0.5507019758224487, - "learning_rate": 5.52890684170664e-07, - "loss": 0.0576, - "step": 19440 - }, - { - "epoch": 4.9103761676344355, - "grad_norm": 0.2918814718723297, - "learning_rate": 5.377429941933855e-07, - "loss": 0.0527, - "step": 19450 - }, - { - "epoch": 4.912900782630649, - "grad_norm": 0.35016146302223206, - "learning_rate": 5.225953042161071e-07, - "loss": 0.0636, - "step": 19460 - }, - { - "epoch": 4.915425397626862, - "grad_norm": 0.5368366837501526, - "learning_rate": 5.074476142388286e-07, - "loss": 0.0569, - "step": 19470 - }, - { - "epoch": 4.917950012623075, - "grad_norm": 0.5466439723968506, - "learning_rate": 4.922999242615502e-07, - "loss": 0.0508, - "step": 19480 - }, - { - "epoch": 4.9204746276192886, - "grad_norm": 0.6173040270805359, - "learning_rate": 4.771522342842717e-07, - "loss": 0.0504, - "step": 19490 - }, - { - "epoch": 4.922999242615501, - "grad_norm": 0.28498920798301697, - "learning_rate": 4.6200454430699317e-07, - "loss": 0.0572, - "step": 19500 - }, - { - "epoch": 4.925523857611714, - "grad_norm": 0.7897679209709167, - "learning_rate": 4.468568543297147e-07, - "loss": 0.053, - "step": 19510 - }, - { - "epoch": 4.928048472607927, - "grad_norm": 0.4405366778373718, - "learning_rate": 4.3170916435243625e-07, - "loss": 0.051, - "step": 19520 - }, - { - "epoch": 4.930573087604141, - "grad_norm": 0.7264717221260071, - "learning_rate": 4.165614743751578e-07, - "loss": 0.0535, - "step": 19530 - }, - { - "epoch": 4.933097702600353, - "grad_norm": 0.47195565700531006, - "learning_rate": 4.0141378439787934e-07, - "loss": 0.0416, - "step": 19540 - }, - { - "epoch": 4.935622317596566, - "grad_norm": 0.4767369031906128, - "learning_rate": 3.862660944206009e-07, - "loss": 0.049, - "step": 19550 - }, - { - "epoch": 4.93814693259278, - "grad_norm": 0.5228800177574158, - "learning_rate": 3.711184044433224e-07, - "loss": 0.0568, - "step": 19560 - }, - { - "epoch": 4.940671547588993, - "grad_norm": 0.5455029010772705, - "learning_rate": 3.5597071446604396e-07, - "loss": 0.061, - "step": 19570 - }, - { - "epoch": 4.943196162585206, - "grad_norm": 0.4548329710960388, - "learning_rate": 3.408230244887655e-07, - "loss": 0.0543, - "step": 19580 - }, - { - "epoch": 4.9457207775814185, - "grad_norm": 0.41128185391426086, - "learning_rate": 3.2567533451148704e-07, - "loss": 0.0568, - "step": 19590 - }, - { - "epoch": 4.948245392577632, - "grad_norm": 0.3675704598426819, - "learning_rate": 3.1052764453420853e-07, - "loss": 0.0646, - "step": 19600 - }, - { - "epoch": 4.950770007573845, - "grad_norm": 0.49481600522994995, - "learning_rate": 2.9537995455693007e-07, - "loss": 0.0659, - "step": 19610 - }, - { - "epoch": 4.953294622570058, - "grad_norm": 0.3610905706882477, - "learning_rate": 2.802322645796516e-07, - "loss": 0.0606, - "step": 19620 - }, - { - "epoch": 4.9558192375662715, - "grad_norm": 0.4303690493106842, - "learning_rate": 2.6508457460237316e-07, - "loss": 0.0485, - "step": 19630 - }, - { - "epoch": 4.958343852562484, - "grad_norm": 0.4692881405353546, - "learning_rate": 2.4993688462509464e-07, - "loss": 0.0552, - "step": 19640 - }, - { - "epoch": 4.960868467558697, - "grad_norm": 0.7063325047492981, - "learning_rate": 2.3478919464781619e-07, - "loss": 0.0614, - "step": 19650 - }, - { - "epoch": 4.96339308255491, - "grad_norm": 0.6039048433303833, - "learning_rate": 2.1964150467053775e-07, - "loss": 0.0729, - "step": 19660 - }, - { - "epoch": 4.965917697551124, - "grad_norm": 0.38355937600135803, - "learning_rate": 2.044938146932593e-07, - "loss": 0.054, - "step": 19670 - }, - { - "epoch": 4.968442312547337, - "grad_norm": 0.7297325134277344, - "learning_rate": 1.893461247159808e-07, - "loss": 0.057, - "step": 19680 - }, - { - "epoch": 4.970966927543549, - "grad_norm": 0.618418276309967, - "learning_rate": 1.7419843473870235e-07, - "loss": 0.0533, - "step": 19690 - }, - { - "epoch": 4.9734915425397626, - "grad_norm": 0.44941627979278564, - "learning_rate": 1.5905074476142387e-07, - "loss": 0.0626, - "step": 19700 - }, - { - "epoch": 4.976016157535976, - "grad_norm": 0.5745902061462402, - "learning_rate": 1.439030547841454e-07, - "loss": 0.0528, - "step": 19710 - }, - { - "epoch": 4.978540772532189, - "grad_norm": 0.6372010707855225, - "learning_rate": 1.2875536480686695e-07, - "loss": 0.0548, - "step": 19720 - }, - { - "epoch": 4.9810653875284014, - "grad_norm": 0.5590953826904297, - "learning_rate": 1.1360767482958849e-07, - "loss": 0.0515, - "step": 19730 - }, - { - "epoch": 4.983590002524615, - "grad_norm": 0.3603893518447876, - "learning_rate": 9.845998485231003e-08, - "loss": 0.057, - "step": 19740 - }, - { - "epoch": 4.986114617520828, - "grad_norm": 0.42396554350852966, - "learning_rate": 8.331229487503156e-08, - "loss": 0.0522, - "step": 19750 - }, - { - "epoch": 4.988639232517041, - "grad_norm": 0.6315743327140808, - "learning_rate": 6.816460489775309e-08, - "loss": 0.0475, - "step": 19760 - }, - { - "epoch": 4.9911638475132545, - "grad_norm": 0.535829484462738, - "learning_rate": 5.301691492047463e-08, - "loss": 0.0602, - "step": 19770 - }, - { - "epoch": 4.993688462509468, - "grad_norm": 0.706295371055603, - "learning_rate": 3.786922494319616e-08, - "loss": 0.0476, - "step": 19780 - }, - { - "epoch": 4.99621307750568, - "grad_norm": 0.5887550711631775, - "learning_rate": 2.2721534965917698e-08, - "loss": 0.0555, - "step": 19790 - }, - { - "epoch": 4.998737692501893, - "grad_norm": 0.2900368273258209, - "learning_rate": 7.573844988639233e-09, - "loss": 0.0532, - "step": 19800 - }, - { - "epoch": 5.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.04290741682052612, - "eval_runtime": 1160.2076, - "eval_samples_per_second": 177.78, - "eval_steps_per_second": 2.778, - "step": 19805 - }, - { - "epoch": 5.0, - "step": 19805, - "total_flos": 9.82001462664467e+19, - "train_loss": 0.0, - "train_runtime": 0.0658, - "train_samples_per_second": 19271134.706, - "train_steps_per_second": 301163.752 - } - ], - "logging_steps": 10, - "max_steps": 19805, - "num_input_tokens_seen": 0, - "num_train_epochs": 5, - "save_steps": 500, - "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, @@ -13949,8 +7019,8 @@ "attributes": {} } }, - "total_flos": 9.82001462664467e+19, - "train_batch_size": 64, + "total_flos": 9.82152667464321e+19, + "train_batch_size": 128, "trial_name": null, "trial_params": null }