diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3920 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9989187240944313, + "eval_steps": 500, + "global_step": 5547, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005406379527842855, + "grad_norm": 5.67321238470604, + "learning_rate": 1.801801801801802e-07, + "loss": 0.8785, + "step": 10 + }, + { + "epoch": 0.01081275905568571, + "grad_norm": 5.2575759647356906, + "learning_rate": 3.603603603603604e-07, + "loss": 0.8654, + "step": 20 + }, + { + "epoch": 0.016219138583528563, + "grad_norm": 3.8360253130807958, + "learning_rate": 5.405405405405406e-07, + "loss": 0.8205, + "step": 30 + }, + { + "epoch": 0.02162551811137142, + "grad_norm": 1.722668988638544, + "learning_rate": 7.207207207207208e-07, + "loss": 0.778, + "step": 40 + }, + { + "epoch": 0.027031897639214274, + "grad_norm": 1.3197714991034968, + "learning_rate": 9.00900900900901e-07, + "loss": 0.7286, + "step": 50 + }, + { + "epoch": 0.03243827716705713, + "grad_norm": 0.8474482237034886, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.6968, + "step": 60 + }, + { + "epoch": 0.03784465669489998, + "grad_norm": 0.5645420283585227, + "learning_rate": 1.2612612612612613e-06, + "loss": 0.6689, + "step": 70 + }, + { + "epoch": 0.04325103622274284, + "grad_norm": 0.43605656948964683, + "learning_rate": 1.4414414414414416e-06, + "loss": 0.6408, + "step": 80 + }, + { + "epoch": 0.04865741575058569, + "grad_norm": 0.4339497028480959, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.6153, + "step": 90 + }, + { + "epoch": 0.05406379527842855, + "grad_norm": 0.3843592033040236, + "learning_rate": 1.801801801801802e-06, + "loss": 0.6082, + "step": 100 + }, + { + "epoch": 0.0594701748062714, + "grad_norm": 0.37685068673558353, + "learning_rate": 1.9819819819819822e-06, + "loss": 0.6049, + "step": 110 + }, + { + "epoch": 0.06487655433411425, + "grad_norm": 0.4392453448959536, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.5889, + "step": 120 + }, + { + "epoch": 0.07028293386195711, + "grad_norm": 0.4212233804351266, + "learning_rate": 2.3423423423423424e-06, + "loss": 0.5842, + "step": 130 + }, + { + "epoch": 0.07568931338979996, + "grad_norm": 0.38709432000579613, + "learning_rate": 2.5225225225225225e-06, + "loss": 0.592, + "step": 140 + }, + { + "epoch": 0.08109569291764282, + "grad_norm": 0.3988233764060424, + "learning_rate": 2.702702702702703e-06, + "loss": 0.5732, + "step": 150 + }, + { + "epoch": 0.08650207244548568, + "grad_norm": 0.41395637177292804, + "learning_rate": 2.882882882882883e-06, + "loss": 0.5679, + "step": 160 + }, + { + "epoch": 0.09190845197332853, + "grad_norm": 0.37677030114794524, + "learning_rate": 3.063063063063063e-06, + "loss": 0.5583, + "step": 170 + }, + { + "epoch": 0.09731483150117139, + "grad_norm": 0.38451911721974225, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.5658, + "step": 180 + }, + { + "epoch": 0.10272121102901424, + "grad_norm": 0.36190379869625294, + "learning_rate": 3.423423423423424e-06, + "loss": 0.5554, + "step": 190 + }, + { + "epoch": 0.1081275905568571, + "grad_norm": 0.3927866832932917, + "learning_rate": 3.603603603603604e-06, + "loss": 0.5534, + "step": 200 + }, + { + "epoch": 0.11353397008469994, + "grad_norm": 0.4109637951464883, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.5527, + "step": 210 + }, + { + "epoch": 0.1189403496125428, + "grad_norm": 0.4189875109517182, + "learning_rate": 3.9639639639639645e-06, + "loss": 0.5521, + "step": 220 + }, + { + "epoch": 0.12434672914038565, + "grad_norm": 0.44103289873218365, + "learning_rate": 4.1441441441441446e-06, + "loss": 0.55, + "step": 230 + }, + { + "epoch": 0.1297531086682285, + "grad_norm": 0.47624121719255225, + "learning_rate": 4.324324324324325e-06, + "loss": 0.5455, + "step": 240 + }, + { + "epoch": 0.13515948819607138, + "grad_norm": 0.4127382950104387, + "learning_rate": 4.504504504504505e-06, + "loss": 0.5392, + "step": 250 + }, + { + "epoch": 0.14056586772391422, + "grad_norm": 0.42849081039324655, + "learning_rate": 4.684684684684685e-06, + "loss": 0.5317, + "step": 260 + }, + { + "epoch": 0.1459722472517571, + "grad_norm": 0.4104060308344588, + "learning_rate": 4.864864864864866e-06, + "loss": 0.5317, + "step": 270 + }, + { + "epoch": 0.15137862677959993, + "grad_norm": 0.5046982359974199, + "learning_rate": 5.045045045045045e-06, + "loss": 0.5342, + "step": 280 + }, + { + "epoch": 0.15678500630744277, + "grad_norm": 0.4507880118410215, + "learning_rate": 5.225225225225226e-06, + "loss": 0.5325, + "step": 290 + }, + { + "epoch": 0.16219138583528564, + "grad_norm": 0.42877102726223915, + "learning_rate": 5.405405405405406e-06, + "loss": 0.5236, + "step": 300 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 0.5283894117116334, + "learning_rate": 5.585585585585585e-06, + "loss": 0.5316, + "step": 310 + }, + { + "epoch": 0.17300414489097135, + "grad_norm": 0.45448942603717846, + "learning_rate": 5.765765765765766e-06, + "loss": 0.5304, + "step": 320 + }, + { + "epoch": 0.1784105244188142, + "grad_norm": 0.4459611601163911, + "learning_rate": 5.945945945945947e-06, + "loss": 0.5307, + "step": 330 + }, + { + "epoch": 0.18381690394665706, + "grad_norm": 0.4167802385045301, + "learning_rate": 6.126126126126126e-06, + "loss": 0.5142, + "step": 340 + }, + { + "epoch": 0.1892232834744999, + "grad_norm": 0.45167071134408077, + "learning_rate": 6.3063063063063065e-06, + "loss": 0.5252, + "step": 350 + }, + { + "epoch": 0.19462966300234277, + "grad_norm": 0.3815004250489287, + "learning_rate": 6.486486486486487e-06, + "loss": 0.5203, + "step": 360 + }, + { + "epoch": 0.20003604253018561, + "grad_norm": 0.4189611440474181, + "learning_rate": 6.666666666666667e-06, + "loss": 0.5198, + "step": 370 + }, + { + "epoch": 0.20544242205802848, + "grad_norm": 0.4356383135556994, + "learning_rate": 6.846846846846848e-06, + "loss": 0.5164, + "step": 380 + }, + { + "epoch": 0.21084880158587133, + "grad_norm": 0.4146665581812368, + "learning_rate": 7.027027027027028e-06, + "loss": 0.5201, + "step": 390 + }, + { + "epoch": 0.2162551811137142, + "grad_norm": 0.46098403607909094, + "learning_rate": 7.207207207207208e-06, + "loss": 0.5241, + "step": 400 + }, + { + "epoch": 0.22166156064155704, + "grad_norm": 0.4173832279688485, + "learning_rate": 7.387387387387388e-06, + "loss": 0.5141, + "step": 410 + }, + { + "epoch": 0.22706794016939988, + "grad_norm": 0.45342411753034784, + "learning_rate": 7.567567567567569e-06, + "loss": 0.5058, + "step": 420 + }, + { + "epoch": 0.23247431969724275, + "grad_norm": 0.5556218847582134, + "learning_rate": 7.747747747747749e-06, + "loss": 0.5132, + "step": 430 + }, + { + "epoch": 0.2378806992250856, + "grad_norm": 0.4159604294450067, + "learning_rate": 7.927927927927929e-06, + "loss": 0.5116, + "step": 440 + }, + { + "epoch": 0.24328707875292846, + "grad_norm": 0.5011827344554423, + "learning_rate": 8.108108108108109e-06, + "loss": 0.5168, + "step": 450 + }, + { + "epoch": 0.2486934582807713, + "grad_norm": 0.4837033851909487, + "learning_rate": 8.288288288288289e-06, + "loss": 0.5078, + "step": 460 + }, + { + "epoch": 0.25409983780861417, + "grad_norm": 0.43704376571990733, + "learning_rate": 8.46846846846847e-06, + "loss": 0.5033, + "step": 470 + }, + { + "epoch": 0.259506217336457, + "grad_norm": 0.3998543920237395, + "learning_rate": 8.64864864864865e-06, + "loss": 0.5023, + "step": 480 + }, + { + "epoch": 0.26491259686429985, + "grad_norm": 0.5026204387708488, + "learning_rate": 8.82882882882883e-06, + "loss": 0.5101, + "step": 490 + }, + { + "epoch": 0.27031897639214275, + "grad_norm": 0.5354755864920291, + "learning_rate": 9.00900900900901e-06, + "loss": 0.508, + "step": 500 + }, + { + "epoch": 0.2757253559199856, + "grad_norm": 0.4703091181508223, + "learning_rate": 9.189189189189191e-06, + "loss": 0.5057, + "step": 510 + }, + { + "epoch": 0.28113173544782843, + "grad_norm": 0.5066877793509437, + "learning_rate": 9.36936936936937e-06, + "loss": 0.5026, + "step": 520 + }, + { + "epoch": 0.2865381149756713, + "grad_norm": 0.46090960041448786, + "learning_rate": 9.54954954954955e-06, + "loss": 0.5106, + "step": 530 + }, + { + "epoch": 0.2919444945035142, + "grad_norm": 0.48562395925030005, + "learning_rate": 9.729729729729732e-06, + "loss": 0.4974, + "step": 540 + }, + { + "epoch": 0.297350874031357, + "grad_norm": 0.4646077201771921, + "learning_rate": 9.90990990990991e-06, + "loss": 0.4999, + "step": 550 + }, + { + "epoch": 0.30275725355919986, + "grad_norm": 0.4546070354869126, + "learning_rate": 9.999975246862685e-06, + "loss": 0.5103, + "step": 560 + }, + { + "epoch": 0.3081636330870427, + "grad_norm": 0.4529892857679444, + "learning_rate": 9.999777223234682e-06, + "loss": 0.5015, + "step": 570 + }, + { + "epoch": 0.31357001261488554, + "grad_norm": 0.42533238661448763, + "learning_rate": 9.999381183821387e-06, + "loss": 0.5079, + "step": 580 + }, + { + "epoch": 0.31897639214272844, + "grad_norm": 0.4319966793689572, + "learning_rate": 9.998787144307906e-06, + "loss": 0.4946, + "step": 590 + }, + { + "epoch": 0.3243827716705713, + "grad_norm": 0.5664739889982127, + "learning_rate": 9.997995128221131e-06, + "loss": 0.4963, + "step": 600 + }, + { + "epoch": 0.3297891511984141, + "grad_norm": 0.4571640893613164, + "learning_rate": 9.9970051669288e-06, + "loss": 0.4937, + "step": 610 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 0.46148944851299945, + "learning_rate": 9.995817299638244e-06, + "loss": 0.5002, + "step": 620 + }, + { + "epoch": 0.34060191025409986, + "grad_norm": 0.4844168889608816, + "learning_rate": 9.994431573394861e-06, + "loss": 0.5029, + "step": 630 + }, + { + "epoch": 0.3460082897819427, + "grad_norm": 0.4279693386473206, + "learning_rate": 9.99284804308023e-06, + "loss": 0.4952, + "step": 640 + }, + { + "epoch": 0.35141466930978554, + "grad_norm": 0.5233101609153901, + "learning_rate": 9.991066771409941e-06, + "loss": 0.4915, + "step": 650 + }, + { + "epoch": 0.3568210488376284, + "grad_norm": 0.4633208414221673, + "learning_rate": 9.989087828931121e-06, + "loss": 0.4981, + "step": 660 + }, + { + "epoch": 0.3622274283654713, + "grad_norm": 0.450997223108701, + "learning_rate": 9.986911294019631e-06, + "loss": 0.4975, + "step": 670 + }, + { + "epoch": 0.3676338078933141, + "grad_norm": 0.42452529740346523, + "learning_rate": 9.984537252876969e-06, + "loss": 0.4908, + "step": 680 + }, + { + "epoch": 0.37304018742115697, + "grad_norm": 0.46365207035760786, + "learning_rate": 9.981965799526846e-06, + "loss": 0.5016, + "step": 690 + }, + { + "epoch": 0.3784465669489998, + "grad_norm": 0.5296232726547591, + "learning_rate": 9.97919703581147e-06, + "loss": 0.4876, + "step": 700 + }, + { + "epoch": 0.38385294647684265, + "grad_norm": 0.401880074927354, + "learning_rate": 9.976231071387513e-06, + "loss": 0.4903, + "step": 710 + }, + { + "epoch": 0.38925932600468555, + "grad_norm": 0.42396559048043103, + "learning_rate": 9.973068023721761e-06, + "loss": 0.4898, + "step": 720 + }, + { + "epoch": 0.3946657055325284, + "grad_norm": 0.46944427807049693, + "learning_rate": 9.969708018086472e-06, + "loss": 0.4881, + "step": 730 + }, + { + "epoch": 0.40007208506037123, + "grad_norm": 0.4333253518146232, + "learning_rate": 9.966151187554403e-06, + "loss": 0.4895, + "step": 740 + }, + { + "epoch": 0.40547846458821407, + "grad_norm": 0.37661719489991125, + "learning_rate": 9.962397672993552e-06, + "loss": 0.487, + "step": 750 + }, + { + "epoch": 0.41088484411605697, + "grad_norm": 0.4603392631171023, + "learning_rate": 9.958447623061564e-06, + "loss": 0.4872, + "step": 760 + }, + { + "epoch": 0.4162912236438998, + "grad_norm": 0.3927558003883759, + "learning_rate": 9.954301194199864e-06, + "loss": 0.4903, + "step": 770 + }, + { + "epoch": 0.42169760317174265, + "grad_norm": 0.42897879593990096, + "learning_rate": 9.949958550627436e-06, + "loss": 0.4885, + "step": 780 + }, + { + "epoch": 0.4271039826995855, + "grad_norm": 0.4924374446694773, + "learning_rate": 9.945419864334344e-06, + "loss": 0.4774, + "step": 790 + }, + { + "epoch": 0.4325103622274284, + "grad_norm": 0.42518945879483444, + "learning_rate": 9.940685315074898e-06, + "loss": 0.4754, + "step": 800 + }, + { + "epoch": 0.43791674175527123, + "grad_norm": 0.399260485682431, + "learning_rate": 9.935755090360554e-06, + "loss": 0.4765, + "step": 810 + }, + { + "epoch": 0.4433231212831141, + "grad_norm": 0.37083672732602235, + "learning_rate": 9.930629385452475e-06, + "loss": 0.4757, + "step": 820 + }, + { + "epoch": 0.4487295008109569, + "grad_norm": 0.41759222116367195, + "learning_rate": 9.925308403353801e-06, + "loss": 0.4871, + "step": 830 + }, + { + "epoch": 0.45413588033879976, + "grad_norm": 0.4969932090759188, + "learning_rate": 9.919792354801614e-06, + "loss": 0.4792, + "step": 840 + }, + { + "epoch": 0.45954225986664266, + "grad_norm": 0.5029960802938596, + "learning_rate": 9.914081458258582e-06, + "loss": 0.4896, + "step": 850 + }, + { + "epoch": 0.4649486393944855, + "grad_norm": 0.40244747307174517, + "learning_rate": 9.908175939904317e-06, + "loss": 0.492, + "step": 860 + }, + { + "epoch": 0.47035501892232834, + "grad_norm": 0.4109529990790928, + "learning_rate": 9.902076033626409e-06, + "loss": 0.4863, + "step": 870 + }, + { + "epoch": 0.4757613984501712, + "grad_norm": 0.4151789891424962, + "learning_rate": 9.89578198101117e-06, + "loss": 0.48, + "step": 880 + }, + { + "epoch": 0.4811677779780141, + "grad_norm": 0.4884869421566706, + "learning_rate": 9.88929403133406e-06, + "loss": 0.4875, + "step": 890 + }, + { + "epoch": 0.4865741575058569, + "grad_norm": 0.39469839728031286, + "learning_rate": 9.882612441549817e-06, + "loss": 0.4886, + "step": 900 + }, + { + "epoch": 0.49198053703369976, + "grad_norm": 0.41142281651530643, + "learning_rate": 9.875737476282283e-06, + "loss": 0.4837, + "step": 910 + }, + { + "epoch": 0.4973869165615426, + "grad_norm": 0.4420691443729092, + "learning_rate": 9.868669407813919e-06, + "loss": 0.4877, + "step": 920 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 0.37836126000922937, + "learning_rate": 9.86140851607502e-06, + "loss": 0.4826, + "step": 930 + }, + { + "epoch": 0.5081996756172283, + "grad_norm": 0.42066137745562854, + "learning_rate": 9.85395508863264e-06, + "loss": 0.4827, + "step": 940 + }, + { + "epoch": 0.5136060551450712, + "grad_norm": 0.45522508321704436, + "learning_rate": 9.846309420679181e-06, + "loss": 0.4807, + "step": 950 + }, + { + "epoch": 0.519012434672914, + "grad_norm": 0.424109403832704, + "learning_rate": 9.838471815020731e-06, + "loss": 0.483, + "step": 960 + }, + { + "epoch": 0.5244188142007569, + "grad_norm": 0.4571075574503357, + "learning_rate": 9.830442582065046e-06, + "loss": 0.4847, + "step": 970 + }, + { + "epoch": 0.5298251937285997, + "grad_norm": 0.39544147521974715, + "learning_rate": 9.822222039809265e-06, + "loss": 0.4894, + "step": 980 + }, + { + "epoch": 0.5352315732564425, + "grad_norm": 0.41512982878770877, + "learning_rate": 9.813810513827324e-06, + "loss": 0.4757, + "step": 990 + }, + { + "epoch": 0.5406379527842855, + "grad_norm": 0.44241530882704766, + "learning_rate": 9.805208337257048e-06, + "loss": 0.4844, + "step": 1000 + }, + { + "epoch": 0.5460443323121283, + "grad_norm": 0.39829234416158904, + "learning_rate": 9.79641585078697e-06, + "loss": 0.4712, + "step": 1010 + }, + { + "epoch": 0.5514507118399712, + "grad_norm": 0.37741532471866907, + "learning_rate": 9.787433402642823e-06, + "loss": 0.4793, + "step": 1020 + }, + { + "epoch": 0.556857091367814, + "grad_norm": 0.4148300916885638, + "learning_rate": 9.778261348573766e-06, + "loss": 0.4838, + "step": 1030 + }, + { + "epoch": 0.5622634708956569, + "grad_norm": 0.4432803310345476, + "learning_rate": 9.76890005183828e-06, + "loss": 0.4808, + "step": 1040 + }, + { + "epoch": 0.5676698504234997, + "grad_norm": 0.44053440283249773, + "learning_rate": 9.759349883189788e-06, + "loss": 0.4855, + "step": 1050 + }, + { + "epoch": 0.5730762299513426, + "grad_norm": 0.47129417304470445, + "learning_rate": 9.749611220861975e-06, + "loss": 0.4825, + "step": 1060 + }, + { + "epoch": 0.5784826094791854, + "grad_norm": 0.3519052622952217, + "learning_rate": 9.739684450553796e-06, + "loss": 0.4672, + "step": 1070 + }, + { + "epoch": 0.5838889890070283, + "grad_norm": 0.41946435282373756, + "learning_rate": 9.729569965414214e-06, + "loss": 0.4749, + "step": 1080 + }, + { + "epoch": 0.5892953685348712, + "grad_norm": 0.40367405116733107, + "learning_rate": 9.719268166026619e-06, + "loss": 0.4714, + "step": 1090 + }, + { + "epoch": 0.594701748062714, + "grad_norm": 0.389163994716956, + "learning_rate": 9.70877946039297e-06, + "loss": 0.4762, + "step": 1100 + }, + { + "epoch": 0.6001081275905569, + "grad_norm": 0.3924144038563765, + "learning_rate": 9.698104263917632e-06, + "loss": 0.479, + "step": 1110 + }, + { + "epoch": 0.6055145071183997, + "grad_norm": 0.38077440580004723, + "learning_rate": 9.687242999390923e-06, + "loss": 0.4743, + "step": 1120 + }, + { + "epoch": 0.6109208866462426, + "grad_norm": 0.4144915670436874, + "learning_rate": 9.676196096972375e-06, + "loss": 0.4831, + "step": 1130 + }, + { + "epoch": 0.6163272661740854, + "grad_norm": 0.4019523099418982, + "learning_rate": 9.664963994173695e-06, + "loss": 0.4811, + "step": 1140 + }, + { + "epoch": 0.6217336457019282, + "grad_norm": 0.3870772083799463, + "learning_rate": 9.653547135841432e-06, + "loss": 0.482, + "step": 1150 + }, + { + "epoch": 0.6271400252297711, + "grad_norm": 0.3774486403943126, + "learning_rate": 9.641945974139368e-06, + "loss": 0.4808, + "step": 1160 + }, + { + "epoch": 0.632546404757614, + "grad_norm": 0.3669418201630717, + "learning_rate": 9.630160968530601e-06, + "loss": 0.4742, + "step": 1170 + }, + { + "epoch": 0.6379527842854569, + "grad_norm": 0.3767330377559856, + "learning_rate": 9.618192585759358e-06, + "loss": 0.4793, + "step": 1180 + }, + { + "epoch": 0.6433591638132997, + "grad_norm": 0.4109728050110914, + "learning_rate": 9.606041299832499e-06, + "loss": 0.476, + "step": 1190 + }, + { + "epoch": 0.6487655433411426, + "grad_norm": 0.42214280261521075, + "learning_rate": 9.593707592000751e-06, + "loss": 0.4719, + "step": 1200 + }, + { + "epoch": 0.6541719228689854, + "grad_norm": 0.40015675805718526, + "learning_rate": 9.581191950739651e-06, + "loss": 0.4802, + "step": 1210 + }, + { + "epoch": 0.6595783023968282, + "grad_norm": 0.3652325798758447, + "learning_rate": 9.568494871730184e-06, + "loss": 0.4751, + "step": 1220 + }, + { + "epoch": 0.6649846819246711, + "grad_norm": 0.4758040665812572, + "learning_rate": 9.555616857839171e-06, + "loss": 0.476, + "step": 1230 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 0.4088256926011169, + "learning_rate": 9.542558419099348e-06, + "loss": 0.4671, + "step": 1240 + }, + { + "epoch": 0.6757974409803568, + "grad_norm": 0.3777516778350075, + "learning_rate": 9.529320072689157e-06, + "loss": 0.4663, + "step": 1250 + }, + { + "epoch": 0.6812038205081997, + "grad_norm": 0.40279858714603456, + "learning_rate": 9.515902342912268e-06, + "loss": 0.4696, + "step": 1260 + }, + { + "epoch": 0.6866102000360426, + "grad_norm": 0.4553420901856075, + "learning_rate": 9.50230576117682e-06, + "loss": 0.4742, + "step": 1270 + }, + { + "epoch": 0.6920165795638854, + "grad_norm": 0.4339586123054069, + "learning_rate": 9.488530865974365e-06, + "loss": 0.4701, + "step": 1280 + }, + { + "epoch": 0.6974229590917282, + "grad_norm": 0.4249972919470697, + "learning_rate": 9.47457820285855e-06, + "loss": 0.4701, + "step": 1290 + }, + { + "epoch": 0.7028293386195711, + "grad_norm": 0.5108244833979698, + "learning_rate": 9.460448324423508e-06, + "loss": 0.4767, + "step": 1300 + }, + { + "epoch": 0.7082357181474139, + "grad_norm": 0.41029950466124815, + "learning_rate": 9.446141790281961e-06, + "loss": 0.4757, + "step": 1310 + }, + { + "epoch": 0.7136420976752568, + "grad_norm": 0.395665406767247, + "learning_rate": 9.431659167043079e-06, + "loss": 0.4657, + "step": 1320 + }, + { + "epoch": 0.7190484772030996, + "grad_norm": 0.3916187354896928, + "learning_rate": 9.417001028290019e-06, + "loss": 0.47, + "step": 1330 + }, + { + "epoch": 0.7244548567309426, + "grad_norm": 0.3841663885450239, + "learning_rate": 9.402167954557218e-06, + "loss": 0.4622, + "step": 1340 + }, + { + "epoch": 0.7298612362587854, + "grad_norm": 0.33000158409293234, + "learning_rate": 9.387160533307398e-06, + "loss": 0.4735, + "step": 1350 + }, + { + "epoch": 0.7352676157866282, + "grad_norm": 0.35110054752545317, + "learning_rate": 9.371979358908302e-06, + "loss": 0.4647, + "step": 1360 + }, + { + "epoch": 0.7406739953144711, + "grad_norm": 0.4060026085740451, + "learning_rate": 9.356625032609157e-06, + "loss": 0.4716, + "step": 1370 + }, + { + "epoch": 0.7460803748423139, + "grad_norm": 0.4014001214789219, + "learning_rate": 9.341098162516848e-06, + "loss": 0.4753, + "step": 1380 + }, + { + "epoch": 0.7514867543701568, + "grad_norm": 0.4466537387424745, + "learning_rate": 9.325399363571853e-06, + "loss": 0.4637, + "step": 1390 + }, + { + "epoch": 0.7568931338979996, + "grad_norm": 0.3789496760613153, + "learning_rate": 9.309529257523873e-06, + "loss": 0.4833, + "step": 1400 + }, + { + "epoch": 0.7622995134258425, + "grad_norm": 0.3871711262176569, + "learning_rate": 9.293488472907213e-06, + "loss": 0.4741, + "step": 1410 + }, + { + "epoch": 0.7677058929536853, + "grad_norm": 0.33522935773230744, + "learning_rate": 9.277277645015895e-06, + "loss": 0.4645, + "step": 1420 + }, + { + "epoch": 0.7731122724815283, + "grad_norm": 0.36926574454217775, + "learning_rate": 9.260897415878484e-06, + "loss": 0.4737, + "step": 1430 + }, + { + "epoch": 0.7785186520093711, + "grad_norm": 0.38628683202935965, + "learning_rate": 9.244348434232676e-06, + "loss": 0.4807, + "step": 1440 + }, + { + "epoch": 0.7839250315372139, + "grad_norm": 0.3723802508008121, + "learning_rate": 9.227631355499588e-06, + "loss": 0.4711, + "step": 1450 + }, + { + "epoch": 0.7893314110650568, + "grad_norm": 0.43275316141725356, + "learning_rate": 9.210746841757816e-06, + "loss": 0.4606, + "step": 1460 + }, + { + "epoch": 0.7947377905928996, + "grad_norm": 0.36470233384616396, + "learning_rate": 9.193695561717207e-06, + "loss": 0.4789, + "step": 1470 + }, + { + "epoch": 0.8001441701207425, + "grad_norm": 0.39548085338311784, + "learning_rate": 9.176478190692369e-06, + "loss": 0.4713, + "step": 1480 + }, + { + "epoch": 0.8055505496485853, + "grad_norm": 0.3553750033222167, + "learning_rate": 9.159095410575931e-06, + "loss": 0.4725, + "step": 1490 + }, + { + "epoch": 0.8109569291764281, + "grad_norm": 0.3637209745858356, + "learning_rate": 9.14154790981154e-06, + "loss": 0.4594, + "step": 1500 + }, + { + "epoch": 0.816363308704271, + "grad_norm": 0.3827679215177506, + "learning_rate": 9.12383638336659e-06, + "loss": 0.4731, + "step": 1510 + }, + { + "epoch": 0.8217696882321139, + "grad_norm": 0.3932319357502074, + "learning_rate": 9.105961532704695e-06, + "loss": 0.4744, + "step": 1520 + }, + { + "epoch": 0.8271760677599568, + "grad_norm": 0.37420610924572006, + "learning_rate": 9.08792406575792e-06, + "loss": 0.4596, + "step": 1530 + }, + { + "epoch": 0.8325824472877996, + "grad_norm": 0.36958869694379687, + "learning_rate": 9.069724696898727e-06, + "loss": 0.4644, + "step": 1540 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 0.4296266126218128, + "learning_rate": 9.051364146911696e-06, + "loss": 0.4695, + "step": 1550 + }, + { + "epoch": 0.8433952063434853, + "grad_norm": 0.3552866307907092, + "learning_rate": 9.03284314296497e-06, + "loss": 0.4699, + "step": 1560 + }, + { + "epoch": 0.8488015858713281, + "grad_norm": 0.36327016829544306, + "learning_rate": 9.01416241858146e-06, + "loss": 0.4669, + "step": 1570 + }, + { + "epoch": 0.854207965399171, + "grad_norm": 0.375420429355353, + "learning_rate": 8.995322713609792e-06, + "loss": 0.4672, + "step": 1580 + }, + { + "epoch": 0.8596143449270138, + "grad_norm": 0.5173900256611019, + "learning_rate": 8.976324774195005e-06, + "loss": 0.4683, + "step": 1590 + }, + { + "epoch": 0.8650207244548568, + "grad_norm": 0.39427484151317893, + "learning_rate": 8.957169352749005e-06, + "loss": 0.4652, + "step": 1600 + }, + { + "epoch": 0.8704271039826996, + "grad_norm": 0.4127231026821577, + "learning_rate": 8.937857207920751e-06, + "loss": 0.4693, + "step": 1610 + }, + { + "epoch": 0.8758334835105425, + "grad_norm": 0.3557084122875894, + "learning_rate": 8.918389104566232e-06, + "loss": 0.4653, + "step": 1620 + }, + { + "epoch": 0.8812398630383853, + "grad_norm": 0.32279027303173025, + "learning_rate": 8.898765813718155e-06, + "loss": 0.4575, + "step": 1630 + }, + { + "epoch": 0.8866462425662281, + "grad_norm": 0.3597815860403744, + "learning_rate": 8.878988112555415e-06, + "loss": 0.4635, + "step": 1640 + }, + { + "epoch": 0.892052622094071, + "grad_norm": 0.3672011391559523, + "learning_rate": 8.85905678437232e-06, + "loss": 0.4637, + "step": 1650 + }, + { + "epoch": 0.8974590016219138, + "grad_norm": 0.39802107641409196, + "learning_rate": 8.838972618547561e-06, + "loss": 0.4668, + "step": 1660 + }, + { + "epoch": 0.9028653811497567, + "grad_norm": 0.35901725656975336, + "learning_rate": 8.81873641051295e-06, + "loss": 0.4626, + "step": 1670 + }, + { + "epoch": 0.9082717606775995, + "grad_norm": 0.45574284613082794, + "learning_rate": 8.798348961721925e-06, + "loss": 0.4618, + "step": 1680 + }, + { + "epoch": 0.9136781402054425, + "grad_norm": 0.33960849857370073, + "learning_rate": 8.777811079617793e-06, + "loss": 0.4735, + "step": 1690 + }, + { + "epoch": 0.9190845197332853, + "grad_norm": 0.36806947123886746, + "learning_rate": 8.757123577601771e-06, + "loss": 0.4642, + "step": 1700 + }, + { + "epoch": 0.9244908992611282, + "grad_norm": 0.36728162811734544, + "learning_rate": 8.736287275000755e-06, + "loss": 0.465, + "step": 1710 + }, + { + "epoch": 0.929897278788971, + "grad_norm": 0.38164336488797146, + "learning_rate": 8.715302997034876e-06, + "loss": 0.4702, + "step": 1720 + }, + { + "epoch": 0.9353036583168138, + "grad_norm": 0.34605322849280384, + "learning_rate": 8.694171574784818e-06, + "loss": 0.4674, + "step": 1730 + }, + { + "epoch": 0.9407100378446567, + "grad_norm": 0.3353439147558085, + "learning_rate": 8.672893845158908e-06, + "loss": 0.4701, + "step": 1740 + }, + { + "epoch": 0.9461164173724995, + "grad_norm": 0.3437002297587831, + "learning_rate": 8.651470650859955e-06, + "loss": 0.4599, + "step": 1750 + }, + { + "epoch": 0.9515227969003424, + "grad_norm": 0.3431363969879203, + "learning_rate": 8.629902840351898e-06, + "loss": 0.4637, + "step": 1760 + }, + { + "epoch": 0.9569291764281853, + "grad_norm": 0.3765462141591892, + "learning_rate": 8.608191267826179e-06, + "loss": 0.4694, + "step": 1770 + }, + { + "epoch": 0.9623355559560282, + "grad_norm": 0.420048049416004, + "learning_rate": 8.586336793167926e-06, + "loss": 0.4641, + "step": 1780 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.412279889648995, + "learning_rate": 8.5643402819219e-06, + "loss": 0.4566, + "step": 1790 + }, + { + "epoch": 0.9731483150117138, + "grad_norm": 0.3299568555620076, + "learning_rate": 8.542202605258204e-06, + "loss": 0.463, + "step": 1800 + }, + { + "epoch": 0.9785546945395567, + "grad_norm": 0.32198105439404867, + "learning_rate": 8.519924639937786e-06, + "loss": 0.4617, + "step": 1810 + }, + { + "epoch": 0.9839610740673995, + "grad_norm": 0.3549245136848414, + "learning_rate": 8.49750726827772e-06, + "loss": 0.4565, + "step": 1820 + }, + { + "epoch": 0.9893674535952424, + "grad_norm": 0.3392271575380573, + "learning_rate": 8.474951378116253e-06, + "loss": 0.4639, + "step": 1830 + }, + { + "epoch": 0.9947738331230852, + "grad_norm": 0.3208227345701, + "learning_rate": 8.452257862777653e-06, + "loss": 0.4546, + "step": 1840 + }, + { + "epoch": 1.000180212650928, + "grad_norm": 0.4559641919273857, + "learning_rate": 8.42942762103681e-06, + "loss": 0.4837, + "step": 1850 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 0.3598410288175877, + "learning_rate": 8.406461557083666e-06, + "loss": 0.4404, + "step": 1860 + }, + { + "epoch": 1.0109929717066137, + "grad_norm": 0.3857145460836866, + "learning_rate": 8.383360580487378e-06, + "loss": 0.4393, + "step": 1870 + }, + { + "epoch": 1.0163993512344567, + "grad_norm": 0.34505752597289024, + "learning_rate": 8.360125606160323e-06, + "loss": 0.4422, + "step": 1880 + }, + { + "epoch": 1.0218057307622994, + "grad_norm": 0.3739277339941646, + "learning_rate": 8.336757554321832e-06, + "loss": 0.4424, + "step": 1890 + }, + { + "epoch": 1.0272121102901424, + "grad_norm": 0.3968787668713752, + "learning_rate": 8.313257350461774e-06, + "loss": 0.4376, + "step": 1900 + }, + { + "epoch": 1.0326184898179853, + "grad_norm": 0.3451897271410753, + "learning_rate": 8.289625925303877e-06, + "loss": 0.4425, + "step": 1910 + }, + { + "epoch": 1.038024869345828, + "grad_norm": 0.40010047495902706, + "learning_rate": 8.265864214768883e-06, + "loss": 0.4503, + "step": 1920 + }, + { + "epoch": 1.043431248873671, + "grad_norm": 0.3736188460908676, + "learning_rate": 8.241973159937482e-06, + "loss": 0.4406, + "step": 1930 + }, + { + "epoch": 1.0488376284015137, + "grad_norm": 0.3394542766186862, + "learning_rate": 8.217953707013025e-06, + "loss": 0.4393, + "step": 1940 + }, + { + "epoch": 1.0542440079293567, + "grad_norm": 0.35077872709329283, + "learning_rate": 8.193806807284064e-06, + "loss": 0.4383, + "step": 1950 + }, + { + "epoch": 1.0596503874571994, + "grad_norm": 0.3441941331677373, + "learning_rate": 8.169533417086673e-06, + "loss": 0.4286, + "step": 1960 + }, + { + "epoch": 1.0650567669850424, + "grad_norm": 0.34884852607611294, + "learning_rate": 8.145134497766566e-06, + "loss": 0.4467, + "step": 1970 + }, + { + "epoch": 1.070463146512885, + "grad_norm": 0.40097746242132437, + "learning_rate": 8.120611015641036e-06, + "loss": 0.4363, + "step": 1980 + }, + { + "epoch": 1.075869526040728, + "grad_norm": 0.33184835023647064, + "learning_rate": 8.095963941960667e-06, + "loss": 0.437, + "step": 1990 + }, + { + "epoch": 1.081275905568571, + "grad_norm": 0.394546885758411, + "learning_rate": 8.071194252870887e-06, + "loss": 0.432, + "step": 2000 + }, + { + "epoch": 1.0866822850964137, + "grad_norm": 0.472784994513626, + "learning_rate": 8.046302929373286e-06, + "loss": 0.4367, + "step": 2010 + }, + { + "epoch": 1.0920886646242567, + "grad_norm": 0.3602670786653786, + "learning_rate": 8.021290957286787e-06, + "loss": 0.4352, + "step": 2020 + }, + { + "epoch": 1.0974950441520994, + "grad_norm": 0.3963387130392289, + "learning_rate": 7.996159327208581e-06, + "loss": 0.4434, + "step": 2030 + }, + { + "epoch": 1.1029014236799424, + "grad_norm": 0.37403782295160953, + "learning_rate": 7.97090903447491e-06, + "loss": 0.4326, + "step": 2040 + }, + { + "epoch": 1.108307803207785, + "grad_norm": 0.37350913921356577, + "learning_rate": 7.945541079121642e-06, + "loss": 0.4485, + "step": 2050 + }, + { + "epoch": 1.113714182735628, + "grad_norm": 0.3661212920976343, + "learning_rate": 7.920056465844658e-06, + "loss": 0.4328, + "step": 2060 + }, + { + "epoch": 1.119120562263471, + "grad_norm": 0.3507951321263283, + "learning_rate": 7.894456203960075e-06, + "loss": 0.4339, + "step": 2070 + }, + { + "epoch": 1.1245269417913137, + "grad_norm": 0.31935101139873434, + "learning_rate": 7.868741307364255e-06, + "loss": 0.4307, + "step": 2080 + }, + { + "epoch": 1.1299333213191567, + "grad_norm": 0.3240469373544592, + "learning_rate": 7.842912794493667e-06, + "loss": 0.4357, + "step": 2090 + }, + { + "epoch": 1.1353397008469994, + "grad_norm": 0.4024576218630106, + "learning_rate": 7.81697168828454e-06, + "loss": 0.4429, + "step": 2100 + }, + { + "epoch": 1.1407460803748424, + "grad_norm": 0.4057186928939639, + "learning_rate": 7.790919016132351e-06, + "loss": 0.4435, + "step": 2110 + }, + { + "epoch": 1.146152459902685, + "grad_norm": 0.4339123108369387, + "learning_rate": 7.764755809851141e-06, + "loss": 0.4375, + "step": 2120 + }, + { + "epoch": 1.151558839430528, + "grad_norm": 0.3423301493159426, + "learning_rate": 7.738483105632644e-06, + "loss": 0.4408, + "step": 2130 + }, + { + "epoch": 1.1569652189583708, + "grad_norm": 0.3049599421413694, + "learning_rate": 7.712101944005256e-06, + "loss": 0.442, + "step": 2140 + }, + { + "epoch": 1.1623715984862137, + "grad_norm": 0.3235699906736669, + "learning_rate": 7.685613369792815e-06, + "loss": 0.4389, + "step": 2150 + }, + { + "epoch": 1.1677779780140565, + "grad_norm": 0.38824198475727123, + "learning_rate": 7.65901843207323e-06, + "loss": 0.4372, + "step": 2160 + }, + { + "epoch": 1.1731843575418994, + "grad_norm": 0.3485465278129701, + "learning_rate": 7.63231818413692e-06, + "loss": 0.4313, + "step": 2170 + }, + { + "epoch": 1.1785907370697424, + "grad_norm": 0.3607061695090595, + "learning_rate": 7.605513683445118e-06, + "loss": 0.433, + "step": 2180 + }, + { + "epoch": 1.183997116597585, + "grad_norm": 0.35864049794241826, + "learning_rate": 7.578605991587974e-06, + "loss": 0.43, + "step": 2190 + }, + { + "epoch": 1.189403496125428, + "grad_norm": 0.3622129404816991, + "learning_rate": 7.5515961742425146e-06, + "loss": 0.4357, + "step": 2200 + }, + { + "epoch": 1.1948098756532708, + "grad_norm": 0.37719764002603634, + "learning_rate": 7.524485301130443e-06, + "loss": 0.4363, + "step": 2210 + }, + { + "epoch": 1.2002162551811137, + "grad_norm": 0.32038054153975193, + "learning_rate": 7.497274445975762e-06, + "loss": 0.4283, + "step": 2220 + }, + { + "epoch": 1.2056226347089565, + "grad_norm": 0.3897896894072551, + "learning_rate": 7.469964686462261e-06, + "loss": 0.4416, + "step": 2230 + }, + { + "epoch": 1.2110290142367994, + "grad_norm": 0.32144151391797593, + "learning_rate": 7.4425571041908254e-06, + "loss": 0.4388, + "step": 2240 + }, + { + "epoch": 1.2164353937646424, + "grad_norm": 0.3553047783046372, + "learning_rate": 7.415052784636603e-06, + "loss": 0.4401, + "step": 2250 + }, + { + "epoch": 1.2218417732924851, + "grad_norm": 0.31787401750902194, + "learning_rate": 7.387452817106017e-06, + "loss": 0.4313, + "step": 2260 + }, + { + "epoch": 1.227248152820328, + "grad_norm": 0.3736244875654426, + "learning_rate": 7.359758294693618e-06, + "loss": 0.4392, + "step": 2270 + }, + { + "epoch": 1.2326545323481708, + "grad_norm": 0.34863542131710556, + "learning_rate": 7.331970314238799e-06, + "loss": 0.4405, + "step": 2280 + }, + { + "epoch": 1.2380609118760137, + "grad_norm": 0.414690288534652, + "learning_rate": 7.304089976282348e-06, + "loss": 0.4401, + "step": 2290 + }, + { + "epoch": 1.2434672914038565, + "grad_norm": 0.356866165228421, + "learning_rate": 7.276118385022865e-06, + "loss": 0.4241, + "step": 2300 + }, + { + "epoch": 1.2488736709316994, + "grad_norm": 0.33264484884680307, + "learning_rate": 7.248056648273034e-06, + "loss": 0.4425, + "step": 2310 + }, + { + "epoch": 1.2542800504595424, + "grad_norm": 0.4175310788334551, + "learning_rate": 7.2199058774157375e-06, + "loss": 0.4276, + "step": 2320 + }, + { + "epoch": 1.2596864299873851, + "grad_norm": 0.38229588901030637, + "learning_rate": 7.1916671873600515e-06, + "loss": 0.4312, + "step": 2330 + }, + { + "epoch": 1.2650928095152278, + "grad_norm": 0.338696312422094, + "learning_rate": 7.163341696497084e-06, + "loss": 0.4405, + "step": 2340 + }, + { + "epoch": 1.2704991890430708, + "grad_norm": 0.32136223620818055, + "learning_rate": 7.134930526655679e-06, + "loss": 0.4347, + "step": 2350 + }, + { + "epoch": 1.2759055685709138, + "grad_norm": 0.3590441906111087, + "learning_rate": 7.106434803057998e-06, + "loss": 0.4392, + "step": 2360 + }, + { + "epoch": 1.2813119480987565, + "grad_norm": 0.3822900334441054, + "learning_rate": 7.077855654274939e-06, + "loss": 0.4329, + "step": 2370 + }, + { + "epoch": 1.2867183276265994, + "grad_norm": 0.4150924729603716, + "learning_rate": 7.04919421218145e-06, + "loss": 0.4344, + "step": 2380 + }, + { + "epoch": 1.2921247071544422, + "grad_norm": 0.31977805162237566, + "learning_rate": 7.020451611911703e-06, + "loss": 0.4274, + "step": 2390 + }, + { + "epoch": 1.2975310866822851, + "grad_norm": 0.4042413750463481, + "learning_rate": 6.9916289918141265e-06, + "loss": 0.4383, + "step": 2400 + }, + { + "epoch": 1.3029374662101278, + "grad_norm": 0.32750161889881924, + "learning_rate": 6.962727493406335e-06, + "loss": 0.4363, + "step": 2410 + }, + { + "epoch": 1.3083438457379708, + "grad_norm": 0.34681784503652924, + "learning_rate": 6.9337482613299065e-06, + "loss": 0.4251, + "step": 2420 + }, + { + "epoch": 1.3137502252658138, + "grad_norm": 0.31392667825247955, + "learning_rate": 6.904692443305059e-06, + "loss": 0.439, + "step": 2430 + }, + { + "epoch": 1.3191566047936565, + "grad_norm": 0.3080535811767778, + "learning_rate": 6.87556119008519e-06, + "loss": 0.4268, + "step": 2440 + }, + { + "epoch": 1.3245629843214994, + "grad_norm": 0.37030845399385603, + "learning_rate": 6.8463556554113005e-06, + "loss": 0.4353, + "step": 2450 + }, + { + "epoch": 1.3299693638493422, + "grad_norm": 0.3473034342384458, + "learning_rate": 6.8170769959663045e-06, + "loss": 0.4292, + "step": 2460 + }, + { + "epoch": 1.3353757433771851, + "grad_norm": 0.322256198293079, + "learning_rate": 6.787726371329214e-06, + "loss": 0.4402, + "step": 2470 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 0.3907219151376363, + "learning_rate": 6.7583049439292205e-06, + "loss": 0.4369, + "step": 2480 + }, + { + "epoch": 1.3461885024328708, + "grad_norm": 0.34928113227903806, + "learning_rate": 6.728813878999652e-06, + "loss": 0.4377, + "step": 2490 + }, + { + "epoch": 1.3515948819607138, + "grad_norm": 0.35544626757027864, + "learning_rate": 6.699254344531821e-06, + "loss": 0.4309, + "step": 2500 + }, + { + "epoch": 1.3570012614885565, + "grad_norm": 0.366218747083373, + "learning_rate": 6.669627511228778e-06, + "loss": 0.434, + "step": 2510 + }, + { + "epoch": 1.3624076410163992, + "grad_norm": 0.3580871935273299, + "learning_rate": 6.6399345524589366e-06, + "loss": 0.4401, + "step": 2520 + }, + { + "epoch": 1.3678140205442422, + "grad_norm": 0.29886314913995143, + "learning_rate": 6.610176644209602e-06, + "loss": 0.4266, + "step": 2530 + }, + { + "epoch": 1.3732204000720851, + "grad_norm": 0.3571328312104908, + "learning_rate": 6.580354965040396e-06, + "loss": 0.4393, + "step": 2540 + }, + { + "epoch": 1.3786267795999279, + "grad_norm": 0.3568154757493318, + "learning_rate": 6.550470696036591e-06, + "loss": 0.4276, + "step": 2550 + }, + { + "epoch": 1.3840331591277708, + "grad_norm": 0.3020834353942124, + "learning_rate": 6.520525020762318e-06, + "loss": 0.4374, + "step": 2560 + }, + { + "epoch": 1.3894395386556138, + "grad_norm": 0.4345861239807074, + "learning_rate": 6.490519125213701e-06, + "loss": 0.44, + "step": 2570 + }, + { + "epoch": 1.3948459181834565, + "grad_norm": 0.4164116140474957, + "learning_rate": 6.460454197771881e-06, + "loss": 0.4347, + "step": 2580 + }, + { + "epoch": 1.4002522977112992, + "grad_norm": 0.3698597319632245, + "learning_rate": 6.430331429155956e-06, + "loss": 0.4398, + "step": 2590 + }, + { + "epoch": 1.4056586772391422, + "grad_norm": 0.3557941383592286, + "learning_rate": 6.400152012375818e-06, + "loss": 0.4361, + "step": 2600 + }, + { + "epoch": 1.4110650567669851, + "grad_norm": 0.3703620913980966, + "learning_rate": 6.3699171426849036e-06, + "loss": 0.433, + "step": 2610 + }, + { + "epoch": 1.4164714362948279, + "grad_norm": 0.312372238883981, + "learning_rate": 6.339628017532858e-06, + "loss": 0.4305, + "step": 2620 + }, + { + "epoch": 1.4218778158226708, + "grad_norm": 0.32819677760603516, + "learning_rate": 6.309285836518113e-06, + "loss": 0.4289, + "step": 2630 + }, + { + "epoch": 1.4272841953505135, + "grad_norm": 0.34835896987461035, + "learning_rate": 6.2788918013403695e-06, + "loss": 0.4312, + "step": 2640 + }, + { + "epoch": 1.4326905748783565, + "grad_norm": 0.34043287674955064, + "learning_rate": 6.248447115753009e-06, + "loss": 0.4327, + "step": 2650 + }, + { + "epoch": 1.4380969544061992, + "grad_norm": 0.32777806734674225, + "learning_rate": 6.21795298551542e-06, + "loss": 0.4206, + "step": 2660 + }, + { + "epoch": 1.4435033339340422, + "grad_norm": 0.2839690869238431, + "learning_rate": 6.187410618345241e-06, + "loss": 0.4337, + "step": 2670 + }, + { + "epoch": 1.4489097134618851, + "grad_norm": 0.2845491198333412, + "learning_rate": 6.156821223870533e-06, + "loss": 0.428, + "step": 2680 + }, + { + "epoch": 1.4543160929897279, + "grad_norm": 0.3381278947086419, + "learning_rate": 6.126186013581868e-06, + "loss": 0.4442, + "step": 2690 + }, + { + "epoch": 1.4597224725175708, + "grad_norm": 0.2678673584947001, + "learning_rate": 6.095506200784349e-06, + "loss": 0.4313, + "step": 2700 + }, + { + "epoch": 1.4651288520454135, + "grad_norm": 0.32064492812884415, + "learning_rate": 6.06478300054956e-06, + "loss": 0.4443, + "step": 2710 + }, + { + "epoch": 1.4705352315732565, + "grad_norm": 0.33114310721210843, + "learning_rate": 6.034017629667439e-06, + "loss": 0.4321, + "step": 2720 + }, + { + "epoch": 1.4759416111010992, + "grad_norm": 0.3407274170049336, + "learning_rate": 6.003211306598089e-06, + "loss": 0.4302, + "step": 2730 + }, + { + "epoch": 1.4813479906289422, + "grad_norm": 0.3655959799961016, + "learning_rate": 5.972365251423521e-06, + "loss": 0.4331, + "step": 2740 + }, + { + "epoch": 1.4867543701567851, + "grad_norm": 0.3707027911602118, + "learning_rate": 5.941480685799338e-06, + "loss": 0.433, + "step": 2750 + }, + { + "epoch": 1.4921607496846279, + "grad_norm": 0.30224309374010494, + "learning_rate": 5.910558832906341e-06, + "loss": 0.4378, + "step": 2760 + }, + { + "epoch": 1.4975671292124706, + "grad_norm": 0.3421553953269554, + "learning_rate": 5.879600917402089e-06, + "loss": 0.4322, + "step": 2770 + }, + { + "epoch": 1.5029735087403135, + "grad_norm": 0.33381909956811917, + "learning_rate": 5.848608165372403e-06, + "loss": 0.425, + "step": 2780 + }, + { + "epoch": 1.5083798882681565, + "grad_norm": 0.3189833875248174, + "learning_rate": 5.8175818042828e-06, + "loss": 0.4357, + "step": 2790 + }, + { + "epoch": 1.5137862677959992, + "grad_norm": 0.36173513055424256, + "learning_rate": 5.78652306292988e-06, + "loss": 0.4395, + "step": 2800 + }, + { + "epoch": 1.5191926473238422, + "grad_norm": 0.3265416603091211, + "learning_rate": 5.75543317139266e-06, + "loss": 0.4426, + "step": 2810 + }, + { + "epoch": 1.5245990268516851, + "grad_norm": 0.33495795652653004, + "learning_rate": 5.724313360983859e-06, + "loss": 0.4335, + "step": 2820 + }, + { + "epoch": 1.5300054063795279, + "grad_norm": 0.35637908471545576, + "learning_rate": 5.693164864201134e-06, + "loss": 0.4343, + "step": 2830 + }, + { + "epoch": 1.5354117859073706, + "grad_norm": 0.3422755476029069, + "learning_rate": 5.661988914678257e-06, + "loss": 0.4201, + "step": 2840 + }, + { + "epoch": 1.5408181654352135, + "grad_norm": 0.29401423880776295, + "learning_rate": 5.630786747136269e-06, + "loss": 0.4263, + "step": 2850 + }, + { + "epoch": 1.5462245449630565, + "grad_norm": 0.35559246067713574, + "learning_rate": 5.599559597334568e-06, + "loss": 0.4327, + "step": 2860 + }, + { + "epoch": 1.5516309244908992, + "grad_norm": 0.3234026109207772, + "learning_rate": 5.56830870202198e-06, + "loss": 0.4284, + "step": 2870 + }, + { + "epoch": 1.557037304018742, + "grad_norm": 0.3041181368480941, + "learning_rate": 5.537035298887764e-06, + "loss": 0.4291, + "step": 2880 + }, + { + "epoch": 1.562443683546585, + "grad_norm": 0.4152034967270183, + "learning_rate": 5.505740626512601e-06, + "loss": 0.4333, + "step": 2890 + }, + { + "epoch": 1.5678500630744279, + "grad_norm": 0.32189843480023705, + "learning_rate": 5.474425924319538e-06, + "loss": 0.4313, + "step": 2900 + }, + { + "epoch": 1.5732564426022706, + "grad_norm": 0.3400408960358337, + "learning_rate": 5.443092432524906e-06, + "loss": 0.4446, + "step": 2910 + }, + { + "epoch": 1.5786628221301136, + "grad_norm": 0.3253331216756115, + "learning_rate": 5.411741392089192e-06, + "loss": 0.4276, + "step": 2920 + }, + { + "epoch": 1.5840692016579565, + "grad_norm": 0.34364169352732366, + "learning_rate": 5.380374044667896e-06, + "loss": 0.4363, + "step": 2930 + }, + { + "epoch": 1.5894755811857992, + "grad_norm": 0.2993302543547276, + "learning_rate": 5.348991632562355e-06, + "loss": 0.4347, + "step": 2940 + }, + { + "epoch": 1.594881960713642, + "grad_norm": 0.31140003151111195, + "learning_rate": 5.317595398670543e-06, + "loss": 0.4203, + "step": 2950 + }, + { + "epoch": 1.600288340241485, + "grad_norm": 0.34917215566088183, + "learning_rate": 5.286186586437845e-06, + "loss": 0.4394, + "step": 2960 + }, + { + "epoch": 1.6056947197693279, + "grad_norm": 0.3099678473182354, + "learning_rate": 5.254766439807807e-06, + "loss": 0.4224, + "step": 2970 + }, + { + "epoch": 1.6111010992971706, + "grad_norm": 0.32027842285858055, + "learning_rate": 5.223336203172874e-06, + "loss": 0.4289, + "step": 2980 + }, + { + "epoch": 1.6165074788250136, + "grad_norm": 0.29377503624337103, + "learning_rate": 5.191897121325111e-06, + "loss": 0.43, + "step": 2990 + }, + { + "epoch": 1.6219138583528565, + "grad_norm": 0.3286814138894788, + "learning_rate": 5.16045043940689e-06, + "loss": 0.4344, + "step": 3000 + }, + { + "epoch": 1.6273202378806992, + "grad_norm": 0.35588674616258936, + "learning_rate": 5.128997402861584e-06, + "loss": 0.4306, + "step": 3010 + }, + { + "epoch": 1.632726617408542, + "grad_norm": 0.33501603495492577, + "learning_rate": 5.09753925738424e-06, + "loss": 0.4154, + "step": 3020 + }, + { + "epoch": 1.638132996936385, + "grad_norm": 0.3011476898703049, + "learning_rate": 5.06607724887225e-06, + "loss": 0.4314, + "step": 3030 + }, + { + "epoch": 1.6435393764642279, + "grad_norm": 0.3879201939655995, + "learning_rate": 5.034612623375993e-06, + "loss": 0.4412, + "step": 3040 + }, + { + "epoch": 1.6489457559920706, + "grad_norm": 0.3426764786646151, + "learning_rate": 5.003146627049499e-06, + "loss": 0.4295, + "step": 3050 + }, + { + "epoch": 1.6543521355199133, + "grad_norm": 0.3408786770769329, + "learning_rate": 4.971680506101086e-06, + "loss": 0.4259, + "step": 3060 + }, + { + "epoch": 1.6597585150477565, + "grad_norm": 0.3689333373771858, + "learning_rate": 4.940215506744011e-06, + "loss": 0.4254, + "step": 3070 + }, + { + "epoch": 1.6651648945755992, + "grad_norm": 0.33725311763702437, + "learning_rate": 4.90875287514711e-06, + "loss": 0.4286, + "step": 3080 + }, + { + "epoch": 1.670571274103442, + "grad_norm": 0.3106105413402686, + "learning_rate": 4.87729385738544e-06, + "loss": 0.426, + "step": 3090 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 0.361491556160267, + "learning_rate": 4.845839699390936e-06, + "loss": 0.4229, + "step": 3100 + }, + { + "epoch": 1.6813840331591279, + "grad_norm": 0.3012437306295753, + "learning_rate": 4.814391646903063e-06, + "loss": 0.4296, + "step": 3110 + }, + { + "epoch": 1.6867904126869706, + "grad_norm": 0.3142934287582159, + "learning_rate": 4.782950945419475e-06, + "loss": 0.4304, + "step": 3120 + }, + { + "epoch": 1.6921967922148133, + "grad_norm": 0.3024864799296645, + "learning_rate": 4.751518840146695e-06, + "loss": 0.4329, + "step": 3130 + }, + { + "epoch": 1.6976031717426563, + "grad_norm": 0.3081924919099197, + "learning_rate": 4.720096575950784e-06, + "loss": 0.4319, + "step": 3140 + }, + { + "epoch": 1.7030095512704992, + "grad_norm": 0.32189094915170496, + "learning_rate": 4.688685397308061e-06, + "loss": 0.42, + "step": 3150 + }, + { + "epoch": 1.708415930798342, + "grad_norm": 0.33972262308693657, + "learning_rate": 4.657286548255789e-06, + "loss": 0.4369, + "step": 3160 + }, + { + "epoch": 1.713822310326185, + "grad_norm": 0.30741331028975344, + "learning_rate": 4.6259012723429285e-06, + "loss": 0.4274, + "step": 3170 + }, + { + "epoch": 1.7192286898540279, + "grad_norm": 0.28971622178653267, + "learning_rate": 4.594530812580876e-06, + "loss": 0.4216, + "step": 3180 + }, + { + "epoch": 1.7246350693818706, + "grad_norm": 0.2792098363578085, + "learning_rate": 4.563176411394229e-06, + "loss": 0.4238, + "step": 3190 + }, + { + "epoch": 1.7300414489097133, + "grad_norm": 0.29274514837335597, + "learning_rate": 4.531839310571595e-06, + "loss": 0.4291, + "step": 3200 + }, + { + "epoch": 1.7354478284375563, + "grad_norm": 0.32996912353874136, + "learning_rate": 4.5005207512163914e-06, + "loss": 0.4388, + "step": 3210 + }, + { + "epoch": 1.7408542079653992, + "grad_norm": 0.34282857698540753, + "learning_rate": 4.469221973697714e-06, + "loss": 0.4373, + "step": 3220 + }, + { + "epoch": 1.746260587493242, + "grad_norm": 0.3147983795136612, + "learning_rate": 4.43794421760119e-06, + "loss": 0.4291, + "step": 3230 + }, + { + "epoch": 1.751666967021085, + "grad_norm": 0.2953517288607898, + "learning_rate": 4.4066887216799055e-06, + "loss": 0.4219, + "step": 3240 + }, + { + "epoch": 1.7570733465489279, + "grad_norm": 0.30489564567587807, + "learning_rate": 4.375456723805321e-06, + "loss": 0.4308, + "step": 3250 + }, + { + "epoch": 1.7624797260767706, + "grad_norm": 0.30950501632812377, + "learning_rate": 4.344249460918271e-06, + "loss": 0.4213, + "step": 3260 + }, + { + "epoch": 1.7678861056046133, + "grad_norm": 0.30230325895579757, + "learning_rate": 4.313068168979957e-06, + "loss": 0.4364, + "step": 3270 + }, + { + "epoch": 1.7732924851324563, + "grad_norm": 0.30774095159515363, + "learning_rate": 4.281914082923002e-06, + "loss": 0.4165, + "step": 3280 + }, + { + "epoch": 1.7786988646602993, + "grad_norm": 0.3275433264912912, + "learning_rate": 4.250788436602548e-06, + "loss": 0.4269, + "step": 3290 + }, + { + "epoch": 1.784105244188142, + "grad_norm": 0.3270523212461865, + "learning_rate": 4.2196924627473715e-06, + "loss": 0.4304, + "step": 3300 + }, + { + "epoch": 1.7895116237159847, + "grad_norm": 0.28953105726529316, + "learning_rate": 4.188627392911091e-06, + "loss": 0.4281, + "step": 3310 + }, + { + "epoch": 1.7949180032438277, + "grad_norm": 0.34157770345495453, + "learning_rate": 4.157594457423357e-06, + "loss": 0.432, + "step": 3320 + }, + { + "epoch": 1.8003243827716706, + "grad_norm": 0.2952227481543905, + "learning_rate": 4.1265948853411506e-06, + "loss": 0.427, + "step": 3330 + }, + { + "epoch": 1.8057307622995133, + "grad_norm": 0.3058432699391948, + "learning_rate": 4.095629904400097e-06, + "loss": 0.4268, + "step": 3340 + }, + { + "epoch": 1.8111371418273563, + "grad_norm": 0.32888818257409286, + "learning_rate": 4.06470074096584e-06, + "loss": 0.4334, + "step": 3350 + }, + { + "epoch": 1.8165435213551993, + "grad_norm": 0.29929296938295863, + "learning_rate": 4.0338086199854765e-06, + "loss": 0.4248, + "step": 3360 + }, + { + "epoch": 1.821949900883042, + "grad_norm": 0.33418978699429813, + "learning_rate": 4.0029547649390346e-06, + "loss": 0.4307, + "step": 3370 + }, + { + "epoch": 1.8273562804108847, + "grad_norm": 0.2991040804166494, + "learning_rate": 3.97214039779103e-06, + "loss": 0.435, + "step": 3380 + }, + { + "epoch": 1.8327626599387277, + "grad_norm": 0.2829911428105187, + "learning_rate": 3.941366738942058e-06, + "loss": 0.4246, + "step": 3390 + }, + { + "epoch": 1.8381690394665706, + "grad_norm": 0.2990384176756561, + "learning_rate": 3.910635007180468e-06, + "loss": 0.4394, + "step": 3400 + }, + { + "epoch": 1.8435754189944134, + "grad_norm": 0.28487793163600966, + "learning_rate": 3.879946419634087e-06, + "loss": 0.4268, + "step": 3410 + }, + { + "epoch": 1.8489817985222563, + "grad_norm": 0.30066911074015307, + "learning_rate": 3.8493021917220225e-06, + "loss": 0.4289, + "step": 3420 + }, + { + "epoch": 1.8543881780500993, + "grad_norm": 0.3145700146426358, + "learning_rate": 3.818703537106522e-06, + "loss": 0.427, + "step": 3430 + }, + { + "epoch": 1.859794557577942, + "grad_norm": 0.3121437364875441, + "learning_rate": 3.7881516676449014e-06, + "loss": 0.4334, + "step": 3440 + }, + { + "epoch": 1.8652009371057847, + "grad_norm": 0.2914138429548545, + "learning_rate": 3.7576477933415612e-06, + "loss": 0.4358, + "step": 3450 + }, + { + "epoch": 1.8706073166336277, + "grad_norm": 0.3263366427961882, + "learning_rate": 3.7271931223000507e-06, + "loss": 0.4294, + "step": 3460 + }, + { + "epoch": 1.8760136961614706, + "grad_norm": 0.3181986581808925, + "learning_rate": 3.6967888606752345e-06, + "loss": 0.433, + "step": 3470 + }, + { + "epoch": 1.8814200756893134, + "grad_norm": 0.31837041508546626, + "learning_rate": 3.6664362126255087e-06, + "loss": 0.4283, + "step": 3480 + }, + { + "epoch": 1.886826455217156, + "grad_norm": 0.2876960972161682, + "learning_rate": 3.636136380265124e-06, + "loss": 0.4189, + "step": 3490 + }, + { + "epoch": 1.8922328347449993, + "grad_norm": 0.30867320900321366, + "learning_rate": 3.6058905636165674e-06, + "loss": 0.4309, + "step": 3500 + }, + { + "epoch": 1.897639214272842, + "grad_norm": 0.29104980848951667, + "learning_rate": 3.575699960563038e-06, + "loss": 0.4184, + "step": 3510 + }, + { + "epoch": 1.9030455938006847, + "grad_norm": 0.2859389528274554, + "learning_rate": 3.5455657668010057e-06, + "loss": 0.4253, + "step": 3520 + }, + { + "epoch": 1.9084519733285277, + "grad_norm": 0.30910611127718657, + "learning_rate": 3.5154891757928523e-06, + "loss": 0.4257, + "step": 3530 + }, + { + "epoch": 1.9138583528563706, + "grad_norm": 0.31381289055858025, + "learning_rate": 3.4854713787196105e-06, + "loss": 0.4324, + "step": 3540 + }, + { + "epoch": 1.9192647323842134, + "grad_norm": 0.33654431291917486, + "learning_rate": 3.4555135644337803e-06, + "loss": 0.4262, + "step": 3550 + }, + { + "epoch": 1.924671111912056, + "grad_norm": 0.30712399081960845, + "learning_rate": 3.42561691941225e-06, + "loss": 0.4344, + "step": 3560 + }, + { + "epoch": 1.930077491439899, + "grad_norm": 0.2989668977037765, + "learning_rate": 3.3957826277093074e-06, + "loss": 0.4278, + "step": 3570 + }, + { + "epoch": 1.935483870967742, + "grad_norm": 0.3259516671848096, + "learning_rate": 3.3660118709097347e-06, + "loss": 0.4242, + "step": 3580 + }, + { + "epoch": 1.9408902504955847, + "grad_norm": 0.29719187591192203, + "learning_rate": 3.336305828082024e-06, + "loss": 0.4319, + "step": 3590 + }, + { + "epoch": 1.9462966300234277, + "grad_norm": 0.3250815058947025, + "learning_rate": 3.306665675731674e-06, + "loss": 0.4324, + "step": 3600 + }, + { + "epoch": 1.9517030095512706, + "grad_norm": 0.3196705993035981, + "learning_rate": 3.277092587754598e-06, + "loss": 0.4283, + "step": 3610 + }, + { + "epoch": 1.9571093890791134, + "grad_norm": 0.2836241969868925, + "learning_rate": 3.247587735390628e-06, + "loss": 0.4285, + "step": 3620 + }, + { + "epoch": 1.962515768606956, + "grad_norm": 0.2963451307813687, + "learning_rate": 3.218152287177133e-06, + "loss": 0.4233, + "step": 3630 + }, + { + "epoch": 1.967922148134799, + "grad_norm": 0.32162438964611967, + "learning_rate": 3.1887874089027304e-06, + "loss": 0.4275, + "step": 3640 + }, + { + "epoch": 1.973328527662642, + "grad_norm": 0.2858747270839711, + "learning_rate": 3.159494263561126e-06, + "loss": 0.429, + "step": 3650 + }, + { + "epoch": 1.9787349071904847, + "grad_norm": 0.294205581889964, + "learning_rate": 3.130274011305047e-06, + "loss": 0.4261, + "step": 3660 + }, + { + "epoch": 1.9841412867183277, + "grad_norm": 0.3271655262933234, + "learning_rate": 3.1011278094002928e-06, + "loss": 0.4352, + "step": 3670 + }, + { + "epoch": 1.9895476662461706, + "grad_norm": 0.3151321646815863, + "learning_rate": 3.0720568121799105e-06, + "loss": 0.4302, + "step": 3680 + }, + { + "epoch": 1.9949540457740134, + "grad_norm": 0.3069606817223593, + "learning_rate": 3.043062170998464e-06, + "loss": 0.4274, + "step": 3690 + }, + { + "epoch": 2.000360425301856, + "grad_norm": 0.3418886732932903, + "learning_rate": 3.0141450341864486e-06, + "loss": 0.4368, + "step": 3700 + }, + { + "epoch": 2.005766804829699, + "grad_norm": 0.28231273100784204, + "learning_rate": 2.9853065470048016e-06, + "loss": 0.4084, + "step": 3710 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 0.27285411121752895, + "learning_rate": 2.956547851599548e-06, + "loss": 0.3899, + "step": 3720 + }, + { + "epoch": 2.0165795638853847, + "grad_norm": 0.31740692003997667, + "learning_rate": 2.9278700869565713e-06, + "loss": 0.406, + "step": 3730 + }, + { + "epoch": 2.0219859434132275, + "grad_norm": 0.32723222207620034, + "learning_rate": 2.8992743888564886e-06, + "loss": 0.4107, + "step": 3740 + }, + { + "epoch": 2.0273923229410706, + "grad_norm": 0.3293876655149398, + "learning_rate": 2.8707618898296864e-06, + "loss": 0.4052, + "step": 3750 + }, + { + "epoch": 2.0327987024689134, + "grad_norm": 0.26473497263074053, + "learning_rate": 2.8423337191114495e-06, + "loss": 0.402, + "step": 3760 + }, + { + "epoch": 2.038205081996756, + "grad_norm": 0.31910999655360905, + "learning_rate": 2.8139910025972622e-06, + "loss": 0.4134, + "step": 3770 + }, + { + "epoch": 2.043611461524599, + "grad_norm": 0.29154253424627524, + "learning_rate": 2.785734862798184e-06, + "loss": 0.4086, + "step": 3780 + }, + { + "epoch": 2.049017841052442, + "grad_norm": 0.2910125618297838, + "learning_rate": 2.7575664187964236e-06, + "loss": 0.4007, + "step": 3790 + }, + { + "epoch": 2.0544242205802847, + "grad_norm": 0.28793585101610353, + "learning_rate": 2.7294867862009937e-06, + "loss": 0.4053, + "step": 3800 + }, + { + "epoch": 2.0598306001081275, + "grad_norm": 0.2731032601573403, + "learning_rate": 2.7014970771035474e-06, + "loss": 0.4138, + "step": 3810 + }, + { + "epoch": 2.0652369796359706, + "grad_norm": 0.29876809472359783, + "learning_rate": 2.6735984000343216e-06, + "loss": 0.4156, + "step": 3820 + }, + { + "epoch": 2.0706433591638134, + "grad_norm": 0.3100743441240049, + "learning_rate": 2.645791859918234e-06, + "loss": 0.4089, + "step": 3830 + }, + { + "epoch": 2.076049738691656, + "grad_norm": 0.34676569440909566, + "learning_rate": 2.6180785580311284e-06, + "loss": 0.3998, + "step": 3840 + }, + { + "epoch": 2.081456118219499, + "grad_norm": 0.28331404223893575, + "learning_rate": 2.5904595919561563e-06, + "loss": 0.3935, + "step": 3850 + }, + { + "epoch": 2.086862497747342, + "grad_norm": 0.2892120423588288, + "learning_rate": 2.562936055540307e-06, + "loss": 0.411, + "step": 3860 + }, + { + "epoch": 2.0922688772751847, + "grad_norm": 0.29210558202813347, + "learning_rate": 2.5355090388510806e-06, + "loss": 0.4108, + "step": 3870 + }, + { + "epoch": 2.0976752568030275, + "grad_norm": 0.29027866503096267, + "learning_rate": 2.508179628133326e-06, + "loss": 0.4016, + "step": 3880 + }, + { + "epoch": 2.1030816363308706, + "grad_norm": 0.2876065349136538, + "learning_rate": 2.4809489057662168e-06, + "loss": 0.4101, + "step": 3890 + }, + { + "epoch": 2.1084880158587134, + "grad_norm": 0.3135899601532618, + "learning_rate": 2.4538179502203753e-06, + "loss": 0.4001, + "step": 3900 + }, + { + "epoch": 2.113894395386556, + "grad_norm": 0.30848425065584256, + "learning_rate": 2.4267878360151747e-06, + "loss": 0.3997, + "step": 3910 + }, + { + "epoch": 2.119300774914399, + "grad_norm": 0.2923032276510183, + "learning_rate": 2.399859633676165e-06, + "loss": 0.4049, + "step": 3920 + }, + { + "epoch": 2.124707154442242, + "grad_norm": 0.29055776768248115, + "learning_rate": 2.3730344096926974e-06, + "loss": 0.3981, + "step": 3930 + }, + { + "epoch": 2.1301135339700847, + "grad_norm": 0.3161385412337821, + "learning_rate": 2.3463132264756617e-06, + "loss": 0.4075, + "step": 3940 + }, + { + "epoch": 2.1355199134979275, + "grad_norm": 0.2828900068372096, + "learning_rate": 2.319697142315428e-06, + "loss": 0.3906, + "step": 3950 + }, + { + "epoch": 2.14092629302577, + "grad_norm": 0.26292390614915356, + "learning_rate": 2.293187211339926e-06, + "loss": 0.3991, + "step": 3960 + }, + { + "epoch": 2.1463326725536134, + "grad_norm": 0.2987394527032652, + "learning_rate": 2.2667844834728923e-06, + "loss": 0.3999, + "step": 3970 + }, + { + "epoch": 2.151739052081456, + "grad_norm": 0.27915670540136367, + "learning_rate": 2.2404900043922996e-06, + "loss": 0.3995, + "step": 3980 + }, + { + "epoch": 2.157145431609299, + "grad_norm": 0.2818164391888048, + "learning_rate": 2.2143048154889272e-06, + "loss": 0.4015, + "step": 3990 + }, + { + "epoch": 2.162551811137142, + "grad_norm": 0.26044900685376793, + "learning_rate": 2.1882299538251352e-06, + "loss": 0.4003, + "step": 4000 + }, + { + "epoch": 2.1679581906649847, + "grad_norm": 0.27297932069072756, + "learning_rate": 2.162266452093774e-06, + "loss": 0.4149, + "step": 4010 + }, + { + "epoch": 2.1733645701928275, + "grad_norm": 0.2978434115081757, + "learning_rate": 2.1364153385773007e-06, + "loss": 0.4018, + "step": 4020 + }, + { + "epoch": 2.17877094972067, + "grad_norm": 0.31586609932366294, + "learning_rate": 2.110677637107036e-06, + "loss": 0.4053, + "step": 4030 + }, + { + "epoch": 2.1841773292485134, + "grad_norm": 0.29030802044428805, + "learning_rate": 2.0850543670226318e-06, + "loss": 0.4065, + "step": 4040 + }, + { + "epoch": 2.189583708776356, + "grad_norm": 0.3365802334808058, + "learning_rate": 2.059546543131696e-06, + "loss": 0.405, + "step": 4050 + }, + { + "epoch": 2.194990088304199, + "grad_norm": 0.2995355365322975, + "learning_rate": 2.034155175669592e-06, + "loss": 0.4044, + "step": 4060 + }, + { + "epoch": 2.200396467832042, + "grad_norm": 0.2868235821916637, + "learning_rate": 2.0088812702594424e-06, + "loss": 0.4023, + "step": 4070 + }, + { + "epoch": 2.2058028473598847, + "grad_norm": 0.29532698621262965, + "learning_rate": 1.9837258278722855e-06, + "loss": 0.413, + "step": 4080 + }, + { + "epoch": 2.2112092268877275, + "grad_norm": 0.282345122194298, + "learning_rate": 1.9586898447874543e-06, + "loss": 0.4033, + "step": 4090 + }, + { + "epoch": 2.21661560641557, + "grad_norm": 0.28744059302390934, + "learning_rate": 1.933774312553092e-06, + "loss": 0.4002, + "step": 4100 + }, + { + "epoch": 2.2220219859434134, + "grad_norm": 0.29637974416632634, + "learning_rate": 1.9089802179469036e-06, + "loss": 0.397, + "step": 4110 + }, + { + "epoch": 2.227428365471256, + "grad_norm": 0.29136812414474506, + "learning_rate": 1.884308542937065e-06, + "loss": 0.4198, + "step": 4120 + }, + { + "epoch": 2.232834744999099, + "grad_norm": 0.28845833396948634, + "learning_rate": 1.8597602646433294e-06, + "loss": 0.4012, + "step": 4130 + }, + { + "epoch": 2.238241124526942, + "grad_norm": 0.31515767696033387, + "learning_rate": 1.8353363552983382e-06, + "loss": 0.4084, + "step": 4140 + }, + { + "epoch": 2.2436475040547847, + "grad_norm": 0.2852056906534805, + "learning_rate": 1.8110377822091057e-06, + "loss": 0.4129, + "step": 4150 + }, + { + "epoch": 2.2490538835826275, + "grad_norm": 0.2961534698999477, + "learning_rate": 1.7868655077187175e-06, + "loss": 0.404, + "step": 4160 + }, + { + "epoch": 2.25446026311047, + "grad_norm": 0.3026130823215708, + "learning_rate": 1.76282048916821e-06, + "loss": 0.4105, + "step": 4170 + }, + { + "epoch": 2.2598666426383134, + "grad_norm": 0.295103201693147, + "learning_rate": 1.7389036788586627e-06, + "loss": 0.4057, + "step": 4180 + }, + { + "epoch": 2.265273022166156, + "grad_norm": 0.26979492433946, + "learning_rate": 1.7151160240134702e-06, + "loss": 0.4027, + "step": 4190 + }, + { + "epoch": 2.270679401693999, + "grad_norm": 0.3069718829915049, + "learning_rate": 1.6914584667408408e-06, + "loss": 0.407, + "step": 4200 + }, + { + "epoch": 2.276085781221842, + "grad_norm": 0.2582555297518662, + "learning_rate": 1.6679319439964797e-06, + "loss": 0.3943, + "step": 4210 + }, + { + "epoch": 2.2814921607496847, + "grad_norm": 0.30300112933414725, + "learning_rate": 1.6445373875464738e-06, + "loss": 0.4073, + "step": 4220 + }, + { + "epoch": 2.2868985402775275, + "grad_norm": 0.27640155584834986, + "learning_rate": 1.6212757239304e-06, + "loss": 0.4074, + "step": 4230 + }, + { + "epoch": 2.29230491980537, + "grad_norm": 0.288482277273483, + "learning_rate": 1.5981478744246242e-06, + "loss": 0.3961, + "step": 4240 + }, + { + "epoch": 2.297711299333213, + "grad_norm": 0.2968944260811366, + "learning_rate": 1.575154755005816e-06, + "loss": 0.403, + "step": 4250 + }, + { + "epoch": 2.303117678861056, + "grad_norm": 0.29278471655933946, + "learning_rate": 1.5522972763146653e-06, + "loss": 0.4019, + "step": 4260 + }, + { + "epoch": 2.308524058388899, + "grad_norm": 0.2729883421366084, + "learning_rate": 1.5295763436198274e-06, + "loss": 0.4148, + "step": 4270 + }, + { + "epoch": 2.3139304379167416, + "grad_norm": 0.30284845140590294, + "learning_rate": 1.5069928567820635e-06, + "loss": 0.4016, + "step": 4280 + }, + { + "epoch": 2.3193368174445848, + "grad_norm": 0.3044664985270554, + "learning_rate": 1.4845477102185974e-06, + "loss": 0.4092, + "step": 4290 + }, + { + "epoch": 2.3247431969724275, + "grad_norm": 0.30467048506977945, + "learning_rate": 1.4622417928677034e-06, + "loss": 0.3997, + "step": 4300 + }, + { + "epoch": 2.33014957650027, + "grad_norm": 0.25546815283849933, + "learning_rate": 1.4400759881534886e-06, + "loss": 0.3988, + "step": 4310 + }, + { + "epoch": 2.335555956028113, + "grad_norm": 0.2852027186621198, + "learning_rate": 1.418051173950914e-06, + "loss": 0.4124, + "step": 4320 + }, + { + "epoch": 2.340962335555956, + "grad_norm": 0.28906302811953016, + "learning_rate": 1.3961682225510203e-06, + "loss": 0.3993, + "step": 4330 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 0.27197836639387235, + "learning_rate": 1.3744280006263839e-06, + "loss": 0.408, + "step": 4340 + }, + { + "epoch": 2.3517750946116416, + "grad_norm": 0.2668399923208869, + "learning_rate": 1.3528313691967926e-06, + "loss": 0.4134, + "step": 4350 + }, + { + "epoch": 2.3571814741394848, + "grad_norm": 0.2872848077693314, + "learning_rate": 1.3313791835951396e-06, + "loss": 0.4045, + "step": 4360 + }, + { + "epoch": 2.3625878536673275, + "grad_norm": 0.29802601615160446, + "learning_rate": 1.310072293433558e-06, + "loss": 0.4014, + "step": 4370 + }, + { + "epoch": 2.36799423319517, + "grad_norm": 0.25723071187565805, + "learning_rate": 1.2889115425697612e-06, + "loss": 0.399, + "step": 4380 + }, + { + "epoch": 2.373400612723013, + "grad_norm": 0.2842104581531295, + "learning_rate": 1.2678977690736311e-06, + "loss": 0.4015, + "step": 4390 + }, + { + "epoch": 2.378806992250856, + "grad_norm": 0.2813179130833351, + "learning_rate": 1.2470318051940205e-06, + "loss": 0.4026, + "step": 4400 + }, + { + "epoch": 2.384213371778699, + "grad_norm": 0.27762098429764004, + "learning_rate": 1.2263144773257967e-06, + "loss": 0.4068, + "step": 4410 + }, + { + "epoch": 2.3896197513065416, + "grad_norm": 0.27848678899943174, + "learning_rate": 1.2057466059771035e-06, + "loss": 0.4006, + "step": 4420 + }, + { + "epoch": 2.3950261308343848, + "grad_norm": 0.27875535013460345, + "learning_rate": 1.1853290057368754e-06, + "loss": 0.4088, + "step": 4430 + }, + { + "epoch": 2.4004325103622275, + "grad_norm": 0.2662344684523685, + "learning_rate": 1.165062485242574e-06, + "loss": 0.4019, + "step": 4440 + }, + { + "epoch": 2.40583888989007, + "grad_norm": 0.3005215328293971, + "learning_rate": 1.1449478471481512e-06, + "loss": 0.411, + "step": 4450 + }, + { + "epoch": 2.411245269417913, + "grad_norm": 0.2712567161403629, + "learning_rate": 1.1249858880922771e-06, + "loss": 0.4059, + "step": 4460 + }, + { + "epoch": 2.416651648945756, + "grad_norm": 0.26211955276644977, + "learning_rate": 1.1051773986667735e-06, + "loss": 0.4051, + "step": 4470 + }, + { + "epoch": 2.422058028473599, + "grad_norm": 0.26165210615685336, + "learning_rate": 1.0855231633853137e-06, + "loss": 0.4068, + "step": 4480 + }, + { + "epoch": 2.4274644080014416, + "grad_norm": 0.2765363606523804, + "learning_rate": 1.0660239606523466e-06, + "loss": 0.4128, + "step": 4490 + }, + { + "epoch": 2.4328707875292848, + "grad_norm": 0.2770223660740028, + "learning_rate": 1.0466805627322685e-06, + "loss": 0.4055, + "step": 4500 + }, + { + "epoch": 2.4382771670571275, + "grad_norm": 0.266013699998984, + "learning_rate": 1.0274937357188414e-06, + "loss": 0.4049, + "step": 4510 + }, + { + "epoch": 2.4436835465849702, + "grad_norm": 0.25683355130670393, + "learning_rate": 1.0084642395048428e-06, + "loss": 0.4078, + "step": 4520 + }, + { + "epoch": 2.449089926112813, + "grad_norm": 0.2811697424270643, + "learning_rate": 9.895928277519822e-07, + "loss": 0.4092, + "step": 4530 + }, + { + "epoch": 2.454496305640656, + "grad_norm": 0.2836256278223854, + "learning_rate": 9.708802478610413e-07, + "loss": 0.4059, + "step": 4540 + }, + { + "epoch": 2.459902685168499, + "grad_norm": 0.2771952071252828, + "learning_rate": 9.523272409422829e-07, + "loss": 0.4112, + "step": 4550 + }, + { + "epoch": 2.4653090646963416, + "grad_norm": 0.2965292468618203, + "learning_rate": 9.339345417860918e-07, + "loss": 0.4028, + "step": 4560 + }, + { + "epoch": 2.4707154442241848, + "grad_norm": 0.307263683184186, + "learning_rate": 9.157028788338795e-07, + "loss": 0.4029, + "step": 4570 + }, + { + "epoch": 2.4761218237520275, + "grad_norm": 0.2922545833760392, + "learning_rate": 8.976329741492262e-07, + "loss": 0.3939, + "step": 4580 + }, + { + "epoch": 2.4815282032798702, + "grad_norm": 0.29211120065069335, + "learning_rate": 8.797255433892926e-07, + "loss": 0.4086, + "step": 4590 + }, + { + "epoch": 2.486934582807713, + "grad_norm": 0.28634400793358533, + "learning_rate": 8.619812957764729e-07, + "loss": 0.4059, + "step": 4600 + }, + { + "epoch": 2.492340962335556, + "grad_norm": 0.2646272575948771, + "learning_rate": 8.444009340703008e-07, + "loss": 0.398, + "step": 4610 + }, + { + "epoch": 2.497747341863399, + "grad_norm": 0.29066647888917396, + "learning_rate": 8.269851545396279e-07, + "loss": 0.4025, + "step": 4620 + }, + { + "epoch": 2.5031537213912416, + "grad_norm": 0.28424280479329644, + "learning_rate": 8.097346469350348e-07, + "loss": 0.4013, + "step": 4630 + }, + { + "epoch": 2.5085601009190848, + "grad_norm": 0.2896529003620974, + "learning_rate": 7.926500944615267e-07, + "loss": 0.4108, + "step": 4640 + }, + { + "epoch": 2.5139664804469275, + "grad_norm": 0.27346406286896946, + "learning_rate": 7.757321737514645e-07, + "loss": 0.3941, + "step": 4650 + }, + { + "epoch": 2.5193728599747702, + "grad_norm": 0.26882609264045565, + "learning_rate": 7.589815548377738e-07, + "loss": 0.4035, + "step": 4660 + }, + { + "epoch": 2.524779239502613, + "grad_norm": 0.27733293233890505, + "learning_rate": 7.423989011274052e-07, + "loss": 0.4085, + "step": 4670 + }, + { + "epoch": 2.5301856190304557, + "grad_norm": 0.25627085107348396, + "learning_rate": 7.259848693750582e-07, + "loss": 0.4017, + "step": 4680 + }, + { + "epoch": 2.535591998558299, + "grad_norm": 0.2691243234604463, + "learning_rate": 7.097401096571765e-07, + "loss": 0.3996, + "step": 4690 + }, + { + "epoch": 2.5409983780861416, + "grad_norm": 0.2764529789534093, + "learning_rate": 6.936652653461939e-07, + "loss": 0.4145, + "step": 4700 + }, + { + "epoch": 2.5464047576139848, + "grad_norm": 0.2902741811813119, + "learning_rate": 6.777609730850615e-07, + "loss": 0.4007, + "step": 4710 + }, + { + "epoch": 2.5518111371418275, + "grad_norm": 0.265969991168333, + "learning_rate": 6.620278627620286e-07, + "loss": 0.402, + "step": 4720 + }, + { + "epoch": 2.5572175166696702, + "grad_norm": 0.259196836837019, + "learning_rate": 6.464665574856977e-07, + "loss": 0.4124, + "step": 4730 + }, + { + "epoch": 2.562623896197513, + "grad_norm": 0.2829926842253021, + "learning_rate": 6.310776735603452e-07, + "loss": 0.3989, + "step": 4740 + }, + { + "epoch": 2.5680302757253557, + "grad_norm": 0.2694529736291035, + "learning_rate": 6.158618204615119e-07, + "loss": 0.4032, + "step": 4750 + }, + { + "epoch": 2.573436655253199, + "grad_norm": 0.2630102431201598, + "learning_rate": 6.008196008118705e-07, + "loss": 0.407, + "step": 4760 + }, + { + "epoch": 2.5788430347810416, + "grad_norm": 0.27146999027694685, + "learning_rate": 5.859516103573492e-07, + "loss": 0.3982, + "step": 4770 + }, + { + "epoch": 2.5842494143088843, + "grad_norm": 0.28346284777141134, + "learning_rate": 5.712584379435482e-07, + "loss": 0.3984, + "step": 4780 + }, + { + "epoch": 2.5896557938367275, + "grad_norm": 0.28197172604169823, + "learning_rate": 5.567406654924074e-07, + "loss": 0.3988, + "step": 4790 + }, + { + "epoch": 2.5950621733645702, + "grad_norm": 0.2717022634001503, + "learning_rate": 5.423988679791686e-07, + "loss": 0.4098, + "step": 4800 + }, + { + "epoch": 2.600468552892413, + "grad_norm": 0.276903744178795, + "learning_rate": 5.282336134095994e-07, + "loss": 0.4043, + "step": 4810 + }, + { + "epoch": 2.6058749324202557, + "grad_norm": 0.25453566586188486, + "learning_rate": 5.142454627974969e-07, + "loss": 0.3976, + "step": 4820 + }, + { + "epoch": 2.611281311948099, + "grad_norm": 0.2784736093310705, + "learning_rate": 5.00434970142471e-07, + "loss": 0.4062, + "step": 4830 + }, + { + "epoch": 2.6166876914759416, + "grad_norm": 0.24784017038474418, + "learning_rate": 4.868026824080008e-07, + "loss": 0.4061, + "step": 4840 + }, + { + "epoch": 2.6220940710037843, + "grad_norm": 0.2807417719405863, + "learning_rate": 4.7334913949977526e-07, + "loss": 0.4075, + "step": 4850 + }, + { + "epoch": 2.6275004505316275, + "grad_norm": 0.25346910500895187, + "learning_rate": 4.6007487424430565e-07, + "loss": 0.3964, + "step": 4860 + }, + { + "epoch": 2.6329068300594702, + "grad_norm": 0.27364761903392193, + "learning_rate": 4.46980412367829e-07, + "loss": 0.3938, + "step": 4870 + }, + { + "epoch": 2.638313209587313, + "grad_norm": 0.2765709048501121, + "learning_rate": 4.3406627247548184e-07, + "loss": 0.4074, + "step": 4880 + }, + { + "epoch": 2.6437195891151557, + "grad_norm": 0.2776500402889704, + "learning_rate": 4.21332966030763e-07, + "loss": 0.3994, + "step": 4890 + }, + { + "epoch": 2.649125968642999, + "grad_norm": 0.26079072827311783, + "learning_rate": 4.08780997335278e-07, + "loss": 0.4045, + "step": 4900 + }, + { + "epoch": 2.6545323481708416, + "grad_norm": 0.2397016051949167, + "learning_rate": 3.9641086350876155e-07, + "loss": 0.4029, + "step": 4910 + }, + { + "epoch": 2.6599387276986843, + "grad_norm": 0.29754617724142174, + "learning_rate": 3.84223054469397e-07, + "loss": 0.4018, + "step": 4920 + }, + { + "epoch": 2.6653451072265275, + "grad_norm": 0.27568276310419043, + "learning_rate": 3.722180529144054e-07, + "loss": 0.4096, + "step": 4930 + }, + { + "epoch": 2.6707514867543702, + "grad_norm": 0.25544292907340554, + "learning_rate": 3.6039633430093367e-07, + "loss": 0.4006, + "step": 4940 + }, + { + "epoch": 2.676157866282213, + "grad_norm": 0.2904302979415872, + "learning_rate": 3.4875836682722096e-07, + "loss": 0.4093, + "step": 4950 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 0.2796446372356396, + "learning_rate": 3.373046114140571e-07, + "loss": 0.4037, + "step": 4960 + }, + { + "epoch": 2.686970625337899, + "grad_norm": 0.2690617997319961, + "learning_rate": 3.260355216865291e-07, + "loss": 0.4058, + "step": 4970 + }, + { + "epoch": 2.6923770048657416, + "grad_norm": 0.27708751977237855, + "learning_rate": 3.149515439560524e-07, + "loss": 0.4084, + "step": 4980 + }, + { + "epoch": 2.6977833843935843, + "grad_norm": 0.25923770611284674, + "learning_rate": 3.040531172026978e-07, + "loss": 0.4035, + "step": 4990 + }, + { + "epoch": 2.7031897639214275, + "grad_norm": 0.2503752240400745, + "learning_rate": 2.933406730578009e-07, + "loss": 0.4094, + "step": 5000 + }, + { + "epoch": 2.7085961434492702, + "grad_norm": 0.27256002841564525, + "learning_rate": 2.828146357868755e-07, + "loss": 0.4049, + "step": 5010 + }, + { + "epoch": 2.714002522977113, + "grad_norm": 0.262526407381437, + "learning_rate": 2.7247542227280155e-07, + "loss": 0.399, + "step": 5020 + }, + { + "epoch": 2.7194089025049557, + "grad_norm": 0.26889496739047675, + "learning_rate": 2.6232344199932034e-07, + "loss": 0.3974, + "step": 5030 + }, + { + "epoch": 2.7248152820327984, + "grad_norm": 0.2581699169174531, + "learning_rate": 2.523590970348166e-07, + "loss": 0.4078, + "step": 5040 + }, + { + "epoch": 2.7302216615606416, + "grad_norm": 0.2681313769671267, + "learning_rate": 2.4258278201639117e-07, + "loss": 0.4083, + "step": 5050 + }, + { + "epoch": 2.7356280410884843, + "grad_norm": 0.2583458633767275, + "learning_rate": 2.3299488413423554e-07, + "loss": 0.4033, + "step": 5060 + }, + { + "epoch": 2.7410344206163275, + "grad_norm": 0.27176652448537475, + "learning_rate": 2.2359578311629272e-07, + "loss": 0.41, + "step": 5070 + }, + { + "epoch": 2.7464408001441702, + "grad_norm": 0.2651677980954859, + "learning_rate": 2.1438585121322465e-07, + "loss": 0.4048, + "step": 5080 + }, + { + "epoch": 2.751847179672013, + "grad_norm": 0.26468667998207535, + "learning_rate": 2.0536545318366018e-07, + "loss": 0.4089, + "step": 5090 + }, + { + "epoch": 2.7572535591998557, + "grad_norm": 0.2682578170402083, + "learning_rate": 1.9653494627975888e-07, + "loss": 0.404, + "step": 5100 + }, + { + "epoch": 2.7626599387276984, + "grad_norm": 0.27087994511441277, + "learning_rate": 1.8789468023305334e-07, + "loss": 0.4033, + "step": 5110 + }, + { + "epoch": 2.7680663182555416, + "grad_norm": 0.25252752081120117, + "learning_rate": 1.7944499724060484e-07, + "loss": 0.4086, + "step": 5120 + }, + { + "epoch": 2.7734726977833843, + "grad_norm": 0.2765603337180068, + "learning_rate": 1.711862319514457e-07, + "loss": 0.4058, + "step": 5130 + }, + { + "epoch": 2.7788790773112275, + "grad_norm": 0.2662570880480703, + "learning_rate": 1.6311871145332836e-07, + "loss": 0.4016, + "step": 5140 + }, + { + "epoch": 2.7842854568390702, + "grad_norm": 0.26536562491010973, + "learning_rate": 1.5524275525977073e-07, + "loss": 0.3961, + "step": 5150 + }, + { + "epoch": 2.789691836366913, + "grad_norm": 0.2696933797225792, + "learning_rate": 1.4755867529740064e-07, + "loss": 0.402, + "step": 5160 + }, + { + "epoch": 2.7950982158947557, + "grad_norm": 0.26230277928432566, + "learning_rate": 1.4006677589360307e-07, + "loss": 0.4006, + "step": 5170 + }, + { + "epoch": 2.8005045954225984, + "grad_norm": 0.2618189445881308, + "learning_rate": 1.3276735376446693e-07, + "loss": 0.4101, + "step": 5180 + }, + { + "epoch": 2.8059109749504416, + "grad_norm": 0.26154419260033057, + "learning_rate": 1.2566069800303393e-07, + "loss": 0.4007, + "step": 5190 + }, + { + "epoch": 2.8113173544782843, + "grad_norm": 0.26129803510244903, + "learning_rate": 1.1874709006784891e-07, + "loss": 0.4108, + "step": 5200 + }, + { + "epoch": 2.816723734006127, + "grad_norm": 0.2755262239215911, + "learning_rate": 1.1202680377181252e-07, + "loss": 0.4081, + "step": 5210 + }, + { + "epoch": 2.8221301135339703, + "grad_norm": 0.27615467193849846, + "learning_rate": 1.055001052713378e-07, + "loss": 0.4057, + "step": 5220 + }, + { + "epoch": 2.827536493061813, + "grad_norm": 0.2565394448779921, + "learning_rate": 9.916725305580632e-08, + "loss": 0.4074, + "step": 5230 + }, + { + "epoch": 2.8329428725896557, + "grad_norm": 0.29481883515723867, + "learning_rate": 9.302849793733526e-08, + "loss": 0.4037, + "step": 5240 + }, + { + "epoch": 2.8383492521174984, + "grad_norm": 0.2628737439763179, + "learning_rate": 8.708408304083927e-08, + "loss": 0.3982, + "step": 5250 + }, + { + "epoch": 2.8437556316453416, + "grad_norm": 0.2856973586242492, + "learning_rate": 8.133424379440535e-08, + "loss": 0.4098, + "step": 5260 + }, + { + "epoch": 2.8491620111731844, + "grad_norm": 0.2573191532815954, + "learning_rate": 7.577920791996595e-08, + "loss": 0.4021, + "step": 5270 + }, + { + "epoch": 2.854568390701027, + "grad_norm": 0.2671924144995498, + "learning_rate": 7.041919542428221e-08, + "loss": 0.4046, + "step": 5280 + }, + { + "epoch": 2.8599747702288703, + "grad_norm": 0.27125026996972024, + "learning_rate": 6.525441859022873e-08, + "loss": 0.3996, + "step": 5290 + }, + { + "epoch": 2.865381149756713, + "grad_norm": 0.2597885306736867, + "learning_rate": 6.028508196838811e-08, + "loss": 0.3991, + "step": 5300 + }, + { + "epoch": 2.8707875292845557, + "grad_norm": 0.2661065612840173, + "learning_rate": 5.551138236894793e-08, + "loss": 0.4082, + "step": 5310 + }, + { + "epoch": 2.8761939088123984, + "grad_norm": 0.27596106902272594, + "learning_rate": 5.093350885390591e-08, + "loss": 0.4092, + "step": 5320 + }, + { + "epoch": 2.8816002883402416, + "grad_norm": 0.2798778899386736, + "learning_rate": 4.655164272958534e-08, + "loss": 0.3935, + "step": 5330 + }, + { + "epoch": 2.8870066678680844, + "grad_norm": 0.2675281011170649, + "learning_rate": 4.236595753944972e-08, + "loss": 0.4049, + "step": 5340 + }, + { + "epoch": 2.892413047395927, + "grad_norm": 0.24219018671622744, + "learning_rate": 3.837661905723378e-08, + "loss": 0.4061, + "step": 5350 + }, + { + "epoch": 2.8978194269237703, + "grad_norm": 0.26852051522723963, + "learning_rate": 3.458378528037598e-08, + "loss": 0.3982, + "step": 5360 + }, + { + "epoch": 2.903225806451613, + "grad_norm": 0.2598218760743794, + "learning_rate": 3.0987606423759644e-08, + "loss": 0.3978, + "step": 5370 + }, + { + "epoch": 2.9086321859794557, + "grad_norm": 0.24224454585639746, + "learning_rate": 2.7588224913768225e-08, + "loss": 0.4056, + "step": 5380 + }, + { + "epoch": 2.9140385655072985, + "grad_norm": 0.28293842876891173, + "learning_rate": 2.438577538263931e-08, + "loss": 0.4041, + "step": 5390 + }, + { + "epoch": 2.9194449450351416, + "grad_norm": 0.24273867782068695, + "learning_rate": 2.1380384663135523e-08, + "loss": 0.4046, + "step": 5400 + }, + { + "epoch": 2.9248513245629844, + "grad_norm": 0.2589867572465761, + "learning_rate": 1.8572171783521885e-08, + "loss": 0.4016, + "step": 5410 + }, + { + "epoch": 2.930257704090827, + "grad_norm": 0.26040920179163585, + "learning_rate": 1.596124796284848e-08, + "loss": 0.4048, + "step": 5420 + }, + { + "epoch": 2.9356640836186703, + "grad_norm": 0.28129280293565423, + "learning_rate": 1.3547716606548967e-08, + "loss": 0.4082, + "step": 5430 + }, + { + "epoch": 2.941070463146513, + "grad_norm": 0.27263421805264343, + "learning_rate": 1.133167330234386e-08, + "loss": 0.3957, + "step": 5440 + }, + { + "epoch": 2.9464768426743557, + "grad_norm": 0.27306797377575853, + "learning_rate": 9.313205816454674e-09, + "loss": 0.4097, + "step": 5450 + }, + { + "epoch": 2.9518832222021985, + "grad_norm": 0.26535989264790094, + "learning_rate": 7.492394090128364e-09, + "loss": 0.4091, + "step": 5460 + }, + { + "epoch": 2.957289601730041, + "grad_norm": 0.26682062170730547, + "learning_rate": 5.8693102364698604e-09, + "loss": 0.3975, + "step": 5470 + }, + { + "epoch": 2.9626959812578844, + "grad_norm": 0.2848285894683682, + "learning_rate": 4.444018537588801e-09, + "loss": 0.4075, + "step": 5480 + }, + { + "epoch": 2.968102360785727, + "grad_norm": 0.2853108418534249, + "learning_rate": 3.2165754420510063e-09, + "loss": 0.4107, + "step": 5490 + }, + { + "epoch": 2.9735087403135703, + "grad_norm": 0.26447810990716136, + "learning_rate": 2.1870295626441607e-09, + "loss": 0.4022, + "step": 5500 + }, + { + "epoch": 2.978915119841413, + "grad_norm": 0.2661971477507847, + "learning_rate": 1.3554216744521287e-09, + "loss": 0.4041, + "step": 5510 + }, + { + "epoch": 2.9843214993692557, + "grad_norm": 0.25582504114161564, + "learning_rate": 7.217847132401367e-10, + "loss": 0.4064, + "step": 5520 + }, + { + "epoch": 2.9897278788970985, + "grad_norm": 0.26069476073784237, + "learning_rate": 2.861437741508155e-10, + "loss": 0.4115, + "step": 5530 + }, + { + "epoch": 2.995134258424941, + "grad_norm": 0.27554755453273777, + "learning_rate": 4.851611070832984e-11, + "loss": 0.4016, + "step": 5540 + }, + { + "epoch": 2.9989187240944313, + "step": 5547, + "total_flos": 8484146955288576.0, + "train_loss": 0.44718967426225087, + "train_runtime": 93872.001, + "train_samples_per_second": 5.675, + "train_steps_per_second": 0.059 + } + ], + "logging_steps": 10, + "max_steps": 5547, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8484146955288576.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}