| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1656, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006041839740200891, | |
| "grad_norm": 8.275440544361343, | |
| "learning_rate": 5.421686746987952e-07, | |
| "loss": 0.818, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.012083679480401783, | |
| "grad_norm": 2.7914609484184627, | |
| "learning_rate": 1.1445783132530121e-06, | |
| "loss": 0.6987, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.018125519220602675, | |
| "grad_norm": 1.3664105415610248, | |
| "learning_rate": 1.7469879518072292e-06, | |
| "loss": 0.5802, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.024167358960803565, | |
| "grad_norm": 0.8594872429754616, | |
| "learning_rate": 2.349397590361446e-06, | |
| "loss": 0.5258, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.030209198701004455, | |
| "grad_norm": 0.7434393729226938, | |
| "learning_rate": 2.9518072289156627e-06, | |
| "loss": 0.5145, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03625103844120535, | |
| "grad_norm": 0.8872227027342147, | |
| "learning_rate": 3.5542168674698798e-06, | |
| "loss": 0.5015, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.042292878181406236, | |
| "grad_norm": 0.8429713488596823, | |
| "learning_rate": 4.156626506024097e-06, | |
| "loss": 0.4901, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04833471792160713, | |
| "grad_norm": 0.9065488209295363, | |
| "learning_rate": 4.759036144578314e-06, | |
| "loss": 0.4796, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05437655766180802, | |
| "grad_norm": 0.9651498125714989, | |
| "learning_rate": 5.361445783132531e-06, | |
| "loss": 0.4758, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06041839740200891, | |
| "grad_norm": 0.8301785431193227, | |
| "learning_rate": 5.963855421686747e-06, | |
| "loss": 0.4829, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0664602371422098, | |
| "grad_norm": 0.8886113651753562, | |
| "learning_rate": 6.566265060240964e-06, | |
| "loss": 0.4812, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0725020768824107, | |
| "grad_norm": 0.9862797318907437, | |
| "learning_rate": 7.168674698795182e-06, | |
| "loss": 0.4789, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07854391662261158, | |
| "grad_norm": 1.0637568246181046, | |
| "learning_rate": 7.771084337349398e-06, | |
| "loss": 0.4767, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08458575636281247, | |
| "grad_norm": 1.019587639434787, | |
| "learning_rate": 8.373493975903614e-06, | |
| "loss": 0.4748, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09062759610301337, | |
| "grad_norm": 1.0494143456183387, | |
| "learning_rate": 8.975903614457832e-06, | |
| "loss": 0.471, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09666943584321426, | |
| "grad_norm": 1.0167856442156524, | |
| "learning_rate": 9.57831325301205e-06, | |
| "loss": 0.482, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10271127558341515, | |
| "grad_norm": 0.9927765526681102, | |
| "learning_rate": 9.99989997506481e-06, | |
| "loss": 0.4867, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10875311532361603, | |
| "grad_norm": 0.8252285160527216, | |
| "learning_rate": 9.998121865323769e-06, | |
| "loss": 0.4832, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.11479495506381693, | |
| "grad_norm": 1.0494152999510933, | |
| "learning_rate": 9.994121889085426e-06, | |
| "loss": 0.4836, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.12083679480401782, | |
| "grad_norm": 0.8683126435696938, | |
| "learning_rate": 9.987901824500342e-06, | |
| "loss": 0.4796, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12687863454421872, | |
| "grad_norm": 1.0765238837274924, | |
| "learning_rate": 9.979464436637771e-06, | |
| "loss": 0.479, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1329204742844196, | |
| "grad_norm": 0.9747782968326728, | |
| "learning_rate": 9.968813476256483e-06, | |
| "loss": 0.4774, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1389623140246205, | |
| "grad_norm": 1.0422947760995596, | |
| "learning_rate": 9.955953678137397e-06, | |
| "loss": 0.4775, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1450041537648214, | |
| "grad_norm": 0.8638869125481701, | |
| "learning_rate": 9.940890758978781e-06, | |
| "loss": 0.4809, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1510459935050223, | |
| "grad_norm": 0.9908759029078118, | |
| "learning_rate": 9.923631414854946e-06, | |
| "loss": 0.4823, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15708783324522316, | |
| "grad_norm": 0.8594590054265251, | |
| "learning_rate": 9.904183318239573e-06, | |
| "loss": 0.4732, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.16312967298542405, | |
| "grad_norm": 0.8537166815636835, | |
| "learning_rate": 9.882555114594994e-06, | |
| "loss": 0.4746, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16917151272562495, | |
| "grad_norm": 0.7988607086870596, | |
| "learning_rate": 9.858756418528928e-06, | |
| "loss": 0.4771, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.17521335246582584, | |
| "grad_norm": 0.8128733549061342, | |
| "learning_rate": 9.832797809520404e-06, | |
| "loss": 0.4703, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.18125519220602673, | |
| "grad_norm": 0.8670976233152772, | |
| "learning_rate": 9.804690827216764e-06, | |
| "loss": 0.4763, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18729703194622763, | |
| "grad_norm": 0.9163703807052254, | |
| "learning_rate": 9.77444796630381e-06, | |
| "loss": 0.4665, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.19333887168642852, | |
| "grad_norm": 1.0505077270930072, | |
| "learning_rate": 9.742082670951423e-06, | |
| "loss": 0.4781, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.19938071142662941, | |
| "grad_norm": 0.9771619804123264, | |
| "learning_rate": 9.707609328837085e-06, | |
| "loss": 0.4695, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2054225511668303, | |
| "grad_norm": 0.8443160970596196, | |
| "learning_rate": 9.67104326474998e-06, | |
| "loss": 0.4709, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2114643909070312, | |
| "grad_norm": 0.7405442085507775, | |
| "learning_rate": 9.632400733778504e-06, | |
| "loss": 0.4687, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21750623064723207, | |
| "grad_norm": 1.06910781919214, | |
| "learning_rate": 9.591698914084224e-06, | |
| "loss": 0.4727, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.22354807038743296, | |
| "grad_norm": 0.9783321165505232, | |
| "learning_rate": 9.548955899265495e-06, | |
| "loss": 0.4757, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.22958991012763386, | |
| "grad_norm": 0.856614778477577, | |
| "learning_rate": 9.504190690314124e-06, | |
| "loss": 0.4628, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.23563174986783475, | |
| "grad_norm": 0.8407994854798428, | |
| "learning_rate": 9.457423187168667e-06, | |
| "loss": 0.4751, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.24167358960803564, | |
| "grad_norm": 0.835330085907184, | |
| "learning_rate": 9.408674179868101e-06, | |
| "loss": 0.4755, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.24771542934823654, | |
| "grad_norm": 0.8184694750958316, | |
| "learning_rate": 9.357965339309823e-06, | |
| "loss": 0.4688, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.25375726908843743, | |
| "grad_norm": 0.8556998098855371, | |
| "learning_rate": 9.305319207616048e-06, | |
| "loss": 0.468, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2597991088286383, | |
| "grad_norm": 0.7125631433834922, | |
| "learning_rate": 9.250759188112944e-06, | |
| "loss": 0.4721, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2658409485688392, | |
| "grad_norm": 0.8684064304089488, | |
| "learning_rate": 9.194309534926896e-06, | |
| "loss": 0.4719, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2718827883090401, | |
| "grad_norm": 0.8581281745281804, | |
| "learning_rate": 9.13599534220258e-06, | |
| "loss": 0.4676, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.277924628049241, | |
| "grad_norm": 0.8073909041822805, | |
| "learning_rate": 9.075842532947605e-06, | |
| "loss": 0.4729, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2839664677894419, | |
| "grad_norm": 0.7760503268897376, | |
| "learning_rate": 9.013877847508684e-06, | |
| "loss": 0.4735, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2900083075296428, | |
| "grad_norm": 1.007511777834999, | |
| "learning_rate": 8.95012883168448e-06, | |
| "loss": 0.4655, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2960501472698437, | |
| "grad_norm": 0.8792795612951879, | |
| "learning_rate": 8.884623824480388e-06, | |
| "loss": 0.4604, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3020919870100446, | |
| "grad_norm": 0.7649844789740483, | |
| "learning_rate": 8.817391945510697e-06, | |
| "loss": 0.4727, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3081338267502455, | |
| "grad_norm": 0.7112188721793731, | |
| "learning_rate": 8.748463082053765e-06, | |
| "loss": 0.4696, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3141756664904463, | |
| "grad_norm": 0.982736127204613, | |
| "learning_rate": 8.677867875765904e-06, | |
| "loss": 0.4667, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3202175062306472, | |
| "grad_norm": 0.7380835460961964, | |
| "learning_rate": 8.605637709059937e-06, | |
| "loss": 0.4665, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3262593459708481, | |
| "grad_norm": 0.889341527636709, | |
| "learning_rate": 8.531804691154454e-06, | |
| "loss": 0.4596, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.332301185711049, | |
| "grad_norm": 0.8717590020119719, | |
| "learning_rate": 8.45640164379996e-06, | |
| "loss": 0.4635, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3383430254512499, | |
| "grad_norm": 0.8593981074116124, | |
| "learning_rate": 8.379462086688302e-06, | |
| "loss": 0.4614, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3443848651914508, | |
| "grad_norm": 0.7452870255566968, | |
| "learning_rate": 8.301020222551795e-06, | |
| "loss": 0.4613, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3504267049316517, | |
| "grad_norm": 0.6738237626130682, | |
| "learning_rate": 8.221110921958748e-06, | |
| "loss": 0.4625, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.35646854467185257, | |
| "grad_norm": 0.7475917228131528, | |
| "learning_rate": 8.139769707812083e-06, | |
| "loss": 0.4643, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.36251038441205347, | |
| "grad_norm": 0.8625312431506086, | |
| "learning_rate": 8.057032739557973e-06, | |
| "loss": 0.4647, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.36855222415225436, | |
| "grad_norm": 0.7686541350728158, | |
| "learning_rate": 7.972936797111515e-06, | |
| "loss": 0.4664, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.37459406389245525, | |
| "grad_norm": 0.793893791173724, | |
| "learning_rate": 7.887519264506577e-06, | |
| "loss": 0.4687, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.38063590363265615, | |
| "grad_norm": 0.8889160126310801, | |
| "learning_rate": 7.800818113277085e-06, | |
| "loss": 0.4675, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.38667774337285704, | |
| "grad_norm": 0.73855111981923, | |
| "learning_rate": 7.712871885577147e-06, | |
| "loss": 0.4696, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.39271958311305794, | |
| "grad_norm": 0.9038531350448926, | |
| "learning_rate": 7.623719677047521e-06, | |
| "loss": 0.4679, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.39876142285325883, | |
| "grad_norm": 0.8258158060448149, | |
| "learning_rate": 7.533401119436012e-06, | |
| "loss": 0.4636, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4048032625934597, | |
| "grad_norm": 0.7984996623335708, | |
| "learning_rate": 7.441956362979584e-06, | |
| "loss": 0.4573, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4108451023336606, | |
| "grad_norm": 0.9068733833902644, | |
| "learning_rate": 7.349426058555943e-06, | |
| "loss": 0.467, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4168869420738615, | |
| "grad_norm": 0.7305483298541591, | |
| "learning_rate": 7.255851339612597e-06, | |
| "loss": 0.4619, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.4229287818140624, | |
| "grad_norm": 0.7441065423891078, | |
| "learning_rate": 7.161273803881381e-06, | |
| "loss": 0.4491, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4289706215542633, | |
| "grad_norm": 0.9276808876681103, | |
| "learning_rate": 7.0657354948865786e-06, | |
| "loss": 0.4532, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.43501246129446414, | |
| "grad_norm": 0.6605412717277142, | |
| "learning_rate": 6.969278883254896e-06, | |
| "loss": 0.4646, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.44105430103466503, | |
| "grad_norm": 0.6593545188739028, | |
| "learning_rate": 6.871946847835548e-06, | |
| "loss": 0.4542, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4470961407748659, | |
| "grad_norm": 0.7295116693541216, | |
| "learning_rate": 6.773782656638889e-06, | |
| "loss": 0.4532, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4531379805150668, | |
| "grad_norm": 0.7877222556349484, | |
| "learning_rate": 6.674829947602034e-06, | |
| "loss": 0.464, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4591798202552677, | |
| "grad_norm": 0.7208466663864777, | |
| "learning_rate": 6.575132709190041e-06, | |
| "loss": 0.4604, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4652216599954686, | |
| "grad_norm": 0.8296261142467121, | |
| "learning_rate": 6.474735260841264e-06, | |
| "loss": 0.4604, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4712634997356695, | |
| "grad_norm": 0.7971141568003346, | |
| "learning_rate": 6.373682233265581e-06, | |
| "loss": 0.464, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4773053394758704, | |
| "grad_norm": 0.8736349174540989, | |
| "learning_rate": 6.2720185486042405e-06, | |
| "loss": 0.4679, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4833471792160713, | |
| "grad_norm": 0.9309796515999401, | |
| "learning_rate": 6.169789400460167e-06, | |
| "loss": 0.4613, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4893890189562722, | |
| "grad_norm": 0.7752657579479202, | |
| "learning_rate": 6.067040233807579e-06, | |
| "loss": 0.4561, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4954308586964731, | |
| "grad_norm": 0.8534120955438902, | |
| "learning_rate": 5.963816724789868e-06, | |
| "loss": 0.4488, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.501472698436674, | |
| "grad_norm": 0.8496125433352629, | |
| "learning_rate": 5.860164760414715e-06, | |
| "loss": 0.4618, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5075145381768749, | |
| "grad_norm": 0.8251415980778435, | |
| "learning_rate": 5.7561304181554626e-06, | |
| "loss": 0.4593, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5135563779170758, | |
| "grad_norm": 0.7139950510608393, | |
| "learning_rate": 5.651759945467829e-06, | |
| "loss": 0.4502, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5195982176572767, | |
| "grad_norm": 0.601943553224525, | |
| "learning_rate": 5.5470997392310475e-06, | |
| "loss": 0.4619, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5256400573974775, | |
| "grad_norm": 0.8153866464407951, | |
| "learning_rate": 5.442196325122583e-06, | |
| "loss": 0.4588, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5316818971376784, | |
| "grad_norm": 0.8344346217932002, | |
| "learning_rate": 5.337096336935596e-06, | |
| "loss": 0.4606, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5377237368778793, | |
| "grad_norm": 0.7492780666259381, | |
| "learning_rate": 5.231846495848343e-06, | |
| "loss": 0.4408, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5437655766180802, | |
| "grad_norm": 0.7261206428098568, | |
| "learning_rate": 5.12649358965473e-06, | |
| "loss": 0.4538, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5498074163582811, | |
| "grad_norm": 0.7279393955754347, | |
| "learning_rate": 5.021084451965257e-06, | |
| "loss": 0.4532, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.555849256098482, | |
| "grad_norm": 0.825743775909579, | |
| "learning_rate": 4.915665941387589e-06, | |
| "loss": 0.4496, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5618910958386829, | |
| "grad_norm": 0.7524696024763261, | |
| "learning_rate": 4.8102849206960144e-06, | |
| "loss": 0.4521, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5679329355788838, | |
| "grad_norm": 0.6103762891076722, | |
| "learning_rate": 4.704988235999059e-06, | |
| "loss": 0.4537, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5739747753190847, | |
| "grad_norm": 0.7615515295808474, | |
| "learning_rate": 4.5998226959145e-06, | |
| "loss": 0.4437, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5800166150592856, | |
| "grad_norm": 0.8148357524689337, | |
| "learning_rate": 4.494835050761055e-06, | |
| "loss": 0.4506, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5860584547994865, | |
| "grad_norm": 0.746845262451762, | |
| "learning_rate": 4.390071971775977e-06, | |
| "loss": 0.4496, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5921002945396874, | |
| "grad_norm": 0.7650124837023387, | |
| "learning_rate": 4.285580030367812e-06, | |
| "loss": 0.4442, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5981421342798883, | |
| "grad_norm": 0.7044872850784959, | |
| "learning_rate": 4.18140567741353e-06, | |
| "loss": 0.4515, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6041839740200892, | |
| "grad_norm": 0.6835203586178121, | |
| "learning_rate": 4.07759522260924e-06, | |
| "loss": 0.4484, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6102258137602901, | |
| "grad_norm": 0.7875522158264618, | |
| "learning_rate": 3.974194813883672e-06, | |
| "loss": 0.4487, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.616267653500491, | |
| "grad_norm": 1.0731871419616035, | |
| "learning_rate": 3.871250416883547e-06, | |
| "loss": 0.4487, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6223094932406917, | |
| "grad_norm": 0.7645842030603645, | |
| "learning_rate": 3.7688077945400135e-06, | |
| "loss": 0.4476, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6283513329808926, | |
| "grad_norm": 0.8262016423595615, | |
| "learning_rate": 3.6669124867251705e-06, | |
| "loss": 0.4419, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6343931727210935, | |
| "grad_norm": 0.8630082766433129, | |
| "learning_rate": 3.56560979000776e-06, | |
| "loss": 0.4433, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6404350124612944, | |
| "grad_norm": 0.6739630234814128, | |
| "learning_rate": 3.4649447375170243e-06, | |
| "loss": 0.434, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6464768522014953, | |
| "grad_norm": 0.8005468876181695, | |
| "learning_rate": 3.3649620789236613e-06, | |
| "loss": 0.4449, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6525186919416962, | |
| "grad_norm": 0.6927017507913021, | |
| "learning_rate": 3.265706260546793e-06, | |
| "loss": 0.4487, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6585605316818971, | |
| "grad_norm": 0.7470116822150206, | |
| "learning_rate": 3.1672214055957885e-06, | |
| "loss": 0.4492, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.664602371422098, | |
| "grad_norm": 0.7165981129271416, | |
| "learning_rate": 3.0695512945557175e-06, | |
| "loss": 0.4435, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6706442111622989, | |
| "grad_norm": 0.6713758708690007, | |
| "learning_rate": 2.972739345725163e-06, | |
| "loss": 0.445, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6766860509024998, | |
| "grad_norm": 0.7528881198034107, | |
| "learning_rate": 2.876828595915043e-06, | |
| "loss": 0.4458, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6827278906427007, | |
| "grad_norm": 1.3727779517766678, | |
| "learning_rate": 2.781861681317004e-06, | |
| "loss": 0.4362, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6887697303829016, | |
| "grad_norm": 0.8146352120444986, | |
| "learning_rate": 2.687880818549927e-06, | |
| "loss": 0.4396, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6948115701231025, | |
| "grad_norm": 0.6965825401455522, | |
| "learning_rate": 2.5949277858929297e-06, | |
| "loss": 0.4362, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7008534098633034, | |
| "grad_norm": 0.6495955670121708, | |
| "learning_rate": 2.5030439047132484e-06, | |
| "loss": 0.4453, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7068952496035042, | |
| "grad_norm": 0.6114269361750394, | |
| "learning_rate": 2.4122700210972218e-06, | |
| "loss": 0.4436, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7129370893437051, | |
| "grad_norm": 0.851795747172313, | |
| "learning_rate": 2.322646487692556e-06, | |
| "loss": 0.4437, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.718978929083906, | |
| "grad_norm": 0.775135762171182, | |
| "learning_rate": 2.2342131457699575e-06, | |
| "loss": 0.4495, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7250207688241069, | |
| "grad_norm": 0.6479257056864712, | |
| "learning_rate": 2.1470093075120686e-06, | |
| "loss": 0.4383, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7310626085643078, | |
| "grad_norm": 0.8263286588325557, | |
| "learning_rate": 2.061073738537635e-06, | |
| "loss": 0.4463, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7371044483045087, | |
| "grad_norm": 0.6982438518266665, | |
| "learning_rate": 1.9764446406686177e-06, | |
| "loss": 0.4436, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7431462880447096, | |
| "grad_norm": 0.6700592747869402, | |
| "learning_rate": 1.8931596349479385e-06, | |
| "loss": 0.4432, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7491881277849105, | |
| "grad_norm": 0.5967953856950272, | |
| "learning_rate": 1.8112557449154316e-06, | |
| "loss": 0.4338, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7552299675251114, | |
| "grad_norm": 0.6985853030295077, | |
| "learning_rate": 1.7307693801493619e-06, | |
| "loss": 0.4407, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7612718072653123, | |
| "grad_norm": 0.7016362891096635, | |
| "learning_rate": 1.6517363200809222e-06, | |
| "loss": 0.4488, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7673136470055132, | |
| "grad_norm": 0.8402985241208492, | |
| "learning_rate": 1.5741916980888267e-06, | |
| "loss": 0.435, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7733554867457141, | |
| "grad_norm": 0.8240304279049092, | |
| "learning_rate": 1.4981699858811038e-06, | |
| "loss": 0.4436, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.779397326485915, | |
| "grad_norm": 0.6660945380738031, | |
| "learning_rate": 1.423704978171046e-06, | |
| "loss": 0.4407, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7854391662261159, | |
| "grad_norm": 0.5874782271939601, | |
| "learning_rate": 1.3508297776540845e-06, | |
| "loss": 0.4498, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7914810059663168, | |
| "grad_norm": 0.5796653208549384, | |
| "learning_rate": 1.2795767802923192e-06, | |
| "loss": 0.4352, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7975228457065177, | |
| "grad_norm": 0.9463374855160357, | |
| "learning_rate": 1.2099776609132048e-06, | |
| "loss": 0.4384, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8035646854467186, | |
| "grad_norm": 0.6012369149099516, | |
| "learning_rate": 1.1420633591288072e-06, | |
| "loss": 0.441, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8096065251869194, | |
| "grad_norm": 0.7550415373921882, | |
| "learning_rate": 1.0758640655819107e-06, | |
| "loss": 0.4375, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8156483649271203, | |
| "grad_norm": 0.7006495875182843, | |
| "learning_rate": 1.0114092085250566e-06, | |
| "loss": 0.4439, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8216902046673212, | |
| "grad_norm": 0.6206161322638641, | |
| "learning_rate": 9.487274407384972e-07, | |
| "loss": 0.4359, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8277320444075221, | |
| "grad_norm": 0.6290155968597686, | |
| "learning_rate": 8.878466267928814e-07, | |
| "loss": 0.4366, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.833773884147723, | |
| "grad_norm": 0.6195315234512612, | |
| "learning_rate": 8.287938306623349e-07, | |
| "loss": 0.4398, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8398157238879239, | |
| "grad_norm": 0.8102446928428507, | |
| "learning_rate": 7.715953036934304e-07, | |
| "loss": 0.4409, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.8458575636281248, | |
| "grad_norm": 0.789309132604837, | |
| "learning_rate": 7.162764729354033e-07, | |
| "loss": 0.4407, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8518994033683257, | |
| "grad_norm": 0.6558463034047459, | |
| "learning_rate": 6.628619298368133e-07, | |
| "loss": 0.4297, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8579412431085266, | |
| "grad_norm": 0.6589202868449741, | |
| "learning_rate": 6.11375419313644e-07, | |
| "loss": 0.4397, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8639830828487275, | |
| "grad_norm": 0.6171089586498126, | |
| "learning_rate": 5.618398291937393e-07, | |
| "loss": 0.4473, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8700249225889283, | |
| "grad_norm": 0.8171417567736048, | |
| "learning_rate": 5.142771800422403e-07, | |
| "loss": 0.4424, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8760667623291292, | |
| "grad_norm": 0.813136783771392, | |
| "learning_rate": 4.687086153725534e-07, | |
| "loss": 0.4412, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8821086020693301, | |
| "grad_norm": 0.6976453824184702, | |
| "learning_rate": 4.2515439224721066e-07, | |
| "loss": 0.4332, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.888150441809531, | |
| "grad_norm": 0.5748251232369562, | |
| "learning_rate": 3.8363387227278947e-07, | |
| "loss": 0.4348, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8941922815497318, | |
| "grad_norm": 0.720005515762108, | |
| "learning_rate": 3.441655129928972e-07, | |
| "loss": 0.4442, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9002341212899327, | |
| "grad_norm": 0.7252408165512142, | |
| "learning_rate": 3.067668596830481e-07, | |
| "loss": 0.4395, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.9062759610301336, | |
| "grad_norm": 0.7144631857492548, | |
| "learning_rate": 2.714545375510852e-07, | |
| "loss": 0.4366, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9123178007703345, | |
| "grad_norm": 0.6635911944336934, | |
| "learning_rate": 2.382442443466043e-07, | |
| "loss": 0.4321, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.9183596405105354, | |
| "grad_norm": 0.7758826608638922, | |
| "learning_rate": 2.0715074338266915e-07, | |
| "loss": 0.4312, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9244014802507363, | |
| "grad_norm": 0.7230232505488107, | |
| "learning_rate": 1.7818785697292895e-07, | |
| "loss": 0.4388, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9304433199909372, | |
| "grad_norm": 0.6580184350979571, | |
| "learning_rate": 1.5136846028704132e-07, | |
| "loss": 0.4491, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9364851597311381, | |
| "grad_norm": 0.5586825913101796, | |
| "learning_rate": 1.2670447562713684e-07, | |
| "loss": 0.4322, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.942526999471339, | |
| "grad_norm": 0.647195128183336, | |
| "learning_rate": 1.042068671278823e-07, | |
| "loss": 0.4354, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9485688392115399, | |
| "grad_norm": 0.6035271594962103, | |
| "learning_rate": 8.388563588247523e-08, | |
| "loss": 0.4316, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9546106789517408, | |
| "grad_norm": 0.6113112433338288, | |
| "learning_rate": 6.574981549676007e-08, | |
| "loss": 0.4354, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9606525186919417, | |
| "grad_norm": 0.7146951417286433, | |
| "learning_rate": 4.980746807342285e-08, | |
| "loss": 0.4331, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9666943584321426, | |
| "grad_norm": 0.6521596489801758, | |
| "learning_rate": 3.6065680628062924e-08, | |
| "loss": 0.4397, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9727361981723435, | |
| "grad_norm": 0.8208153994914311, | |
| "learning_rate": 2.4530561938729825e-08, | |
| "loss": 0.4266, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9787780379125444, | |
| "grad_norm": 0.6500288352676025, | |
| "learning_rate": 1.520723983032324e-08, | |
| "loss": 0.4335, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9848198776527453, | |
| "grad_norm": 0.6538530895079608, | |
| "learning_rate": 8.099858895072587e-09, | |
| "loss": 0.4373, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9908617173929462, | |
| "grad_norm": 0.8433082091701005, | |
| "learning_rate": 3.2115786500924728e-09, | |
| "loss": 0.4371, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.996903557133147, | |
| "grad_norm": 0.6909298344072548, | |
| "learning_rate": 5.445721328567466e-10, | |
| "loss": 0.4329, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1656, | |
| "total_flos": 2042833294327808.0, | |
| "train_loss": 0.4611289887036678, | |
| "train_runtime": 68178.0817, | |
| "train_samples_per_second": 1.554, | |
| "train_steps_per_second": 0.024 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1656, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2042833294327808.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |