| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.458882239345328, |
| "eval_steps": 500, |
| "global_step": 3000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0015296074644844267, |
| "grad_norm": 39.202430725097656, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 2.2544, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0030592149289688534, |
| "grad_norm": 6.937076568603516, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 1.9815, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00458882239345328, |
| "grad_norm": 12.800159454345703, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 1.8989, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.006118429857937707, |
| "grad_norm": 18.019540786743164, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 1.894, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.007648037322422133, |
| "grad_norm": 19.988229751586914, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.8713, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.00917764478690656, |
| "grad_norm": 21.80278968811035, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 1.8662, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.010707252251390987, |
| "grad_norm": 23.45534324645996, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 1.8661, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.012236859715875414, |
| "grad_norm": 23.260648727416992, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 1.8222, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.013766467180359841, |
| "grad_norm": 23.629636764526367, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 1.8312, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.015296074644844266, |
| "grad_norm": 25.211572647094727, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.8441, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.016825682109328693, |
| "grad_norm": 23.6651668548584, |
| "learning_rate": 4.4e-06, |
| "loss": 1.8113, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.01835528957381312, |
| "grad_norm": 24.198888778686523, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 1.8209, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.019884897038297548, |
| "grad_norm": 21.0020694732666, |
| "learning_rate": 5.2e-06, |
| "loss": 1.8178, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.021414504502781973, |
| "grad_norm": 22.20139503479004, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 1.8184, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.0229441119672664, |
| "grad_norm": 20.499448776245117, |
| "learning_rate": 6e-06, |
| "loss": 1.7967, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.024473719431750827, |
| "grad_norm": 27.421058654785156, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 1.7887, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.026003326896235253, |
| "grad_norm": 19.557756423950195, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 1.7954, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.027532934360719682, |
| "grad_norm": 27.11305809020996, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 1.7946, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.029062541825204107, |
| "grad_norm": 19.00750160217285, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 1.7845, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.030592149289688533, |
| "grad_norm": 18.32988929748535, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.7817, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.03212175675417296, |
| "grad_norm": 22.121030807495117, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 1.8228, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.03365136421865739, |
| "grad_norm": 30.522912979125977, |
| "learning_rate": 8.8e-06, |
| "loss": 1.8359, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.03518097168314181, |
| "grad_norm": 26.51453399658203, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 1.7971, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.03671057914762624, |
| "grad_norm": 28.695058822631836, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 1.8302, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.03824018661211067, |
| "grad_norm": 27.09485626220703, |
| "learning_rate": 1e-05, |
| "loss": 1.829, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.039769794076595096, |
| "grad_norm": 19.422021865844727, |
| "learning_rate": 1.04e-05, |
| "loss": 1.7814, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.04129940154107952, |
| "grad_norm": 26.004735946655273, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 1.8323, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.042829009005563946, |
| "grad_norm": 21.66231346130371, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 1.8503, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.04435861647004837, |
| "grad_norm": 16.89419174194336, |
| "learning_rate": 1.16e-05, |
| "loss": 1.8079, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.0458882239345328, |
| "grad_norm": 11.961407661437988, |
| "learning_rate": 1.2e-05, |
| "loss": 1.8598, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.04741783139901723, |
| "grad_norm": 22.449304580688477, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 1.8461, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.048947438863501655, |
| "grad_norm": 17.80685806274414, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 1.8374, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.05047704632798608, |
| "grad_norm": 20.850351333618164, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 1.8325, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.052006653792470506, |
| "grad_norm": 18.351491928100586, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 1.8575, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.05353626125695493, |
| "grad_norm": 23.839378356933594, |
| "learning_rate": 1.4e-05, |
| "loss": 1.8521, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.055065868721439364, |
| "grad_norm": 17.616167068481445, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 1.8278, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.05659547618592379, |
| "grad_norm": 25.248546600341797, |
| "learning_rate": 1.48e-05, |
| "loss": 1.8904, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.058125083650408214, |
| "grad_norm": 23.11628532409668, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 1.8552, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.05965469111489264, |
| "grad_norm": 21.66451072692871, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 1.8795, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.061184298579377065, |
| "grad_norm": 22.456846237182617, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.8819, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0627139060438615, |
| "grad_norm": 18.99188232421875, |
| "learning_rate": 1.64e-05, |
| "loss": 1.8874, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.06424351350834592, |
| "grad_norm": 20.916900634765625, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 1.8895, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.06577312097283035, |
| "grad_norm": 25.91555404663086, |
| "learning_rate": 1.72e-05, |
| "loss": 1.8769, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.06730272843731477, |
| "grad_norm": 20.888111114501953, |
| "learning_rate": 1.76e-05, |
| "loss": 1.8992, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.0688323359017992, |
| "grad_norm": 18.96579933166504, |
| "learning_rate": 1.8e-05, |
| "loss": 1.9045, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.07036194336628362, |
| "grad_norm": 27.60817527770996, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 1.8765, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.07189155083076805, |
| "grad_norm": 21.365825653076172, |
| "learning_rate": 1.88e-05, |
| "loss": 1.9315, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.07342115829525248, |
| "grad_norm": 21.732152938842773, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 1.9313, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.0749507657597369, |
| "grad_norm": 21.766752243041992, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 1.9151, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.07648037322422134, |
| "grad_norm": 21.943374633789062, |
| "learning_rate": 2e-05, |
| "loss": 1.9768, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.07800998068870577, |
| "grad_norm": 20.251033782958984, |
| "learning_rate": 1.9999756307053947e-05, |
| "loss": 1.9019, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.07953958815319019, |
| "grad_norm": 26.903671264648438, |
| "learning_rate": 1.9999025240093045e-05, |
| "loss": 1.9229, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.08106919561767462, |
| "grad_norm": 17.911582946777344, |
| "learning_rate": 1.9997806834748455e-05, |
| "loss": 1.9123, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.08259880308215904, |
| "grad_norm": 17.10667610168457, |
| "learning_rate": 1.9996101150403543e-05, |
| "loss": 1.8725, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.08412841054664347, |
| "grad_norm": 23.082595825195312, |
| "learning_rate": 1.999390827019096e-05, |
| "loss": 1.9008, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.08565801801112789, |
| "grad_norm": 16.786951065063477, |
| "learning_rate": 1.9991228300988586e-05, |
| "loss": 1.9283, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.08718762547561232, |
| "grad_norm": 22.904605865478516, |
| "learning_rate": 1.9988061373414342e-05, |
| "loss": 1.886, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.08871723294009674, |
| "grad_norm": 18.19251823425293, |
| "learning_rate": 1.9984407641819812e-05, |
| "loss": 1.8846, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.09024684040458117, |
| "grad_norm": 17.656436920166016, |
| "learning_rate": 1.9980267284282718e-05, |
| "loss": 1.8561, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.0917764478690656, |
| "grad_norm": 20.029586791992188, |
| "learning_rate": 1.9975640502598243e-05, |
| "loss": 1.867, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.09330605533355003, |
| "grad_norm": 15.413036346435547, |
| "learning_rate": 1.9970527522269204e-05, |
| "loss": 1.867, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.09483566279803446, |
| "grad_norm": 17.78459930419922, |
| "learning_rate": 1.9964928592495046e-05, |
| "loss": 1.8522, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.09636527026251888, |
| "grad_norm": 18.63687515258789, |
| "learning_rate": 1.9958843986159705e-05, |
| "loss": 1.8715, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.09789487772700331, |
| "grad_norm": 18.381534576416016, |
| "learning_rate": 1.9952273999818312e-05, |
| "loss": 1.8483, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.09942448519148774, |
| "grad_norm": 17.309951782226562, |
| "learning_rate": 1.9945218953682736e-05, |
| "loss": 1.8411, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.10095409265597216, |
| "grad_norm": 16.78116798400879, |
| "learning_rate": 1.9937679191605964e-05, |
| "loss": 1.8417, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.10248370012045659, |
| "grad_norm": 14.930190086364746, |
| "learning_rate": 1.992965508106537e-05, |
| "loss": 1.7848, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.10401330758494101, |
| "grad_norm": 15.964579582214355, |
| "learning_rate": 1.9921147013144782e-05, |
| "loss": 1.8235, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.10554291504942544, |
| "grad_norm": 19.765722274780273, |
| "learning_rate": 1.991215540251542e-05, |
| "loss": 1.8351, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.10707252251390986, |
| "grad_norm": 22.259653091430664, |
| "learning_rate": 1.9902680687415704e-05, |
| "loss": 1.873, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.10860212997839429, |
| "grad_norm": 17.007463455200195, |
| "learning_rate": 1.9892723329629885e-05, |
| "loss": 1.7792, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.11013173744287873, |
| "grad_norm": 16.560100555419922, |
| "learning_rate": 1.988228381446553e-05, |
| "loss": 1.7872, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.11166134490736315, |
| "grad_norm": 20.133487701416016, |
| "learning_rate": 1.987136265072988e-05, |
| "loss": 1.8089, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.11319095237184758, |
| "grad_norm": 13.525500297546387, |
| "learning_rate": 1.985996037070505e-05, |
| "loss": 1.8169, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.114720559836332, |
| "grad_norm": 17.305484771728516, |
| "learning_rate": 1.9848077530122083e-05, |
| "loss": 1.8002, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.11625016730081643, |
| "grad_norm": 14.32529067993164, |
| "learning_rate": 1.983571470813386e-05, |
| "loss": 1.7969, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.11777977476530085, |
| "grad_norm": 16.328943252563477, |
| "learning_rate": 1.982287250728689e-05, |
| "loss": 1.7679, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.11930938222978528, |
| "grad_norm": 10.76165771484375, |
| "learning_rate": 1.9809551553491918e-05, |
| "loss": 1.8304, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.1208389896942697, |
| "grad_norm": 14.789531707763672, |
| "learning_rate": 1.979575249599344e-05, |
| "loss": 1.8215, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.12236859715875413, |
| "grad_norm": 16.026445388793945, |
| "learning_rate": 1.9781476007338058e-05, |
| "loss": 1.7581, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.12389820462323856, |
| "grad_norm": 17.013036727905273, |
| "learning_rate": 1.9766722783341682e-05, |
| "loss": 1.7698, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.125427812087723, |
| "grad_norm": 24.33189582824707, |
| "learning_rate": 1.9751493543055634e-05, |
| "loss": 1.7896, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.1269574195522074, |
| "grad_norm": 18.09388542175293, |
| "learning_rate": 1.9735789028731603e-05, |
| "loss": 1.754, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.12848702701669185, |
| "grad_norm": 12.945524215698242, |
| "learning_rate": 1.9719610005785466e-05, |
| "loss": 1.7693, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.13001663448117626, |
| "grad_norm": 12.579715728759766, |
| "learning_rate": 1.9702957262759964e-05, |
| "loss": 1.7323, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1315462419456607, |
| "grad_norm": 13.929019927978516, |
| "learning_rate": 1.9685831611286312e-05, |
| "loss": 1.7814, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.1330758494101451, |
| "grad_norm": 16.18221664428711, |
| "learning_rate": 1.9668233886044597e-05, |
| "loss": 1.7715, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.13460545687462955, |
| "grad_norm": 17.0338191986084, |
| "learning_rate": 1.9650164944723116e-05, |
| "loss": 1.7783, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.136135064339114, |
| "grad_norm": 12.997424125671387, |
| "learning_rate": 1.9631625667976584e-05, |
| "loss": 1.7303, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.1376646718035984, |
| "grad_norm": 14.024730682373047, |
| "learning_rate": 1.961261695938319e-05, |
| "loss": 1.7659, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.13919427926808284, |
| "grad_norm": 15.084492683410645, |
| "learning_rate": 1.9593139745400575e-05, |
| "loss": 1.7504, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.14072388673256725, |
| "grad_norm": 13.797721862792969, |
| "learning_rate": 1.9573194975320672e-05, |
| "loss": 1.7514, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.1422534941970517, |
| "grad_norm": 12.69803237915039, |
| "learning_rate": 1.9552783621223437e-05, |
| "loss": 1.7344, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.1437831016615361, |
| "grad_norm": 11.39012622833252, |
| "learning_rate": 1.9531906677929472e-05, |
| "loss": 1.7235, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.14531270912602054, |
| "grad_norm": 15.734513282775879, |
| "learning_rate": 1.9510565162951538e-05, |
| "loss": 1.7329, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.14684231659050495, |
| "grad_norm": 11.408859252929688, |
| "learning_rate": 1.9488760116444966e-05, |
| "loss": 1.7163, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.1483719240549894, |
| "grad_norm": 11.509827613830566, |
| "learning_rate": 1.9466492601156964e-05, |
| "loss": 1.7178, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.1499015315194738, |
| "grad_norm": 6.658020496368408, |
| "learning_rate": 1.944376370237481e-05, |
| "loss": 1.7269, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.15143113898395824, |
| "grad_norm": 15.084721565246582, |
| "learning_rate": 1.942057452787297e-05, |
| "loss": 1.7002, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.15296074644844268, |
| "grad_norm": 12.061055183410645, |
| "learning_rate": 1.9396926207859085e-05, |
| "loss": 1.7466, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1544903539129271, |
| "grad_norm": 12.866460800170898, |
| "learning_rate": 1.937281989491892e-05, |
| "loss": 1.7213, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.15601996137741153, |
| "grad_norm": 10.788905143737793, |
| "learning_rate": 1.9348256763960146e-05, |
| "loss": 1.7204, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.15754956884189594, |
| "grad_norm": 14.169967651367188, |
| "learning_rate": 1.9323238012155125e-05, |
| "loss": 1.6906, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.15907917630638038, |
| "grad_norm": 15.22304916381836, |
| "learning_rate": 1.9297764858882516e-05, |
| "loss": 1.7012, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.1606087837708648, |
| "grad_norm": 15.920389175415039, |
| "learning_rate": 1.9271838545667876e-05, |
| "loss": 1.7151, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.16213839123534923, |
| "grad_norm": 14.644919395446777, |
| "learning_rate": 1.9245460336123136e-05, |
| "loss": 1.7011, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.16366799869983364, |
| "grad_norm": 11.079008102416992, |
| "learning_rate": 1.9218631515885007e-05, |
| "loss": 1.6773, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.16519760616431808, |
| "grad_norm": 13.276355743408203, |
| "learning_rate": 1.9191353392552346e-05, |
| "loss": 1.6895, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.1667272136288025, |
| "grad_norm": 12.620210647583008, |
| "learning_rate": 1.9163627295622397e-05, |
| "loss": 1.7153, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.16825682109328693, |
| "grad_norm": 13.213116645812988, |
| "learning_rate": 1.913545457642601e-05, |
| "loss": 1.6805, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.16978642855777137, |
| "grad_norm": 13.620598793029785, |
| "learning_rate": 1.910683660806177e-05, |
| "loss": 1.6882, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.17131603602225579, |
| "grad_norm": 14.6329927444458, |
| "learning_rate": 1.907777478532909e-05, |
| "loss": 1.6843, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.17284564348674022, |
| "grad_norm": 12.677013397216797, |
| "learning_rate": 1.9048270524660197e-05, |
| "loss": 1.7041, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.17437525095122464, |
| "grad_norm": 12.261626243591309, |
| "learning_rate": 1.901832526405114e-05, |
| "loss": 1.6819, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.17590485841570908, |
| "grad_norm": 14.508549690246582, |
| "learning_rate": 1.8987940462991673e-05, |
| "loss": 1.6847, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.1774344658801935, |
| "grad_norm": 14.293961524963379, |
| "learning_rate": 1.895711760239413e-05, |
| "loss": 1.6928, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.17896407334467793, |
| "grad_norm": 13.377256393432617, |
| "learning_rate": 1.892585818452126e-05, |
| "loss": 1.6838, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.18049368080916234, |
| "grad_norm": 12.398565292358398, |
| "learning_rate": 1.889416373291298e-05, |
| "loss": 1.6692, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.18202328827364678, |
| "grad_norm": 11.622918128967285, |
| "learning_rate": 1.8862035792312148e-05, |
| "loss": 1.6429, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.1835528957381312, |
| "grad_norm": 9.335916519165039, |
| "learning_rate": 1.8829475928589272e-05, |
| "loss": 1.6535, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.18508250320261563, |
| "grad_norm": 30.566650390625, |
| "learning_rate": 1.879648572866617e-05, |
| "loss": 1.6625, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.18661211066710007, |
| "grad_norm": 10.248709678649902, |
| "learning_rate": 1.8763066800438638e-05, |
| "loss": 1.657, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.18814171813158448, |
| "grad_norm": 12.403678894042969, |
| "learning_rate": 1.8729220772698096e-05, |
| "loss": 1.6588, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.18967132559606892, |
| "grad_norm": 12.880125999450684, |
| "learning_rate": 1.869494929505219e-05, |
| "loss": 1.6782, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.19120093306055333, |
| "grad_norm": 11.847280502319336, |
| "learning_rate": 1.866025403784439e-05, |
| "loss": 1.6612, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.19273054052503777, |
| "grad_norm": 13.305404663085938, |
| "learning_rate": 1.8625136692072577e-05, |
| "loss": 1.6565, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.19426014798952218, |
| "grad_norm": 14.423601150512695, |
| "learning_rate": 1.8589598969306646e-05, |
| "loss": 1.677, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.19578975545400662, |
| "grad_norm": 15.052698135375977, |
| "learning_rate": 1.855364260160507e-05, |
| "loss": 1.6467, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.19731936291849103, |
| "grad_norm": 11.820367813110352, |
| "learning_rate": 1.851726934143048e-05, |
| "loss": 1.6384, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.19884897038297547, |
| "grad_norm": 15.453312873840332, |
| "learning_rate": 1.848048096156426e-05, |
| "loss": 1.637, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.20037857784745988, |
| "grad_norm": 10.9462308883667, |
| "learning_rate": 1.8443279255020153e-05, |
| "loss": 1.6397, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.20190818531194432, |
| "grad_norm": 12.902162551879883, |
| "learning_rate": 1.8405666034956842e-05, |
| "loss": 1.6289, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.20343779277642876, |
| "grad_norm": 9.770879745483398, |
| "learning_rate": 1.836764313458962e-05, |
| "loss": 1.6349, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.20496740024091317, |
| "grad_norm": 11.91503620147705, |
| "learning_rate": 1.8329212407100996e-05, |
| "loss": 1.6101, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.2064970077053976, |
| "grad_norm": 9.018235206604004, |
| "learning_rate": 1.8290375725550417e-05, |
| "loss": 1.6194, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.20802661516988202, |
| "grad_norm": 13.213310241699219, |
| "learning_rate": 1.8251134982782952e-05, |
| "loss": 1.6173, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.20955622263436646, |
| "grad_norm": 13.184313774108887, |
| "learning_rate": 1.821149209133704e-05, |
| "loss": 1.644, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.21108583009885087, |
| "grad_norm": 11.71191692352295, |
| "learning_rate": 1.8171448983351284e-05, |
| "loss": 1.6355, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.2126154375633353, |
| "grad_norm": 11.976449966430664, |
| "learning_rate": 1.8131007610470278e-05, |
| "loss": 1.6196, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.21414504502781972, |
| "grad_norm": 12.10886287689209, |
| "learning_rate": 1.8090169943749477e-05, |
| "loss": 1.637, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.21567465249230416, |
| "grad_norm": 9.17182731628418, |
| "learning_rate": 1.804893797355914e-05, |
| "loss": 1.5883, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.21720425995678858, |
| "grad_norm": 15.71194076538086, |
| "learning_rate": 1.8007313709487334e-05, |
| "loss": 1.6215, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.21873386742127301, |
| "grad_norm": 10.610137939453125, |
| "learning_rate": 1.7965299180241963e-05, |
| "loss": 1.6228, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.22026347488575745, |
| "grad_norm": 12.10354232788086, |
| "learning_rate": 1.792289643355191e-05, |
| "loss": 1.6052, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.22179308235024187, |
| "grad_norm": 10.069908142089844, |
| "learning_rate": 1.788010753606722e-05, |
| "loss": 1.5964, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.2233226898147263, |
| "grad_norm": 12.154913902282715, |
| "learning_rate": 1.78369345732584e-05, |
| "loss": 1.6269, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.22485229727921072, |
| "grad_norm": 12.322149276733398, |
| "learning_rate": 1.7793379649314743e-05, |
| "loss": 1.5855, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.22638190474369516, |
| "grad_norm": 10.875051498413086, |
| "learning_rate": 1.7749444887041797e-05, |
| "loss": 1.6005, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.22791151220817957, |
| "grad_norm": 11.17204761505127, |
| "learning_rate": 1.7705132427757895e-05, |
| "loss": 1.5849, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.229441119672664, |
| "grad_norm": 10.732039451599121, |
| "learning_rate": 1.766044443118978e-05, |
| "loss": 1.5825, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.23097072713714842, |
| "grad_norm": 13.450652122497559, |
| "learning_rate": 1.761538307536737e-05, |
| "loss": 1.6146, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.23250033460163286, |
| "grad_norm": 11.979947090148926, |
| "learning_rate": 1.7569950556517566e-05, |
| "loss": 1.5928, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.23402994206611727, |
| "grad_norm": 11.305917739868164, |
| "learning_rate": 1.7524149088957244e-05, |
| "loss": 1.5866, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.2355595495306017, |
| "grad_norm": 10.360641479492188, |
| "learning_rate": 1.747798090498532e-05, |
| "loss": 1.6028, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.23708915699508615, |
| "grad_norm": 8.656867027282715, |
| "learning_rate": 1.7431448254773943e-05, |
| "loss": 1.6014, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.23861876445957056, |
| "grad_norm": 11.687288284301758, |
| "learning_rate": 1.7384553406258842e-05, |
| "loss": 1.5948, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.240148371924055, |
| "grad_norm": 10.943402290344238, |
| "learning_rate": 1.7337298645028764e-05, |
| "loss": 1.5923, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.2416779793885394, |
| "grad_norm": 10.023202896118164, |
| "learning_rate": 1.7289686274214116e-05, |
| "loss": 1.5719, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.24320758685302385, |
| "grad_norm": 9.700725555419922, |
| "learning_rate": 1.7241718614374678e-05, |
| "loss": 1.5872, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.24473719431750826, |
| "grad_norm": 11.386409759521484, |
| "learning_rate": 1.7193398003386514e-05, |
| "loss": 1.5832, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.2462668017819927, |
| "grad_norm": 9.78838062286377, |
| "learning_rate": 1.7144726796328034e-05, |
| "loss": 1.5904, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.2477964092464771, |
| "grad_norm": 12.67321491241455, |
| "learning_rate": 1.709570736536521e-05, |
| "loss": 1.5779, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.24932601671096155, |
| "grad_norm": 10.230249404907227, |
| "learning_rate": 1.7046342099635948e-05, |
| "loss": 1.5931, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.250855624175446, |
| "grad_norm": 9.709312438964844, |
| "learning_rate": 1.6996633405133656e-05, |
| "loss": 1.5728, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.25238523163993043, |
| "grad_norm": 10.340200424194336, |
| "learning_rate": 1.6946583704589973e-05, |
| "loss": 1.561, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.2539148391044148, |
| "grad_norm": 9.730533599853516, |
| "learning_rate": 1.68961954373567e-05, |
| "loss": 1.5487, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.25544444656889925, |
| "grad_norm": 8.787372589111328, |
| "learning_rate": 1.684547105928689e-05, |
| "loss": 1.5704, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.2569740540333837, |
| "grad_norm": 9.325871467590332, |
| "learning_rate": 1.6794413042615168e-05, |
| "loss": 1.5352, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.25850366149786813, |
| "grad_norm": 10.094749450683594, |
| "learning_rate": 1.6743023875837233e-05, |
| "loss": 1.5718, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.2600332689623525, |
| "grad_norm": 9.554350852966309, |
| "learning_rate": 1.6691306063588583e-05, |
| "loss": 1.5563, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.26156287642683695, |
| "grad_norm": 10.526246070861816, |
| "learning_rate": 1.6639262126522417e-05, |
| "loss": 1.5399, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.2630924838913214, |
| "grad_norm": 8.69581413269043, |
| "learning_rate": 1.6586894601186804e-05, |
| "loss": 1.5456, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.26462209135580583, |
| "grad_norm": 9.803443908691406, |
| "learning_rate": 1.6534206039901057e-05, |
| "loss": 1.54, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.2661516988202902, |
| "grad_norm": 9.926239013671875, |
| "learning_rate": 1.6481199010631312e-05, |
| "loss": 1.5468, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.26768130628477466, |
| "grad_norm": 11.072277069091797, |
| "learning_rate": 1.6427876096865394e-05, |
| "loss": 1.5495, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.2692109137492591, |
| "grad_norm": 10.02304744720459, |
| "learning_rate": 1.63742398974869e-05, |
| "loss": 1.567, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.27074052121374353, |
| "grad_norm": 9.076205253601074, |
| "learning_rate": 1.632029302664851e-05, |
| "loss": 1.553, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.272270128678228, |
| "grad_norm": 7.942831039428711, |
| "learning_rate": 1.6266038113644605e-05, |
| "loss": 1.5408, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.27379973614271236, |
| "grad_norm": 10.165443420410156, |
| "learning_rate": 1.6211477802783105e-05, |
| "loss": 1.5317, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.2753293436071968, |
| "grad_norm": 10.491219520568848, |
| "learning_rate": 1.6156614753256583e-05, |
| "loss": 1.5528, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.27685895107168124, |
| "grad_norm": 4.210089206695557, |
| "learning_rate": 1.610145163901268e-05, |
| "loss": 1.5295, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.2783885585361657, |
| "grad_norm": 12.827298164367676, |
| "learning_rate": 1.6045991148623752e-05, |
| "loss": 1.5366, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.27991816600065006, |
| "grad_norm": 11.551325798034668, |
| "learning_rate": 1.599023598515586e-05, |
| "loss": 1.5226, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.2814477734651345, |
| "grad_norm": 11.47888469696045, |
| "learning_rate": 1.5934188866037017e-05, |
| "loss": 1.5285, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.28297738092961894, |
| "grad_norm": 9.28005599975586, |
| "learning_rate": 1.5877852522924733e-05, |
| "loss": 1.504, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.2845069883941034, |
| "grad_norm": 10.573029518127441, |
| "learning_rate": 1.5821229701572897e-05, |
| "loss": 1.5212, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.2860365958585878, |
| "grad_norm": 11.156023025512695, |
| "learning_rate": 1.5764323161697933e-05, |
| "loss": 1.5257, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.2875662033230722, |
| "grad_norm": 9.835415840148926, |
| "learning_rate": 1.570713567684432e-05, |
| "loss": 1.5119, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.28909581078755664, |
| "grad_norm": 8.080103874206543, |
| "learning_rate": 1.564967003424938e-05, |
| "loss": 1.5048, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.2906254182520411, |
| "grad_norm": 10.452638626098633, |
| "learning_rate": 1.5591929034707468e-05, |
| "loss": 1.5197, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2921550257165255, |
| "grad_norm": 10.441474914550781, |
| "learning_rate": 1.553391549243344e-05, |
| "loss": 1.5317, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.2936846331810099, |
| "grad_norm": 9.752622604370117, |
| "learning_rate": 1.5475632234925505e-05, |
| "loss": 1.5069, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.29521424064549434, |
| "grad_norm": 8.496747016906738, |
| "learning_rate": 1.54170821028274e-05, |
| "loss": 1.5285, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.2967438481099788, |
| "grad_norm": 9.566315650939941, |
| "learning_rate": 1.5358267949789968e-05, |
| "loss": 1.5162, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.2982734555744632, |
| "grad_norm": 6.093721389770508, |
| "learning_rate": 1.529919264233205e-05, |
| "loss": 1.5008, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2998030630389476, |
| "grad_norm": 7.963770866394043, |
| "learning_rate": 1.5239859059700794e-05, |
| "loss": 1.4732, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.30133267050343204, |
| "grad_norm": 8.183985710144043, |
| "learning_rate": 1.5180270093731305e-05, |
| "loss": 1.5126, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.3028622779679165, |
| "grad_norm": 10.149001121520996, |
| "learning_rate": 1.5120428648705716e-05, |
| "loss": 1.5145, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.3043918854324009, |
| "grad_norm": 8.544008255004883, |
| "learning_rate": 1.5060337641211637e-05, |
| "loss": 1.5061, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.30592149289688536, |
| "grad_norm": 9.953203201293945, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 1.5209, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.30745110036136974, |
| "grad_norm": 10.6850004196167, |
| "learning_rate": 1.493941866584231e-05, |
| "loss": 1.5163, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.3089807078258542, |
| "grad_norm": 8.49984359741211, |
| "learning_rate": 1.4878596591387329e-05, |
| "loss": 1.4868, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.3105103152903386, |
| "grad_norm": 8.950973510742188, |
| "learning_rate": 1.4817536741017153e-05, |
| "loss": 1.481, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.31203992275482306, |
| "grad_norm": 10.185256958007812, |
| "learning_rate": 1.4756242090702756e-05, |
| "loss": 1.49, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.31356953021930745, |
| "grad_norm": 8.98540210723877, |
| "learning_rate": 1.469471562785891e-05, |
| "loss": 1.4905, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.3150991376837919, |
| "grad_norm": 7.6039299964904785, |
| "learning_rate": 1.463296035119862e-05, |
| "loss": 1.4794, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.3166287451482763, |
| "grad_norm": 8.91348934173584, |
| "learning_rate": 1.4570979270586944e-05, |
| "loss": 1.4697, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.31815835261276076, |
| "grad_norm": 8.245038986206055, |
| "learning_rate": 1.4508775406894308e-05, |
| "loss": 1.4827, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.3196879600772452, |
| "grad_norm": 9.471598625183105, |
| "learning_rate": 1.4446351791849276e-05, |
| "loss": 1.4912, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.3212175675417296, |
| "grad_norm": 5.9019975662231445, |
| "learning_rate": 1.4383711467890776e-05, |
| "loss": 1.4708, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.322747175006214, |
| "grad_norm": 7.020793437957764, |
| "learning_rate": 1.4320857488019826e-05, |
| "loss": 1.4702, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.32427678247069847, |
| "grad_norm": 9.424378395080566, |
| "learning_rate": 1.4257792915650728e-05, |
| "loss": 1.4805, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.3258063899351829, |
| "grad_norm": 7.374673366546631, |
| "learning_rate": 1.4194520824461773e-05, |
| "loss": 1.4682, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.3273359973996673, |
| "grad_norm": 10.07297134399414, |
| "learning_rate": 1.413104429824542e-05, |
| "loss": 1.4817, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.32886560486415173, |
| "grad_norm": 8.525253295898438, |
| "learning_rate": 1.4067366430758004e-05, |
| "loss": 1.4686, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.33039521232863617, |
| "grad_norm": 9.093647956848145, |
| "learning_rate": 1.4003490325568953e-05, |
| "loss": 1.4779, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.3319248197931206, |
| "grad_norm": 7.984882831573486, |
| "learning_rate": 1.3939419095909513e-05, |
| "loss": 1.4756, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.333454427257605, |
| "grad_norm": 7.52358341217041, |
| "learning_rate": 1.3875155864521031e-05, |
| "loss": 1.4531, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.33498403472208943, |
| "grad_norm": 9.23783016204834, |
| "learning_rate": 1.3810703763502744e-05, |
| "loss": 1.4685, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.33651364218657387, |
| "grad_norm": 10.533625602722168, |
| "learning_rate": 1.3746065934159123e-05, |
| "loss": 1.4532, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.3380432496510583, |
| "grad_norm": 10.983015060424805, |
| "learning_rate": 1.3681245526846782e-05, |
| "loss": 1.4631, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.33957285711554275, |
| "grad_norm": 7.50883150100708, |
| "learning_rate": 1.3616245700820922e-05, |
| "loss": 1.4627, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.34110246458002713, |
| "grad_norm": 6.438501834869385, |
| "learning_rate": 1.3551069624081372e-05, |
| "loss": 1.4517, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.34263207204451157, |
| "grad_norm": 8.066999435424805, |
| "learning_rate": 1.3485720473218153e-05, |
| "loss": 1.4548, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.344161679508996, |
| "grad_norm": 8.656118392944336, |
| "learning_rate": 1.342020143325669e-05, |
| "loss": 1.4445, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.34569128697348045, |
| "grad_norm": 7.250131607055664, |
| "learning_rate": 1.3354515697502552e-05, |
| "loss": 1.4321, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.34722089443796483, |
| "grad_norm": 8.546892166137695, |
| "learning_rate": 1.3288666467385834e-05, |
| "loss": 1.4464, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.3487505019024493, |
| "grad_norm": 8.729716300964355, |
| "learning_rate": 1.3222656952305113e-05, |
| "loss": 1.4541, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.3502801093669337, |
| "grad_norm": 8.721868515014648, |
| "learning_rate": 1.3156490369471026e-05, |
| "loss": 1.4475, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.35180971683141815, |
| "grad_norm": 8.831208229064941, |
| "learning_rate": 1.3090169943749475e-05, |
| "loss": 1.4302, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.3533393242959026, |
| "grad_norm": 8.883235931396484, |
| "learning_rate": 1.3023698907504447e-05, |
| "loss": 1.4432, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.354868931760387, |
| "grad_norm": 8.794004440307617, |
| "learning_rate": 1.2957080500440469e-05, |
| "loss": 1.4444, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.3563985392248714, |
| "grad_norm": 4.8828935623168945, |
| "learning_rate": 1.2890317969444716e-05, |
| "loss": 1.4384, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.35792814668935585, |
| "grad_norm": 15.948801040649414, |
| "learning_rate": 1.2823414568428767e-05, |
| "loss": 1.4551, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.3594577541538403, |
| "grad_norm": 7.99323844909668, |
| "learning_rate": 1.2756373558169992e-05, |
| "loss": 1.4202, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.3609873616183247, |
| "grad_norm": 5.047421455383301, |
| "learning_rate": 1.2689198206152657e-05, |
| "loss": 1.4278, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.3625169690828091, |
| "grad_norm": 5.64467191696167, |
| "learning_rate": 1.2621891786408648e-05, |
| "loss": 1.4138, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.36404657654729355, |
| "grad_norm": 8.789236068725586, |
| "learning_rate": 1.2554457579357906e-05, |
| "loss": 1.4153, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.365576184011778, |
| "grad_norm": 8.083710670471191, |
| "learning_rate": 1.2486898871648552e-05, |
| "loss": 1.4093, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.3671057914762624, |
| "grad_norm": 5.929847240447998, |
| "learning_rate": 1.2419218955996677e-05, |
| "loss": 1.4153, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.3686353989407468, |
| "grad_norm": 7.852176666259766, |
| "learning_rate": 1.23514211310259e-05, |
| "loss": 1.4145, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.37016500640523126, |
| "grad_norm": 9.10372257232666, |
| "learning_rate": 1.2283508701106559e-05, |
| "loss": 1.414, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.3716946138697157, |
| "grad_norm": 9.65334415435791, |
| "learning_rate": 1.2215484976194675e-05, |
| "loss": 1.4363, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.37322422133420013, |
| "grad_norm": 7.612096786499023, |
| "learning_rate": 1.2147353271670634e-05, |
| "loss": 1.4188, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.3747538287986845, |
| "grad_norm": 5.171387672424316, |
| "learning_rate": 1.2079116908177592e-05, |
| "loss": 1.4108, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.37628343626316896, |
| "grad_norm": 6.974627494812012, |
| "learning_rate": 1.2010779211459649e-05, |
| "loss": 1.4164, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.3778130437276534, |
| "grad_norm": 7.995100021362305, |
| "learning_rate": 1.194234351219972e-05, |
| "loss": 1.4241, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.37934265119213784, |
| "grad_norm": 9.874503135681152, |
| "learning_rate": 1.187381314585725e-05, |
| "loss": 1.3957, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.3808722586566222, |
| "grad_norm": 9.190123558044434, |
| "learning_rate": 1.1805191452505602e-05, |
| "loss": 1.4234, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.38240186612110666, |
| "grad_norm": 9.317654609680176, |
| "learning_rate": 1.1736481776669307e-05, |
| "loss": 1.4251, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3839314735855911, |
| "grad_norm": 8.3729829788208, |
| "learning_rate": 1.1667687467161025e-05, |
| "loss": 1.4024, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.38546108105007554, |
| "grad_norm": 8.835628509521484, |
| "learning_rate": 1.159881187691835e-05, |
| "loss": 1.4098, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.38699068851456, |
| "grad_norm": 8.480125427246094, |
| "learning_rate": 1.1529858362840383e-05, |
| "loss": 1.4, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.38852029597904436, |
| "grad_norm": 8.7410306930542, |
| "learning_rate": 1.1460830285624119e-05, |
| "loss": 1.4084, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.3900499034435288, |
| "grad_norm": 5.247309684753418, |
| "learning_rate": 1.1391731009600655e-05, |
| "loss": 1.3918, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.39157951090801324, |
| "grad_norm": 6.82070779800415, |
| "learning_rate": 1.1322563902571227e-05, |
| "loss": 1.3829, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.3931091183724977, |
| "grad_norm": 7.315955638885498, |
| "learning_rate": 1.1253332335643043e-05, |
| "loss": 1.3879, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.39463872583698206, |
| "grad_norm": 8.81851863861084, |
| "learning_rate": 1.1184039683065014e-05, |
| "loss": 1.3809, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.3961683333014665, |
| "grad_norm": 7.224653720855713, |
| "learning_rate": 1.1114689322063255e-05, |
| "loss": 1.3958, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.39769794076595094, |
| "grad_norm": 6.905256748199463, |
| "learning_rate": 1.1045284632676535e-05, |
| "loss": 1.3854, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.3992275482304354, |
| "grad_norm": 7.848160266876221, |
| "learning_rate": 1.0975828997591496e-05, |
| "loss": 1.3876, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.40075715569491976, |
| "grad_norm": 6.962865352630615, |
| "learning_rate": 1.0906325801977804e-05, |
| "loss": 1.3909, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.4022867631594042, |
| "grad_norm": 7.401321887969971, |
| "learning_rate": 1.083677843332316e-05, |
| "loss": 1.3825, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.40381637062388864, |
| "grad_norm": 5.543237209320068, |
| "learning_rate": 1.0767190281268187e-05, |
| "loss": 1.3773, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.4053459780883731, |
| "grad_norm": 7.603894233703613, |
| "learning_rate": 1.0697564737441254e-05, |
| "loss": 1.3961, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.4068755855528575, |
| "grad_norm": 7.263538837432861, |
| "learning_rate": 1.0627905195293135e-05, |
| "loss": 1.3696, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.4084051930173419, |
| "grad_norm": 9.08191967010498, |
| "learning_rate": 1.055821504993164e-05, |
| "loss": 1.374, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.40993480048182634, |
| "grad_norm": 6.848121166229248, |
| "learning_rate": 1.0488497697956134e-05, |
| "loss": 1.3765, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.4114644079463108, |
| "grad_norm": 6.978294849395752, |
| "learning_rate": 1.0418756537291996e-05, |
| "loss": 1.375, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.4129940154107952, |
| "grad_norm": 7.876370429992676, |
| "learning_rate": 1.0348994967025012e-05, |
| "loss": 1.3681, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.4145236228752796, |
| "grad_norm": 5.700248718261719, |
| "learning_rate": 1.0279216387235691e-05, |
| "loss": 1.3733, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.41605323033976405, |
| "grad_norm": 5.646441459655762, |
| "learning_rate": 1.0209424198833571e-05, |
| "loss": 1.3487, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.4175828378042485, |
| "grad_norm": 6.236967086791992, |
| "learning_rate": 1.0139621803391454e-05, |
| "loss": 1.3621, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.4191124452687329, |
| "grad_norm": 6.606427192687988, |
| "learning_rate": 1.0069812602979617e-05, |
| "loss": 1.3551, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.42064205273321736, |
| "grad_norm": 5.275976181030273, |
| "learning_rate": 1e-05, |
| "loss": 1.3554, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.42217166019770175, |
| "grad_norm": 7.677177429199219, |
| "learning_rate": 9.930187397020385e-06, |
| "loss": 1.3601, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.4237012676621862, |
| "grad_norm": 7.980209827423096, |
| "learning_rate": 9.860378196608549e-06, |
| "loss": 1.3671, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.4252308751266706, |
| "grad_norm": 4.75565767288208, |
| "learning_rate": 9.790575801166432e-06, |
| "loss": 1.3557, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.42676048259115507, |
| "grad_norm": 5.737847328186035, |
| "learning_rate": 9.720783612764314e-06, |
| "loss": 1.3475, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.42829009005563945, |
| "grad_norm": 7.28814172744751, |
| "learning_rate": 9.651005032974994e-06, |
| "loss": 1.3516, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.4298196975201239, |
| "grad_norm": 6.400086879730225, |
| "learning_rate": 9.581243462708007e-06, |
| "loss": 1.3552, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.43134930498460833, |
| "grad_norm": 14.871561050415039, |
| "learning_rate": 9.511502302043867e-06, |
| "loss": 1.338, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.43287891244909277, |
| "grad_norm": 7.7634100914001465, |
| "learning_rate": 9.441784950068362e-06, |
| "loss": 1.346, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.43440851991357715, |
| "grad_norm": 7.211244106292725, |
| "learning_rate": 9.372094804706867e-06, |
| "loss": 1.3526, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.4359381273780616, |
| "grad_norm": 6.430041313171387, |
| "learning_rate": 9.302435262558748e-06, |
| "loss": 1.3343, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.43746773484254603, |
| "grad_norm": 5.903066635131836, |
| "learning_rate": 9.232809718731815e-06, |
| "loss": 1.3353, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.43899734230703047, |
| "grad_norm": 7.717010021209717, |
| "learning_rate": 9.163221566676847e-06, |
| "loss": 1.3438, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.4405269497715149, |
| "grad_norm": 7.673276901245117, |
| "learning_rate": 9.093674198022201e-06, |
| "loss": 1.3394, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.4420565572359993, |
| "grad_norm": 6.96506929397583, |
| "learning_rate": 9.024171002408507e-06, |
| "loss": 1.35, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.44358616470048373, |
| "grad_norm": 5.854609966278076, |
| "learning_rate": 8.954715367323468e-06, |
| "loss": 1.3376, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.44511577216496817, |
| "grad_norm": 6.3792572021484375, |
| "learning_rate": 8.885310677936746e-06, |
| "loss": 1.3359, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.4466453796294526, |
| "grad_norm": 6.389842510223389, |
| "learning_rate": 8.815960316934991e-06, |
| "loss": 1.3299, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.448174987093937, |
| "grad_norm": 6.603227138519287, |
| "learning_rate": 8.746667664356957e-06, |
| "loss": 1.323, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.44970459455842143, |
| "grad_norm": 6.4203338623046875, |
| "learning_rate": 8.677436097428775e-06, |
| "loss": 1.3459, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.4512342020229059, |
| "grad_norm": 5.568302154541016, |
| "learning_rate": 8.60826899039935e-06, |
| "loss": 1.3434, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.4527638094873903, |
| "grad_norm": 6.737658977508545, |
| "learning_rate": 8.539169714375885e-06, |
| "loss": 1.3146, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.45429341695187475, |
| "grad_norm": 5.4618940353393555, |
| "learning_rate": 8.47014163715962e-06, |
| "loss": 1.3205, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.45582302441635913, |
| "grad_norm": 6.360799312591553, |
| "learning_rate": 8.401188123081653e-06, |
| "loss": 1.3198, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.4573526318808436, |
| "grad_norm": 6.304644584655762, |
| "learning_rate": 8.332312532838978e-06, |
| "loss": 1.3261, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.458882239345328, |
| "grad_norm": 4.984965801239014, |
| "learning_rate": 8.263518223330698e-06, |
| "loss": 1.3181, |
| "step": 3000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.73984932364288e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|