{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010703773080010704, "grad_norm": 2.640625, "learning_rate": 1.993582887700535e-05, "loss": 1.5584056854248047, "step": 10 }, { "epoch": 0.02140754616002141, "grad_norm": 2.53125, "learning_rate": 1.9864527629233515e-05, "loss": 1.562470054626465, "step": 20 }, { "epoch": 0.03211131924003211, "grad_norm": 2.46875, "learning_rate": 1.9793226381461677e-05, "loss": 1.5967525482177733, "step": 30 }, { "epoch": 0.04281509232004282, "grad_norm": 2.546875, "learning_rate": 1.972192513368984e-05, "loss": 1.5231587409973144, "step": 40 }, { "epoch": 0.05351886540005352, "grad_norm": 2.015625, "learning_rate": 1.9650623885918005e-05, "loss": 1.4803240776062012, "step": 50 }, { "epoch": 0.06422263848006422, "grad_norm": 2.21875, "learning_rate": 1.957932263814617e-05, "loss": 1.5196543693542481, "step": 60 }, { "epoch": 0.07492641156007493, "grad_norm": 2.21875, "learning_rate": 1.9508021390374332e-05, "loss": 1.4593828201293946, "step": 70 }, { "epoch": 0.08563018464008564, "grad_norm": 2.28125, "learning_rate": 1.9436720142602497e-05, "loss": 1.4797739028930663, "step": 80 }, { "epoch": 0.09633395772009633, "grad_norm": 2.078125, "learning_rate": 1.9365418894830663e-05, "loss": 1.5016543388366699, "step": 90 }, { "epoch": 0.10703773080010703, "grad_norm": 2.6875, "learning_rate": 1.9294117647058825e-05, "loss": 1.4800514221191405, "step": 100 }, { "epoch": 0.11774150388011774, "grad_norm": 1.75, "learning_rate": 1.922281639928699e-05, "loss": 1.445749568939209, "step": 110 }, { "epoch": 0.12844527696012845, "grad_norm": 2.0625, "learning_rate": 1.9151515151515152e-05, "loss": 1.4519201278686524, "step": 120 }, { "epoch": 0.13914905004013914, "grad_norm": 2.03125, "learning_rate": 1.9080213903743317e-05, "loss": 1.3476288795471192, "step": 130 }, { "epoch": 0.14985282312014986, "grad_norm": 2.234375, "learning_rate": 1.9008912655971482e-05, "loss": 1.4554848670959473, "step": 140 }, { "epoch": 0.16055659620016055, "grad_norm": 1.9375, "learning_rate": 1.8937611408199644e-05, "loss": 1.4435239791870118, "step": 150 }, { "epoch": 0.17126036928017127, "grad_norm": 1.671875, "learning_rate": 1.886631016042781e-05, "loss": 1.3767672538757325, "step": 160 }, { "epoch": 0.18196414236018196, "grad_norm": 2.09375, "learning_rate": 1.8795008912655972e-05, "loss": 1.4352334022521973, "step": 170 }, { "epoch": 0.19266791544019266, "grad_norm": 2.0, "learning_rate": 1.8723707664884137e-05, "loss": 1.382822322845459, "step": 180 }, { "epoch": 0.20337168852020338, "grad_norm": 1.953125, "learning_rate": 1.8652406417112302e-05, "loss": 1.429026985168457, "step": 190 }, { "epoch": 0.21407546160021407, "grad_norm": 1.859375, "learning_rate": 1.8581105169340464e-05, "loss": 1.3564122200012207, "step": 200 }, { "epoch": 0.2247792346802248, "grad_norm": 1.7109375, "learning_rate": 1.850980392156863e-05, "loss": 1.457004451751709, "step": 210 }, { "epoch": 0.23548300776023548, "grad_norm": 1.9921875, "learning_rate": 1.843850267379679e-05, "loss": 1.3679749488830566, "step": 220 }, { "epoch": 0.2461867808402462, "grad_norm": 2.0625, "learning_rate": 1.8367201426024957e-05, "loss": 1.4186459541320802, "step": 230 }, { "epoch": 0.2568905539202569, "grad_norm": 1.8984375, "learning_rate": 1.8295900178253122e-05, "loss": 1.3645942687988282, "step": 240 }, { "epoch": 0.2675943270002676, "grad_norm": 1.796875, "learning_rate": 1.8224598930481284e-05, "loss": 1.3659990310668946, "step": 250 }, { "epoch": 0.2782981000802783, "grad_norm": 1.9375, "learning_rate": 1.815329768270945e-05, "loss": 1.3751505851745605, "step": 260 }, { "epoch": 0.289001873160289, "grad_norm": 1.96875, "learning_rate": 1.808199643493761e-05, "loss": 1.394303798675537, "step": 270 }, { "epoch": 0.2997056462402997, "grad_norm": 1.71875, "learning_rate": 1.8010695187165777e-05, "loss": 1.3266244888305665, "step": 280 }, { "epoch": 0.3104094193203104, "grad_norm": 1.6328125, "learning_rate": 1.7939393939393942e-05, "loss": 1.3767006874084473, "step": 290 }, { "epoch": 0.3211131924003211, "grad_norm": 1.8359375, "learning_rate": 1.7868092691622104e-05, "loss": 1.3508996963500977, "step": 300 }, { "epoch": 0.3318169654803318, "grad_norm": 1.8984375, "learning_rate": 1.779679144385027e-05, "loss": 1.299268627166748, "step": 310 }, { "epoch": 0.34252073856034254, "grad_norm": 1.6015625, "learning_rate": 1.772549019607843e-05, "loss": 1.335693073272705, "step": 320 }, { "epoch": 0.35322451164035323, "grad_norm": 1.6171875, "learning_rate": 1.7654188948306597e-05, "loss": 1.3631214141845702, "step": 330 }, { "epoch": 0.3639282847203639, "grad_norm": 1.7421875, "learning_rate": 1.7582887700534762e-05, "loss": 1.349259376525879, "step": 340 }, { "epoch": 0.3746320578003746, "grad_norm": 1.8515625, "learning_rate": 1.7511586452762924e-05, "loss": 1.3234673500061036, "step": 350 }, { "epoch": 0.3853358308803853, "grad_norm": 1.734375, "learning_rate": 1.744028520499109e-05, "loss": 1.34688138961792, "step": 360 }, { "epoch": 0.39603960396039606, "grad_norm": 1.796875, "learning_rate": 1.736898395721925e-05, "loss": 1.310294246673584, "step": 370 }, { "epoch": 0.40674337704040675, "grad_norm": 1.5703125, "learning_rate": 1.7297682709447417e-05, "loss": 1.3146047592163086, "step": 380 }, { "epoch": 0.41744715012041744, "grad_norm": 2.078125, "learning_rate": 1.7226381461675582e-05, "loss": 1.3516902923583984, "step": 390 }, { "epoch": 0.42815092320042814, "grad_norm": 1.6875, "learning_rate": 1.7155080213903744e-05, "loss": 1.3631095886230469, "step": 400 }, { "epoch": 0.4388546962804388, "grad_norm": 1.578125, "learning_rate": 1.708377896613191e-05, "loss": 1.3395885467529296, "step": 410 }, { "epoch": 0.4495584693604496, "grad_norm": 1.7890625, "learning_rate": 1.701247771836007e-05, "loss": 1.322316837310791, "step": 420 }, { "epoch": 0.46026224244046027, "grad_norm": 2.03125, "learning_rate": 1.6941176470588237e-05, "loss": 1.3892762184143066, "step": 430 }, { "epoch": 0.47096601552047096, "grad_norm": 1.765625, "learning_rate": 1.6869875222816402e-05, "loss": 1.3081950187683105, "step": 440 }, { "epoch": 0.48166978860048165, "grad_norm": 1.75, "learning_rate": 1.6798573975044564e-05, "loss": 1.3405800819396974, "step": 450 }, { "epoch": 0.4923735616804924, "grad_norm": 1.6171875, "learning_rate": 1.672727272727273e-05, "loss": 1.3331517219543456, "step": 460 }, { "epoch": 0.5030773347605031, "grad_norm": 1.703125, "learning_rate": 1.665597147950089e-05, "loss": 1.3040351867675781, "step": 470 }, { "epoch": 0.5137811078405138, "grad_norm": 1.7265625, "learning_rate": 1.6584670231729056e-05, "loss": 1.319422149658203, "step": 480 }, { "epoch": 0.5244848809205245, "grad_norm": 1.8125, "learning_rate": 1.6513368983957222e-05, "loss": 1.3433240890502929, "step": 490 }, { "epoch": 0.5351886540005352, "grad_norm": 1.7265625, "learning_rate": 1.6442067736185384e-05, "loss": 1.3346479415893555, "step": 500 }, { "epoch": 0.5458924270805459, "grad_norm": 1.5625, "learning_rate": 1.637076648841355e-05, "loss": 1.3032867431640625, "step": 510 }, { "epoch": 0.5565962001605566, "grad_norm": 1.78125, "learning_rate": 1.629946524064171e-05, "loss": 1.3006314277648925, "step": 520 }, { "epoch": 0.5672999732405672, "grad_norm": 1.765625, "learning_rate": 1.6228163992869876e-05, "loss": 1.3416614532470703, "step": 530 }, { "epoch": 0.578003746320578, "grad_norm": 2.015625, "learning_rate": 1.615686274509804e-05, "loss": 1.303782081604004, "step": 540 }, { "epoch": 0.5887075194005887, "grad_norm": 1.578125, "learning_rate": 1.6085561497326207e-05, "loss": 1.2814931869506836, "step": 550 }, { "epoch": 0.5994112924805994, "grad_norm": 1.53125, "learning_rate": 1.601426024955437e-05, "loss": 1.3404861450195313, "step": 560 }, { "epoch": 0.6101150655606101, "grad_norm": 1.7734375, "learning_rate": 1.594295900178253e-05, "loss": 1.3594398498535156, "step": 570 }, { "epoch": 0.6208188386406208, "grad_norm": 1.609375, "learning_rate": 1.5871657754010696e-05, "loss": 1.2768223762512207, "step": 580 }, { "epoch": 0.6315226117206315, "grad_norm": 1.609375, "learning_rate": 1.580035650623886e-05, "loss": 1.3110815048217774, "step": 590 }, { "epoch": 0.6422263848006422, "grad_norm": 1.6640625, "learning_rate": 1.5729055258467027e-05, "loss": 1.2639217376708984, "step": 600 }, { "epoch": 0.6529301578806529, "grad_norm": 1.8203125, "learning_rate": 1.565775401069519e-05, "loss": 1.3356239318847656, "step": 610 }, { "epoch": 0.6636339309606636, "grad_norm": 1.8828125, "learning_rate": 1.558645276292335e-05, "loss": 1.3733593940734863, "step": 620 }, { "epoch": 0.6743377040406744, "grad_norm": 1.5703125, "learning_rate": 1.5515151515151516e-05, "loss": 1.2768065452575683, "step": 630 }, { "epoch": 0.6850414771206851, "grad_norm": 1.7578125, "learning_rate": 1.544385026737968e-05, "loss": 1.345008659362793, "step": 640 }, { "epoch": 0.6957452502006958, "grad_norm": 1.4921875, "learning_rate": 1.5372549019607847e-05, "loss": 1.2327005386352539, "step": 650 }, { "epoch": 0.7064490232807065, "grad_norm": 1.765625, "learning_rate": 1.530124777183601e-05, "loss": 1.327579879760742, "step": 660 }, { "epoch": 0.7171527963607172, "grad_norm": 1.6640625, "learning_rate": 1.5229946524064172e-05, "loss": 1.2693171501159668, "step": 670 }, { "epoch": 0.7278565694407279, "grad_norm": 1.640625, "learning_rate": 1.5158645276292336e-05, "loss": 1.3229084014892578, "step": 680 }, { "epoch": 0.7385603425207385, "grad_norm": 1.7265625, "learning_rate": 1.5087344028520501e-05, "loss": 1.3010024070739745, "step": 690 }, { "epoch": 0.7492641156007492, "grad_norm": 1.59375, "learning_rate": 1.5016042780748665e-05, "loss": 1.304527473449707, "step": 700 }, { "epoch": 0.7599678886807599, "grad_norm": 1.8671875, "learning_rate": 1.4944741532976827e-05, "loss": 1.2771072387695312, "step": 710 }, { "epoch": 0.7706716617607706, "grad_norm": 1.671875, "learning_rate": 1.4873440285204992e-05, "loss": 1.285037899017334, "step": 720 }, { "epoch": 0.7813754348407814, "grad_norm": 1.6171875, "learning_rate": 1.4802139037433156e-05, "loss": 1.2612761497497558, "step": 730 }, { "epoch": 0.7920792079207921, "grad_norm": 1.8125, "learning_rate": 1.4730837789661321e-05, "loss": 1.3110386848449707, "step": 740 }, { "epoch": 0.8027829810008028, "grad_norm": 1.671875, "learning_rate": 1.4659536541889485e-05, "loss": 1.3450962066650392, "step": 750 }, { "epoch": 0.8134867540808135, "grad_norm": 1.796875, "learning_rate": 1.4588235294117647e-05, "loss": 1.294900608062744, "step": 760 }, { "epoch": 0.8241905271608242, "grad_norm": 1.4765625, "learning_rate": 1.4516934046345812e-05, "loss": 1.3215585708618165, "step": 770 }, { "epoch": 0.8348943002408349, "grad_norm": 1.5859375, "learning_rate": 1.4445632798573976e-05, "loss": 1.3044111251831054, "step": 780 }, { "epoch": 0.8455980733208456, "grad_norm": 1.8671875, "learning_rate": 1.4374331550802141e-05, "loss": 1.3348912239074706, "step": 790 }, { "epoch": 0.8563018464008563, "grad_norm": 1.65625, "learning_rate": 1.4303030303030305e-05, "loss": 1.3434508323669434, "step": 800 }, { "epoch": 0.867005619480867, "grad_norm": 1.625, "learning_rate": 1.4231729055258467e-05, "loss": 1.291652297973633, "step": 810 }, { "epoch": 0.8777093925608777, "grad_norm": 1.4921875, "learning_rate": 1.4160427807486632e-05, "loss": 1.3067720413208008, "step": 820 }, { "epoch": 0.8884131656408885, "grad_norm": 1.9453125, "learning_rate": 1.4089126559714796e-05, "loss": 1.3196195602416991, "step": 830 }, { "epoch": 0.8991169387208992, "grad_norm": 1.5390625, "learning_rate": 1.4017825311942961e-05, "loss": 1.3129652976989745, "step": 840 }, { "epoch": 0.9098207118009098, "grad_norm": 1.5, "learning_rate": 1.3946524064171123e-05, "loss": 1.2702789306640625, "step": 850 }, { "epoch": 0.9205244848809205, "grad_norm": 1.9453125, "learning_rate": 1.3875222816399288e-05, "loss": 1.29964599609375, "step": 860 }, { "epoch": 0.9312282579609312, "grad_norm": 1.5703125, "learning_rate": 1.3803921568627452e-05, "loss": 1.301185131072998, "step": 870 }, { "epoch": 0.9419320310409419, "grad_norm": 1.4140625, "learning_rate": 1.3732620320855616e-05, "loss": 1.2731993675231934, "step": 880 }, { "epoch": 0.9526358041209526, "grad_norm": 1.421875, "learning_rate": 1.3661319073083781e-05, "loss": 1.2806821823120118, "step": 890 }, { "epoch": 0.9633395772009633, "grad_norm": 1.84375, "learning_rate": 1.3590017825311943e-05, "loss": 1.2375809669494628, "step": 900 }, { "epoch": 0.974043350280974, "grad_norm": 1.8046875, "learning_rate": 1.3518716577540108e-05, "loss": 1.2453808784484863, "step": 910 }, { "epoch": 0.9847471233609848, "grad_norm": 2.03125, "learning_rate": 1.3447415329768272e-05, "loss": 1.3074142456054687, "step": 920 }, { "epoch": 0.9954508964409955, "grad_norm": 1.484375, "learning_rate": 1.3376114081996437e-05, "loss": 1.2914584159851075, "step": 930 }, { "epoch": 1.0053518865400053, "grad_norm": 2.0625, "learning_rate": 1.33048128342246e-05, "loss": 1.3543176651000977, "step": 940 }, { "epoch": 1.016055659620016, "grad_norm": 1.765625, "learning_rate": 1.3233511586452763e-05, "loss": 1.3298683166503906, "step": 950 }, { "epoch": 1.0267594327000267, "grad_norm": 1.59375, "learning_rate": 1.3162210338680928e-05, "loss": 1.3020204544067382, "step": 960 }, { "epoch": 1.0374632057800375, "grad_norm": 1.6484375, "learning_rate": 1.3090909090909092e-05, "loss": 1.3046648025512695, "step": 970 }, { "epoch": 1.048166978860048, "grad_norm": 1.65625, "learning_rate": 1.3019607843137257e-05, "loss": 1.2308432579040527, "step": 980 }, { "epoch": 1.0588707519400589, "grad_norm": 1.65625, "learning_rate": 1.294830659536542e-05, "loss": 1.2811461448669434, "step": 990 }, { "epoch": 1.0695745250200697, "grad_norm": 1.6640625, "learning_rate": 1.2877005347593583e-05, "loss": 1.3090335845947265, "step": 1000 }, { "epoch": 1.0802782981000802, "grad_norm": 1.9921875, "learning_rate": 1.2805704099821748e-05, "loss": 1.2958572387695313, "step": 1010 }, { "epoch": 1.090982071180091, "grad_norm": 1.6953125, "learning_rate": 1.2734402852049912e-05, "loss": 1.326209259033203, "step": 1020 }, { "epoch": 1.1016858442601016, "grad_norm": 1.75, "learning_rate": 1.2663101604278077e-05, "loss": 1.2520675659179688, "step": 1030 }, { "epoch": 1.1123896173401124, "grad_norm": 1.546875, "learning_rate": 1.259180035650624e-05, "loss": 1.3478898048400878, "step": 1040 }, { "epoch": 1.123093390420123, "grad_norm": 1.4921875, "learning_rate": 1.2520499108734403e-05, "loss": 1.2806931495666505, "step": 1050 }, { "epoch": 1.1337971635001338, "grad_norm": 1.75, "learning_rate": 1.2449197860962568e-05, "loss": 1.2603809356689453, "step": 1060 }, { "epoch": 1.1445009365801444, "grad_norm": 1.484375, "learning_rate": 1.2377896613190731e-05, "loss": 1.2837313652038573, "step": 1070 }, { "epoch": 1.1552047096601552, "grad_norm": 1.859375, "learning_rate": 1.2306595365418897e-05, "loss": 1.271355152130127, "step": 1080 }, { "epoch": 1.165908482740166, "grad_norm": 1.6015625, "learning_rate": 1.223529411764706e-05, "loss": 1.2751256942749023, "step": 1090 }, { "epoch": 1.1766122558201766, "grad_norm": 1.5390625, "learning_rate": 1.2163992869875222e-05, "loss": 1.2217981338500976, "step": 1100 }, { "epoch": 1.1873160289001874, "grad_norm": 1.46875, "learning_rate": 1.2092691622103388e-05, "loss": 1.3460000038146973, "step": 1110 }, { "epoch": 1.198019801980198, "grad_norm": 1.6328125, "learning_rate": 1.2021390374331551e-05, "loss": 1.3119497299194336, "step": 1120 }, { "epoch": 1.2087235750602088, "grad_norm": 1.5, "learning_rate": 1.1950089126559717e-05, "loss": 1.326594066619873, "step": 1130 }, { "epoch": 1.2194273481402194, "grad_norm": 1.8671875, "learning_rate": 1.187878787878788e-05, "loss": 1.313736343383789, "step": 1140 }, { "epoch": 1.2301311212202302, "grad_norm": 1.625, "learning_rate": 1.1807486631016042e-05, "loss": 1.2580394744873047, "step": 1150 }, { "epoch": 1.2408348943002407, "grad_norm": 1.6953125, "learning_rate": 1.1736185383244208e-05, "loss": 1.3472198486328124, "step": 1160 }, { "epoch": 1.2515386673802515, "grad_norm": 1.6875, "learning_rate": 1.1664884135472371e-05, "loss": 1.3223270416259765, "step": 1170 }, { "epoch": 1.2622424404602621, "grad_norm": 1.5390625, "learning_rate": 1.1593582887700537e-05, "loss": 1.3479475021362304, "step": 1180 }, { "epoch": 1.272946213540273, "grad_norm": 1.46875, "learning_rate": 1.15222816399287e-05, "loss": 1.2691156387329101, "step": 1190 }, { "epoch": 1.2836499866202837, "grad_norm": 1.578125, "learning_rate": 1.1450980392156862e-05, "loss": 1.3078096389770508, "step": 1200 }, { "epoch": 1.2943537597002943, "grad_norm": 1.7265625, "learning_rate": 1.1379679144385028e-05, "loss": 1.2821264266967773, "step": 1210 }, { "epoch": 1.3050575327803051, "grad_norm": 1.6015625, "learning_rate": 1.1308377896613191e-05, "loss": 1.2466256141662597, "step": 1220 }, { "epoch": 1.3157613058603157, "grad_norm": 1.84375, "learning_rate": 1.1237076648841357e-05, "loss": 1.301154327392578, "step": 1230 }, { "epoch": 1.3264650789403265, "grad_norm": 1.6171875, "learning_rate": 1.116577540106952e-05, "loss": 1.3058858871459962, "step": 1240 }, { "epoch": 1.337168852020337, "grad_norm": 1.6875, "learning_rate": 1.1094474153297684e-05, "loss": 1.257982349395752, "step": 1250 }, { "epoch": 1.3478726251003479, "grad_norm": 1.546875, "learning_rate": 1.1023172905525847e-05, "loss": 1.278379535675049, "step": 1260 }, { "epoch": 1.3585763981803587, "grad_norm": 1.890625, "learning_rate": 1.0951871657754011e-05, "loss": 1.2998493194580079, "step": 1270 }, { "epoch": 1.3692801712603693, "grad_norm": 1.5859375, "learning_rate": 1.0880570409982176e-05, "loss": 1.3042527198791505, "step": 1280 }, { "epoch": 1.3799839443403799, "grad_norm": 1.6015625, "learning_rate": 1.0809269162210338e-05, "loss": 1.2903579711914062, "step": 1290 }, { "epoch": 1.3906877174203907, "grad_norm": 1.5390625, "learning_rate": 1.0737967914438504e-05, "loss": 1.216090202331543, "step": 1300 }, { "epoch": 1.4013914905004015, "grad_norm": 1.4296875, "learning_rate": 1.0666666666666667e-05, "loss": 1.2497664451599122, "step": 1310 }, { "epoch": 1.412095263580412, "grad_norm": 1.609375, "learning_rate": 1.0595365418894833e-05, "loss": 1.2592049598693849, "step": 1320 }, { "epoch": 1.4227990366604228, "grad_norm": 1.703125, "learning_rate": 1.0524064171122996e-05, "loss": 1.3062689781188965, "step": 1330 }, { "epoch": 1.4335028097404334, "grad_norm": 1.515625, "learning_rate": 1.0452762923351158e-05, "loss": 1.2577032089233398, "step": 1340 }, { "epoch": 1.4442065828204442, "grad_norm": 1.75, "learning_rate": 1.0381461675579324e-05, "loss": 1.2874650001525878, "step": 1350 }, { "epoch": 1.4549103559004548, "grad_norm": 2.0625, "learning_rate": 1.0310160427807487e-05, "loss": 1.2887776374816895, "step": 1360 }, { "epoch": 1.4656141289804656, "grad_norm": 1.59375, "learning_rate": 1.0238859180035653e-05, "loss": 1.2869946479797363, "step": 1370 }, { "epoch": 1.4763179020604764, "grad_norm": 1.7421875, "learning_rate": 1.0167557932263816e-05, "loss": 1.3055774688720703, "step": 1380 }, { "epoch": 1.487021675140487, "grad_norm": 1.7421875, "learning_rate": 1.0096256684491978e-05, "loss": 1.2925223350524901, "step": 1390 }, { "epoch": 1.4977254482204978, "grad_norm": 1.5390625, "learning_rate": 1.0024955436720143e-05, "loss": 1.3624143600463867, "step": 1400 }, { "epoch": 1.5084292213005084, "grad_norm": 1.7265625, "learning_rate": 9.953654188948307e-06, "loss": 1.3100957870483398, "step": 1410 }, { "epoch": 1.5191329943805192, "grad_norm": 1.734375, "learning_rate": 9.882352941176472e-06, "loss": 1.2667318344116212, "step": 1420 }, { "epoch": 1.5298367674605298, "grad_norm": 1.5, "learning_rate": 9.811051693404634e-06, "loss": 1.2964338302612304, "step": 1430 }, { "epoch": 1.5405405405405406, "grad_norm": 1.703125, "learning_rate": 9.7397504456328e-06, "loss": 1.2451062202453613, "step": 1440 }, { "epoch": 1.5512443136205514, "grad_norm": 1.6484375, "learning_rate": 9.668449197860963e-06, "loss": 1.2622719764709474, "step": 1450 }, { "epoch": 1.561948086700562, "grad_norm": 1.8359375, "learning_rate": 9.597147950089127e-06, "loss": 1.2830778121948243, "step": 1460 }, { "epoch": 1.5726518597805725, "grad_norm": 1.5390625, "learning_rate": 9.525846702317292e-06, "loss": 1.3212904930114746, "step": 1470 }, { "epoch": 1.5833556328605833, "grad_norm": 1.515625, "learning_rate": 9.454545454545456e-06, "loss": 1.301555347442627, "step": 1480 }, { "epoch": 1.5940594059405941, "grad_norm": 1.453125, "learning_rate": 9.38324420677362e-06, "loss": 1.2626118659973145, "step": 1490 }, { "epoch": 1.6047631790206047, "grad_norm": 1.890625, "learning_rate": 9.311942959001783e-06, "loss": 1.2342555046081543, "step": 1500 }, { "epoch": 1.6154669521006153, "grad_norm": 1.7421875, "learning_rate": 9.240641711229947e-06, "loss": 1.3167900085449218, "step": 1510 }, { "epoch": 1.6261707251806263, "grad_norm": 2.0, "learning_rate": 9.169340463458112e-06, "loss": 1.296627902984619, "step": 1520 }, { "epoch": 1.636874498260637, "grad_norm": 1.4453125, "learning_rate": 9.098039215686276e-06, "loss": 1.275075340270996, "step": 1530 }, { "epoch": 1.6475782713406475, "grad_norm": 1.53125, "learning_rate": 9.02673796791444e-06, "loss": 1.2771642684936524, "step": 1540 }, { "epoch": 1.6582820444206583, "grad_norm": 1.8046875, "learning_rate": 8.955436720142603e-06, "loss": 1.2907758712768556, "step": 1550 }, { "epoch": 1.668985817500669, "grad_norm": 1.6953125, "learning_rate": 8.884135472370767e-06, "loss": 1.2778194427490235, "step": 1560 }, { "epoch": 1.6796895905806797, "grad_norm": 1.5859375, "learning_rate": 8.81283422459893e-06, "loss": 1.2820199012756348, "step": 1570 }, { "epoch": 1.6903933636606903, "grad_norm": 1.8984375, "learning_rate": 8.741532976827096e-06, "loss": 1.3197799682617188, "step": 1580 }, { "epoch": 1.701097136740701, "grad_norm": 1.796875, "learning_rate": 8.67023172905526e-06, "loss": 1.2711196899414063, "step": 1590 }, { "epoch": 1.7118009098207119, "grad_norm": 1.7265625, "learning_rate": 8.598930481283423e-06, "loss": 1.3094602584838868, "step": 1600 }, { "epoch": 1.7225046829007225, "grad_norm": 1.65625, "learning_rate": 8.527629233511587e-06, "loss": 1.3037433624267578, "step": 1610 }, { "epoch": 1.7332084559807333, "grad_norm": 1.6484375, "learning_rate": 8.45632798573975e-06, "loss": 1.2734570503234863, "step": 1620 }, { "epoch": 1.743912229060744, "grad_norm": 1.5703125, "learning_rate": 8.385026737967916e-06, "loss": 1.2476407051086427, "step": 1630 }, { "epoch": 1.7546160021407546, "grad_norm": 1.7265625, "learning_rate": 8.31372549019608e-06, "loss": 1.3427558898925782, "step": 1640 }, { "epoch": 1.7653197752207652, "grad_norm": 1.65625, "learning_rate": 8.242424242424243e-06, "loss": 1.273496437072754, "step": 1650 }, { "epoch": 1.776023548300776, "grad_norm": 1.71875, "learning_rate": 8.171122994652407e-06, "loss": 1.2626665115356446, "step": 1660 }, { "epoch": 1.7867273213807868, "grad_norm": 1.8046875, "learning_rate": 8.09982174688057e-06, "loss": 1.2670047760009766, "step": 1670 }, { "epoch": 1.7974310944607974, "grad_norm": 1.7265625, "learning_rate": 8.028520499108736e-06, "loss": 1.349191665649414, "step": 1680 }, { "epoch": 1.808134867540808, "grad_norm": 1.578125, "learning_rate": 7.9572192513369e-06, "loss": 1.2989972114562989, "step": 1690 }, { "epoch": 1.8188386406208188, "grad_norm": 1.6875, "learning_rate": 7.885918003565063e-06, "loss": 1.1850922584533692, "step": 1700 }, { "epoch": 1.8295424137008296, "grad_norm": 1.953125, "learning_rate": 7.814616755793228e-06, "loss": 1.3360312461853028, "step": 1710 }, { "epoch": 1.8402461867808402, "grad_norm": 1.6328125, "learning_rate": 7.74331550802139e-06, "loss": 1.2957257270812987, "step": 1720 }, { "epoch": 1.850949959860851, "grad_norm": 1.6015625, "learning_rate": 7.672014260249555e-06, "loss": 1.2536530494689941, "step": 1730 }, { "epoch": 1.8616537329408618, "grad_norm": 1.7109375, "learning_rate": 7.60071301247772e-06, "loss": 1.2660930633544922, "step": 1740 }, { "epoch": 1.8723575060208724, "grad_norm": 1.7265625, "learning_rate": 7.529411764705883e-06, "loss": 1.3080876350402832, "step": 1750 }, { "epoch": 1.883061279100883, "grad_norm": 1.640625, "learning_rate": 7.458110516934047e-06, "loss": 1.3132406234741212, "step": 1760 }, { "epoch": 1.8937650521808937, "grad_norm": 1.8203125, "learning_rate": 7.386809269162211e-06, "loss": 1.31253080368042, "step": 1770 }, { "epoch": 1.9044688252609046, "grad_norm": 1.4140625, "learning_rate": 7.315508021390375e-06, "loss": 1.3013240814208984, "step": 1780 }, { "epoch": 1.9151725983409151, "grad_norm": 1.4765625, "learning_rate": 7.244206773618538e-06, "loss": 1.2744117736816407, "step": 1790 }, { "epoch": 1.9258763714209257, "grad_norm": 1.4609375, "learning_rate": 7.172905525846703e-06, "loss": 1.3057758331298828, "step": 1800 }, { "epoch": 1.9365801445009367, "grad_norm": 1.515625, "learning_rate": 7.101604278074867e-06, "loss": 1.224927043914795, "step": 1810 }, { "epoch": 1.9472839175809473, "grad_norm": 1.7109375, "learning_rate": 7.030303030303031e-06, "loss": 1.3182221412658692, "step": 1820 }, { "epoch": 1.957987690660958, "grad_norm": 1.5546875, "learning_rate": 6.959001782531195e-06, "loss": 1.2400826454162597, "step": 1830 }, { "epoch": 1.9686914637409687, "grad_norm": 1.7421875, "learning_rate": 6.887700534759358e-06, "loss": 1.2463386535644532, "step": 1840 }, { "epoch": 1.9793952368209795, "grad_norm": 1.46875, "learning_rate": 6.8163992869875225e-06, "loss": 1.3235528945922852, "step": 1850 }, { "epoch": 1.99009900990099, "grad_norm": 1.8203125, "learning_rate": 6.745098039215687e-06, "loss": 1.2812946319580079, "step": 1860 }, { "epoch": 2.0, "grad_norm": 4.1875, "learning_rate": 6.673796791443851e-06, "loss": 1.2953272819519044, "step": 1870 }, { "epoch": 2.0107037730800106, "grad_norm": 1.5390625, "learning_rate": 6.602495543672015e-06, "loss": 1.207719612121582, "step": 1880 }, { "epoch": 2.0214075461600216, "grad_norm": 1.671875, "learning_rate": 6.531194295900179e-06, "loss": 1.2520846366882323, "step": 1890 }, { "epoch": 2.032111319240032, "grad_norm": 1.703125, "learning_rate": 6.459893048128343e-06, "loss": 1.2905988693237305, "step": 1900 }, { "epoch": 2.0428150923200428, "grad_norm": 1.921875, "learning_rate": 6.388591800356507e-06, "loss": 1.3520148277282715, "step": 1910 }, { "epoch": 2.0535188654000534, "grad_norm": 1.8515625, "learning_rate": 6.3172905525846705e-06, "loss": 1.2911107063293457, "step": 1920 }, { "epoch": 2.0642226384800644, "grad_norm": 1.671875, "learning_rate": 6.245989304812835e-06, "loss": 1.2403117179870606, "step": 1930 }, { "epoch": 2.074926411560075, "grad_norm": 1.96875, "learning_rate": 6.174688057040999e-06, "loss": 1.3558055877685546, "step": 1940 }, { "epoch": 2.0856301846400855, "grad_norm": 1.6328125, "learning_rate": 6.103386809269163e-06, "loss": 1.3194045066833495, "step": 1950 }, { "epoch": 2.096333957720096, "grad_norm": 2.140625, "learning_rate": 6.032085561497326e-06, "loss": 1.3321582794189453, "step": 1960 }, { "epoch": 2.107037730800107, "grad_norm": 1.625, "learning_rate": 5.96078431372549e-06, "loss": 1.288839054107666, "step": 1970 }, { "epoch": 2.1177415038801177, "grad_norm": 2.0, "learning_rate": 5.889483065953655e-06, "loss": 1.3260244369506835, "step": 1980 }, { "epoch": 2.1284452769601283, "grad_norm": 1.5703125, "learning_rate": 5.8181818181818185e-06, "loss": 1.2721702575683593, "step": 1990 }, { "epoch": 2.1391490500401393, "grad_norm": 1.6640625, "learning_rate": 5.746880570409983e-06, "loss": 1.2622364044189454, "step": 2000 }, { "epoch": 2.14985282312015, "grad_norm": 1.609375, "learning_rate": 5.675579322638146e-06, "loss": 1.30474796295166, "step": 2010 }, { "epoch": 2.1605565962001605, "grad_norm": 1.7890625, "learning_rate": 5.60427807486631e-06, "loss": 1.3109374046325684, "step": 2020 }, { "epoch": 2.171260369280171, "grad_norm": 1.71875, "learning_rate": 5.532976827094475e-06, "loss": 1.3231799125671386, "step": 2030 }, { "epoch": 2.181964142360182, "grad_norm": 1.796875, "learning_rate": 5.4616755793226384e-06, "loss": 1.2993489265441895, "step": 2040 }, { "epoch": 2.1926679154401927, "grad_norm": 1.875, "learning_rate": 5.390374331550803e-06, "loss": 1.3044631958007813, "step": 2050 }, { "epoch": 2.2033716885202033, "grad_norm": 1.4609375, "learning_rate": 5.3190730837789666e-06, "loss": 1.2702978134155274, "step": 2060 }, { "epoch": 2.2140754616002143, "grad_norm": 1.5546875, "learning_rate": 5.24777183600713e-06, "loss": 1.287952709197998, "step": 2070 }, { "epoch": 2.224779234680225, "grad_norm": 1.5546875, "learning_rate": 5.176470588235295e-06, "loss": 1.2974214553833008, "step": 2080 }, { "epoch": 2.2354830077602355, "grad_norm": 1.703125, "learning_rate": 5.105169340463458e-06, "loss": 1.3148197174072265, "step": 2090 }, { "epoch": 2.246186780840246, "grad_norm": 1.6328125, "learning_rate": 5.033868092691623e-06, "loss": 1.3466445922851562, "step": 2100 }, { "epoch": 2.256890553920257, "grad_norm": 1.484375, "learning_rate": 4.9625668449197864e-06, "loss": 1.334506893157959, "step": 2110 }, { "epoch": 2.2675943270002676, "grad_norm": 1.9765625, "learning_rate": 4.891265597147951e-06, "loss": 1.279165267944336, "step": 2120 }, { "epoch": 2.278298100080278, "grad_norm": 1.890625, "learning_rate": 4.8199643493761146e-06, "loss": 1.2512639045715332, "step": 2130 }, { "epoch": 2.289001873160289, "grad_norm": 1.59375, "learning_rate": 4.748663101604278e-06, "loss": 1.2572649002075196, "step": 2140 }, { "epoch": 2.2997056462403, "grad_norm": 1.5625, "learning_rate": 4.677361853832442e-06, "loss": 1.2503036499023437, "step": 2150 }, { "epoch": 2.3104094193203104, "grad_norm": 1.859375, "learning_rate": 4.606060606060606e-06, "loss": 1.2866994857788085, "step": 2160 }, { "epoch": 2.321113192400321, "grad_norm": 1.6171875, "learning_rate": 4.534759358288771e-06, "loss": 1.2810638427734375, "step": 2170 }, { "epoch": 2.331816965480332, "grad_norm": 1.71875, "learning_rate": 4.4634581105169345e-06, "loss": 1.2588828086853028, "step": 2180 }, { "epoch": 2.3425207385603426, "grad_norm": 1.6875, "learning_rate": 4.392156862745098e-06, "loss": 1.2615557670593263, "step": 2190 }, { "epoch": 2.353224511640353, "grad_norm": 1.921875, "learning_rate": 4.320855614973263e-06, "loss": 1.2974510192871094, "step": 2200 }, { "epoch": 2.3639282847203638, "grad_norm": 1.78125, "learning_rate": 4.249554367201426e-06, "loss": 1.303697681427002, "step": 2210 }, { "epoch": 2.374632057800375, "grad_norm": 1.765625, "learning_rate": 4.178253119429591e-06, "loss": 1.303341007232666, "step": 2220 }, { "epoch": 2.3853358308803854, "grad_norm": 1.46875, "learning_rate": 4.106951871657754e-06, "loss": 1.306796932220459, "step": 2230 }, { "epoch": 2.396039603960396, "grad_norm": 1.828125, "learning_rate": 4.035650623885918e-06, "loss": 1.3068408012390136, "step": 2240 }, { "epoch": 2.4067433770404065, "grad_norm": 1.671875, "learning_rate": 3.9643493761140825e-06, "loss": 1.307657527923584, "step": 2250 }, { "epoch": 2.4174471501204176, "grad_norm": 1.75, "learning_rate": 3.893048128342246e-06, "loss": 1.2932353973388673, "step": 2260 }, { "epoch": 2.428150923200428, "grad_norm": 1.765625, "learning_rate": 3.821746880570411e-06, "loss": 1.2625031471252441, "step": 2270 }, { "epoch": 2.4388546962804387, "grad_norm": 1.703125, "learning_rate": 3.7504456327985743e-06, "loss": 1.3354209899902343, "step": 2280 }, { "epoch": 2.4495584693604497, "grad_norm": 1.3671875, "learning_rate": 3.6791443850267383e-06, "loss": 1.200312042236328, "step": 2290 }, { "epoch": 2.4602622424404603, "grad_norm": 1.6484375, "learning_rate": 3.6078431372549024e-06, "loss": 1.2868337631225586, "step": 2300 }, { "epoch": 2.470966015520471, "grad_norm": 1.78125, "learning_rate": 3.536541889483066e-06, "loss": 1.2731021881103515, "step": 2310 }, { "epoch": 2.4816697886004815, "grad_norm": 2.265625, "learning_rate": 3.46524064171123e-06, "loss": 1.3145703315734862, "step": 2320 }, { "epoch": 2.4923735616804925, "grad_norm": 1.671875, "learning_rate": 3.3939393939393946e-06, "loss": 1.305215549468994, "step": 2330 }, { "epoch": 2.503077334760503, "grad_norm": 1.734375, "learning_rate": 3.322638146167558e-06, "loss": 1.3567096710205078, "step": 2340 }, { "epoch": 2.5137811078405137, "grad_norm": 1.6015625, "learning_rate": 3.2513368983957223e-06, "loss": 1.3081507682800293, "step": 2350 }, { "epoch": 2.5244848809205243, "grad_norm": 1.7109375, "learning_rate": 3.180035650623886e-06, "loss": 1.300461483001709, "step": 2360 }, { "epoch": 2.5351886540005353, "grad_norm": 1.5859375, "learning_rate": 3.10873440285205e-06, "loss": 1.3006972312927245, "step": 2370 }, { "epoch": 2.545892427080546, "grad_norm": 1.734375, "learning_rate": 3.0374331550802145e-06, "loss": 1.3157925605773926, "step": 2380 }, { "epoch": 2.5565962001605564, "grad_norm": 1.5625, "learning_rate": 2.966131907308378e-06, "loss": 1.2608634948730468, "step": 2390 }, { "epoch": 2.5672999732405675, "grad_norm": 1.765625, "learning_rate": 2.894830659536542e-06, "loss": 1.237275981903076, "step": 2400 }, { "epoch": 2.578003746320578, "grad_norm": 1.5859375, "learning_rate": 2.8235294117647062e-06, "loss": 1.274481964111328, "step": 2410 }, { "epoch": 2.5887075194005886, "grad_norm": 1.65625, "learning_rate": 2.75222816399287e-06, "loss": 1.3146997451782227, "step": 2420 }, { "epoch": 2.5994112924805997, "grad_norm": 1.6640625, "learning_rate": 2.680926916221034e-06, "loss": 1.3125761032104493, "step": 2430 }, { "epoch": 2.6101150655606102, "grad_norm": 1.6953125, "learning_rate": 2.6096256684491984e-06, "loss": 1.2919845581054688, "step": 2440 }, { "epoch": 2.620818838640621, "grad_norm": 1.6484375, "learning_rate": 2.538324420677362e-06, "loss": 1.2364542961120606, "step": 2450 }, { "epoch": 2.6315226117206314, "grad_norm": 1.609375, "learning_rate": 2.467023172905526e-06, "loss": 1.2624409675598145, "step": 2460 }, { "epoch": 2.642226384800642, "grad_norm": 1.5859375, "learning_rate": 2.3957219251336898e-06, "loss": 1.3075796127319337, "step": 2470 }, { "epoch": 2.652930157880653, "grad_norm": 2.109375, "learning_rate": 2.3244206773618542e-06, "loss": 1.2835824012756347, "step": 2480 }, { "epoch": 2.6636339309606636, "grad_norm": 1.8359375, "learning_rate": 2.253119429590018e-06, "loss": 1.307276153564453, "step": 2490 }, { "epoch": 2.674337704040674, "grad_norm": 1.7421875, "learning_rate": 2.181818181818182e-06, "loss": 1.2622486114501954, "step": 2500 }, { "epoch": 2.685041477120685, "grad_norm": 1.6796875, "learning_rate": 2.110516934046346e-06, "loss": 1.2514682769775392, "step": 2510 }, { "epoch": 2.6957452502006958, "grad_norm": 1.5625, "learning_rate": 2.03921568627451e-06, "loss": 1.2614849090576172, "step": 2520 }, { "epoch": 2.7064490232807064, "grad_norm": 1.5390625, "learning_rate": 1.9679144385026737e-06, "loss": 1.3241849899291993, "step": 2530 }, { "epoch": 2.7171527963607174, "grad_norm": 1.7890625, "learning_rate": 1.896613190730838e-06, "loss": 1.2841781616210937, "step": 2540 }, { "epoch": 2.727856569440728, "grad_norm": 1.8515625, "learning_rate": 1.8253119429590018e-06, "loss": 1.3062438011169433, "step": 2550 }, { "epoch": 2.7385603425207385, "grad_norm": 1.6328125, "learning_rate": 1.7540106951871661e-06, "loss": 1.3065251350402831, "step": 2560 }, { "epoch": 2.749264115600749, "grad_norm": 1.65625, "learning_rate": 1.68270944741533e-06, "loss": 1.2980451583862305, "step": 2570 }, { "epoch": 2.7599678886807597, "grad_norm": 1.671875, "learning_rate": 1.6114081996434938e-06, "loss": 1.2766281127929688, "step": 2580 }, { "epoch": 2.7706716617607707, "grad_norm": 1.5859375, "learning_rate": 1.5401069518716579e-06, "loss": 1.3033970832824706, "step": 2590 }, { "epoch": 2.7813754348407813, "grad_norm": 1.4921875, "learning_rate": 1.468805704099822e-06, "loss": 1.2335359573364257, "step": 2600 }, { "epoch": 2.792079207920792, "grad_norm": 1.6484375, "learning_rate": 1.3975044563279858e-06, "loss": 1.3184511184692382, "step": 2610 }, { "epoch": 2.802782981000803, "grad_norm": 1.609375, "learning_rate": 1.3262032085561499e-06, "loss": 1.1845362663269043, "step": 2620 }, { "epoch": 2.8134867540808135, "grad_norm": 1.7890625, "learning_rate": 1.2549019607843137e-06, "loss": 1.2506700515747071, "step": 2630 }, { "epoch": 2.824190527160824, "grad_norm": 1.7109375, "learning_rate": 1.1836007130124778e-06, "loss": 1.2360112190246582, "step": 2640 }, { "epoch": 2.834894300240835, "grad_norm": 1.703125, "learning_rate": 1.1122994652406418e-06, "loss": 1.2875761032104491, "step": 2650 }, { "epoch": 2.8455980733208457, "grad_norm": 1.578125, "learning_rate": 1.0409982174688057e-06, "loss": 1.2473506927490234, "step": 2660 }, { "epoch": 2.8563018464008563, "grad_norm": 1.546875, "learning_rate": 9.696969696969698e-07, "loss": 1.3208060264587402, "step": 2670 }, { "epoch": 2.867005619480867, "grad_norm": 1.671875, "learning_rate": 8.983957219251338e-07, "loss": 1.3371116638183593, "step": 2680 }, { "epoch": 2.8777093925608774, "grad_norm": 1.484375, "learning_rate": 8.270944741532977e-07, "loss": 1.2605000495910645, "step": 2690 }, { "epoch": 2.8884131656408885, "grad_norm": 1.6015625, "learning_rate": 7.557932263814617e-07, "loss": 1.267725658416748, "step": 2700 }, { "epoch": 2.899116938720899, "grad_norm": 1.640625, "learning_rate": 6.844919786096257e-07, "loss": 1.27689208984375, "step": 2710 }, { "epoch": 2.9098207118009096, "grad_norm": 1.578125, "learning_rate": 6.131907308377896e-07, "loss": 1.286923885345459, "step": 2720 }, { "epoch": 2.9205244848809206, "grad_norm": 1.765625, "learning_rate": 5.418894830659537e-07, "loss": 1.330905055999756, "step": 2730 }, { "epoch": 2.9312282579609312, "grad_norm": 1.71875, "learning_rate": 4.7058823529411767e-07, "loss": 1.2354840278625487, "step": 2740 }, { "epoch": 2.941932031040942, "grad_norm": 1.4609375, "learning_rate": 3.992869875222817e-07, "loss": 1.2647834777832032, "step": 2750 }, { "epoch": 2.952635804120953, "grad_norm": 1.6875, "learning_rate": 3.2798573975044564e-07, "loss": 1.2786317825317384, "step": 2760 }, { "epoch": 2.9633395772009634, "grad_norm": 1.53125, "learning_rate": 2.5668449197860965e-07, "loss": 1.2594982147216798, "step": 2770 }, { "epoch": 2.974043350280974, "grad_norm": 1.578125, "learning_rate": 1.8538324420677363e-07, "loss": 1.3203317642211914, "step": 2780 }, { "epoch": 2.984747123360985, "grad_norm": 2.046875, "learning_rate": 1.1408199643493762e-07, "loss": 1.224764347076416, "step": 2790 }, { "epoch": 2.9954508964409956, "grad_norm": 1.5078125, "learning_rate": 4.2780748663101606e-08, "loss": 1.2845193862915039, "step": 2800 } ], "logging_steps": 10, "max_steps": 2805, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7314356060356608.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }