| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2805, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010703773080010704, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.993582887700535e-05, | |
| "loss": 1.5584056854248047, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02140754616002141, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.9864527629233515e-05, | |
| "loss": 1.562470054626465, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03211131924003211, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.9793226381461677e-05, | |
| "loss": 1.5967525482177733, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04281509232004282, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.972192513368984e-05, | |
| "loss": 1.5231587409973144, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05351886540005352, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.9650623885918005e-05, | |
| "loss": 1.4803240776062012, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06422263848006422, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.957932263814617e-05, | |
| "loss": 1.5196543693542481, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07492641156007493, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.9508021390374332e-05, | |
| "loss": 1.4593828201293946, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08563018464008564, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.9436720142602497e-05, | |
| "loss": 1.4797739028930663, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09633395772009633, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.9365418894830663e-05, | |
| "loss": 1.5016543388366699, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10703773080010703, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.9294117647058825e-05, | |
| "loss": 1.4800514221191405, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11774150388011774, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.922281639928699e-05, | |
| "loss": 1.445749568939209, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12844527696012845, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.9151515151515152e-05, | |
| "loss": 1.4519201278686524, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13914905004013914, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.9080213903743317e-05, | |
| "loss": 1.3476288795471192, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14985282312014986, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.9008912655971482e-05, | |
| "loss": 1.4554848670959473, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16055659620016055, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.8937611408199644e-05, | |
| "loss": 1.4435239791870118, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17126036928017127, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.886631016042781e-05, | |
| "loss": 1.3767672538757325, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18196414236018196, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.8795008912655972e-05, | |
| "loss": 1.4352334022521973, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19266791544019266, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.8723707664884137e-05, | |
| "loss": 1.382822322845459, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20337168852020338, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.8652406417112302e-05, | |
| "loss": 1.429026985168457, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21407546160021407, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.8581105169340464e-05, | |
| "loss": 1.3564122200012207, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2247792346802248, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.850980392156863e-05, | |
| "loss": 1.457004451751709, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23548300776023548, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.843850267379679e-05, | |
| "loss": 1.3679749488830566, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2461867808402462, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8367201426024957e-05, | |
| "loss": 1.4186459541320802, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2568905539202569, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.8295900178253122e-05, | |
| "loss": 1.3645942687988282, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2675943270002676, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.8224598930481284e-05, | |
| "loss": 1.3659990310668946, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2782981000802783, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.815329768270945e-05, | |
| "loss": 1.3751505851745605, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.289001873160289, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.808199643493761e-05, | |
| "loss": 1.394303798675537, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2997056462402997, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.8010695187165777e-05, | |
| "loss": 1.3266244888305665, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3104094193203104, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.7939393939393942e-05, | |
| "loss": 1.3767006874084473, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3211131924003211, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.7868092691622104e-05, | |
| "loss": 1.3508996963500977, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3318169654803318, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.779679144385027e-05, | |
| "loss": 1.299268627166748, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.34252073856034254, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.772549019607843e-05, | |
| "loss": 1.335693073272705, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.35322451164035323, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.7654188948306597e-05, | |
| "loss": 1.3631214141845702, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3639282847203639, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.7582887700534762e-05, | |
| "loss": 1.349259376525879, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3746320578003746, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.7511586452762924e-05, | |
| "loss": 1.3234673500061036, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3853358308803853, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.744028520499109e-05, | |
| "loss": 1.34688138961792, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.736898395721925e-05, | |
| "loss": 1.310294246673584, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40674337704040675, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.7297682709447417e-05, | |
| "loss": 1.3146047592163086, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.41744715012041744, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.7226381461675582e-05, | |
| "loss": 1.3516902923583984, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.42815092320042814, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.7155080213903744e-05, | |
| "loss": 1.3631095886230469, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4388546962804388, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.708377896613191e-05, | |
| "loss": 1.3395885467529296, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4495584693604496, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.701247771836007e-05, | |
| "loss": 1.322316837310791, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.46026224244046027, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.6941176470588237e-05, | |
| "loss": 1.3892762184143066, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.47096601552047096, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.6869875222816402e-05, | |
| "loss": 1.3081950187683105, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.48166978860048165, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.6798573975044564e-05, | |
| "loss": 1.3405800819396974, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4923735616804924, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.672727272727273e-05, | |
| "loss": 1.3331517219543456, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5030773347605031, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.665597147950089e-05, | |
| "loss": 1.3040351867675781, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5137811078405138, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.6584670231729056e-05, | |
| "loss": 1.319422149658203, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5244848809205245, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.6513368983957222e-05, | |
| "loss": 1.3433240890502929, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5351886540005352, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.6442067736185384e-05, | |
| "loss": 1.3346479415893555, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5458924270805459, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.637076648841355e-05, | |
| "loss": 1.3032867431640625, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5565962001605566, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.629946524064171e-05, | |
| "loss": 1.3006314277648925, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5672999732405672, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.6228163992869876e-05, | |
| "loss": 1.3416614532470703, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.578003746320578, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.615686274509804e-05, | |
| "loss": 1.303782081604004, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5887075194005887, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.6085561497326207e-05, | |
| "loss": 1.2814931869506836, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.5994112924805994, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.601426024955437e-05, | |
| "loss": 1.3404861450195313, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6101150655606101, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.594295900178253e-05, | |
| "loss": 1.3594398498535156, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6208188386406208, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.5871657754010696e-05, | |
| "loss": 1.2768223762512207, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6315226117206315, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.580035650623886e-05, | |
| "loss": 1.3110815048217774, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6422263848006422, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.5729055258467027e-05, | |
| "loss": 1.2639217376708984, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6529301578806529, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.565775401069519e-05, | |
| "loss": 1.3356239318847656, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6636339309606636, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.558645276292335e-05, | |
| "loss": 1.3733593940734863, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6743377040406744, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.5515151515151516e-05, | |
| "loss": 1.2768065452575683, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6850414771206851, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.544385026737968e-05, | |
| "loss": 1.345008659362793, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6957452502006958, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.5372549019607847e-05, | |
| "loss": 1.2327005386352539, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7064490232807065, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.530124777183601e-05, | |
| "loss": 1.327579879760742, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7171527963607172, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.5229946524064172e-05, | |
| "loss": 1.2693171501159668, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7278565694407279, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.5158645276292336e-05, | |
| "loss": 1.3229084014892578, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7385603425207385, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.5087344028520501e-05, | |
| "loss": 1.3010024070739745, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7492641156007492, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.5016042780748665e-05, | |
| "loss": 1.304527473449707, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7599678886807599, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.4944741532976827e-05, | |
| "loss": 1.2771072387695312, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7706716617607706, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.4873440285204992e-05, | |
| "loss": 1.285037899017334, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7813754348407814, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.4802139037433156e-05, | |
| "loss": 1.2612761497497558, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.4730837789661321e-05, | |
| "loss": 1.3110386848449707, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8027829810008028, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.4659536541889485e-05, | |
| "loss": 1.3450962066650392, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8134867540808135, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.4588235294117647e-05, | |
| "loss": 1.294900608062744, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8241905271608242, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.4516934046345812e-05, | |
| "loss": 1.3215585708618165, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8348943002408349, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.4445632798573976e-05, | |
| "loss": 1.3044111251831054, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8455980733208456, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.4374331550802141e-05, | |
| "loss": 1.3348912239074706, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8563018464008563, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.4303030303030305e-05, | |
| "loss": 1.3434508323669434, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.867005619480867, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.4231729055258467e-05, | |
| "loss": 1.291652297973633, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8777093925608777, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.4160427807486632e-05, | |
| "loss": 1.3067720413208008, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.8884131656408885, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.4089126559714796e-05, | |
| "loss": 1.3196195602416991, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8991169387208992, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.4017825311942961e-05, | |
| "loss": 1.3129652976989745, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9098207118009098, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.3946524064171123e-05, | |
| "loss": 1.2702789306640625, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9205244848809205, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.3875222816399288e-05, | |
| "loss": 1.29964599609375, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9312282579609312, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.3803921568627452e-05, | |
| "loss": 1.301185131072998, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9419320310409419, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.3732620320855616e-05, | |
| "loss": 1.2731993675231934, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9526358041209526, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.3661319073083781e-05, | |
| "loss": 1.2806821823120118, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9633395772009633, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.3590017825311943e-05, | |
| "loss": 1.2375809669494628, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.974043350280974, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.3518716577540108e-05, | |
| "loss": 1.2453808784484863, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9847471233609848, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.3447415329768272e-05, | |
| "loss": 1.3074142456054687, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.9954508964409955, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.3376114081996437e-05, | |
| "loss": 1.2914584159851075, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.0053518865400053, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.33048128342246e-05, | |
| "loss": 1.3543176651000977, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.016055659620016, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.3233511586452763e-05, | |
| "loss": 1.3298683166503906, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.0267594327000267, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.3162210338680928e-05, | |
| "loss": 1.3020204544067382, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0374632057800375, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.3090909090909092e-05, | |
| "loss": 1.3046648025512695, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.048166978860048, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.3019607843137257e-05, | |
| "loss": 1.2308432579040527, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.0588707519400589, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.294830659536542e-05, | |
| "loss": 1.2811461448669434, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.0695745250200697, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.2877005347593583e-05, | |
| "loss": 1.3090335845947265, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0802782981000802, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.2805704099821748e-05, | |
| "loss": 1.2958572387695313, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.090982071180091, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2734402852049912e-05, | |
| "loss": 1.326209259033203, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.1016858442601016, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2663101604278077e-05, | |
| "loss": 1.2520675659179688, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.1123896173401124, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.259180035650624e-05, | |
| "loss": 1.3478898048400878, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.123093390420123, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.2520499108734403e-05, | |
| "loss": 1.2806931495666505, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1337971635001338, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2449197860962568e-05, | |
| "loss": 1.2603809356689453, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1445009365801444, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.2377896613190731e-05, | |
| "loss": 1.2837313652038573, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.1552047096601552, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.2306595365418897e-05, | |
| "loss": 1.271355152130127, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.165908482740166, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.223529411764706e-05, | |
| "loss": 1.2751256942749023, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.1766122558201766, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.2163992869875222e-05, | |
| "loss": 1.2217981338500976, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1873160289001874, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.2092691622103388e-05, | |
| "loss": 1.3460000038146973, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.198019801980198, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.2021390374331551e-05, | |
| "loss": 1.3119497299194336, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.2087235750602088, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.1950089126559717e-05, | |
| "loss": 1.326594066619873, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2194273481402194, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.187878787878788e-05, | |
| "loss": 1.313736343383789, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.2301311212202302, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.1807486631016042e-05, | |
| "loss": 1.2580394744873047, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2408348943002407, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1736185383244208e-05, | |
| "loss": 1.3472198486328124, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.2515386673802515, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.1664884135472371e-05, | |
| "loss": 1.3223270416259765, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.2622424404602621, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.1593582887700537e-05, | |
| "loss": 1.3479475021362304, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.272946213540273, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.15222816399287e-05, | |
| "loss": 1.2691156387329101, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.2836499866202837, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.1450980392156862e-05, | |
| "loss": 1.3078096389770508, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2943537597002943, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.1379679144385028e-05, | |
| "loss": 1.2821264266967773, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.3050575327803051, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.1308377896613191e-05, | |
| "loss": 1.2466256141662597, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.3157613058603157, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.1237076648841357e-05, | |
| "loss": 1.301154327392578, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3264650789403265, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.116577540106952e-05, | |
| "loss": 1.3058858871459962, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.337168852020337, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.1094474153297684e-05, | |
| "loss": 1.257982349395752, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.3478726251003479, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.1023172905525847e-05, | |
| "loss": 1.278379535675049, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.3585763981803587, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.0951871657754011e-05, | |
| "loss": 1.2998493194580079, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.3692801712603693, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.0880570409982176e-05, | |
| "loss": 1.3042527198791505, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.3799839443403799, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.0809269162210338e-05, | |
| "loss": 1.2903579711914062, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.3906877174203907, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.0737967914438504e-05, | |
| "loss": 1.216090202331543, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4013914905004015, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 1.2497664451599122, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.412095263580412, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.0595365418894833e-05, | |
| "loss": 1.2592049598693849, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.4227990366604228, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.0524064171122996e-05, | |
| "loss": 1.3062689781188965, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.4335028097404334, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.0452762923351158e-05, | |
| "loss": 1.2577032089233398, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.4442065828204442, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.0381461675579324e-05, | |
| "loss": 1.2874650001525878, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.4549103559004548, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.0310160427807487e-05, | |
| "loss": 1.2887776374816895, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.4656141289804656, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.0238859180035653e-05, | |
| "loss": 1.2869946479797363, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.4763179020604764, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.0167557932263816e-05, | |
| "loss": 1.3055774688720703, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.487021675140487, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.0096256684491978e-05, | |
| "loss": 1.2925223350524901, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.4977254482204978, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.0024955436720143e-05, | |
| "loss": 1.3624143600463867, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.5084292213005084, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 9.953654188948307e-06, | |
| "loss": 1.3100957870483398, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.5191329943805192, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 9.882352941176472e-06, | |
| "loss": 1.2667318344116212, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.5298367674605298, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.811051693404634e-06, | |
| "loss": 1.2964338302612304, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.5405405405405406, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 9.7397504456328e-06, | |
| "loss": 1.2451062202453613, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.5512443136205514, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.668449197860963e-06, | |
| "loss": 1.2622719764709474, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.561948086700562, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 9.597147950089127e-06, | |
| "loss": 1.2830778121948243, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.5726518597805725, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 9.525846702317292e-06, | |
| "loss": 1.3212904930114746, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.5833556328605833, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 9.454545454545456e-06, | |
| "loss": 1.301555347442627, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.5940594059405941, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.38324420677362e-06, | |
| "loss": 1.2626118659973145, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.6047631790206047, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 9.311942959001783e-06, | |
| "loss": 1.2342555046081543, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6154669521006153, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 9.240641711229947e-06, | |
| "loss": 1.3167900085449218, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.6261707251806263, | |
| "grad_norm": 2.0, | |
| "learning_rate": 9.169340463458112e-06, | |
| "loss": 1.296627902984619, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.636874498260637, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 9.098039215686276e-06, | |
| "loss": 1.275075340270996, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.6475782713406475, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.02673796791444e-06, | |
| "loss": 1.2771642684936524, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.6582820444206583, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 8.955436720142603e-06, | |
| "loss": 1.2907758712768556, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.668985817500669, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8.884135472370767e-06, | |
| "loss": 1.2778194427490235, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.6796895905806797, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.81283422459893e-06, | |
| "loss": 1.2820199012756348, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.6903933636606903, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8.741532976827096e-06, | |
| "loss": 1.3197799682617188, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.701097136740701, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 8.67023172905526e-06, | |
| "loss": 1.2711196899414063, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.7118009098207119, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8.598930481283423e-06, | |
| "loss": 1.3094602584838868, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.7225046829007225, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.527629233511587e-06, | |
| "loss": 1.3037433624267578, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.7332084559807333, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 8.45632798573975e-06, | |
| "loss": 1.2734570503234863, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.743912229060744, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 8.385026737967916e-06, | |
| "loss": 1.2476407051086427, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.7546160021407546, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8.31372549019608e-06, | |
| "loss": 1.3427558898925782, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.7653197752207652, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.242424242424243e-06, | |
| "loss": 1.273496437072754, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.776023548300776, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 8.171122994652407e-06, | |
| "loss": 1.2626665115356446, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.7867273213807868, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 8.09982174688057e-06, | |
| "loss": 1.2670047760009766, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.7974310944607974, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8.028520499108736e-06, | |
| "loss": 1.349191665649414, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.808134867540808, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 7.9572192513369e-06, | |
| "loss": 1.2989972114562989, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.8188386406208188, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 7.885918003565063e-06, | |
| "loss": 1.1850922584533692, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.8295424137008296, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 7.814616755793228e-06, | |
| "loss": 1.3360312461853028, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.8402461867808402, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 7.74331550802139e-06, | |
| "loss": 1.2957257270812987, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.850949959860851, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 7.672014260249555e-06, | |
| "loss": 1.2536530494689941, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.8616537329408618, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.60071301247772e-06, | |
| "loss": 1.2660930633544922, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.8723575060208724, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 7.529411764705883e-06, | |
| "loss": 1.3080876350402832, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.883061279100883, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 7.458110516934047e-06, | |
| "loss": 1.3132406234741212, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.8937650521808937, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 7.386809269162211e-06, | |
| "loss": 1.31253080368042, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.9044688252609046, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 7.315508021390375e-06, | |
| "loss": 1.3013240814208984, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.9151725983409151, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 7.244206773618538e-06, | |
| "loss": 1.2744117736816407, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.9258763714209257, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 7.172905525846703e-06, | |
| "loss": 1.3057758331298828, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.9365801445009367, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 7.101604278074867e-06, | |
| "loss": 1.224927043914795, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.9472839175809473, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.030303030303031e-06, | |
| "loss": 1.3182221412658692, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.957987690660958, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 6.959001782531195e-06, | |
| "loss": 1.2400826454162597, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.9686914637409687, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 6.887700534759358e-06, | |
| "loss": 1.2463386535644532, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.9793952368209795, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 6.8163992869875225e-06, | |
| "loss": 1.3235528945922852, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.99009900990099, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.745098039215687e-06, | |
| "loss": 1.2812946319580079, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 6.673796791443851e-06, | |
| "loss": 1.2953272819519044, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.0107037730800106, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 6.602495543672015e-06, | |
| "loss": 1.207719612121582, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.0214075461600216, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 6.531194295900179e-06, | |
| "loss": 1.2520846366882323, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.032111319240032, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 6.459893048128343e-06, | |
| "loss": 1.2905988693237305, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.0428150923200428, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 6.388591800356507e-06, | |
| "loss": 1.3520148277282715, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.0535188654000534, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 6.3172905525846705e-06, | |
| "loss": 1.2911107063293457, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.0642226384800644, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 6.245989304812835e-06, | |
| "loss": 1.2403117179870606, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.074926411560075, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 6.174688057040999e-06, | |
| "loss": 1.3558055877685546, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.0856301846400855, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 6.103386809269163e-06, | |
| "loss": 1.3194045066833495, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.096333957720096, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.032085561497326e-06, | |
| "loss": 1.3321582794189453, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.107037730800107, | |
| "grad_norm": 1.625, | |
| "learning_rate": 5.96078431372549e-06, | |
| "loss": 1.288839054107666, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.1177415038801177, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.889483065953655e-06, | |
| "loss": 1.3260244369506835, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.1284452769601283, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.8181818181818185e-06, | |
| "loss": 1.2721702575683593, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.1391490500401393, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 5.746880570409983e-06, | |
| "loss": 1.2622364044189454, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.14985282312015, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 5.675579322638146e-06, | |
| "loss": 1.30474796295166, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.1605565962001605, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.60427807486631e-06, | |
| "loss": 1.3109374046325684, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.171260369280171, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 5.532976827094475e-06, | |
| "loss": 1.3231799125671386, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.181964142360182, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 5.4616755793226384e-06, | |
| "loss": 1.2993489265441895, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.1926679154401927, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.390374331550803e-06, | |
| "loss": 1.3044631958007813, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.2033716885202033, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 5.3190730837789666e-06, | |
| "loss": 1.2702978134155274, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.2140754616002143, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.24777183600713e-06, | |
| "loss": 1.287952709197998, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.224779234680225, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.176470588235295e-06, | |
| "loss": 1.2974214553833008, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.2354830077602355, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.105169340463458e-06, | |
| "loss": 1.3148197174072265, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.246186780840246, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.033868092691623e-06, | |
| "loss": 1.3466445922851562, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.256890553920257, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 4.9625668449197864e-06, | |
| "loss": 1.334506893157959, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.2675943270002676, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 4.891265597147951e-06, | |
| "loss": 1.279165267944336, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.278298100080278, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.8199643493761146e-06, | |
| "loss": 1.2512639045715332, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.289001873160289, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 4.748663101604278e-06, | |
| "loss": 1.2572649002075196, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.2997056462403, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 4.677361853832442e-06, | |
| "loss": 1.2503036499023437, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.3104094193203104, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.606060606060606e-06, | |
| "loss": 1.2866994857788085, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.321113192400321, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 4.534759358288771e-06, | |
| "loss": 1.2810638427734375, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.331816965480332, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 4.4634581105169345e-06, | |
| "loss": 1.2588828086853028, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.3425207385603426, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.392156862745098e-06, | |
| "loss": 1.2615557670593263, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.353224511640353, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 4.320855614973263e-06, | |
| "loss": 1.2974510192871094, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.3639282847203638, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 4.249554367201426e-06, | |
| "loss": 1.303697681427002, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.374632057800375, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 4.178253119429591e-06, | |
| "loss": 1.303341007232666, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.3853358308803854, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 4.106951871657754e-06, | |
| "loss": 1.306796932220459, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.396039603960396, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 4.035650623885918e-06, | |
| "loss": 1.3068408012390136, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.4067433770404065, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3.9643493761140825e-06, | |
| "loss": 1.307657527923584, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.4174471501204176, | |
| "grad_norm": 1.75, | |
| "learning_rate": 3.893048128342246e-06, | |
| "loss": 1.2932353973388673, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.428150923200428, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.821746880570411e-06, | |
| "loss": 1.2625031471252441, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.4388546962804387, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 3.7504456327985743e-06, | |
| "loss": 1.3354209899902343, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.4495584693604497, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 3.6791443850267383e-06, | |
| "loss": 1.200312042236328, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.4602622424404603, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 3.6078431372549024e-06, | |
| "loss": 1.2868337631225586, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.470966015520471, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 3.536541889483066e-06, | |
| "loss": 1.2731021881103515, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.4816697886004815, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.46524064171123e-06, | |
| "loss": 1.3145703315734862, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.4923735616804925, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3.3939393939393946e-06, | |
| "loss": 1.305215549468994, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.503077334760503, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.322638146167558e-06, | |
| "loss": 1.3567096710205078, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.5137811078405137, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 3.2513368983957223e-06, | |
| "loss": 1.3081507682800293, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.5244848809205243, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 3.180035650623886e-06, | |
| "loss": 1.300461483001709, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.5351886540005353, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 3.10873440285205e-06, | |
| "loss": 1.3006972312927245, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.545892427080546, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.0374331550802145e-06, | |
| "loss": 1.3157925605773926, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.5565962001605564, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.966131907308378e-06, | |
| "loss": 1.2608634948730468, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.5672999732405675, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 2.894830659536542e-06, | |
| "loss": 1.237275981903076, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.578003746320578, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.8235294117647062e-06, | |
| "loss": 1.274481964111328, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.5887075194005886, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.75222816399287e-06, | |
| "loss": 1.3146997451782227, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.5994112924805997, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 2.680926916221034e-06, | |
| "loss": 1.3125761032104493, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.6101150655606102, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.6096256684491984e-06, | |
| "loss": 1.2919845581054688, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.620818838640621, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 2.538324420677362e-06, | |
| "loss": 1.2364542961120606, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.6315226117206314, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.467023172905526e-06, | |
| "loss": 1.2624409675598145, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.642226384800642, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.3957219251336898e-06, | |
| "loss": 1.3075796127319337, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.652930157880653, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.3244206773618542e-06, | |
| "loss": 1.2835824012756347, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.6636339309606636, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.253119429590018e-06, | |
| "loss": 1.307276153564453, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.674337704040674, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.181818181818182e-06, | |
| "loss": 1.2622486114501954, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.685041477120685, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.110516934046346e-06, | |
| "loss": 1.2514682769775392, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.6957452502006958, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.03921568627451e-06, | |
| "loss": 1.2614849090576172, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.7064490232807064, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.9679144385026737e-06, | |
| "loss": 1.3241849899291993, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.7171527963607174, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.896613190730838e-06, | |
| "loss": 1.2841781616210937, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.727856569440728, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.8253119429590018e-06, | |
| "loss": 1.3062438011169433, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.7385603425207385, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.7540106951871661e-06, | |
| "loss": 1.3065251350402831, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.749264115600749, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.68270944741533e-06, | |
| "loss": 1.2980451583862305, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.7599678886807597, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.6114081996434938e-06, | |
| "loss": 1.2766281127929688, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.7706716617607707, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.5401069518716579e-06, | |
| "loss": 1.3033970832824706, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.7813754348407813, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 1.468805704099822e-06, | |
| "loss": 1.2335359573364257, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.792079207920792, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.3975044563279858e-06, | |
| "loss": 1.3184511184692382, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.802782981000803, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.3262032085561499e-06, | |
| "loss": 1.1845362663269043, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.8134867540808135, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.2549019607843137e-06, | |
| "loss": 1.2506700515747071, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.824190527160824, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1836007130124778e-06, | |
| "loss": 1.2360112190246582, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.834894300240835, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.1122994652406418e-06, | |
| "loss": 1.2875761032104491, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.8455980733208457, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.0409982174688057e-06, | |
| "loss": 1.2473506927490234, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.8563018464008563, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.696969696969698e-07, | |
| "loss": 1.3208060264587402, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.867005619480867, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 8.983957219251338e-07, | |
| "loss": 1.3371116638183593, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.8777093925608774, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.270944741532977e-07, | |
| "loss": 1.2605000495910645, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.8884131656408885, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 7.557932263814617e-07, | |
| "loss": 1.267725658416748, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.899116938720899, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 6.844919786096257e-07, | |
| "loss": 1.27689208984375, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.9098207118009096, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 6.131907308377896e-07, | |
| "loss": 1.286923885345459, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.9205244848809206, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 5.418894830659537e-07, | |
| "loss": 1.330905055999756, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.9312282579609312, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 4.7058823529411767e-07, | |
| "loss": 1.2354840278625487, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.941932031040942, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 3.992869875222817e-07, | |
| "loss": 1.2647834777832032, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.952635804120953, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 3.2798573975044564e-07, | |
| "loss": 1.2786317825317384, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.9633395772009634, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 2.5668449197860965e-07, | |
| "loss": 1.2594982147216798, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.974043350280974, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.8538324420677363e-07, | |
| "loss": 1.3203317642211914, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.984747123360985, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.1408199643493762e-07, | |
| "loss": 1.224764347076416, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.9954508964409956, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 4.2780748663101606e-08, | |
| "loss": 1.2845193862915039, | |
| "step": 2800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2805, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7314356060356608.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |