| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.7089684509039348, |
| "eval_steps": 500, |
| "global_step": 2000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003544842254519674, |
| "grad_norm": 132.5248565673828, |
| "learning_rate": 6.373937677053824e-07, |
| "loss": 14.446830749511719, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.007089684509039348, |
| "grad_norm": 54.77531051635742, |
| "learning_rate": 1.3456090651558075e-06, |
| "loss": 13.234834289550781, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.010634526763559022, |
| "grad_norm": 6.283421993255615, |
| "learning_rate": 2.0538243626062327e-06, |
| "loss": 11.701119995117187, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.014179369018078695, |
| "grad_norm": 9.19540023803711, |
| "learning_rate": 2.762039660056657e-06, |
| "loss": 11.269725799560547, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.01772421127259837, |
| "grad_norm": 3.588550329208374, |
| "learning_rate": 3.4702549575070827e-06, |
| "loss": 11.082182312011719, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.021269053527118043, |
| "grad_norm": 4.2412614822387695, |
| "learning_rate": 4.178470254957508e-06, |
| "loss": 10.633071899414062, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.02481389578163772, |
| "grad_norm": 2.81349515914917, |
| "learning_rate": 4.886685552407932e-06, |
| "loss": 10.016432189941407, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02835873803615739, |
| "grad_norm": 2.157428026199341, |
| "learning_rate": 5.594900849858357e-06, |
| "loss": 9.317854309082032, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.03190358029067707, |
| "grad_norm": 1.53337824344635, |
| "learning_rate": 6.3031161473087825e-06, |
| "loss": 8.773271179199218, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.03544842254519674, |
| "grad_norm": 1.2842110395431519, |
| "learning_rate": 7.011331444759208e-06, |
| "loss": 8.46307373046875, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03899326479971641, |
| "grad_norm": 0.8940983414649963, |
| "learning_rate": 7.719546742209632e-06, |
| "loss": 8.362952423095702, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.042538107054236086, |
| "grad_norm": 1.2323634624481201, |
| "learning_rate": 8.427762039660058e-06, |
| "loss": 8.313992309570313, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.04608294930875576, |
| "grad_norm": 0.8698780536651611, |
| "learning_rate": 9.135977337110482e-06, |
| "loss": 8.313835906982423, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.04962779156327544, |
| "grad_norm": 0.6485510468482971, |
| "learning_rate": 9.844192634560907e-06, |
| "loss": 8.277175140380859, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.053172633817795106, |
| "grad_norm": 1.8881973028182983, |
| "learning_rate": 1.0552407932011333e-05, |
| "loss": 8.263162231445312, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05671747607231478, |
| "grad_norm": 1.2178221940994263, |
| "learning_rate": 1.1260623229461757e-05, |
| "loss": 8.26162109375, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.06026231832683446, |
| "grad_norm": 2.23687481880188, |
| "learning_rate": 1.1968838526912181e-05, |
| "loss": 8.220828247070312, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.06380716058135413, |
| "grad_norm": 2.2403409481048584, |
| "learning_rate": 1.2677053824362606e-05, |
| "loss": 8.156272888183594, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.06735200283587381, |
| "grad_norm": 4.979323863983154, |
| "learning_rate": 1.3385269121813032e-05, |
| "loss": 8.121318054199218, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.07089684509039348, |
| "grad_norm": 3.5296337604522705, |
| "learning_rate": 1.4093484419263456e-05, |
| "loss": 8.086737823486327, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07444168734491315, |
| "grad_norm": 2.324810743331909, |
| "learning_rate": 1.4801699716713882e-05, |
| "loss": 8.067035675048828, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.07798652959943282, |
| "grad_norm": 0.9995874166488647, |
| "learning_rate": 1.5509915014164305e-05, |
| "loss": 8.020206451416016, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0815313718539525, |
| "grad_norm": 1.183273196220398, |
| "learning_rate": 1.6218130311614733e-05, |
| "loss": 7.9764549255371096, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.08507621410847217, |
| "grad_norm": 0.9968162775039673, |
| "learning_rate": 1.6926345609065157e-05, |
| "loss": 7.941542053222657, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.08862105636299185, |
| "grad_norm": 1.453206181526184, |
| "learning_rate": 1.763456090651558e-05, |
| "loss": 7.900083923339844, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.09216589861751152, |
| "grad_norm": 1.9150718450546265, |
| "learning_rate": 1.8342776203966006e-05, |
| "loss": 7.868098449707031, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.0957107408720312, |
| "grad_norm": 1.4079521894454956, |
| "learning_rate": 1.9050991501416433e-05, |
| "loss": 7.8084770202636715, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.09925558312655088, |
| "grad_norm": 2.327735424041748, |
| "learning_rate": 1.9759206798866854e-05, |
| "loss": 7.770416259765625, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.10280042538107054, |
| "grad_norm": 3.1308300495147705, |
| "learning_rate": 2.0467422096317282e-05, |
| "loss": 7.720582580566406, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.10634526763559021, |
| "grad_norm": 1.5730700492858887, |
| "learning_rate": 2.1175637393767706e-05, |
| "loss": 7.653485107421875, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10989010989010989, |
| "grad_norm": 1.9993301630020142, |
| "learning_rate": 2.188385269121813e-05, |
| "loss": 7.584654998779297, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.11343495214462956, |
| "grad_norm": 3.556170701980591, |
| "learning_rate": 2.2592067988668555e-05, |
| "loss": 7.472293090820313, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.11697979439914924, |
| "grad_norm": 1.5170807838439941, |
| "learning_rate": 2.3300283286118983e-05, |
| "loss": 7.420698547363282, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.12052463665366892, |
| "grad_norm": 1.8956718444824219, |
| "learning_rate": 2.4008498583569404e-05, |
| "loss": 7.352853393554687, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.12406947890818859, |
| "grad_norm": 3.6424927711486816, |
| "learning_rate": 2.471671388101983e-05, |
| "loss": 7.289952087402344, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.12761432116270827, |
| "grad_norm": 3.950108528137207, |
| "learning_rate": 2.542492917847026e-05, |
| "loss": 7.2670539855957035, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.13115916341722794, |
| "grad_norm": 3.1376140117645264, |
| "learning_rate": 2.613314447592068e-05, |
| "loss": 7.219367980957031, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.13470400567174762, |
| "grad_norm": 2.8113393783569336, |
| "learning_rate": 2.6841359773371104e-05, |
| "loss": 7.1138450622558596, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.1382488479262673, |
| "grad_norm": 1.7290911674499512, |
| "learning_rate": 2.7549575070821532e-05, |
| "loss": 7.069696044921875, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.14179369018078697, |
| "grad_norm": 2.157928705215454, |
| "learning_rate": 2.8257790368271957e-05, |
| "loss": 7.032493591308594, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.14533853243530662, |
| "grad_norm": 2.0297610759735107, |
| "learning_rate": 2.8966005665722377e-05, |
| "loss": 6.974455261230469, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.1488833746898263, |
| "grad_norm": 2.0817925930023193, |
| "learning_rate": 2.9674220963172805e-05, |
| "loss": 6.954932403564453, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.15242821694434597, |
| "grad_norm": 2.302412271499634, |
| "learning_rate": 3.0382436260623233e-05, |
| "loss": 6.9087669372558596, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.15597305919886564, |
| "grad_norm": 2.4860918521881104, |
| "learning_rate": 3.109065155807366e-05, |
| "loss": 6.853008270263672, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.15951790145338532, |
| "grad_norm": 1.9176361560821533, |
| "learning_rate": 3.179886685552408e-05, |
| "loss": 6.821636199951172, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.163062743707905, |
| "grad_norm": 2.412122964859009, |
| "learning_rate": 3.2507082152974506e-05, |
| "loss": 6.7895256042480465, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.16660758596242467, |
| "grad_norm": 2.1598598957061768, |
| "learning_rate": 3.3215297450424934e-05, |
| "loss": 6.758164978027343, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.17015242821694435, |
| "grad_norm": 1.4905693531036377, |
| "learning_rate": 3.3923512747875355e-05, |
| "loss": 6.707500457763672, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.17369727047146402, |
| "grad_norm": 2.570793628692627, |
| "learning_rate": 3.4631728045325776e-05, |
| "loss": 6.675039672851563, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.1772421127259837, |
| "grad_norm": 1.7906855344772339, |
| "learning_rate": 3.53399433427762e-05, |
| "loss": 6.652853393554688, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.18078695498050337, |
| "grad_norm": 2.3503963947296143, |
| "learning_rate": 3.604815864022663e-05, |
| "loss": 6.630236053466797, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.18433179723502305, |
| "grad_norm": 2.17346453666687, |
| "learning_rate": 3.675637393767706e-05, |
| "loss": 6.607561492919922, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.18787663948954272, |
| "grad_norm": 2.7891957759857178, |
| "learning_rate": 3.746458923512748e-05, |
| "loss": 6.555358123779297, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1914214817440624, |
| "grad_norm": 1.5042469501495361, |
| "learning_rate": 3.817280453257791e-05, |
| "loss": 6.542356872558594, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.19496632399858208, |
| "grad_norm": 2.329241991043091, |
| "learning_rate": 3.888101983002833e-05, |
| "loss": 6.541840362548828, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.19851116625310175, |
| "grad_norm": 2.9417874813079834, |
| "learning_rate": 3.9589235127478756e-05, |
| "loss": 6.506352233886719, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.2020560085076214, |
| "grad_norm": 1.765146255493164, |
| "learning_rate": 4.029745042492918e-05, |
| "loss": 6.494882202148437, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.20560085076214107, |
| "grad_norm": 1.5695878267288208, |
| "learning_rate": 4.1005665722379605e-05, |
| "loss": 6.457882690429687, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.20914569301666075, |
| "grad_norm": 2.1169416904449463, |
| "learning_rate": 4.171388101983003e-05, |
| "loss": 6.4466796875, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.21269053527118043, |
| "grad_norm": 2.836350679397583, |
| "learning_rate": 4.242209631728046e-05, |
| "loss": 6.394796371459961, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.2162353775257001, |
| "grad_norm": 2.057159423828125, |
| "learning_rate": 4.313031161473088e-05, |
| "loss": 6.309791564941406, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 1.9764236211776733, |
| "learning_rate": 4.38385269121813e-05, |
| "loss": 6.143642425537109, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.22332506203473945, |
| "grad_norm": 1.5374716520309448, |
| "learning_rate": 4.454674220963173e-05, |
| "loss": 6.059618377685547, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.22686990428925913, |
| "grad_norm": 1.504021406173706, |
| "learning_rate": 4.525495750708216e-05, |
| "loss": 5.966709899902344, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.2304147465437788, |
| "grad_norm": 1.8329981565475464, |
| "learning_rate": 4.596317280453258e-05, |
| "loss": 5.919542694091797, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.23395958879829848, |
| "grad_norm": 1.955461859703064, |
| "learning_rate": 4.6671388101983006e-05, |
| "loss": 5.863460540771484, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.23750443105281815, |
| "grad_norm": 1.6255284547805786, |
| "learning_rate": 4.7379603399433434e-05, |
| "loss": 5.847004318237305, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.24104927330733783, |
| "grad_norm": 1.6530513763427734, |
| "learning_rate": 4.8087818696883855e-05, |
| "loss": 5.811199569702149, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.2445941155618575, |
| "grad_norm": 1.6961824893951416, |
| "learning_rate": 4.8796033994334276e-05, |
| "loss": 5.7780517578125, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.24813895781637718, |
| "grad_norm": 1.508845567703247, |
| "learning_rate": 4.9504249291784704e-05, |
| "loss": 5.740897750854492, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.25168380007089686, |
| "grad_norm": 1.4073859453201294, |
| "learning_rate": 4.999999381545897e-05, |
| "loss": 5.743928146362305, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.25522864232541653, |
| "grad_norm": 1.2714122533798218, |
| "learning_rate": 4.999988386814785e-05, |
| "loss": 5.7146648406982425, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.2587734845799362, |
| "grad_norm": 1.105147361755371, |
| "learning_rate": 4.999963648728715e-05, |
| "loss": 5.703514862060547, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.2623183268344559, |
| "grad_norm": 1.281786561012268, |
| "learning_rate": 4.99992516742368e-05, |
| "loss": 5.659426879882813, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.26586316908897556, |
| "grad_norm": 1.4539729356765747, |
| "learning_rate": 4.999872943111228e-05, |
| "loss": 5.675305938720703, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.26940801134349524, |
| "grad_norm": 1.2878540754318237, |
| "learning_rate": 4.9998069760784536e-05, |
| "loss": 5.644029998779297, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.2729528535980149, |
| "grad_norm": 1.2199702262878418, |
| "learning_rate": 4.9997272666880024e-05, |
| "loss": 5.6056877136230465, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.2764976958525346, |
| "grad_norm": 1.2377008199691772, |
| "learning_rate": 4.999633815378066e-05, |
| "loss": 5.609469604492188, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.28004253810705426, |
| "grad_norm": 1.278878927230835, |
| "learning_rate": 4.9995266226623807e-05, |
| "loss": 5.626304626464844, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.28358738036157394, |
| "grad_norm": 1.3319532871246338, |
| "learning_rate": 4.999405689130224e-05, |
| "loss": 5.580442428588867, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.28713222261609356, |
| "grad_norm": 1.683655858039856, |
| "learning_rate": 4.9992710154464116e-05, |
| "loss": 5.579409790039063, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.29067706487061323, |
| "grad_norm": 1.1337031126022339, |
| "learning_rate": 4.999122602351296e-05, |
| "loss": 5.577402877807617, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.2942219071251329, |
| "grad_norm": 1.3304811716079712, |
| "learning_rate": 4.9989604506607564e-05, |
| "loss": 5.547556686401367, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.2977667493796526, |
| "grad_norm": 1.2711671590805054, |
| "learning_rate": 4.998784561266201e-05, |
| "loss": 5.548542404174805, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.30131159163417226, |
| "grad_norm": 1.3355711698532104, |
| "learning_rate": 4.998594935134559e-05, |
| "loss": 5.53816032409668, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.30485643388869194, |
| "grad_norm": 1.1520898342132568, |
| "learning_rate": 4.998391573308275e-05, |
| "loss": 5.541797637939453, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.3084012761432116, |
| "grad_norm": 1.3476178646087646, |
| "learning_rate": 4.998174476905303e-05, |
| "loss": 5.528475570678711, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.3119461183977313, |
| "grad_norm": 0.979820191860199, |
| "learning_rate": 4.9979436471191015e-05, |
| "loss": 5.501230621337891, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.31549096065225096, |
| "grad_norm": 0.9973044395446777, |
| "learning_rate": 4.997699085218628e-05, |
| "loss": 5.48109130859375, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.31903580290677064, |
| "grad_norm": 1.2147647142410278, |
| "learning_rate": 4.9974407925483275e-05, |
| "loss": 5.5053356170654295, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 1.174742579460144, |
| "learning_rate": 4.9971687705281305e-05, |
| "loss": 5.525197982788086, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.32612548741581, |
| "grad_norm": 1.0364092588424683, |
| "learning_rate": 4.9968830206534426e-05, |
| "loss": 5.507562255859375, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.32967032967032966, |
| "grad_norm": 1.1423817873001099, |
| "learning_rate": 4.9965835444951345e-05, |
| "loss": 5.50116081237793, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.33321517192484934, |
| "grad_norm": 1.063589096069336, |
| "learning_rate": 4.996270343699539e-05, |
| "loss": 5.483753967285156, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.336760014179369, |
| "grad_norm": 1.156726360321045, |
| "learning_rate": 4.995943419988433e-05, |
| "loss": 5.5090789794921875, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.3403048564338887, |
| "grad_norm": 1.3510740995407104, |
| "learning_rate": 4.995602775159038e-05, |
| "loss": 5.480130767822265, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.34384969868840837, |
| "grad_norm": 1.1639209985733032, |
| "learning_rate": 4.995248411084004e-05, |
| "loss": 5.487713623046875, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.34739454094292804, |
| "grad_norm": 1.0440890789031982, |
| "learning_rate": 4.9948803297114e-05, |
| "loss": 5.465737152099609, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.3509393831974477, |
| "grad_norm": 0.9426578283309937, |
| "learning_rate": 4.9944985330647045e-05, |
| "loss": 5.451010894775391, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.3544842254519674, |
| "grad_norm": 0.8856348991394043, |
| "learning_rate": 4.9941030232427945e-05, |
| "loss": 5.4333232879638675, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.35802906770648707, |
| "grad_norm": 1.196390986442566, |
| "learning_rate": 4.993693802419933e-05, |
| "loss": 5.440399169921875, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.36157390996100675, |
| "grad_norm": 0.9776602387428284, |
| "learning_rate": 4.993270872845756e-05, |
| "loss": 5.427825164794922, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.3651187522155264, |
| "grad_norm": 0.776901125907898, |
| "learning_rate": 4.992834236845264e-05, |
| "loss": 5.455727386474609, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.3686635944700461, |
| "grad_norm": 0.8997649550437927, |
| "learning_rate": 4.992383896818805e-05, |
| "loss": 5.438071823120117, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.37220843672456577, |
| "grad_norm": 0.8250705003738403, |
| "learning_rate": 4.991919855242065e-05, |
| "loss": 5.4653472900390625, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.37575327897908545, |
| "grad_norm": 0.8961722254753113, |
| "learning_rate": 4.991442114666049e-05, |
| "loss": 5.4244224548339846, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.3792981212336051, |
| "grad_norm": 0.9243871569633484, |
| "learning_rate": 4.990950677717073e-05, |
| "loss": 5.4270378112792965, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.3828429634881248, |
| "grad_norm": 0.9992454051971436, |
| "learning_rate": 4.990445547096748e-05, |
| "loss": 5.421928024291992, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.3863878057426445, |
| "grad_norm": 0.8732923269271851, |
| "learning_rate": 4.989926725581962e-05, |
| "loss": 5.399656295776367, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.38993264799716415, |
| "grad_norm": 1.0609304904937744, |
| "learning_rate": 4.989394216024866e-05, |
| "loss": 5.381362533569336, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.3934774902516838, |
| "grad_norm": 0.8973801732063293, |
| "learning_rate": 4.9888480213528624e-05, |
| "loss": 5.3848522186279295, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.3970223325062035, |
| "grad_norm": 0.8304575085639954, |
| "learning_rate": 4.988288144568583e-05, |
| "loss": 5.370730590820313, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.4005671747607231, |
| "grad_norm": 1.0252659320831299, |
| "learning_rate": 4.9877145887498774e-05, |
| "loss": 5.375761032104492, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.4041120170152428, |
| "grad_norm": 0.9789050221443176, |
| "learning_rate": 4.9871273570497924e-05, |
| "loss": 5.376107788085937, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.4076568592697625, |
| "grad_norm": 0.7593803405761719, |
| "learning_rate": 4.986526452696556e-05, |
| "loss": 5.3734375, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.41120170152428215, |
| "grad_norm": 0.7464333772659302, |
| "learning_rate": 4.98591187899356e-05, |
| "loss": 5.366069793701172, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.4147465437788018, |
| "grad_norm": 0.7203890085220337, |
| "learning_rate": 4.9852836393193436e-05, |
| "loss": 5.3826652526855465, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.4182913860333215, |
| "grad_norm": 0.9423730373382568, |
| "learning_rate": 4.984641737127569e-05, |
| "loss": 5.383267974853515, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.4218362282878412, |
| "grad_norm": 0.8559587597846985, |
| "learning_rate": 4.98398617594701e-05, |
| "loss": 5.379903411865234, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.42538107054236085, |
| "grad_norm": 0.7626580595970154, |
| "learning_rate": 4.9833169593815264e-05, |
| "loss": 5.348976135253906, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4289259127968805, |
| "grad_norm": 0.9729316830635071, |
| "learning_rate": 4.9826340911100484e-05, |
| "loss": 5.38103141784668, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.4324707550514002, |
| "grad_norm": 0.9993200898170471, |
| "learning_rate": 4.981937574886553e-05, |
| "loss": 5.35406265258789, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.4360155973059199, |
| "grad_norm": 0.8553763031959534, |
| "learning_rate": 4.9812274145400476e-05, |
| "loss": 5.344794845581054, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 0.8764554262161255, |
| "learning_rate": 4.980503613974542e-05, |
| "loss": 5.346466064453125, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.44310528181495923, |
| "grad_norm": 0.931968629360199, |
| "learning_rate": 4.9797661771690355e-05, |
| "loss": 5.383474349975586, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.4466501240694789, |
| "grad_norm": 0.9033966660499573, |
| "learning_rate": 4.9790151081774894e-05, |
| "loss": 5.3460533142089846, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.4501949663239986, |
| "grad_norm": 0.7806386947631836, |
| "learning_rate": 4.978250411128805e-05, |
| "loss": 5.311779022216797, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.45373980857851826, |
| "grad_norm": 0.9154277443885803, |
| "learning_rate": 4.9774720902268045e-05, |
| "loss": 5.3727764129638675, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.45728465083303793, |
| "grad_norm": 0.9599602222442627, |
| "learning_rate": 4.9766801497502025e-05, |
| "loss": 5.323087692260742, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.4608294930875576, |
| "grad_norm": 0.7230603098869324, |
| "learning_rate": 4.9758745940525874e-05, |
| "loss": 5.366838073730468, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.4643743353420773, |
| "grad_norm": 1.0107765197753906, |
| "learning_rate": 4.975055427562396e-05, |
| "loss": 5.308674240112305, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.46791917759659696, |
| "grad_norm": 0.8170345425605774, |
| "learning_rate": 4.974222654782885e-05, |
| "loss": 5.34793701171875, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.47146401985111663, |
| "grad_norm": 0.9455390572547913, |
| "learning_rate": 4.973376280292115e-05, |
| "loss": 5.278959274291992, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.4750088621056363, |
| "grad_norm": 0.7414869666099548, |
| "learning_rate": 4.9725163087429164e-05, |
| "loss": 5.352619934082031, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.478553704360156, |
| "grad_norm": 0.6697789430618286, |
| "learning_rate": 4.971642744862869e-05, |
| "loss": 5.344553375244141, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.48209854661467566, |
| "grad_norm": 0.7350028157234192, |
| "learning_rate": 4.9707555934542735e-05, |
| "loss": 5.331580352783203, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.48564338886919534, |
| "grad_norm": 0.7793420553207397, |
| "learning_rate": 4.9698548593941295e-05, |
| "loss": 5.325060272216797, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.489188231123715, |
| "grad_norm": 0.9029025435447693, |
| "learning_rate": 4.968940547634102e-05, |
| "loss": 5.306049728393555, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.4927330733782347, |
| "grad_norm": 0.8494910597801208, |
| "learning_rate": 4.9680126632004984e-05, |
| "loss": 5.327105331420898, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.49627791563275436, |
| "grad_norm": 0.7679411172866821, |
| "learning_rate": 4.967071211194241e-05, |
| "loss": 5.330787277221679, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.49982275788727404, |
| "grad_norm": 0.7874600291252136, |
| "learning_rate": 4.966116196790836e-05, |
| "loss": 5.342826461791992, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.5033676001417937, |
| "grad_norm": 0.9020946025848389, |
| "learning_rate": 4.965147625240351e-05, |
| "loss": 5.3182518005371096, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.5069124423963134, |
| "grad_norm": 0.7656190991401672, |
| "learning_rate": 4.964165501867378e-05, |
| "loss": 5.303837585449219, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.5104572846508331, |
| "grad_norm": 0.7332035303115845, |
| "learning_rate": 4.9631698320710115e-05, |
| "loss": 5.324761962890625, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.5140021269053527, |
| "grad_norm": 0.6765123009681702, |
| "learning_rate": 4.962160621324813e-05, |
| "loss": 5.318471527099609, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.5175469691598724, |
| "grad_norm": 0.7399813532829285, |
| "learning_rate": 4.9611378751767854e-05, |
| "loss": 5.327352905273438, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.5210918114143921, |
| "grad_norm": 0.9081066846847534, |
| "learning_rate": 4.96010159924934e-05, |
| "loss": 5.302130889892578, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.5246366536689118, |
| "grad_norm": 0.8822699189186096, |
| "learning_rate": 4.959051799239267e-05, |
| "loss": 5.313935470581055, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.5281814959234314, |
| "grad_norm": 0.7201648354530334, |
| "learning_rate": 4.9579884809177024e-05, |
| "loss": 5.290200805664062, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.5317263381779511, |
| "grad_norm": 0.6942399740219116, |
| "learning_rate": 4.956911650130098e-05, |
| "loss": 5.262855148315429, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5352711804324708, |
| "grad_norm": 0.7676780819892883, |
| "learning_rate": 4.955821312796188e-05, |
| "loss": 5.308451461791992, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.5388160226869905, |
| "grad_norm": 0.8454974889755249, |
| "learning_rate": 4.954717474909958e-05, |
| "loss": 5.288463592529297, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.5423608649415101, |
| "grad_norm": 0.622800350189209, |
| "learning_rate": 4.953600142539609e-05, |
| "loss": 5.301121520996094, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.5459057071960298, |
| "grad_norm": 0.7299189567565918, |
| "learning_rate": 4.9524693218275306e-05, |
| "loss": 5.29693603515625, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "grad_norm": 0.7149349451065063, |
| "learning_rate": 4.951325018990258e-05, |
| "loss": 5.285152435302734, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.5529953917050692, |
| "grad_norm": 0.7443231344223022, |
| "learning_rate": 4.950167240318444e-05, |
| "loss": 5.3107250213623045, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.5565402339595888, |
| "grad_norm": 0.7101079821586609, |
| "learning_rate": 4.948995992176824e-05, |
| "loss": 5.288132858276367, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.5600850762141085, |
| "grad_norm": 0.6409624218940735, |
| "learning_rate": 4.94781128100418e-05, |
| "loss": 5.258818817138672, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.5636299184686282, |
| "grad_norm": 0.7422112226486206, |
| "learning_rate": 4.946613113313304e-05, |
| "loss": 5.310946273803711, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.5671747607231479, |
| "grad_norm": 0.7419252991676331, |
| "learning_rate": 4.9454014956909644e-05, |
| "loss": 5.281931686401367, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.5707196029776674, |
| "grad_norm": 0.6587216258049011, |
| "learning_rate": 4.944176434797869e-05, |
| "loss": 5.29292106628418, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.5742644452321871, |
| "grad_norm": 0.7236230373382568, |
| "learning_rate": 4.942937937368628e-05, |
| "loss": 5.291954803466797, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.5778092874867068, |
| "grad_norm": 0.8149150013923645, |
| "learning_rate": 4.941686010211715e-05, |
| "loss": 5.252371978759766, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.5813541297412265, |
| "grad_norm": 0.6907062530517578, |
| "learning_rate": 4.940420660209436e-05, |
| "loss": 5.272654342651367, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.5848989719957461, |
| "grad_norm": 0.7174138426780701, |
| "learning_rate": 4.9391418943178836e-05, |
| "loss": 5.294339752197265, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.5884438142502658, |
| "grad_norm": 0.798039972782135, |
| "learning_rate": 4.9378497195669036e-05, |
| "loss": 5.257662963867188, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.5919886565047855, |
| "grad_norm": 0.724810004234314, |
| "learning_rate": 4.936544143060058e-05, |
| "loss": 5.284110641479492, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.5955334987593052, |
| "grad_norm": 0.7931723594665527, |
| "learning_rate": 4.9352251719745774e-05, |
| "loss": 5.281097412109375, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.5990783410138248, |
| "grad_norm": 0.6884552836418152, |
| "learning_rate": 4.933892813561333e-05, |
| "loss": 5.27345085144043, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.6026231832683445, |
| "grad_norm": 0.6960254907608032, |
| "learning_rate": 4.9325470751447866e-05, |
| "loss": 5.2600860595703125, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6061680255228642, |
| "grad_norm": 0.7212153077125549, |
| "learning_rate": 4.931187964122959e-05, |
| "loss": 5.293134307861328, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.6097128677773839, |
| "grad_norm": 0.7511916756629944, |
| "learning_rate": 4.929815487967382e-05, |
| "loss": 5.3018230438232425, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.6132577100319035, |
| "grad_norm": 0.7361993193626404, |
| "learning_rate": 4.9284296542230615e-05, |
| "loss": 5.259002685546875, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.6168025522864232, |
| "grad_norm": 0.6815996170043945, |
| "learning_rate": 4.927030470508434e-05, |
| "loss": 5.303490447998047, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.6203473945409429, |
| "grad_norm": 0.6869247555732727, |
| "learning_rate": 4.925617944515328e-05, |
| "loss": 5.268404388427735, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.6238922367954626, |
| "grad_norm": 0.712818443775177, |
| "learning_rate": 4.9241920840089176e-05, |
| "loss": 5.259001922607422, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.6274370790499822, |
| "grad_norm": 0.7157370448112488, |
| "learning_rate": 4.922752896827682e-05, |
| "loss": 5.2527915954589846, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.6309819213045019, |
| "grad_norm": 0.7796548008918762, |
| "learning_rate": 4.921300390883362e-05, |
| "loss": 5.244187545776367, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.6345267635590216, |
| "grad_norm": 0.6423467397689819, |
| "learning_rate": 4.919834574160916e-05, |
| "loss": 5.230907440185547, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.6380716058135413, |
| "grad_norm": 0.7811551094055176, |
| "learning_rate": 4.9183554547184784e-05, |
| "loss": 5.229999923706055, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.641616448068061, |
| "grad_norm": 0.6660889387130737, |
| "learning_rate": 4.916863040687312e-05, |
| "loss": 5.248212814331055, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.6456777453422546, |
| "learning_rate": 4.915357340271765e-05, |
| "loss": 5.257810592651367, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.6487061325771003, |
| "grad_norm": 0.6784165501594543, |
| "learning_rate": 4.9138383617492254e-05, |
| "loss": 5.257859420776367, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.65225097483162, |
| "grad_norm": 0.6905676126480103, |
| "learning_rate": 4.9123061134700774e-05, |
| "loss": 5.253431701660157, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.6557958170861397, |
| "grad_norm": 0.6898303031921387, |
| "learning_rate": 4.9107606038576523e-05, |
| "loss": 5.243818283081055, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 0.6445653438568115, |
| "learning_rate": 4.9092018414081854e-05, |
| "loss": 5.2768810272216795, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.662885501595179, |
| "grad_norm": 0.6441115140914917, |
| "learning_rate": 4.9076298346907654e-05, |
| "loss": 5.255257415771484, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.6664303438496987, |
| "grad_norm": 0.6179636716842651, |
| "learning_rate": 4.906044592347292e-05, |
| "loss": 5.236399078369141, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.6699751861042184, |
| "grad_norm": 0.6955509781837463, |
| "learning_rate": 4.904446123092424e-05, |
| "loss": 5.26529541015625, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.673520028358738, |
| "grad_norm": 0.716314971446991, |
| "learning_rate": 4.9028344357135355e-05, |
| "loss": 5.257140350341797, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.6770648706132577, |
| "grad_norm": 0.785479724407196, |
| "learning_rate": 4.9012095390706636e-05, |
| "loss": 5.23547477722168, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.6806097128677774, |
| "grad_norm": 0.619127094745636, |
| "learning_rate": 4.899571442096462e-05, |
| "loss": 5.233910751342774, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.6841545551222971, |
| "grad_norm": 0.6372919082641602, |
| "learning_rate": 4.897920153796153e-05, |
| "loss": 5.2188163757324215, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.6876993973768167, |
| "grad_norm": 0.6643814444541931, |
| "learning_rate": 4.896255683247474e-05, |
| "loss": 5.250975036621094, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.6912442396313364, |
| "grad_norm": 0.551176130771637, |
| "learning_rate": 4.894578039600633e-05, |
| "loss": 5.245917510986328, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.6947890818858561, |
| "grad_norm": 0.7061398029327393, |
| "learning_rate": 4.892887232078251e-05, |
| "loss": 5.223080062866211, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.6983339241403758, |
| "grad_norm": 0.6970100402832031, |
| "learning_rate": 4.8911832699753205e-05, |
| "loss": 5.245952606201172, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.7018787663948954, |
| "grad_norm": 0.6700912117958069, |
| "learning_rate": 4.8894661626591475e-05, |
| "loss": 5.273249816894531, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.7054236086494151, |
| "grad_norm": 0.5814111828804016, |
| "learning_rate": 4.8877359195693005e-05, |
| "loss": 5.207174682617188, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.7089684509039348, |
| "grad_norm": 0.7154501080513, |
| "learning_rate": 4.885992550217563e-05, |
| "loss": 5.2551219940185545, |
| "step": 2000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 14105, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6845619367143014e+19, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|