{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7089684509039348, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003544842254519674, "grad_norm": 132.5248565673828, "learning_rate": 6.373937677053824e-07, "loss": 14.446830749511719, "step": 10 }, { "epoch": 0.007089684509039348, "grad_norm": 54.77531051635742, "learning_rate": 1.3456090651558075e-06, "loss": 13.234834289550781, "step": 20 }, { "epoch": 0.010634526763559022, "grad_norm": 6.283421993255615, "learning_rate": 2.0538243626062327e-06, "loss": 11.701119995117187, "step": 30 }, { "epoch": 0.014179369018078695, "grad_norm": 9.19540023803711, "learning_rate": 2.762039660056657e-06, "loss": 11.269725799560547, "step": 40 }, { "epoch": 0.01772421127259837, "grad_norm": 3.588550329208374, "learning_rate": 3.4702549575070827e-06, "loss": 11.082182312011719, "step": 50 }, { "epoch": 0.021269053527118043, "grad_norm": 4.2412614822387695, "learning_rate": 4.178470254957508e-06, "loss": 10.633071899414062, "step": 60 }, { "epoch": 0.02481389578163772, "grad_norm": 2.81349515914917, "learning_rate": 4.886685552407932e-06, "loss": 10.016432189941407, "step": 70 }, { "epoch": 0.02835873803615739, "grad_norm": 2.157428026199341, "learning_rate": 5.594900849858357e-06, "loss": 9.317854309082032, "step": 80 }, { "epoch": 0.03190358029067707, "grad_norm": 1.53337824344635, "learning_rate": 6.3031161473087825e-06, "loss": 8.773271179199218, "step": 90 }, { "epoch": 0.03544842254519674, "grad_norm": 1.2842110395431519, "learning_rate": 7.011331444759208e-06, "loss": 8.46307373046875, "step": 100 }, { "epoch": 0.03899326479971641, "grad_norm": 0.8940983414649963, "learning_rate": 7.719546742209632e-06, "loss": 8.362952423095702, "step": 110 }, { "epoch": 0.042538107054236086, "grad_norm": 1.2323634624481201, "learning_rate": 8.427762039660058e-06, "loss": 8.313992309570313, "step": 120 }, { "epoch": 0.04608294930875576, "grad_norm": 0.8698780536651611, "learning_rate": 9.135977337110482e-06, "loss": 8.313835906982423, "step": 130 }, { "epoch": 0.04962779156327544, "grad_norm": 0.6485510468482971, "learning_rate": 9.844192634560907e-06, "loss": 8.277175140380859, "step": 140 }, { "epoch": 0.053172633817795106, "grad_norm": 1.8881973028182983, "learning_rate": 1.0552407932011333e-05, "loss": 8.263162231445312, "step": 150 }, { "epoch": 0.05671747607231478, "grad_norm": 1.2178221940994263, "learning_rate": 1.1260623229461757e-05, "loss": 8.26162109375, "step": 160 }, { "epoch": 0.06026231832683446, "grad_norm": 2.23687481880188, "learning_rate": 1.1968838526912181e-05, "loss": 8.220828247070312, "step": 170 }, { "epoch": 0.06380716058135413, "grad_norm": 2.2403409481048584, "learning_rate": 1.2677053824362606e-05, "loss": 8.156272888183594, "step": 180 }, { "epoch": 0.06735200283587381, "grad_norm": 4.979323863983154, "learning_rate": 1.3385269121813032e-05, "loss": 8.121318054199218, "step": 190 }, { "epoch": 0.07089684509039348, "grad_norm": 3.5296337604522705, "learning_rate": 1.4093484419263456e-05, "loss": 8.086737823486327, "step": 200 }, { "epoch": 0.07444168734491315, "grad_norm": 2.324810743331909, "learning_rate": 1.4801699716713882e-05, "loss": 8.067035675048828, "step": 210 }, { "epoch": 0.07798652959943282, "grad_norm": 0.9995874166488647, "learning_rate": 1.5509915014164305e-05, "loss": 8.020206451416016, "step": 220 }, { "epoch": 0.0815313718539525, "grad_norm": 1.183273196220398, "learning_rate": 1.6218130311614733e-05, "loss": 7.9764549255371096, "step": 230 }, { "epoch": 0.08507621410847217, "grad_norm": 0.9968162775039673, "learning_rate": 1.6926345609065157e-05, "loss": 7.941542053222657, "step": 240 }, { "epoch": 0.08862105636299185, "grad_norm": 1.453206181526184, "learning_rate": 1.763456090651558e-05, "loss": 7.900083923339844, "step": 250 }, { "epoch": 0.09216589861751152, "grad_norm": 1.9150718450546265, "learning_rate": 1.8342776203966006e-05, "loss": 7.868098449707031, "step": 260 }, { "epoch": 0.0957107408720312, "grad_norm": 1.4079521894454956, "learning_rate": 1.9050991501416433e-05, "loss": 7.8084770202636715, "step": 270 }, { "epoch": 0.09925558312655088, "grad_norm": 2.327735424041748, "learning_rate": 1.9759206798866854e-05, "loss": 7.770416259765625, "step": 280 }, { "epoch": 0.10280042538107054, "grad_norm": 3.1308300495147705, "learning_rate": 2.0467422096317282e-05, "loss": 7.720582580566406, "step": 290 }, { "epoch": 0.10634526763559021, "grad_norm": 1.5730700492858887, "learning_rate": 2.1175637393767706e-05, "loss": 7.653485107421875, "step": 300 }, { "epoch": 0.10989010989010989, "grad_norm": 1.9993301630020142, "learning_rate": 2.188385269121813e-05, "loss": 7.584654998779297, "step": 310 }, { "epoch": 0.11343495214462956, "grad_norm": 3.556170701980591, "learning_rate": 2.2592067988668555e-05, "loss": 7.472293090820313, "step": 320 }, { "epoch": 0.11697979439914924, "grad_norm": 1.5170807838439941, "learning_rate": 2.3300283286118983e-05, "loss": 7.420698547363282, "step": 330 }, { "epoch": 0.12052463665366892, "grad_norm": 1.8956718444824219, "learning_rate": 2.4008498583569404e-05, "loss": 7.352853393554687, "step": 340 }, { "epoch": 0.12406947890818859, "grad_norm": 3.6424927711486816, "learning_rate": 2.471671388101983e-05, "loss": 7.289952087402344, "step": 350 }, { "epoch": 0.12761432116270827, "grad_norm": 3.950108528137207, "learning_rate": 2.542492917847026e-05, "loss": 7.2670539855957035, "step": 360 }, { "epoch": 0.13115916341722794, "grad_norm": 3.1376140117645264, "learning_rate": 2.613314447592068e-05, "loss": 7.219367980957031, "step": 370 }, { "epoch": 0.13470400567174762, "grad_norm": 2.8113393783569336, "learning_rate": 2.6841359773371104e-05, "loss": 7.1138450622558596, "step": 380 }, { "epoch": 0.1382488479262673, "grad_norm": 1.7290911674499512, "learning_rate": 2.7549575070821532e-05, "loss": 7.069696044921875, "step": 390 }, { "epoch": 0.14179369018078697, "grad_norm": 2.157928705215454, "learning_rate": 2.8257790368271957e-05, "loss": 7.032493591308594, "step": 400 }, { "epoch": 0.14533853243530662, "grad_norm": 2.0297610759735107, "learning_rate": 2.8966005665722377e-05, "loss": 6.974455261230469, "step": 410 }, { "epoch": 0.1488833746898263, "grad_norm": 2.0817925930023193, "learning_rate": 2.9674220963172805e-05, "loss": 6.954932403564453, "step": 420 }, { "epoch": 0.15242821694434597, "grad_norm": 2.302412271499634, "learning_rate": 3.0382436260623233e-05, "loss": 6.9087669372558596, "step": 430 }, { "epoch": 0.15597305919886564, "grad_norm": 2.4860918521881104, "learning_rate": 3.109065155807366e-05, "loss": 6.853008270263672, "step": 440 }, { "epoch": 0.15951790145338532, "grad_norm": 1.9176361560821533, "learning_rate": 3.179886685552408e-05, "loss": 6.821636199951172, "step": 450 }, { "epoch": 0.163062743707905, "grad_norm": 2.412122964859009, "learning_rate": 3.2507082152974506e-05, "loss": 6.7895256042480465, "step": 460 }, { "epoch": 0.16660758596242467, "grad_norm": 2.1598598957061768, "learning_rate": 3.3215297450424934e-05, "loss": 6.758164978027343, "step": 470 }, { "epoch": 0.17015242821694435, "grad_norm": 1.4905693531036377, "learning_rate": 3.3923512747875355e-05, "loss": 6.707500457763672, "step": 480 }, { "epoch": 0.17369727047146402, "grad_norm": 2.570793628692627, "learning_rate": 3.4631728045325776e-05, "loss": 6.675039672851563, "step": 490 }, { "epoch": 0.1772421127259837, "grad_norm": 1.7906855344772339, "learning_rate": 3.53399433427762e-05, "loss": 6.652853393554688, "step": 500 }, { "epoch": 0.18078695498050337, "grad_norm": 2.3503963947296143, "learning_rate": 3.604815864022663e-05, "loss": 6.630236053466797, "step": 510 }, { "epoch": 0.18433179723502305, "grad_norm": 2.17346453666687, "learning_rate": 3.675637393767706e-05, "loss": 6.607561492919922, "step": 520 }, { "epoch": 0.18787663948954272, "grad_norm": 2.7891957759857178, "learning_rate": 3.746458923512748e-05, "loss": 6.555358123779297, "step": 530 }, { "epoch": 0.1914214817440624, "grad_norm": 1.5042469501495361, "learning_rate": 3.817280453257791e-05, "loss": 6.542356872558594, "step": 540 }, { "epoch": 0.19496632399858208, "grad_norm": 2.329241991043091, "learning_rate": 3.888101983002833e-05, "loss": 6.541840362548828, "step": 550 }, { "epoch": 0.19851116625310175, "grad_norm": 2.9417874813079834, "learning_rate": 3.9589235127478756e-05, "loss": 6.506352233886719, "step": 560 }, { "epoch": 0.2020560085076214, "grad_norm": 1.765146255493164, "learning_rate": 4.029745042492918e-05, "loss": 6.494882202148437, "step": 570 }, { "epoch": 0.20560085076214107, "grad_norm": 1.5695878267288208, "learning_rate": 4.1005665722379605e-05, "loss": 6.457882690429687, "step": 580 }, { "epoch": 0.20914569301666075, "grad_norm": 2.1169416904449463, "learning_rate": 4.171388101983003e-05, "loss": 6.4466796875, "step": 590 }, { "epoch": 0.21269053527118043, "grad_norm": 2.836350679397583, "learning_rate": 4.242209631728046e-05, "loss": 6.394796371459961, "step": 600 }, { "epoch": 0.2162353775257001, "grad_norm": 2.057159423828125, "learning_rate": 4.313031161473088e-05, "loss": 6.309791564941406, "step": 610 }, { "epoch": 0.21978021978021978, "grad_norm": 1.9764236211776733, "learning_rate": 4.38385269121813e-05, "loss": 6.143642425537109, "step": 620 }, { "epoch": 0.22332506203473945, "grad_norm": 1.5374716520309448, "learning_rate": 4.454674220963173e-05, "loss": 6.059618377685547, "step": 630 }, { "epoch": 0.22686990428925913, "grad_norm": 1.504021406173706, "learning_rate": 4.525495750708216e-05, "loss": 5.966709899902344, "step": 640 }, { "epoch": 0.2304147465437788, "grad_norm": 1.8329981565475464, "learning_rate": 4.596317280453258e-05, "loss": 5.919542694091797, "step": 650 }, { "epoch": 0.23395958879829848, "grad_norm": 1.955461859703064, "learning_rate": 4.6671388101983006e-05, "loss": 5.863460540771484, "step": 660 }, { "epoch": 0.23750443105281815, "grad_norm": 1.6255284547805786, "learning_rate": 4.7379603399433434e-05, "loss": 5.847004318237305, "step": 670 }, { "epoch": 0.24104927330733783, "grad_norm": 1.6530513763427734, "learning_rate": 4.8087818696883855e-05, "loss": 5.811199569702149, "step": 680 }, { "epoch": 0.2445941155618575, "grad_norm": 1.6961824893951416, "learning_rate": 4.8796033994334276e-05, "loss": 5.7780517578125, "step": 690 }, { "epoch": 0.24813895781637718, "grad_norm": 1.508845567703247, "learning_rate": 4.9504249291784704e-05, "loss": 5.740897750854492, "step": 700 }, { "epoch": 0.25168380007089686, "grad_norm": 1.4073859453201294, "learning_rate": 4.999999381545897e-05, "loss": 5.743928146362305, "step": 710 }, { "epoch": 0.25522864232541653, "grad_norm": 1.2714122533798218, "learning_rate": 4.999988386814785e-05, "loss": 5.7146648406982425, "step": 720 }, { "epoch": 0.2587734845799362, "grad_norm": 1.105147361755371, "learning_rate": 4.999963648728715e-05, "loss": 5.703514862060547, "step": 730 }, { "epoch": 0.2623183268344559, "grad_norm": 1.281786561012268, "learning_rate": 4.99992516742368e-05, "loss": 5.659426879882813, "step": 740 }, { "epoch": 0.26586316908897556, "grad_norm": 1.4539729356765747, "learning_rate": 4.999872943111228e-05, "loss": 5.675305938720703, "step": 750 }, { "epoch": 0.26940801134349524, "grad_norm": 1.2878540754318237, "learning_rate": 4.9998069760784536e-05, "loss": 5.644029998779297, "step": 760 }, { "epoch": 0.2729528535980149, "grad_norm": 1.2199702262878418, "learning_rate": 4.9997272666880024e-05, "loss": 5.6056877136230465, "step": 770 }, { "epoch": 0.2764976958525346, "grad_norm": 1.2377008199691772, "learning_rate": 4.999633815378066e-05, "loss": 5.609469604492188, "step": 780 }, { "epoch": 0.28004253810705426, "grad_norm": 1.278878927230835, "learning_rate": 4.9995266226623807e-05, "loss": 5.626304626464844, "step": 790 }, { "epoch": 0.28358738036157394, "grad_norm": 1.3319532871246338, "learning_rate": 4.999405689130224e-05, "loss": 5.580442428588867, "step": 800 }, { "epoch": 0.28713222261609356, "grad_norm": 1.683655858039856, "learning_rate": 4.9992710154464116e-05, "loss": 5.579409790039063, "step": 810 }, { "epoch": 0.29067706487061323, "grad_norm": 1.1337031126022339, "learning_rate": 4.999122602351296e-05, "loss": 5.577402877807617, "step": 820 }, { "epoch": 0.2942219071251329, "grad_norm": 1.3304811716079712, "learning_rate": 4.9989604506607564e-05, "loss": 5.547556686401367, "step": 830 }, { "epoch": 0.2977667493796526, "grad_norm": 1.2711671590805054, "learning_rate": 4.998784561266201e-05, "loss": 5.548542404174805, "step": 840 }, { "epoch": 0.30131159163417226, "grad_norm": 1.3355711698532104, "learning_rate": 4.998594935134559e-05, "loss": 5.53816032409668, "step": 850 }, { "epoch": 0.30485643388869194, "grad_norm": 1.1520898342132568, "learning_rate": 4.998391573308275e-05, "loss": 5.541797637939453, "step": 860 }, { "epoch": 0.3084012761432116, "grad_norm": 1.3476178646087646, "learning_rate": 4.998174476905303e-05, "loss": 5.528475570678711, "step": 870 }, { "epoch": 0.3119461183977313, "grad_norm": 0.979820191860199, "learning_rate": 4.9979436471191015e-05, "loss": 5.501230621337891, "step": 880 }, { "epoch": 0.31549096065225096, "grad_norm": 0.9973044395446777, "learning_rate": 4.997699085218628e-05, "loss": 5.48109130859375, "step": 890 }, { "epoch": 0.31903580290677064, "grad_norm": 1.2147647142410278, "learning_rate": 4.9974407925483275e-05, "loss": 5.5053356170654295, "step": 900 }, { "epoch": 0.3225806451612903, "grad_norm": 1.174742579460144, "learning_rate": 4.9971687705281305e-05, "loss": 5.525197982788086, "step": 910 }, { "epoch": 0.32612548741581, "grad_norm": 1.0364092588424683, "learning_rate": 4.9968830206534426e-05, "loss": 5.507562255859375, "step": 920 }, { "epoch": 0.32967032967032966, "grad_norm": 1.1423817873001099, "learning_rate": 4.9965835444951345e-05, "loss": 5.50116081237793, "step": 930 }, { "epoch": 0.33321517192484934, "grad_norm": 1.063589096069336, "learning_rate": 4.996270343699539e-05, "loss": 5.483753967285156, "step": 940 }, { "epoch": 0.336760014179369, "grad_norm": 1.156726360321045, "learning_rate": 4.995943419988433e-05, "loss": 5.5090789794921875, "step": 950 }, { "epoch": 0.3403048564338887, "grad_norm": 1.3510740995407104, "learning_rate": 4.995602775159038e-05, "loss": 5.480130767822265, "step": 960 }, { "epoch": 0.34384969868840837, "grad_norm": 1.1639209985733032, "learning_rate": 4.995248411084004e-05, "loss": 5.487713623046875, "step": 970 }, { "epoch": 0.34739454094292804, "grad_norm": 1.0440890789031982, "learning_rate": 4.9948803297114e-05, "loss": 5.465737152099609, "step": 980 }, { "epoch": 0.3509393831974477, "grad_norm": 0.9426578283309937, "learning_rate": 4.9944985330647045e-05, "loss": 5.451010894775391, "step": 990 }, { "epoch": 0.3544842254519674, "grad_norm": 0.8856348991394043, "learning_rate": 4.9941030232427945e-05, "loss": 5.4333232879638675, "step": 1000 }, { "epoch": 0.35802906770648707, "grad_norm": 1.196390986442566, "learning_rate": 4.993693802419933e-05, "loss": 5.440399169921875, "step": 1010 }, { "epoch": 0.36157390996100675, "grad_norm": 0.9776602387428284, "learning_rate": 4.993270872845756e-05, "loss": 5.427825164794922, "step": 1020 }, { "epoch": 0.3651187522155264, "grad_norm": 0.776901125907898, "learning_rate": 4.992834236845264e-05, "loss": 5.455727386474609, "step": 1030 }, { "epoch": 0.3686635944700461, "grad_norm": 0.8997649550437927, "learning_rate": 4.992383896818805e-05, "loss": 5.438071823120117, "step": 1040 }, { "epoch": 0.37220843672456577, "grad_norm": 0.8250705003738403, "learning_rate": 4.991919855242065e-05, "loss": 5.4653472900390625, "step": 1050 }, { "epoch": 0.37575327897908545, "grad_norm": 0.8961722254753113, "learning_rate": 4.991442114666049e-05, "loss": 5.4244224548339846, "step": 1060 }, { "epoch": 0.3792981212336051, "grad_norm": 0.9243871569633484, "learning_rate": 4.990950677717073e-05, "loss": 5.4270378112792965, "step": 1070 }, { "epoch": 0.3828429634881248, "grad_norm": 0.9992454051971436, "learning_rate": 4.990445547096748e-05, "loss": 5.421928024291992, "step": 1080 }, { "epoch": 0.3863878057426445, "grad_norm": 0.8732923269271851, "learning_rate": 4.989926725581962e-05, "loss": 5.399656295776367, "step": 1090 }, { "epoch": 0.38993264799716415, "grad_norm": 1.0609304904937744, "learning_rate": 4.989394216024866e-05, "loss": 5.381362533569336, "step": 1100 }, { "epoch": 0.3934774902516838, "grad_norm": 0.8973801732063293, "learning_rate": 4.9888480213528624e-05, "loss": 5.3848522186279295, "step": 1110 }, { "epoch": 0.3970223325062035, "grad_norm": 0.8304575085639954, "learning_rate": 4.988288144568583e-05, "loss": 5.370730590820313, "step": 1120 }, { "epoch": 0.4005671747607231, "grad_norm": 1.0252659320831299, "learning_rate": 4.9877145887498774e-05, "loss": 5.375761032104492, "step": 1130 }, { "epoch": 0.4041120170152428, "grad_norm": 0.9789050221443176, "learning_rate": 4.9871273570497924e-05, "loss": 5.376107788085937, "step": 1140 }, { "epoch": 0.4076568592697625, "grad_norm": 0.7593803405761719, "learning_rate": 4.986526452696556e-05, "loss": 5.3734375, "step": 1150 }, { "epoch": 0.41120170152428215, "grad_norm": 0.7464333772659302, "learning_rate": 4.98591187899356e-05, "loss": 5.366069793701172, "step": 1160 }, { "epoch": 0.4147465437788018, "grad_norm": 0.7203890085220337, "learning_rate": 4.9852836393193436e-05, "loss": 5.3826652526855465, "step": 1170 }, { "epoch": 0.4182913860333215, "grad_norm": 0.9423730373382568, "learning_rate": 4.984641737127569e-05, "loss": 5.383267974853515, "step": 1180 }, { "epoch": 0.4218362282878412, "grad_norm": 0.8559587597846985, "learning_rate": 4.98398617594701e-05, "loss": 5.379903411865234, "step": 1190 }, { "epoch": 0.42538107054236085, "grad_norm": 0.7626580595970154, "learning_rate": 4.9833169593815264e-05, "loss": 5.348976135253906, "step": 1200 }, { "epoch": 0.4289259127968805, "grad_norm": 0.9729316830635071, "learning_rate": 4.9826340911100484e-05, "loss": 5.38103141784668, "step": 1210 }, { "epoch": 0.4324707550514002, "grad_norm": 0.9993200898170471, "learning_rate": 4.981937574886553e-05, "loss": 5.35406265258789, "step": 1220 }, { "epoch": 0.4360155973059199, "grad_norm": 0.8553763031959534, "learning_rate": 4.9812274145400476e-05, "loss": 5.344794845581054, "step": 1230 }, { "epoch": 0.43956043956043955, "grad_norm": 0.8764554262161255, "learning_rate": 4.980503613974542e-05, "loss": 5.346466064453125, "step": 1240 }, { "epoch": 0.44310528181495923, "grad_norm": 0.931968629360199, "learning_rate": 4.9797661771690355e-05, "loss": 5.383474349975586, "step": 1250 }, { "epoch": 0.4466501240694789, "grad_norm": 0.9033966660499573, "learning_rate": 4.9790151081774894e-05, "loss": 5.3460533142089846, "step": 1260 }, { "epoch": 0.4501949663239986, "grad_norm": 0.7806386947631836, "learning_rate": 4.978250411128805e-05, "loss": 5.311779022216797, "step": 1270 }, { "epoch": 0.45373980857851826, "grad_norm": 0.9154277443885803, "learning_rate": 4.9774720902268045e-05, "loss": 5.3727764129638675, "step": 1280 }, { "epoch": 0.45728465083303793, "grad_norm": 0.9599602222442627, "learning_rate": 4.9766801497502025e-05, "loss": 5.323087692260742, "step": 1290 }, { "epoch": 0.4608294930875576, "grad_norm": 0.7230603098869324, "learning_rate": 4.9758745940525874e-05, "loss": 5.366838073730468, "step": 1300 }, { "epoch": 0.4643743353420773, "grad_norm": 1.0107765197753906, "learning_rate": 4.975055427562396e-05, "loss": 5.308674240112305, "step": 1310 }, { "epoch": 0.46791917759659696, "grad_norm": 0.8170345425605774, "learning_rate": 4.974222654782885e-05, "loss": 5.34793701171875, "step": 1320 }, { "epoch": 0.47146401985111663, "grad_norm": 0.9455390572547913, "learning_rate": 4.973376280292115e-05, "loss": 5.278959274291992, "step": 1330 }, { "epoch": 0.4750088621056363, "grad_norm": 0.7414869666099548, "learning_rate": 4.9725163087429164e-05, "loss": 5.352619934082031, "step": 1340 }, { "epoch": 0.478553704360156, "grad_norm": 0.6697789430618286, "learning_rate": 4.971642744862869e-05, "loss": 5.344553375244141, "step": 1350 }, { "epoch": 0.48209854661467566, "grad_norm": 0.7350028157234192, "learning_rate": 4.9707555934542735e-05, "loss": 5.331580352783203, "step": 1360 }, { "epoch": 0.48564338886919534, "grad_norm": 0.7793420553207397, "learning_rate": 4.9698548593941295e-05, "loss": 5.325060272216797, "step": 1370 }, { "epoch": 0.489188231123715, "grad_norm": 0.9029025435447693, "learning_rate": 4.968940547634102e-05, "loss": 5.306049728393555, "step": 1380 }, { "epoch": 0.4927330733782347, "grad_norm": 0.8494910597801208, "learning_rate": 4.9680126632004984e-05, "loss": 5.327105331420898, "step": 1390 }, { "epoch": 0.49627791563275436, "grad_norm": 0.7679411172866821, "learning_rate": 4.967071211194241e-05, "loss": 5.330787277221679, "step": 1400 }, { "epoch": 0.49982275788727404, "grad_norm": 0.7874600291252136, "learning_rate": 4.966116196790836e-05, "loss": 5.342826461791992, "step": 1410 }, { "epoch": 0.5033676001417937, "grad_norm": 0.9020946025848389, "learning_rate": 4.965147625240351e-05, "loss": 5.3182518005371096, "step": 1420 }, { "epoch": 0.5069124423963134, "grad_norm": 0.7656190991401672, "learning_rate": 4.964165501867378e-05, "loss": 5.303837585449219, "step": 1430 }, { "epoch": 0.5104572846508331, "grad_norm": 0.7332035303115845, "learning_rate": 4.9631698320710115e-05, "loss": 5.324761962890625, "step": 1440 }, { "epoch": 0.5140021269053527, "grad_norm": 0.6765123009681702, "learning_rate": 4.962160621324813e-05, "loss": 5.318471527099609, "step": 1450 }, { "epoch": 0.5175469691598724, "grad_norm": 0.7399813532829285, "learning_rate": 4.9611378751767854e-05, "loss": 5.327352905273438, "step": 1460 }, { "epoch": 0.5210918114143921, "grad_norm": 0.9081066846847534, "learning_rate": 4.96010159924934e-05, "loss": 5.302130889892578, "step": 1470 }, { "epoch": 0.5246366536689118, "grad_norm": 0.8822699189186096, "learning_rate": 4.959051799239267e-05, "loss": 5.313935470581055, "step": 1480 }, { "epoch": 0.5281814959234314, "grad_norm": 0.7201648354530334, "learning_rate": 4.9579884809177024e-05, "loss": 5.290200805664062, "step": 1490 }, { "epoch": 0.5317263381779511, "grad_norm": 0.6942399740219116, "learning_rate": 4.956911650130098e-05, "loss": 5.262855148315429, "step": 1500 }, { "epoch": 0.5352711804324708, "grad_norm": 0.7676780819892883, "learning_rate": 4.955821312796188e-05, "loss": 5.308451461791992, "step": 1510 }, { "epoch": 0.5388160226869905, "grad_norm": 0.8454974889755249, "learning_rate": 4.954717474909958e-05, "loss": 5.288463592529297, "step": 1520 }, { "epoch": 0.5423608649415101, "grad_norm": 0.622800350189209, "learning_rate": 4.953600142539609e-05, "loss": 5.301121520996094, "step": 1530 }, { "epoch": 0.5459057071960298, "grad_norm": 0.7299189567565918, "learning_rate": 4.9524693218275306e-05, "loss": 5.29693603515625, "step": 1540 }, { "epoch": 0.5494505494505495, "grad_norm": 0.7149349451065063, "learning_rate": 4.951325018990258e-05, "loss": 5.285152435302734, "step": 1550 }, { "epoch": 0.5529953917050692, "grad_norm": 0.7443231344223022, "learning_rate": 4.950167240318444e-05, "loss": 5.3107250213623045, "step": 1560 }, { "epoch": 0.5565402339595888, "grad_norm": 0.7101079821586609, "learning_rate": 4.948995992176824e-05, "loss": 5.288132858276367, "step": 1570 }, { "epoch": 0.5600850762141085, "grad_norm": 0.6409624218940735, "learning_rate": 4.94781128100418e-05, "loss": 5.258818817138672, "step": 1580 }, { "epoch": 0.5636299184686282, "grad_norm": 0.7422112226486206, "learning_rate": 4.946613113313304e-05, "loss": 5.310946273803711, "step": 1590 }, { "epoch": 0.5671747607231479, "grad_norm": 0.7419252991676331, "learning_rate": 4.9454014956909644e-05, "loss": 5.281931686401367, "step": 1600 }, { "epoch": 0.5707196029776674, "grad_norm": 0.6587216258049011, "learning_rate": 4.944176434797869e-05, "loss": 5.29292106628418, "step": 1610 }, { "epoch": 0.5742644452321871, "grad_norm": 0.7236230373382568, "learning_rate": 4.942937937368628e-05, "loss": 5.291954803466797, "step": 1620 }, { "epoch": 0.5778092874867068, "grad_norm": 0.8149150013923645, "learning_rate": 4.941686010211715e-05, "loss": 5.252371978759766, "step": 1630 }, { "epoch": 0.5813541297412265, "grad_norm": 0.6907062530517578, "learning_rate": 4.940420660209436e-05, "loss": 5.272654342651367, "step": 1640 }, { "epoch": 0.5848989719957461, "grad_norm": 0.7174138426780701, "learning_rate": 4.9391418943178836e-05, "loss": 5.294339752197265, "step": 1650 }, { "epoch": 0.5884438142502658, "grad_norm": 0.798039972782135, "learning_rate": 4.9378497195669036e-05, "loss": 5.257662963867188, "step": 1660 }, { "epoch": 0.5919886565047855, "grad_norm": 0.724810004234314, "learning_rate": 4.936544143060058e-05, "loss": 5.284110641479492, "step": 1670 }, { "epoch": 0.5955334987593052, "grad_norm": 0.7931723594665527, "learning_rate": 4.9352251719745774e-05, "loss": 5.281097412109375, "step": 1680 }, { "epoch": 0.5990783410138248, "grad_norm": 0.6884552836418152, "learning_rate": 4.933892813561333e-05, "loss": 5.27345085144043, "step": 1690 }, { "epoch": 0.6026231832683445, "grad_norm": 0.6960254907608032, "learning_rate": 4.9325470751447866e-05, "loss": 5.2600860595703125, "step": 1700 }, { "epoch": 0.6061680255228642, "grad_norm": 0.7212153077125549, "learning_rate": 4.931187964122959e-05, "loss": 5.293134307861328, "step": 1710 }, { "epoch": 0.6097128677773839, "grad_norm": 0.7511916756629944, "learning_rate": 4.929815487967382e-05, "loss": 5.3018230438232425, "step": 1720 }, { "epoch": 0.6132577100319035, "grad_norm": 0.7361993193626404, "learning_rate": 4.9284296542230615e-05, "loss": 5.259002685546875, "step": 1730 }, { "epoch": 0.6168025522864232, "grad_norm": 0.6815996170043945, "learning_rate": 4.927030470508434e-05, "loss": 5.303490447998047, "step": 1740 }, { "epoch": 0.6203473945409429, "grad_norm": 0.6869247555732727, "learning_rate": 4.925617944515328e-05, "loss": 5.268404388427735, "step": 1750 }, { "epoch": 0.6238922367954626, "grad_norm": 0.712818443775177, "learning_rate": 4.9241920840089176e-05, "loss": 5.259001922607422, "step": 1760 }, { "epoch": 0.6274370790499822, "grad_norm": 0.7157370448112488, "learning_rate": 4.922752896827682e-05, "loss": 5.2527915954589846, "step": 1770 }, { "epoch": 0.6309819213045019, "grad_norm": 0.7796548008918762, "learning_rate": 4.921300390883362e-05, "loss": 5.244187545776367, "step": 1780 }, { "epoch": 0.6345267635590216, "grad_norm": 0.6423467397689819, "learning_rate": 4.919834574160916e-05, "loss": 5.230907440185547, "step": 1790 }, { "epoch": 0.6380716058135413, "grad_norm": 0.7811551094055176, "learning_rate": 4.9183554547184784e-05, "loss": 5.229999923706055, "step": 1800 }, { "epoch": 0.641616448068061, "grad_norm": 0.6660889387130737, "learning_rate": 4.916863040687312e-05, "loss": 5.248212814331055, "step": 1810 }, { "epoch": 0.6451612903225806, "grad_norm": 0.6456777453422546, "learning_rate": 4.915357340271765e-05, "loss": 5.257810592651367, "step": 1820 }, { "epoch": 0.6487061325771003, "grad_norm": 0.6784165501594543, "learning_rate": 4.9138383617492254e-05, "loss": 5.257859420776367, "step": 1830 }, { "epoch": 0.65225097483162, "grad_norm": 0.6905676126480103, "learning_rate": 4.9123061134700774e-05, "loss": 5.253431701660157, "step": 1840 }, { "epoch": 0.6557958170861397, "grad_norm": 0.6898303031921387, "learning_rate": 4.9107606038576523e-05, "loss": 5.243818283081055, "step": 1850 }, { "epoch": 0.6593406593406593, "grad_norm": 0.6445653438568115, "learning_rate": 4.9092018414081854e-05, "loss": 5.2768810272216795, "step": 1860 }, { "epoch": 0.662885501595179, "grad_norm": 0.6441115140914917, "learning_rate": 4.9076298346907654e-05, "loss": 5.255257415771484, "step": 1870 }, { "epoch": 0.6664303438496987, "grad_norm": 0.6179636716842651, "learning_rate": 4.906044592347292e-05, "loss": 5.236399078369141, "step": 1880 }, { "epoch": 0.6699751861042184, "grad_norm": 0.6955509781837463, "learning_rate": 4.904446123092424e-05, "loss": 5.26529541015625, "step": 1890 }, { "epoch": 0.673520028358738, "grad_norm": 0.716314971446991, "learning_rate": 4.9028344357135355e-05, "loss": 5.257140350341797, "step": 1900 }, { "epoch": 0.6770648706132577, "grad_norm": 0.785479724407196, "learning_rate": 4.9012095390706636e-05, "loss": 5.23547477722168, "step": 1910 }, { "epoch": 0.6806097128677774, "grad_norm": 0.619127094745636, "learning_rate": 4.899571442096462e-05, "loss": 5.233910751342774, "step": 1920 }, { "epoch": 0.6841545551222971, "grad_norm": 0.6372919082641602, "learning_rate": 4.897920153796153e-05, "loss": 5.2188163757324215, "step": 1930 }, { "epoch": 0.6876993973768167, "grad_norm": 0.6643814444541931, "learning_rate": 4.896255683247474e-05, "loss": 5.250975036621094, "step": 1940 }, { "epoch": 0.6912442396313364, "grad_norm": 0.551176130771637, "learning_rate": 4.894578039600633e-05, "loss": 5.245917510986328, "step": 1950 }, { "epoch": 0.6947890818858561, "grad_norm": 0.7061398029327393, "learning_rate": 4.892887232078251e-05, "loss": 5.223080062866211, "step": 1960 }, { "epoch": 0.6983339241403758, "grad_norm": 0.6970100402832031, "learning_rate": 4.8911832699753205e-05, "loss": 5.245952606201172, "step": 1970 }, { "epoch": 0.7018787663948954, "grad_norm": 0.6700912117958069, "learning_rate": 4.8894661626591475e-05, "loss": 5.273249816894531, "step": 1980 }, { "epoch": 0.7054236086494151, "grad_norm": 0.5814111828804016, "learning_rate": 4.8877359195693005e-05, "loss": 5.207174682617188, "step": 1990 }, { "epoch": 0.7089684509039348, "grad_norm": 0.7154501080513, "learning_rate": 4.885992550217563e-05, "loss": 5.2551219940185545, "step": 2000 } ], "logging_steps": 10, "max_steps": 14105, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6845619367143014e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }