qwen35 / trainer_state.json
Endy2001's picture
Add files using upload-large-folder tool
8889525 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7089684509039348,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003544842254519674,
"grad_norm": 132.5248565673828,
"learning_rate": 6.373937677053824e-07,
"loss": 14.446830749511719,
"step": 10
},
{
"epoch": 0.007089684509039348,
"grad_norm": 54.77531051635742,
"learning_rate": 1.3456090651558075e-06,
"loss": 13.234834289550781,
"step": 20
},
{
"epoch": 0.010634526763559022,
"grad_norm": 6.283421993255615,
"learning_rate": 2.0538243626062327e-06,
"loss": 11.701119995117187,
"step": 30
},
{
"epoch": 0.014179369018078695,
"grad_norm": 9.19540023803711,
"learning_rate": 2.762039660056657e-06,
"loss": 11.269725799560547,
"step": 40
},
{
"epoch": 0.01772421127259837,
"grad_norm": 3.588550329208374,
"learning_rate": 3.4702549575070827e-06,
"loss": 11.082182312011719,
"step": 50
},
{
"epoch": 0.021269053527118043,
"grad_norm": 4.2412614822387695,
"learning_rate": 4.178470254957508e-06,
"loss": 10.633071899414062,
"step": 60
},
{
"epoch": 0.02481389578163772,
"grad_norm": 2.81349515914917,
"learning_rate": 4.886685552407932e-06,
"loss": 10.016432189941407,
"step": 70
},
{
"epoch": 0.02835873803615739,
"grad_norm": 2.157428026199341,
"learning_rate": 5.594900849858357e-06,
"loss": 9.317854309082032,
"step": 80
},
{
"epoch": 0.03190358029067707,
"grad_norm": 1.53337824344635,
"learning_rate": 6.3031161473087825e-06,
"loss": 8.773271179199218,
"step": 90
},
{
"epoch": 0.03544842254519674,
"grad_norm": 1.2842110395431519,
"learning_rate": 7.011331444759208e-06,
"loss": 8.46307373046875,
"step": 100
},
{
"epoch": 0.03899326479971641,
"grad_norm": 0.8940983414649963,
"learning_rate": 7.719546742209632e-06,
"loss": 8.362952423095702,
"step": 110
},
{
"epoch": 0.042538107054236086,
"grad_norm": 1.2323634624481201,
"learning_rate": 8.427762039660058e-06,
"loss": 8.313992309570313,
"step": 120
},
{
"epoch": 0.04608294930875576,
"grad_norm": 0.8698780536651611,
"learning_rate": 9.135977337110482e-06,
"loss": 8.313835906982423,
"step": 130
},
{
"epoch": 0.04962779156327544,
"grad_norm": 0.6485510468482971,
"learning_rate": 9.844192634560907e-06,
"loss": 8.277175140380859,
"step": 140
},
{
"epoch": 0.053172633817795106,
"grad_norm": 1.8881973028182983,
"learning_rate": 1.0552407932011333e-05,
"loss": 8.263162231445312,
"step": 150
},
{
"epoch": 0.05671747607231478,
"grad_norm": 1.2178221940994263,
"learning_rate": 1.1260623229461757e-05,
"loss": 8.26162109375,
"step": 160
},
{
"epoch": 0.06026231832683446,
"grad_norm": 2.23687481880188,
"learning_rate": 1.1968838526912181e-05,
"loss": 8.220828247070312,
"step": 170
},
{
"epoch": 0.06380716058135413,
"grad_norm": 2.2403409481048584,
"learning_rate": 1.2677053824362606e-05,
"loss": 8.156272888183594,
"step": 180
},
{
"epoch": 0.06735200283587381,
"grad_norm": 4.979323863983154,
"learning_rate": 1.3385269121813032e-05,
"loss": 8.121318054199218,
"step": 190
},
{
"epoch": 0.07089684509039348,
"grad_norm": 3.5296337604522705,
"learning_rate": 1.4093484419263456e-05,
"loss": 8.086737823486327,
"step": 200
},
{
"epoch": 0.07444168734491315,
"grad_norm": 2.324810743331909,
"learning_rate": 1.4801699716713882e-05,
"loss": 8.067035675048828,
"step": 210
},
{
"epoch": 0.07798652959943282,
"grad_norm": 0.9995874166488647,
"learning_rate": 1.5509915014164305e-05,
"loss": 8.020206451416016,
"step": 220
},
{
"epoch": 0.0815313718539525,
"grad_norm": 1.183273196220398,
"learning_rate": 1.6218130311614733e-05,
"loss": 7.9764549255371096,
"step": 230
},
{
"epoch": 0.08507621410847217,
"grad_norm": 0.9968162775039673,
"learning_rate": 1.6926345609065157e-05,
"loss": 7.941542053222657,
"step": 240
},
{
"epoch": 0.08862105636299185,
"grad_norm": 1.453206181526184,
"learning_rate": 1.763456090651558e-05,
"loss": 7.900083923339844,
"step": 250
},
{
"epoch": 0.09216589861751152,
"grad_norm": 1.9150718450546265,
"learning_rate": 1.8342776203966006e-05,
"loss": 7.868098449707031,
"step": 260
},
{
"epoch": 0.0957107408720312,
"grad_norm": 1.4079521894454956,
"learning_rate": 1.9050991501416433e-05,
"loss": 7.8084770202636715,
"step": 270
},
{
"epoch": 0.09925558312655088,
"grad_norm": 2.327735424041748,
"learning_rate": 1.9759206798866854e-05,
"loss": 7.770416259765625,
"step": 280
},
{
"epoch": 0.10280042538107054,
"grad_norm": 3.1308300495147705,
"learning_rate": 2.0467422096317282e-05,
"loss": 7.720582580566406,
"step": 290
},
{
"epoch": 0.10634526763559021,
"grad_norm": 1.5730700492858887,
"learning_rate": 2.1175637393767706e-05,
"loss": 7.653485107421875,
"step": 300
},
{
"epoch": 0.10989010989010989,
"grad_norm": 1.9993301630020142,
"learning_rate": 2.188385269121813e-05,
"loss": 7.584654998779297,
"step": 310
},
{
"epoch": 0.11343495214462956,
"grad_norm": 3.556170701980591,
"learning_rate": 2.2592067988668555e-05,
"loss": 7.472293090820313,
"step": 320
},
{
"epoch": 0.11697979439914924,
"grad_norm": 1.5170807838439941,
"learning_rate": 2.3300283286118983e-05,
"loss": 7.420698547363282,
"step": 330
},
{
"epoch": 0.12052463665366892,
"grad_norm": 1.8956718444824219,
"learning_rate": 2.4008498583569404e-05,
"loss": 7.352853393554687,
"step": 340
},
{
"epoch": 0.12406947890818859,
"grad_norm": 3.6424927711486816,
"learning_rate": 2.471671388101983e-05,
"loss": 7.289952087402344,
"step": 350
},
{
"epoch": 0.12761432116270827,
"grad_norm": 3.950108528137207,
"learning_rate": 2.542492917847026e-05,
"loss": 7.2670539855957035,
"step": 360
},
{
"epoch": 0.13115916341722794,
"grad_norm": 3.1376140117645264,
"learning_rate": 2.613314447592068e-05,
"loss": 7.219367980957031,
"step": 370
},
{
"epoch": 0.13470400567174762,
"grad_norm": 2.8113393783569336,
"learning_rate": 2.6841359773371104e-05,
"loss": 7.1138450622558596,
"step": 380
},
{
"epoch": 0.1382488479262673,
"grad_norm": 1.7290911674499512,
"learning_rate": 2.7549575070821532e-05,
"loss": 7.069696044921875,
"step": 390
},
{
"epoch": 0.14179369018078697,
"grad_norm": 2.157928705215454,
"learning_rate": 2.8257790368271957e-05,
"loss": 7.032493591308594,
"step": 400
},
{
"epoch": 0.14533853243530662,
"grad_norm": 2.0297610759735107,
"learning_rate": 2.8966005665722377e-05,
"loss": 6.974455261230469,
"step": 410
},
{
"epoch": 0.1488833746898263,
"grad_norm": 2.0817925930023193,
"learning_rate": 2.9674220963172805e-05,
"loss": 6.954932403564453,
"step": 420
},
{
"epoch": 0.15242821694434597,
"grad_norm": 2.302412271499634,
"learning_rate": 3.0382436260623233e-05,
"loss": 6.9087669372558596,
"step": 430
},
{
"epoch": 0.15597305919886564,
"grad_norm": 2.4860918521881104,
"learning_rate": 3.109065155807366e-05,
"loss": 6.853008270263672,
"step": 440
},
{
"epoch": 0.15951790145338532,
"grad_norm": 1.9176361560821533,
"learning_rate": 3.179886685552408e-05,
"loss": 6.821636199951172,
"step": 450
},
{
"epoch": 0.163062743707905,
"grad_norm": 2.412122964859009,
"learning_rate": 3.2507082152974506e-05,
"loss": 6.7895256042480465,
"step": 460
},
{
"epoch": 0.16660758596242467,
"grad_norm": 2.1598598957061768,
"learning_rate": 3.3215297450424934e-05,
"loss": 6.758164978027343,
"step": 470
},
{
"epoch": 0.17015242821694435,
"grad_norm": 1.4905693531036377,
"learning_rate": 3.3923512747875355e-05,
"loss": 6.707500457763672,
"step": 480
},
{
"epoch": 0.17369727047146402,
"grad_norm": 2.570793628692627,
"learning_rate": 3.4631728045325776e-05,
"loss": 6.675039672851563,
"step": 490
},
{
"epoch": 0.1772421127259837,
"grad_norm": 1.7906855344772339,
"learning_rate": 3.53399433427762e-05,
"loss": 6.652853393554688,
"step": 500
},
{
"epoch": 0.18078695498050337,
"grad_norm": 2.3503963947296143,
"learning_rate": 3.604815864022663e-05,
"loss": 6.630236053466797,
"step": 510
},
{
"epoch": 0.18433179723502305,
"grad_norm": 2.17346453666687,
"learning_rate": 3.675637393767706e-05,
"loss": 6.607561492919922,
"step": 520
},
{
"epoch": 0.18787663948954272,
"grad_norm": 2.7891957759857178,
"learning_rate": 3.746458923512748e-05,
"loss": 6.555358123779297,
"step": 530
},
{
"epoch": 0.1914214817440624,
"grad_norm": 1.5042469501495361,
"learning_rate": 3.817280453257791e-05,
"loss": 6.542356872558594,
"step": 540
},
{
"epoch": 0.19496632399858208,
"grad_norm": 2.329241991043091,
"learning_rate": 3.888101983002833e-05,
"loss": 6.541840362548828,
"step": 550
},
{
"epoch": 0.19851116625310175,
"grad_norm": 2.9417874813079834,
"learning_rate": 3.9589235127478756e-05,
"loss": 6.506352233886719,
"step": 560
},
{
"epoch": 0.2020560085076214,
"grad_norm": 1.765146255493164,
"learning_rate": 4.029745042492918e-05,
"loss": 6.494882202148437,
"step": 570
},
{
"epoch": 0.20560085076214107,
"grad_norm": 1.5695878267288208,
"learning_rate": 4.1005665722379605e-05,
"loss": 6.457882690429687,
"step": 580
},
{
"epoch": 0.20914569301666075,
"grad_norm": 2.1169416904449463,
"learning_rate": 4.171388101983003e-05,
"loss": 6.4466796875,
"step": 590
},
{
"epoch": 0.21269053527118043,
"grad_norm": 2.836350679397583,
"learning_rate": 4.242209631728046e-05,
"loss": 6.394796371459961,
"step": 600
},
{
"epoch": 0.2162353775257001,
"grad_norm": 2.057159423828125,
"learning_rate": 4.313031161473088e-05,
"loss": 6.309791564941406,
"step": 610
},
{
"epoch": 0.21978021978021978,
"grad_norm": 1.9764236211776733,
"learning_rate": 4.38385269121813e-05,
"loss": 6.143642425537109,
"step": 620
},
{
"epoch": 0.22332506203473945,
"grad_norm": 1.5374716520309448,
"learning_rate": 4.454674220963173e-05,
"loss": 6.059618377685547,
"step": 630
},
{
"epoch": 0.22686990428925913,
"grad_norm": 1.504021406173706,
"learning_rate": 4.525495750708216e-05,
"loss": 5.966709899902344,
"step": 640
},
{
"epoch": 0.2304147465437788,
"grad_norm": 1.8329981565475464,
"learning_rate": 4.596317280453258e-05,
"loss": 5.919542694091797,
"step": 650
},
{
"epoch": 0.23395958879829848,
"grad_norm": 1.955461859703064,
"learning_rate": 4.6671388101983006e-05,
"loss": 5.863460540771484,
"step": 660
},
{
"epoch": 0.23750443105281815,
"grad_norm": 1.6255284547805786,
"learning_rate": 4.7379603399433434e-05,
"loss": 5.847004318237305,
"step": 670
},
{
"epoch": 0.24104927330733783,
"grad_norm": 1.6530513763427734,
"learning_rate": 4.8087818696883855e-05,
"loss": 5.811199569702149,
"step": 680
},
{
"epoch": 0.2445941155618575,
"grad_norm": 1.6961824893951416,
"learning_rate": 4.8796033994334276e-05,
"loss": 5.7780517578125,
"step": 690
},
{
"epoch": 0.24813895781637718,
"grad_norm": 1.508845567703247,
"learning_rate": 4.9504249291784704e-05,
"loss": 5.740897750854492,
"step": 700
},
{
"epoch": 0.25168380007089686,
"grad_norm": 1.4073859453201294,
"learning_rate": 4.999999381545897e-05,
"loss": 5.743928146362305,
"step": 710
},
{
"epoch": 0.25522864232541653,
"grad_norm": 1.2714122533798218,
"learning_rate": 4.999988386814785e-05,
"loss": 5.7146648406982425,
"step": 720
},
{
"epoch": 0.2587734845799362,
"grad_norm": 1.105147361755371,
"learning_rate": 4.999963648728715e-05,
"loss": 5.703514862060547,
"step": 730
},
{
"epoch": 0.2623183268344559,
"grad_norm": 1.281786561012268,
"learning_rate": 4.99992516742368e-05,
"loss": 5.659426879882813,
"step": 740
},
{
"epoch": 0.26586316908897556,
"grad_norm": 1.4539729356765747,
"learning_rate": 4.999872943111228e-05,
"loss": 5.675305938720703,
"step": 750
},
{
"epoch": 0.26940801134349524,
"grad_norm": 1.2878540754318237,
"learning_rate": 4.9998069760784536e-05,
"loss": 5.644029998779297,
"step": 760
},
{
"epoch": 0.2729528535980149,
"grad_norm": 1.2199702262878418,
"learning_rate": 4.9997272666880024e-05,
"loss": 5.6056877136230465,
"step": 770
},
{
"epoch": 0.2764976958525346,
"grad_norm": 1.2377008199691772,
"learning_rate": 4.999633815378066e-05,
"loss": 5.609469604492188,
"step": 780
},
{
"epoch": 0.28004253810705426,
"grad_norm": 1.278878927230835,
"learning_rate": 4.9995266226623807e-05,
"loss": 5.626304626464844,
"step": 790
},
{
"epoch": 0.28358738036157394,
"grad_norm": 1.3319532871246338,
"learning_rate": 4.999405689130224e-05,
"loss": 5.580442428588867,
"step": 800
},
{
"epoch": 0.28713222261609356,
"grad_norm": 1.683655858039856,
"learning_rate": 4.9992710154464116e-05,
"loss": 5.579409790039063,
"step": 810
},
{
"epoch": 0.29067706487061323,
"grad_norm": 1.1337031126022339,
"learning_rate": 4.999122602351296e-05,
"loss": 5.577402877807617,
"step": 820
},
{
"epoch": 0.2942219071251329,
"grad_norm": 1.3304811716079712,
"learning_rate": 4.9989604506607564e-05,
"loss": 5.547556686401367,
"step": 830
},
{
"epoch": 0.2977667493796526,
"grad_norm": 1.2711671590805054,
"learning_rate": 4.998784561266201e-05,
"loss": 5.548542404174805,
"step": 840
},
{
"epoch": 0.30131159163417226,
"grad_norm": 1.3355711698532104,
"learning_rate": 4.998594935134559e-05,
"loss": 5.53816032409668,
"step": 850
},
{
"epoch": 0.30485643388869194,
"grad_norm": 1.1520898342132568,
"learning_rate": 4.998391573308275e-05,
"loss": 5.541797637939453,
"step": 860
},
{
"epoch": 0.3084012761432116,
"grad_norm": 1.3476178646087646,
"learning_rate": 4.998174476905303e-05,
"loss": 5.528475570678711,
"step": 870
},
{
"epoch": 0.3119461183977313,
"grad_norm": 0.979820191860199,
"learning_rate": 4.9979436471191015e-05,
"loss": 5.501230621337891,
"step": 880
},
{
"epoch": 0.31549096065225096,
"grad_norm": 0.9973044395446777,
"learning_rate": 4.997699085218628e-05,
"loss": 5.48109130859375,
"step": 890
},
{
"epoch": 0.31903580290677064,
"grad_norm": 1.2147647142410278,
"learning_rate": 4.9974407925483275e-05,
"loss": 5.5053356170654295,
"step": 900
},
{
"epoch": 0.3225806451612903,
"grad_norm": 1.174742579460144,
"learning_rate": 4.9971687705281305e-05,
"loss": 5.525197982788086,
"step": 910
},
{
"epoch": 0.32612548741581,
"grad_norm": 1.0364092588424683,
"learning_rate": 4.9968830206534426e-05,
"loss": 5.507562255859375,
"step": 920
},
{
"epoch": 0.32967032967032966,
"grad_norm": 1.1423817873001099,
"learning_rate": 4.9965835444951345e-05,
"loss": 5.50116081237793,
"step": 930
},
{
"epoch": 0.33321517192484934,
"grad_norm": 1.063589096069336,
"learning_rate": 4.996270343699539e-05,
"loss": 5.483753967285156,
"step": 940
},
{
"epoch": 0.336760014179369,
"grad_norm": 1.156726360321045,
"learning_rate": 4.995943419988433e-05,
"loss": 5.5090789794921875,
"step": 950
},
{
"epoch": 0.3403048564338887,
"grad_norm": 1.3510740995407104,
"learning_rate": 4.995602775159038e-05,
"loss": 5.480130767822265,
"step": 960
},
{
"epoch": 0.34384969868840837,
"grad_norm": 1.1639209985733032,
"learning_rate": 4.995248411084004e-05,
"loss": 5.487713623046875,
"step": 970
},
{
"epoch": 0.34739454094292804,
"grad_norm": 1.0440890789031982,
"learning_rate": 4.9948803297114e-05,
"loss": 5.465737152099609,
"step": 980
},
{
"epoch": 0.3509393831974477,
"grad_norm": 0.9426578283309937,
"learning_rate": 4.9944985330647045e-05,
"loss": 5.451010894775391,
"step": 990
},
{
"epoch": 0.3544842254519674,
"grad_norm": 0.8856348991394043,
"learning_rate": 4.9941030232427945e-05,
"loss": 5.4333232879638675,
"step": 1000
},
{
"epoch": 0.35802906770648707,
"grad_norm": 1.196390986442566,
"learning_rate": 4.993693802419933e-05,
"loss": 5.440399169921875,
"step": 1010
},
{
"epoch": 0.36157390996100675,
"grad_norm": 0.9776602387428284,
"learning_rate": 4.993270872845756e-05,
"loss": 5.427825164794922,
"step": 1020
},
{
"epoch": 0.3651187522155264,
"grad_norm": 0.776901125907898,
"learning_rate": 4.992834236845264e-05,
"loss": 5.455727386474609,
"step": 1030
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.8997649550437927,
"learning_rate": 4.992383896818805e-05,
"loss": 5.438071823120117,
"step": 1040
},
{
"epoch": 0.37220843672456577,
"grad_norm": 0.8250705003738403,
"learning_rate": 4.991919855242065e-05,
"loss": 5.4653472900390625,
"step": 1050
},
{
"epoch": 0.37575327897908545,
"grad_norm": 0.8961722254753113,
"learning_rate": 4.991442114666049e-05,
"loss": 5.4244224548339846,
"step": 1060
},
{
"epoch": 0.3792981212336051,
"grad_norm": 0.9243871569633484,
"learning_rate": 4.990950677717073e-05,
"loss": 5.4270378112792965,
"step": 1070
},
{
"epoch": 0.3828429634881248,
"grad_norm": 0.9992454051971436,
"learning_rate": 4.990445547096748e-05,
"loss": 5.421928024291992,
"step": 1080
},
{
"epoch": 0.3863878057426445,
"grad_norm": 0.8732923269271851,
"learning_rate": 4.989926725581962e-05,
"loss": 5.399656295776367,
"step": 1090
},
{
"epoch": 0.38993264799716415,
"grad_norm": 1.0609304904937744,
"learning_rate": 4.989394216024866e-05,
"loss": 5.381362533569336,
"step": 1100
},
{
"epoch": 0.3934774902516838,
"grad_norm": 0.8973801732063293,
"learning_rate": 4.9888480213528624e-05,
"loss": 5.3848522186279295,
"step": 1110
},
{
"epoch": 0.3970223325062035,
"grad_norm": 0.8304575085639954,
"learning_rate": 4.988288144568583e-05,
"loss": 5.370730590820313,
"step": 1120
},
{
"epoch": 0.4005671747607231,
"grad_norm": 1.0252659320831299,
"learning_rate": 4.9877145887498774e-05,
"loss": 5.375761032104492,
"step": 1130
},
{
"epoch": 0.4041120170152428,
"grad_norm": 0.9789050221443176,
"learning_rate": 4.9871273570497924e-05,
"loss": 5.376107788085937,
"step": 1140
},
{
"epoch": 0.4076568592697625,
"grad_norm": 0.7593803405761719,
"learning_rate": 4.986526452696556e-05,
"loss": 5.3734375,
"step": 1150
},
{
"epoch": 0.41120170152428215,
"grad_norm": 0.7464333772659302,
"learning_rate": 4.98591187899356e-05,
"loss": 5.366069793701172,
"step": 1160
},
{
"epoch": 0.4147465437788018,
"grad_norm": 0.7203890085220337,
"learning_rate": 4.9852836393193436e-05,
"loss": 5.3826652526855465,
"step": 1170
},
{
"epoch": 0.4182913860333215,
"grad_norm": 0.9423730373382568,
"learning_rate": 4.984641737127569e-05,
"loss": 5.383267974853515,
"step": 1180
},
{
"epoch": 0.4218362282878412,
"grad_norm": 0.8559587597846985,
"learning_rate": 4.98398617594701e-05,
"loss": 5.379903411865234,
"step": 1190
},
{
"epoch": 0.42538107054236085,
"grad_norm": 0.7626580595970154,
"learning_rate": 4.9833169593815264e-05,
"loss": 5.348976135253906,
"step": 1200
},
{
"epoch": 0.4289259127968805,
"grad_norm": 0.9729316830635071,
"learning_rate": 4.9826340911100484e-05,
"loss": 5.38103141784668,
"step": 1210
},
{
"epoch": 0.4324707550514002,
"grad_norm": 0.9993200898170471,
"learning_rate": 4.981937574886553e-05,
"loss": 5.35406265258789,
"step": 1220
},
{
"epoch": 0.4360155973059199,
"grad_norm": 0.8553763031959534,
"learning_rate": 4.9812274145400476e-05,
"loss": 5.344794845581054,
"step": 1230
},
{
"epoch": 0.43956043956043955,
"grad_norm": 0.8764554262161255,
"learning_rate": 4.980503613974542e-05,
"loss": 5.346466064453125,
"step": 1240
},
{
"epoch": 0.44310528181495923,
"grad_norm": 0.931968629360199,
"learning_rate": 4.9797661771690355e-05,
"loss": 5.383474349975586,
"step": 1250
},
{
"epoch": 0.4466501240694789,
"grad_norm": 0.9033966660499573,
"learning_rate": 4.9790151081774894e-05,
"loss": 5.3460533142089846,
"step": 1260
},
{
"epoch": 0.4501949663239986,
"grad_norm": 0.7806386947631836,
"learning_rate": 4.978250411128805e-05,
"loss": 5.311779022216797,
"step": 1270
},
{
"epoch": 0.45373980857851826,
"grad_norm": 0.9154277443885803,
"learning_rate": 4.9774720902268045e-05,
"loss": 5.3727764129638675,
"step": 1280
},
{
"epoch": 0.45728465083303793,
"grad_norm": 0.9599602222442627,
"learning_rate": 4.9766801497502025e-05,
"loss": 5.323087692260742,
"step": 1290
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.7230603098869324,
"learning_rate": 4.9758745940525874e-05,
"loss": 5.366838073730468,
"step": 1300
},
{
"epoch": 0.4643743353420773,
"grad_norm": 1.0107765197753906,
"learning_rate": 4.975055427562396e-05,
"loss": 5.308674240112305,
"step": 1310
},
{
"epoch": 0.46791917759659696,
"grad_norm": 0.8170345425605774,
"learning_rate": 4.974222654782885e-05,
"loss": 5.34793701171875,
"step": 1320
},
{
"epoch": 0.47146401985111663,
"grad_norm": 0.9455390572547913,
"learning_rate": 4.973376280292115e-05,
"loss": 5.278959274291992,
"step": 1330
},
{
"epoch": 0.4750088621056363,
"grad_norm": 0.7414869666099548,
"learning_rate": 4.9725163087429164e-05,
"loss": 5.352619934082031,
"step": 1340
},
{
"epoch": 0.478553704360156,
"grad_norm": 0.6697789430618286,
"learning_rate": 4.971642744862869e-05,
"loss": 5.344553375244141,
"step": 1350
},
{
"epoch": 0.48209854661467566,
"grad_norm": 0.7350028157234192,
"learning_rate": 4.9707555934542735e-05,
"loss": 5.331580352783203,
"step": 1360
},
{
"epoch": 0.48564338886919534,
"grad_norm": 0.7793420553207397,
"learning_rate": 4.9698548593941295e-05,
"loss": 5.325060272216797,
"step": 1370
},
{
"epoch": 0.489188231123715,
"grad_norm": 0.9029025435447693,
"learning_rate": 4.968940547634102e-05,
"loss": 5.306049728393555,
"step": 1380
},
{
"epoch": 0.4927330733782347,
"grad_norm": 0.8494910597801208,
"learning_rate": 4.9680126632004984e-05,
"loss": 5.327105331420898,
"step": 1390
},
{
"epoch": 0.49627791563275436,
"grad_norm": 0.7679411172866821,
"learning_rate": 4.967071211194241e-05,
"loss": 5.330787277221679,
"step": 1400
},
{
"epoch": 0.49982275788727404,
"grad_norm": 0.7874600291252136,
"learning_rate": 4.966116196790836e-05,
"loss": 5.342826461791992,
"step": 1410
},
{
"epoch": 0.5033676001417937,
"grad_norm": 0.9020946025848389,
"learning_rate": 4.965147625240351e-05,
"loss": 5.3182518005371096,
"step": 1420
},
{
"epoch": 0.5069124423963134,
"grad_norm": 0.7656190991401672,
"learning_rate": 4.964165501867378e-05,
"loss": 5.303837585449219,
"step": 1430
},
{
"epoch": 0.5104572846508331,
"grad_norm": 0.7332035303115845,
"learning_rate": 4.9631698320710115e-05,
"loss": 5.324761962890625,
"step": 1440
},
{
"epoch": 0.5140021269053527,
"grad_norm": 0.6765123009681702,
"learning_rate": 4.962160621324813e-05,
"loss": 5.318471527099609,
"step": 1450
},
{
"epoch": 0.5175469691598724,
"grad_norm": 0.7399813532829285,
"learning_rate": 4.9611378751767854e-05,
"loss": 5.327352905273438,
"step": 1460
},
{
"epoch": 0.5210918114143921,
"grad_norm": 0.9081066846847534,
"learning_rate": 4.96010159924934e-05,
"loss": 5.302130889892578,
"step": 1470
},
{
"epoch": 0.5246366536689118,
"grad_norm": 0.8822699189186096,
"learning_rate": 4.959051799239267e-05,
"loss": 5.313935470581055,
"step": 1480
},
{
"epoch": 0.5281814959234314,
"grad_norm": 0.7201648354530334,
"learning_rate": 4.9579884809177024e-05,
"loss": 5.290200805664062,
"step": 1490
},
{
"epoch": 0.5317263381779511,
"grad_norm": 0.6942399740219116,
"learning_rate": 4.956911650130098e-05,
"loss": 5.262855148315429,
"step": 1500
},
{
"epoch": 0.5352711804324708,
"grad_norm": 0.7676780819892883,
"learning_rate": 4.955821312796188e-05,
"loss": 5.308451461791992,
"step": 1510
},
{
"epoch": 0.5388160226869905,
"grad_norm": 0.8454974889755249,
"learning_rate": 4.954717474909958e-05,
"loss": 5.288463592529297,
"step": 1520
},
{
"epoch": 0.5423608649415101,
"grad_norm": 0.622800350189209,
"learning_rate": 4.953600142539609e-05,
"loss": 5.301121520996094,
"step": 1530
},
{
"epoch": 0.5459057071960298,
"grad_norm": 0.7299189567565918,
"learning_rate": 4.9524693218275306e-05,
"loss": 5.29693603515625,
"step": 1540
},
{
"epoch": 0.5494505494505495,
"grad_norm": 0.7149349451065063,
"learning_rate": 4.951325018990258e-05,
"loss": 5.285152435302734,
"step": 1550
},
{
"epoch": 0.5529953917050692,
"grad_norm": 0.7443231344223022,
"learning_rate": 4.950167240318444e-05,
"loss": 5.3107250213623045,
"step": 1560
},
{
"epoch": 0.5565402339595888,
"grad_norm": 0.7101079821586609,
"learning_rate": 4.948995992176824e-05,
"loss": 5.288132858276367,
"step": 1570
},
{
"epoch": 0.5600850762141085,
"grad_norm": 0.6409624218940735,
"learning_rate": 4.94781128100418e-05,
"loss": 5.258818817138672,
"step": 1580
},
{
"epoch": 0.5636299184686282,
"grad_norm": 0.7422112226486206,
"learning_rate": 4.946613113313304e-05,
"loss": 5.310946273803711,
"step": 1590
},
{
"epoch": 0.5671747607231479,
"grad_norm": 0.7419252991676331,
"learning_rate": 4.9454014956909644e-05,
"loss": 5.281931686401367,
"step": 1600
},
{
"epoch": 0.5707196029776674,
"grad_norm": 0.6587216258049011,
"learning_rate": 4.944176434797869e-05,
"loss": 5.29292106628418,
"step": 1610
},
{
"epoch": 0.5742644452321871,
"grad_norm": 0.7236230373382568,
"learning_rate": 4.942937937368628e-05,
"loss": 5.291954803466797,
"step": 1620
},
{
"epoch": 0.5778092874867068,
"grad_norm": 0.8149150013923645,
"learning_rate": 4.941686010211715e-05,
"loss": 5.252371978759766,
"step": 1630
},
{
"epoch": 0.5813541297412265,
"grad_norm": 0.6907062530517578,
"learning_rate": 4.940420660209436e-05,
"loss": 5.272654342651367,
"step": 1640
},
{
"epoch": 0.5848989719957461,
"grad_norm": 0.7174138426780701,
"learning_rate": 4.9391418943178836e-05,
"loss": 5.294339752197265,
"step": 1650
},
{
"epoch": 0.5884438142502658,
"grad_norm": 0.798039972782135,
"learning_rate": 4.9378497195669036e-05,
"loss": 5.257662963867188,
"step": 1660
},
{
"epoch": 0.5919886565047855,
"grad_norm": 0.724810004234314,
"learning_rate": 4.936544143060058e-05,
"loss": 5.284110641479492,
"step": 1670
},
{
"epoch": 0.5955334987593052,
"grad_norm": 0.7931723594665527,
"learning_rate": 4.9352251719745774e-05,
"loss": 5.281097412109375,
"step": 1680
},
{
"epoch": 0.5990783410138248,
"grad_norm": 0.6884552836418152,
"learning_rate": 4.933892813561333e-05,
"loss": 5.27345085144043,
"step": 1690
},
{
"epoch": 0.6026231832683445,
"grad_norm": 0.6960254907608032,
"learning_rate": 4.9325470751447866e-05,
"loss": 5.2600860595703125,
"step": 1700
},
{
"epoch": 0.6061680255228642,
"grad_norm": 0.7212153077125549,
"learning_rate": 4.931187964122959e-05,
"loss": 5.293134307861328,
"step": 1710
},
{
"epoch": 0.6097128677773839,
"grad_norm": 0.7511916756629944,
"learning_rate": 4.929815487967382e-05,
"loss": 5.3018230438232425,
"step": 1720
},
{
"epoch": 0.6132577100319035,
"grad_norm": 0.7361993193626404,
"learning_rate": 4.9284296542230615e-05,
"loss": 5.259002685546875,
"step": 1730
},
{
"epoch": 0.6168025522864232,
"grad_norm": 0.6815996170043945,
"learning_rate": 4.927030470508434e-05,
"loss": 5.303490447998047,
"step": 1740
},
{
"epoch": 0.6203473945409429,
"grad_norm": 0.6869247555732727,
"learning_rate": 4.925617944515328e-05,
"loss": 5.268404388427735,
"step": 1750
},
{
"epoch": 0.6238922367954626,
"grad_norm": 0.712818443775177,
"learning_rate": 4.9241920840089176e-05,
"loss": 5.259001922607422,
"step": 1760
},
{
"epoch": 0.6274370790499822,
"grad_norm": 0.7157370448112488,
"learning_rate": 4.922752896827682e-05,
"loss": 5.2527915954589846,
"step": 1770
},
{
"epoch": 0.6309819213045019,
"grad_norm": 0.7796548008918762,
"learning_rate": 4.921300390883362e-05,
"loss": 5.244187545776367,
"step": 1780
},
{
"epoch": 0.6345267635590216,
"grad_norm": 0.6423467397689819,
"learning_rate": 4.919834574160916e-05,
"loss": 5.230907440185547,
"step": 1790
},
{
"epoch": 0.6380716058135413,
"grad_norm": 0.7811551094055176,
"learning_rate": 4.9183554547184784e-05,
"loss": 5.229999923706055,
"step": 1800
},
{
"epoch": 0.641616448068061,
"grad_norm": 0.6660889387130737,
"learning_rate": 4.916863040687312e-05,
"loss": 5.248212814331055,
"step": 1810
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.6456777453422546,
"learning_rate": 4.915357340271765e-05,
"loss": 5.257810592651367,
"step": 1820
},
{
"epoch": 0.6487061325771003,
"grad_norm": 0.6784165501594543,
"learning_rate": 4.9138383617492254e-05,
"loss": 5.257859420776367,
"step": 1830
},
{
"epoch": 0.65225097483162,
"grad_norm": 0.6905676126480103,
"learning_rate": 4.9123061134700774e-05,
"loss": 5.253431701660157,
"step": 1840
},
{
"epoch": 0.6557958170861397,
"grad_norm": 0.6898303031921387,
"learning_rate": 4.9107606038576523e-05,
"loss": 5.243818283081055,
"step": 1850
},
{
"epoch": 0.6593406593406593,
"grad_norm": 0.6445653438568115,
"learning_rate": 4.9092018414081854e-05,
"loss": 5.2768810272216795,
"step": 1860
},
{
"epoch": 0.662885501595179,
"grad_norm": 0.6441115140914917,
"learning_rate": 4.9076298346907654e-05,
"loss": 5.255257415771484,
"step": 1870
},
{
"epoch": 0.6664303438496987,
"grad_norm": 0.6179636716842651,
"learning_rate": 4.906044592347292e-05,
"loss": 5.236399078369141,
"step": 1880
},
{
"epoch": 0.6699751861042184,
"grad_norm": 0.6955509781837463,
"learning_rate": 4.904446123092424e-05,
"loss": 5.26529541015625,
"step": 1890
},
{
"epoch": 0.673520028358738,
"grad_norm": 0.716314971446991,
"learning_rate": 4.9028344357135355e-05,
"loss": 5.257140350341797,
"step": 1900
},
{
"epoch": 0.6770648706132577,
"grad_norm": 0.785479724407196,
"learning_rate": 4.9012095390706636e-05,
"loss": 5.23547477722168,
"step": 1910
},
{
"epoch": 0.6806097128677774,
"grad_norm": 0.619127094745636,
"learning_rate": 4.899571442096462e-05,
"loss": 5.233910751342774,
"step": 1920
},
{
"epoch": 0.6841545551222971,
"grad_norm": 0.6372919082641602,
"learning_rate": 4.897920153796153e-05,
"loss": 5.2188163757324215,
"step": 1930
},
{
"epoch": 0.6876993973768167,
"grad_norm": 0.6643814444541931,
"learning_rate": 4.896255683247474e-05,
"loss": 5.250975036621094,
"step": 1940
},
{
"epoch": 0.6912442396313364,
"grad_norm": 0.551176130771637,
"learning_rate": 4.894578039600633e-05,
"loss": 5.245917510986328,
"step": 1950
},
{
"epoch": 0.6947890818858561,
"grad_norm": 0.7061398029327393,
"learning_rate": 4.892887232078251e-05,
"loss": 5.223080062866211,
"step": 1960
},
{
"epoch": 0.6983339241403758,
"grad_norm": 0.6970100402832031,
"learning_rate": 4.8911832699753205e-05,
"loss": 5.245952606201172,
"step": 1970
},
{
"epoch": 0.7018787663948954,
"grad_norm": 0.6700912117958069,
"learning_rate": 4.8894661626591475e-05,
"loss": 5.273249816894531,
"step": 1980
},
{
"epoch": 0.7054236086494151,
"grad_norm": 0.5814111828804016,
"learning_rate": 4.8877359195693005e-05,
"loss": 5.207174682617188,
"step": 1990
},
{
"epoch": 0.7089684509039348,
"grad_norm": 0.7154501080513,
"learning_rate": 4.885992550217563e-05,
"loss": 5.2551219940185545,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 14105,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6845619367143014e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}