kurtpayne's picture
Auto fine-tune: 16589 examples (LoRA adapter)
f4d2609 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2015,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004962779156327543,
"grad_norm": NaN,
"learning_rate": 1.9982133995037223e-05,
"loss": 4.0715,
"step": 10
},
{
"epoch": 0.009925558312655087,
"grad_norm": 7.404272079467773,
"learning_rate": 1.996228287841191e-05,
"loss": 2.8811,
"step": 20
},
{
"epoch": 0.01488833746898263,
"grad_norm": 13.565997123718262,
"learning_rate": 1.99424317617866e-05,
"loss": 3.1689,
"step": 30
},
{
"epoch": 0.019851116625310174,
"grad_norm": 15.281591415405273,
"learning_rate": 1.9922580645161292e-05,
"loss": 3.0516,
"step": 40
},
{
"epoch": 0.02481389578163772,
"grad_norm": 7.509647369384766,
"learning_rate": 1.9902729528535983e-05,
"loss": 2.9276,
"step": 50
},
{
"epoch": 0.02977667493796526,
"grad_norm": 7.238119602203369,
"learning_rate": 1.988287841191067e-05,
"loss": 2.0641,
"step": 60
},
{
"epoch": 0.034739454094292806,
"grad_norm": 8.467841148376465,
"learning_rate": 1.9863027295285362e-05,
"loss": 1.8259,
"step": 70
},
{
"epoch": 0.03970223325062035,
"grad_norm": 7.360463619232178,
"learning_rate": 1.984317617866005e-05,
"loss": 1.7238,
"step": 80
},
{
"epoch": 0.04466501240694789,
"grad_norm": 6.133955478668213,
"learning_rate": 1.982332506203474e-05,
"loss": 1.3318,
"step": 90
},
{
"epoch": 0.04962779156327544,
"grad_norm": 6.2159295082092285,
"learning_rate": 1.980347394540943e-05,
"loss": 1.1172,
"step": 100
},
{
"epoch": 0.05459057071960298,
"grad_norm": 2.4206221103668213,
"learning_rate": 1.9783622828784122e-05,
"loss": 1.033,
"step": 110
},
{
"epoch": 0.05955334987593052,
"grad_norm": 5.191859245300293,
"learning_rate": 1.976377171215881e-05,
"loss": 0.98,
"step": 120
},
{
"epoch": 0.06451612903225806,
"grad_norm": 2.742246150970459,
"learning_rate": 1.97439205955335e-05,
"loss": 0.7428,
"step": 130
},
{
"epoch": 0.06947890818858561,
"grad_norm": 4.614717960357666,
"learning_rate": 1.972406947890819e-05,
"loss": 0.9041,
"step": 140
},
{
"epoch": 0.07444168734491315,
"grad_norm": 3.7935266494750977,
"learning_rate": 1.9704218362282882e-05,
"loss": 0.5919,
"step": 150
},
{
"epoch": 0.0794044665012407,
"grad_norm": 3.5459160804748535,
"learning_rate": 1.968436724565757e-05,
"loss": 0.7932,
"step": 160
},
{
"epoch": 0.08436724565756824,
"grad_norm": 1.8404773473739624,
"learning_rate": 1.966451612903226e-05,
"loss": 0.7619,
"step": 170
},
{
"epoch": 0.08933002481389578,
"grad_norm": 2.2088851928710938,
"learning_rate": 1.964466501240695e-05,
"loss": 0.6204,
"step": 180
},
{
"epoch": 0.09429280397022333,
"grad_norm": 1.566972017288208,
"learning_rate": 1.962481389578164e-05,
"loss": 0.7081,
"step": 190
},
{
"epoch": 0.09925558312655088,
"grad_norm": 1.225921630859375,
"learning_rate": 1.960496277915633e-05,
"loss": 0.5577,
"step": 200
},
{
"epoch": 0.10421836228287841,
"grad_norm": 2.0241658687591553,
"learning_rate": 1.958511166253102e-05,
"loss": 0.5101,
"step": 210
},
{
"epoch": 0.10918114143920596,
"grad_norm": 2.1363158226013184,
"learning_rate": 1.956526054590571e-05,
"loss": 0.4952,
"step": 220
},
{
"epoch": 0.1141439205955335,
"grad_norm": 2.7994675636291504,
"learning_rate": 1.9545409429280396e-05,
"loss": 0.5103,
"step": 230
},
{
"epoch": 0.11910669975186104,
"grad_norm": 2.4477338790893555,
"learning_rate": 1.9525558312655087e-05,
"loss": 0.3801,
"step": 240
},
{
"epoch": 0.12406947890818859,
"grad_norm": 2.0796492099761963,
"learning_rate": 1.9505707196029778e-05,
"loss": 0.4258,
"step": 250
},
{
"epoch": 0.12903225806451613,
"grad_norm": 1.3138341903686523,
"learning_rate": 1.948585607940447e-05,
"loss": 0.4838,
"step": 260
},
{
"epoch": 0.13399503722084366,
"grad_norm": 1.5272380113601685,
"learning_rate": 1.9466004962779156e-05,
"loss": 0.366,
"step": 270
},
{
"epoch": 0.13895781637717122,
"grad_norm": 3.6165964603424072,
"learning_rate": 1.9446153846153847e-05,
"loss": 0.4171,
"step": 280
},
{
"epoch": 0.14392059553349876,
"grad_norm": 2.2593328952789307,
"learning_rate": 1.9426302729528538e-05,
"loss": 0.3028,
"step": 290
},
{
"epoch": 0.1488833746898263,
"grad_norm": 1.5910056829452515,
"learning_rate": 1.940645161290323e-05,
"loss": 0.3047,
"step": 300
},
{
"epoch": 0.15384615384615385,
"grad_norm": 1.767904281616211,
"learning_rate": 1.9386600496277917e-05,
"loss": 0.4552,
"step": 310
},
{
"epoch": 0.1588089330024814,
"grad_norm": 1.7085593938827515,
"learning_rate": 1.9366749379652608e-05,
"loss": 0.2556,
"step": 320
},
{
"epoch": 0.16377171215880892,
"grad_norm": 1.355582594871521,
"learning_rate": 1.9346898263027295e-05,
"loss": 0.4337,
"step": 330
},
{
"epoch": 0.1687344913151365,
"grad_norm": 0.8261783123016357,
"learning_rate": 1.9327047146401986e-05,
"loss": 0.3328,
"step": 340
},
{
"epoch": 0.17369727047146402,
"grad_norm": 0.8605831861495972,
"learning_rate": 1.9307196029776677e-05,
"loss": 0.2587,
"step": 350
},
{
"epoch": 0.17866004962779156,
"grad_norm": 2.7887861728668213,
"learning_rate": 1.9287344913151368e-05,
"loss": 0.3559,
"step": 360
},
{
"epoch": 0.18362282878411912,
"grad_norm": 1.0830411911010742,
"learning_rate": 1.9267493796526055e-05,
"loss": 0.1826,
"step": 370
},
{
"epoch": 0.18858560794044665,
"grad_norm": 3.8424558639526367,
"learning_rate": 1.9247642679900746e-05,
"loss": 0.2587,
"step": 380
},
{
"epoch": 0.1935483870967742,
"grad_norm": 3.894951343536377,
"learning_rate": 1.9227791563275434e-05,
"loss": 0.2034,
"step": 390
},
{
"epoch": 0.19851116625310175,
"grad_norm": 3.139410972595215,
"learning_rate": 1.9207940446650125e-05,
"loss": 0.2387,
"step": 400
},
{
"epoch": 0.20347394540942929,
"grad_norm": 1.7462635040283203,
"learning_rate": 1.9188089330024816e-05,
"loss": 0.2626,
"step": 410
},
{
"epoch": 0.20843672456575682,
"grad_norm": 4.002986907958984,
"learning_rate": 1.9168238213399507e-05,
"loss": 0.1782,
"step": 420
},
{
"epoch": 0.21339950372208435,
"grad_norm": 2.251654624938965,
"learning_rate": 1.9148387096774194e-05,
"loss": 0.1621,
"step": 430
},
{
"epoch": 0.21836228287841192,
"grad_norm": 0.6951057314872742,
"learning_rate": 1.9128535980148885e-05,
"loss": 0.1937,
"step": 440
},
{
"epoch": 0.22332506203473945,
"grad_norm": 0.8540646433830261,
"learning_rate": 1.9108684863523576e-05,
"loss": 0.1913,
"step": 450
},
{
"epoch": 0.228287841191067,
"grad_norm": 0.9929500222206116,
"learning_rate": 1.9088833746898267e-05,
"loss": 0.2433,
"step": 460
},
{
"epoch": 0.23325062034739455,
"grad_norm": 3.8289272785186768,
"learning_rate": 1.9068982630272954e-05,
"loss": 0.1903,
"step": 470
},
{
"epoch": 0.23821339950372208,
"grad_norm": 4.537591934204102,
"learning_rate": 1.9049131513647645e-05,
"loss": 0.2136,
"step": 480
},
{
"epoch": 0.24317617866004962,
"grad_norm": 0.20700696110725403,
"learning_rate": 1.9029280397022333e-05,
"loss": 0.1888,
"step": 490
},
{
"epoch": 0.24813895781637718,
"grad_norm": 0.4332411289215088,
"learning_rate": 1.9009429280397024e-05,
"loss": 0.2358,
"step": 500
},
{
"epoch": 0.2531017369727047,
"grad_norm": 1.050528645515442,
"learning_rate": 1.8989578163771715e-05,
"loss": 0.0832,
"step": 510
},
{
"epoch": 0.25806451612903225,
"grad_norm": 0.22720620036125183,
"learning_rate": 1.8969727047146406e-05,
"loss": 0.1272,
"step": 520
},
{
"epoch": 0.2630272952853598,
"grad_norm": 3.576136350631714,
"learning_rate": 1.8949875930521093e-05,
"loss": 0.2066,
"step": 530
},
{
"epoch": 0.2679900744416873,
"grad_norm": 0.33265048265457153,
"learning_rate": 1.8930024813895784e-05,
"loss": 0.0556,
"step": 540
},
{
"epoch": 0.2729528535980149,
"grad_norm": 0.6722800731658936,
"learning_rate": 1.891017369727047e-05,
"loss": 0.0807,
"step": 550
},
{
"epoch": 0.27791563275434245,
"grad_norm": 1.2628779411315918,
"learning_rate": 1.8890322580645163e-05,
"loss": 0.24,
"step": 560
},
{
"epoch": 0.28287841191067,
"grad_norm": 0.27271902561187744,
"learning_rate": 1.8870471464019853e-05,
"loss": 0.077,
"step": 570
},
{
"epoch": 0.2878411910669975,
"grad_norm": 3.738210916519165,
"learning_rate": 1.885062034739454e-05,
"loss": 0.395,
"step": 580
},
{
"epoch": 0.29280397022332505,
"grad_norm": 0.6131235361099243,
"learning_rate": 1.8830769230769232e-05,
"loss": 0.1971,
"step": 590
},
{
"epoch": 0.2977667493796526,
"grad_norm": 0.8646567463874817,
"learning_rate": 1.8810918114143923e-05,
"loss": 0.0669,
"step": 600
},
{
"epoch": 0.3027295285359802,
"grad_norm": 4.964555263519287,
"learning_rate": 1.8791066997518614e-05,
"loss": 0.2324,
"step": 610
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.5028713941574097,
"learning_rate": 1.87712158808933e-05,
"loss": 0.1819,
"step": 620
},
{
"epoch": 0.31265508684863524,
"grad_norm": 0.31731173396110535,
"learning_rate": 1.8751364764267992e-05,
"loss": 0.1463,
"step": 630
},
{
"epoch": 0.3176178660049628,
"grad_norm": 0.23767873644828796,
"learning_rate": 1.873151364764268e-05,
"loss": 0.0874,
"step": 640
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.5757867693901062,
"learning_rate": 1.871166253101737e-05,
"loss": 0.1267,
"step": 650
},
{
"epoch": 0.32754342431761785,
"grad_norm": 2.5683581829071045,
"learning_rate": 1.869181141439206e-05,
"loss": 0.0823,
"step": 660
},
{
"epoch": 0.3325062034739454,
"grad_norm": 0.22655190527439117,
"learning_rate": 1.8671960297766752e-05,
"loss": 0.2087,
"step": 670
},
{
"epoch": 0.337468982630273,
"grad_norm": 2.4306397438049316,
"learning_rate": 1.865210918114144e-05,
"loss": 0.1034,
"step": 680
},
{
"epoch": 0.3424317617866005,
"grad_norm": 0.11680830270051956,
"learning_rate": 1.863225806451613e-05,
"loss": 0.0316,
"step": 690
},
{
"epoch": 0.34739454094292804,
"grad_norm": 0.27029949426651,
"learning_rate": 1.861240694789082e-05,
"loss": 0.131,
"step": 700
},
{
"epoch": 0.3523573200992556,
"grad_norm": 0.31934475898742676,
"learning_rate": 1.859255583126551e-05,
"loss": 0.1961,
"step": 710
},
{
"epoch": 0.3573200992555831,
"grad_norm": 6.141274929046631,
"learning_rate": 1.85727047146402e-05,
"loss": 0.2118,
"step": 720
},
{
"epoch": 0.36228287841191065,
"grad_norm": 0.8305376172065735,
"learning_rate": 1.855285359801489e-05,
"loss": 0.0207,
"step": 730
},
{
"epoch": 0.36724565756823824,
"grad_norm": 1.411669135093689,
"learning_rate": 1.853300248138958e-05,
"loss": 0.0639,
"step": 740
},
{
"epoch": 0.37220843672456577,
"grad_norm": 2.125790596008301,
"learning_rate": 1.851315136476427e-05,
"loss": 0.2443,
"step": 750
},
{
"epoch": 0.3771712158808933,
"grad_norm": 0.9730265736579895,
"learning_rate": 1.849330024813896e-05,
"loss": 0.0786,
"step": 760
},
{
"epoch": 0.38213399503722084,
"grad_norm": 3.10215163230896,
"learning_rate": 1.847344913151365e-05,
"loss": 0.3467,
"step": 770
},
{
"epoch": 0.3870967741935484,
"grad_norm": 3.7202343940734863,
"learning_rate": 1.845359801488834e-05,
"loss": 0.2285,
"step": 780
},
{
"epoch": 0.3920595533498759,
"grad_norm": 0.5115141272544861,
"learning_rate": 1.843374689826303e-05,
"loss": 0.1705,
"step": 790
},
{
"epoch": 0.3970223325062035,
"grad_norm": 0.4430786669254303,
"learning_rate": 1.8413895781637717e-05,
"loss": 0.0631,
"step": 800
},
{
"epoch": 0.40198511166253104,
"grad_norm": 4.2651143074035645,
"learning_rate": 1.8394044665012408e-05,
"loss": 0.1615,
"step": 810
},
{
"epoch": 0.40694789081885857,
"grad_norm": 5.431081295013428,
"learning_rate": 1.83741935483871e-05,
"loss": 0.0325,
"step": 820
},
{
"epoch": 0.4119106699751861,
"grad_norm": 1.7246816158294678,
"learning_rate": 1.835434243176179e-05,
"loss": 0.0818,
"step": 830
},
{
"epoch": 0.41687344913151364,
"grad_norm": 0.11597002297639847,
"learning_rate": 1.8334491315136478e-05,
"loss": 0.1661,
"step": 840
},
{
"epoch": 0.4218362282878412,
"grad_norm": 5.003543853759766,
"learning_rate": 1.831464019851117e-05,
"loss": 0.0775,
"step": 850
},
{
"epoch": 0.4267990074441687,
"grad_norm": 6.160369396209717,
"learning_rate": 1.8294789081885856e-05,
"loss": 0.0777,
"step": 860
},
{
"epoch": 0.4317617866004963,
"grad_norm": 0.4120528995990753,
"learning_rate": 1.8274937965260547e-05,
"loss": 0.0515,
"step": 870
},
{
"epoch": 0.43672456575682383,
"grad_norm": 0.1204703077673912,
"learning_rate": 1.8255086848635238e-05,
"loss": 0.0356,
"step": 880
},
{
"epoch": 0.44168734491315137,
"grad_norm": 0.09946464747190475,
"learning_rate": 1.8235235732009925e-05,
"loss": 0.1525,
"step": 890
},
{
"epoch": 0.4466501240694789,
"grad_norm": 1.1095144748687744,
"learning_rate": 1.8215384615384616e-05,
"loss": 0.098,
"step": 900
},
{
"epoch": 0.45161290322580644,
"grad_norm": 0.15330803394317627,
"learning_rate": 1.8195533498759307e-05,
"loss": 0.0986,
"step": 910
},
{
"epoch": 0.456575682382134,
"grad_norm": 0.231792613863945,
"learning_rate": 1.8175682382133998e-05,
"loss": 0.1464,
"step": 920
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.15597473084926605,
"learning_rate": 1.8155831265508686e-05,
"loss": 0.166,
"step": 930
},
{
"epoch": 0.4665012406947891,
"grad_norm": 1.0127346515655518,
"learning_rate": 1.8135980148883377e-05,
"loss": 0.1167,
"step": 940
},
{
"epoch": 0.47146401985111663,
"grad_norm": 2.508481740951538,
"learning_rate": 1.8116129032258064e-05,
"loss": 0.0913,
"step": 950
},
{
"epoch": 0.47642679900744417,
"grad_norm": 0.06334420293569565,
"learning_rate": 1.8096277915632755e-05,
"loss": 0.0285,
"step": 960
},
{
"epoch": 0.4813895781637717,
"grad_norm": 0.1302240639925003,
"learning_rate": 1.8076426799007446e-05,
"loss": 0.123,
"step": 970
},
{
"epoch": 0.48635235732009924,
"grad_norm": 1.0986523628234863,
"learning_rate": 1.8056575682382137e-05,
"loss": 0.0978,
"step": 980
},
{
"epoch": 0.4913151364764268,
"grad_norm": 0.08456585556268692,
"learning_rate": 1.8036724565756824e-05,
"loss": 0.253,
"step": 990
},
{
"epoch": 0.49627791563275436,
"grad_norm": 3.1052098274230957,
"learning_rate": 1.8016873449131515e-05,
"loss": 0.0929,
"step": 1000
},
{
"epoch": 0.5012406947890818,
"grad_norm": 2.032149314880371,
"learning_rate": 1.7997022332506203e-05,
"loss": 0.2004,
"step": 1010
},
{
"epoch": 0.5062034739454094,
"grad_norm": 0.11190329492092133,
"learning_rate": 1.7977171215880894e-05,
"loss": 0.0892,
"step": 1020
},
{
"epoch": 0.511166253101737,
"grad_norm": 3.9147121906280518,
"learning_rate": 1.7957320099255585e-05,
"loss": 0.1345,
"step": 1030
},
{
"epoch": 0.5161290322580645,
"grad_norm": 0.1115802675485611,
"learning_rate": 1.7937468982630276e-05,
"loss": 0.0167,
"step": 1040
},
{
"epoch": 0.5210918114143921,
"grad_norm": 0.09744033217430115,
"learning_rate": 1.7917617866004963e-05,
"loss": 0.0726,
"step": 1050
},
{
"epoch": 0.5260545905707196,
"grad_norm": 8.279594421386719,
"learning_rate": 1.7897766749379654e-05,
"loss": 0.1851,
"step": 1060
},
{
"epoch": 0.5310173697270472,
"grad_norm": 0.12151824682950974,
"learning_rate": 1.7877915632754345e-05,
"loss": 0.1491,
"step": 1070
},
{
"epoch": 0.5359801488833746,
"grad_norm": 0.8291270136833191,
"learning_rate": 1.7858064516129036e-05,
"loss": 0.0114,
"step": 1080
},
{
"epoch": 0.5409429280397022,
"grad_norm": 3.0179080963134766,
"learning_rate": 1.7838213399503723e-05,
"loss": 0.1433,
"step": 1090
},
{
"epoch": 0.5459057071960298,
"grad_norm": 3.6688222885131836,
"learning_rate": 1.7818362282878414e-05,
"loss": 0.2806,
"step": 1100
},
{
"epoch": 0.5508684863523573,
"grad_norm": 0.08318022638559341,
"learning_rate": 1.7798511166253102e-05,
"loss": 0.0321,
"step": 1110
},
{
"epoch": 0.5558312655086849,
"grad_norm": 0.16207154095172882,
"learning_rate": 1.7778660049627793e-05,
"loss": 0.0684,
"step": 1120
},
{
"epoch": 0.5607940446650124,
"grad_norm": 7.777888774871826,
"learning_rate": 1.7758808933002484e-05,
"loss": 0.1594,
"step": 1130
},
{
"epoch": 0.56575682382134,
"grad_norm": 0.07727096229791641,
"learning_rate": 1.7738957816377175e-05,
"loss": 0.0785,
"step": 1140
},
{
"epoch": 0.5707196029776674,
"grad_norm": 0.2622874975204468,
"learning_rate": 1.7719106699751862e-05,
"loss": 0.1995,
"step": 1150
},
{
"epoch": 0.575682382133995,
"grad_norm": 3.662970542907715,
"learning_rate": 1.7699255583126553e-05,
"loss": 0.1673,
"step": 1160
},
{
"epoch": 0.5806451612903226,
"grad_norm": 7.050316333770752,
"learning_rate": 1.767940446650124e-05,
"loss": 0.2565,
"step": 1170
},
{
"epoch": 0.5856079404466501,
"grad_norm": 0.09787000715732574,
"learning_rate": 1.765955334987593e-05,
"loss": 0.1664,
"step": 1180
},
{
"epoch": 0.5905707196029777,
"grad_norm": 0.08483708649873734,
"learning_rate": 1.7639702233250622e-05,
"loss": 0.1026,
"step": 1190
},
{
"epoch": 0.5955334987593052,
"grad_norm": 2.9109203815460205,
"learning_rate": 1.761985111662531e-05,
"loss": 0.1508,
"step": 1200
},
{
"epoch": 0.6004962779156328,
"grad_norm": 0.09153315424919128,
"learning_rate": 1.76e-05,
"loss": 0.0846,
"step": 1210
},
{
"epoch": 0.6054590570719603,
"grad_norm": 0.5280864834785461,
"learning_rate": 1.7580148883374692e-05,
"loss": 0.1233,
"step": 1220
},
{
"epoch": 0.6104218362282878,
"grad_norm": 0.20713864266872406,
"learning_rate": 1.7560297766749383e-05,
"loss": 0.0837,
"step": 1230
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.20489178597927094,
"learning_rate": 1.754044665012407e-05,
"loss": 0.0938,
"step": 1240
},
{
"epoch": 0.6203473945409429,
"grad_norm": 0.0775533989071846,
"learning_rate": 1.752059553349876e-05,
"loss": 0.0832,
"step": 1250
},
{
"epoch": 0.6253101736972705,
"grad_norm": 0.16079996526241302,
"learning_rate": 1.750074441687345e-05,
"loss": 0.1211,
"step": 1260
},
{
"epoch": 0.630272952853598,
"grad_norm": 0.06369619816541672,
"learning_rate": 1.748089330024814e-05,
"loss": 0.0561,
"step": 1270
},
{
"epoch": 0.6352357320099256,
"grad_norm": 2.8441195487976074,
"learning_rate": 1.746104218362283e-05,
"loss": 0.1201,
"step": 1280
},
{
"epoch": 0.6401985111662531,
"grad_norm": 6.837566375732422,
"learning_rate": 1.744119106699752e-05,
"loss": 0.1761,
"step": 1290
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.06762542575597763,
"learning_rate": 1.742133995037221e-05,
"loss": 0.1454,
"step": 1300
},
{
"epoch": 0.6501240694789082,
"grad_norm": 0.10768511891365051,
"learning_rate": 1.74014888337469e-05,
"loss": 0.1109,
"step": 1310
},
{
"epoch": 0.6550868486352357,
"grad_norm": 0.14378197491168976,
"learning_rate": 1.7381637717121587e-05,
"loss": 0.0964,
"step": 1320
},
{
"epoch": 0.6600496277915633,
"grad_norm": 5.744873523712158,
"learning_rate": 1.736178660049628e-05,
"loss": 0.1388,
"step": 1330
},
{
"epoch": 0.6650124069478908,
"grad_norm": 1.2244545221328735,
"learning_rate": 1.734193548387097e-05,
"loss": 0.0129,
"step": 1340
},
{
"epoch": 0.6699751861042184,
"grad_norm": 6.384188175201416,
"learning_rate": 1.732208436724566e-05,
"loss": 0.3897,
"step": 1350
},
{
"epoch": 0.674937965260546,
"grad_norm": 11.079994201660156,
"learning_rate": 1.7302233250620348e-05,
"loss": 0.1177,
"step": 1360
},
{
"epoch": 0.6799007444168734,
"grad_norm": 0.9940871000289917,
"learning_rate": 1.728238213399504e-05,
"loss": 0.0522,
"step": 1370
},
{
"epoch": 0.684863523573201,
"grad_norm": 4.725884914398193,
"learning_rate": 1.726253101736973e-05,
"loss": 0.2414,
"step": 1380
},
{
"epoch": 0.6898263027295285,
"grad_norm": 8.218222618103027,
"learning_rate": 1.724267990074442e-05,
"loss": 0.0943,
"step": 1390
},
{
"epoch": 0.6947890818858561,
"grad_norm": 0.0562286414206028,
"learning_rate": 1.7222828784119108e-05,
"loss": 0.0122,
"step": 1400
},
{
"epoch": 0.6997518610421837,
"grad_norm": 0.0633021891117096,
"learning_rate": 1.72029776674938e-05,
"loss": 0.0148,
"step": 1410
},
{
"epoch": 0.7047146401985112,
"grad_norm": 0.23889249563217163,
"learning_rate": 1.7183126550868486e-05,
"loss": 0.0958,
"step": 1420
},
{
"epoch": 0.7096774193548387,
"grad_norm": 3.1089024543762207,
"learning_rate": 1.7163275434243177e-05,
"loss": 0.2354,
"step": 1430
},
{
"epoch": 0.7146401985111662,
"grad_norm": 0.04075319692492485,
"learning_rate": 1.7143424317617868e-05,
"loss": 0.1398,
"step": 1440
},
{
"epoch": 0.7196029776674938,
"grad_norm": 0.2210625857114792,
"learning_rate": 1.712357320099256e-05,
"loss": 0.0855,
"step": 1450
},
{
"epoch": 0.7245657568238213,
"grad_norm": 6.384485244750977,
"learning_rate": 1.7103722084367247e-05,
"loss": 0.2229,
"step": 1460
},
{
"epoch": 0.7295285359801489,
"grad_norm": 0.10448583960533142,
"learning_rate": 1.7083870967741938e-05,
"loss": 0.0054,
"step": 1470
},
{
"epoch": 0.7344913151364765,
"grad_norm": 0.06005546450614929,
"learning_rate": 1.7064019851116625e-05,
"loss": 0.0955,
"step": 1480
},
{
"epoch": 0.739454094292804,
"grad_norm": 0.09095001220703125,
"learning_rate": 1.7044168734491316e-05,
"loss": 0.1075,
"step": 1490
},
{
"epoch": 0.7444168734491315,
"grad_norm": 0.05384785681962967,
"learning_rate": 1.7024317617866007e-05,
"loss": 0.1336,
"step": 1500
},
{
"epoch": 0.749379652605459,
"grad_norm": 3.4298574924468994,
"learning_rate": 1.7004466501240694e-05,
"loss": 0.0192,
"step": 1510
},
{
"epoch": 0.7543424317617866,
"grad_norm": 0.11529748886823654,
"learning_rate": 1.6984615384615385e-05,
"loss": 0.1036,
"step": 1520
},
{
"epoch": 0.7593052109181141,
"grad_norm": 13.056891441345215,
"learning_rate": 1.6964764267990076e-05,
"loss": 0.0958,
"step": 1530
},
{
"epoch": 0.7642679900744417,
"grad_norm": 0.05889654532074928,
"learning_rate": 1.6944913151364767e-05,
"loss": 0.1754,
"step": 1540
},
{
"epoch": 0.7692307692307693,
"grad_norm": 5.287586688995361,
"learning_rate": 1.6925062034739455e-05,
"loss": 0.1773,
"step": 1550
},
{
"epoch": 0.7741935483870968,
"grad_norm": 4.1205878257751465,
"learning_rate": 1.6905210918114146e-05,
"loss": 0.2493,
"step": 1560
},
{
"epoch": 0.7791563275434243,
"grad_norm": 0.19854427874088287,
"learning_rate": 1.6885359801488833e-05,
"loss": 0.1686,
"step": 1570
},
{
"epoch": 0.7841191066997518,
"grad_norm": 0.05931422859430313,
"learning_rate": 1.6865508684863524e-05,
"loss": 0.0397,
"step": 1580
},
{
"epoch": 0.7890818858560794,
"grad_norm": 0.10935617238283157,
"learning_rate": 1.6845657568238215e-05,
"loss": 0.0285,
"step": 1590
},
{
"epoch": 0.794044665012407,
"grad_norm": 0.10486113280057907,
"learning_rate": 1.6825806451612906e-05,
"loss": 0.0781,
"step": 1600
},
{
"epoch": 0.7990074441687345,
"grad_norm": 0.7170459032058716,
"learning_rate": 1.6805955334987593e-05,
"loss": 0.0829,
"step": 1610
},
{
"epoch": 0.8039702233250621,
"grad_norm": 0.3470781445503235,
"learning_rate": 1.6786104218362284e-05,
"loss": 0.053,
"step": 1620
},
{
"epoch": 0.8089330024813896,
"grad_norm": 4.012697219848633,
"learning_rate": 1.6766253101736972e-05,
"loss": 0.1086,
"step": 1630
},
{
"epoch": 0.8138957816377171,
"grad_norm": 0.302435040473938,
"learning_rate": 1.6746401985111663e-05,
"loss": 0.0111,
"step": 1640
},
{
"epoch": 0.8188585607940446,
"grad_norm": 0.06520848721265793,
"learning_rate": 1.6726550868486354e-05,
"loss": 0.0726,
"step": 1650
},
{
"epoch": 0.8238213399503722,
"grad_norm": 0.09845948964357376,
"learning_rate": 1.6706699751861045e-05,
"loss": 0.0094,
"step": 1660
},
{
"epoch": 0.8287841191066998,
"grad_norm": 0.03756513074040413,
"learning_rate": 1.6686848635235732e-05,
"loss": 0.23,
"step": 1670
},
{
"epoch": 0.8337468982630273,
"grad_norm": 0.39116331934928894,
"learning_rate": 1.6666997518610423e-05,
"loss": 0.034,
"step": 1680
},
{
"epoch": 0.8387096774193549,
"grad_norm": 0.8433384299278259,
"learning_rate": 1.6647146401985114e-05,
"loss": 0.0264,
"step": 1690
},
{
"epoch": 0.8436724565756824,
"grad_norm": 0.05700427293777466,
"learning_rate": 1.6627295285359805e-05,
"loss": 0.2487,
"step": 1700
},
{
"epoch": 0.8486352357320099,
"grad_norm": 8.650578498840332,
"learning_rate": 1.6607444168734492e-05,
"loss": 0.2098,
"step": 1710
},
{
"epoch": 0.8535980148883374,
"grad_norm": 0.07815321534872055,
"learning_rate": 1.6587593052109183e-05,
"loss": 0.0113,
"step": 1720
},
{
"epoch": 0.858560794044665,
"grad_norm": 0.11031467467546463,
"learning_rate": 1.656774193548387e-05,
"loss": 0.0744,
"step": 1730
},
{
"epoch": 0.8635235732009926,
"grad_norm": 0.7240772247314453,
"learning_rate": 1.6547890818858562e-05,
"loss": 0.1165,
"step": 1740
},
{
"epoch": 0.8684863523573201,
"grad_norm": 0.03234798088669777,
"learning_rate": 1.6528039702233253e-05,
"loss": 0.0678,
"step": 1750
},
{
"epoch": 0.8734491315136477,
"grad_norm": 0.024767836555838585,
"learning_rate": 1.6508188585607944e-05,
"loss": 0.0519,
"step": 1760
},
{
"epoch": 0.8784119106699751,
"grad_norm": 0.03122510015964508,
"learning_rate": 1.648833746898263e-05,
"loss": 0.1386,
"step": 1770
},
{
"epoch": 0.8833746898263027,
"grad_norm": 0.824370265007019,
"learning_rate": 1.6468486352357322e-05,
"loss": 0.164,
"step": 1780
},
{
"epoch": 0.8883374689826302,
"grad_norm": 5.419961452484131,
"learning_rate": 1.644863523573201e-05,
"loss": 0.1668,
"step": 1790
},
{
"epoch": 0.8933002481389578,
"grad_norm": 0.03851868212223053,
"learning_rate": 1.64287841191067e-05,
"loss": 0.0614,
"step": 1800
},
{
"epoch": 0.8982630272952854,
"grad_norm": 0.06279598921537399,
"learning_rate": 1.640893300248139e-05,
"loss": 0.0031,
"step": 1810
},
{
"epoch": 0.9032258064516129,
"grad_norm": 0.04613855481147766,
"learning_rate": 1.6389081885856082e-05,
"loss": 0.1774,
"step": 1820
},
{
"epoch": 0.9081885856079405,
"grad_norm": 0.6312947869300842,
"learning_rate": 1.636923076923077e-05,
"loss": 0.0815,
"step": 1830
},
{
"epoch": 0.913151364764268,
"grad_norm": 0.161106139421463,
"learning_rate": 1.634937965260546e-05,
"loss": 0.1977,
"step": 1840
},
{
"epoch": 0.9181141439205955,
"grad_norm": 0.03545048087835312,
"learning_rate": 1.6329528535980152e-05,
"loss": 0.0353,
"step": 1850
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.1279628425836563,
"learning_rate": 1.630967741935484e-05,
"loss": 0.0503,
"step": 1860
},
{
"epoch": 0.9280397022332506,
"grad_norm": 8.406203269958496,
"learning_rate": 1.628982630272953e-05,
"loss": 0.0463,
"step": 1870
},
{
"epoch": 0.9330024813895782,
"grad_norm": 10.595025062561035,
"learning_rate": 1.6269975186104218e-05,
"loss": 0.0173,
"step": 1880
},
{
"epoch": 0.9379652605459057,
"grad_norm": 0.7097483277320862,
"learning_rate": 1.625012406947891e-05,
"loss": 0.0596,
"step": 1890
},
{
"epoch": 0.9429280397022333,
"grad_norm": 7.379908561706543,
"learning_rate": 1.62302729528536e-05,
"loss": 0.2075,
"step": 1900
},
{
"epoch": 0.9478908188585607,
"grad_norm": 0.1548600196838379,
"learning_rate": 1.621042183622829e-05,
"loss": 0.2226,
"step": 1910
},
{
"epoch": 0.9528535980148883,
"grad_norm": 0.026123059913516045,
"learning_rate": 1.6190570719602978e-05,
"loss": 0.1507,
"step": 1920
},
{
"epoch": 0.9578163771712159,
"grad_norm": 0.08188609778881073,
"learning_rate": 1.617071960297767e-05,
"loss": 0.1326,
"step": 1930
},
{
"epoch": 0.9627791563275434,
"grad_norm": 0.3817611634731293,
"learning_rate": 1.6150868486352356e-05,
"loss": 0.2298,
"step": 1940
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.048138681799173355,
"learning_rate": 1.6131017369727047e-05,
"loss": 0.0676,
"step": 1950
},
{
"epoch": 0.9727047146401985,
"grad_norm": 0.06823063641786575,
"learning_rate": 1.6111166253101738e-05,
"loss": 0.0727,
"step": 1960
},
{
"epoch": 0.9776674937965261,
"grad_norm": 0.031212667003273964,
"learning_rate": 1.609131513647643e-05,
"loss": 0.0943,
"step": 1970
},
{
"epoch": 0.9826302729528535,
"grad_norm": 7.244811534881592,
"learning_rate": 1.6071464019851117e-05,
"loss": 0.1229,
"step": 1980
},
{
"epoch": 0.9875930521091811,
"grad_norm": 0.06972146779298782,
"learning_rate": 1.6051612903225808e-05,
"loss": 0.3056,
"step": 1990
},
{
"epoch": 0.9925558312655087,
"grad_norm": 0.026395201683044434,
"learning_rate": 1.60317617866005e-05,
"loss": 0.0214,
"step": 2000
},
{
"epoch": 0.9975186104218362,
"grad_norm": 1.8401563167572021,
"learning_rate": 1.601191066997519e-05,
"loss": 0.077,
"step": 2010
}
],
"logging_steps": 10,
"max_steps": 10075,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2178492222720000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}