llama31-8b-hatexplain-lora / trainer_state.json
muditbaid's picture
Upload folder using huggingface_hub
adb530f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8201261132418904,
"eval_steps": 500,
"global_step": 3500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00520054605733602,
"grad_norm": 16.05304718017578,
"learning_rate": 1.5570934256055363e-06,
"loss": 1.3826,
"step": 10
},
{
"epoch": 0.01040109211467204,
"grad_norm": 13.638071060180664,
"learning_rate": 3.2871972318339097e-06,
"loss": 1.2516,
"step": 20
},
{
"epoch": 0.015601638172008062,
"grad_norm": 17.292724609375,
"learning_rate": 5.017301038062284e-06,
"loss": 1.1593,
"step": 30
},
{
"epoch": 0.02080218422934408,
"grad_norm": 2.1085638999938965,
"learning_rate": 6.747404844290659e-06,
"loss": 0.5496,
"step": 40
},
{
"epoch": 0.0260027302866801,
"grad_norm": 1.403855562210083,
"learning_rate": 8.477508650519032e-06,
"loss": 0.369,
"step": 50
},
{
"epoch": 0.031203276344016123,
"grad_norm": 2.6436541080474854,
"learning_rate": 1.0207612456747406e-05,
"loss": 0.2841,
"step": 60
},
{
"epoch": 0.03640382240135214,
"grad_norm": 1.9844225645065308,
"learning_rate": 1.193771626297578e-05,
"loss": 0.4041,
"step": 70
},
{
"epoch": 0.04160436845868816,
"grad_norm": 2.442768096923828,
"learning_rate": 1.3667820069204155e-05,
"loss": 0.2672,
"step": 80
},
{
"epoch": 0.046804914516024185,
"grad_norm": 2.2531392574310303,
"learning_rate": 1.5397923875432525e-05,
"loss": 0.3032,
"step": 90
},
{
"epoch": 0.0520054605733602,
"grad_norm": 1.9605236053466797,
"learning_rate": 1.7128027681660898e-05,
"loss": 0.2662,
"step": 100
},
{
"epoch": 0.05720600663069622,
"grad_norm": 2.8307950496673584,
"learning_rate": 1.8858131487889273e-05,
"loss": 0.2989,
"step": 110
},
{
"epoch": 0.062406552688032246,
"grad_norm": 2.2501089572906494,
"learning_rate": 2.058823529411765e-05,
"loss": 0.2533,
"step": 120
},
{
"epoch": 0.06760709874536826,
"grad_norm": 3.275035858154297,
"learning_rate": 2.231833910034602e-05,
"loss": 0.2911,
"step": 130
},
{
"epoch": 0.07280764480270428,
"grad_norm": 3.080817699432373,
"learning_rate": 2.4048442906574396e-05,
"loss": 0.3063,
"step": 140
},
{
"epoch": 0.07800819086004031,
"grad_norm": 5.425448894500732,
"learning_rate": 2.5778546712802772e-05,
"loss": 0.3093,
"step": 150
},
{
"epoch": 0.08320873691737632,
"grad_norm": 1.7119687795639038,
"learning_rate": 2.7508650519031144e-05,
"loss": 0.2612,
"step": 160
},
{
"epoch": 0.08840928297471234,
"grad_norm": 4.50128173828125,
"learning_rate": 2.9238754325259516e-05,
"loss": 0.2895,
"step": 170
},
{
"epoch": 0.09360982903204837,
"grad_norm": 1.802933692932129,
"learning_rate": 3.096885813148789e-05,
"loss": 0.2085,
"step": 180
},
{
"epoch": 0.09881037508938438,
"grad_norm": 2.629002571105957,
"learning_rate": 3.269896193771627e-05,
"loss": 0.2971,
"step": 190
},
{
"epoch": 0.1040109211467204,
"grad_norm": 1.411960244178772,
"learning_rate": 3.4429065743944636e-05,
"loss": 0.2985,
"step": 200
},
{
"epoch": 0.10921146720405643,
"grad_norm": 1.9607653617858887,
"learning_rate": 3.615916955017301e-05,
"loss": 0.2282,
"step": 210
},
{
"epoch": 0.11441201326139244,
"grad_norm": 3.4696173667907715,
"learning_rate": 3.788927335640138e-05,
"loss": 0.259,
"step": 220
},
{
"epoch": 0.11961255931872847,
"grad_norm": 3.1414554119110107,
"learning_rate": 3.961937716262976e-05,
"loss": 0.2353,
"step": 230
},
{
"epoch": 0.12481310537606449,
"grad_norm": 1.7068389654159546,
"learning_rate": 4.134948096885813e-05,
"loss": 0.2279,
"step": 240
},
{
"epoch": 0.13001365143340052,
"grad_norm": 2.7408318519592285,
"learning_rate": 4.307958477508651e-05,
"loss": 0.2809,
"step": 250
},
{
"epoch": 0.13521419749073652,
"grad_norm": 3.036931276321411,
"learning_rate": 4.480968858131488e-05,
"loss": 0.253,
"step": 260
},
{
"epoch": 0.14041474354807254,
"grad_norm": 2.8552465438842773,
"learning_rate": 4.653979238754326e-05,
"loss": 0.2454,
"step": 270
},
{
"epoch": 0.14561528960540857,
"grad_norm": 3.6416499614715576,
"learning_rate": 4.826989619377163e-05,
"loss": 0.244,
"step": 280
},
{
"epoch": 0.1508158356627446,
"grad_norm": 3.5004782676696777,
"learning_rate": 5e-05,
"loss": 0.2909,
"step": 290
},
{
"epoch": 0.15601638172008062,
"grad_norm": 1.2734322547912598,
"learning_rate": 4.999958918390321e-05,
"loss": 0.33,
"step": 300
},
{
"epoch": 0.16121692777741664,
"grad_norm": 5.017611026763916,
"learning_rate": 4.999835674911443e-05,
"loss": 0.2723,
"step": 310
},
{
"epoch": 0.16641747383475264,
"grad_norm": 2.2255094051361084,
"learning_rate": 4.999630273613799e-05,
"loss": 0.2993,
"step": 320
},
{
"epoch": 0.17161801989208866,
"grad_norm": 1.4226183891296387,
"learning_rate": 4.9993427212479606e-05,
"loss": 0.2749,
"step": 330
},
{
"epoch": 0.1768185659494247,
"grad_norm": 1.295336127281189,
"learning_rate": 4.998973027264419e-05,
"loss": 0.2618,
"step": 340
},
{
"epoch": 0.1820191120067607,
"grad_norm": 1.9380894899368286,
"learning_rate": 4.998521203813274e-05,
"loss": 0.2595,
"step": 350
},
{
"epoch": 0.18721965806409674,
"grad_norm": 2.218477964401245,
"learning_rate": 4.997987265743834e-05,
"loss": 0.2305,
"step": 360
},
{
"epoch": 0.19242020412143276,
"grad_norm": 2.5676722526550293,
"learning_rate": 4.9973712306041256e-05,
"loss": 0.2259,
"step": 370
},
{
"epoch": 0.19762075017876876,
"grad_norm": 1.3287098407745361,
"learning_rate": 4.996673118640323e-05,
"loss": 0.2082,
"step": 380
},
{
"epoch": 0.2028212962361048,
"grad_norm": 1.6181299686431885,
"learning_rate": 4.995892952796074e-05,
"loss": 0.2422,
"step": 390
},
{
"epoch": 0.2080218422934408,
"grad_norm": 2.0212459564208984,
"learning_rate": 4.995030758711756e-05,
"loss": 0.296,
"step": 400
},
{
"epoch": 0.21322238835077684,
"grad_norm": 1.0081758499145508,
"learning_rate": 4.994086564723626e-05,
"loss": 0.2289,
"step": 410
},
{
"epoch": 0.21842293440811286,
"grad_norm": 1.7539795637130737,
"learning_rate": 4.993060401862888e-05,
"loss": 0.2118,
"step": 420
},
{
"epoch": 0.22362348046544886,
"grad_norm": 1.8362935781478882,
"learning_rate": 4.991952303854682e-05,
"loss": 0.2198,
"step": 430
},
{
"epoch": 0.22882402652278488,
"grad_norm": 3.820204734802246,
"learning_rate": 4.9907623071169686e-05,
"loss": 0.2744,
"step": 440
},
{
"epoch": 0.2340245725801209,
"grad_norm": 2.739043951034546,
"learning_rate": 4.9894904507593316e-05,
"loss": 0.1887,
"step": 450
},
{
"epoch": 0.23922511863745693,
"grad_norm": 1.1746188402175903,
"learning_rate": 4.988136776581696e-05,
"loss": 0.2105,
"step": 460
},
{
"epoch": 0.24442566469479296,
"grad_norm": 4.921947956085205,
"learning_rate": 4.9867013290729535e-05,
"loss": 0.2306,
"step": 470
},
{
"epoch": 0.24962621075212899,
"grad_norm": 4.337555408477783,
"learning_rate": 4.9851841554095e-05,
"loss": 0.2564,
"step": 480
},
{
"epoch": 0.254826756809465,
"grad_norm": 3.977388620376587,
"learning_rate": 4.9835853054536846e-05,
"loss": 0.2106,
"step": 490
},
{
"epoch": 0.26002730286680104,
"grad_norm": 1.2592873573303223,
"learning_rate": 4.981904831752171e-05,
"loss": 0.3106,
"step": 500
},
{
"epoch": 0.26002730286680104,
"eval_loss": 0.2771838307380676,
"eval_runtime": 136.7574,
"eval_samples_per_second": 14.054,
"eval_steps_per_second": 14.054,
"step": 500
},
{
"epoch": 0.26522784892413703,
"grad_norm": 1.807676911354065,
"learning_rate": 4.98014278953421e-05,
"loss": 0.2341,
"step": 510
},
{
"epoch": 0.27042839498147303,
"grad_norm": 2.764136791229248,
"learning_rate": 4.978299236709826e-05,
"loss": 0.339,
"step": 520
},
{
"epoch": 0.2756289410388091,
"grad_norm": 1.2402966022491455,
"learning_rate": 4.9763742338679145e-05,
"loss": 0.2754,
"step": 530
},
{
"epoch": 0.2808294870961451,
"grad_norm": 1.5016759634017944,
"learning_rate": 4.974367844274248e-05,
"loss": 0.2544,
"step": 540
},
{
"epoch": 0.28603003315348113,
"grad_norm": 2.2027359008789062,
"learning_rate": 4.972280133869396e-05,
"loss": 0.232,
"step": 550
},
{
"epoch": 0.29123057921081713,
"grad_norm": 0.8741855621337891,
"learning_rate": 4.9701111712665625e-05,
"loss": 0.2665,
"step": 560
},
{
"epoch": 0.29643112526815313,
"grad_norm": 2.105534315109253,
"learning_rate": 4.9678610277493275e-05,
"loss": 0.2719,
"step": 570
},
{
"epoch": 0.3016316713254892,
"grad_norm": 2.820169687271118,
"learning_rate": 4.965529777269306e-05,
"loss": 0.2875,
"step": 580
},
{
"epoch": 0.3068322173828252,
"grad_norm": 1.968910813331604,
"learning_rate": 4.963117496443715e-05,
"loss": 0.2547,
"step": 590
},
{
"epoch": 0.31203276344016123,
"grad_norm": 1.4258973598480225,
"learning_rate": 4.960624264552858e-05,
"loss": 0.3096,
"step": 600
},
{
"epoch": 0.31723330949749723,
"grad_norm": 0.6942580342292786,
"learning_rate": 4.958050163537519e-05,
"loss": 0.2271,
"step": 610
},
{
"epoch": 0.3224338555548333,
"grad_norm": 2.4023945331573486,
"learning_rate": 4.955395277996268e-05,
"loss": 0.2973,
"step": 620
},
{
"epoch": 0.3276344016121693,
"grad_norm": 0.890560507774353,
"learning_rate": 4.9526596951826824e-05,
"loss": 0.2368,
"step": 630
},
{
"epoch": 0.3328349476695053,
"grad_norm": 1.4097232818603516,
"learning_rate": 4.949843505002477e-05,
"loss": 0.1829,
"step": 640
},
{
"epoch": 0.33803549372684133,
"grad_norm": 1.28754723072052,
"learning_rate": 4.946946800010556e-05,
"loss": 0.3505,
"step": 650
},
{
"epoch": 0.3432360397841773,
"grad_norm": 0.8762970566749573,
"learning_rate": 4.9439696754079595e-05,
"loss": 0.2356,
"step": 660
},
{
"epoch": 0.3484365858415134,
"grad_norm": 2.1406095027923584,
"learning_rate": 4.940912229038745e-05,
"loss": 0.2232,
"step": 670
},
{
"epoch": 0.3536371318988494,
"grad_norm": 1.4764164686203003,
"learning_rate": 4.937774561386768e-05,
"loss": 0.2281,
"step": 680
},
{
"epoch": 0.3588376779561854,
"grad_norm": 1.5396536588668823,
"learning_rate": 4.934556775572377e-05,
"loss": 0.2875,
"step": 690
},
{
"epoch": 0.3640382240135214,
"grad_norm": 1.0842628479003906,
"learning_rate": 4.9312589773490304e-05,
"loss": 0.2562,
"step": 700
},
{
"epoch": 0.3692387700708574,
"grad_norm": 1.8963087797164917,
"learning_rate": 4.927881275099815e-05,
"loss": 0.2413,
"step": 710
},
{
"epoch": 0.3744393161281935,
"grad_norm": 1.5899958610534668,
"learning_rate": 4.9244237798338866e-05,
"loss": 0.2979,
"step": 720
},
{
"epoch": 0.3796398621855295,
"grad_norm": 0.8220577836036682,
"learning_rate": 4.920886605182823e-05,
"loss": 0.2868,
"step": 730
},
{
"epoch": 0.38484040824286553,
"grad_norm": 1.0545523166656494,
"learning_rate": 4.917269867396886e-05,
"loss": 0.194,
"step": 740
},
{
"epoch": 0.3900409543002015,
"grad_norm": 1.3721591234207153,
"learning_rate": 4.913573685341205e-05,
"loss": 0.2109,
"step": 750
},
{
"epoch": 0.3952415003575375,
"grad_norm": 0.9382643699645996,
"learning_rate": 4.909798180491865e-05,
"loss": 0.2194,
"step": 760
},
{
"epoch": 0.4004420464148736,
"grad_norm": 0.6716025471687317,
"learning_rate": 4.9059434769319205e-05,
"loss": 0.2021,
"step": 770
},
{
"epoch": 0.4056425924722096,
"grad_norm": 2.405698537826538,
"learning_rate": 4.902009701347313e-05,
"loss": 0.2933,
"step": 780
},
{
"epoch": 0.4108431385295456,
"grad_norm": 1.7277915477752686,
"learning_rate": 4.8979969830227086e-05,
"loss": 0.2376,
"step": 790
},
{
"epoch": 0.4160436845868816,
"grad_norm": 1.790748119354248,
"learning_rate": 4.8939054538372496e-05,
"loss": 0.2227,
"step": 800
},
{
"epoch": 0.4212442306442176,
"grad_norm": 1.2813634872436523,
"learning_rate": 4.889735248260221e-05,
"loss": 0.2544,
"step": 810
},
{
"epoch": 0.4264447767015537,
"grad_norm": 0.9295778870582581,
"learning_rate": 4.8854865033466275e-05,
"loss": 0.1625,
"step": 820
},
{
"epoch": 0.43164532275888967,
"grad_norm": 1.9681141376495361,
"learning_rate": 4.881159358732694e-05,
"loss": 0.2262,
"step": 830
},
{
"epoch": 0.4368458688162257,
"grad_norm": 1.1844898462295532,
"learning_rate": 4.8767539566312734e-05,
"loss": 0.2683,
"step": 840
},
{
"epoch": 0.4420464148735617,
"grad_norm": 1.1099355220794678,
"learning_rate": 4.8722704418271745e-05,
"loss": 0.2281,
"step": 850
},
{
"epoch": 0.4472469609308977,
"grad_norm": 1.4917421340942383,
"learning_rate": 4.867708961672399e-05,
"loss": 0.3092,
"step": 860
},
{
"epoch": 0.45244750698823377,
"grad_norm": 1.1806445121765137,
"learning_rate": 4.863069666081307e-05,
"loss": 0.2272,
"step": 870
},
{
"epoch": 0.45764805304556977,
"grad_norm": 1.3496099710464478,
"learning_rate": 4.8583527075256804e-05,
"loss": 0.2299,
"step": 880
},
{
"epoch": 0.4628485991029058,
"grad_norm": 2.9580721855163574,
"learning_rate": 4.853558241029723e-05,
"loss": 0.2648,
"step": 890
},
{
"epoch": 0.4680491451602418,
"grad_norm": 0.47517985105514526,
"learning_rate": 4.848686424164953e-05,
"loss": 0.2166,
"step": 900
},
{
"epoch": 0.4732496912175779,
"grad_norm": 1.1966201066970825,
"learning_rate": 4.8437374170450344e-05,
"loss": 0.2499,
"step": 910
},
{
"epoch": 0.47845023727491387,
"grad_norm": 1.4806653261184692,
"learning_rate": 4.8387113823205096e-05,
"loss": 0.2532,
"step": 920
},
{
"epoch": 0.48365078333224987,
"grad_norm": 1.9070792198181152,
"learning_rate": 4.833608485173457e-05,
"loss": 0.2721,
"step": 930
},
{
"epoch": 0.4888513293895859,
"grad_norm": 1.1496449708938599,
"learning_rate": 4.8284288933120594e-05,
"loss": 0.2181,
"step": 940
},
{
"epoch": 0.4940518754469219,
"grad_norm": 1.1686209440231323,
"learning_rate": 4.823172776965094e-05,
"loss": 0.2084,
"step": 950
},
{
"epoch": 0.49925242150425797,
"grad_norm": 1.7963812351226807,
"learning_rate": 4.8178403088763355e-05,
"loss": 0.2436,
"step": 960
},
{
"epoch": 0.504452967561594,
"grad_norm": 1.3361034393310547,
"learning_rate": 4.812431664298883e-05,
"loss": 0.1777,
"step": 970
},
{
"epoch": 0.50965351361893,
"grad_norm": 0.7462561726570129,
"learning_rate": 4.8069470209893974e-05,
"loss": 0.2749,
"step": 980
},
{
"epoch": 0.514854059676266,
"grad_norm": 1.4435970783233643,
"learning_rate": 4.801386559202259e-05,
"loss": 0.2099,
"step": 990
},
{
"epoch": 0.5200546057336021,
"grad_norm": 1.6081739664077759,
"learning_rate": 4.795750461683644e-05,
"loss": 0.2594,
"step": 1000
},
{
"epoch": 0.5200546057336021,
"eval_loss": 0.24056576192378998,
"eval_runtime": 134.5423,
"eval_samples_per_second": 14.285,
"eval_steps_per_second": 14.285,
"step": 1000
},
{
"epoch": 0.5252551517909381,
"grad_norm": 0.9048750996589661,
"learning_rate": 4.790038913665519e-05,
"loss": 0.2459,
"step": 1010
},
{
"epoch": 0.5304556978482741,
"grad_norm": 1.2910796403884888,
"learning_rate": 4.7842521028595526e-05,
"loss": 0.2357,
"step": 1020
},
{
"epoch": 0.5356562439056101,
"grad_norm": 1.6829766035079956,
"learning_rate": 4.778390219450949e-05,
"loss": 0.2348,
"step": 1030
},
{
"epoch": 0.5408567899629461,
"grad_norm": 2.526048421859741,
"learning_rate": 4.772453456092191e-05,
"loss": 0.2503,
"step": 1040
},
{
"epoch": 0.5460573360202822,
"grad_norm": 0.8338559865951538,
"learning_rate": 4.766442007896715e-05,
"loss": 0.1851,
"step": 1050
},
{
"epoch": 0.5512578820776182,
"grad_norm": 2.0072736740112305,
"learning_rate": 4.760356072432498e-05,
"loss": 0.3063,
"step": 1060
},
{
"epoch": 0.5564584281349542,
"grad_norm": 2.7068746089935303,
"learning_rate": 4.754195849715557e-05,
"loss": 0.2264,
"step": 1070
},
{
"epoch": 0.5616589741922902,
"grad_norm": 1.7025487422943115,
"learning_rate": 4.747961542203386e-05,
"loss": 0.1975,
"step": 1080
},
{
"epoch": 0.5668595202496262,
"grad_norm": 1.6216896772384644,
"learning_rate": 4.741653354788295e-05,
"loss": 0.232,
"step": 1090
},
{
"epoch": 0.5720600663069623,
"grad_norm": 1.5931206941604614,
"learning_rate": 4.735271494790678e-05,
"loss": 0.2607,
"step": 1100
},
{
"epoch": 0.5772606123642983,
"grad_norm": 1.2996855974197388,
"learning_rate": 4.7288161719522016e-05,
"loss": 0.2148,
"step": 1110
},
{
"epoch": 0.5824611584216343,
"grad_norm": 1.3389666080474854,
"learning_rate": 4.722287598428907e-05,
"loss": 0.2831,
"step": 1120
},
{
"epoch": 0.5876617044789703,
"grad_norm": 2.0776829719543457,
"learning_rate": 4.7156859887842416e-05,
"loss": 0.3034,
"step": 1130
},
{
"epoch": 0.5928622505363063,
"grad_norm": 0.8629754781723022,
"learning_rate": 4.709011559982006e-05,
"loss": 0.2287,
"step": 1140
},
{
"epoch": 0.5980627965936424,
"grad_norm": 1.2654669284820557,
"learning_rate": 4.7022645313792235e-05,
"loss": 0.2223,
"step": 1150
},
{
"epoch": 0.6032633426509784,
"grad_norm": 1.1408824920654297,
"learning_rate": 4.695445124718931e-05,
"loss": 0.1832,
"step": 1160
},
{
"epoch": 0.6084638887083144,
"grad_norm": 1.0831233263015747,
"learning_rate": 4.6885535641228904e-05,
"loss": 0.2787,
"step": 1170
},
{
"epoch": 0.6136644347656504,
"grad_norm": 1.243690848350525,
"learning_rate": 4.6815900760842236e-05,
"loss": 0.2505,
"step": 1180
},
{
"epoch": 0.6188649808229865,
"grad_norm": 2.173030138015747,
"learning_rate": 4.674554889459968e-05,
"loss": 0.2526,
"step": 1190
},
{
"epoch": 0.6240655268803225,
"grad_norm": 1.0949965715408325,
"learning_rate": 4.667448235463557e-05,
"loss": 0.233,
"step": 1200
},
{
"epoch": 0.6292660729376585,
"grad_norm": 2.3284902572631836,
"learning_rate": 4.660270347657219e-05,
"loss": 0.2447,
"step": 1210
},
{
"epoch": 0.6344666189949945,
"grad_norm": 1.0869665145874023,
"learning_rate": 4.6530214619443037e-05,
"loss": 0.2217,
"step": 1220
},
{
"epoch": 0.6396671650523305,
"grad_norm": 1.639493465423584,
"learning_rate": 4.645701816561523e-05,
"loss": 0.2322,
"step": 1230
},
{
"epoch": 0.6448677111096666,
"grad_norm": 1.2198299169540405,
"learning_rate": 4.63831165207113e-05,
"loss": 0.1883,
"step": 1240
},
{
"epoch": 0.6500682571670026,
"grad_norm": 1.4124974012374878,
"learning_rate": 4.630851211353007e-05,
"loss": 0.2559,
"step": 1250
},
{
"epoch": 0.6552688032243386,
"grad_norm": 1.7080676555633545,
"learning_rate": 4.623320739596685e-05,
"loss": 0.2219,
"step": 1260
},
{
"epoch": 0.6604693492816746,
"grad_norm": 2.443284511566162,
"learning_rate": 4.615720484293286e-05,
"loss": 0.2324,
"step": 1270
},
{
"epoch": 0.6656698953390106,
"grad_norm": 0.6745538115501404,
"learning_rate": 4.608050695227385e-05,
"loss": 0.2877,
"step": 1280
},
{
"epoch": 0.6708704413963467,
"grad_norm": 1.1423040628433228,
"learning_rate": 4.60031162446881e-05,
"loss": 0.2469,
"step": 1290
},
{
"epoch": 0.6760709874536827,
"grad_norm": 1.5825380086898804,
"learning_rate": 4.5925035263643444e-05,
"loss": 0.2699,
"step": 1300
},
{
"epoch": 0.6812715335110187,
"grad_norm": 1.138910174369812,
"learning_rate": 4.5846266575293816e-05,
"loss": 0.2457,
"step": 1310
},
{
"epoch": 0.6864720795683547,
"grad_norm": 1.3718457221984863,
"learning_rate": 4.576681276839483e-05,
"loss": 0.2485,
"step": 1320
},
{
"epoch": 0.6916726256256907,
"grad_norm": 1.4293012619018555,
"learning_rate": 4.56866764542187e-05,
"loss": 0.2458,
"step": 1330
},
{
"epoch": 0.6968731716830268,
"grad_norm": 1.009885311126709,
"learning_rate": 4.560586026646845e-05,
"loss": 0.2077,
"step": 1340
},
{
"epoch": 0.7020737177403628,
"grad_norm": 0.6243613362312317,
"learning_rate": 4.552436686119134e-05,
"loss": 0.2204,
"step": 1350
},
{
"epoch": 0.7072742637976988,
"grad_norm": 1.6868172883987427,
"learning_rate": 4.54421989166916e-05,
"loss": 0.2372,
"step": 1360
},
{
"epoch": 0.7124748098550348,
"grad_norm": 1.7123680114746094,
"learning_rate": 4.5359359133442356e-05,
"loss": 0.2613,
"step": 1370
},
{
"epoch": 0.7176753559123707,
"grad_norm": 0.856176495552063,
"learning_rate": 4.5275850233996925e-05,
"loss": 0.2438,
"step": 1380
},
{
"epoch": 0.7228759019697069,
"grad_norm": 1.1216453313827515,
"learning_rate": 4.5191674962899314e-05,
"loss": 0.2029,
"step": 1390
},
{
"epoch": 0.7280764480270429,
"grad_norm": 1.8667545318603516,
"learning_rate": 4.510683608659403e-05,
"loss": 0.1938,
"step": 1400
},
{
"epoch": 0.7332769940843789,
"grad_norm": 1.677372932434082,
"learning_rate": 4.502133639333516e-05,
"loss": 0.2053,
"step": 1410
},
{
"epoch": 0.7384775401417148,
"grad_norm": 1.217119574546814,
"learning_rate": 4.4935178693094714e-05,
"loss": 0.1992,
"step": 1420
},
{
"epoch": 0.7436780861990508,
"grad_norm": 2.1485345363616943,
"learning_rate": 4.484836581747032e-05,
"loss": 0.2454,
"step": 1430
},
{
"epoch": 0.748878632256387,
"grad_norm": 1.3972569704055786,
"learning_rate": 4.4760900619592085e-05,
"loss": 0.1673,
"step": 1440
},
{
"epoch": 0.754079178313723,
"grad_norm": 1.4621198177337646,
"learning_rate": 4.467278597402894e-05,
"loss": 0.2137,
"step": 1450
},
{
"epoch": 0.759279724371059,
"grad_norm": 1.6665892601013184,
"learning_rate": 4.4584024776694035e-05,
"loss": 0.1556,
"step": 1460
},
{
"epoch": 0.764480270428395,
"grad_norm": 1.4974132776260376,
"learning_rate": 4.449461994474968e-05,
"loss": 0.278,
"step": 1470
},
{
"epoch": 0.7696808164857311,
"grad_norm": 0.9022512435913086,
"learning_rate": 4.440457441651139e-05,
"loss": 0.1929,
"step": 1480
},
{
"epoch": 0.774881362543067,
"grad_norm": 1.8019062280654907,
"learning_rate": 4.4313891151351375e-05,
"loss": 0.2594,
"step": 1490
},
{
"epoch": 0.780081908600403,
"grad_norm": 1.0030608177185059,
"learning_rate": 4.422257312960123e-05,
"loss": 0.1938,
"step": 1500
},
{
"epoch": 0.780081908600403,
"eval_loss": 0.2387997955083847,
"eval_runtime": 136.4254,
"eval_samples_per_second": 14.088,
"eval_steps_per_second": 14.088,
"step": 1500
},
{
"epoch": 0.785282454657739,
"grad_norm": 1.8986437320709229,
"learning_rate": 4.413062335245402e-05,
"loss": 0.2154,
"step": 1510
},
{
"epoch": 0.790483000715075,
"grad_norm": 1.5987744331359863,
"learning_rate": 4.4038044841865614e-05,
"loss": 0.2624,
"step": 1520
},
{
"epoch": 0.7956835467724112,
"grad_norm": 1.032251000404358,
"learning_rate": 4.394484064045542e-05,
"loss": 0.2311,
"step": 1530
},
{
"epoch": 0.8008840928297472,
"grad_norm": 1.9166332483291626,
"learning_rate": 4.385101381140633e-05,
"loss": 0.2384,
"step": 1540
},
{
"epoch": 0.8060846388870831,
"grad_norm": 0.6986478567123413,
"learning_rate": 4.375656743836407e-05,
"loss": 0.1841,
"step": 1550
},
{
"epoch": 0.8112851849444191,
"grad_norm": 0.631565511226654,
"learning_rate": 4.366150462533588e-05,
"loss": 0.2398,
"step": 1560
},
{
"epoch": 0.8164857310017551,
"grad_norm": 1.0940667390823364,
"learning_rate": 4.356582849658845e-05,
"loss": 0.1876,
"step": 1570
},
{
"epoch": 0.8216862770590913,
"grad_norm": 0.7327963709831238,
"learning_rate": 4.34695421965453e-05,
"loss": 0.2551,
"step": 1580
},
{
"epoch": 0.8268868231164272,
"grad_norm": 1.5531721115112305,
"learning_rate": 4.3372648889683364e-05,
"loss": 0.1719,
"step": 1590
},
{
"epoch": 0.8320873691737632,
"grad_norm": 0.8876403570175171,
"learning_rate": 4.3275151760429075e-05,
"loss": 0.2152,
"step": 1600
},
{
"epoch": 0.8372879152310992,
"grad_norm": 2.079756259918213,
"learning_rate": 4.317705401305362e-05,
"loss": 0.2369,
"step": 1610
},
{
"epoch": 0.8424884612884352,
"grad_norm": 1.2363635301589966,
"learning_rate": 4.3078358871567706e-05,
"loss": 0.2718,
"step": 1620
},
{
"epoch": 0.8476890073457714,
"grad_norm": 1.3667513132095337,
"learning_rate": 4.2979069579615564e-05,
"loss": 0.2221,
"step": 1630
},
{
"epoch": 0.8528895534031073,
"grad_norm": 1.1651591062545776,
"learning_rate": 4.2879189400368314e-05,
"loss": 0.2858,
"step": 1640
},
{
"epoch": 0.8580900994604433,
"grad_norm": 0.9213271141052246,
"learning_rate": 4.277872161641682e-05,
"loss": 0.2187,
"step": 1650
},
{
"epoch": 0.8632906455177793,
"grad_norm": 0.8052433133125305,
"learning_rate": 4.267766952966369e-05,
"loss": 0.2695,
"step": 1660
},
{
"epoch": 0.8684911915751153,
"grad_norm": 1.9036948680877686,
"learning_rate": 4.257603646121484e-05,
"loss": 0.2253,
"step": 1670
},
{
"epoch": 0.8736917376324514,
"grad_norm": 0.8116464018821716,
"learning_rate": 4.247382575127031e-05,
"loss": 0.2417,
"step": 1680
},
{
"epoch": 0.8788922836897874,
"grad_norm": 1.7750636339187622,
"learning_rate": 4.237104075901449e-05,
"loss": 0.2438,
"step": 1690
},
{
"epoch": 0.8840928297471234,
"grad_norm": 0.9960026144981384,
"learning_rate": 4.226768486250572e-05,
"loss": 0.2928,
"step": 1700
},
{
"epoch": 0.8892933758044594,
"grad_norm": 1.5663594007492065,
"learning_rate": 4.216376145856529e-05,
"loss": 0.249,
"step": 1710
},
{
"epoch": 0.8944939218617954,
"grad_norm": 2.8207902908325195,
"learning_rate": 4.205927396266577e-05,
"loss": 0.233,
"step": 1720
},
{
"epoch": 0.8996944679191315,
"grad_norm": 0.683710515499115,
"learning_rate": 4.195422580881878e-05,
"loss": 0.1886,
"step": 1730
},
{
"epoch": 0.9048950139764675,
"grad_norm": 1.2048577070236206,
"learning_rate": 4.1848620449462115e-05,
"loss": 0.205,
"step": 1740
},
{
"epoch": 0.9100955600338035,
"grad_norm": 1.833343505859375,
"learning_rate": 4.17424613553463e-05,
"loss": 0.2846,
"step": 1750
},
{
"epoch": 0.9152961060911395,
"grad_norm": 1.2163664102554321,
"learning_rate": 4.163575201542052e-05,
"loss": 0.2269,
"step": 1760
},
{
"epoch": 0.9204966521484755,
"grad_norm": 0.7797666788101196,
"learning_rate": 4.152849593671793e-05,
"loss": 0.1856,
"step": 1770
},
{
"epoch": 0.9256971982058116,
"grad_norm": 1.4620978832244873,
"learning_rate": 4.142069664424041e-05,
"loss": 0.2599,
"step": 1780
},
{
"epoch": 0.9308977442631476,
"grad_norm": 0.480034202337265,
"learning_rate": 4.1312357680842735e-05,
"loss": 0.2485,
"step": 1790
},
{
"epoch": 0.9360982903204836,
"grad_norm": 1.0644006729125977,
"learning_rate": 4.120348260711611e-05,
"loss": 0.2576,
"step": 1800
},
{
"epoch": 0.9412988363778196,
"grad_norm": 1.8595833778381348,
"learning_rate": 4.109407500127116e-05,
"loss": 0.2438,
"step": 1810
},
{
"epoch": 0.9464993824351557,
"grad_norm": 0.9909834861755371,
"learning_rate": 4.098413845902033e-05,
"loss": 0.241,
"step": 1820
},
{
"epoch": 0.9516999284924917,
"grad_norm": 1.157691478729248,
"learning_rate": 4.0873676593459725e-05,
"loss": 0.2383,
"step": 1830
},
{
"epoch": 0.9569004745498277,
"grad_norm": 1.2096604108810425,
"learning_rate": 4.076269303495033e-05,
"loss": 0.2554,
"step": 1840
},
{
"epoch": 0.9621010206071637,
"grad_norm": 0.8286678194999695,
"learning_rate": 4.065119143099874e-05,
"loss": 0.1894,
"step": 1850
},
{
"epoch": 0.9673015666644997,
"grad_norm": 0.9873716235160828,
"learning_rate": 4.053917544613723e-05,
"loss": 0.2311,
"step": 1860
},
{
"epoch": 0.9725021127218358,
"grad_norm": 0.9408676028251648,
"learning_rate": 4.042664876180341e-05,
"loss": 0.2386,
"step": 1870
},
{
"epoch": 0.9777026587791718,
"grad_norm": 0.6958754062652588,
"learning_rate": 4.031361507621911e-05,
"loss": 0.2468,
"step": 1880
},
{
"epoch": 0.9829032048365078,
"grad_norm": 0.8920957446098328,
"learning_rate": 4.0200078104268944e-05,
"loss": 0.2584,
"step": 1890
},
{
"epoch": 0.9881037508938438,
"grad_norm": 1.3254570960998535,
"learning_rate": 4.0086041577378166e-05,
"loss": 0.2755,
"step": 1900
},
{
"epoch": 0.9933042969511798,
"grad_norm": 1.2101293802261353,
"learning_rate": 3.9971509243390025e-05,
"loss": 0.2417,
"step": 1910
},
{
"epoch": 0.9985048430085159,
"grad_norm": 0.42130109667778015,
"learning_rate": 3.985648486644267e-05,
"loss": 0.1982,
"step": 1920
},
{
"epoch": 1.0036403822401352,
"grad_norm": 2.4333481788635254,
"learning_rate": 3.974097222684532e-05,
"loss": 0.2277,
"step": 1930
},
{
"epoch": 1.0088409282974713,
"grad_norm": 1.6568609476089478,
"learning_rate": 3.962497512095412e-05,
"loss": 0.1901,
"step": 1940
},
{
"epoch": 1.0140414743548072,
"grad_norm": 1.0351656675338745,
"learning_rate": 3.9508497361047334e-05,
"loss": 0.2923,
"step": 1950
},
{
"epoch": 1.0192420204121433,
"grad_norm": 0.8283625245094299,
"learning_rate": 3.939154277520006e-05,
"loss": 0.2245,
"step": 1960
},
{
"epoch": 1.0244425664694794,
"grad_norm": 0.6887472867965698,
"learning_rate": 3.92741152071584e-05,
"loss": 0.1447,
"step": 1970
},
{
"epoch": 1.0296431125268153,
"grad_norm": 2.1077232360839844,
"learning_rate": 3.915621851621318e-05,
"loss": 0.2368,
"step": 1980
},
{
"epoch": 1.0348436585841514,
"grad_norm": 0.7262524366378784,
"learning_rate": 3.903785657707307e-05,
"loss": 0.2153,
"step": 1990
},
{
"epoch": 1.0400442046414873,
"grad_norm": 0.6093840003013611,
"learning_rate": 3.8919033279737274e-05,
"loss": 0.1695,
"step": 2000
},
{
"epoch": 1.0400442046414873,
"eval_loss": 0.24628731608390808,
"eval_runtime": 134.8334,
"eval_samples_per_second": 14.255,
"eval_steps_per_second": 14.255,
"step": 2000
},
{
"epoch": 1.0452447506988234,
"grad_norm": 1.6017835140228271,
"learning_rate": 3.879975252936761e-05,
"loss": 0.202,
"step": 2010
},
{
"epoch": 1.0504452967561595,
"grad_norm": 1.7225841283798218,
"learning_rate": 3.8680018246160295e-05,
"loss": 0.1952,
"step": 2020
},
{
"epoch": 1.0556458428134954,
"grad_norm": 2.1085808277130127,
"learning_rate": 3.855983436521699e-05,
"loss": 0.2721,
"step": 2030
},
{
"epoch": 1.0608463888708315,
"grad_norm": 0.8755818605422974,
"learning_rate": 3.843920483641551e-05,
"loss": 0.2199,
"step": 2040
},
{
"epoch": 1.0660469349281674,
"grad_norm": 0.6190668344497681,
"learning_rate": 3.831813362428005e-05,
"loss": 0.1944,
"step": 2050
},
{
"epoch": 1.0712474809855035,
"grad_norm": 0.6328080296516418,
"learning_rate": 3.819662470785082e-05,
"loss": 0.2687,
"step": 2060
},
{
"epoch": 1.0764480270428396,
"grad_norm": 1.3243086338043213,
"learning_rate": 3.8074682080553335e-05,
"loss": 0.1866,
"step": 2070
},
{
"epoch": 1.0816485731001755,
"grad_norm": 1.4289870262145996,
"learning_rate": 3.795230975006712e-05,
"loss": 0.1979,
"step": 2080
},
{
"epoch": 1.0868491191575116,
"grad_norm": 1.1440227031707764,
"learning_rate": 3.782951173819403e-05,
"loss": 0.2097,
"step": 2090
},
{
"epoch": 1.0920496652148475,
"grad_norm": 0.7256899476051331,
"learning_rate": 3.7706292080726055e-05,
"loss": 0.2522,
"step": 2100
},
{
"epoch": 1.0972502112721836,
"grad_norm": 1.0164716243743896,
"learning_rate": 3.75826548273127e-05,
"loss": 0.2312,
"step": 2110
},
{
"epoch": 1.1024507573295197,
"grad_norm": 1.053582787513733,
"learning_rate": 3.7458604041327874e-05,
"loss": 0.1406,
"step": 2120
},
{
"epoch": 1.1076513033868556,
"grad_norm": 1.578212022781372,
"learning_rate": 3.733414379973635e-05,
"loss": 0.1913,
"step": 2130
},
{
"epoch": 1.1128518494441917,
"grad_norm": 1.1891608238220215,
"learning_rate": 3.720927819295979e-05,
"loss": 0.2298,
"step": 2140
},
{
"epoch": 1.1180523955015276,
"grad_norm": 0.4603135585784912,
"learning_rate": 3.708401132474228e-05,
"loss": 0.2261,
"step": 2150
},
{
"epoch": 1.1232529415588637,
"grad_norm": 2.1462292671203613,
"learning_rate": 3.695834731201548e-05,
"loss": 0.2354,
"step": 2160
},
{
"epoch": 1.1284534876161998,
"grad_norm": 1.139315128326416,
"learning_rate": 3.683229028476334e-05,
"loss": 0.1615,
"step": 2170
},
{
"epoch": 1.1336540336735357,
"grad_norm": 1.1548924446105957,
"learning_rate": 3.6705844385886334e-05,
"loss": 0.1705,
"step": 2180
},
{
"epoch": 1.1388545797308718,
"grad_norm": 1.0922483205795288,
"learning_rate": 3.6579013771065305e-05,
"loss": 0.1906,
"step": 2190
},
{
"epoch": 1.1440551257882077,
"grad_norm": 0.8926368951797485,
"learning_rate": 3.645180260862492e-05,
"loss": 0.1744,
"step": 2200
},
{
"epoch": 1.1492556718455438,
"grad_norm": 1.1546534299850464,
"learning_rate": 3.632421507939661e-05,
"loss": 0.2112,
"step": 2210
},
{
"epoch": 1.1544562179028799,
"grad_norm": 1.9052295684814453,
"learning_rate": 3.6196255376581254e-05,
"loss": 0.2351,
"step": 2220
},
{
"epoch": 1.1596567639602158,
"grad_norm": 0.9189292788505554,
"learning_rate": 3.6067927705611304e-05,
"loss": 0.2165,
"step": 2230
},
{
"epoch": 1.1648573100175519,
"grad_norm": 0.5956322550773621,
"learning_rate": 3.593923628401259e-05,
"loss": 0.2127,
"step": 2240
},
{
"epoch": 1.1700578560748878,
"grad_norm": 2.0540506839752197,
"learning_rate": 3.581018534126571e-05,
"loss": 0.2175,
"step": 2250
},
{
"epoch": 1.1752584021322239,
"grad_norm": 0.8053009510040283,
"learning_rate": 3.568077911866703e-05,
"loss": 0.2046,
"step": 2260
},
{
"epoch": 1.18045894818956,
"grad_norm": 1.437412142753601,
"learning_rate": 3.5551021869189286e-05,
"loss": 0.2297,
"step": 2270
},
{
"epoch": 1.1856594942468959,
"grad_norm": 0.7657543420791626,
"learning_rate": 3.542091785734184e-05,
"loss": 0.1784,
"step": 2280
},
{
"epoch": 1.190860040304232,
"grad_norm": 1.170629620552063,
"learning_rate": 3.529047135903045e-05,
"loss": 0.1824,
"step": 2290
},
{
"epoch": 1.1960605863615679,
"grad_norm": 1.3208539485931396,
"learning_rate": 3.5159686661416834e-05,
"loss": 0.1682,
"step": 2300
},
{
"epoch": 1.201261132418904,
"grad_norm": 0.5824002027511597,
"learning_rate": 3.502856806277773e-05,
"loss": 0.1631,
"step": 2310
},
{
"epoch": 1.20646167847624,
"grad_norm": 2.711642265319824,
"learning_rate": 3.489711987236357e-05,
"loss": 0.1973,
"step": 2320
},
{
"epoch": 1.211662224533576,
"grad_norm": 0.9232580661773682,
"learning_rate": 3.476534641025698e-05,
"loss": 0.246,
"step": 2330
},
{
"epoch": 1.216862770590912,
"grad_norm": 1.4809739589691162,
"learning_rate": 3.463325200723071e-05,
"loss": 0.2476,
"step": 2340
},
{
"epoch": 1.222063316648248,
"grad_norm": 1.0022258758544922,
"learning_rate": 3.4500841004605324e-05,
"loss": 0.1629,
"step": 2350
},
{
"epoch": 1.227263862705584,
"grad_norm": 0.6187863945960999,
"learning_rate": 3.436811775410651e-05,
"loss": 0.2049,
"step": 2360
},
{
"epoch": 1.2324644087629202,
"grad_norm": 1.0579588413238525,
"learning_rate": 3.42350866177221e-05,
"loss": 0.1923,
"step": 2370
},
{
"epoch": 1.237664954820256,
"grad_norm": 0.8715612888336182,
"learning_rate": 3.410175196755866e-05,
"loss": 0.1777,
"step": 2380
},
{
"epoch": 1.2428655008775922,
"grad_norm": 1.0652248859405518,
"learning_rate": 3.396811818569785e-05,
"loss": 0.258,
"step": 2390
},
{
"epoch": 1.248066046934928,
"grad_norm": 1.5773491859436035,
"learning_rate": 3.383418966405234e-05,
"loss": 0.2021,
"step": 2400
},
{
"epoch": 1.2532665929922642,
"grad_norm": 1.5874974727630615,
"learning_rate": 3.369997080422155e-05,
"loss": 0.2206,
"step": 2410
},
{
"epoch": 1.2584671390496003,
"grad_norm": 1.1131178140640259,
"learning_rate": 3.356546601734692e-05,
"loss": 0.2099,
"step": 2420
},
{
"epoch": 1.2636676851069362,
"grad_norm": 1.019285798072815,
"learning_rate": 3.3430679723966976e-05,
"loss": 0.2599,
"step": 2430
},
{
"epoch": 1.2688682311642723,
"grad_norm": 1.3517482280731201,
"learning_rate": 3.3295616353872026e-05,
"loss": 0.1706,
"step": 2440
},
{
"epoch": 1.2740687772216082,
"grad_norm": 1.2477843761444092,
"learning_rate": 3.3160280345958614e-05,
"loss": 0.2172,
"step": 2450
},
{
"epoch": 1.2792693232789443,
"grad_norm": 0.7591115236282349,
"learning_rate": 3.3024676148083555e-05,
"loss": 0.2201,
"step": 2460
},
{
"epoch": 1.2844698693362804,
"grad_norm": 1.461832046508789,
"learning_rate": 3.288880821691785e-05,
"loss": 0.1695,
"step": 2470
},
{
"epoch": 1.2896704153936163,
"grad_norm": 1.8396881818771362,
"learning_rate": 3.2752681017800144e-05,
"loss": 0.175,
"step": 2480
},
{
"epoch": 1.2948709614509524,
"grad_norm": 1.3018438816070557,
"learning_rate": 3.261629902459e-05,
"loss": 0.2071,
"step": 2490
},
{
"epoch": 1.3000715075082883,
"grad_norm": 1.120477557182312,
"learning_rate": 3.2479666719520886e-05,
"loss": 0.1841,
"step": 2500
},
{
"epoch": 1.3000715075082883,
"eval_loss": 0.23911671340465546,
"eval_runtime": 135.2893,
"eval_samples_per_second": 14.207,
"eval_steps_per_second": 14.207,
"step": 2500
},
{
"epoch": 1.3052720535656244,
"grad_norm": 2.349160671234131,
"learning_rate": 3.23427885930528e-05,
"loss": 0.1993,
"step": 2510
},
{
"epoch": 1.3104725996229605,
"grad_norm": 0.9985238313674927,
"learning_rate": 3.220566914372477e-05,
"loss": 0.1448,
"step": 2520
},
{
"epoch": 1.3156731456802964,
"grad_norm": 1.038683295249939,
"learning_rate": 3.2068312878006955e-05,
"loss": 0.1793,
"step": 2530
},
{
"epoch": 1.3208736917376325,
"grad_norm": 1.3996448516845703,
"learning_rate": 3.193072431015254e-05,
"loss": 0.1495,
"step": 2540
},
{
"epoch": 1.3260742377949684,
"grad_norm": 1.8597303628921509,
"learning_rate": 3.17929079620494e-05,
"loss": 0.1746,
"step": 2550
},
{
"epoch": 1.3312747838523045,
"grad_norm": 0.5454281568527222,
"learning_rate": 3.1654868363071484e-05,
"loss": 0.1633,
"step": 2560
},
{
"epoch": 1.3364753299096406,
"grad_norm": 2.386983871459961,
"learning_rate": 3.151661004992992e-05,
"loss": 0.2391,
"step": 2570
},
{
"epoch": 1.3416758759669765,
"grad_norm": 1.90854811668396,
"learning_rate": 3.137813756652395e-05,
"loss": 0.1816,
"step": 2580
},
{
"epoch": 1.3468764220243126,
"grad_norm": 0.8159545063972473,
"learning_rate": 3.12394554637916e-05,
"loss": 0.235,
"step": 2590
},
{
"epoch": 1.3520769680816485,
"grad_norm": 1.6975359916687012,
"learning_rate": 3.110056829956006e-05,
"loss": 0.1799,
"step": 2600
},
{
"epoch": 1.3572775141389846,
"grad_norm": 1.2948479652404785,
"learning_rate": 3.096148063839596e-05,
"loss": 0.1747,
"step": 2610
},
{
"epoch": 1.3624780601963207,
"grad_norm": 1.0926662683486938,
"learning_rate": 3.08221970514553e-05,
"loss": 0.1946,
"step": 2620
},
{
"epoch": 1.3676786062536566,
"grad_norm": 2.317523956298828,
"learning_rate": 3.068272211633326e-05,
"loss": 0.2677,
"step": 2630
},
{
"epoch": 1.3728791523109927,
"grad_norm": 1.379921555519104,
"learning_rate": 3.0543060416913696e-05,
"loss": 0.2897,
"step": 2640
},
{
"epoch": 1.3780796983683286,
"grad_norm": 1.2815351486206055,
"learning_rate": 3.0403216543218547e-05,
"loss": 0.205,
"step": 2650
},
{
"epoch": 1.3832802444256647,
"grad_norm": 1.7982994318008423,
"learning_rate": 3.026319509125697e-05,
"loss": 0.1774,
"step": 2660
},
{
"epoch": 1.3884807904830008,
"grad_norm": 2.2039549350738525,
"learning_rate": 3.0123000662874272e-05,
"loss": 0.1811,
"step": 2670
},
{
"epoch": 1.3936813365403367,
"grad_norm": 1.7380796670913696,
"learning_rate": 2.9982637865600683e-05,
"loss": 0.2688,
"step": 2680
},
{
"epoch": 1.3988818825976728,
"grad_norm": 0.9833778738975525,
"learning_rate": 2.9842111312499914e-05,
"loss": 0.1609,
"step": 2690
},
{
"epoch": 1.4040824286550087,
"grad_norm": 2.575516939163208,
"learning_rate": 2.9701425622017583e-05,
"loss": 0.1734,
"step": 2700
},
{
"epoch": 1.4092829747123448,
"grad_norm": 3.007417678833008,
"learning_rate": 2.9560585417829368e-05,
"loss": 0.2598,
"step": 2710
},
{
"epoch": 1.4144835207696809,
"grad_norm": 1.1851876974105835,
"learning_rate": 2.9419595328689138e-05,
"loss": 0.1271,
"step": 2720
},
{
"epoch": 1.4196840668270168,
"grad_norm": 2.1141178607940674,
"learning_rate": 2.9278459988276703e-05,
"loss": 0.1752,
"step": 2730
},
{
"epoch": 1.4248846128843529,
"grad_norm": 1.5198488235473633,
"learning_rate": 2.913718403504567e-05,
"loss": 0.2225,
"step": 2740
},
{
"epoch": 1.4300851589416887,
"grad_norm": 0.9600934386253357,
"learning_rate": 2.899577211207087e-05,
"loss": 0.2169,
"step": 2750
},
{
"epoch": 1.4352857049990249,
"grad_norm": 1.3893183469772339,
"learning_rate": 2.8854228866895855e-05,
"loss": 0.2257,
"step": 2760
},
{
"epoch": 1.440486251056361,
"grad_norm": 1.2468478679656982,
"learning_rate": 2.8712558951380097e-05,
"loss": 0.221,
"step": 2770
},
{
"epoch": 1.445686797113697,
"grad_norm": 0.7069809436798096,
"learning_rate": 2.857076702154614e-05,
"loss": 0.1912,
"step": 2780
},
{
"epoch": 1.450887343171033,
"grad_norm": 1.5114367008209229,
"learning_rate": 2.8428857737426556e-05,
"loss": 0.2006,
"step": 2790
},
{
"epoch": 1.4560878892283688,
"grad_norm": 0.9951623678207397,
"learning_rate": 2.8286835762910803e-05,
"loss": 0.1765,
"step": 2800
},
{
"epoch": 1.461288435285705,
"grad_norm": 0.7911898493766785,
"learning_rate": 2.8144705765591938e-05,
"loss": 0.1737,
"step": 2810
},
{
"epoch": 1.466488981343041,
"grad_norm": 0.7575000524520874,
"learning_rate": 2.800247241661321e-05,
"loss": 0.2185,
"step": 2820
},
{
"epoch": 1.4716895274003772,
"grad_norm": 1.342424988746643,
"learning_rate": 2.7860140390514583e-05,
"loss": 0.2083,
"step": 2830
},
{
"epoch": 1.476890073457713,
"grad_norm": 2.5245749950408936,
"learning_rate": 2.771771436507903e-05,
"loss": 0.1811,
"step": 2840
},
{
"epoch": 1.482090619515049,
"grad_norm": 2.4802660942077637,
"learning_rate": 2.757519902117886e-05,
"loss": 0.1575,
"step": 2850
},
{
"epoch": 1.487291165572385,
"grad_norm": 1.177516222000122,
"learning_rate": 2.743259904262187e-05,
"loss": 0.2133,
"step": 2860
},
{
"epoch": 1.4924917116297212,
"grad_norm": 1.1934640407562256,
"learning_rate": 2.7289919115997374e-05,
"loss": 0.23,
"step": 2870
},
{
"epoch": 1.4976922576870573,
"grad_norm": 1.5221962928771973,
"learning_rate": 2.714716393052223e-05,
"loss": 0.2154,
"step": 2880
},
{
"epoch": 1.5028928037443932,
"grad_norm": 2.0732405185699463,
"learning_rate": 2.7004338177886672e-05,
"loss": 0.1759,
"step": 2890
},
{
"epoch": 1.508093349801729,
"grad_norm": 0.8759207129478455,
"learning_rate": 2.686144655210016e-05,
"loss": 0.2008,
"step": 2900
},
{
"epoch": 1.5132938958590652,
"grad_norm": 0.9305397868156433,
"learning_rate": 2.6718493749337105e-05,
"loss": 0.1785,
"step": 2910
},
{
"epoch": 1.5184944419164013,
"grad_norm": 0.9819073677062988,
"learning_rate": 2.6575484467782486e-05,
"loss": 0.2719,
"step": 2920
},
{
"epoch": 1.5236949879737374,
"grad_norm": 2.144178628921509,
"learning_rate": 2.6432423407477496e-05,
"loss": 0.1598,
"step": 2930
},
{
"epoch": 1.5288955340310733,
"grad_norm": 2.3962485790252686,
"learning_rate": 2.6289315270165062e-05,
"loss": 0.2127,
"step": 2940
},
{
"epoch": 1.5340960800884091,
"grad_norm": 1.1640074253082275,
"learning_rate": 2.6146164759135266e-05,
"loss": 0.1784,
"step": 2950
},
{
"epoch": 1.5392966261457453,
"grad_norm": 1.0884958505630493,
"learning_rate": 2.6002976579070872e-05,
"loss": 0.1717,
"step": 2960
},
{
"epoch": 1.5444971722030814,
"grad_norm": 1.471543312072754,
"learning_rate": 2.5859755435892597e-05,
"loss": 0.1892,
"step": 2970
},
{
"epoch": 1.5496977182604175,
"grad_norm": 1.1566507816314697,
"learning_rate": 2.5716506036604542e-05,
"loss": 0.2027,
"step": 2980
},
{
"epoch": 1.5548982643177534,
"grad_norm": 1.8999615907669067,
"learning_rate": 2.557323308913942e-05,
"loss": 0.2162,
"step": 2990
},
{
"epoch": 1.5600988103750892,
"grad_norm": 1.2542750835418701,
"learning_rate": 2.542994130220388e-05,
"loss": 0.1548,
"step": 3000
},
{
"epoch": 1.5600988103750892,
"eval_loss": 0.24241599440574646,
"eval_runtime": 135.9654,
"eval_samples_per_second": 14.136,
"eval_steps_per_second": 14.136,
"step": 3000
},
{
"epoch": 1.5652993564324253,
"grad_norm": 2.8087780475616455,
"learning_rate": 2.5286635385123725e-05,
"loss": 0.24,
"step": 3010
},
{
"epoch": 1.5704999024897615,
"grad_norm": 1.2270337343215942,
"learning_rate": 2.5143320047689173e-05,
"loss": 0.1968,
"step": 3020
},
{
"epoch": 1.5757004485470976,
"grad_norm": 1.490675926208496,
"learning_rate": 2.5e-05,
"loss": 0.167,
"step": 3030
},
{
"epoch": 1.5809009946044335,
"grad_norm": 0.7937414646148682,
"learning_rate": 2.485667995231084e-05,
"loss": 0.1436,
"step": 3040
},
{
"epoch": 1.5861015406617693,
"grad_norm": 1.8276423215866089,
"learning_rate": 2.4713364614876274e-05,
"loss": 0.2169,
"step": 3050
},
{
"epoch": 1.5913020867191054,
"grad_norm": 2.1891725063323975,
"learning_rate": 2.4570058697796125e-05,
"loss": 0.2003,
"step": 3060
},
{
"epoch": 1.5965026327764416,
"grad_norm": 1.920414686203003,
"learning_rate": 2.4426766910860585e-05,
"loss": 0.224,
"step": 3070
},
{
"epoch": 1.6017031788337777,
"grad_norm": 1.974658727645874,
"learning_rate": 2.428349396339547e-05,
"loss": 0.1934,
"step": 3080
},
{
"epoch": 1.6069037248911135,
"grad_norm": 2.3854596614837646,
"learning_rate": 2.4140244564107402e-05,
"loss": 0.2128,
"step": 3090
},
{
"epoch": 1.6121042709484494,
"grad_norm": 1.476598858833313,
"learning_rate": 2.3997023420929137e-05,
"loss": 0.1819,
"step": 3100
},
{
"epoch": 1.6173048170057855,
"grad_norm": 1.3164430856704712,
"learning_rate": 2.3853835240864743e-05,
"loss": 0.222,
"step": 3110
},
{
"epoch": 1.6225053630631217,
"grad_norm": 1.467546820640564,
"learning_rate": 2.3710684729834954e-05,
"loss": 0.173,
"step": 3120
},
{
"epoch": 1.6277059091204578,
"grad_norm": 0.9425441026687622,
"learning_rate": 2.3567576592522507e-05,
"loss": 0.2174,
"step": 3130
},
{
"epoch": 1.6329064551777936,
"grad_norm": 1.062456488609314,
"learning_rate": 2.342451553221752e-05,
"loss": 0.1934,
"step": 3140
},
{
"epoch": 1.6381070012351295,
"grad_norm": 1.7149615287780762,
"learning_rate": 2.32815062506629e-05,
"loss": 0.2479,
"step": 3150
},
{
"epoch": 1.6433075472924656,
"grad_norm": 0.6858556866645813,
"learning_rate": 2.3138553447899835e-05,
"loss": 0.1825,
"step": 3160
},
{
"epoch": 1.6485080933498018,
"grad_norm": 0.9924718737602234,
"learning_rate": 2.299566182211333e-05,
"loss": 0.155,
"step": 3170
},
{
"epoch": 1.6537086394071379,
"grad_norm": 2.138089656829834,
"learning_rate": 2.2852836069477773e-05,
"loss": 0.2105,
"step": 3180
},
{
"epoch": 1.6589091854644737,
"grad_norm": 1.5541861057281494,
"learning_rate": 2.2710080884002632e-05,
"loss": 0.2087,
"step": 3190
},
{
"epoch": 1.6641097315218096,
"grad_norm": 1.846656084060669,
"learning_rate": 2.2567400957378132e-05,
"loss": 0.1669,
"step": 3200
},
{
"epoch": 1.6693102775791457,
"grad_norm": 2.2019214630126953,
"learning_rate": 2.2424800978821146e-05,
"loss": 0.1955,
"step": 3210
},
{
"epoch": 1.6745108236364818,
"grad_norm": 0.8931058645248413,
"learning_rate": 2.228228563492098e-05,
"loss": 0.1679,
"step": 3220
},
{
"epoch": 1.679711369693818,
"grad_norm": 1.5306602716445923,
"learning_rate": 2.2139859609485426e-05,
"loss": 0.1887,
"step": 3230
},
{
"epoch": 1.6849119157511538,
"grad_norm": 0.7173328399658203,
"learning_rate": 2.199752758338679e-05,
"loss": 0.1744,
"step": 3240
},
{
"epoch": 1.6901124618084897,
"grad_norm": 2.8038320541381836,
"learning_rate": 2.1855294234408068e-05,
"loss": 0.2108,
"step": 3250
},
{
"epoch": 1.6953130078658258,
"grad_norm": 1.2980599403381348,
"learning_rate": 2.1713164237089203e-05,
"loss": 0.1721,
"step": 3260
},
{
"epoch": 1.700513553923162,
"grad_norm": 1.4280049800872803,
"learning_rate": 2.1571142262573457e-05,
"loss": 0.1959,
"step": 3270
},
{
"epoch": 1.705714099980498,
"grad_norm": 2.656005382537842,
"learning_rate": 2.1429232978453862e-05,
"loss": 0.2284,
"step": 3280
},
{
"epoch": 1.710914646037834,
"grad_norm": 0.8656441569328308,
"learning_rate": 2.128744104861991e-05,
"loss": 0.2159,
"step": 3290
},
{
"epoch": 1.7161151920951698,
"grad_norm": 1.6419271230697632,
"learning_rate": 2.1145771133104157e-05,
"loss": 0.1671,
"step": 3300
},
{
"epoch": 1.721315738152506,
"grad_norm": 1.286908507347107,
"learning_rate": 2.1004227887929133e-05,
"loss": 0.1683,
"step": 3310
},
{
"epoch": 1.726516284209842,
"grad_norm": 3.205409288406372,
"learning_rate": 2.086281596495434e-05,
"loss": 0.1585,
"step": 3320
},
{
"epoch": 1.7317168302671782,
"grad_norm": 0.6113395094871521,
"learning_rate": 2.07215400117233e-05,
"loss": 0.1593,
"step": 3330
},
{
"epoch": 1.736917376324514,
"grad_norm": 1.3752492666244507,
"learning_rate": 2.0580404671310878e-05,
"loss": 0.2058,
"step": 3340
},
{
"epoch": 1.7421179223818501,
"grad_norm": 0.68391352891922,
"learning_rate": 2.0439414582170628e-05,
"loss": 0.1796,
"step": 3350
},
{
"epoch": 1.747318468439186,
"grad_norm": 1.9185495376586914,
"learning_rate": 2.0298574377982427e-05,
"loss": 0.2212,
"step": 3360
},
{
"epoch": 1.7525190144965221,
"grad_norm": 1.3910088539123535,
"learning_rate": 2.015788868750009e-05,
"loss": 0.1488,
"step": 3370
},
{
"epoch": 1.7577195605538583,
"grad_norm": 0.8257030248641968,
"learning_rate": 2.001736213439933e-05,
"loss": 0.1957,
"step": 3380
},
{
"epoch": 1.7629201066111941,
"grad_norm": 0.8184394240379333,
"learning_rate": 1.987699933712573e-05,
"loss": 0.2042,
"step": 3390
},
{
"epoch": 1.7681206526685302,
"grad_norm": 0.9625434875488281,
"learning_rate": 1.9736804908743033e-05,
"loss": 0.1953,
"step": 3400
},
{
"epoch": 1.7733211987258661,
"grad_norm": 2.588742256164551,
"learning_rate": 1.959678345678146e-05,
"loss": 0.2007,
"step": 3410
},
{
"epoch": 1.7785217447832022,
"grad_norm": 1.6495355367660522,
"learning_rate": 1.9456939583086303e-05,
"loss": 0.1823,
"step": 3420
},
{
"epoch": 1.7837222908405383,
"grad_norm": 1.325899600982666,
"learning_rate": 1.9317277883666745e-05,
"loss": 0.2144,
"step": 3430
},
{
"epoch": 1.7889228368978742,
"grad_norm": 1.2811932563781738,
"learning_rate": 1.91778029485447e-05,
"loss": 0.2244,
"step": 3440
},
{
"epoch": 1.7941233829552103,
"grad_norm": 1.6615418195724487,
"learning_rate": 1.9038519361604046e-05,
"loss": 0.1965,
"step": 3450
},
{
"epoch": 1.7993239290125462,
"grad_norm": 1.7860767841339111,
"learning_rate": 1.8899431700439946e-05,
"loss": 0.206,
"step": 3460
},
{
"epoch": 1.8045244750698823,
"grad_norm": 1.323864221572876,
"learning_rate": 1.876054453620841e-05,
"loss": 0.1507,
"step": 3470
},
{
"epoch": 1.8097250211272184,
"grad_norm": 1.264664649963379,
"learning_rate": 1.8621862433476054e-05,
"loss": 0.1847,
"step": 3480
},
{
"epoch": 1.8149255671845543,
"grad_norm": 2.377115249633789,
"learning_rate": 1.8483389950070097e-05,
"loss": 0.2117,
"step": 3490
},
{
"epoch": 1.8201261132418904,
"grad_norm": 1.387811541557312,
"learning_rate": 1.8345131636928518e-05,
"loss": 0.2048,
"step": 3500
},
{
"epoch": 1.8201261132418904,
"eval_loss": 0.2365516871213913,
"eval_runtime": 134.9588,
"eval_samples_per_second": 14.241,
"eval_steps_per_second": 14.241,
"step": 3500
}
],
"logging_steps": 10,
"max_steps": 5769,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2327670832608051e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}