final_score0.25 / trainer_state.json
LHL3341's picture
upload
c63717c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3765,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007971303308090873,
"grad_norm": 2.7439849590205014,
"learning_rate": 2.387267904509284e-07,
"loss": 0.4952,
"step": 10
},
{
"epoch": 0.015942606616181746,
"grad_norm": 1.8269416958822857,
"learning_rate": 5.039787798408489e-07,
"loss": 0.4898,
"step": 20
},
{
"epoch": 0.023913909924272617,
"grad_norm": 1.373955440239158,
"learning_rate": 7.692307692307694e-07,
"loss": 0.4661,
"step": 30
},
{
"epoch": 0.03188521323236349,
"grad_norm": 0.7072675856563305,
"learning_rate": 1.0344827586206898e-06,
"loss": 0.4337,
"step": 40
},
{
"epoch": 0.03985651654045436,
"grad_norm": 0.7236913112540133,
"learning_rate": 1.29973474801061e-06,
"loss": 0.4091,
"step": 50
},
{
"epoch": 0.047827819848545235,
"grad_norm": 0.42830095771838445,
"learning_rate": 1.5649867374005307e-06,
"loss": 0.391,
"step": 60
},
{
"epoch": 0.05579912315663611,
"grad_norm": 0.24985255941378587,
"learning_rate": 1.830238726790451e-06,
"loss": 0.3755,
"step": 70
},
{
"epoch": 0.06377042646472698,
"grad_norm": 0.22053904249111797,
"learning_rate": 2.0954907161803713e-06,
"loss": 0.3646,
"step": 80
},
{
"epoch": 0.07174172977281786,
"grad_norm": 0.1871454926752817,
"learning_rate": 2.360742705570292e-06,
"loss": 0.3585,
"step": 90
},
{
"epoch": 0.07971303308090873,
"grad_norm": 0.16957059006933034,
"learning_rate": 2.625994694960212e-06,
"loss": 0.3537,
"step": 100
},
{
"epoch": 0.0876843363889996,
"grad_norm": 0.15807877454450825,
"learning_rate": 2.891246684350133e-06,
"loss": 0.3481,
"step": 110
},
{
"epoch": 0.09565563969709047,
"grad_norm": 0.1696204667980688,
"learning_rate": 3.1564986737400535e-06,
"loss": 0.3455,
"step": 120
},
{
"epoch": 0.10362694300518134,
"grad_norm": 0.18296589316110073,
"learning_rate": 3.4217506631299737e-06,
"loss": 0.3374,
"step": 130
},
{
"epoch": 0.11159824631327223,
"grad_norm": 0.1589874660304547,
"learning_rate": 3.6870026525198943e-06,
"loss": 0.3353,
"step": 140
},
{
"epoch": 0.1195695496213631,
"grad_norm": 0.16713671478912018,
"learning_rate": 3.9522546419098145e-06,
"loss": 0.3311,
"step": 150
},
{
"epoch": 0.12754085292945397,
"grad_norm": 0.19094654720327028,
"learning_rate": 4.217506631299735e-06,
"loss": 0.3298,
"step": 160
},
{
"epoch": 0.13551215623754484,
"grad_norm": 0.17173127985031086,
"learning_rate": 4.482758620689656e-06,
"loss": 0.3297,
"step": 170
},
{
"epoch": 0.1434834595456357,
"grad_norm": 0.16743742436159792,
"learning_rate": 4.748010610079576e-06,
"loss": 0.3255,
"step": 180
},
{
"epoch": 0.15145476285372658,
"grad_norm": 0.20163674324120764,
"learning_rate": 5.013262599469496e-06,
"loss": 0.325,
"step": 190
},
{
"epoch": 0.15942606616181745,
"grad_norm": 0.16037098448631043,
"learning_rate": 5.278514588859417e-06,
"loss": 0.3197,
"step": 200
},
{
"epoch": 0.16739736946990832,
"grad_norm": 0.21353712697354443,
"learning_rate": 5.5437665782493376e-06,
"loss": 0.319,
"step": 210
},
{
"epoch": 0.1753686727779992,
"grad_norm": 0.19869628289479196,
"learning_rate": 5.809018567639257e-06,
"loss": 0.3156,
"step": 220
},
{
"epoch": 0.18333997608609007,
"grad_norm": 0.1975130616223235,
"learning_rate": 6.074270557029178e-06,
"loss": 0.3172,
"step": 230
},
{
"epoch": 0.19131127939418094,
"grad_norm": 0.1902414828411451,
"learning_rate": 6.339522546419099e-06,
"loss": 0.3122,
"step": 240
},
{
"epoch": 0.1992825827022718,
"grad_norm": 0.22225170253654492,
"learning_rate": 6.6047745358090184e-06,
"loss": 0.3073,
"step": 250
},
{
"epoch": 0.20725388601036268,
"grad_norm": 0.19343451278128465,
"learning_rate": 6.87002652519894e-06,
"loss": 0.309,
"step": 260
},
{
"epoch": 0.21522518931845358,
"grad_norm": 0.21183101802781412,
"learning_rate": 7.1352785145888606e-06,
"loss": 0.3124,
"step": 270
},
{
"epoch": 0.22319649262654445,
"grad_norm": 0.26603427067291613,
"learning_rate": 7.40053050397878e-06,
"loss": 0.3059,
"step": 280
},
{
"epoch": 0.23116779593463532,
"grad_norm": 0.22261379017016011,
"learning_rate": 7.6657824933687e-06,
"loss": 0.3094,
"step": 290
},
{
"epoch": 0.2391390992427262,
"grad_norm": 0.2024824115097555,
"learning_rate": 7.93103448275862e-06,
"loss": 0.3084,
"step": 300
},
{
"epoch": 0.24711040255081707,
"grad_norm": 0.2042942185636207,
"learning_rate": 8.196286472148541e-06,
"loss": 0.3057,
"step": 310
},
{
"epoch": 0.25508170585890794,
"grad_norm": 0.18736233349181933,
"learning_rate": 8.461538461538462e-06,
"loss": 0.304,
"step": 320
},
{
"epoch": 0.2630530091669988,
"grad_norm": 0.21102640951635082,
"learning_rate": 8.726790450928383e-06,
"loss": 0.3033,
"step": 330
},
{
"epoch": 0.2710243124750897,
"grad_norm": 0.21618009137316338,
"learning_rate": 8.992042440318303e-06,
"loss": 0.3021,
"step": 340
},
{
"epoch": 0.27899561578318055,
"grad_norm": 0.19515018580578244,
"learning_rate": 9.257294429708224e-06,
"loss": 0.2978,
"step": 350
},
{
"epoch": 0.2869669190912714,
"grad_norm": 0.24140786468138617,
"learning_rate": 9.522546419098145e-06,
"loss": 0.3012,
"step": 360
},
{
"epoch": 0.2949382223993623,
"grad_norm": 0.27497175793322665,
"learning_rate": 9.787798408488064e-06,
"loss": 0.3013,
"step": 370
},
{
"epoch": 0.30290952570745316,
"grad_norm": 0.23551643286267343,
"learning_rate": 9.99999140169557e-06,
"loss": 0.3002,
"step": 380
},
{
"epoch": 0.31088082901554404,
"grad_norm": 0.24946640275348875,
"learning_rate": 9.99969046414561e-06,
"loss": 0.2955,
"step": 390
},
{
"epoch": 0.3188521323236349,
"grad_norm": 0.2424205054036822,
"learning_rate": 9.998959640946033e-06,
"loss": 0.2976,
"step": 400
},
{
"epoch": 0.3268234356317258,
"grad_norm": 0.2844168535727217,
"learning_rate": 9.997798994934812e-06,
"loss": 0.2977,
"step": 410
},
{
"epoch": 0.33479473893981665,
"grad_norm": 0.25483658348181026,
"learning_rate": 9.99620862590714e-06,
"loss": 0.2951,
"step": 420
},
{
"epoch": 0.3427660422479075,
"grad_norm": 0.23711613447438198,
"learning_rate": 9.994188670606845e-06,
"loss": 0.2952,
"step": 430
},
{
"epoch": 0.3507373455559984,
"grad_norm": 0.2028196656708933,
"learning_rate": 9.99173930271464e-06,
"loss": 0.2972,
"step": 440
},
{
"epoch": 0.35870864886408926,
"grad_norm": 0.23753065276887678,
"learning_rate": 9.988860732833183e-06,
"loss": 0.294,
"step": 450
},
{
"epoch": 0.36667995217218013,
"grad_norm": 0.21519789088821045,
"learning_rate": 9.98555320846897e-06,
"loss": 0.2949,
"step": 460
},
{
"epoch": 0.374651255480271,
"grad_norm": 0.24780874257317498,
"learning_rate": 9.981817014011066e-06,
"loss": 0.2911,
"step": 470
},
{
"epoch": 0.3826225587883619,
"grad_norm": 0.2419346606109402,
"learning_rate": 9.977652470706629e-06,
"loss": 0.2923,
"step": 480
},
{
"epoch": 0.39059386209645275,
"grad_norm": 0.22080537805937434,
"learning_rate": 9.973059936633308e-06,
"loss": 0.2908,
"step": 490
},
{
"epoch": 0.3985651654045436,
"grad_norm": 0.22915945812817434,
"learning_rate": 9.968039806668448e-06,
"loss": 0.2934,
"step": 500
},
{
"epoch": 0.4065364687126345,
"grad_norm": 0.25832882794920076,
"learning_rate": 9.96259251245514e-06,
"loss": 0.2892,
"step": 510
},
{
"epoch": 0.41450777202072536,
"grad_norm": 0.20890334095429705,
"learning_rate": 9.956718522365098e-06,
"loss": 0.29,
"step": 520
},
{
"epoch": 0.42247907532881623,
"grad_norm": 0.19222031712823726,
"learning_rate": 9.950418341458398e-06,
"loss": 0.2936,
"step": 530
},
{
"epoch": 0.43045037863690716,
"grad_norm": 0.229251945332095,
"learning_rate": 9.943692511440051e-06,
"loss": 0.2903,
"step": 540
},
{
"epoch": 0.43842168194499803,
"grad_norm": 0.19910329695969214,
"learning_rate": 9.936541610613417e-06,
"loss": 0.2882,
"step": 550
},
{
"epoch": 0.4463929852530889,
"grad_norm": 0.1980158753040197,
"learning_rate": 9.928966253830492e-06,
"loss": 0.288,
"step": 560
},
{
"epoch": 0.4543642885611798,
"grad_norm": 0.21294837161238345,
"learning_rate": 9.920967092439028e-06,
"loss": 0.2901,
"step": 570
},
{
"epoch": 0.46233559186927065,
"grad_norm": 0.2345212892254059,
"learning_rate": 9.912544814226547e-06,
"loss": 0.2889,
"step": 580
},
{
"epoch": 0.4703068951773615,
"grad_norm": 0.2653674551152453,
"learning_rate": 9.903700143361185e-06,
"loss": 0.2884,
"step": 590
},
{
"epoch": 0.4782781984854524,
"grad_norm": 0.22315589547019074,
"learning_rate": 9.894433840329442e-06,
"loss": 0.288,
"step": 600
},
{
"epoch": 0.48624950179354326,
"grad_norm": 0.27243118100516245,
"learning_rate": 9.884746701870778e-06,
"loss": 0.2876,
"step": 610
},
{
"epoch": 0.49422080510163413,
"grad_norm": 0.2642805191794303,
"learning_rate": 9.874639560909118e-06,
"loss": 0.2855,
"step": 620
},
{
"epoch": 0.502192108409725,
"grad_norm": 0.21570422679762521,
"learning_rate": 9.864113286481237e-06,
"loss": 0.2848,
"step": 630
},
{
"epoch": 0.5101634117178159,
"grad_norm": 0.20704843013033902,
"learning_rate": 9.853168783662028e-06,
"loss": 0.2873,
"step": 640
},
{
"epoch": 0.5181347150259067,
"grad_norm": 0.18056431665581077,
"learning_rate": 9.841806993486686e-06,
"loss": 0.2839,
"step": 650
},
{
"epoch": 0.5261060183339976,
"grad_norm": 0.2105787281096281,
"learning_rate": 9.830028892869804e-06,
"loss": 0.2813,
"step": 660
},
{
"epoch": 0.5340773216420884,
"grad_norm": 0.21257758348049322,
"learning_rate": 9.81783549452136e-06,
"loss": 0.2846,
"step": 670
},
{
"epoch": 0.5420486249501794,
"grad_norm": 0.21961213318400993,
"learning_rate": 9.805227846859652e-06,
"loss": 0.2829,
"step": 680
},
{
"epoch": 0.5500199282582702,
"grad_norm": 0.25199803254623604,
"learning_rate": 9.792207033921152e-06,
"loss": 0.2883,
"step": 690
},
{
"epoch": 0.5579912315663611,
"grad_norm": 0.20221809778648672,
"learning_rate": 9.778774175267294e-06,
"loss": 0.2842,
"step": 700
},
{
"epoch": 0.565962534874452,
"grad_norm": 0.1898841298780325,
"learning_rate": 9.764930425888216e-06,
"loss": 0.282,
"step": 710
},
{
"epoch": 0.5739338381825428,
"grad_norm": 0.20117780815968267,
"learning_rate": 9.750676976103444e-06,
"loss": 0.2839,
"step": 720
},
{
"epoch": 0.5819051414906338,
"grad_norm": 0.21755712645273997,
"learning_rate": 9.736015051459551e-06,
"loss": 0.2819,
"step": 730
},
{
"epoch": 0.5898764447987246,
"grad_norm": 0.20291021122674352,
"learning_rate": 9.720945912624783e-06,
"loss": 0.2836,
"step": 740
},
{
"epoch": 0.5978477481068155,
"grad_norm": 0.20194444409505907,
"learning_rate": 9.705470855280661e-06,
"loss": 0.2833,
"step": 750
},
{
"epoch": 0.6058190514149063,
"grad_norm": 0.20304767059054613,
"learning_rate": 9.689591210010572e-06,
"loss": 0.2825,
"step": 760
},
{
"epoch": 0.6137903547229973,
"grad_norm": 0.2648940083523449,
"learning_rate": 9.673308342185366e-06,
"loss": 0.282,
"step": 770
},
{
"epoch": 0.6217616580310881,
"grad_norm": 0.21769630697418604,
"learning_rate": 9.65662365184596e-06,
"loss": 0.2774,
"step": 780
},
{
"epoch": 0.629732961339179,
"grad_norm": 0.392188226969322,
"learning_rate": 9.639538573582952e-06,
"loss": 0.2819,
"step": 790
},
{
"epoch": 0.6377042646472698,
"grad_norm": 0.20739255958270508,
"learning_rate": 9.62205457641328e-06,
"loss": 0.2814,
"step": 800
},
{
"epoch": 0.6456755679553607,
"grad_norm": 0.19797948805471874,
"learning_rate": 9.604173163653906e-06,
"loss": 0.2807,
"step": 810
},
{
"epoch": 0.6536468712634516,
"grad_norm": 0.22378074781887625,
"learning_rate": 9.58589587279256e-06,
"loss": 0.2779,
"step": 820
},
{
"epoch": 0.6616181745715425,
"grad_norm": 0.19730401928407432,
"learning_rate": 9.567224275355538e-06,
"loss": 0.2807,
"step": 830
},
{
"epoch": 0.6695894778796333,
"grad_norm": 0.19232291589840167,
"learning_rate": 9.548159976772593e-06,
"loss": 0.2803,
"step": 840
},
{
"epoch": 0.6775607811877242,
"grad_norm": 0.23171459344457662,
"learning_rate": 9.528704616238875e-06,
"loss": 0.2794,
"step": 850
},
{
"epoch": 0.685532084495815,
"grad_norm": 0.2316345630782031,
"learning_rate": 9.508859866574003e-06,
"loss": 0.2802,
"step": 860
},
{
"epoch": 0.693503387803906,
"grad_norm": 0.20634122518883097,
"learning_rate": 9.488627434078232e-06,
"loss": 0.2814,
"step": 870
},
{
"epoch": 0.7014746911119968,
"grad_norm": 0.28423540381597584,
"learning_rate": 9.468009058385735e-06,
"loss": 0.277,
"step": 880
},
{
"epoch": 0.7094459944200877,
"grad_norm": 0.19838823753708112,
"learning_rate": 9.447006512315025e-06,
"loss": 0.2775,
"step": 890
},
{
"epoch": 0.7174172977281785,
"grad_norm": 0.1988655247541702,
"learning_rate": 9.425621601716531e-06,
"loss": 0.278,
"step": 900
},
{
"epoch": 0.7253886010362695,
"grad_norm": 0.18011113329739104,
"learning_rate": 9.403856165317322e-06,
"loss": 0.2786,
"step": 910
},
{
"epoch": 0.7333599043443603,
"grad_norm": 0.1844142249739523,
"learning_rate": 9.381712074563006e-06,
"loss": 0.2785,
"step": 920
},
{
"epoch": 0.7413312076524512,
"grad_norm": 0.2148813967605786,
"learning_rate": 9.359191233456821e-06,
"loss": 0.2785,
"step": 930
},
{
"epoch": 0.749302510960542,
"grad_norm": 0.1883568911372003,
"learning_rate": 9.336295578395927e-06,
"loss": 0.2789,
"step": 940
},
{
"epoch": 0.7572738142686329,
"grad_norm": 0.2340210607240215,
"learning_rate": 9.313027078004903e-06,
"loss": 0.2789,
"step": 950
},
{
"epoch": 0.7652451175767238,
"grad_norm": 0.19429300696808027,
"learning_rate": 9.289387732966492e-06,
"loss": 0.2788,
"step": 960
},
{
"epoch": 0.7732164208848147,
"grad_norm": 0.1933521060098035,
"learning_rate": 9.265379575849561e-06,
"loss": 0.2743,
"step": 970
},
{
"epoch": 0.7811877241929055,
"grad_norm": 0.2008930847687583,
"learning_rate": 9.241004670934348e-06,
"loss": 0.2746,
"step": 980
},
{
"epoch": 0.7891590275009964,
"grad_norm": 0.20419942608243757,
"learning_rate": 9.216265114034964e-06,
"loss": 0.2761,
"step": 990
},
{
"epoch": 0.7971303308090872,
"grad_norm": 0.20885521941941515,
"learning_rate": 9.191163032319198e-06,
"loss": 0.2799,
"step": 1000
},
{
"epoch": 0.8051016341171782,
"grad_norm": 0.20033581634243633,
"learning_rate": 9.1657005841256e-06,
"loss": 0.2773,
"step": 1010
},
{
"epoch": 0.813072937425269,
"grad_norm": 0.22147195992757276,
"learning_rate": 9.139879958777931e-06,
"loss": 0.275,
"step": 1020
},
{
"epoch": 0.8210442407333599,
"grad_norm": 0.20166804562350057,
"learning_rate": 9.113703376396885e-06,
"loss": 0.2755,
"step": 1030
},
{
"epoch": 0.8290155440414507,
"grad_norm": 0.19260698449209812,
"learning_rate": 9.087173087709226e-06,
"loss": 0.2742,
"step": 1040
},
{
"epoch": 0.8369868473495417,
"grad_norm": 0.2116198417344192,
"learning_rate": 9.060291373854252e-06,
"loss": 0.2749,
"step": 1050
},
{
"epoch": 0.8449581506576325,
"grad_norm": 0.27126785391794345,
"learning_rate": 9.033060546187651e-06,
"loss": 0.2774,
"step": 1060
},
{
"epoch": 0.8529294539657234,
"grad_norm": 0.2361830324015449,
"learning_rate": 9.005482946082784e-06,
"loss": 0.2724,
"step": 1070
},
{
"epoch": 0.8609007572738143,
"grad_norm": 0.23551093980233517,
"learning_rate": 8.97756094472935e-06,
"loss": 0.2762,
"step": 1080
},
{
"epoch": 0.8688720605819051,
"grad_norm": 0.21298565207438697,
"learning_rate": 8.949296942929515e-06,
"loss": 0.2753,
"step": 1090
},
{
"epoch": 0.8768433638899961,
"grad_norm": 0.20749916709788005,
"learning_rate": 8.92069337089148e-06,
"loss": 0.278,
"step": 1100
},
{
"epoch": 0.8848146671980869,
"grad_norm": 0.17956706004286632,
"learning_rate": 8.891752688020532e-06,
"loss": 0.2775,
"step": 1110
},
{
"epoch": 0.8927859705061778,
"grad_norm": 0.1953798521876003,
"learning_rate": 8.862477382707569e-06,
"loss": 0.2741,
"step": 1120
},
{
"epoch": 0.9007572738142686,
"grad_norm": 0.2294774184802095,
"learning_rate": 8.832869972115148e-06,
"loss": 0.2736,
"step": 1130
},
{
"epoch": 0.9087285771223595,
"grad_norm": 0.18067067247270632,
"learning_rate": 8.802933001961058e-06,
"loss": 0.2737,
"step": 1140
},
{
"epoch": 0.9166998804304504,
"grad_norm": 0.2146301016081478,
"learning_rate": 8.77266904629942e-06,
"loss": 0.2733,
"step": 1150
},
{
"epoch": 0.9246711837385413,
"grad_norm": 0.18563813225207978,
"learning_rate": 8.742080707299381e-06,
"loss": 0.2734,
"step": 1160
},
{
"epoch": 0.9326424870466321,
"grad_norm": 0.19921223017975606,
"learning_rate": 8.71117061502135e-06,
"loss": 0.2762,
"step": 1170
},
{
"epoch": 0.940613790354723,
"grad_norm": 0.19733160963975976,
"learning_rate": 8.679941427190884e-06,
"loss": 0.275,
"step": 1180
},
{
"epoch": 0.9485850936628138,
"grad_norm": 0.20922845272298418,
"learning_rate": 8.64839582897015e-06,
"loss": 0.2712,
"step": 1190
},
{
"epoch": 0.9565563969709048,
"grad_norm": 0.17679207911764114,
"learning_rate": 8.616536532727062e-06,
"loss": 0.2716,
"step": 1200
},
{
"epoch": 0.9645277002789956,
"grad_norm": 0.1871171741554302,
"learning_rate": 8.584366277802057e-06,
"loss": 0.271,
"step": 1210
},
{
"epoch": 0.9724990035870865,
"grad_norm": 0.20200784391473173,
"learning_rate": 8.55188783027256e-06,
"loss": 0.2722,
"step": 1220
},
{
"epoch": 0.9804703068951773,
"grad_norm": 0.2060568121760763,
"learning_rate": 8.519103982715158e-06,
"loss": 0.2695,
"step": 1230
},
{
"epoch": 0.9884416102032683,
"grad_norm": 0.19358577088132986,
"learning_rate": 8.486017553965475e-06,
"loss": 0.2701,
"step": 1240
},
{
"epoch": 0.9964129135113591,
"grad_norm": 0.2434707357861983,
"learning_rate": 8.452631388875814e-06,
"loss": 0.2731,
"step": 1250
},
{
"epoch": 1.0039856516540455,
"grad_norm": 0.2616385036392448,
"learning_rate": 8.418948358070535e-06,
"loss": 0.2665,
"step": 1260
},
{
"epoch": 1.0119569549621363,
"grad_norm": 0.2115757131765118,
"learning_rate": 8.384971357699255e-06,
"loss": 0.2627,
"step": 1270
},
{
"epoch": 1.0199282582702272,
"grad_norm": 0.20239812309837896,
"learning_rate": 8.3507033091878e-06,
"loss": 0.2662,
"step": 1280
},
{
"epoch": 1.027899561578318,
"grad_norm": 0.20274705613036875,
"learning_rate": 8.316147158987036e-06,
"loss": 0.2637,
"step": 1290
},
{
"epoch": 1.035870864886409,
"grad_norm": 0.1854664643809721,
"learning_rate": 8.281305878319519e-06,
"loss": 0.2627,
"step": 1300
},
{
"epoch": 1.0438421681944998,
"grad_norm": 0.20204039521748993,
"learning_rate": 8.246182462924022e-06,
"loss": 0.2625,
"step": 1310
},
{
"epoch": 1.0518134715025906,
"grad_norm": 0.21618960118522135,
"learning_rate": 8.210779932797954e-06,
"loss": 0.2693,
"step": 1320
},
{
"epoch": 1.0597847748106815,
"grad_norm": 0.178361480629425,
"learning_rate": 8.175101331937692e-06,
"loss": 0.261,
"step": 1330
},
{
"epoch": 1.0677560781187725,
"grad_norm": 0.25878248216098365,
"learning_rate": 8.139149728076852e-06,
"loss": 0.2634,
"step": 1340
},
{
"epoch": 1.0757273814268633,
"grad_norm": 0.2033185860677756,
"learning_rate": 8.102928212422519e-06,
"loss": 0.2646,
"step": 1350
},
{
"epoch": 1.0836986847349541,
"grad_norm": 0.2143942468103035,
"learning_rate": 8.066439899389451e-06,
"loss": 0.264,
"step": 1360
},
{
"epoch": 1.091669988043045,
"grad_norm": 0.18331709491785228,
"learning_rate": 8.02968792633231e-06,
"loss": 0.2646,
"step": 1370
},
{
"epoch": 1.099641291351136,
"grad_norm": 0.19827283155969086,
"learning_rate": 7.99267545327588e-06,
"loss": 0.2648,
"step": 1380
},
{
"epoch": 1.1076125946592268,
"grad_norm": 0.24364988811035662,
"learning_rate": 7.955405662643384e-06,
"loss": 0.2601,
"step": 1390
},
{
"epoch": 1.1155838979673176,
"grad_norm": 0.19923690656335546,
"learning_rate": 7.917881758982838e-06,
"loss": 0.2638,
"step": 1400
},
{
"epoch": 1.1235552012754084,
"grad_norm": 0.19790386629961096,
"learning_rate": 7.880106968691516e-06,
"loss": 0.2647,
"step": 1410
},
{
"epoch": 1.1315265045834995,
"grad_norm": 0.209982262643357,
"learning_rate": 7.842084539738547e-06,
"loss": 0.2629,
"step": 1420
},
{
"epoch": 1.1394978078915903,
"grad_norm": 0.1865353794893146,
"learning_rate": 7.803817741385636e-06,
"loss": 0.2622,
"step": 1430
},
{
"epoch": 1.147469111199681,
"grad_norm": 0.18591402004381719,
"learning_rate": 7.765309863905965e-06,
"loss": 0.2638,
"step": 1440
},
{
"epoch": 1.1554404145077721,
"grad_norm": 0.22287948038133532,
"learning_rate": 7.726564218301298e-06,
"loss": 0.2658,
"step": 1450
},
{
"epoch": 1.163411717815863,
"grad_norm": 0.18704477851949775,
"learning_rate": 7.68758413601728e-06,
"loss": 0.2632,
"step": 1460
},
{
"epoch": 1.1713830211239538,
"grad_norm": 0.18630909945867757,
"learning_rate": 7.648372968656995e-06,
"loss": 0.2629,
"step": 1470
},
{
"epoch": 1.1793543244320446,
"grad_norm": 0.5386080569158398,
"learning_rate": 7.608934087692794e-06,
"loss": 0.2612,
"step": 1480
},
{
"epoch": 1.1873256277401354,
"grad_norm": 0.1746043720739328,
"learning_rate": 7.569270884176401e-06,
"loss": 0.2609,
"step": 1490
},
{
"epoch": 1.1952969310482264,
"grad_norm": 0.19541388116589287,
"learning_rate": 7.529386768447342e-06,
"loss": 0.2642,
"step": 1500
},
{
"epoch": 1.2032682343563172,
"grad_norm": 0.2015022076748336,
"learning_rate": 7.4892851698397174e-06,
"loss": 0.2638,
"step": 1510
},
{
"epoch": 1.211239537664408,
"grad_norm": 0.18974402819968988,
"learning_rate": 7.448969536387339e-06,
"loss": 0.2617,
"step": 1520
},
{
"epoch": 1.219210840972499,
"grad_norm": 0.19142664444724963,
"learning_rate": 7.408443334527257e-06,
"loss": 0.2644,
"step": 1530
},
{
"epoch": 1.22718214428059,
"grad_norm": 0.18678354662021696,
"learning_rate": 7.367710048801715e-06,
"loss": 0.26,
"step": 1540
},
{
"epoch": 1.2351534475886807,
"grad_norm": 0.20449677931821042,
"learning_rate": 7.326773181558532e-06,
"loss": 0.2593,
"step": 1550
},
{
"epoch": 1.2431247508967715,
"grad_norm": 0.17656320872206302,
"learning_rate": 7.285636252649965e-06,
"loss": 0.2629,
"step": 1560
},
{
"epoch": 1.2510960542048624,
"grad_norm": 0.18132917187985256,
"learning_rate": 7.244302799130064e-06,
"loss": 0.2632,
"step": 1570
},
{
"epoch": 1.2590673575129534,
"grad_norm": 0.19148004299294089,
"learning_rate": 7.202776374950549e-06,
"loss": 0.262,
"step": 1580
},
{
"epoch": 1.2670386608210442,
"grad_norm": 0.17407075104632905,
"learning_rate": 7.161060550655227e-06,
"loss": 0.2614,
"step": 1590
},
{
"epoch": 1.275009964129135,
"grad_norm": 0.1805208863871377,
"learning_rate": 7.119158913072996e-06,
"loss": 0.2602,
"step": 1600
},
{
"epoch": 1.282981267437226,
"grad_norm": 0.1700912662455128,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.2601,
"step": 1610
},
{
"epoch": 1.2909525707453169,
"grad_norm": 0.20872692606059245,
"learning_rate": 7.03481262493702e-06,
"loss": 0.2623,
"step": 1620
},
{
"epoch": 1.2989238740534077,
"grad_norm": 0.18231553621037966,
"learning_rate": 6.992375226684016e-06,
"loss": 0.2612,
"step": 1630
},
{
"epoch": 1.3068951773614987,
"grad_norm": 0.3260893903118034,
"learning_rate": 6.949766519122021e-06,
"loss": 0.2593,
"step": 1640
},
{
"epoch": 1.3148664806695896,
"grad_norm": 0.1837719103909305,
"learning_rate": 6.906990165852218e-06,
"loss": 0.2631,
"step": 1650
},
{
"epoch": 1.3228377839776804,
"grad_norm": 0.17385746180137812,
"learning_rate": 6.864049844890389e-06,
"loss": 0.2601,
"step": 1660
},
{
"epoch": 1.3308090872857712,
"grad_norm": 0.18767118916567374,
"learning_rate": 6.820949248350653e-06,
"loss": 0.2599,
"step": 1670
},
{
"epoch": 1.338780390593862,
"grad_norm": 0.18523703670706165,
"learning_rate": 6.777692082128024e-06,
"loss": 0.2611,
"step": 1680
},
{
"epoch": 1.346751693901953,
"grad_norm": 0.17761446472146364,
"learning_rate": 6.734282065579757e-06,
"loss": 0.2596,
"step": 1690
},
{
"epoch": 1.3547229972100439,
"grad_norm": 0.20263313570108268,
"learning_rate": 6.690722931205551e-06,
"loss": 0.2579,
"step": 1700
},
{
"epoch": 1.3626943005181347,
"grad_norm": 0.19596135949066837,
"learning_rate": 6.6470184243266235e-06,
"loss": 0.2594,
"step": 1710
},
{
"epoch": 1.3706656038262257,
"grad_norm": 0.17389328177855742,
"learning_rate": 6.6031723027636775e-06,
"loss": 0.2601,
"step": 1720
},
{
"epoch": 1.3786369071343165,
"grad_norm": 0.17594519271532788,
"learning_rate": 6.559188336513794e-06,
"loss": 0.2609,
"step": 1730
},
{
"epoch": 1.3866082104424073,
"grad_norm": 0.17131298098330294,
"learning_rate": 6.515070307426279e-06,
"loss": 0.2639,
"step": 1740
},
{
"epoch": 1.3945795137504982,
"grad_norm": 0.18366655818427982,
"learning_rate": 6.470822008877482e-06,
"loss": 0.2643,
"step": 1750
},
{
"epoch": 1.402550817058589,
"grad_norm": 0.16290637917636525,
"learning_rate": 6.4264472454446535e-06,
"loss": 0.2589,
"step": 1760
},
{
"epoch": 1.41052212036668,
"grad_norm": 0.1923343830812872,
"learning_rate": 6.381949832578796e-06,
"loss": 0.2597,
"step": 1770
},
{
"epoch": 1.4184934236747708,
"grad_norm": 0.16772495714207045,
"learning_rate": 6.337333596276613e-06,
"loss": 0.2605,
"step": 1780
},
{
"epoch": 1.4264647269828616,
"grad_norm": 0.18790213961542337,
"learning_rate": 6.292602372751536e-06,
"loss": 0.2604,
"step": 1790
},
{
"epoch": 1.4344360302909527,
"grad_norm": 0.18406526632785894,
"learning_rate": 6.247760008103889e-06,
"loss": 0.2607,
"step": 1800
},
{
"epoch": 1.4424073335990435,
"grad_norm": 0.18222410408713163,
"learning_rate": 6.2028103579901725e-06,
"loss": 0.2615,
"step": 1810
},
{
"epoch": 1.4503786369071343,
"grad_norm": 0.18828145882782565,
"learning_rate": 6.157757287291557e-06,
"loss": 0.2614,
"step": 1820
},
{
"epoch": 1.4583499402152251,
"grad_norm": 0.16998195784557607,
"learning_rate": 6.112604669781572e-06,
"loss": 0.2581,
"step": 1830
},
{
"epoch": 1.466321243523316,
"grad_norm": 0.16771798799179155,
"learning_rate": 6.0673563877930244e-06,
"loss": 0.259,
"step": 1840
},
{
"epoch": 1.474292546831407,
"grad_norm": 0.17650338257347983,
"learning_rate": 6.022016331884185e-06,
"loss": 0.2611,
"step": 1850
},
{
"epoch": 1.4822638501394978,
"grad_norm": 0.1883582677936213,
"learning_rate": 5.9765884005042725e-06,
"loss": 0.2577,
"step": 1860
},
{
"epoch": 1.4902351534475886,
"grad_norm": 0.1903059355062031,
"learning_rate": 5.931076499658258e-06,
"loss": 0.2561,
"step": 1870
},
{
"epoch": 1.4982064567556796,
"grad_norm": 0.170261118553718,
"learning_rate": 5.8854845425710085e-06,
"loss": 0.2574,
"step": 1880
},
{
"epoch": 1.5061777600637705,
"grad_norm": 0.1693099595303142,
"learning_rate": 5.839816449350824e-06,
"loss": 0.2603,
"step": 1890
},
{
"epoch": 1.5141490633718613,
"grad_norm": 0.1818861367650445,
"learning_rate": 5.7940761466523795e-06,
"loss": 0.2648,
"step": 1900
},
{
"epoch": 1.5221203666799523,
"grad_norm": 0.21748921810474134,
"learning_rate": 5.748267567339093e-06,
"loss": 0.2555,
"step": 1910
},
{
"epoch": 1.530091669988043,
"grad_norm": 0.1903700684723453,
"learning_rate": 5.702394650144975e-06,
"loss": 0.2602,
"step": 1920
},
{
"epoch": 1.538062973296134,
"grad_norm": 0.17836978417633012,
"learning_rate": 5.656461339335968e-06,
"loss": 0.2577,
"step": 1930
},
{
"epoch": 1.5460342766042248,
"grad_norm": 0.1627820959396175,
"learning_rate": 5.6104715843708e-06,
"loss": 0.2611,
"step": 1940
},
{
"epoch": 1.5540055799123156,
"grad_norm": 0.15537918892349203,
"learning_rate": 5.564429339561411e-06,
"loss": 0.2592,
"step": 1950
},
{
"epoch": 1.5619768832204066,
"grad_norm": 0.1692865509471413,
"learning_rate": 5.518338563732945e-06,
"loss": 0.2557,
"step": 1960
},
{
"epoch": 1.5699481865284974,
"grad_norm": 0.18872166066900145,
"learning_rate": 5.4722032198833595e-06,
"loss": 0.2597,
"step": 1970
},
{
"epoch": 1.5779194898365883,
"grad_norm": 0.16793201436135893,
"learning_rate": 5.426027274842683e-06,
"loss": 0.2612,
"step": 1980
},
{
"epoch": 1.5858907931446793,
"grad_norm": 0.1614692177614517,
"learning_rate": 5.379814698931935e-06,
"loss": 0.257,
"step": 1990
},
{
"epoch": 1.5938620964527699,
"grad_norm": 0.17158275465453685,
"learning_rate": 5.3335694656217405e-06,
"loss": 0.2604,
"step": 2000
},
{
"epoch": 1.601833399760861,
"grad_norm": 0.16178053730940672,
"learning_rate": 5.2872955511906974e-06,
"loss": 0.258,
"step": 2010
},
{
"epoch": 1.6098047030689517,
"grad_norm": 0.17144071756850604,
"learning_rate": 5.2409969343834675e-06,
"loss": 0.2596,
"step": 2020
},
{
"epoch": 1.6177760063770426,
"grad_norm": 0.16798857484611837,
"learning_rate": 5.194677596068689e-06,
"loss": 0.2598,
"step": 2030
},
{
"epoch": 1.6257473096851336,
"grad_norm": 0.16245877571502684,
"learning_rate": 5.1483415188966855e-06,
"loss": 0.2621,
"step": 2040
},
{
"epoch": 1.6337186129932244,
"grad_norm": 0.22338180692114515,
"learning_rate": 5.101992686957028e-06,
"loss": 0.2579,
"step": 2050
},
{
"epoch": 1.6416899163013152,
"grad_norm": 0.1777225738766675,
"learning_rate": 5.055635085435972e-06,
"loss": 0.2559,
"step": 2060
},
{
"epoch": 1.6496612196094063,
"grad_norm": 0.17571885043922136,
"learning_rate": 5.009272700273804e-06,
"loss": 0.2598,
"step": 2070
},
{
"epoch": 1.6576325229174969,
"grad_norm": 0.16140144516975566,
"learning_rate": 4.962909517822125e-06,
"loss": 0.2555,
"step": 2080
},
{
"epoch": 1.665603826225588,
"grad_norm": 0.18160320235099942,
"learning_rate": 4.91654952450108e-06,
"loss": 0.2559,
"step": 2090
},
{
"epoch": 1.6735751295336787,
"grad_norm": 0.18372880830782032,
"learning_rate": 4.870196706456609e-06,
"loss": 0.262,
"step": 2100
},
{
"epoch": 1.6815464328417695,
"grad_norm": 0.16897256178696948,
"learning_rate": 4.8238550492177065e-06,
"loss": 0.2566,
"step": 2110
},
{
"epoch": 1.6895177361498606,
"grad_norm": 0.16383359612759563,
"learning_rate": 4.777528537353729e-06,
"loss": 0.258,
"step": 2120
},
{
"epoch": 1.6974890394579514,
"grad_norm": 0.16504846683572935,
"learning_rate": 4.7312211541318e-06,
"loss": 0.258,
"step": 2130
},
{
"epoch": 1.7054603427660422,
"grad_norm": 0.18461915132331697,
"learning_rate": 4.684936881174314e-06,
"loss": 0.259,
"step": 2140
},
{
"epoch": 1.7134316460741332,
"grad_norm": 0.18262043780620987,
"learning_rate": 4.638679698116588e-06,
"loss": 0.2597,
"step": 2150
},
{
"epoch": 1.721402949382224,
"grad_norm": 0.17541591405265763,
"learning_rate": 4.592453582264684e-06,
"loss": 0.2554,
"step": 2160
},
{
"epoch": 1.7293742526903149,
"grad_norm": 0.1653040886084589,
"learning_rate": 4.546262508253429e-06,
"loss": 0.2584,
"step": 2170
},
{
"epoch": 1.737345555998406,
"grad_norm": 0.1585102987285458,
"learning_rate": 4.500110447704666e-06,
"loss": 0.2593,
"step": 2180
},
{
"epoch": 1.7453168593064965,
"grad_norm": 0.16189117020291785,
"learning_rate": 4.454001368885764e-06,
"loss": 0.2568,
"step": 2190
},
{
"epoch": 1.7532881626145875,
"grad_norm": 0.1754043030670264,
"learning_rate": 4.40793923636842e-06,
"loss": 0.2557,
"step": 2200
},
{
"epoch": 1.7612594659226783,
"grad_norm": 0.1798179829068267,
"learning_rate": 4.3619280106877716e-06,
"loss": 0.2572,
"step": 2210
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.16313092179418315,
"learning_rate": 4.315971648001861e-06,
"loss": 0.2556,
"step": 2220
},
{
"epoch": 1.7772020725388602,
"grad_norm": 0.177395532354502,
"learning_rate": 4.270074099751478e-06,
"loss": 0.2542,
"step": 2230
},
{
"epoch": 1.785173375846951,
"grad_norm": 0.16407314616761093,
"learning_rate": 4.224239312320399e-06,
"loss": 0.257,
"step": 2240
},
{
"epoch": 1.7931446791550418,
"grad_norm": 0.1611929306688441,
"learning_rate": 4.178471226696073e-06,
"loss": 0.2572,
"step": 2250
},
{
"epoch": 1.8011159824631329,
"grad_norm": 0.15344135774159312,
"learning_rate": 4.132773778130766e-06,
"loss": 0.2551,
"step": 2260
},
{
"epoch": 1.8090872857712235,
"grad_norm": 0.18018747885039685,
"learning_rate": 4.087150895803192e-06,
"loss": 0.2562,
"step": 2270
},
{
"epoch": 1.8170585890793145,
"grad_norm": 0.16109719534481134,
"learning_rate": 4.041606502480684e-06,
"loss": 0.2544,
"step": 2280
},
{
"epoch": 1.8250298923874053,
"grad_norm": 0.15229386523298538,
"learning_rate": 3.996144514181891e-06,
"loss": 0.254,
"step": 2290
},
{
"epoch": 1.8330011956954961,
"grad_norm": 0.1732832205396586,
"learning_rate": 3.950768839840079e-06,
"loss": 0.2568,
"step": 2300
},
{
"epoch": 1.8409724990035872,
"grad_norm": 0.17030412533612874,
"learning_rate": 3.905483380967027e-06,
"loss": 0.2559,
"step": 2310
},
{
"epoch": 1.848943802311678,
"grad_norm": 0.1646050569595746,
"learning_rate": 3.8602920313175684e-06,
"loss": 0.2584,
"step": 2320
},
{
"epoch": 1.8569151056197688,
"grad_norm": 0.18112487950883321,
"learning_rate": 3.815198676554794e-06,
"loss": 0.2577,
"step": 2330
},
{
"epoch": 1.8648864089278598,
"grad_norm": 0.15970698627462004,
"learning_rate": 3.7702071939159535e-06,
"loss": 0.2574,
"step": 2340
},
{
"epoch": 1.8728577122359504,
"grad_norm": 0.16592497935881761,
"learning_rate": 3.7253214518790814e-06,
"loss": 0.2528,
"step": 2350
},
{
"epoch": 1.8808290155440415,
"grad_norm": 0.1907236587733275,
"learning_rate": 3.6805453098303757e-06,
"loss": 0.2592,
"step": 2360
},
{
"epoch": 1.8888003188521323,
"grad_norm": 0.1636603572774249,
"learning_rate": 3.63588261773236e-06,
"loss": 0.2602,
"step": 2370
},
{
"epoch": 1.896771622160223,
"grad_norm": 0.15180772024085912,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.2563,
"step": 2380
},
{
"epoch": 1.9047429254683141,
"grad_norm": 0.1597945224171933,
"learning_rate": 3.546912934134773e-06,
"loss": 0.2549,
"step": 2390
},
{
"epoch": 1.912714228776405,
"grad_norm": 0.1548083962863071,
"learning_rate": 3.502613592466826e-06,
"loss": 0.2572,
"step": 2400
},
{
"epoch": 1.9206855320844958,
"grad_norm": 0.1481441665132999,
"learning_rate": 3.4584429997550685e-06,
"loss": 0.2575,
"step": 2410
},
{
"epoch": 1.9286568353925868,
"grad_norm": 0.15555262051404722,
"learning_rate": 3.414404953895406e-06,
"loss": 0.2552,
"step": 2420
},
{
"epoch": 1.9366281387006774,
"grad_norm": 0.15133232327459284,
"learning_rate": 3.3705032413870402e-06,
"loss": 0.2539,
"step": 2430
},
{
"epoch": 1.9445994420087684,
"grad_norm": 0.15739551231138196,
"learning_rate": 3.326741637006896e-06,
"loss": 0.2546,
"step": 2440
},
{
"epoch": 1.9525707453168593,
"grad_norm": 0.17129713309093755,
"learning_rate": 3.2831239034850593e-06,
"loss": 0.254,
"step": 2450
},
{
"epoch": 1.96054204862495,
"grad_norm": 0.17872233586244216,
"learning_rate": 3.2396537911812454e-06,
"loss": 0.2535,
"step": 2460
},
{
"epoch": 1.9685133519330411,
"grad_norm": 0.15620956109745782,
"learning_rate": 3.196335037762337e-06,
"loss": 0.2566,
"step": 2470
},
{
"epoch": 1.976484655241132,
"grad_norm": 0.1442172315199126,
"learning_rate": 3.1531713678810076e-06,
"loss": 0.2529,
"step": 2480
},
{
"epoch": 1.9844559585492227,
"grad_norm": 0.15796499558853128,
"learning_rate": 3.110166492855468e-06,
"loss": 0.2551,
"step": 2490
},
{
"epoch": 1.9924272618573138,
"grad_norm": 0.15453646599732415,
"learning_rate": 3.0673241103503572e-06,
"loss": 0.2515,
"step": 2500
},
{
"epoch": 2.0,
"grad_norm": 0.1972849604645947,
"learning_rate": 3.0246479040588077e-06,
"loss": 0.2551,
"step": 2510
},
{
"epoch": 2.007971303308091,
"grad_norm": 0.1644857365812511,
"learning_rate": 2.9821415433857174e-06,
"loss": 0.2503,
"step": 2520
},
{
"epoch": 2.0159426066161816,
"grad_norm": 0.15167361754961786,
"learning_rate": 2.939808683132238e-06,
"loss": 0.248,
"step": 2530
},
{
"epoch": 2.0239139099242727,
"grad_norm": 0.15143886743143137,
"learning_rate": 2.897652963181529e-06,
"loss": 0.2475,
"step": 2540
},
{
"epoch": 2.0318852132323637,
"grad_norm": 0.15557561304873513,
"learning_rate": 2.8556780081857966e-06,
"loss": 0.2502,
"step": 2550
},
{
"epoch": 2.0398565165404543,
"grad_norm": 0.14833684638790712,
"learning_rate": 2.813887427254626e-06,
"loss": 0.247,
"step": 2560
},
{
"epoch": 2.0478278198485453,
"grad_norm": 0.15160958104161015,
"learning_rate": 2.772284813644675e-06,
"loss": 0.2485,
"step": 2570
},
{
"epoch": 2.055799123156636,
"grad_norm": 0.1463790587181485,
"learning_rate": 2.7308737444507037e-06,
"loss": 0.2464,
"step": 2580
},
{
"epoch": 2.063770426464727,
"grad_norm": 0.14689650797158363,
"learning_rate": 2.689657780298019e-06,
"loss": 0.2491,
"step": 2590
},
{
"epoch": 2.071741729772818,
"grad_norm": 0.15966093074503412,
"learning_rate": 2.648640465036316e-06,
"loss": 0.2485,
"step": 2600
},
{
"epoch": 2.0797130330809086,
"grad_norm": 0.2092933023915233,
"learning_rate": 2.6078253254349706e-06,
"loss": 0.2468,
"step": 2610
},
{
"epoch": 2.0876843363889996,
"grad_norm": 0.15161101989978912,
"learning_rate": 2.5672158708797953e-06,
"loss": 0.2474,
"step": 2620
},
{
"epoch": 2.0956556396970907,
"grad_norm": 0.13872835429015348,
"learning_rate": 2.526815593071306e-06,
"loss": 0.2479,
"step": 2630
},
{
"epoch": 2.1036269430051813,
"grad_norm": 0.15338466332562303,
"learning_rate": 2.486627965724482e-06,
"loss": 0.2498,
"step": 2640
},
{
"epoch": 2.1115982463132723,
"grad_norm": 0.14140026219879168,
"learning_rate": 2.4466564442700974e-06,
"loss": 0.2498,
"step": 2650
},
{
"epoch": 2.119569549621363,
"grad_norm": 0.15970341507801283,
"learning_rate": 2.406904465557614e-06,
"loss": 0.2466,
"step": 2660
},
{
"epoch": 2.127540852929454,
"grad_norm": 0.1519435398581199,
"learning_rate": 2.3673754475596634e-06,
"loss": 0.2472,
"step": 2670
},
{
"epoch": 2.135512156237545,
"grad_norm": 0.14371921030590182,
"learning_rate": 2.3280727890781753e-06,
"loss": 0.2471,
"step": 2680
},
{
"epoch": 2.1434834595456356,
"grad_norm": 0.1480883729791422,
"learning_rate": 2.2889998694521257e-06,
"loss": 0.2486,
"step": 2690
},
{
"epoch": 2.1514547628537266,
"grad_norm": 0.1635322392602264,
"learning_rate": 2.2501600482669865e-06,
"loss": 0.2503,
"step": 2700
},
{
"epoch": 2.1594260661618176,
"grad_norm": 0.17232127721119667,
"learning_rate": 2.211556665065854e-06,
"loss": 0.2484,
"step": 2710
},
{
"epoch": 2.1673973694699082,
"grad_norm": 0.1480271607060395,
"learning_rate": 2.173193039062299e-06,
"loss": 0.2507,
"step": 2720
},
{
"epoch": 2.1753686727779993,
"grad_norm": 0.1602014965172441,
"learning_rate": 2.1350724688549906e-06,
"loss": 0.2514,
"step": 2730
},
{
"epoch": 2.18333997608609,
"grad_norm": 0.1414641476617348,
"learning_rate": 2.0971982321440553e-06,
"loss": 0.248,
"step": 2740
},
{
"epoch": 2.191311279394181,
"grad_norm": 0.15546919413747778,
"learning_rate": 2.0595735854492675e-06,
"loss": 0.2487,
"step": 2750
},
{
"epoch": 2.199282582702272,
"grad_norm": 0.14859695582253002,
"learning_rate": 2.0222017638300394e-06,
"loss": 0.2469,
"step": 2760
},
{
"epoch": 2.2072538860103625,
"grad_norm": 0.15297411724937537,
"learning_rate": 1.9850859806072576e-06,
"loss": 0.2449,
"step": 2770
},
{
"epoch": 2.2152251893184536,
"grad_norm": 0.3819615108389116,
"learning_rate": 1.9482294270870055e-06,
"loss": 0.2469,
"step": 2780
},
{
"epoch": 2.2231964926265446,
"grad_norm": 0.15939599378059569,
"learning_rate": 1.9116352722861596e-06,
"loss": 0.2472,
"step": 2790
},
{
"epoch": 2.231167795934635,
"grad_norm": 0.1555518437281259,
"learning_rate": 1.8753066626599086e-06,
"loss": 0.2508,
"step": 2800
},
{
"epoch": 2.2391390992427262,
"grad_norm": 0.1420441078873591,
"learning_rate": 1.839246721831215e-06,
"loss": 0.2484,
"step": 2810
},
{
"epoch": 2.247110402550817,
"grad_norm": 0.1436721984643807,
"learning_rate": 1.8034585503222441e-06,
"loss": 0.2469,
"step": 2820
},
{
"epoch": 2.255081705858908,
"grad_norm": 0.14135560041109888,
"learning_rate": 1.7679452252877622e-06,
"loss": 0.2465,
"step": 2830
},
{
"epoch": 2.263053009166999,
"grad_norm": 0.1504613140361942,
"learning_rate": 1.7327098002505681e-06,
"loss": 0.2444,
"step": 2840
},
{
"epoch": 2.2710243124750895,
"grad_norm": 0.17027410047278219,
"learning_rate": 1.6977553048389306e-06,
"loss": 0.2461,
"step": 2850
},
{
"epoch": 2.2789956157831806,
"grad_norm": 0.1715439427086149,
"learning_rate": 1.663084744526105e-06,
"loss": 0.2481,
"step": 2860
},
{
"epoch": 2.2869669190912716,
"grad_norm": 0.147378563017796,
"learning_rate": 1.6287011003719105e-06,
"loss": 0.2452,
"step": 2870
},
{
"epoch": 2.294938222399362,
"grad_norm": 0.13827676791752133,
"learning_rate": 1.5946073287664065e-06,
"loss": 0.2492,
"step": 2880
},
{
"epoch": 2.302909525707453,
"grad_norm": 0.15079331024865258,
"learning_rate": 1.5608063611757058e-06,
"loss": 0.249,
"step": 2890
},
{
"epoch": 2.3108808290155443,
"grad_norm": 0.14708063904393764,
"learning_rate": 1.5273011038899066e-06,
"loss": 0.2514,
"step": 2900
},
{
"epoch": 2.318852132323635,
"grad_norm": 0.14020201643369284,
"learning_rate": 1.4940944377732168e-06,
"loss": 0.2477,
"step": 2910
},
{
"epoch": 2.326823435631726,
"grad_norm": 0.1528037921936168,
"learning_rate": 1.4611892180162407e-06,
"loss": 0.2469,
"step": 2920
},
{
"epoch": 2.3347947389398165,
"grad_norm": 0.13915360250390427,
"learning_rate": 1.4285882738904822e-06,
"loss": 0.2468,
"step": 2930
},
{
"epoch": 2.3427660422479075,
"grad_norm": 0.14250210180453843,
"learning_rate": 1.3962944085050833e-06,
"loss": 0.248,
"step": 2940
},
{
"epoch": 2.3507373455559986,
"grad_norm": 0.15103609439825436,
"learning_rate": 1.3643103985658047e-06,
"loss": 0.2471,
"step": 2950
},
{
"epoch": 2.358708648864089,
"grad_norm": 0.1343269921202141,
"learning_rate": 1.332638994136269e-06,
"loss": 0.2446,
"step": 2960
},
{
"epoch": 2.36667995217218,
"grad_norm": 0.15449698277173599,
"learning_rate": 1.301282918401518e-06,
"loss": 0.2444,
"step": 2970
},
{
"epoch": 2.374651255480271,
"grad_norm": 0.1445514536177799,
"learning_rate": 1.270244867433853e-06,
"loss": 0.2512,
"step": 2980
},
{
"epoch": 2.382622558788362,
"grad_norm": 0.15118149213707782,
"learning_rate": 1.2395275099610272e-06,
"loss": 0.2495,
"step": 2990
},
{
"epoch": 2.390593862096453,
"grad_norm": 0.13886439394677316,
"learning_rate": 1.2091334871367838e-06,
"loss": 0.246,
"step": 3000
},
{
"epoch": 2.3985651654045435,
"grad_norm": 0.13698163877896719,
"learning_rate": 1.1790654123137552e-06,
"loss": 0.2487,
"step": 3010
},
{
"epoch": 2.4065364687126345,
"grad_norm": 0.13611193490421775,
"learning_rate": 1.1493258708187677e-06,
"loss": 0.2462,
"step": 3020
},
{
"epoch": 2.4145077720207255,
"grad_norm": 0.13752762254360268,
"learning_rate": 1.1199174197305473e-06,
"loss": 0.2464,
"step": 3030
},
{
"epoch": 2.422479075328816,
"grad_norm": 0.13534708608338716,
"learning_rate": 1.0908425876598512e-06,
"loss": 0.2471,
"step": 3040
},
{
"epoch": 2.430450378636907,
"grad_norm": 0.14072307465294032,
"learning_rate": 1.0621038745320579e-06,
"loss": 0.2507,
"step": 3050
},
{
"epoch": 2.438421681944998,
"grad_norm": 0.18149670536468343,
"learning_rate": 1.0337037513722154e-06,
"loss": 0.2468,
"step": 3060
},
{
"epoch": 2.446392985253089,
"grad_norm": 0.13777290158405223,
"learning_rate": 1.0056446600925718e-06,
"loss": 0.2467,
"step": 3070
},
{
"epoch": 2.45436428856118,
"grad_norm": 0.13677789276728541,
"learning_rate": 9.779290132826224e-07,
"loss": 0.2481,
"step": 3080
},
{
"epoch": 2.4623355918692704,
"grad_norm": 0.14535155203152114,
"learning_rate": 9.505591940016601e-07,
"loss": 0.246,
"step": 3090
},
{
"epoch": 2.4703068951773615,
"grad_norm": 0.13976318624658268,
"learning_rate": 9.235375555738824e-07,
"loss": 0.2463,
"step": 3100
},
{
"epoch": 2.4782781984854525,
"grad_norm": 0.13568166041286991,
"learning_rate": 8.968664213860417e-07,
"loss": 0.2472,
"step": 3110
},
{
"epoch": 2.486249501793543,
"grad_norm": 0.13510001777472666,
"learning_rate": 8.705480846876746e-07,
"loss": 0.2453,
"step": 3120
},
{
"epoch": 2.494220805101634,
"grad_norm": 0.14225142264009744,
"learning_rate": 8.445848083939267e-07,
"loss": 0.2466,
"step": 3130
},
{
"epoch": 2.5021921084097247,
"grad_norm": 0.13719258420894617,
"learning_rate": 8.189788248909763e-07,
"loss": 0.246,
"step": 3140
},
{
"epoch": 2.5101634117178158,
"grad_norm": 0.13873842823619842,
"learning_rate": 7.937323358440935e-07,
"loss": 0.2466,
"step": 3150
},
{
"epoch": 2.518134715025907,
"grad_norm": 0.15089712483314988,
"learning_rate": 7.688475120083349e-07,
"loss": 0.2477,
"step": 3160
},
{
"epoch": 2.526106018333998,
"grad_norm": 0.13447979411026342,
"learning_rate": 7.443264930418886e-07,
"loss": 0.2463,
"step": 3170
},
{
"epoch": 2.5340773216420884,
"grad_norm": 0.1380057513415994,
"learning_rate": 7.201713873221134e-07,
"loss": 0.2495,
"step": 3180
},
{
"epoch": 2.5420486249501795,
"grad_norm": 0.14450360624810707,
"learning_rate": 6.963842717642488e-07,
"loss": 0.2499,
"step": 3190
},
{
"epoch": 2.55001992825827,
"grad_norm": 0.2060655057691808,
"learning_rate": 6.72967191642836e-07,
"loss": 0.247,
"step": 3200
},
{
"epoch": 2.557991231566361,
"grad_norm": 0.12851695195695914,
"learning_rate": 6.499221604158623e-07,
"loss": 0.246,
"step": 3210
},
{
"epoch": 2.565962534874452,
"grad_norm": 0.16068250540689558,
"learning_rate": 6.2725115955164e-07,
"loss": 0.2457,
"step": 3220
},
{
"epoch": 2.5739338381825427,
"grad_norm": 0.1362948462489877,
"learning_rate": 6.049561383584301e-07,
"loss": 0.2485,
"step": 3230
},
{
"epoch": 2.5819051414906338,
"grad_norm": 0.14318195107122192,
"learning_rate": 5.830390138168435e-07,
"loss": 0.2457,
"step": 3240
},
{
"epoch": 2.5898764447987244,
"grad_norm": 0.15160361757805285,
"learning_rate": 5.615016704150056e-07,
"loss": 0.2437,
"step": 3250
},
{
"epoch": 2.5978477481068154,
"grad_norm": 0.14208542489599402,
"learning_rate": 5.403459599865307e-07,
"loss": 0.2456,
"step": 3260
},
{
"epoch": 2.6058190514149064,
"grad_norm": 0.1342083762390786,
"learning_rate": 5.195737015512947e-07,
"loss": 0.2449,
"step": 3270
},
{
"epoch": 2.6137903547229975,
"grad_norm": 0.12996044889937128,
"learning_rate": 4.991866811590268e-07,
"loss": 0.2482,
"step": 3280
},
{
"epoch": 2.621761658031088,
"grad_norm": 0.13276419525758598,
"learning_rate": 4.791866517357491e-07,
"loss": 0.2478,
"step": 3290
},
{
"epoch": 2.629732961339179,
"grad_norm": 0.13410510712928858,
"learning_rate": 4.5957533293304655e-07,
"loss": 0.2465,
"step": 3300
},
{
"epoch": 2.6377042646472697,
"grad_norm": 0.13836647678174321,
"learning_rate": 4.403544109802144e-07,
"loss": 0.2446,
"step": 3310
},
{
"epoch": 2.6456755679553607,
"grad_norm": 0.14173844811223874,
"learning_rate": 4.2152553853926914e-07,
"loss": 0.2467,
"step": 3320
},
{
"epoch": 2.653646871263452,
"grad_norm": 0.12963202838388832,
"learning_rate": 4.0309033456284565e-07,
"loss": 0.2461,
"step": 3330
},
{
"epoch": 2.6616181745715424,
"grad_norm": 0.12985066342865778,
"learning_rate": 3.850503841550024e-07,
"loss": 0.2441,
"step": 3340
},
{
"epoch": 2.6695894778796334,
"grad_norm": 0.13686704000194505,
"learning_rate": 3.674072384349242e-07,
"loss": 0.2494,
"step": 3350
},
{
"epoch": 2.677560781187724,
"grad_norm": 0.13198096936683557,
"learning_rate": 3.501624144035559e-07,
"loss": 0.2466,
"step": 3360
},
{
"epoch": 2.685532084495815,
"grad_norm": 0.13285862104009824,
"learning_rate": 3.333173948131663e-07,
"loss": 0.249,
"step": 3370
},
{
"epoch": 2.693503387803906,
"grad_norm": 0.13123763580997755,
"learning_rate": 3.1687362803985987e-07,
"loss": 0.2485,
"step": 3380
},
{
"epoch": 2.7014746911119967,
"grad_norm": 0.130635888923708,
"learning_rate": 3.008325279590357e-07,
"loss": 0.2464,
"step": 3390
},
{
"epoch": 2.7094459944200877,
"grad_norm": 0.12903560375175332,
"learning_rate": 2.851954738238277e-07,
"loss": 0.2464,
"step": 3400
},
{
"epoch": 2.7174172977281783,
"grad_norm": 0.13068172271012624,
"learning_rate": 2.6996381014650353e-07,
"loss": 0.2477,
"step": 3410
},
{
"epoch": 2.7253886010362693,
"grad_norm": 0.12948155434817346,
"learning_rate": 2.5513884658286745e-07,
"loss": 0.2454,
"step": 3420
},
{
"epoch": 2.7333599043443604,
"grad_norm": 0.13478087793899063,
"learning_rate": 2.407218578196524e-07,
"loss": 0.2446,
"step": 3430
},
{
"epoch": 2.7413312076524514,
"grad_norm": 0.14200513684004365,
"learning_rate": 2.267140834649123e-07,
"loss": 0.2475,
"step": 3440
},
{
"epoch": 2.749302510960542,
"grad_norm": 0.1388036591016315,
"learning_rate": 2.13116727941447e-07,
"loss": 0.2458,
"step": 3450
},
{
"epoch": 2.757273814268633,
"grad_norm": 0.13274417766749047,
"learning_rate": 1.9993096038323556e-07,
"loss": 0.2478,
"step": 3460
},
{
"epoch": 2.7652451175767236,
"grad_norm": 0.1309898112310216,
"learning_rate": 1.8715791453491562e-07,
"loss": 0.2473,
"step": 3470
},
{
"epoch": 2.7732164208848147,
"grad_norm": 0.12855050002466045,
"learning_rate": 1.7479868865430072e-07,
"loss": 0.246,
"step": 3480
},
{
"epoch": 2.7811877241929057,
"grad_norm": 0.1297939051422684,
"learning_rate": 1.6285434541794598e-07,
"loss": 0.2454,
"step": 3490
},
{
"epoch": 2.7891590275009963,
"grad_norm": 0.1360452914842712,
"learning_rate": 1.5132591182978107e-07,
"loss": 0.2478,
"step": 3500
},
{
"epoch": 2.7971303308090874,
"grad_norm": 0.1303778236512888,
"learning_rate": 1.4021437913280366e-07,
"loss": 0.2473,
"step": 3510
},
{
"epoch": 2.805101634117178,
"grad_norm": 0.12494111165590116,
"learning_rate": 1.2952070272384986e-07,
"loss": 0.2472,
"step": 3520
},
{
"epoch": 2.813072937425269,
"grad_norm": 0.13077601667810976,
"learning_rate": 1.192458020714482e-07,
"loss": 0.2447,
"step": 3530
},
{
"epoch": 2.82104424073336,
"grad_norm": 0.13202872119004214,
"learning_rate": 1.0939056063675846e-07,
"loss": 0.2467,
"step": 3540
},
{
"epoch": 2.8290155440414506,
"grad_norm": 0.1282605177841322,
"learning_rate": 9.995582579761243e-08,
"loss": 0.2479,
"step": 3550
},
{
"epoch": 2.8369868473495417,
"grad_norm": 0.12641719729667697,
"learning_rate": 9.094240877565441e-08,
"loss": 0.2473,
"step": 3560
},
{
"epoch": 2.8449581506576322,
"grad_norm": 0.133124543261481,
"learning_rate": 8.235108456658814e-08,
"loss": 0.2456,
"step": 3570
},
{
"epoch": 2.8529294539657233,
"grad_norm": 0.126398551708617,
"learning_rate": 7.418259187354227e-08,
"loss": 0.248,
"step": 3580
},
{
"epoch": 2.8609007572738143,
"grad_norm": 0.12941811506345907,
"learning_rate": 6.643763304355566e-08,
"loss": 0.2465,
"step": 3590
},
{
"epoch": 2.8688720605819054,
"grad_norm": 0.12633613236354346,
"learning_rate": 5.911687400718458e-08,
"loss": 0.2484,
"step": 3600
},
{
"epoch": 2.876843363889996,
"grad_norm": 0.13027882462021592,
"learning_rate": 5.222094422124846e-08,
"loss": 0.2478,
"step": 3610
},
{
"epoch": 2.884814667198087,
"grad_norm": 0.130436804547978,
"learning_rate": 4.57504366147038e-08,
"loss": 0.2502,
"step": 3620
},
{
"epoch": 2.8927859705061776,
"grad_norm": 0.13485396495421867,
"learning_rate": 3.970590753766712e-08,
"loss": 0.2475,
"step": 3630
},
{
"epoch": 2.9007572738142686,
"grad_norm": 0.1357506612910079,
"learning_rate": 3.408787671357494e-08,
"loss": 0.2485,
"step": 3640
},
{
"epoch": 2.9087285771223597,
"grad_norm": 0.1324337960159484,
"learning_rate": 2.8896827194496713e-08,
"loss": 0.2507,
"step": 3650
},
{
"epoch": 2.9166998804304503,
"grad_norm": 0.1384258646412873,
"learning_rate": 2.4133205319603614e-08,
"loss": 0.2471,
"step": 3660
},
{
"epoch": 2.9246711837385413,
"grad_norm": 0.12538463682333836,
"learning_rate": 1.9797420676788692e-08,
"loss": 0.2465,
"step": 3670
},
{
"epoch": 2.932642487046632,
"grad_norm": 0.13488887387214157,
"learning_rate": 1.5889846067450586e-08,
"loss": 0.2473,
"step": 3680
},
{
"epoch": 2.940613790354723,
"grad_norm": 0.1414038209388306,
"learning_rate": 1.241081747443862e-08,
"loss": 0.2481,
"step": 3690
},
{
"epoch": 2.948585093662814,
"grad_norm": 0.13040100044435768,
"learning_rate": 9.36063403316534e-09,
"loss": 0.2463,
"step": 3700
},
{
"epoch": 2.956556396970905,
"grad_norm": 0.12638246739069472,
"learning_rate": 6.739558005884883e-09,
"loss": 0.2465,
"step": 3710
},
{
"epoch": 2.9645277002789956,
"grad_norm": 0.13324839473704572,
"learning_rate": 4.547814759142122e-09,
"loss": 0.2505,
"step": 3720
},
{
"epoch": 2.9724990035870866,
"grad_norm": 0.1305883337577235,
"learning_rate": 2.785592744398713e-09,
"loss": 0.2478,
"step": 3730
},
{
"epoch": 2.9804703068951772,
"grad_norm": 0.13393956028092843,
"learning_rate": 1.453043481824401e-09,
"loss": 0.2459,
"step": 3740
},
{
"epoch": 2.9884416102032683,
"grad_norm": 0.13005754003113282,
"learning_rate": 5.50281547275211e-10,
"loss": 0.2488,
"step": 3750
},
{
"epoch": 2.9964129135113593,
"grad_norm": 0.13834661200938075,
"learning_rate": 7.738456243466808e-11,
"loss": 0.2472,
"step": 3760
}
],
"logging_steps": 10,
"max_steps": 3765,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.647275000450253e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}