wood_codeonly_rt_add2_cp3000 / trainer_state.json
timhua's picture
Upload folder using huggingface_hub
23fcad3 verified
{
"best_global_step": 3000,
"best_metric": 1.1457551717758179,
"best_model_checkpoint": "/workspace/woodcode_2/checkpoint-3000",
"epoch": 0.6942837306179125,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023142791020597086,
"grad_norm": 1.0788178443908691,
"learning_rate": 6.923076923076923e-06,
"loss": 2.405,
"step": 10
},
{
"epoch": 0.004628558204119417,
"grad_norm": 0.4521988034248352,
"learning_rate": 1.4615384615384617e-05,
"loss": 2.2017,
"step": 20
},
{
"epoch": 0.006942837306179125,
"grad_norm": 0.41120657324790955,
"learning_rate": 2.230769230769231e-05,
"loss": 1.9363,
"step": 30
},
{
"epoch": 0.009257116408238834,
"grad_norm": 0.28640317916870117,
"learning_rate": 3e-05,
"loss": 1.748,
"step": 40
},
{
"epoch": 0.011571395510298541,
"grad_norm": 0.26409590244293213,
"learning_rate": 3.769230769230769e-05,
"loss": 1.6238,
"step": 50
},
{
"epoch": 0.01388567461235825,
"grad_norm": 0.26786214113235474,
"learning_rate": 4.538461538461539e-05,
"loss": 1.5538,
"step": 60
},
{
"epoch": 0.01619995371441796,
"grad_norm": 0.2931516766548157,
"learning_rate": 5.3076923076923076e-05,
"loss": 1.4984,
"step": 70
},
{
"epoch": 0.01851423281647767,
"grad_norm": 0.29799872636795044,
"learning_rate": 6.0769230769230765e-05,
"loss": 1.4658,
"step": 80
},
{
"epoch": 0.020828511918537376,
"grad_norm": 0.29269853234291077,
"learning_rate": 6.846153846153847e-05,
"loss": 1.4265,
"step": 90
},
{
"epoch": 0.023142791020597082,
"grad_norm": 0.3255228102207184,
"learning_rate": 7.615384615384616e-05,
"loss": 1.4194,
"step": 100
},
{
"epoch": 0.025457070122656793,
"grad_norm": 0.3054940104484558,
"learning_rate": 8.384615384615386e-05,
"loss": 1.3852,
"step": 110
},
{
"epoch": 0.0277713492247165,
"grad_norm": 0.2838137149810791,
"learning_rate": 9.153846153846155e-05,
"loss": 1.3885,
"step": 120
},
{
"epoch": 0.03008562832677621,
"grad_norm": 0.2860707938671112,
"learning_rate": 9.923076923076923e-05,
"loss": 1.3693,
"step": 130
},
{
"epoch": 0.03239990742883592,
"grad_norm": 0.2725190818309784,
"learning_rate": 9.999886214268966e-05,
"loss": 1.3606,
"step": 140
},
{
"epoch": 0.03471418653089563,
"grad_norm": 0.2650243937969208,
"learning_rate": 9.999492887526629e-05,
"loss": 1.3414,
"step": 150
},
{
"epoch": 0.03702846563295534,
"grad_norm": 0.2538904845714569,
"learning_rate": 9.998818637106816e-05,
"loss": 1.3495,
"step": 160
},
{
"epoch": 0.03934274473501504,
"grad_norm": 0.25812944769859314,
"learning_rate": 9.99786350089595e-05,
"loss": 1.3419,
"step": 170
},
{
"epoch": 0.04165702383707475,
"grad_norm": 0.23412390053272247,
"learning_rate": 9.996627532563551e-05,
"loss": 1.3314,
"step": 180
},
{
"epoch": 0.04397130293913446,
"grad_norm": 0.256795197725296,
"learning_rate": 9.995110801559215e-05,
"loss": 1.3326,
"step": 190
},
{
"epoch": 0.046285582041194165,
"grad_norm": 0.24870288372039795,
"learning_rate": 9.993313393108719e-05,
"loss": 1.328,
"step": 200
},
{
"epoch": 0.048599861143253875,
"grad_norm": 0.2265051007270813,
"learning_rate": 9.991235408209221e-05,
"loss": 1.3271,
"step": 210
},
{
"epoch": 0.050914140245313586,
"grad_norm": 0.23642291128635406,
"learning_rate": 9.988876963623597e-05,
"loss": 1.3268,
"step": 220
},
{
"epoch": 0.053228419347373296,
"grad_norm": 0.2279822677373886,
"learning_rate": 9.986238191873874e-05,
"loss": 1.3058,
"step": 230
},
{
"epoch": 0.055542698449433,
"grad_norm": 0.23465260863304138,
"learning_rate": 9.983319241233782e-05,
"loss": 1.3057,
"step": 240
},
{
"epoch": 0.05785697755149271,
"grad_norm": 0.22599244117736816,
"learning_rate": 9.980120275720424e-05,
"loss": 1.313,
"step": 250
},
{
"epoch": 0.06017125665355242,
"grad_norm": 0.23743176460266113,
"learning_rate": 9.976641475085067e-05,
"loss": 1.3004,
"step": 260
},
{
"epoch": 0.06248553575561213,
"grad_norm": 0.22628776729106903,
"learning_rate": 9.972883034803025e-05,
"loss": 1.3059,
"step": 270
},
{
"epoch": 0.06479981485767183,
"grad_norm": 0.24011753499507904,
"learning_rate": 9.968845166062692e-05,
"loss": 1.2905,
"step": 280
},
{
"epoch": 0.06711409395973154,
"grad_norm": 0.231357604265213,
"learning_rate": 9.96452809575367e-05,
"loss": 1.2971,
"step": 290
},
{
"epoch": 0.06942837306179125,
"grad_norm": 0.23093485832214355,
"learning_rate": 9.959932066454008e-05,
"loss": 1.2977,
"step": 300
},
{
"epoch": 0.07174265216385096,
"grad_norm": 0.22505411505699158,
"learning_rate": 9.955057336416597e-05,
"loss": 1.2746,
"step": 310
},
{
"epoch": 0.07405693126591067,
"grad_norm": 0.2227935642004013,
"learning_rate": 9.949904179554632e-05,
"loss": 1.273,
"step": 320
},
{
"epoch": 0.07637121036797037,
"grad_norm": 0.23704655468463898,
"learning_rate": 9.944472885426235e-05,
"loss": 1.2909,
"step": 330
},
{
"epoch": 0.07868548947003008,
"grad_norm": 0.2225590944290161,
"learning_rate": 9.938763759218185e-05,
"loss": 1.2846,
"step": 340
},
{
"epoch": 0.08099976857208979,
"grad_norm": 0.24287723004817963,
"learning_rate": 9.932777121728763e-05,
"loss": 1.2989,
"step": 350
},
{
"epoch": 0.0833140476741495,
"grad_norm": 0.22559230029582977,
"learning_rate": 9.926513309349732e-05,
"loss": 1.2803,
"step": 360
},
{
"epoch": 0.08562832677620921,
"grad_norm": 0.22119103372097015,
"learning_rate": 9.919972674047429e-05,
"loss": 1.269,
"step": 370
},
{
"epoch": 0.08794260587826892,
"grad_norm": 0.2336379736661911,
"learning_rate": 9.913155583342994e-05,
"loss": 1.2775,
"step": 380
},
{
"epoch": 0.09025688498032863,
"grad_norm": 0.2086753100156784,
"learning_rate": 9.906062420291715e-05,
"loss": 1.2868,
"step": 390
},
{
"epoch": 0.09257116408238833,
"grad_norm": 0.24568480253219604,
"learning_rate": 9.898693583461507e-05,
"loss": 1.2746,
"step": 400
},
{
"epoch": 0.09488544318444804,
"grad_norm": 0.22321555018424988,
"learning_rate": 9.891049486910511e-05,
"loss": 1.2682,
"step": 410
},
{
"epoch": 0.09719972228650775,
"grad_norm": 0.22601205110549927,
"learning_rate": 9.883130560163837e-05,
"loss": 1.27,
"step": 420
},
{
"epoch": 0.09951400138856746,
"grad_norm": 0.20481973886489868,
"learning_rate": 9.874937248189415e-05,
"loss": 1.275,
"step": 430
},
{
"epoch": 0.10182828049062717,
"grad_norm": 0.2164992243051529,
"learning_rate": 9.866470011373008e-05,
"loss": 1.2661,
"step": 440
},
{
"epoch": 0.10414255959268688,
"grad_norm": 0.20576460659503937,
"learning_rate": 9.857729325492329e-05,
"loss": 1.2626,
"step": 450
},
{
"epoch": 0.10645683869474659,
"grad_norm": 0.22202594578266144,
"learning_rate": 9.848715681690317e-05,
"loss": 1.2488,
"step": 460
},
{
"epoch": 0.10877111779680629,
"grad_norm": 0.20930485427379608,
"learning_rate": 9.839429586447533e-05,
"loss": 1.2623,
"step": 470
},
{
"epoch": 0.111085396898866,
"grad_norm": 0.23361071944236755,
"learning_rate": 9.829871561553702e-05,
"loss": 1.2546,
"step": 480
},
{
"epoch": 0.11339967600092571,
"grad_norm": 0.211343452334404,
"learning_rate": 9.820042144078397e-05,
"loss": 1.2538,
"step": 490
},
{
"epoch": 0.11571395510298542,
"grad_norm": 0.20587043464183807,
"learning_rate": 9.809941886340854e-05,
"loss": 1.2719,
"step": 500
},
{
"epoch": 0.11571395510298542,
"eval_loss": 1.2470530271530151,
"eval_runtime": 23.9969,
"eval_samples_per_second": 16.002,
"eval_steps_per_second": 0.5,
"step": 500
},
{
"epoch": 0.11802823420504513,
"grad_norm": 0.20054572820663452,
"learning_rate": 9.799571355878947e-05,
"loss": 1.2563,
"step": 510
},
{
"epoch": 0.12034251330710484,
"grad_norm": 0.21044516563415527,
"learning_rate": 9.788931135417287e-05,
"loss": 1.2517,
"step": 520
},
{
"epoch": 0.12265679240916455,
"grad_norm": 0.21391618251800537,
"learning_rate": 9.778021822834485e-05,
"loss": 1.2491,
"step": 530
},
{
"epoch": 0.12497107151122426,
"grad_norm": 0.2132970243692398,
"learning_rate": 9.766844031129552e-05,
"loss": 1.2472,
"step": 540
},
{
"epoch": 0.12728535061328397,
"grad_norm": 0.2169645130634308,
"learning_rate": 9.755398388387462e-05,
"loss": 1.2596,
"step": 550
},
{
"epoch": 0.12959962971534367,
"grad_norm": 0.20134727656841278,
"learning_rate": 9.743685537743856e-05,
"loss": 1.257,
"step": 560
},
{
"epoch": 0.1319139088174034,
"grad_norm": 0.21121706068515778,
"learning_rate": 9.731706137348898e-05,
"loss": 1.2616,
"step": 570
},
{
"epoch": 0.1342281879194631,
"grad_norm": 0.21253220736980438,
"learning_rate": 9.7194608603303e-05,
"loss": 1.2355,
"step": 580
},
{
"epoch": 0.13654246702152278,
"grad_norm": 0.22279760241508484,
"learning_rate": 9.706950394755501e-05,
"loss": 1.256,
"step": 590
},
{
"epoch": 0.1388567461235825,
"grad_norm": 0.191938579082489,
"learning_rate": 9.694175443592993e-05,
"loss": 1.2408,
"step": 600
},
{
"epoch": 0.1411710252256422,
"grad_norm": 0.2211560308933258,
"learning_rate": 9.681136724672835e-05,
"loss": 1.2563,
"step": 610
},
{
"epoch": 0.14348530432770193,
"grad_norm": 0.2078508883714676,
"learning_rate": 9.667834970646307e-05,
"loss": 1.2323,
"step": 620
},
{
"epoch": 0.14579958342976163,
"grad_norm": 0.21866025030612946,
"learning_rate": 9.65427092894475e-05,
"loss": 1.261,
"step": 630
},
{
"epoch": 0.14811386253182135,
"grad_norm": 0.20903170108795166,
"learning_rate": 9.640445361737556e-05,
"loss": 1.2476,
"step": 640
},
{
"epoch": 0.15042814163388105,
"grad_norm": 0.20698365569114685,
"learning_rate": 9.626359045889355e-05,
"loss": 1.2354,
"step": 650
},
{
"epoch": 0.15274242073594074,
"grad_norm": 0.21057769656181335,
"learning_rate": 9.612012772916353e-05,
"loss": 1.2527,
"step": 660
},
{
"epoch": 0.15505669983800047,
"grad_norm": 0.2073555439710617,
"learning_rate": 9.597407348941865e-05,
"loss": 1.2338,
"step": 670
},
{
"epoch": 0.15737097894006016,
"grad_norm": 0.20362691581249237,
"learning_rate": 9.582543594651005e-05,
"loss": 1.2548,
"step": 680
},
{
"epoch": 0.1596852580421199,
"grad_norm": 0.18878686428070068,
"learning_rate": 9.56742234524459e-05,
"loss": 1.2399,
"step": 690
},
{
"epoch": 0.16199953714417958,
"grad_norm": 0.21003399789333344,
"learning_rate": 9.552044450392189e-05,
"loss": 1.2366,
"step": 700
},
{
"epoch": 0.1643138162462393,
"grad_norm": 0.21605387330055237,
"learning_rate": 9.536410774184396e-05,
"loss": 1.2419,
"step": 710
},
{
"epoch": 0.166628095348299,
"grad_norm": 0.21591876447200775,
"learning_rate": 9.520522195084274e-05,
"loss": 1.2412,
"step": 720
},
{
"epoch": 0.1689423744503587,
"grad_norm": 0.2058115005493164,
"learning_rate": 9.504379605877979e-05,
"loss": 1.233,
"step": 730
},
{
"epoch": 0.17125665355241843,
"grad_norm": 0.224104642868042,
"learning_rate": 9.487983913624615e-05,
"loss": 1.2272,
"step": 740
},
{
"epoch": 0.17357093265447812,
"grad_norm": 0.20306575298309326,
"learning_rate": 9.471336039605255e-05,
"loss": 1.2278,
"step": 750
},
{
"epoch": 0.17588521175653785,
"grad_norm": 0.1998828798532486,
"learning_rate": 9.454436919271169e-05,
"loss": 1.2344,
"step": 760
},
{
"epoch": 0.17819949085859754,
"grad_norm": 0.1913456916809082,
"learning_rate": 9.437287502191274e-05,
"loss": 1.2376,
"step": 770
},
{
"epoch": 0.18051376996065727,
"grad_norm": 0.20067718625068665,
"learning_rate": 9.419888751998767e-05,
"loss": 1.2586,
"step": 780
},
{
"epoch": 0.18282804906271696,
"grad_norm": 0.1913948804140091,
"learning_rate": 9.402241646336977e-05,
"loss": 1.2414,
"step": 790
},
{
"epoch": 0.18514232816477666,
"grad_norm": 0.20469442009925842,
"learning_rate": 9.38434717680444e-05,
"loss": 1.2395,
"step": 800
},
{
"epoch": 0.18745660726683638,
"grad_norm": 0.20488658547401428,
"learning_rate": 9.366206348899177e-05,
"loss": 1.2259,
"step": 810
},
{
"epoch": 0.18977088636889608,
"grad_norm": 0.21545757353305817,
"learning_rate": 9.347820181962185e-05,
"loss": 1.2267,
"step": 820
},
{
"epoch": 0.1920851654709558,
"grad_norm": 0.20461086928844452,
"learning_rate": 9.329189709120174e-05,
"loss": 1.2482,
"step": 830
},
{
"epoch": 0.1943994445730155,
"grad_norm": 0.21476367115974426,
"learning_rate": 9.310315977227509e-05,
"loss": 1.2321,
"step": 840
},
{
"epoch": 0.19671372367507522,
"grad_norm": 0.20833474397659302,
"learning_rate": 9.291200046807382e-05,
"loss": 1.22,
"step": 850
},
{
"epoch": 0.19902800277713492,
"grad_norm": 0.19986377656459808,
"learning_rate": 9.27184299199223e-05,
"loss": 1.2423,
"step": 860
},
{
"epoch": 0.20134228187919462,
"grad_norm": 0.22088144719600677,
"learning_rate": 9.252245900463373e-05,
"loss": 1.232,
"step": 870
},
{
"epoch": 0.20365656098125434,
"grad_norm": 0.20136182010173798,
"learning_rate": 9.2324098733899e-05,
"loss": 1.2229,
"step": 880
},
{
"epoch": 0.20597084008331404,
"grad_norm": 0.1938410848379135,
"learning_rate": 9.212336025366788e-05,
"loss": 1.2227,
"step": 890
},
{
"epoch": 0.20828511918537376,
"grad_norm": 0.202079638838768,
"learning_rate": 9.19202548435228e-05,
"loss": 1.2197,
"step": 900
},
{
"epoch": 0.21059939828743346,
"grad_norm": 0.20484083890914917,
"learning_rate": 9.1714793916045e-05,
"loss": 1.2089,
"step": 910
},
{
"epoch": 0.21291367738949318,
"grad_norm": 0.21105819940567017,
"learning_rate": 9.150698901617327e-05,
"loss": 1.2315,
"step": 920
},
{
"epoch": 0.21522795649155288,
"grad_norm": 0.19875198602676392,
"learning_rate": 9.129685182055519e-05,
"loss": 1.2233,
"step": 930
},
{
"epoch": 0.21754223559361258,
"grad_norm": 0.201791912317276,
"learning_rate": 9.10843941368911e-05,
"loss": 1.2324,
"step": 940
},
{
"epoch": 0.2198565146956723,
"grad_norm": 0.20584046840667725,
"learning_rate": 9.086962790327056e-05,
"loss": 1.2167,
"step": 950
},
{
"epoch": 0.222170793797732,
"grad_norm": 0.19981129467487335,
"learning_rate": 9.065256518750154e-05,
"loss": 1.2178,
"step": 960
},
{
"epoch": 0.22448507289979172,
"grad_norm": 0.19994951784610748,
"learning_rate": 9.043321818643233e-05,
"loss": 1.2158,
"step": 970
},
{
"epoch": 0.22679935200185142,
"grad_norm": 0.2023075968027115,
"learning_rate": 9.021159922526623e-05,
"loss": 1.2353,
"step": 980
},
{
"epoch": 0.22911363110391114,
"grad_norm": 0.1981421411037445,
"learning_rate": 8.998772075686896e-05,
"loss": 1.2396,
"step": 990
},
{
"epoch": 0.23142791020597084,
"grad_norm": 0.2052128165960312,
"learning_rate": 8.976159536106894e-05,
"loss": 1.2137,
"step": 1000
},
{
"epoch": 0.23142791020597084,
"eval_loss": 1.208183765411377,
"eval_runtime": 21.6303,
"eval_samples_per_second": 17.753,
"eval_steps_per_second": 0.555,
"step": 1000
},
{
"epoch": 0.23374218930803053,
"grad_norm": 0.20763066411018372,
"learning_rate": 8.953323574395037e-05,
"loss": 1.2247,
"step": 1010
},
{
"epoch": 0.23605646841009026,
"grad_norm": 0.19439862668514252,
"learning_rate": 8.930265473713938e-05,
"loss": 1.2239,
"step": 1020
},
{
"epoch": 0.23837074751214996,
"grad_norm": 0.188704714179039,
"learning_rate": 8.90698652970829e-05,
"loss": 1.2331,
"step": 1030
},
{
"epoch": 0.24068502661420968,
"grad_norm": 0.2066233903169632,
"learning_rate": 8.883488050432074e-05,
"loss": 1.2178,
"step": 1040
},
{
"epoch": 0.24299930571626938,
"grad_norm": 0.20683979988098145,
"learning_rate": 8.859771356275046e-05,
"loss": 1.2222,
"step": 1050
},
{
"epoch": 0.2453135848183291,
"grad_norm": 0.21290378272533417,
"learning_rate": 8.835837779888557e-05,
"loss": 1.2162,
"step": 1060
},
{
"epoch": 0.2476278639203888,
"grad_norm": 0.19746707379817963,
"learning_rate": 8.811688666110662e-05,
"loss": 1.2239,
"step": 1070
},
{
"epoch": 0.24994214302244852,
"grad_norm": 0.19365474581718445,
"learning_rate": 8.787325371890558e-05,
"loss": 1.2187,
"step": 1080
},
{
"epoch": 0.2522564221245082,
"grad_norm": 0.20299233496189117,
"learning_rate": 8.76274926621233e-05,
"loss": 1.2075,
"step": 1090
},
{
"epoch": 0.25457070122656794,
"grad_norm": 0.20049187541007996,
"learning_rate": 8.737961730018034e-05,
"loss": 1.2114,
"step": 1100
},
{
"epoch": 0.2568849803286276,
"grad_norm": 0.19873455166816711,
"learning_rate": 8.712964156130099e-05,
"loss": 1.2247,
"step": 1110
},
{
"epoch": 0.25919925943068733,
"grad_norm": 0.1992412507534027,
"learning_rate": 8.687757949173063e-05,
"loss": 1.2164,
"step": 1120
},
{
"epoch": 0.26151353853274706,
"grad_norm": 0.2137262374162674,
"learning_rate": 8.662344525494644e-05,
"loss": 1.2083,
"step": 1130
},
{
"epoch": 0.2638278176348068,
"grad_norm": 0.20104128122329712,
"learning_rate": 8.636725313086162e-05,
"loss": 1.2125,
"step": 1140
},
{
"epoch": 0.26614209673686645,
"grad_norm": 0.2062898725271225,
"learning_rate": 8.610901751502292e-05,
"loss": 1.235,
"step": 1150
},
{
"epoch": 0.2684563758389262,
"grad_norm": 0.20116354525089264,
"learning_rate": 8.584875291780178e-05,
"loss": 1.217,
"step": 1160
},
{
"epoch": 0.2707706549409859,
"grad_norm": 0.20894746482372284,
"learning_rate": 8.558647396357901e-05,
"loss": 1.2173,
"step": 1170
},
{
"epoch": 0.27308493404304557,
"grad_norm": 0.19359129667282104,
"learning_rate": 8.532219538992301e-05,
"loss": 1.2082,
"step": 1180
},
{
"epoch": 0.2753992131451053,
"grad_norm": 0.1946392059326172,
"learning_rate": 8.505593204676162e-05,
"loss": 1.2161,
"step": 1190
},
{
"epoch": 0.277713492247165,
"grad_norm": 0.2131495177745819,
"learning_rate": 8.478769889554781e-05,
"loss": 1.2046,
"step": 1200
},
{
"epoch": 0.28002777134922474,
"grad_norm": 0.21192453801631927,
"learning_rate": 8.451751100841887e-05,
"loss": 1.2174,
"step": 1210
},
{
"epoch": 0.2823420504512844,
"grad_norm": 0.1986854523420334,
"learning_rate": 8.424538356734957e-05,
"loss": 1.2124,
"step": 1220
},
{
"epoch": 0.28465632955334413,
"grad_norm": 0.19923637807369232,
"learning_rate": 8.397133186329903e-05,
"loss": 1.2168,
"step": 1230
},
{
"epoch": 0.28697060865540386,
"grad_norm": 0.19468043744564056,
"learning_rate": 8.36953712953516e-05,
"loss": 1.2067,
"step": 1240
},
{
"epoch": 0.2892848877574635,
"grad_norm": 0.19150374829769135,
"learning_rate": 8.34175173698515e-05,
"loss": 1.2118,
"step": 1250
},
{
"epoch": 0.29159916685952325,
"grad_norm": 0.19914792478084564,
"learning_rate": 8.31377856995315e-05,
"loss": 1.2018,
"step": 1260
},
{
"epoch": 0.293913445961583,
"grad_norm": 0.19311580061912537,
"learning_rate": 8.285619200263567e-05,
"loss": 1.2001,
"step": 1270
},
{
"epoch": 0.2962277250636427,
"grad_norm": 0.20415401458740234,
"learning_rate": 8.257275210203622e-05,
"loss": 1.2156,
"step": 1280
},
{
"epoch": 0.29854200416570237,
"grad_norm": 0.1939728707075119,
"learning_rate": 8.228748192434428e-05,
"loss": 1.2035,
"step": 1290
},
{
"epoch": 0.3008562832677621,
"grad_norm": 0.1993534117937088,
"learning_rate": 8.200039749901511e-05,
"loss": 1.1971,
"step": 1300
},
{
"epoch": 0.3031705623698218,
"grad_norm": 0.19424191117286682,
"learning_rate": 8.171151495744727e-05,
"loss": 1.1923,
"step": 1310
},
{
"epoch": 0.3054848414718815,
"grad_norm": 0.19882912933826447,
"learning_rate": 8.142085053207629e-05,
"loss": 1.1998,
"step": 1320
},
{
"epoch": 0.3077991205739412,
"grad_norm": 0.1941244751214981,
"learning_rate": 8.112842055546252e-05,
"loss": 1.2152,
"step": 1330
},
{
"epoch": 0.31011339967600093,
"grad_norm": 0.20408713817596436,
"learning_rate": 8.083424145937339e-05,
"loss": 1.2202,
"step": 1340
},
{
"epoch": 0.31242767877806066,
"grad_norm": 0.19065722823143005,
"learning_rate": 8.053832977386015e-05,
"loss": 1.2123,
"step": 1350
},
{
"epoch": 0.3147419578801203,
"grad_norm": 0.20365293323993683,
"learning_rate": 8.024070212632892e-05,
"loss": 1.1972,
"step": 1360
},
{
"epoch": 0.31705623698218005,
"grad_norm": 0.20200444757938385,
"learning_rate": 7.994137524060656e-05,
"loss": 1.202,
"step": 1370
},
{
"epoch": 0.3193705160842398,
"grad_norm": 0.19926463067531586,
"learning_rate": 7.964036593600084e-05,
"loss": 1.1989,
"step": 1380
},
{
"epoch": 0.32168479518629944,
"grad_norm": 0.19380785524845123,
"learning_rate": 7.933769112635534e-05,
"loss": 1.203,
"step": 1390
},
{
"epoch": 0.32399907428835917,
"grad_norm": 0.19268542528152466,
"learning_rate": 7.903336781909911e-05,
"loss": 1.2019,
"step": 1400
},
{
"epoch": 0.3263133533904189,
"grad_norm": 0.20773714780807495,
"learning_rate": 7.872741311429103e-05,
"loss": 1.1995,
"step": 1410
},
{
"epoch": 0.3286276324924786,
"grad_norm": 0.19505122303962708,
"learning_rate": 7.841984420365888e-05,
"loss": 1.2028,
"step": 1420
},
{
"epoch": 0.3309419115945383,
"grad_norm": 0.19330574572086334,
"learning_rate": 7.811067836963337e-05,
"loss": 1.2002,
"step": 1430
},
{
"epoch": 0.333256190696598,
"grad_norm": 0.21044421195983887,
"learning_rate": 7.779993298437704e-05,
"loss": 1.1985,
"step": 1440
},
{
"epoch": 0.33557046979865773,
"grad_norm": 0.20081642270088196,
"learning_rate": 7.74876255088081e-05,
"loss": 1.2131,
"step": 1450
},
{
"epoch": 0.3378847489007174,
"grad_norm": 0.1973022222518921,
"learning_rate": 7.71737734916193e-05,
"loss": 1.1997,
"step": 1460
},
{
"epoch": 0.3401990280027771,
"grad_norm": 0.19213716685771942,
"learning_rate": 7.685839456829183e-05,
"loss": 1.201,
"step": 1470
},
{
"epoch": 0.34251330710483685,
"grad_norm": 0.19389280676841736,
"learning_rate": 7.65415064601044e-05,
"loss": 1.2078,
"step": 1480
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.20220617949962616,
"learning_rate": 7.622312697313754e-05,
"loss": 1.2013,
"step": 1490
},
{
"epoch": 0.34714186530895624,
"grad_norm": 0.2051166296005249,
"learning_rate": 7.59032739972729e-05,
"loss": 1.2183,
"step": 1500
},
{
"epoch": 0.34714186530895624,
"eval_loss": 1.1873364448547363,
"eval_runtime": 21.6444,
"eval_samples_per_second": 17.741,
"eval_steps_per_second": 0.554,
"step": 1500
},
{
"epoch": 0.34945614441101597,
"grad_norm": 0.19153615832328796,
"learning_rate": 7.558196550518818e-05,
"loss": 1.1948,
"step": 1510
},
{
"epoch": 0.3517704235130757,
"grad_norm": 0.1992039531469345,
"learning_rate": 7.525921955134713e-05,
"loss": 1.1868,
"step": 1520
},
{
"epoch": 0.35408470261513536,
"grad_norm": 0.20605571568012238,
"learning_rate": 7.493505427098517e-05,
"loss": 1.199,
"step": 1530
},
{
"epoch": 0.3563989817171951,
"grad_norm": 0.17926311492919922,
"learning_rate": 7.460948787909017e-05,
"loss": 1.194,
"step": 1540
},
{
"epoch": 0.3587132608192548,
"grad_norm": 0.20658712089061737,
"learning_rate": 7.428253866937918e-05,
"loss": 1.2012,
"step": 1550
},
{
"epoch": 0.36102753992131453,
"grad_norm": 0.21082770824432373,
"learning_rate": 7.395422501327036e-05,
"loss": 1.2004,
"step": 1560
},
{
"epoch": 0.3633418190233742,
"grad_norm": 0.20247185230255127,
"learning_rate": 7.362456535885066e-05,
"loss": 1.1878,
"step": 1570
},
{
"epoch": 0.3656560981254339,
"grad_norm": 0.20155729353427887,
"learning_rate": 7.329357822983929e-05,
"loss": 1.1796,
"step": 1580
},
{
"epoch": 0.36797037722749365,
"grad_norm": 0.1960991472005844,
"learning_rate": 7.296128222454686e-05,
"loss": 1.2043,
"step": 1590
},
{
"epoch": 0.3702846563295533,
"grad_norm": 0.19188149273395538,
"learning_rate": 7.262769601483024e-05,
"loss": 1.2037,
"step": 1600
},
{
"epoch": 0.37259893543161304,
"grad_norm": 0.2052951157093048,
"learning_rate": 7.229283834504351e-05,
"loss": 1.1985,
"step": 1610
},
{
"epoch": 0.37491321453367277,
"grad_norm": 0.18684880435466766,
"learning_rate": 7.195672803098463e-05,
"loss": 1.2023,
"step": 1620
},
{
"epoch": 0.3772274936357325,
"grad_norm": 0.20104870200157166,
"learning_rate": 7.161938395883815e-05,
"loss": 1.1892,
"step": 1630
},
{
"epoch": 0.37954177273779216,
"grad_norm": 0.19793595373630524,
"learning_rate": 7.128082508411406e-05,
"loss": 1.1992,
"step": 1640
},
{
"epoch": 0.3818560518398519,
"grad_norm": 0.20280171930789948,
"learning_rate": 7.094107043058264e-05,
"loss": 1.2076,
"step": 1650
},
{
"epoch": 0.3841703309419116,
"grad_norm": 0.20379236340522766,
"learning_rate": 7.060013908920548e-05,
"loss": 1.1987,
"step": 1660
},
{
"epoch": 0.3864846100439713,
"grad_norm": 0.19275911152362823,
"learning_rate": 7.025805021706276e-05,
"loss": 1.1983,
"step": 1670
},
{
"epoch": 0.388798889146031,
"grad_norm": 0.20220735669136047,
"learning_rate": 6.991482303627685e-05,
"loss": 1.1992,
"step": 1680
},
{
"epoch": 0.3911131682480907,
"grad_norm": 0.2047668844461441,
"learning_rate": 6.957047683293215e-05,
"loss": 1.2086,
"step": 1690
},
{
"epoch": 0.39342744735015045,
"grad_norm": 0.19045311212539673,
"learning_rate": 6.922503095599142e-05,
"loss": 1.1926,
"step": 1700
},
{
"epoch": 0.3957417264522101,
"grad_norm": 0.2014586180448532,
"learning_rate": 6.887850481620858e-05,
"loss": 1.1973,
"step": 1710
},
{
"epoch": 0.39805600555426984,
"grad_norm": 0.18599851429462433,
"learning_rate": 6.853091788503802e-05,
"loss": 1.1956,
"step": 1720
},
{
"epoch": 0.40037028465632957,
"grad_norm": 0.2029285877943039,
"learning_rate": 6.818228969354037e-05,
"loss": 1.2114,
"step": 1730
},
{
"epoch": 0.40268456375838924,
"grad_norm": 0.19286784529685974,
"learning_rate": 6.783263983128519e-05,
"loss": 1.1761,
"step": 1740
},
{
"epoch": 0.40499884286044896,
"grad_norm": 0.19630247354507446,
"learning_rate": 6.748198794525016e-05,
"loss": 1.188,
"step": 1750
},
{
"epoch": 0.4073131219625087,
"grad_norm": 0.19817174971103668,
"learning_rate": 6.71303537387171e-05,
"loss": 1.1885,
"step": 1760
},
{
"epoch": 0.4096274010645684,
"grad_norm": 0.19006091356277466,
"learning_rate": 6.677775697016484e-05,
"loss": 1.1915,
"step": 1770
},
{
"epoch": 0.4119416801666281,
"grad_norm": 0.1849374771118164,
"learning_rate": 6.642421745215901e-05,
"loss": 1.1853,
"step": 1780
},
{
"epoch": 0.4142559592686878,
"grad_norm": 0.2032414823770523,
"learning_rate": 6.606975505023873e-05,
"loss": 1.197,
"step": 1790
},
{
"epoch": 0.4165702383707475,
"grad_norm": 0.1908976286649704,
"learning_rate": 6.571438968180035e-05,
"loss": 1.1937,
"step": 1800
},
{
"epoch": 0.4188845174728072,
"grad_norm": 0.19852004945278168,
"learning_rate": 6.535814131497833e-05,
"loss": 1.1837,
"step": 1810
},
{
"epoch": 0.4211987965748669,
"grad_norm": 0.19003674387931824,
"learning_rate": 6.50010299675232e-05,
"loss": 1.1959,
"step": 1820
},
{
"epoch": 0.42351307567692664,
"grad_norm": 0.2054755687713623,
"learning_rate": 6.46430757056767e-05,
"loss": 1.1943,
"step": 1830
},
{
"epoch": 0.42582735477898637,
"grad_norm": 0.19895458221435547,
"learning_rate": 6.428429864304432e-05,
"loss": 1.1871,
"step": 1840
},
{
"epoch": 0.42814163388104604,
"grad_norm": 0.19693517684936523,
"learning_rate": 6.39247189394651e-05,
"loss": 1.185,
"step": 1850
},
{
"epoch": 0.43045591298310576,
"grad_norm": 0.19280746579170227,
"learning_rate": 6.356435679987882e-05,
"loss": 1.1817,
"step": 1860
},
{
"epoch": 0.4327701920851655,
"grad_norm": 0.19104084372520447,
"learning_rate": 6.320323247319064e-05,
"loss": 1.186,
"step": 1870
},
{
"epoch": 0.43508447118722515,
"grad_norm": 0.19598130881786346,
"learning_rate": 6.28413662511334e-05,
"loss": 1.1946,
"step": 1880
},
{
"epoch": 0.4373987502892849,
"grad_norm": 0.2072417438030243,
"learning_rate": 6.247877846712734e-05,
"loss": 1.1921,
"step": 1890
},
{
"epoch": 0.4397130293913446,
"grad_norm": 0.19743064045906067,
"learning_rate": 6.211548949513756e-05,
"loss": 1.1825,
"step": 1900
},
{
"epoch": 0.4420273084934043,
"grad_norm": 0.19049686193466187,
"learning_rate": 6.175151974852923e-05,
"loss": 1.1893,
"step": 1910
},
{
"epoch": 0.444341587595464,
"grad_norm": 0.18704815208911896,
"learning_rate": 6.138688967892055e-05,
"loss": 1.1851,
"step": 1920
},
{
"epoch": 0.4466558666975237,
"grad_norm": 0.2007189691066742,
"learning_rate": 6.102161977503358e-05,
"loss": 1.1791,
"step": 1930
},
{
"epoch": 0.44897014579958344,
"grad_norm": 0.19694744050502777,
"learning_rate": 6.065573056154289e-05,
"loss": 1.1797,
"step": 1940
},
{
"epoch": 0.4512844249016431,
"grad_norm": 0.1945074051618576,
"learning_rate": 6.028924259792235e-05,
"loss": 1.1842,
"step": 1950
},
{
"epoch": 0.45359870400370284,
"grad_norm": 0.19543199241161346,
"learning_rate": 5.9922176477289874e-05,
"loss": 1.1897,
"step": 1960
},
{
"epoch": 0.45591298310576256,
"grad_norm": 0.20255213975906372,
"learning_rate": 5.9554552825250264e-05,
"loss": 1.1912,
"step": 1970
},
{
"epoch": 0.4582272622078223,
"grad_norm": 0.19501863420009613,
"learning_rate": 5.918639229873624e-05,
"loss": 1.1821,
"step": 1980
},
{
"epoch": 0.46054154130988195,
"grad_norm": 0.19646863639354706,
"learning_rate": 5.881771558484774e-05,
"loss": 1.1756,
"step": 1990
},
{
"epoch": 0.4628558204119417,
"grad_norm": 0.2014242708683014,
"learning_rate": 5.844854339968952e-05,
"loss": 1.1853,
"step": 2000
},
{
"epoch": 0.4628558204119417,
"eval_loss": 1.1698839664459229,
"eval_runtime": 21.5892,
"eval_samples_per_second": 17.787,
"eval_steps_per_second": 0.556,
"step": 2000
},
{
"epoch": 0.4651700995140014,
"grad_norm": 0.19205763936042786,
"learning_rate": 5.8078896487207015e-05,
"loss": 1.1883,
"step": 2010
},
{
"epoch": 0.46748437861606107,
"grad_norm": 0.19423869252204895,
"learning_rate": 5.770879561802087e-05,
"loss": 1.1777,
"step": 2020
},
{
"epoch": 0.4697986577181208,
"grad_norm": 0.19925445318222046,
"learning_rate": 5.7338261588259726e-05,
"loss": 1.1843,
"step": 2030
},
{
"epoch": 0.4721129368201805,
"grad_norm": 0.18574309349060059,
"learning_rate": 5.696731521839167e-05,
"loss": 1.1763,
"step": 2040
},
{
"epoch": 0.47442721592224024,
"grad_norm": 0.19058012962341309,
"learning_rate": 5.6595977352054407e-05,
"loss": 1.1797,
"step": 2050
},
{
"epoch": 0.4767414950242999,
"grad_norm": 0.1849735677242279,
"learning_rate": 5.6224268854883996e-05,
"loss": 1.1808,
"step": 2060
},
{
"epoch": 0.47905577412635963,
"grad_norm": 0.1923811137676239,
"learning_rate": 5.585221061334236e-05,
"loss": 1.1744,
"step": 2070
},
{
"epoch": 0.48137005322841936,
"grad_norm": 0.1943860650062561,
"learning_rate": 5.547982353354376e-05,
"loss": 1.1833,
"step": 2080
},
{
"epoch": 0.4836843323304791,
"grad_norm": 0.20127460360527039,
"learning_rate": 5.510712854008001e-05,
"loss": 1.1798,
"step": 2090
},
{
"epoch": 0.48599861143253875,
"grad_norm": 0.18425202369689941,
"learning_rate": 5.473414657484468e-05,
"loss": 1.1969,
"step": 2100
},
{
"epoch": 0.4883128905345985,
"grad_norm": 0.19612173736095428,
"learning_rate": 5.436089859585648e-05,
"loss": 1.1707,
"step": 2110
},
{
"epoch": 0.4906271696366582,
"grad_norm": 0.18944087624549866,
"learning_rate": 5.3987405576081505e-05,
"loss": 1.1822,
"step": 2120
},
{
"epoch": 0.49294144873871787,
"grad_norm": 0.19573846459388733,
"learning_rate": 5.361368850225479e-05,
"loss": 1.1831,
"step": 2130
},
{
"epoch": 0.4952557278407776,
"grad_norm": 0.18912994861602783,
"learning_rate": 5.32397683737011e-05,
"loss": 1.1859,
"step": 2140
},
{
"epoch": 0.4975700069428373,
"grad_norm": 0.19357813894748688,
"learning_rate": 5.286566620115493e-05,
"loss": 1.1701,
"step": 2150
},
{
"epoch": 0.49988428604489704,
"grad_norm": 0.18788059055805206,
"learning_rate": 5.249140300557985e-05,
"loss": 1.1764,
"step": 2160
},
{
"epoch": 0.5021985651469567,
"grad_norm": 0.19492246210575104,
"learning_rate": 5.211699981698747e-05,
"loss": 1.1898,
"step": 2170
},
{
"epoch": 0.5045128442490164,
"grad_norm": 0.21048106253147125,
"learning_rate": 5.17424776732556e-05,
"loss": 1.1768,
"step": 2180
},
{
"epoch": 0.5068271233510762,
"grad_norm": 0.1978602409362793,
"learning_rate": 5.1367857618946194e-05,
"loss": 1.1791,
"step": 2190
},
{
"epoch": 0.5091414024531359,
"grad_norm": 0.19546453654766083,
"learning_rate": 5.09931607041229e-05,
"loss": 1.1821,
"step": 2200
},
{
"epoch": 0.5114556815551956,
"grad_norm": 0.19739992916584015,
"learning_rate": 5.0618407983168146e-05,
"loss": 1.1754,
"step": 2210
},
{
"epoch": 0.5137699606572552,
"grad_norm": 0.19072087109088898,
"learning_rate": 5.0243620513600145e-05,
"loss": 1.1826,
"step": 2220
},
{
"epoch": 0.5160842397593149,
"grad_norm": 0.1789073944091797,
"learning_rate": 4.9868819354889625e-05,
"loss": 1.1731,
"step": 2230
},
{
"epoch": 0.5183985188613747,
"grad_norm": 0.19865523278713226,
"learning_rate": 4.9494025567276544e-05,
"loss": 1.1796,
"step": 2240
},
{
"epoch": 0.5207127979634344,
"grad_norm": 0.1872965544462204,
"learning_rate": 4.9119260210586695e-05,
"loss": 1.176,
"step": 2250
},
{
"epoch": 0.5230270770654941,
"grad_norm": 0.1958765685558319,
"learning_rate": 4.874454434304824e-05,
"loss": 1.1712,
"step": 2260
},
{
"epoch": 0.5253413561675538,
"grad_norm": 0.19132095575332642,
"learning_rate": 4.8369899020108626e-05,
"loss": 1.1786,
"step": 2270
},
{
"epoch": 0.5276556352696136,
"grad_norm": 0.19474317133426666,
"learning_rate": 4.7995345293251284e-05,
"loss": 1.1869,
"step": 2280
},
{
"epoch": 0.5299699143716732,
"grad_norm": 0.19309870898723602,
"learning_rate": 4.762090420881289e-05,
"loss": 1.1802,
"step": 2290
},
{
"epoch": 0.5322841934737329,
"grad_norm": 0.2047063410282135,
"learning_rate": 4.7246596806800636e-05,
"loss": 1.1689,
"step": 2300
},
{
"epoch": 0.5345984725757926,
"grad_norm": 0.19408148527145386,
"learning_rate": 4.687244411971009e-05,
"loss": 1.1715,
"step": 2310
},
{
"epoch": 0.5369127516778524,
"grad_norm": 0.21102771162986755,
"learning_rate": 4.649846717134327e-05,
"loss": 1.1868,
"step": 2320
},
{
"epoch": 0.5392270307799121,
"grad_norm": 0.20618434250354767,
"learning_rate": 4.612468697562741e-05,
"loss": 1.1688,
"step": 2330
},
{
"epoch": 0.5415413098819718,
"grad_norm": 0.19679012894630432,
"learning_rate": 4.575112453543408e-05,
"loss": 1.1758,
"step": 2340
},
{
"epoch": 0.5438555889840315,
"grad_norm": 0.20675049722194672,
"learning_rate": 4.537780084139913e-05,
"loss": 1.1605,
"step": 2350
},
{
"epoch": 0.5461698680860911,
"grad_norm": 0.18863654136657715,
"learning_rate": 4.500473687074309e-05,
"loss": 1.1742,
"step": 2360
},
{
"epoch": 0.5484841471881509,
"grad_norm": 0.19098520278930664,
"learning_rate": 4.463195358609258e-05,
"loss": 1.1652,
"step": 2370
},
{
"epoch": 0.5507984262902106,
"grad_norm": 0.19034633040428162,
"learning_rate": 4.4259471934302324e-05,
"loss": 1.1716,
"step": 2380
},
{
"epoch": 0.5531127053922703,
"grad_norm": 0.19565701484680176,
"learning_rate": 4.388731284527816e-05,
"loss": 1.1503,
"step": 2390
},
{
"epoch": 0.55542698449433,
"grad_norm": 0.19067049026489258,
"learning_rate": 4.351549723080097e-05,
"loss": 1.1772,
"step": 2400
},
{
"epoch": 0.5577412635963898,
"grad_norm": 0.19726891815662384,
"learning_rate": 4.3144045983351735e-05,
"loss": 1.187,
"step": 2410
},
{
"epoch": 0.5600555426984495,
"grad_norm": 0.19251461327075958,
"learning_rate": 4.277297997493737e-05,
"loss": 1.1734,
"step": 2420
},
{
"epoch": 0.5623698218005091,
"grad_norm": 0.19542944431304932,
"learning_rate": 4.2402320055918154e-05,
"loss": 1.1717,
"step": 2430
},
{
"epoch": 0.5646841009025688,
"grad_norm": 0.19372211396694183,
"learning_rate": 4.203208705383594e-05,
"loss": 1.1859,
"step": 2440
},
{
"epoch": 0.5669983800046285,
"grad_norm": 0.19102297723293304,
"learning_rate": 4.1662301772243996e-05,
"loss": 1.1609,
"step": 2450
},
{
"epoch": 0.5693126591066883,
"grad_norm": 0.1865842044353485,
"learning_rate": 4.129298498953792e-05,
"loss": 1.1898,
"step": 2460
},
{
"epoch": 0.571626938208748,
"grad_norm": 0.19476434588432312,
"learning_rate": 4.0924157457788226e-05,
"loss": 1.1726,
"step": 2470
},
{
"epoch": 0.5739412173108077,
"grad_norm": 0.19208824634552002,
"learning_rate": 4.055583990157416e-05,
"loss": 1.1777,
"step": 2480
},
{
"epoch": 0.5762554964128674,
"grad_norm": 0.1908976435661316,
"learning_rate": 4.01880530168192e-05,
"loss": 1.1668,
"step": 2490
},
{
"epoch": 0.578569775514927,
"grad_norm": 0.19089365005493164,
"learning_rate": 3.982081746962826e-05,
"loss": 1.1794,
"step": 2500
},
{
"epoch": 0.578569775514927,
"eval_loss": 1.1556445360183716,
"eval_runtime": 21.5393,
"eval_samples_per_second": 17.828,
"eval_steps_per_second": 0.557,
"step": 2500
},
{
"epoch": 0.5808840546169868,
"grad_norm": 0.21013890206813812,
"learning_rate": 3.94541538951262e-05,
"loss": 1.157,
"step": 2510
},
{
"epoch": 0.5831983337190465,
"grad_norm": 0.20332397520542145,
"learning_rate": 3.908808289629865e-05,
"loss": 1.1709,
"step": 2520
},
{
"epoch": 0.5855126128211062,
"grad_norm": 0.19428518414497375,
"learning_rate": 3.8722625042834025e-05,
"loss": 1.1783,
"step": 2530
},
{
"epoch": 0.587826891923166,
"grad_norm": 0.19970852136611938,
"learning_rate": 3.835780086996794e-05,
"loss": 1.1687,
"step": 2540
},
{
"epoch": 0.5901411710252257,
"grad_norm": 0.18801788985729218,
"learning_rate": 3.7993630877329124e-05,
"loss": 1.1715,
"step": 2550
},
{
"epoch": 0.5924554501272854,
"grad_norm": 0.2128693163394928,
"learning_rate": 3.763013552778774e-05,
"loss": 1.179,
"step": 2560
},
{
"epoch": 0.594769729229345,
"grad_norm": 0.19203241169452667,
"learning_rate": 3.726733524630535e-05,
"loss": 1.1838,
"step": 2570
},
{
"epoch": 0.5970840083314047,
"grad_norm": 0.20480811595916748,
"learning_rate": 3.690525041878743e-05,
"loss": 1.1616,
"step": 2580
},
{
"epoch": 0.5993982874334645,
"grad_norm": 0.19616341590881348,
"learning_rate": 3.6543901390937754e-05,
"loss": 1.1416,
"step": 2590
},
{
"epoch": 0.6017125665355242,
"grad_norm": 0.1999153196811676,
"learning_rate": 3.6183308467115175e-05,
"loss": 1.1659,
"step": 2600
},
{
"epoch": 0.6040268456375839,
"grad_norm": 0.19980020821094513,
"learning_rate": 3.582349190919275e-05,
"loss": 1.1657,
"step": 2610
},
{
"epoch": 0.6063411247396436,
"grad_norm": 0.19510309398174286,
"learning_rate": 3.546447193541922e-05,
"loss": 1.1701,
"step": 2620
},
{
"epoch": 0.6086554038417034,
"grad_norm": 0.18956266343593597,
"learning_rate": 3.510626871928287e-05,
"loss": 1.1663,
"step": 2630
},
{
"epoch": 0.610969682943763,
"grad_norm": 0.18637120723724365,
"learning_rate": 3.474890238837806e-05,
"loss": 1.1731,
"step": 2640
},
{
"epoch": 0.6132839620458227,
"grad_norm": 0.19002896547317505,
"learning_rate": 3.439239302327417e-05,
"loss": 1.1683,
"step": 2650
},
{
"epoch": 0.6155982411478824,
"grad_norm": 0.19537580013275146,
"learning_rate": 3.403676065638735e-05,
"loss": 1.1652,
"step": 2660
},
{
"epoch": 0.6179125202499421,
"grad_norm": 0.1950923502445221,
"learning_rate": 3.368202527085476e-05,
"loss": 1.1778,
"step": 2670
},
{
"epoch": 0.6202267993520019,
"grad_norm": 0.19736339151859283,
"learning_rate": 3.332820679941186e-05,
"loss": 1.179,
"step": 2680
},
{
"epoch": 0.6225410784540616,
"grad_norm": 0.19073107838630676,
"learning_rate": 3.297532512327231e-05,
"loss": 1.162,
"step": 2690
},
{
"epoch": 0.6248553575561213,
"grad_norm": 0.1941593438386917,
"learning_rate": 3.262340007101076e-05,
"loss": 1.1592,
"step": 2700
},
{
"epoch": 0.6271696366581809,
"grad_norm": 0.1990540772676468,
"learning_rate": 3.227245141744882e-05,
"loss": 1.1571,
"step": 2710
},
{
"epoch": 0.6294839157602407,
"grad_norm": 0.19624970853328705,
"learning_rate": 3.192249888254381e-05,
"loss": 1.1582,
"step": 2720
},
{
"epoch": 0.6317981948623004,
"grad_norm": 0.18591170012950897,
"learning_rate": 3.157356213028072e-05,
"loss": 1.1518,
"step": 2730
},
{
"epoch": 0.6341124739643601,
"grad_norm": 0.1997700184583664,
"learning_rate": 3.122566076756724e-05,
"loss": 1.1689,
"step": 2740
},
{
"epoch": 0.6364267530664198,
"grad_norm": 0.18985426425933838,
"learning_rate": 3.087881434313212e-05,
"loss": 1.1693,
"step": 2750
},
{
"epoch": 0.6387410321684796,
"grad_norm": 0.19050361216068268,
"learning_rate": 3.053304234642661e-05,
"loss": 1.1651,
"step": 2760
},
{
"epoch": 0.6410553112705393,
"grad_norm": 0.19750134646892548,
"learning_rate": 3.0188364206529467e-05,
"loss": 1.1657,
"step": 2770
},
{
"epoch": 0.6433695903725989,
"grad_norm": 0.18156161904335022,
"learning_rate": 2.9844799291055083e-05,
"loss": 1.1792,
"step": 2780
},
{
"epoch": 0.6456838694746586,
"grad_norm": 0.19130638241767883,
"learning_rate": 2.950236690506537e-05,
"loss": 1.1623,
"step": 2790
},
{
"epoch": 0.6479981485767183,
"grad_norm": 0.20145711302757263,
"learning_rate": 2.916108628998484e-05,
"loss": 1.162,
"step": 2800
},
{
"epoch": 0.6503124276787781,
"grad_norm": 0.18573534488677979,
"learning_rate": 2.8820976622519558e-05,
"loss": 1.1724,
"step": 2810
},
{
"epoch": 0.6526267067808378,
"grad_norm": 0.19041724503040314,
"learning_rate": 2.84820570135795e-05,
"loss": 1.1567,
"step": 2820
},
{
"epoch": 0.6549409858828975,
"grad_norm": 0.19331230223178864,
"learning_rate": 2.8144346507204728e-05,
"loss": 1.1722,
"step": 2830
},
{
"epoch": 0.6572552649849572,
"grad_norm": 0.19323979318141937,
"learning_rate": 2.7807864079495306e-05,
"loss": 1.1637,
"step": 2840
},
{
"epoch": 0.6595695440870168,
"grad_norm": 0.18545861542224884,
"learning_rate": 2.7472628637545082e-05,
"loss": 1.1634,
"step": 2850
},
{
"epoch": 0.6618838231890766,
"grad_norm": 0.19878439605236053,
"learning_rate": 2.7138659018379144e-05,
"loss": 1.169,
"step": 2860
},
{
"epoch": 0.6641981022911363,
"grad_norm": 0.19122658669948578,
"learning_rate": 2.680597398789554e-05,
"loss": 1.1779,
"step": 2870
},
{
"epoch": 0.666512381393196,
"grad_norm": 0.19803395867347717,
"learning_rate": 2.647459223981064e-05,
"loss": 1.1523,
"step": 2880
},
{
"epoch": 0.6688266604952557,
"grad_norm": 0.19722239673137665,
"learning_rate": 2.614453239460884e-05,
"loss": 1.1596,
"step": 2890
},
{
"epoch": 0.6711409395973155,
"grad_norm": 0.2012769877910614,
"learning_rate": 2.581581299849627e-05,
"loss": 1.1675,
"step": 2900
},
{
"epoch": 0.6734552186993752,
"grad_norm": 0.19577769935131073,
"learning_rate": 2.5488452522358585e-05,
"loss": 1.167,
"step": 2910
},
{
"epoch": 0.6757694978014348,
"grad_norm": 0.19656306505203247,
"learning_rate": 2.5162469360723208e-05,
"loss": 1.1737,
"step": 2920
},
{
"epoch": 0.6780837769034945,
"grad_norm": 0.19946037232875824,
"learning_rate": 2.4837881830725584e-05,
"loss": 1.1509,
"step": 2930
},
{
"epoch": 0.6803980560055543,
"grad_norm": 0.1928076148033142,
"learning_rate": 2.451470817108007e-05,
"loss": 1.1595,
"step": 2940
},
{
"epoch": 0.682712335107614,
"grad_norm": 0.19065019488334656,
"learning_rate": 2.4192966541054977e-05,
"loss": 1.1651,
"step": 2950
},
{
"epoch": 0.6850266142096737,
"grad_norm": 0.198676198720932,
"learning_rate": 2.387267501945233e-05,
"loss": 1.1487,
"step": 2960
},
{
"epoch": 0.6873408933117334,
"grad_norm": 0.19842711091041565,
"learning_rate": 2.3553851603591837e-05,
"loss": 1.1606,
"step": 2970
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.20439574122428894,
"learning_rate": 2.3236514208299796e-05,
"loss": 1.1464,
"step": 2980
},
{
"epoch": 0.6919694515158528,
"grad_norm": 0.18908947706222534,
"learning_rate": 2.2920680664902304e-05,
"loss": 1.1608,
"step": 2990
},
{
"epoch": 0.6942837306179125,
"grad_norm": 0.20464125275611877,
"learning_rate": 2.260636872022339e-05,
"loss": 1.1482,
"step": 3000
},
{
"epoch": 0.6942837306179125,
"eval_loss": 1.1457551717758179,
"eval_runtime": 21.6541,
"eval_samples_per_second": 17.733,
"eval_steps_per_second": 0.554,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 4321,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.914805497032868e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}