facts_ts-new / trainer_state.json
Julianvn's picture
Upload 10 files
9924877 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 4455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011231222799382283,
"grad_norm": 7.254988670349121,
"learning_rate": 0.00010799999999999998,
"loss": 65.2664,
"step": 10
},
{
"epoch": 0.022462445598764567,
"grad_norm": 10.046869277954102,
"learning_rate": 0.00022799999999999999,
"loss": 53.1308,
"step": 20
},
{
"epoch": 0.033693668398146845,
"grad_norm": 6.69377326965332,
"learning_rate": 0.00034799999999999995,
"loss": 42.5968,
"step": 30
},
{
"epoch": 0.04492489119752913,
"grad_norm": 2.674748182296753,
"learning_rate": 0.000468,
"loss": 36.9814,
"step": 40
},
{
"epoch": 0.056156113996911415,
"grad_norm": 1.836259126663208,
"learning_rate": 0.000588,
"loss": 33.8187,
"step": 50
},
{
"epoch": 0.06738733679629369,
"grad_norm": 1.980311632156372,
"learning_rate": 0.0005987741203178206,
"loss": 31.7549,
"step": 60
},
{
"epoch": 0.07861855959567599,
"grad_norm": 1.9080690145492554,
"learning_rate": 0.0005974120317820658,
"loss": 30.3116,
"step": 70
},
{
"epoch": 0.08984978239505827,
"grad_norm": 2.006067991256714,
"learning_rate": 0.000596049943246311,
"loss": 29.1807,
"step": 80
},
{
"epoch": 0.10108100519444055,
"grad_norm": 1.9598348140716553,
"learning_rate": 0.0005946878547105561,
"loss": 28.136,
"step": 90
},
{
"epoch": 0.11231222799382283,
"grad_norm": 1.4707179069519043,
"learning_rate": 0.0005933257661748014,
"loss": 27.1126,
"step": 100
},
{
"epoch": 0.12354345079320511,
"grad_norm": 1.9835516214370728,
"learning_rate": 0.0005919636776390465,
"loss": 26.144,
"step": 110
},
{
"epoch": 0.13477467359258738,
"grad_norm": 1.9193854331970215,
"learning_rate": 0.0005906015891032917,
"loss": 25.4067,
"step": 120
},
{
"epoch": 0.14600589639196968,
"grad_norm": 2.0830602645874023,
"learning_rate": 0.0005892395005675368,
"loss": 24.6646,
"step": 130
},
{
"epoch": 0.15723711919135197,
"grad_norm": 2.2983345985412598,
"learning_rate": 0.0005878774120317821,
"loss": 23.997,
"step": 140
},
{
"epoch": 0.16846834199073424,
"grad_norm": 1.7569265365600586,
"learning_rate": 0.0005865153234960272,
"loss": 23.4708,
"step": 150
},
{
"epoch": 0.17969956479011653,
"grad_norm": 3.213463068008423,
"learning_rate": 0.0005851532349602724,
"loss": 22.9803,
"step": 160
},
{
"epoch": 0.1909307875894988,
"grad_norm": 2.6588027477264404,
"learning_rate": 0.0005837911464245175,
"loss": 22.5322,
"step": 170
},
{
"epoch": 0.2021620103888811,
"grad_norm": 2.5820348262786865,
"learning_rate": 0.0005824290578887628,
"loss": 22.0044,
"step": 180
},
{
"epoch": 0.21339323318826336,
"grad_norm": 2.5082194805145264,
"learning_rate": 0.0005810669693530079,
"loss": 21.6131,
"step": 190
},
{
"epoch": 0.22462445598764566,
"grad_norm": 2.2405030727386475,
"learning_rate": 0.0005797048808172531,
"loss": 21.2934,
"step": 200
},
{
"epoch": 0.23585567878702793,
"grad_norm": 2.5789482593536377,
"learning_rate": 0.0005783427922814982,
"loss": 20.8536,
"step": 210
},
{
"epoch": 0.24708690158641022,
"grad_norm": 2.514025926589966,
"learning_rate": 0.0005769807037457435,
"loss": 20.531,
"step": 220
},
{
"epoch": 0.2583181243857925,
"grad_norm": 2.224000930786133,
"learning_rate": 0.0005756186152099886,
"loss": 20.1819,
"step": 230
},
{
"epoch": 0.26954934718517476,
"grad_norm": 2.81223201751709,
"learning_rate": 0.0005742565266742338,
"loss": 19.9192,
"step": 240
},
{
"epoch": 0.2807805699845571,
"grad_norm": 2.691889524459839,
"learning_rate": 0.000572894438138479,
"loss": 19.629,
"step": 250
},
{
"epoch": 0.29201179278393935,
"grad_norm": 2.8096940517425537,
"learning_rate": 0.0005715323496027242,
"loss": 19.3428,
"step": 260
},
{
"epoch": 0.3032430155833216,
"grad_norm": 2.537062406539917,
"learning_rate": 0.0005701702610669693,
"loss": 19.0967,
"step": 270
},
{
"epoch": 0.31447423838270394,
"grad_norm": 2.8304593563079834,
"learning_rate": 0.0005688081725312145,
"loss": 18.7822,
"step": 280
},
{
"epoch": 0.3257054611820862,
"grad_norm": 3.3747401237487793,
"learning_rate": 0.0005674460839954597,
"loss": 18.6202,
"step": 290
},
{
"epoch": 0.3369366839814685,
"grad_norm": 3.1069352626800537,
"learning_rate": 0.0005660839954597049,
"loss": 18.3328,
"step": 300
},
{
"epoch": 0.34816790678085074,
"grad_norm": 3.197521209716797,
"learning_rate": 0.00056472190692395,
"loss": 18.1676,
"step": 310
},
{
"epoch": 0.35939912958023307,
"grad_norm": 2.2514944076538086,
"learning_rate": 0.0005633598183881951,
"loss": 17.9463,
"step": 320
},
{
"epoch": 0.37063035237961534,
"grad_norm": 2.492508888244629,
"learning_rate": 0.0005619977298524404,
"loss": 17.7745,
"step": 330
},
{
"epoch": 0.3818615751789976,
"grad_norm": 2.636312484741211,
"learning_rate": 0.0005606356413166855,
"loss": 17.5753,
"step": 340
},
{
"epoch": 0.39309279797837987,
"grad_norm": 2.8358511924743652,
"learning_rate": 0.0005592735527809307,
"loss": 17.4287,
"step": 350
},
{
"epoch": 0.4043240207777622,
"grad_norm": 2.478410005569458,
"learning_rate": 0.0005579114642451758,
"loss": 17.2377,
"step": 360
},
{
"epoch": 0.41555524357714446,
"grad_norm": 3.003167152404785,
"learning_rate": 0.0005565493757094211,
"loss": 17.0597,
"step": 370
},
{
"epoch": 0.42678646637652673,
"grad_norm": 2.51339054107666,
"learning_rate": 0.0005551872871736662,
"loss": 16.8974,
"step": 380
},
{
"epoch": 0.43801768917590905,
"grad_norm": 2.4287829399108887,
"learning_rate": 0.0005538251986379114,
"loss": 16.7779,
"step": 390
},
{
"epoch": 0.4492489119752913,
"grad_norm": 2.6313817501068115,
"learning_rate": 0.0005524631101021566,
"loss": 16.6392,
"step": 400
},
{
"epoch": 0.4604801347746736,
"grad_norm": 2.333446502685547,
"learning_rate": 0.0005511010215664018,
"loss": 16.523,
"step": 410
},
{
"epoch": 0.47171135757405586,
"grad_norm": 2.979137420654297,
"learning_rate": 0.0005497389330306469,
"loss": 16.4291,
"step": 420
},
{
"epoch": 0.4829425803734382,
"grad_norm": 2.530829668045044,
"learning_rate": 0.0005483768444948921,
"loss": 16.2809,
"step": 430
},
{
"epoch": 0.49417380317282045,
"grad_norm": 2.572939157485962,
"learning_rate": 0.0005470147559591373,
"loss": 16.1444,
"step": 440
},
{
"epoch": 0.5054050259722027,
"grad_norm": 2.281402349472046,
"learning_rate": 0.0005456526674233825,
"loss": 16.1148,
"step": 450
},
{
"epoch": 0.516636248771585,
"grad_norm": 2.429608106613159,
"learning_rate": 0.0005442905788876276,
"loss": 15.9973,
"step": 460
},
{
"epoch": 0.5278674715709673,
"grad_norm": 3.2124414443969727,
"learning_rate": 0.0005429284903518729,
"loss": 15.7978,
"step": 470
},
{
"epoch": 0.5390986943703495,
"grad_norm": 2.6288387775421143,
"learning_rate": 0.000541566401816118,
"loss": 15.7305,
"step": 480
},
{
"epoch": 0.5503299171697319,
"grad_norm": 2.923504114151001,
"learning_rate": 0.0005402043132803632,
"loss": 15.649,
"step": 490
},
{
"epoch": 0.5615611399691142,
"grad_norm": 2.313035249710083,
"learning_rate": 0.0005388422247446083,
"loss": 15.5597,
"step": 500
},
{
"epoch": 0.5727923627684964,
"grad_norm": 2.209913492202759,
"learning_rate": 0.0005374801362088536,
"loss": 15.4301,
"step": 510
},
{
"epoch": 0.5840235855678787,
"grad_norm": 2.5013203620910645,
"learning_rate": 0.0005361180476730987,
"loss": 15.4357,
"step": 520
},
{
"epoch": 0.595254808367261,
"grad_norm": 2.2074005603790283,
"learning_rate": 0.0005347559591373438,
"loss": 15.2997,
"step": 530
},
{
"epoch": 0.6064860311666432,
"grad_norm": 2.186513900756836,
"learning_rate": 0.000533393870601589,
"loss": 15.1875,
"step": 540
},
{
"epoch": 0.6177172539660255,
"grad_norm": 2.2040984630584717,
"learning_rate": 0.0005320317820658342,
"loss": 15.1292,
"step": 550
},
{
"epoch": 0.6289484767654079,
"grad_norm": 2.560344696044922,
"learning_rate": 0.0005306696935300794,
"loss": 15.0857,
"step": 560
},
{
"epoch": 0.6401796995647902,
"grad_norm": 2.844914436340332,
"learning_rate": 0.0005293076049943245,
"loss": 15.0146,
"step": 570
},
{
"epoch": 0.6514109223641724,
"grad_norm": 2.1173508167266846,
"learning_rate": 0.0005279455164585697,
"loss": 14.9086,
"step": 580
},
{
"epoch": 0.6626421451635547,
"grad_norm": 2.6725828647613525,
"learning_rate": 0.0005265834279228149,
"loss": 14.8688,
"step": 590
},
{
"epoch": 0.673873367962937,
"grad_norm": 2.2795088291168213,
"learning_rate": 0.0005252213393870601,
"loss": 14.7697,
"step": 600
},
{
"epoch": 0.6851045907623192,
"grad_norm": 2.5514473915100098,
"learning_rate": 0.0005238592508513052,
"loss": 14.7712,
"step": 610
},
{
"epoch": 0.6963358135617015,
"grad_norm": 2.7515954971313477,
"learning_rate": 0.0005224971623155505,
"loss": 14.6388,
"step": 620
},
{
"epoch": 0.7075670363610839,
"grad_norm": 2.690708637237549,
"learning_rate": 0.0005211350737797956,
"loss": 14.5995,
"step": 630
},
{
"epoch": 0.7187982591604661,
"grad_norm": 2.461747407913208,
"learning_rate": 0.0005197729852440408,
"loss": 14.6307,
"step": 640
},
{
"epoch": 0.7300294819598484,
"grad_norm": 2.346754550933838,
"learning_rate": 0.0005184108967082859,
"loss": 14.5505,
"step": 650
},
{
"epoch": 0.7412607047592307,
"grad_norm": 2.376952886581421,
"learning_rate": 0.0005170488081725312,
"loss": 14.4045,
"step": 660
},
{
"epoch": 0.7524919275586129,
"grad_norm": 2.421809673309326,
"learning_rate": 0.0005156867196367763,
"loss": 14.4317,
"step": 670
},
{
"epoch": 0.7637231503579952,
"grad_norm": 2.1633081436157227,
"learning_rate": 0.0005143246311010215,
"loss": 14.39,
"step": 680
},
{
"epoch": 0.7749543731573775,
"grad_norm": 2.3786990642547607,
"learning_rate": 0.0005129625425652666,
"loss": 14.3502,
"step": 690
},
{
"epoch": 0.7861855959567597,
"grad_norm": 2.113426685333252,
"learning_rate": 0.0005116004540295119,
"loss": 14.3406,
"step": 700
},
{
"epoch": 0.7974168187561421,
"grad_norm": 2.3934221267700195,
"learning_rate": 0.000510238365493757,
"loss": 14.1971,
"step": 710
},
{
"epoch": 0.8086480415555244,
"grad_norm": 2.2229974269866943,
"learning_rate": 0.0005088762769580022,
"loss": 14.1345,
"step": 720
},
{
"epoch": 0.8198792643549067,
"grad_norm": 2.368398427963257,
"learning_rate": 0.0005075141884222473,
"loss": 14.1697,
"step": 730
},
{
"epoch": 0.8311104871542889,
"grad_norm": 2.0480265617370605,
"learning_rate": 0.0005061520998864926,
"loss": 14.109,
"step": 740
},
{
"epoch": 0.8423417099536712,
"grad_norm": 2.5201494693756104,
"learning_rate": 0.0005047900113507377,
"loss": 14.0595,
"step": 750
},
{
"epoch": 0.8535729327530535,
"grad_norm": 2.2030587196350098,
"learning_rate": 0.0005034279228149829,
"loss": 14.0706,
"step": 760
},
{
"epoch": 0.8648041555524357,
"grad_norm": 2.3631434440612793,
"learning_rate": 0.0005020658342792281,
"loss": 13.9638,
"step": 770
},
{
"epoch": 0.8760353783518181,
"grad_norm": 2.200186252593994,
"learning_rate": 0.0005007037457434733,
"loss": 13.9289,
"step": 780
},
{
"epoch": 0.8872666011512004,
"grad_norm": 1.9684851169586182,
"learning_rate": 0.0004993416572077184,
"loss": 13.7841,
"step": 790
},
{
"epoch": 0.8984978239505826,
"grad_norm": 2.311785936355591,
"learning_rate": 0.0004979795686719636,
"loss": 13.8482,
"step": 800
},
{
"epoch": 0.9097290467499649,
"grad_norm": 2.169306755065918,
"learning_rate": 0.0004966174801362088,
"loss": 13.7793,
"step": 810
},
{
"epoch": 0.9209602695493472,
"grad_norm": 2.2637839317321777,
"learning_rate": 0.000495255391600454,
"loss": 13.8025,
"step": 820
},
{
"epoch": 0.9321914923487294,
"grad_norm": 2.4545624256134033,
"learning_rate": 0.0004938933030646991,
"loss": 13.7582,
"step": 830
},
{
"epoch": 0.9434227151481117,
"grad_norm": 1.94866144657135,
"learning_rate": 0.0004925312145289444,
"loss": 13.6945,
"step": 840
},
{
"epoch": 0.954653937947494,
"grad_norm": 2.5071635246276855,
"learning_rate": 0.0004911691259931895,
"loss": 13.7015,
"step": 850
},
{
"epoch": 0.9658851607468764,
"grad_norm": 2.1163992881774902,
"learning_rate": 0.0004898070374574347,
"loss": 13.6065,
"step": 860
},
{
"epoch": 0.9771163835462586,
"grad_norm": 2.0031838417053223,
"learning_rate": 0.0004884449489216798,
"loss": 13.6221,
"step": 870
},
{
"epoch": 0.9883476063456409,
"grad_norm": 1.9967576265335083,
"learning_rate": 0.00048708286038592506,
"loss": 13.5372,
"step": 880
},
{
"epoch": 0.9995788291450232,
"grad_norm": 1.9578522443771362,
"learning_rate": 0.0004857207718501702,
"loss": 13.479,
"step": 890
},
{
"epoch": 1.010108100519444,
"grad_norm": 2.088886022567749,
"learning_rate": 0.0004843586833144154,
"loss": 12.5586,
"step": 900
},
{
"epoch": 1.0213393233188264,
"grad_norm": 1.9549593925476074,
"learning_rate": 0.00048299659477866054,
"loss": 13.3791,
"step": 910
},
{
"epoch": 1.0325705461182086,
"grad_norm": 1.9387295246124268,
"learning_rate": 0.00048163450624290577,
"loss": 13.4606,
"step": 920
},
{
"epoch": 1.043801768917591,
"grad_norm": 1.9079780578613281,
"learning_rate": 0.0004802724177071509,
"loss": 13.3116,
"step": 930
},
{
"epoch": 1.0550329917169732,
"grad_norm": 2.125767230987549,
"learning_rate": 0.0004789103291713961,
"loss": 13.3492,
"step": 940
},
{
"epoch": 1.0662642145163554,
"grad_norm": 1.894142508506775,
"learning_rate": 0.00047754824063564124,
"loss": 13.3045,
"step": 950
},
{
"epoch": 1.0774954373157377,
"grad_norm": 2.1038496494293213,
"learning_rate": 0.0004761861520998865,
"loss": 13.2968,
"step": 960
},
{
"epoch": 1.08872666011512,
"grad_norm": 2.0055999755859375,
"learning_rate": 0.0004748240635641316,
"loss": 13.2548,
"step": 970
},
{
"epoch": 1.0999578829145022,
"grad_norm": 1.8585240840911865,
"learning_rate": 0.00047346197502837683,
"loss": 13.1873,
"step": 980
},
{
"epoch": 1.1111891057138845,
"grad_norm": 1.9325159788131714,
"learning_rate": 0.00047209988649262195,
"loss": 13.2134,
"step": 990
},
{
"epoch": 1.1224203285132668,
"grad_norm": 1.9729729890823364,
"learning_rate": 0.0004707377979568672,
"loss": 13.2012,
"step": 1000
},
{
"epoch": 1.1336515513126493,
"grad_norm": 1.8888603448867798,
"learning_rate": 0.0004693757094211123,
"loss": 13.1582,
"step": 1010
},
{
"epoch": 1.1448827741120315,
"grad_norm": 2.0408947467803955,
"learning_rate": 0.00046801362088535754,
"loss": 13.1612,
"step": 1020
},
{
"epoch": 1.1561139969114138,
"grad_norm": 1.85500168800354,
"learning_rate": 0.00046665153234960266,
"loss": 13.1639,
"step": 1030
},
{
"epoch": 1.167345219710796,
"grad_norm": 1.9741461277008057,
"learning_rate": 0.0004652894438138479,
"loss": 13.1511,
"step": 1040
},
{
"epoch": 1.1785764425101783,
"grad_norm": 1.9243193864822388,
"learning_rate": 0.000463927355278093,
"loss": 13.0421,
"step": 1050
},
{
"epoch": 1.1898076653095606,
"grad_norm": 1.907875657081604,
"learning_rate": 0.00046256526674233825,
"loss": 13.0895,
"step": 1060
},
{
"epoch": 1.2010388881089429,
"grad_norm": 2.068942070007324,
"learning_rate": 0.00046120317820658337,
"loss": 12.9636,
"step": 1070
},
{
"epoch": 1.2122701109083251,
"grad_norm": 1.7963610887527466,
"learning_rate": 0.0004598410896708286,
"loss": 13.0476,
"step": 1080
},
{
"epoch": 1.2235013337077074,
"grad_norm": 2.0973403453826904,
"learning_rate": 0.0004584790011350737,
"loss": 13.017,
"step": 1090
},
{
"epoch": 1.2347325565070897,
"grad_norm": 1.749234676361084,
"learning_rate": 0.00045711691259931895,
"loss": 12.9918,
"step": 1100
},
{
"epoch": 1.245963779306472,
"grad_norm": 2.021134853363037,
"learning_rate": 0.0004557548240635641,
"loss": 12.9339,
"step": 1110
},
{
"epoch": 1.2571950021058542,
"grad_norm": 1.840922236442566,
"learning_rate": 0.00045439273552780925,
"loss": 12.9677,
"step": 1120
},
{
"epoch": 1.2684262249052365,
"grad_norm": 1.932059645652771,
"learning_rate": 0.00045303064699205443,
"loss": 12.9547,
"step": 1130
},
{
"epoch": 1.279657447704619,
"grad_norm": 2.177213668823242,
"learning_rate": 0.0004516685584562996,
"loss": 12.8664,
"step": 1140
},
{
"epoch": 1.290888670504001,
"grad_norm": 2.070005178451538,
"learning_rate": 0.0004503064699205448,
"loss": 12.8585,
"step": 1150
},
{
"epoch": 1.3021198933033835,
"grad_norm": 1.832366943359375,
"learning_rate": 0.00044894438138478996,
"loss": 12.8719,
"step": 1160
},
{
"epoch": 1.3133511161027656,
"grad_norm": 1.9845136404037476,
"learning_rate": 0.00044758229284903514,
"loss": 12.8424,
"step": 1170
},
{
"epoch": 1.324582338902148,
"grad_norm": 1.8147252798080444,
"learning_rate": 0.0004462202043132803,
"loss": 12.83,
"step": 1180
},
{
"epoch": 1.3358135617015303,
"grad_norm": 2.107008695602417,
"learning_rate": 0.0004448581157775255,
"loss": 12.7765,
"step": 1190
},
{
"epoch": 1.3470447845009126,
"grad_norm": 1.7466599941253662,
"learning_rate": 0.00044349602724177067,
"loss": 12.7204,
"step": 1200
},
{
"epoch": 1.3582760073002949,
"grad_norm": 1.7873570919036865,
"learning_rate": 0.00044213393870601585,
"loss": 12.7782,
"step": 1210
},
{
"epoch": 1.3695072300996771,
"grad_norm": 2.079206943511963,
"learning_rate": 0.000440771850170261,
"loss": 12.7326,
"step": 1220
},
{
"epoch": 1.3807384528990594,
"grad_norm": 1.7911771535873413,
"learning_rate": 0.0004394097616345062,
"loss": 12.7422,
"step": 1230
},
{
"epoch": 1.3919696756984417,
"grad_norm": 1.901097297668457,
"learning_rate": 0.0004380476730987514,
"loss": 12.7185,
"step": 1240
},
{
"epoch": 1.403200898497824,
"grad_norm": 1.826768398284912,
"learning_rate": 0.00043668558456299656,
"loss": 12.7017,
"step": 1250
},
{
"epoch": 1.4144321212972062,
"grad_norm": 1.7345402240753174,
"learning_rate": 0.00043532349602724173,
"loss": 12.6704,
"step": 1260
},
{
"epoch": 1.4256633440965885,
"grad_norm": 1.9229341745376587,
"learning_rate": 0.0004339614074914869,
"loss": 12.683,
"step": 1270
},
{
"epoch": 1.4368945668959707,
"grad_norm": 1.6749757528305054,
"learning_rate": 0.0004325993189557321,
"loss": 12.6501,
"step": 1280
},
{
"epoch": 1.4481257896953532,
"grad_norm": 1.812778353691101,
"learning_rate": 0.00043123723041997727,
"loss": 12.6529,
"step": 1290
},
{
"epoch": 1.4593570124947353,
"grad_norm": 1.7584545612335205,
"learning_rate": 0.00042987514188422244,
"loss": 12.6065,
"step": 1300
},
{
"epoch": 1.4705882352941178,
"grad_norm": 1.7893540859222412,
"learning_rate": 0.0004285130533484676,
"loss": 12.6011,
"step": 1310
},
{
"epoch": 1.4818194580934998,
"grad_norm": 1.8022955656051636,
"learning_rate": 0.0004271509648127128,
"loss": 12.5883,
"step": 1320
},
{
"epoch": 1.4930506808928823,
"grad_norm": 2.1660964488983154,
"learning_rate": 0.0004257888762769579,
"loss": 12.578,
"step": 1330
},
{
"epoch": 1.5042819036922643,
"grad_norm": 2.0054919719696045,
"learning_rate": 0.00042442678774120315,
"loss": 12.4797,
"step": 1340
},
{
"epoch": 1.5155131264916468,
"grad_norm": 1.6194941997528076,
"learning_rate": 0.0004230646992054483,
"loss": 12.475,
"step": 1350
},
{
"epoch": 1.526744349291029,
"grad_norm": 1.7826400995254517,
"learning_rate": 0.0004217026106696935,
"loss": 12.4979,
"step": 1360
},
{
"epoch": 1.5379755720904114,
"grad_norm": 1.7084046602249146,
"learning_rate": 0.00042034052213393863,
"loss": 12.5252,
"step": 1370
},
{
"epoch": 1.5492067948897936,
"grad_norm": 1.7256839275360107,
"learning_rate": 0.00041897843359818386,
"loss": 12.4617,
"step": 1380
},
{
"epoch": 1.560438017689176,
"grad_norm": 1.8508225679397583,
"learning_rate": 0.000417616345062429,
"loss": 12.5004,
"step": 1390
},
{
"epoch": 1.5716692404885582,
"grad_norm": 1.6339643001556396,
"learning_rate": 0.0004162542565266742,
"loss": 12.4315,
"step": 1400
},
{
"epoch": 1.5829004632879404,
"grad_norm": 1.8081163167953491,
"learning_rate": 0.00041489216799091934,
"loss": 12.4165,
"step": 1410
},
{
"epoch": 1.594131686087323,
"grad_norm": 1.8224244117736816,
"learning_rate": 0.00041353007945516457,
"loss": 12.4136,
"step": 1420
},
{
"epoch": 1.605362908886705,
"grad_norm": 1.8123489618301392,
"learning_rate": 0.0004121679909194097,
"loss": 12.4247,
"step": 1430
},
{
"epoch": 1.6165941316860875,
"grad_norm": 2.0198097229003906,
"learning_rate": 0.0004108059023836549,
"loss": 12.4179,
"step": 1440
},
{
"epoch": 1.6278253544854695,
"grad_norm": 1.759125828742981,
"learning_rate": 0.00040944381384790005,
"loss": 12.393,
"step": 1450
},
{
"epoch": 1.639056577284852,
"grad_norm": 1.768356442451477,
"learning_rate": 0.0004080817253121453,
"loss": 12.3846,
"step": 1460
},
{
"epoch": 1.650287800084234,
"grad_norm": 1.7623116970062256,
"learning_rate": 0.0004067196367763904,
"loss": 12.3867,
"step": 1470
},
{
"epoch": 1.6615190228836165,
"grad_norm": 1.7670810222625732,
"learning_rate": 0.00040535754824063563,
"loss": 12.3861,
"step": 1480
},
{
"epoch": 1.6727502456829986,
"grad_norm": 1.7134902477264404,
"learning_rate": 0.00040399545970488075,
"loss": 12.3066,
"step": 1490
},
{
"epoch": 1.683981468482381,
"grad_norm": 1.7663081884384155,
"learning_rate": 0.000402633371169126,
"loss": 12.3453,
"step": 1500
},
{
"epoch": 1.6952126912817633,
"grad_norm": 2.0105090141296387,
"learning_rate": 0.0004012712826333711,
"loss": 12.3071,
"step": 1510
},
{
"epoch": 1.7064439140811456,
"grad_norm": 1.7803105115890503,
"learning_rate": 0.00039990919409761634,
"loss": 12.3537,
"step": 1520
},
{
"epoch": 1.7176751368805279,
"grad_norm": 1.676762342453003,
"learning_rate": 0.00039854710556186146,
"loss": 12.3329,
"step": 1530
},
{
"epoch": 1.7289063596799101,
"grad_norm": 1.678467035293579,
"learning_rate": 0.0003971850170261067,
"loss": 12.2744,
"step": 1540
},
{
"epoch": 1.7401375824792924,
"grad_norm": 1.5558372735977173,
"learning_rate": 0.0003958229284903518,
"loss": 12.2728,
"step": 1550
},
{
"epoch": 1.7513688052786747,
"grad_norm": 1.8932992219924927,
"learning_rate": 0.00039446083995459705,
"loss": 12.244,
"step": 1560
},
{
"epoch": 1.7626000280780572,
"grad_norm": 1.5952329635620117,
"learning_rate": 0.00039309875141884217,
"loss": 12.2526,
"step": 1570
},
{
"epoch": 1.7738312508774392,
"grad_norm": 1.7214975357055664,
"learning_rate": 0.0003917366628830874,
"loss": 12.2495,
"step": 1580
},
{
"epoch": 1.7850624736768217,
"grad_norm": 1.686092495918274,
"learning_rate": 0.0003903745743473325,
"loss": 12.2539,
"step": 1590
},
{
"epoch": 1.7962936964762037,
"grad_norm": 1.7045249938964844,
"learning_rate": 0.00038901248581157776,
"loss": 12.229,
"step": 1600
},
{
"epoch": 1.8075249192755862,
"grad_norm": 1.591776728630066,
"learning_rate": 0.0003876503972758229,
"loss": 12.2094,
"step": 1610
},
{
"epoch": 1.8187561420749683,
"grad_norm": 1.7864668369293213,
"learning_rate": 0.0003862883087400681,
"loss": 12.1677,
"step": 1620
},
{
"epoch": 1.8299873648743508,
"grad_norm": 1.6069233417510986,
"learning_rate": 0.00038492622020431323,
"loss": 12.2168,
"step": 1630
},
{
"epoch": 1.841218587673733,
"grad_norm": 1.5101128816604614,
"learning_rate": 0.00038356413166855846,
"loss": 12.2165,
"step": 1640
},
{
"epoch": 1.8524498104731153,
"grad_norm": 1.675972819328308,
"learning_rate": 0.0003822020431328036,
"loss": 12.1509,
"step": 1650
},
{
"epoch": 1.8636810332724976,
"grad_norm": 1.5653135776519775,
"learning_rate": 0.00038083995459704876,
"loss": 12.1905,
"step": 1660
},
{
"epoch": 1.8749122560718798,
"grad_norm": 1.6895561218261719,
"learning_rate": 0.00037947786606129394,
"loss": 12.1049,
"step": 1670
},
{
"epoch": 1.886143478871262,
"grad_norm": 1.6613564491271973,
"learning_rate": 0.0003781157775255391,
"loss": 12.1453,
"step": 1680
},
{
"epoch": 1.8973747016706444,
"grad_norm": 1.5011565685272217,
"learning_rate": 0.0003767536889897843,
"loss": 12.1299,
"step": 1690
},
{
"epoch": 1.9086059244700266,
"grad_norm": 1.6737213134765625,
"learning_rate": 0.00037539160045402947,
"loss": 12.1812,
"step": 1700
},
{
"epoch": 1.919837147269409,
"grad_norm": 1.6121188402175903,
"learning_rate": 0.00037402951191827465,
"loss": 12.1656,
"step": 1710
},
{
"epoch": 1.9310683700687914,
"grad_norm": 1.527930736541748,
"learning_rate": 0.0003726674233825198,
"loss": 12.0239,
"step": 1720
},
{
"epoch": 1.9422995928681734,
"grad_norm": 1.6483533382415771,
"learning_rate": 0.000371305334846765,
"loss": 12.1018,
"step": 1730
},
{
"epoch": 1.953530815667556,
"grad_norm": 1.6136051416397095,
"learning_rate": 0.0003699432463110102,
"loss": 12.0595,
"step": 1740
},
{
"epoch": 1.964762038466938,
"grad_norm": 1.7923524379730225,
"learning_rate": 0.00036858115777525536,
"loss": 12.0446,
"step": 1750
},
{
"epoch": 1.9759932612663205,
"grad_norm": 1.732296109199524,
"learning_rate": 0.00036721906923950054,
"loss": 12.1205,
"step": 1760
},
{
"epoch": 1.9872244840657025,
"grad_norm": 1.595475435256958,
"learning_rate": 0.0003658569807037457,
"loss": 12.0134,
"step": 1770
},
{
"epoch": 1.998455706865085,
"grad_norm": 1.620702862739563,
"learning_rate": 0.0003644948921679909,
"loss": 12.0375,
"step": 1780
},
{
"epoch": 2.0089849782395057,
"grad_norm": 1.9543509483337402,
"learning_rate": 0.00036313280363223607,
"loss": 11.248,
"step": 1790
},
{
"epoch": 2.020216201038888,
"grad_norm": 1.726154088973999,
"learning_rate": 0.00036177071509648124,
"loss": 11.8804,
"step": 1800
},
{
"epoch": 2.0314474238382703,
"grad_norm": 1.61319899559021,
"learning_rate": 0.0003604086265607264,
"loss": 11.8473,
"step": 1810
},
{
"epoch": 2.0426786466376528,
"grad_norm": 1.5679258108139038,
"learning_rate": 0.0003590465380249716,
"loss": 11.8905,
"step": 1820
},
{
"epoch": 2.053909869437035,
"grad_norm": 1.5252978801727295,
"learning_rate": 0.0003576844494892168,
"loss": 11.9025,
"step": 1830
},
{
"epoch": 2.0651410922364173,
"grad_norm": 1.866320252418518,
"learning_rate": 0.00035632236095346195,
"loss": 11.8789,
"step": 1840
},
{
"epoch": 2.0763723150357993,
"grad_norm": 1.656232476234436,
"learning_rate": 0.00035496027241770713,
"loss": 11.8793,
"step": 1850
},
{
"epoch": 2.087603537835182,
"grad_norm": 1.6428873538970947,
"learning_rate": 0.0003535981838819523,
"loss": 11.9116,
"step": 1860
},
{
"epoch": 2.0988347606345643,
"grad_norm": 1.5620014667510986,
"learning_rate": 0.00035223609534619743,
"loss": 11.7622,
"step": 1870
},
{
"epoch": 2.1100659834339464,
"grad_norm": 1.6203725337982178,
"learning_rate": 0.00035087400681044266,
"loss": 11.8528,
"step": 1880
},
{
"epoch": 2.121297206233329,
"grad_norm": 1.610343337059021,
"learning_rate": 0.0003495119182746878,
"loss": 11.8451,
"step": 1890
},
{
"epoch": 2.132528429032711,
"grad_norm": 1.7108615636825562,
"learning_rate": 0.000348149829738933,
"loss": 11.8686,
"step": 1900
},
{
"epoch": 2.1437596518320934,
"grad_norm": 1.6075197458267212,
"learning_rate": 0.00034678774120317814,
"loss": 11.8373,
"step": 1910
},
{
"epoch": 2.1549908746314754,
"grad_norm": 1.6820300817489624,
"learning_rate": 0.00034542565266742337,
"loss": 11.8662,
"step": 1920
},
{
"epoch": 2.166222097430858,
"grad_norm": 1.7435542345046997,
"learning_rate": 0.0003440635641316685,
"loss": 11.7815,
"step": 1930
},
{
"epoch": 2.17745332023024,
"grad_norm": 1.6679948568344116,
"learning_rate": 0.0003427014755959137,
"loss": 11.8364,
"step": 1940
},
{
"epoch": 2.1886845430296225,
"grad_norm": 1.6212760210037231,
"learning_rate": 0.00034133938706015885,
"loss": 11.7716,
"step": 1950
},
{
"epoch": 2.1999157658290045,
"grad_norm": 1.5176148414611816,
"learning_rate": 0.0003399772985244041,
"loss": 11.7938,
"step": 1960
},
{
"epoch": 2.211146988628387,
"grad_norm": 1.5583223104476929,
"learning_rate": 0.0003386152099886492,
"loss": 11.7711,
"step": 1970
},
{
"epoch": 2.222378211427769,
"grad_norm": 1.768452763557434,
"learning_rate": 0.00033725312145289443,
"loss": 11.816,
"step": 1980
},
{
"epoch": 2.2336094342271515,
"grad_norm": 1.5718501806259155,
"learning_rate": 0.00033589103291713955,
"loss": 11.8198,
"step": 1990
},
{
"epoch": 2.2448406570265336,
"grad_norm": 1.5841343402862549,
"learning_rate": 0.0003345289443813848,
"loss": 11.7875,
"step": 2000
},
{
"epoch": 2.256071879825916,
"grad_norm": 1.8200368881225586,
"learning_rate": 0.0003331668558456299,
"loss": 11.7819,
"step": 2010
},
{
"epoch": 2.2673031026252985,
"grad_norm": 1.549752116203308,
"learning_rate": 0.00033180476730987514,
"loss": 11.751,
"step": 2020
},
{
"epoch": 2.2785343254246806,
"grad_norm": 1.4725509881973267,
"learning_rate": 0.00033044267877412026,
"loss": 11.6703,
"step": 2030
},
{
"epoch": 2.289765548224063,
"grad_norm": 1.7251125574111938,
"learning_rate": 0.0003290805902383655,
"loss": 11.7971,
"step": 2040
},
{
"epoch": 2.300996771023445,
"grad_norm": 1.8287060260772705,
"learning_rate": 0.0003277185017026106,
"loss": 11.7656,
"step": 2050
},
{
"epoch": 2.3122279938228276,
"grad_norm": 1.6441352367401123,
"learning_rate": 0.00032635641316685585,
"loss": 11.7444,
"step": 2060
},
{
"epoch": 2.3234592166222097,
"grad_norm": 1.675087809562683,
"learning_rate": 0.00032499432463110097,
"loss": 11.7359,
"step": 2070
},
{
"epoch": 2.334690439421592,
"grad_norm": 1.5940701961517334,
"learning_rate": 0.0003236322360953462,
"loss": 11.7006,
"step": 2080
},
{
"epoch": 2.345921662220974,
"grad_norm": 1.616356372833252,
"learning_rate": 0.0003222701475595913,
"loss": 11.7267,
"step": 2090
},
{
"epoch": 2.3571528850203567,
"grad_norm": 1.6495846509933472,
"learning_rate": 0.00032090805902383656,
"loss": 11.6705,
"step": 2100
},
{
"epoch": 2.3683841078197387,
"grad_norm": 1.666407585144043,
"learning_rate": 0.0003195459704880817,
"loss": 11.7987,
"step": 2110
},
{
"epoch": 2.3796153306191212,
"grad_norm": 1.4373139142990112,
"learning_rate": 0.0003181838819523269,
"loss": 11.7374,
"step": 2120
},
{
"epoch": 2.3908465534185033,
"grad_norm": 1.5677517652511597,
"learning_rate": 0.00031682179341657203,
"loss": 11.7455,
"step": 2130
},
{
"epoch": 2.4020777762178858,
"grad_norm": 1.6688272953033447,
"learning_rate": 0.00031545970488081727,
"loss": 11.6604,
"step": 2140
},
{
"epoch": 2.413308999017268,
"grad_norm": 1.5364110469818115,
"learning_rate": 0.0003140976163450624,
"loss": 11.702,
"step": 2150
},
{
"epoch": 2.4245402218166503,
"grad_norm": 1.6387137174606323,
"learning_rate": 0.0003127355278093076,
"loss": 11.6858,
"step": 2160
},
{
"epoch": 2.435771444616033,
"grad_norm": 1.5507831573486328,
"learning_rate": 0.00031137343927355274,
"loss": 11.6422,
"step": 2170
},
{
"epoch": 2.447002667415415,
"grad_norm": 1.4561623334884644,
"learning_rate": 0.000310011350737798,
"loss": 11.6273,
"step": 2180
},
{
"epoch": 2.4582338902147973,
"grad_norm": 1.5597074031829834,
"learning_rate": 0.0003086492622020431,
"loss": 11.6417,
"step": 2190
},
{
"epoch": 2.4694651130141794,
"grad_norm": 1.5203100442886353,
"learning_rate": 0.0003072871736662883,
"loss": 11.6372,
"step": 2200
},
{
"epoch": 2.480696335813562,
"grad_norm": 1.50716233253479,
"learning_rate": 0.00030592508513053345,
"loss": 11.6546,
"step": 2210
},
{
"epoch": 2.491927558612944,
"grad_norm": 1.615830898284912,
"learning_rate": 0.00030456299659477863,
"loss": 11.6139,
"step": 2220
},
{
"epoch": 2.5031587814123264,
"grad_norm": 1.479765772819519,
"learning_rate": 0.0003032009080590238,
"loss": 11.6222,
"step": 2230
},
{
"epoch": 2.5143900042117084,
"grad_norm": 1.5284301042556763,
"learning_rate": 0.000301838819523269,
"loss": 11.6529,
"step": 2240
},
{
"epoch": 2.525621227011091,
"grad_norm": 1.5398808717727661,
"learning_rate": 0.00030047673098751416,
"loss": 11.651,
"step": 2250
},
{
"epoch": 2.536852449810473,
"grad_norm": 1.526878833770752,
"learning_rate": 0.00029911464245175934,
"loss": 11.5892,
"step": 2260
},
{
"epoch": 2.5480836726098555,
"grad_norm": 1.5637303590774536,
"learning_rate": 0.0002977525539160045,
"loss": 11.6498,
"step": 2270
},
{
"epoch": 2.559314895409238,
"grad_norm": 1.711832046508789,
"learning_rate": 0.0002963904653802497,
"loss": 11.551,
"step": 2280
},
{
"epoch": 2.57054611820862,
"grad_norm": 1.5033766031265259,
"learning_rate": 0.00029502837684449487,
"loss": 11.6127,
"step": 2290
},
{
"epoch": 2.581777341008002,
"grad_norm": 1.455689787864685,
"learning_rate": 0.00029366628830874004,
"loss": 11.5994,
"step": 2300
},
{
"epoch": 2.5930085638073845,
"grad_norm": 1.6426947116851807,
"learning_rate": 0.0002923041997729852,
"loss": 11.6214,
"step": 2310
},
{
"epoch": 2.604239786606767,
"grad_norm": 1.5379929542541504,
"learning_rate": 0.0002909421112372304,
"loss": 11.5727,
"step": 2320
},
{
"epoch": 2.615471009406149,
"grad_norm": 1.5243467092514038,
"learning_rate": 0.0002895800227014756,
"loss": 11.576,
"step": 2330
},
{
"epoch": 2.626702232205531,
"grad_norm": 1.562593936920166,
"learning_rate": 0.00028821793416572075,
"loss": 11.641,
"step": 2340
},
{
"epoch": 2.6379334550049136,
"grad_norm": 1.5288798809051514,
"learning_rate": 0.00028685584562996593,
"loss": 11.5637,
"step": 2350
},
{
"epoch": 2.649164677804296,
"grad_norm": 1.8018691539764404,
"learning_rate": 0.0002854937570942111,
"loss": 11.4962,
"step": 2360
},
{
"epoch": 2.660395900603678,
"grad_norm": 1.77366304397583,
"learning_rate": 0.0002841316685584563,
"loss": 11.5311,
"step": 2370
},
{
"epoch": 2.6716271234030606,
"grad_norm": 1.500279188156128,
"learning_rate": 0.00028276958002270146,
"loss": 11.5499,
"step": 2380
},
{
"epoch": 2.6828583462024427,
"grad_norm": 1.6834344863891602,
"learning_rate": 0.00028140749148694664,
"loss": 11.5187,
"step": 2390
},
{
"epoch": 2.694089569001825,
"grad_norm": 1.4962844848632812,
"learning_rate": 0.0002800454029511918,
"loss": 11.5505,
"step": 2400
},
{
"epoch": 2.705320791801207,
"grad_norm": 1.4639493227005005,
"learning_rate": 0.00027868331441543694,
"loss": 11.5381,
"step": 2410
},
{
"epoch": 2.7165520146005897,
"grad_norm": 1.5657707452774048,
"learning_rate": 0.0002773212258796821,
"loss": 11.5474,
"step": 2420
},
{
"epoch": 2.727783237399972,
"grad_norm": 1.5155985355377197,
"learning_rate": 0.0002759591373439273,
"loss": 11.5487,
"step": 2430
},
{
"epoch": 2.7390144601993542,
"grad_norm": 1.6336228847503662,
"learning_rate": 0.00027459704880817247,
"loss": 11.5191,
"step": 2440
},
{
"epoch": 2.7502456829987363,
"grad_norm": 1.4850879907608032,
"learning_rate": 0.00027323496027241765,
"loss": 11.4788,
"step": 2450
},
{
"epoch": 2.7614769057981188,
"grad_norm": 1.5942481756210327,
"learning_rate": 0.0002718728717366628,
"loss": 11.5141,
"step": 2460
},
{
"epoch": 2.7727081285975013,
"grad_norm": 1.5158549547195435,
"learning_rate": 0.000270510783200908,
"loss": 11.4485,
"step": 2470
},
{
"epoch": 2.7839393513968833,
"grad_norm": 1.6702396869659424,
"learning_rate": 0.0002691486946651532,
"loss": 11.5334,
"step": 2480
},
{
"epoch": 2.7951705741962654,
"grad_norm": 1.5319764614105225,
"learning_rate": 0.00026778660612939836,
"loss": 11.4425,
"step": 2490
},
{
"epoch": 2.806401796995648,
"grad_norm": 1.463173270225525,
"learning_rate": 0.00026642451759364353,
"loss": 11.4739,
"step": 2500
},
{
"epoch": 2.8176330197950303,
"grad_norm": 1.7062861919403076,
"learning_rate": 0.0002650624290578887,
"loss": 11.4848,
"step": 2510
},
{
"epoch": 2.8288642425944124,
"grad_norm": 1.5613621473312378,
"learning_rate": 0.0002637003405221339,
"loss": 11.5269,
"step": 2520
},
{
"epoch": 2.840095465393795,
"grad_norm": 1.6942760944366455,
"learning_rate": 0.00026233825198637906,
"loss": 11.5173,
"step": 2530
},
{
"epoch": 2.851326688193177,
"grad_norm": 1.4133695363998413,
"learning_rate": 0.00026097616345062424,
"loss": 11.4846,
"step": 2540
},
{
"epoch": 2.8625579109925594,
"grad_norm": 1.4533722400665283,
"learning_rate": 0.0002596140749148694,
"loss": 11.5119,
"step": 2550
},
{
"epoch": 2.8737891337919415,
"grad_norm": 1.407906174659729,
"learning_rate": 0.0002582519863791146,
"loss": 11.4717,
"step": 2560
},
{
"epoch": 2.885020356591324,
"grad_norm": 1.4731615781784058,
"learning_rate": 0.00025688989784335977,
"loss": 11.434,
"step": 2570
},
{
"epoch": 2.8962515793907064,
"grad_norm": 1.702810287475586,
"learning_rate": 0.00025552780930760495,
"loss": 11.4225,
"step": 2580
},
{
"epoch": 2.9074828021900885,
"grad_norm": 1.5021584033966064,
"learning_rate": 0.0002541657207718501,
"loss": 11.5014,
"step": 2590
},
{
"epoch": 2.9187140249894705,
"grad_norm": 1.52582848072052,
"learning_rate": 0.0002528036322360953,
"loss": 11.5105,
"step": 2600
},
{
"epoch": 2.929945247788853,
"grad_norm": 1.552182912826538,
"learning_rate": 0.0002514415437003405,
"loss": 11.3703,
"step": 2610
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.5947136878967285,
"learning_rate": 0.00025007945516458566,
"loss": 11.4672,
"step": 2620
},
{
"epoch": 2.9524076933876175,
"grad_norm": 1.451735496520996,
"learning_rate": 0.00024871736662883083,
"loss": 11.3825,
"step": 2630
},
{
"epoch": 2.9636389161869996,
"grad_norm": 1.5744386911392212,
"learning_rate": 0.000247355278093076,
"loss": 11.3662,
"step": 2640
},
{
"epoch": 2.974870138986382,
"grad_norm": 1.52872633934021,
"learning_rate": 0.0002459931895573212,
"loss": 11.3914,
"step": 2650
},
{
"epoch": 2.9861013617857646,
"grad_norm": 1.5340864658355713,
"learning_rate": 0.00024463110102156637,
"loss": 11.3522,
"step": 2660
},
{
"epoch": 2.9973325845851466,
"grad_norm": 1.5574575662612915,
"learning_rate": 0.00024326901248581154,
"loss": 11.3726,
"step": 2670
},
{
"epoch": 3.007861855959568,
"grad_norm": 1.4836827516555786,
"learning_rate": 0.00024190692395005672,
"loss": 10.6008,
"step": 2680
},
{
"epoch": 3.01909307875895,
"grad_norm": 1.4343681335449219,
"learning_rate": 0.0002405448354143019,
"loss": 11.2479,
"step": 2690
},
{
"epoch": 3.0303243015583323,
"grad_norm": 1.5032552480697632,
"learning_rate": 0.00023918274687854707,
"loss": 11.2719,
"step": 2700
},
{
"epoch": 3.0415555243577144,
"grad_norm": 1.6779489517211914,
"learning_rate": 0.00023782065834279225,
"loss": 11.2446,
"step": 2710
},
{
"epoch": 3.052786747157097,
"grad_norm": 1.5352118015289307,
"learning_rate": 0.00023645856980703743,
"loss": 11.2898,
"step": 2720
},
{
"epoch": 3.064017969956479,
"grad_norm": 1.4970052242279053,
"learning_rate": 0.0002350964812712826,
"loss": 11.2933,
"step": 2730
},
{
"epoch": 3.0752491927558614,
"grad_norm": 1.4887489080429077,
"learning_rate": 0.00023373439273552778,
"loss": 11.2404,
"step": 2740
},
{
"epoch": 3.0864804155552434,
"grad_norm": 1.4684367179870605,
"learning_rate": 0.00023237230419977296,
"loss": 11.251,
"step": 2750
},
{
"epoch": 3.097711638354626,
"grad_norm": 1.5003316402435303,
"learning_rate": 0.00023101021566401814,
"loss": 11.2742,
"step": 2760
},
{
"epoch": 3.108942861154008,
"grad_norm": 1.5525890588760376,
"learning_rate": 0.00022964812712826331,
"loss": 11.2605,
"step": 2770
},
{
"epoch": 3.1201740839533905,
"grad_norm": 1.5025476217269897,
"learning_rate": 0.0002282860385925085,
"loss": 11.2734,
"step": 2780
},
{
"epoch": 3.1314053067527725,
"grad_norm": 1.5809471607208252,
"learning_rate": 0.00022692395005675367,
"loss": 11.2549,
"step": 2790
},
{
"epoch": 3.142636529552155,
"grad_norm": 1.6025090217590332,
"learning_rate": 0.00022556186152099885,
"loss": 11.2199,
"step": 2800
},
{
"epoch": 3.1538677523515375,
"grad_norm": 1.5650960206985474,
"learning_rate": 0.000224199772985244,
"loss": 11.2323,
"step": 2810
},
{
"epoch": 3.1650989751509195,
"grad_norm": 1.5035734176635742,
"learning_rate": 0.00022283768444948917,
"loss": 11.2694,
"step": 2820
},
{
"epoch": 3.176330197950302,
"grad_norm": 1.5358186960220337,
"learning_rate": 0.00022147559591373435,
"loss": 11.229,
"step": 2830
},
{
"epoch": 3.187561420749684,
"grad_norm": 1.5240596532821655,
"learning_rate": 0.00022011350737797953,
"loss": 11.2474,
"step": 2840
},
{
"epoch": 3.1987926435490666,
"grad_norm": 1.4946931600570679,
"learning_rate": 0.0002187514188422247,
"loss": 11.3082,
"step": 2850
},
{
"epoch": 3.2100238663484486,
"grad_norm": 1.5998846292495728,
"learning_rate": 0.00021738933030646988,
"loss": 11.1555,
"step": 2860
},
{
"epoch": 3.221255089147831,
"grad_norm": 1.527463436126709,
"learning_rate": 0.00021602724177071506,
"loss": 11.2718,
"step": 2870
},
{
"epoch": 3.232486311947213,
"grad_norm": 1.4795818328857422,
"learning_rate": 0.00021466515323496024,
"loss": 11.1977,
"step": 2880
},
{
"epoch": 3.2437175347465956,
"grad_norm": 1.4611546993255615,
"learning_rate": 0.0002133030646992054,
"loss": 11.2381,
"step": 2890
},
{
"epoch": 3.2549487575459777,
"grad_norm": 1.5139747858047485,
"learning_rate": 0.0002119409761634506,
"loss": 11.197,
"step": 2900
},
{
"epoch": 3.26617998034536,
"grad_norm": 1.6371299028396606,
"learning_rate": 0.00021057888762769577,
"loss": 11.264,
"step": 2910
},
{
"epoch": 3.277411203144742,
"grad_norm": 1.4956345558166504,
"learning_rate": 0.00020921679909194094,
"loss": 11.2839,
"step": 2920
},
{
"epoch": 3.2886424259441247,
"grad_norm": 1.6143215894699097,
"learning_rate": 0.00020785471055618612,
"loss": 11.2053,
"step": 2930
},
{
"epoch": 3.2998736487435067,
"grad_norm": 1.6429616212844849,
"learning_rate": 0.0002064926220204313,
"loss": 11.1887,
"step": 2940
},
{
"epoch": 3.3111048715428892,
"grad_norm": 1.50111985206604,
"learning_rate": 0.00020513053348467647,
"loss": 11.2054,
"step": 2950
},
{
"epoch": 3.3223360943422717,
"grad_norm": 1.4351112842559814,
"learning_rate": 0.00020376844494892165,
"loss": 11.2221,
"step": 2960
},
{
"epoch": 3.3335673171416538,
"grad_norm": 1.4160490036010742,
"learning_rate": 0.00020240635641316683,
"loss": 11.2436,
"step": 2970
},
{
"epoch": 3.3447985399410363,
"grad_norm": 1.5771642923355103,
"learning_rate": 0.000201044267877412,
"loss": 11.1831,
"step": 2980
},
{
"epoch": 3.3560297627404183,
"grad_norm": 1.5144894123077393,
"learning_rate": 0.00019968217934165718,
"loss": 11.1581,
"step": 2990
},
{
"epoch": 3.367260985539801,
"grad_norm": 1.456102967262268,
"learning_rate": 0.00019832009080590236,
"loss": 11.1989,
"step": 3000
},
{
"epoch": 3.378492208339183,
"grad_norm": 1.446110486984253,
"learning_rate": 0.00019695800227014754,
"loss": 11.1712,
"step": 3010
},
{
"epoch": 3.3897234311385653,
"grad_norm": 1.4775264263153076,
"learning_rate": 0.00019559591373439271,
"loss": 11.1628,
"step": 3020
},
{
"epoch": 3.4009546539379474,
"grad_norm": 1.4535621404647827,
"learning_rate": 0.0001942338251986379,
"loss": 11.092,
"step": 3030
},
{
"epoch": 3.41218587673733,
"grad_norm": 1.5154165029525757,
"learning_rate": 0.00019287173666288307,
"loss": 11.1513,
"step": 3040
},
{
"epoch": 3.423417099536712,
"grad_norm": 1.476539134979248,
"learning_rate": 0.00019150964812712825,
"loss": 11.2126,
"step": 3050
},
{
"epoch": 3.4346483223360944,
"grad_norm": 1.4598060846328735,
"learning_rate": 0.00019014755959137342,
"loss": 11.1632,
"step": 3060
},
{
"epoch": 3.4458795451354765,
"grad_norm": 1.4783436059951782,
"learning_rate": 0.0001887854710556186,
"loss": 11.1238,
"step": 3070
},
{
"epoch": 3.457110767934859,
"grad_norm": 1.545137643814087,
"learning_rate": 0.00018742338251986375,
"loss": 11.1628,
"step": 3080
},
{
"epoch": 3.468341990734241,
"grad_norm": 1.4730881452560425,
"learning_rate": 0.00018606129398410893,
"loss": 11.1644,
"step": 3090
},
{
"epoch": 3.4795732135336235,
"grad_norm": 1.48605215549469,
"learning_rate": 0.0001846992054483541,
"loss": 11.1871,
"step": 3100
},
{
"epoch": 3.490804436333006,
"grad_norm": 1.4472525119781494,
"learning_rate": 0.00018333711691259928,
"loss": 11.207,
"step": 3110
},
{
"epoch": 3.502035659132388,
"grad_norm": 1.3672376871109009,
"learning_rate": 0.00018197502837684446,
"loss": 11.1861,
"step": 3120
},
{
"epoch": 3.51326688193177,
"grad_norm": 1.5072065591812134,
"learning_rate": 0.00018061293984108964,
"loss": 11.1347,
"step": 3130
},
{
"epoch": 3.5244981047311525,
"grad_norm": 1.5243936777114868,
"learning_rate": 0.0001792508513053348,
"loss": 11.1585,
"step": 3140
},
{
"epoch": 3.535729327530535,
"grad_norm": 1.5613657236099243,
"learning_rate": 0.00017788876276958,
"loss": 11.2059,
"step": 3150
},
{
"epoch": 3.546960550329917,
"grad_norm": 1.4457815885543823,
"learning_rate": 0.00017652667423382517,
"loss": 11.0973,
"step": 3160
},
{
"epoch": 3.5581917731292996,
"grad_norm": 1.3698965311050415,
"learning_rate": 0.00017516458569807034,
"loss": 11.0931,
"step": 3170
},
{
"epoch": 3.5694229959286816,
"grad_norm": 1.5461801290512085,
"learning_rate": 0.00017380249716231552,
"loss": 11.1319,
"step": 3180
},
{
"epoch": 3.580654218728064,
"grad_norm": 1.5740439891815186,
"learning_rate": 0.0001724404086265607,
"loss": 11.2028,
"step": 3190
},
{
"epoch": 3.591885441527446,
"grad_norm": 1.3753931522369385,
"learning_rate": 0.00017107832009080588,
"loss": 11.1389,
"step": 3200
},
{
"epoch": 3.6031166643268286,
"grad_norm": 1.5603852272033691,
"learning_rate": 0.00016971623155505105,
"loss": 11.1247,
"step": 3210
},
{
"epoch": 3.614347887126211,
"grad_norm": 1.5574864149093628,
"learning_rate": 0.00016835414301929623,
"loss": 11.1183,
"step": 3220
},
{
"epoch": 3.625579109925593,
"grad_norm": 1.51617431640625,
"learning_rate": 0.0001669920544835414,
"loss": 11.1271,
"step": 3230
},
{
"epoch": 3.6368103327249752,
"grad_norm": 1.517800211906433,
"learning_rate": 0.00016562996594778658,
"loss": 11.1425,
"step": 3240
},
{
"epoch": 3.6480415555243577,
"grad_norm": 1.5872033834457397,
"learning_rate": 0.00016426787741203176,
"loss": 11.1319,
"step": 3250
},
{
"epoch": 3.65927277832374,
"grad_norm": 1.4323076009750366,
"learning_rate": 0.00016290578887627694,
"loss": 11.0476,
"step": 3260
},
{
"epoch": 3.6705040011231223,
"grad_norm": 1.4947340488433838,
"learning_rate": 0.00016154370034052211,
"loss": 11.0651,
"step": 3270
},
{
"epoch": 3.6817352239225043,
"grad_norm": 1.4352633953094482,
"learning_rate": 0.0001601816118047673,
"loss": 11.0978,
"step": 3280
},
{
"epoch": 3.692966446721887,
"grad_norm": 1.6209876537322998,
"learning_rate": 0.00015881952326901247,
"loss": 11.0699,
"step": 3290
},
{
"epoch": 3.7041976695212693,
"grad_norm": 1.4265753030776978,
"learning_rate": 0.00015745743473325765,
"loss": 11.0495,
"step": 3300
},
{
"epoch": 3.7154288923206513,
"grad_norm": 1.5374298095703125,
"learning_rate": 0.00015609534619750282,
"loss": 11.0698,
"step": 3310
},
{
"epoch": 3.726660115120034,
"grad_norm": 1.5547951459884644,
"learning_rate": 0.000154733257661748,
"loss": 11.105,
"step": 3320
},
{
"epoch": 3.737891337919416,
"grad_norm": 1.4317471981048584,
"learning_rate": 0.00015337116912599318,
"loss": 11.0474,
"step": 3330
},
{
"epoch": 3.7491225607187983,
"grad_norm": 1.4616270065307617,
"learning_rate": 0.00015200908059023835,
"loss": 11.0472,
"step": 3340
},
{
"epoch": 3.7603537835181804,
"grad_norm": 1.4159753322601318,
"learning_rate": 0.0001506469920544835,
"loss": 11.0644,
"step": 3350
},
{
"epoch": 3.771585006317563,
"grad_norm": 1.5320748090744019,
"learning_rate": 0.0001492849035187287,
"loss": 11.092,
"step": 3360
},
{
"epoch": 3.7828162291169454,
"grad_norm": 1.4254796504974365,
"learning_rate": 0.00014792281498297389,
"loss": 11.0266,
"step": 3370
},
{
"epoch": 3.7940474519163274,
"grad_norm": 1.5224483013153076,
"learning_rate": 0.00014656072644721906,
"loss": 11.0699,
"step": 3380
},
{
"epoch": 3.8052786747157095,
"grad_norm": 1.3961817026138306,
"learning_rate": 0.00014519863791146424,
"loss": 11.0638,
"step": 3390
},
{
"epoch": 3.816509897515092,
"grad_norm": 1.4628331661224365,
"learning_rate": 0.00014383654937570942,
"loss": 11.0998,
"step": 3400
},
{
"epoch": 3.8277411203144744,
"grad_norm": 1.469596266746521,
"learning_rate": 0.0001424744608399546,
"loss": 11.0611,
"step": 3410
},
{
"epoch": 3.8389723431138565,
"grad_norm": 1.4373725652694702,
"learning_rate": 0.00014111237230419977,
"loss": 11.0567,
"step": 3420
},
{
"epoch": 3.8502035659132385,
"grad_norm": 1.4011608362197876,
"learning_rate": 0.00013975028376844495,
"loss": 11.0241,
"step": 3430
},
{
"epoch": 3.861434788712621,
"grad_norm": 1.5255424976348877,
"learning_rate": 0.00013838819523269013,
"loss": 11.023,
"step": 3440
},
{
"epoch": 3.8726660115120035,
"grad_norm": 1.4086155891418457,
"learning_rate": 0.0001370261066969353,
"loss": 11.0914,
"step": 3450
},
{
"epoch": 3.8838972343113856,
"grad_norm": 1.4729487895965576,
"learning_rate": 0.00013566401816118048,
"loss": 11.0527,
"step": 3460
},
{
"epoch": 3.895128457110768,
"grad_norm": 1.4167112112045288,
"learning_rate": 0.00013430192962542566,
"loss": 11.0646,
"step": 3470
},
{
"epoch": 3.90635967991015,
"grad_norm": 1.4228618144989014,
"learning_rate": 0.00013293984108967083,
"loss": 11.0896,
"step": 3480
},
{
"epoch": 3.9175909027095326,
"grad_norm": 1.3969998359680176,
"learning_rate": 0.000131577752553916,
"loss": 11.0844,
"step": 3490
},
{
"epoch": 3.9288221255089146,
"grad_norm": 1.3993597030639648,
"learning_rate": 0.0001302156640181612,
"loss": 11.0194,
"step": 3500
},
{
"epoch": 3.940053348308297,
"grad_norm": 1.4945032596588135,
"learning_rate": 0.00012885357548240634,
"loss": 11.0404,
"step": 3510
},
{
"epoch": 3.9512845711076796,
"grad_norm": 1.4512568712234497,
"learning_rate": 0.00012749148694665152,
"loss": 11.0251,
"step": 3520
},
{
"epoch": 3.9625157939070617,
"grad_norm": 1.4356528520584106,
"learning_rate": 0.0001261293984108967,
"loss": 11.0682,
"step": 3530
},
{
"epoch": 3.9737470167064437,
"grad_norm": 1.4650399684906006,
"learning_rate": 0.00012476730987514187,
"loss": 11.042,
"step": 3540
},
{
"epoch": 3.984978239505826,
"grad_norm": 1.482191562652588,
"learning_rate": 0.00012340522133938705,
"loss": 11.0361,
"step": 3550
},
{
"epoch": 3.9962094623052087,
"grad_norm": 1.4656471014022827,
"learning_rate": 0.00012204313280363224,
"loss": 10.9494,
"step": 3560
},
{
"epoch": 4.006738733679629,
"grad_norm": 1.5250879526138306,
"learning_rate": 0.00012068104426787741,
"loss": 10.3207,
"step": 3570
},
{
"epoch": 4.0179699564790115,
"grad_norm": 1.4742364883422852,
"learning_rate": 0.00011931895573212258,
"loss": 10.9206,
"step": 3580
},
{
"epoch": 4.0292011792783935,
"grad_norm": 1.4390878677368164,
"learning_rate": 0.00011795686719636776,
"loss": 10.9298,
"step": 3590
},
{
"epoch": 4.040432402077776,
"grad_norm": 1.4271754026412964,
"learning_rate": 0.00011659477866061293,
"loss": 10.9401,
"step": 3600
},
{
"epoch": 4.0516636248771585,
"grad_norm": 1.4713624715805054,
"learning_rate": 0.00011523269012485811,
"loss": 10.9334,
"step": 3610
},
{
"epoch": 4.0628948476765405,
"grad_norm": 1.416982650756836,
"learning_rate": 0.00011387060158910329,
"loss": 10.9586,
"step": 3620
},
{
"epoch": 4.0741260704759235,
"grad_norm": 1.4600762128829956,
"learning_rate": 0.00011250851305334846,
"loss": 10.8739,
"step": 3630
},
{
"epoch": 4.0853572932753055,
"grad_norm": 1.3993488550186157,
"learning_rate": 0.00011114642451759364,
"loss": 10.9006,
"step": 3640
},
{
"epoch": 4.0965885160746875,
"grad_norm": 1.3790336847305298,
"learning_rate": 0.00010978433598183882,
"loss": 10.9069,
"step": 3650
},
{
"epoch": 4.10781973887407,
"grad_norm": 1.4517656564712524,
"learning_rate": 0.000108422247446084,
"loss": 10.8724,
"step": 3660
},
{
"epoch": 4.1190509616734525,
"grad_norm": 1.5568796396255493,
"learning_rate": 0.00010706015891032917,
"loss": 10.8934,
"step": 3670
},
{
"epoch": 4.130282184472835,
"grad_norm": 1.4685585498809814,
"learning_rate": 0.00010569807037457435,
"loss": 10.899,
"step": 3680
},
{
"epoch": 4.141513407272217,
"grad_norm": 1.4848984479904175,
"learning_rate": 0.00010433598183881953,
"loss": 10.8616,
"step": 3690
},
{
"epoch": 4.152744630071599,
"grad_norm": 1.4731574058532715,
"learning_rate": 0.0001029738933030647,
"loss": 10.8857,
"step": 3700
},
{
"epoch": 4.163975852870982,
"grad_norm": 1.447550892829895,
"learning_rate": 0.00010161180476730987,
"loss": 10.8534,
"step": 3710
},
{
"epoch": 4.175207075670364,
"grad_norm": 1.4320396184921265,
"learning_rate": 0.00010024971623155504,
"loss": 10.8808,
"step": 3720
},
{
"epoch": 4.186438298469746,
"grad_norm": 1.4548975229263306,
"learning_rate": 9.888762769580022e-05,
"loss": 10.8875,
"step": 3730
},
{
"epoch": 4.197669521269129,
"grad_norm": 1.405381441116333,
"learning_rate": 9.75255391600454e-05,
"loss": 10.8869,
"step": 3740
},
{
"epoch": 4.208900744068511,
"grad_norm": 1.394189476966858,
"learning_rate": 9.616345062429058e-05,
"loss": 10.8952,
"step": 3750
},
{
"epoch": 4.220131966867893,
"grad_norm": 1.3617000579833984,
"learning_rate": 9.480136208853575e-05,
"loss": 10.8854,
"step": 3760
},
{
"epoch": 4.231363189667275,
"grad_norm": 1.4552937746047974,
"learning_rate": 9.343927355278093e-05,
"loss": 10.9343,
"step": 3770
},
{
"epoch": 4.242594412466658,
"grad_norm": 1.3947829008102417,
"learning_rate": 9.20771850170261e-05,
"loss": 10.8487,
"step": 3780
},
{
"epoch": 4.25382563526604,
"grad_norm": 1.4310104846954346,
"learning_rate": 9.071509648127128e-05,
"loss": 10.8277,
"step": 3790
},
{
"epoch": 4.265056858065422,
"grad_norm": 1.4186252355575562,
"learning_rate": 8.935300794551646e-05,
"loss": 10.9158,
"step": 3800
},
{
"epoch": 4.276288080864804,
"grad_norm": 1.3875808715820312,
"learning_rate": 8.799091940976164e-05,
"loss": 10.9288,
"step": 3810
},
{
"epoch": 4.287519303664187,
"grad_norm": 1.3773916959762573,
"learning_rate": 8.662883087400681e-05,
"loss": 10.8744,
"step": 3820
},
{
"epoch": 4.298750526463569,
"grad_norm": 1.5134872198104858,
"learning_rate": 8.526674233825199e-05,
"loss": 10.8303,
"step": 3830
},
{
"epoch": 4.309981749262951,
"grad_norm": 1.525930643081665,
"learning_rate": 8.390465380249717e-05,
"loss": 10.8502,
"step": 3840
},
{
"epoch": 4.321212972062333,
"grad_norm": 1.4034548997879028,
"learning_rate": 8.254256526674233e-05,
"loss": 10.8712,
"step": 3850
},
{
"epoch": 4.332444194861716,
"grad_norm": 1.431321144104004,
"learning_rate": 8.118047673098751e-05,
"loss": 10.9037,
"step": 3860
},
{
"epoch": 4.343675417661098,
"grad_norm": 1.4377447366714478,
"learning_rate": 7.981838819523269e-05,
"loss": 10.9046,
"step": 3870
},
{
"epoch": 4.35490664046048,
"grad_norm": 1.4167426824569702,
"learning_rate": 7.845629965947786e-05,
"loss": 10.8576,
"step": 3880
},
{
"epoch": 4.366137863259862,
"grad_norm": 1.3977469205856323,
"learning_rate": 7.709421112372304e-05,
"loss": 10.8286,
"step": 3890
},
{
"epoch": 4.377369086059245,
"grad_norm": 1.4152209758758545,
"learning_rate": 7.573212258796822e-05,
"loss": 10.9304,
"step": 3900
},
{
"epoch": 4.388600308858627,
"grad_norm": 1.4083077907562256,
"learning_rate": 7.437003405221338e-05,
"loss": 10.8174,
"step": 3910
},
{
"epoch": 4.399831531658009,
"grad_norm": 1.3758295774459839,
"learning_rate": 7.300794551645856e-05,
"loss": 10.8231,
"step": 3920
},
{
"epoch": 4.411062754457392,
"grad_norm": 1.3821783065795898,
"learning_rate": 7.164585698070374e-05,
"loss": 10.8213,
"step": 3930
},
{
"epoch": 4.422293977256774,
"grad_norm": 1.4114271402359009,
"learning_rate": 7.028376844494891e-05,
"loss": 10.8755,
"step": 3940
},
{
"epoch": 4.433525200056156,
"grad_norm": 1.474861741065979,
"learning_rate": 6.892167990919409e-05,
"loss": 10.8719,
"step": 3950
},
{
"epoch": 4.444756422855538,
"grad_norm": 1.455072283744812,
"learning_rate": 6.755959137343927e-05,
"loss": 10.8429,
"step": 3960
},
{
"epoch": 4.455987645654921,
"grad_norm": 1.4050872325897217,
"learning_rate": 6.619750283768444e-05,
"loss": 10.8245,
"step": 3970
},
{
"epoch": 4.467218868454303,
"grad_norm": 1.415117621421814,
"learning_rate": 6.483541430192962e-05,
"loss": 10.8377,
"step": 3980
},
{
"epoch": 4.478450091253685,
"grad_norm": 1.4472249746322632,
"learning_rate": 6.34733257661748e-05,
"loss": 10.8068,
"step": 3990
},
{
"epoch": 4.489681314053067,
"grad_norm": 1.411483645439148,
"learning_rate": 6.211123723041998e-05,
"loss": 10.8779,
"step": 4000
},
{
"epoch": 4.50091253685245,
"grad_norm": 1.3772433996200562,
"learning_rate": 6.0749148694665146e-05,
"loss": 10.8594,
"step": 4010
},
{
"epoch": 4.512143759651832,
"grad_norm": 1.4007872343063354,
"learning_rate": 5.938706015891032e-05,
"loss": 10.8399,
"step": 4020
},
{
"epoch": 4.523374982451214,
"grad_norm": 1.3770571947097778,
"learning_rate": 5.80249716231555e-05,
"loss": 10.7955,
"step": 4030
},
{
"epoch": 4.534606205250597,
"grad_norm": 1.4038739204406738,
"learning_rate": 5.666288308740068e-05,
"loss": 10.8186,
"step": 4040
},
{
"epoch": 4.545837428049979,
"grad_norm": 1.3469901084899902,
"learning_rate": 5.5300794551645854e-05,
"loss": 10.7957,
"step": 4050
},
{
"epoch": 4.557068650849361,
"grad_norm": 1.358163833618164,
"learning_rate": 5.393870601589103e-05,
"loss": 10.8953,
"step": 4060
},
{
"epoch": 4.568299873648743,
"grad_norm": 1.360987901687622,
"learning_rate": 5.25766174801362e-05,
"loss": 10.8231,
"step": 4070
},
{
"epoch": 4.579531096448126,
"grad_norm": 1.3678861856460571,
"learning_rate": 5.121452894438138e-05,
"loss": 10.8246,
"step": 4080
},
{
"epoch": 4.590762319247508,
"grad_norm": 1.3450872898101807,
"learning_rate": 4.9852440408626556e-05,
"loss": 10.8162,
"step": 4090
},
{
"epoch": 4.60199354204689,
"grad_norm": 1.371072769165039,
"learning_rate": 4.849035187287173e-05,
"loss": 10.7912,
"step": 4100
},
{
"epoch": 4.613224764846272,
"grad_norm": 1.3714163303375244,
"learning_rate": 4.712826333711691e-05,
"loss": 10.7708,
"step": 4110
},
{
"epoch": 4.624455987645655,
"grad_norm": 1.371570348739624,
"learning_rate": 4.576617480136209e-05,
"loss": 10.7985,
"step": 4120
},
{
"epoch": 4.635687210445037,
"grad_norm": 1.4231752157211304,
"learning_rate": 4.440408626560726e-05,
"loss": 10.827,
"step": 4130
},
{
"epoch": 4.646918433244419,
"grad_norm": 1.3889615535736084,
"learning_rate": 4.3041997729852435e-05,
"loss": 10.837,
"step": 4140
},
{
"epoch": 4.658149656043802,
"grad_norm": 1.3703919649124146,
"learning_rate": 4.167990919409761e-05,
"loss": 10.8179,
"step": 4150
},
{
"epoch": 4.669380878843184,
"grad_norm": 1.3489488363265991,
"learning_rate": 4.031782065834279e-05,
"loss": 10.8593,
"step": 4160
},
{
"epoch": 4.680612101642566,
"grad_norm": 1.3783739805221558,
"learning_rate": 3.8955732122587966e-05,
"loss": 10.7853,
"step": 4170
},
{
"epoch": 4.691843324441948,
"grad_norm": 1.3344446420669556,
"learning_rate": 3.759364358683314e-05,
"loss": 10.8021,
"step": 4180
},
{
"epoch": 4.7030745472413304,
"grad_norm": 1.3575279712677002,
"learning_rate": 3.623155505107832e-05,
"loss": 10.7695,
"step": 4190
},
{
"epoch": 4.714305770040713,
"grad_norm": 1.3288753032684326,
"learning_rate": 3.486946651532349e-05,
"loss": 10.8142,
"step": 4200
},
{
"epoch": 4.725536992840095,
"grad_norm": 1.3467031717300415,
"learning_rate": 3.350737797956867e-05,
"loss": 10.809,
"step": 4210
},
{
"epoch": 4.7367682156394775,
"grad_norm": 1.3722003698349,
"learning_rate": 3.2145289443813845e-05,
"loss": 10.7404,
"step": 4220
},
{
"epoch": 4.74799943843886,
"grad_norm": 1.3655105829238892,
"learning_rate": 3.078320090805902e-05,
"loss": 10.8864,
"step": 4230
},
{
"epoch": 4.7592306612382425,
"grad_norm": 1.3268227577209473,
"learning_rate": 2.94211123723042e-05,
"loss": 10.7649,
"step": 4240
},
{
"epoch": 4.7704618840376245,
"grad_norm": 1.3199375867843628,
"learning_rate": 2.8059023836549373e-05,
"loss": 10.7571,
"step": 4250
},
{
"epoch": 4.7816931068370065,
"grad_norm": 1.3348361253738403,
"learning_rate": 2.669693530079455e-05,
"loss": 10.7514,
"step": 4260
},
{
"epoch": 4.7929243296363895,
"grad_norm": 1.3305360078811646,
"learning_rate": 2.5334846765039727e-05,
"loss": 10.7446,
"step": 4270
},
{
"epoch": 4.8041555524357715,
"grad_norm": 1.322024941444397,
"learning_rate": 2.3972758229284904e-05,
"loss": 10.816,
"step": 4280
},
{
"epoch": 4.815386775235154,
"grad_norm": 1.3302651643753052,
"learning_rate": 2.2610669693530078e-05,
"loss": 10.7709,
"step": 4290
},
{
"epoch": 4.826617998034536,
"grad_norm": 1.3159739971160889,
"learning_rate": 2.1248581157775255e-05,
"loss": 10.7594,
"step": 4300
},
{
"epoch": 4.8378492208339186,
"grad_norm": 1.3260287046432495,
"learning_rate": 1.9886492622020432e-05,
"loss": 10.8124,
"step": 4310
},
{
"epoch": 4.849080443633301,
"grad_norm": 1.3304709196090698,
"learning_rate": 1.8524404086265606e-05,
"loss": 10.8415,
"step": 4320
},
{
"epoch": 4.860311666432683,
"grad_norm": 1.342634916305542,
"learning_rate": 1.716231555051078e-05,
"loss": 10.7881,
"step": 4330
},
{
"epoch": 4.871542889232066,
"grad_norm": 1.333287000656128,
"learning_rate": 1.5800227014755956e-05,
"loss": 10.757,
"step": 4340
},
{
"epoch": 4.882774112031448,
"grad_norm": 1.3149112462997437,
"learning_rate": 1.4438138479001134e-05,
"loss": 10.773,
"step": 4350
},
{
"epoch": 4.89400533483083,
"grad_norm": 1.306639552116394,
"learning_rate": 1.3076049943246309e-05,
"loss": 10.781,
"step": 4360
},
{
"epoch": 4.905236557630212,
"grad_norm": 1.3133597373962402,
"learning_rate": 1.1713961407491484e-05,
"loss": 10.7741,
"step": 4370
},
{
"epoch": 4.916467780429595,
"grad_norm": 1.3073337078094482,
"learning_rate": 1.0351872871736661e-05,
"loss": 10.8127,
"step": 4380
},
{
"epoch": 4.927699003228977,
"grad_norm": 1.3013139963150024,
"learning_rate": 8.989784335981839e-06,
"loss": 10.7386,
"step": 4390
},
{
"epoch": 4.938930226028359,
"grad_norm": 1.2934496402740479,
"learning_rate": 7.627695800227015e-06,
"loss": 10.7845,
"step": 4400
},
{
"epoch": 4.950161448827741,
"grad_norm": 1.3016905784606934,
"learning_rate": 6.26560726447219e-06,
"loss": 10.7222,
"step": 4410
},
{
"epoch": 4.961392671627124,
"grad_norm": 1.2927377223968506,
"learning_rate": 4.9035187287173665e-06,
"loss": 10.7998,
"step": 4420
},
{
"epoch": 4.972623894426506,
"grad_norm": 1.277617335319519,
"learning_rate": 3.5414301929625423e-06,
"loss": 10.8386,
"step": 4430
},
{
"epoch": 4.983855117225888,
"grad_norm": 1.2841224670410156,
"learning_rate": 2.179341657207718e-06,
"loss": 10.7737,
"step": 4440
},
{
"epoch": 4.995086340025271,
"grad_norm": 1.2789523601531982,
"learning_rate": 8.172531214528943e-07,
"loss": 10.7885,
"step": 4450
}
],
"logging_steps": 10,
"max_steps": 4455,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.8215997838065664e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}