naturavision / trainer_state.json
kiolPL's picture
Upload NaturaVision QLoRA adapter
28fac09
{
"best_global_step": 1359,
"best_metric": 0.05789753,
"best_model_checkpoint": "/home/kiol/runs-v2-full-clean-r1/qwen35-qlora-forest/v0-20260424-170332/checkpoint-1359",
"epoch": 3.0,
"eval_steps": 200,
"global_step": 1359,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002210708117443869,
"grad_norm": 2.3902769088745117,
"learning_rate": 2.173913043478261e-06,
"loss": 0.5411382913589478,
"step": 1,
"token_acc": 0.8289156626506025
},
{
"epoch": 0.011053540587219343,
"grad_norm": 2.6440868377685547,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.5509852766990662,
"step": 5,
"token_acc": 0.8427947598253275
},
{
"epoch": 0.022107081174438686,
"grad_norm": 2.8517158031463623,
"learning_rate": 2.173913043478261e-05,
"loss": 0.5423528671264648,
"step": 10,
"token_acc": 0.8413485374318295
},
{
"epoch": 0.03316062176165803,
"grad_norm": 3.078794479370117,
"learning_rate": 3.260869565217392e-05,
"loss": 0.4979440689086914,
"step": 15,
"token_acc": 0.8463063956370848
},
{
"epoch": 0.04421416234887737,
"grad_norm": 3.6568212509155273,
"learning_rate": 4.347826086956522e-05,
"loss": 0.4140318393707275,
"step": 20,
"token_acc": 0.8631159780985564
},
{
"epoch": 0.055267702936096716,
"grad_norm": 2.4268248081207275,
"learning_rate": 4.9997331144187255e-05,
"loss": 0.292206335067749,
"step": 25,
"token_acc": 0.9111776447105788
},
{
"epoch": 0.06632124352331606,
"grad_norm": 3.583557605743408,
"learning_rate": 4.996731305997416e-05,
"loss": 0.2570572137832642,
"step": 30,
"token_acc": 0.9214214214214215
},
{
"epoch": 0.0773747841105354,
"grad_norm": 3.932553768157959,
"learning_rate": 4.990398100856367e-05,
"loss": 0.20562427043914794,
"step": 35,
"token_acc": 0.9317507418397626
},
{
"epoch": 0.08842832469775475,
"grad_norm": 2.240527391433716,
"learning_rate": 4.980741949411839e-05,
"loss": 0.21206333637237548,
"step": 40,
"token_acc": 0.932
},
{
"epoch": 0.09948186528497409,
"grad_norm": 2.624908447265625,
"learning_rate": 4.967775735898179e-05,
"loss": 0.21093401908874512,
"step": 45,
"token_acc": 0.9330024813895782
},
{
"epoch": 0.11053540587219343,
"grad_norm": 1.4830845594406128,
"learning_rate": 4.9515167611763434e-05,
"loss": 0.1953817367553711,
"step": 50,
"token_acc": 0.9338308457711443
},
{
"epoch": 0.12158894645941278,
"grad_norm": 2.913989305496216,
"learning_rate": 4.931986719649299e-05,
"loss": 0.16938740015029907,
"step": 55,
"token_acc": 0.9406234537357744
},
{
"epoch": 0.13264248704663212,
"grad_norm": 2.571464776992798,
"learning_rate": 4.909211670315114e-05,
"loss": 0.18647342920303345,
"step": 60,
"token_acc": 0.9361914257228315
},
{
"epoch": 0.14369602763385148,
"grad_norm": 2.3652737140655518,
"learning_rate": 4.8832220019963514e-05,
"loss": 0.16005475521087648,
"step": 65,
"token_acc": 0.9426719840478565
},
{
"epoch": 0.1547495682210708,
"grad_norm": 2.1498990058898926,
"learning_rate": 4.8540523927921616e-05,
"loss": 0.17245489358901978,
"step": 70,
"token_acc": 0.9344672336168084
},
{
"epoch": 0.16580310880829016,
"grad_norm": 2.2657954692840576,
"learning_rate": 4.821741763807186e-05,
"loss": 0.1709640145301819,
"step": 75,
"token_acc": 0.9368473396320238
},
{
"epoch": 0.1768566493955095,
"grad_norm": 2.3392601013183594,
"learning_rate": 4.786333227218995e-05,
"loss": 0.16215839385986328,
"step": 80,
"token_acc": 0.9388667992047713
},
{
"epoch": 0.18791018998272885,
"grad_norm": 3.259490489959717,
"learning_rate": 4.747874028753375e-05,
"loss": 0.16813175678253173,
"step": 85,
"token_acc": 0.9411177644710579
},
{
"epoch": 0.19896373056994818,
"grad_norm": 2.805907726287842,
"learning_rate": 4.706415484644195e-05,
"loss": 0.16006848812103272,
"step": 90,
"token_acc": 0.944666001994018
},
{
"epoch": 0.21001727115716753,
"grad_norm": 2.46066951751709,
"learning_rate": 4.662012913161997e-05,
"loss": 0.15026205778121948,
"step": 95,
"token_acc": 0.9417910447761194
},
{
"epoch": 0.22107081174438686,
"grad_norm": 2.5938236713409424,
"learning_rate": 4.6147255608026394e-05,
"loss": 0.15484832525253295,
"step": 100,
"token_acc": 0.9400099157164105
},
{
"epoch": 0.23212435233160622,
"grad_norm": 2.2375521659851074,
"learning_rate": 4.564616523234511e-05,
"loss": 0.16213221549987794,
"step": 105,
"token_acc": 0.9409704852426213
},
{
"epoch": 0.24317789291882555,
"grad_norm": 2.779315233230591,
"learning_rate": 4.511752661109768e-05,
"loss": 0.15692013502120972,
"step": 110,
"token_acc": 0.9458519622454049
},
{
"epoch": 0.2542314335060449,
"grad_norm": 2.144108295440674,
"learning_rate": 4.4562045108519565e-05,
"loss": 0.1405424118041992,
"step": 115,
"token_acc": 0.9512437810945273
},
{
"epoch": 0.26528497409326424,
"grad_norm": 2.1729953289031982,
"learning_rate": 4.398046190539025e-05,
"loss": 0.13323441743850709,
"step": 120,
"token_acc": 0.9441953163926258
},
{
"epoch": 0.2763385146804836,
"grad_norm": 2.5072147846221924,
"learning_rate": 4.3373553010073355e-05,
"loss": 0.1478448271751404,
"step": 125,
"token_acc": 0.9433681073025335
},
{
"epoch": 0.28739205526770295,
"grad_norm": 3.7264955043792725,
"learning_rate": 4.274212822308612e-05,
"loss": 0.13619284629821776,
"step": 130,
"token_acc": 0.9491778774289985
},
{
"epoch": 0.29844559585492225,
"grad_norm": 3.988476276397705,
"learning_rate": 4.208703005657999e-05,
"loss": 0.1341521382331848,
"step": 135,
"token_acc": 0.9512922465208747
},
{
"epoch": 0.3094991364421416,
"grad_norm": 3.7429981231689453,
"learning_rate": 4.140913261017382e-05,
"loss": 0.13466038703918456,
"step": 140,
"token_acc": 0.9512074913750616
},
{
"epoch": 0.32055267702936097,
"grad_norm": 2.5042636394500732,
"learning_rate": 4.070934040463998e-05,
"loss": 0.13430129289627074,
"step": 145,
"token_acc": 0.9491525423728814
},
{
"epoch": 0.3316062176165803,
"grad_norm": 3.5047497749328613,
"learning_rate": 3.998858717499931e-05,
"loss": 0.1267208695411682,
"step": 150,
"token_acc": 0.9575636545182227
},
{
"epoch": 0.3426597582037997,
"grad_norm": 3.953479528427124,
"learning_rate": 3.924783462463541e-05,
"loss": 0.11603262424468994,
"step": 155,
"token_acc": 0.9582297364495276
},
{
"epoch": 0.353713298791019,
"grad_norm": 4.1243791580200195,
"learning_rate": 3.848807114209074e-05,
"loss": 0.1214677095413208,
"step": 160,
"token_acc": 0.9600591715976331
},
{
"epoch": 0.36476683937823834,
"grad_norm": 4.218201160430908,
"learning_rate": 3.7710310482256526e-05,
"loss": 0.12736471891403198,
"step": 165,
"token_acc": 0.9536390827517448
},
{
"epoch": 0.3758203799654577,
"grad_norm": 5.623844623565674,
"learning_rate": 3.691559041371631e-05,
"loss": 0.11791330575942993,
"step": 170,
"token_acc": 0.9556109725685785
},
{
"epoch": 0.38687392055267705,
"grad_norm": 2.831035852432251,
"learning_rate": 3.6104971334047956e-05,
"loss": 0.10760444402694702,
"step": 175,
"token_acc": 0.9595461272816971
},
{
"epoch": 0.39792746113989635,
"grad_norm": 3.888122797012329,
"learning_rate": 3.527953485493168e-05,
"loss": 0.11927105188369751,
"step": 180,
"token_acc": 0.956737941322725
},
{
"epoch": 0.4089810017271157,
"grad_norm": 3.259575128555298,
"learning_rate": 3.444038235895212e-05,
"loss": 0.12609773874282837,
"step": 185,
"token_acc": 0.955
},
{
"epoch": 0.42003454231433507,
"grad_norm": 4.910781383514404,
"learning_rate": 3.358863353001987e-05,
"loss": 0.12669839859008789,
"step": 190,
"token_acc": 0.9573412698412699
},
{
"epoch": 0.4310880829015544,
"grad_norm": 2.9438295364379883,
"learning_rate": 3.272542485937369e-05,
"loss": 0.11617603302001953,
"step": 195,
"token_acc": 0.958958958958959
},
{
"epoch": 0.4421416234887737,
"grad_norm": 5.1633195877075195,
"learning_rate": 3.185190812915646e-05,
"loss": 0.10707591772079468,
"step": 200,
"token_acc": 0.959479739869935
},
{
"epoch": 0.4421416234887737,
"eval_loss": 0.11586810648441315,
"eval_runtime": 1992.0002,
"eval_samples_per_second": 1.178,
"eval_steps_per_second": 1.178,
"eval_token_acc": 0.9600378980137381,
"step": 200
},
{
"epoch": 0.4531951640759931,
"grad_norm": 4.371393203735352,
"learning_rate": 3.096924887558855e-05,
"loss": 0.11527643203735352,
"step": 205,
"token_acc": 0.9623202776400595
},
{
"epoch": 0.46424870466321244,
"grad_norm": 5.050312519073486,
"learning_rate": 3.007862483378906e-05,
"loss": 0.12530215978622436,
"step": 210,
"token_acc": 0.9602583209140586
},
{
"epoch": 0.4753022452504318,
"grad_norm": 2.541715145111084,
"learning_rate": 2.9181224366319947e-05,
"loss": 0.10488080978393555,
"step": 215,
"token_acc": 0.9631474103585658
},
{
"epoch": 0.4863557858376511,
"grad_norm": 5.020773410797119,
"learning_rate": 2.827824487755007e-05,
"loss": 0.13031703233718872,
"step": 220,
"token_acc": 0.9523809523809523
},
{
"epoch": 0.49740932642487046,
"grad_norm": 5.226169586181641,
"learning_rate": 2.7370891215954568e-05,
"loss": 0.13065972328186035,
"step": 225,
"token_acc": 0.9522150323544052
},
{
"epoch": 0.5084628670120898,
"grad_norm": 3.837796211242676,
"learning_rate": 2.646037406648165e-05,
"loss": 0.11477760076522828,
"step": 230,
"token_acc": 0.9575848303393214
},
{
"epoch": 0.5195164075993092,
"grad_norm": 3.0600061416625977,
"learning_rate": 2.5547908335131704e-05,
"loss": 0.1025763750076294,
"step": 235,
"token_acc": 0.9629446640316206
},
{
"epoch": 0.5305699481865285,
"grad_norm": 4.423151016235352,
"learning_rate": 2.4634711527904272e-05,
"loss": 0.11243470907211303,
"step": 240,
"token_acc": 0.9575212393803099
},
{
"epoch": 0.5416234887737479,
"grad_norm": 3.2377634048461914,
"learning_rate": 2.3722002126275824e-05,
"loss": 0.11171900033950806,
"step": 245,
"token_acc": 0.9635182408795602
},
{
"epoch": 0.5526770293609672,
"grad_norm": 3.354611396789551,
"learning_rate": 2.281099796137594e-05,
"loss": 0.10156643390655518,
"step": 250,
"token_acc": 0.965965965965966
},
{
"epoch": 0.5637305699481865,
"grad_norm": 4.366595268249512,
"learning_rate": 2.19029145890313e-05,
"loss": 0.09984329342842102,
"step": 255,
"token_acc": 0.9646061814556331
},
{
"epoch": 0.5747841105354059,
"grad_norm": 4.832893371582031,
"learning_rate": 2.0998963667845535e-05,
"loss": 0.08920307159423828,
"step": 260,
"token_acc": 0.9660847880299251
},
{
"epoch": 0.5858376511226252,
"grad_norm": 10.402474403381348,
"learning_rate": 2.0100351342479216e-05,
"loss": 0.10913920402526855,
"step": 265,
"token_acc": 0.9623389494549058
},
{
"epoch": 0.5968911917098445,
"grad_norm": 5.610108852386475,
"learning_rate": 1.9208276634287143e-05,
"loss": 0.11966934204101562,
"step": 270,
"token_acc": 0.9542060726729716
},
{
"epoch": 0.6079447322970639,
"grad_norm": 5.160745143890381,
"learning_rate": 1.832392984146018e-05,
"loss": 0.12343072891235352,
"step": 275,
"token_acc": 0.9561752988047809
},
{
"epoch": 0.6189982728842832,
"grad_norm": 4.599337100982666,
"learning_rate": 1.7448490950806552e-05,
"loss": 0.09566409587860107,
"step": 280,
"token_acc": 0.9677579365079365
},
{
"epoch": 0.6300518134715026,
"grad_norm": 4.375885486602783,
"learning_rate": 1.6583128063291576e-05,
"loss": 0.12313523292541503,
"step": 285,
"token_acc": 0.9544328875681031
},
{
"epoch": 0.6411053540587219,
"grad_norm": 6.371300220489502,
"learning_rate": 1.572899583543671e-05,
"loss": 0.10101137161254883,
"step": 290,
"token_acc": 0.96250616674889
},
{
"epoch": 0.6521588946459412,
"grad_norm": 4.738733291625977,
"learning_rate": 1.488723393865766e-05,
"loss": 0.10386931896209717,
"step": 295,
"token_acc": 0.963681592039801
},
{
"epoch": 0.6632124352331606,
"grad_norm": 6.841185569763184,
"learning_rate": 1.4058965538597033e-05,
"loss": 0.1303364634513855,
"step": 300,
"token_acc": 0.9556772908366534
},
{
"epoch": 0.67426597582038,
"grad_norm": 14.118032455444336,
"learning_rate": 1.3245295796480789e-05,
"loss": 0.12113407850265503,
"step": 305,
"token_acc": 0.9616342800199302
},
{
"epoch": 0.6853195164075994,
"grad_norm": 3.676441192626953,
"learning_rate": 1.2447310394498019e-05,
"loss": 0.09884743690490723,
"step": 310,
"token_acc": 0.9665518937530743
},
{
"epoch": 0.6963730569948187,
"grad_norm": 5.252943992614746,
"learning_rate": 1.1666074087171627e-05,
"loss": 0.1010090470314026,
"step": 315,
"token_acc": 0.9651741293532339
},
{
"epoch": 0.707426597582038,
"grad_norm": 4.592480182647705,
"learning_rate": 1.0902629280652931e-05,
"loss": 0.10196793079376221,
"step": 320,
"token_acc": 0.9611166500498505
},
{
"epoch": 0.7184801381692574,
"grad_norm": 6.885665416717529,
"learning_rate": 1.0157994641835736e-05,
"loss": 0.11955760717391968,
"step": 325,
"token_acc": 0.95773247140726
},
{
"epoch": 0.7295336787564767,
"grad_norm": 5.868223667144775,
"learning_rate": 9.433163739145773e-06,
"loss": 0.09359505772590637,
"step": 330,
"token_acc": 0.9641434262948207
},
{
"epoch": 0.740587219343696,
"grad_norm": 6.838636875152588,
"learning_rate": 8.729103716819112e-06,
"loss": 0.09510601162910462,
"step": 335,
"token_acc": 0.9697870232788509
},
{
"epoch": 0.7516407599309154,
"grad_norm": 5.597217559814453,
"learning_rate": 8.046754004438429e-06,
"loss": 0.0947553813457489,
"step": 340,
"token_acc": 0.9647992067426872
},
{
"epoch": 0.7626943005181347,
"grad_norm": 4.449385643005371,
"learning_rate": 7.387025063449082e-06,
"loss": 0.11492215394973755,
"step": 345,
"token_acc": 0.9548834903321765
},
{
"epoch": 0.7737478411053541,
"grad_norm": 5.535495758056641,
"learning_rate": 6.750797172327442e-06,
"loss": 0.10709909200668336,
"step": 350,
"token_acc": 0.9646590343454455
},
{
"epoch": 0.7848013816925734,
"grad_norm": 3.9693377017974854,
"learning_rate": 6.138919252022435e-06,
"loss": 0.09377566576004029,
"step": 355,
"token_acc": 0.963220675944334
},
{
"epoch": 0.7958549222797927,
"grad_norm": 4.245842456817627,
"learning_rate": 5.5522077332375436e-06,
"loss": 0.11761529445648193,
"step": 360,
"token_acc": 0.9603371343579573
},
{
"epoch": 0.8069084628670121,
"grad_norm": 5.86805534362793,
"learning_rate": 4.99144546706469e-06,
"loss": 0.11018631458282471,
"step": 365,
"token_acc": 0.9619000494804553
},
{
"epoch": 0.8179620034542314,
"grad_norm": 3.946427583694458,
"learning_rate": 4.457380680423434e-06,
"loss": 0.09712615013122558,
"step": 370,
"token_acc": 0.9691695673794132
},
{
"epoch": 0.8290155440414507,
"grad_norm": 3.8146259784698486,
"learning_rate": 3.950725977699396e-06,
"loss": 0.10309855937957764,
"step": 375,
"token_acc": 0.9674837418709354
},
{
"epoch": 0.8400690846286701,
"grad_norm": 6.905686855316162,
"learning_rate": 3.4721573899138743e-06,
"loss": 0.12204140424728394,
"step": 380,
"token_acc": 0.9587064676616915
},
{
"epoch": 0.8511226252158894,
"grad_norm": 5.100115776062012,
"learning_rate": 3.0223134726934472e-06,
"loss": 0.10104206800460816,
"step": 385,
"token_acc": 0.9645885286783042
},
{
"epoch": 0.8621761658031089,
"grad_norm": 6.091329574584961,
"learning_rate": 2.6017944542431393e-06,
"loss": 0.09742544889450074,
"step": 390,
"token_acc": 0.9626307922272048
},
{
"epoch": 0.8732297063903282,
"grad_norm": 5.460122585296631,
"learning_rate": 2.2111614344599683e-06,
"loss": 0.10580202341079711,
"step": 395,
"token_acc": 0.9581673306772909
},
{
"epoch": 0.8842832469775475,
"grad_norm": 3.7578561305999756,
"learning_rate": 1.8509356362554963e-06,
"loss": 0.09627346396446228,
"step": 400,
"token_acc": 0.9677898909811695
},
{
"epoch": 0.8842832469775475,
"eval_loss": 0.10459936410188675,
"eval_runtime": 2136.0319,
"eval_samples_per_second": 1.099,
"eval_steps_per_second": 1.099,
"eval_token_acc": 0.9643014245592664,
"step": 400
},
{
"epoch": 0.8953367875647669,
"grad_norm": 4.357903480529785,
"learning_rate": 1.5215977100864392e-06,
"loss": 0.11347759962081909,
"step": 405,
"token_acc": 0.9596814335490294
},
{
"epoch": 0.9063903281519862,
"grad_norm": 2.9337828159332275,
"learning_rate": 1.2235870926211619e-06,
"loss": 0.11455904245376587,
"step": 410,
"token_acc": 0.9613095238095238
},
{
"epoch": 0.9174438687392056,
"grad_norm": 2.6067817211151123,
"learning_rate": 9.573014203979242e-07,
"loss": 0.0897371768951416,
"step": 415,
"token_acc": 0.9702970297029703
},
{
"epoch": 0.9284974093264249,
"grad_norm": 4.809932708740234,
"learning_rate": 7.230959992571368e-07,
"loss": 0.10621033906936646,
"step": 420,
"token_acc": 0.9666001994017946
},
{
"epoch": 0.9395509499136442,
"grad_norm": 6.890130996704102,
"learning_rate": 5.212833302556258e-07,
"loss": 0.10602649450302123,
"step": 425,
"token_acc": 0.9638076351016361
},
{
"epoch": 0.9506044905008636,
"grad_norm": 4.578585147857666,
"learning_rate": 3.521326926954532e-07,
"loss": 0.08783534765243531,
"step": 430,
"token_acc": 0.9670822942643391
},
{
"epoch": 0.9616580310880829,
"grad_norm": 4.462143898010254,
"learning_rate": 2.158697848236607e-07,
"loss": 0.08878173232078553,
"step": 435,
"token_acc": 0.9683950617283951
},
{
"epoch": 0.9727115716753022,
"grad_norm": 3.5091371536254883,
"learning_rate": 1.1267642268238121e-07,
"loss": 0.11377729177474975,
"step": 440,
"token_acc": 0.9592647789369101
},
{
"epoch": 0.9837651122625216,
"grad_norm": 5.187692165374756,
"learning_rate": 4.26902975110749e-08,
"loss": 0.08824495673179626,
"step": 445,
"token_acc": 0.9681750372948782
},
{
"epoch": 0.9948186528497409,
"grad_norm": 4.813872337341309,
"learning_rate": 6.004792024680295e-09,
"loss": 0.12380951642990112,
"step": 450,
"token_acc": 0.9557213930348258
},
{
"epoch": 1.0,
"eval_loss": 0.10443862527608871,
"eval_runtime": 2140.8222,
"eval_samples_per_second": 1.096,
"eval_steps_per_second": 1.096,
"eval_token_acc": 0.9642337495347342,
"step": 453
},
{
"epoch": 1.0044214162348877,
"grad_norm": 2.683554172515869,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.08228109031915665,
"step": 455,
"token_acc": 0.9726708074534162
},
{
"epoch": 1.015474956822107,
"grad_norm": 3.1228041648864746,
"learning_rate": 1.0294117647058824e-05,
"loss": 0.08381627202033996,
"step": 460,
"token_acc": 0.9706905116741182
},
{
"epoch": 1.0265284974093265,
"grad_norm": 4.306940078735352,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.09802506566047668,
"step": 465,
"token_acc": 0.9675810473815462
},
{
"epoch": 1.0375820379965457,
"grad_norm": 3.2758865356445312,
"learning_rate": 2.5e-05,
"loss": 0.08836065530776978,
"step": 470,
"token_acc": 0.9721531576330183
},
{
"epoch": 1.048635578583765,
"grad_norm": 5.335510730743408,
"learning_rate": 3.235294117647059e-05,
"loss": 0.09924425482749939,
"step": 475,
"token_acc": 0.963220675944334
},
{
"epoch": 1.0596891191709845,
"grad_norm": 6.677077770233154,
"learning_rate": 3.970588235294117e-05,
"loss": 0.09869781732559205,
"step": 480,
"token_acc": 0.9626307922272048
},
{
"epoch": 1.0707426597582037,
"grad_norm": 7.235245227813721,
"learning_rate": 4.705882352941177e-05,
"loss": 0.09256232976913452,
"step": 485,
"token_acc": 0.967869500741473
},
{
"epoch": 1.0817962003454231,
"grad_norm": 2.617497682571411,
"learning_rate": 5.441176470588235e-05,
"loss": 0.11165302991867065,
"step": 490,
"token_acc": 0.9609804902451226
},
{
"epoch": 1.0928497409326425,
"grad_norm": 4.969885349273682,
"learning_rate": 6.176470588235295e-05,
"loss": 0.10181330442428589,
"step": 495,
"token_acc": 0.9626865671641791
},
{
"epoch": 1.103903281519862,
"grad_norm": 3.402235746383667,
"learning_rate": 6.911764705882354e-05,
"loss": 0.09162335991859435,
"step": 500,
"token_acc": 0.9699062654168722
},
{
"epoch": 1.1149568221070811,
"grad_norm": 3.815495729446411,
"learning_rate": 7.647058823529411e-05,
"loss": 0.11382390260696411,
"step": 505,
"token_acc": 0.954228855721393
},
{
"epoch": 1.1260103626943005,
"grad_norm": 4.899413108825684,
"learning_rate": 8.382352941176471e-05,
"loss": 0.1054335355758667,
"step": 510,
"token_acc": 0.9592850049652433
},
{
"epoch": 1.1370639032815197,
"grad_norm": 4.382949352264404,
"learning_rate": 9.11764705882353e-05,
"loss": 0.0767556071281433,
"step": 515,
"token_acc": 0.9732540861812778
},
{
"epoch": 1.1481174438687392,
"grad_norm": 4.010786533355713,
"learning_rate": 9.852941176470589e-05,
"loss": 0.0905315101146698,
"step": 520,
"token_acc": 0.9681274900398407
},
{
"epoch": 1.1591709844559586,
"grad_norm": 8.248323440551758,
"learning_rate": 9.99976313340166e-05,
"loss": 0.14047706127166748,
"step": 525,
"token_acc": 0.9527127924340468
},
{
"epoch": 1.170224525043178,
"grad_norm": 6.752266883850098,
"learning_rate": 9.998800901308916e-05,
"loss": 0.1028751015663147,
"step": 530,
"token_acc": 0.9620947630922694
},
{
"epoch": 1.1812780656303972,
"grad_norm": 4.091604709625244,
"learning_rate": 9.997098641899562e-05,
"loss": 0.1167901635169983,
"step": 535,
"token_acc": 0.9626679940268791
},
{
"epoch": 1.1923316062176166,
"grad_norm": 3.13606858253479,
"learning_rate": 9.994656607177722e-05,
"loss": 0.11653541326522827,
"step": 540,
"token_acc": 0.955050505050505
},
{
"epoch": 1.203385146804836,
"grad_norm": 4.286221027374268,
"learning_rate": 9.991475158664578e-05,
"loss": 0.09516905546188355,
"step": 545,
"token_acc": 0.968031968031968
},
{
"epoch": 1.2144386873920552,
"grad_norm": 5.616212368011475,
"learning_rate": 9.987554767344845e-05,
"loss": 0.12145495414733887,
"step": 550,
"token_acc": 0.9621890547263682
},
{
"epoch": 1.2254922279792746,
"grad_norm": 5.72205114364624,
"learning_rate": 9.982896013597038e-05,
"loss": 0.12007032632827759,
"step": 555,
"token_acc": 0.9582089552238806
},
{
"epoch": 1.236545768566494,
"grad_norm": 3.8363678455352783,
"learning_rate": 9.977499587107569e-05,
"loss": 0.08600590825080871,
"step": 560,
"token_acc": 0.967196819085487
},
{
"epoch": 1.2475993091537134,
"grad_norm": 2.206713914871216,
"learning_rate": 9.971366286768629e-05,
"loss": 0.11412311792373657,
"step": 565,
"token_acc": 0.961824491819534
},
{
"epoch": 1.2586528497409326,
"grad_norm": 2.48056697845459,
"learning_rate": 9.964497020559926e-05,
"loss": 0.09548119902610779,
"step": 570,
"token_acc": 0.968222442899702
},
{
"epoch": 1.269706390328152,
"grad_norm": 4.197746276855469,
"learning_rate": 9.956892805414272e-05,
"loss": 0.11575849056243896,
"step": 575,
"token_acc": 0.9597215315763302
},
{
"epoch": 1.2807599309153712,
"grad_norm": 3.9247732162475586,
"learning_rate": 9.948554767067025e-05,
"loss": 0.09247745275497436,
"step": 580,
"token_acc": 0.9692001987083955
},
{
"epoch": 1.2918134715025906,
"grad_norm": 3.7010436058044434,
"learning_rate": 9.93948413988944e-05,
"loss": 0.11627188920974732,
"step": 585,
"token_acc": 0.9602780536246276
},
{
"epoch": 1.30286701208981,
"grad_norm": 6.0411858558654785,
"learning_rate": 9.92968226670593e-05,
"loss": 0.09203023314476014,
"step": 590,
"token_acc": 0.9705882352941176
},
{
"epoch": 1.3139205526770295,
"grad_norm": 4.776832103729248,
"learning_rate": 9.919150598595276e-05,
"loss": 0.07992117404937744,
"step": 595,
"token_acc": 0.9711155378486056
},
{
"epoch": 1.3249740932642486,
"grad_norm": 2.1442465782165527,
"learning_rate": 9.907890694675803e-05,
"loss": 0.08411768078804016,
"step": 600,
"token_acc": 0.971301335972291
},
{
"epoch": 1.3249740932642486,
"eval_loss": 0.10072976350784302,
"eval_runtime": 2790.9554,
"eval_samples_per_second": 0.841,
"eval_steps_per_second": 0.841,
"eval_token_acc": 0.9663655128074984,
"step": 600
},
{
"epoch": 1.336027633851468,
"grad_norm": 3.444389820098877,
"learning_rate": 9.89590422187457e-05,
"loss": 0.0943886399269104,
"step": 605,
"token_acc": 0.969261279127417
},
{
"epoch": 1.3470811744386875,
"grad_norm": 3.3243448734283447,
"learning_rate": 9.883192954680593e-05,
"loss": 0.07429519295692444,
"step": 610,
"token_acc": 0.9723046488625123
},
{
"epoch": 1.3581347150259067,
"grad_norm": 3.897686243057251,
"learning_rate": 9.869758774882154e-05,
"loss": 0.10087257623672485,
"step": 615,
"token_acc": 0.9645
},
{
"epoch": 1.369188255613126,
"grad_norm": 3.0886733531951904,
"learning_rate": 9.855603671288215e-05,
"loss": 0.0957147240638733,
"step": 620,
"token_acc": 0.9651394422310757
},
{
"epoch": 1.3802417962003455,
"grad_norm": 5.4413604736328125,
"learning_rate": 9.840729739433992e-05,
"loss": 0.0882586419582367,
"step": 625,
"token_acc": 0.9705441837244134
},
{
"epoch": 1.391295336787565,
"grad_norm": 3.051844358444214,
"learning_rate": 9.82513918127073e-05,
"loss": 0.09511439800262451,
"step": 630,
"token_acc": 0.9663532904502722
},
{
"epoch": 1.402348877374784,
"grad_norm": 2.9811363220214844,
"learning_rate": 9.808834304839729e-05,
"loss": 0.10007621049880981,
"step": 635,
"token_acc": 0.9672943508424182
},
{
"epoch": 1.4134024179620035,
"grad_norm": 3.030879497528076,
"learning_rate": 9.791817523930653e-05,
"loss": 0.08152814507484436,
"step": 640,
"token_acc": 0.9720837487537388
},
{
"epoch": 1.4244559585492227,
"grad_norm": 2.544180154800415,
"learning_rate": 9.774091357724196e-05,
"loss": 0.07389838099479676,
"step": 645,
"token_acc": 0.9755854509217738
},
{
"epoch": 1.435509499136442,
"grad_norm": 5.591005802154541,
"learning_rate": 9.755658430419132e-05,
"loss": 0.09485760927200318,
"step": 650,
"token_acc": 0.9637357178340785
},
{
"epoch": 1.4465630397236615,
"grad_norm": 4.964202880859375,
"learning_rate": 9.736521470843838e-05,
"loss": 0.08160382509231567,
"step": 655,
"token_acc": 0.9705441837244134
},
{
"epoch": 1.457616580310881,
"grad_norm": 4.984673500061035,
"learning_rate": 9.7166833120523e-05,
"loss": 0.08802146315574647,
"step": 660,
"token_acc": 0.9641076769690927
},
{
"epoch": 1.4686701208981001,
"grad_norm": 2.584303140640259,
"learning_rate": 9.696146890904722e-05,
"loss": 0.09760611653327941,
"step": 665,
"token_acc": 0.9701343952215032
},
{
"epoch": 1.4797236614853195,
"grad_norm": 2.9796664714813232,
"learning_rate": 9.674915247632739e-05,
"loss": 0.09098277688026428,
"step": 670,
"token_acc": 0.9660678642714571
},
{
"epoch": 1.490777202072539,
"grad_norm": 10.275652885437012,
"learning_rate": 9.652991525389337e-05,
"loss": 0.08257744312286378,
"step": 675,
"token_acc": 0.9722084367245658
},
{
"epoch": 1.5018307426597581,
"grad_norm": 4.8155131340026855,
"learning_rate": 9.630378969783547e-05,
"loss": 0.07055800557136535,
"step": 680,
"token_acc": 0.974090682610862
},
{
"epoch": 1.5128842832469775,
"grad_norm": 3.6014211177825928,
"learning_rate": 9.607080928399958e-05,
"loss": 0.09370391964912414,
"step": 685,
"token_acc": 0.9658584858980702
},
{
"epoch": 1.523937823834197,
"grad_norm": 2.675119400024414,
"learning_rate": 9.58310085030313e-05,
"loss": 0.09670426845550537,
"step": 690,
"token_acc": 0.9641969169567379
},
{
"epoch": 1.5349913644214164,
"grad_norm": 3.324349880218506,
"learning_rate": 9.558442285527e-05,
"loss": 0.08441510200500488,
"step": 695,
"token_acc": 0.9696819085487077
},
{
"epoch": 1.5460449050086356,
"grad_norm": 3.4498190879821777,
"learning_rate": 9.533108884549333e-05,
"loss": 0.06717776656150817,
"step": 700,
"token_acc": 0.975597609561753
},
{
"epoch": 1.557098445595855,
"grad_norm": 2.959386110305786,
"learning_rate": 9.50710439775129e-05,
"loss": 0.08139981031417846,
"step": 705,
"token_acc": 0.9747023809523809
},
{
"epoch": 1.5681519861830742,
"grad_norm": 2.3681604862213135,
"learning_rate": 9.480432674862232e-05,
"loss": 0.07764554619789124,
"step": 710,
"token_acc": 0.9675810473815462
},
{
"epoch": 1.5792055267702936,
"grad_norm": 2.840590715408325,
"learning_rate": 9.453097664389789e-05,
"loss": 0.08232161402702332,
"step": 715,
"token_acc": 0.9701789264413518
},
{
"epoch": 1.590259067357513,
"grad_norm": 2.655217409133911,
"learning_rate": 9.425103413035335e-05,
"loss": 0.0968110740184784,
"step": 720,
"token_acc": 0.9689534301452178
},
{
"epoch": 1.6013126079447324,
"grad_norm": 2.4610400199890137,
"learning_rate": 9.396454065094891e-05,
"loss": 0.09739276766777039,
"step": 725,
"token_acc": 0.964729259811227
},
{
"epoch": 1.6123661485319516,
"grad_norm": 4.132114887237549,
"learning_rate": 9.367153861845617e-05,
"loss": 0.08105069994926453,
"step": 730,
"token_acc": 0.9716981132075472
},
{
"epoch": 1.623419689119171,
"grad_norm": 3.3622491359710693,
"learning_rate": 9.337207140917919e-05,
"loss": 0.09018557667732238,
"step": 735,
"token_acc": 0.9642324888226528
},
{
"epoch": 1.6344732297063902,
"grad_norm": 2.985978364944458,
"learning_rate": 9.306618335653307e-05,
"loss": 0.08649082779884339,
"step": 740,
"token_acc": 0.9683011391778108
},
{
"epoch": 1.6455267702936096,
"grad_norm": 3.509003162384033,
"learning_rate": 9.275391974448076e-05,
"loss": 0.0744367241859436,
"step": 745,
"token_acc": 0.9770687936191426
},
{
"epoch": 1.656580310880829,
"grad_norm": 2.937560796737671,
"learning_rate": 9.243532680082915e-05,
"loss": 0.07034647464752197,
"step": 750,
"token_acc": 0.9767211490837048
},
{
"epoch": 1.6676338514680484,
"grad_norm": 3.8314759731292725,
"learning_rate": 9.211045169038554e-05,
"loss": 0.07900274395942689,
"step": 755,
"token_acc": 0.9711729622266402
},
{
"epoch": 1.6786873920552678,
"grad_norm": 5.002687931060791,
"learning_rate": 9.17793425079753e-05,
"loss": 0.07675303220748901,
"step": 760,
"token_acc": 0.9689349112426036
},
{
"epoch": 1.689740932642487,
"grad_norm": 1.7825733423233032,
"learning_rate": 9.144204827132175e-05,
"loss": 0.08085300326347351,
"step": 765,
"token_acc": 0.9716981132075472
},
{
"epoch": 1.7007944732297062,
"grad_norm": 1.8953182697296143,
"learning_rate": 9.10986189137897e-05,
"loss": 0.07259147167205811,
"step": 770,
"token_acc": 0.9760479041916168
},
{
"epoch": 1.7118480138169256,
"grad_norm": 3.7877821922302246,
"learning_rate": 9.074910527699313e-05,
"loss": 0.08823164105415345,
"step": 775,
"token_acc": 0.972139303482587
},
{
"epoch": 1.722901554404145,
"grad_norm": 3.614501714706421,
"learning_rate": 9.039355910326863e-05,
"loss": 0.10905979871749878,
"step": 780,
"token_acc": 0.9652087475149106
},
{
"epoch": 1.7339550949913645,
"grad_norm": 3.0243847370147705,
"learning_rate": 9.00320330280154e-05,
"loss": 0.07965280413627625,
"step": 785,
"token_acc": 0.9723593287265548
},
{
"epoch": 1.7450086355785839,
"grad_norm": 4.025770664215088,
"learning_rate": 8.966458057190301e-05,
"loss": 0.07108275294303894,
"step": 790,
"token_acc": 0.9760956175298805
},
{
"epoch": 1.756062176165803,
"grad_norm": 2.9816761016845703,
"learning_rate": 8.92912561329482e-05,
"loss": 0.0776334285736084,
"step": 795,
"token_acc": 0.9706467661691542
},
{
"epoch": 1.7671157167530225,
"grad_norm": 3.0242762565612793,
"learning_rate": 8.891211497846171e-05,
"loss": 0.07837628722190856,
"step": 800,
"token_acc": 0.974155069582505
},
{
"epoch": 1.7671157167530225,
"eval_loss": 0.08094792068004608,
"eval_runtime": 2286.4128,
"eval_samples_per_second": 1.026,
"eval_steps_per_second": 1.026,
"eval_token_acc": 0.9727608026257909,
"step": 800
},
{
"epoch": 1.7781692573402417,
"grad_norm": 3.771763324737549,
"learning_rate": 8.852721323686648e-05,
"loss": 0.08302398324012757,
"step": 805,
"token_acc": 0.9725411882176734
},
{
"epoch": 1.789222797927461,
"grad_norm": 3.6321651935577393,
"learning_rate": 8.813660788938833e-05,
"loss": 0.06937822699546814,
"step": 810,
"token_acc": 0.9772839506172839
},
{
"epoch": 1.8002763385146805,
"grad_norm": 6.7241644859313965,
"learning_rate": 8.774035676162043e-05,
"loss": 0.05159105062484741,
"step": 815,
"token_acc": 0.9829488465396189
},
{
"epoch": 1.8113298791019,
"grad_norm": 4.365586757659912,
"learning_rate": 8.733851851496268e-05,
"loss": 0.08399490118026734,
"step": 820,
"token_acc": 0.9721254355400697
},
{
"epoch": 1.8223834196891193,
"grad_norm": 3.0722031593322754,
"learning_rate": 8.693115263793747e-05,
"loss": 0.07215502858161926,
"step": 825,
"token_acc": 0.9740648379052369
},
{
"epoch": 1.8334369602763385,
"grad_norm": 4.519292831420898,
"learning_rate": 8.651831943738296e-05,
"loss": 0.06996339559555054,
"step": 830,
"token_acc": 0.9770459081836327
},
{
"epoch": 1.8444905008635577,
"grad_norm": 4.166755199432373,
"learning_rate": 8.610008002952513e-05,
"loss": 0.07142719030380248,
"step": 835,
"token_acc": 0.9767211490837048
},
{
"epoch": 1.8555440414507771,
"grad_norm": 4.686975002288818,
"learning_rate": 8.567649633093016e-05,
"loss": 0.06802060008049012,
"step": 840,
"token_acc": 0.9775
},
{
"epoch": 1.8665975820379965,
"grad_norm": 3.0628044605255127,
"learning_rate": 8.524763104933816e-05,
"loss": 0.06818159222602845,
"step": 845,
"token_acc": 0.973644952759821
},
{
"epoch": 1.877651122625216,
"grad_norm": 3.939176321029663,
"learning_rate": 8.481354767437988e-05,
"loss": 0.07347306013107299,
"step": 850,
"token_acc": 0.972568578553616
},
{
"epoch": 1.8887046632124354,
"grad_norm": 2.8834567070007324,
"learning_rate": 8.437431046817769e-05,
"loss": 0.06994418501853943,
"step": 855,
"token_acc": 0.971712158808933
},
{
"epoch": 1.8997582037996545,
"grad_norm": 3.4866409301757812,
"learning_rate": 8.392998445583212e-05,
"loss": 0.07565975189208984,
"step": 860,
"token_acc": 0.9760956175298805
},
{
"epoch": 1.910811744386874,
"grad_norm": 3.4372429847717285,
"learning_rate": 8.348063541579545e-05,
"loss": 0.07984944581985473,
"step": 865,
"token_acc": 0.9727452923686819
},
{
"epoch": 1.9218652849740931,
"grad_norm": 4.127252578735352,
"learning_rate": 8.302632987013388e-05,
"loss": 0.07774015665054321,
"step": 870,
"token_acc": 0.9744872436218109
},
{
"epoch": 1.9329188255613126,
"grad_norm": 3.960955858230591,
"learning_rate": 8.256713507467941e-05,
"loss": 0.08486457467079163,
"step": 875,
"token_acc": 0.9731743666169895
},
{
"epoch": 1.943972366148532,
"grad_norm": 3.087674617767334,
"learning_rate": 8.210311900907339e-05,
"loss": 0.07507517337799072,
"step": 880,
"token_acc": 0.9787023278850916
},
{
"epoch": 1.9550259067357514,
"grad_norm": 2.5197293758392334,
"learning_rate": 8.163435036670261e-05,
"loss": 0.08100587725639344,
"step": 885,
"token_acc": 0.9724724724724725
},
{
"epoch": 1.9660794473229708,
"grad_norm": 1.6736986637115479,
"learning_rate": 8.116089854452995e-05,
"loss": 0.07375568151473999,
"step": 890,
"token_acc": 0.9772727272727273
},
{
"epoch": 1.97713298791019,
"grad_norm": 3.322634220123291,
"learning_rate": 8.068283363282074e-05,
"loss": 0.07798144817352295,
"step": 895,
"token_acc": 0.9738400789733465
},
{
"epoch": 1.9881865284974092,
"grad_norm": 1.9556145668029785,
"learning_rate": 8.020022640476654e-05,
"loss": 0.06203848123550415,
"step": 900,
"token_acc": 0.9791666666666666
},
{
"epoch": 1.9992400690846286,
"grad_norm": 3.805701494216919,
"learning_rate": 7.971314830600783e-05,
"loss": 0.06745657324790955,
"step": 905,
"token_acc": 0.9751491053677932
},
{
"epoch": 2.0088428324697754,
"grad_norm": 2.904778003692627,
"learning_rate": 7.922167144405706e-05,
"loss": 0.06268702149391174,
"step": 910,
"token_acc": 0.9777777777777777
},
{
"epoch": 2.0198963730569948,
"grad_norm": 3.050584554672241,
"learning_rate": 7.87258685776239e-05,
"loss": 0.07978938817977906,
"step": 915,
"token_acc": 0.9681116093672147
},
{
"epoch": 2.030949913644214,
"grad_norm": 3.2201292514801025,
"learning_rate": 7.822581310584388e-05,
"loss": 0.07445316910743713,
"step": 920,
"token_acc": 0.9754509018036072
},
{
"epoch": 2.0420034542314336,
"grad_norm": 4.6090545654296875,
"learning_rate": 7.772157905741231e-05,
"loss": 0.06728174090385437,
"step": 925,
"token_acc": 0.977205153617443
},
{
"epoch": 2.053056994818653,
"grad_norm": 3.3497800827026367,
"learning_rate": 7.721324107962506e-05,
"loss": 0.06557589173316955,
"step": 930,
"token_acc": 0.9775784753363229
},
{
"epoch": 2.064110535405872,
"grad_norm": 2.1937358379364014,
"learning_rate": 7.670087442732763e-05,
"loss": 0.05688057541847229,
"step": 935,
"token_acc": 0.981028457314029
},
{
"epoch": 2.0751640759930914,
"grad_norm": 5.671027660369873,
"learning_rate": 7.618455495177445e-05,
"loss": 0.08629457950592041,
"step": 940,
"token_acc": 0.9695304695304695
},
{
"epoch": 2.086217616580311,
"grad_norm": 3.635037899017334,
"learning_rate": 7.566435908939967e-05,
"loss": 0.0566463053226471,
"step": 945,
"token_acc": 0.9820717131474104
},
{
"epoch": 2.09727115716753,
"grad_norm": 3.0799365043640137,
"learning_rate": 7.514036385050147e-05,
"loss": 0.06808796525001526,
"step": 950,
"token_acc": 0.9766052762568442
},
{
"epoch": 2.1083246977547496,
"grad_norm": 5.38563871383667,
"learning_rate": 7.461264680784151e-05,
"loss": 0.07369622588157654,
"step": 955,
"token_acc": 0.9737363726461843
},
{
"epoch": 2.119378238341969,
"grad_norm": 3.824101448059082,
"learning_rate": 7.408128608516077e-05,
"loss": 0.06465582847595215,
"step": 960,
"token_acc": 0.9786917740336968
},
{
"epoch": 2.1304317789291884,
"grad_norm": 3.410547971725464,
"learning_rate": 7.354636034561418e-05,
"loss": 0.051229971647262576,
"step": 965,
"token_acc": 0.9821428571428571
},
{
"epoch": 2.1414853195164074,
"grad_norm": 3.721679210662842,
"learning_rate": 7.30079487801252e-05,
"loss": 0.06965258717536926,
"step": 970,
"token_acc": 0.9755854509217738
},
{
"epoch": 2.152538860103627,
"grad_norm": 2.798208236694336,
"learning_rate": 7.246613109566238e-05,
"loss": 0.07870134711265564,
"step": 975,
"token_acc": 0.9751243781094527
},
{
"epoch": 2.1635924006908462,
"grad_norm": 1.8357700109481812,
"learning_rate": 7.192098750343935e-05,
"loss": 0.0715235412120819,
"step": 980,
"token_acc": 0.97675568743818
},
{
"epoch": 2.1746459412780657,
"grad_norm": 3.544813632965088,
"learning_rate": 7.137259870704036e-05,
"loss": 0.055529987812042235,
"step": 985,
"token_acc": 0.9841112214498511
},
{
"epoch": 2.185699481865285,
"grad_norm": 2.2707366943359375,
"learning_rate": 7.082104589047285e-05,
"loss": 0.05665128231048584,
"step": 990,
"token_acc": 0.9795102448775612
},
{
"epoch": 2.1967530224525045,
"grad_norm": 4.324965476989746,
"learning_rate": 7.026641070614884e-05,
"loss": 0.06112373471260071,
"step": 995,
"token_acc": 0.9775112443778111
},
{
"epoch": 2.207806563039724,
"grad_norm": 3.2108352184295654,
"learning_rate": 6.970877526279702e-05,
"loss": 0.061422485113143924,
"step": 1000,
"token_acc": 0.9781854238968766
},
{
"epoch": 2.207806563039724,
"eval_loss": 0.07163181900978088,
"eval_runtime": 2173.3435,
"eval_samples_per_second": 1.08,
"eval_steps_per_second": 1.08,
"eval_token_acc": 0.9747910533617569,
"step": 1000
},
{
"epoch": 2.218860103626943,
"grad_norm": 4.137533187866211,
"learning_rate": 6.914822211330742e-05,
"loss": 0.06986818313598633,
"step": 1005,
"token_acc": 0.9751367478866235
},
{
"epoch": 2.2299136442141623,
"grad_norm": 1.7135125398635864,
"learning_rate": 6.858483424251001e-05,
"loss": 0.0670811414718628,
"step": 1010,
"token_acc": 0.9776341948310139
},
{
"epoch": 2.2409671848013817,
"grad_norm": 2.10587739944458,
"learning_rate": 6.801869505488969e-05,
"loss": 0.06850314140319824,
"step": 1015,
"token_acc": 0.9761312779711586
},
{
"epoch": 2.252020725388601,
"grad_norm": 3.5003225803375244,
"learning_rate": 6.744988836223893e-05,
"loss": 0.06779593229293823,
"step": 1020,
"token_acc": 0.9794692038057086
},
{
"epoch": 2.2630742659758205,
"grad_norm": 2.2275619506835938,
"learning_rate": 6.687849837125027e-05,
"loss": 0.05072577595710755,
"step": 1025,
"token_acc": 0.9836309523809523
},
{
"epoch": 2.2741278065630395,
"grad_norm": 2.6360952854156494,
"learning_rate": 6.630460967105018e-05,
"loss": 0.05415867567062378,
"step": 1030,
"token_acc": 0.9781746031746031
},
{
"epoch": 2.285181347150259,
"grad_norm": 1.4451478719711304,
"learning_rate": 6.572830722067653e-05,
"loss": 0.055239105224609376,
"step": 1035,
"token_acc": 0.981094527363184
},
{
"epoch": 2.2962348877374783,
"grad_norm": 1.9783443212509155,
"learning_rate": 6.5149676336501e-05,
"loss": 0.05858151912689209,
"step": 1040,
"token_acc": 0.9850224663005491
},
{
"epoch": 2.3072884283246977,
"grad_norm": 2.043297529220581,
"learning_rate": 6.456880267959894e-05,
"loss": 0.05577117800712585,
"step": 1045,
"token_acc": 0.9800697558545092
},
{
"epoch": 2.318341968911917,
"grad_norm": 4.235545635223389,
"learning_rate": 6.39857722430679e-05,
"loss": 0.05815597772598267,
"step": 1050,
"token_acc": 0.9801587301587301
},
{
"epoch": 2.3293955094991365,
"grad_norm": 5.21077823638916,
"learning_rate": 6.340067133929719e-05,
"loss": 0.054069459438323975,
"step": 1055,
"token_acc": 0.9801291604570294
},
{
"epoch": 2.340449050086356,
"grad_norm": 2.011702537536621,
"learning_rate": 6.281358658719011e-05,
"loss": 0.07169802188873291,
"step": 1060,
"token_acc": 0.9752107089737233
},
{
"epoch": 2.351502590673575,
"grad_norm": 2.3686680793762207,
"learning_rate": 6.22246048993407e-05,
"loss": 0.05615015029907226,
"step": 1065,
"token_acc": 0.9800299550673989
},
{
"epoch": 2.3625561312607943,
"grad_norm": 7.002840995788574,
"learning_rate": 6.163381346916732e-05,
"loss": 0.06114639043807983,
"step": 1070,
"token_acc": 0.9760239760239761
},
{
"epoch": 2.3736096718480137,
"grad_norm": 4.902541160583496,
"learning_rate": 6.104129975800427e-05,
"loss": 0.07762741446495056,
"step": 1075,
"token_acc": 0.9730807577268196
},
{
"epoch": 2.384663212435233,
"grad_norm": 4.661471843719482,
"learning_rate": 6.0447151482153955e-05,
"loss": 0.06509206891059875,
"step": 1080,
"token_acc": 0.9771144278606965
},
{
"epoch": 2.3957167530224526,
"grad_norm": 1.999237298965454,
"learning_rate": 5.985145659990138e-05,
"loss": 0.06380823254585266,
"step": 1085,
"token_acc": 0.9796626984126984
},
{
"epoch": 2.406770293609672,
"grad_norm": 2.382763385772705,
"learning_rate": 5.925430329849264e-05,
"loss": 0.05442737936973572,
"step": 1090,
"token_acc": 0.9821073558648111
},
{
"epoch": 2.4178238341968914,
"grad_norm": 1.8456308841705322,
"learning_rate": 5.865577998107961e-05,
"loss": 0.04835757613182068,
"step": 1095,
"token_acc": 0.9831013916500994
},
{
"epoch": 2.4288773747841104,
"grad_norm": 4.092327117919922,
"learning_rate": 5.805597525363263e-05,
"loss": 0.07175707817077637,
"step": 1100,
"token_acc": 0.977589641434263
},
{
"epoch": 2.43993091537133,
"grad_norm": 2.878070831298828,
"learning_rate": 5.745497791182325e-05,
"loss": 0.054905033111572264,
"step": 1105,
"token_acc": 0.9791459781529295
},
{
"epoch": 2.450984455958549,
"grad_norm": 3.2568767070770264,
"learning_rate": 5.685287692787883e-05,
"loss": 0.060244417190551756,
"step": 1110,
"token_acc": 0.9787549407114624
},
{
"epoch": 2.4620379965457686,
"grad_norm": 3.9391286373138428,
"learning_rate": 5.6249761437410895e-05,
"loss": 0.07208690047264099,
"step": 1115,
"token_acc": 0.9765234765234765
},
{
"epoch": 2.473091537132988,
"grad_norm": 1.7733020782470703,
"learning_rate": 5.5645720726219584e-05,
"loss": 0.05974746346473694,
"step": 1120,
"token_acc": 0.9806066633515664
},
{
"epoch": 2.4841450777202074,
"grad_norm": 4.430456161499023,
"learning_rate": 5.504084421707555e-05,
"loss": 0.0642861008644104,
"step": 1125,
"token_acc": 0.9781529294935452
},
{
"epoch": 2.495198618307427,
"grad_norm": 2.7552433013916016,
"learning_rate": 5.443522145648181e-05,
"loss": 0.06047917008399963,
"step": 1130,
"token_acc": 0.9830677290836654
},
{
"epoch": 2.506252158894646,
"grad_norm": 1.864843487739563,
"learning_rate": 5.3828942101417136e-05,
"loss": 0.044628658890724184,
"step": 1135,
"token_acc": 0.9856719367588933
},
{
"epoch": 2.5173056994818652,
"grad_norm": 4.15530252456665,
"learning_rate": 5.322209590606323e-05,
"loss": 0.0662376880645752,
"step": 1140,
"token_acc": 0.9765234765234765
},
{
"epoch": 2.5283592400690846,
"grad_norm": 3.1273319721221924,
"learning_rate": 5.2614772708517324e-05,
"loss": 0.06211344003677368,
"step": 1145,
"token_acc": 0.9807217004448838
},
{
"epoch": 2.539412780656304,
"grad_norm": 2.2132909297943115,
"learning_rate": 5.200706241749257e-05,
"loss": 0.05160966515541077,
"step": 1150,
"token_acc": 0.9830423940149626
},
{
"epoch": 2.5504663212435235,
"grad_norm": 2.4676833152770996,
"learning_rate": 5.1399054999007756e-05,
"loss": 0.05153646469116211,
"step": 1155,
"token_acc": 0.9795816733067729
},
{
"epoch": 2.5615198618307424,
"grad_norm": 2.5266482830047607,
"learning_rate": 5.079084046306877e-05,
"loss": 0.05694507360458374,
"step": 1160,
"token_acc": 0.9826474962816063
},
{
"epoch": 2.572573402417962,
"grad_norm": 3.3381104469299316,
"learning_rate": 5.018250885034328e-05,
"loss": 0.056800955533981325,
"step": 1165,
"token_acc": 0.9816377171215881
},
{
"epoch": 2.5836269430051813,
"grad_norm": 3.0276429653167725,
"learning_rate": 4.957415021883121e-05,
"loss": 0.061768895387649535,
"step": 1170,
"token_acc": 0.9804413239719157
},
{
"epoch": 2.5946804835924007,
"grad_norm": 1.4948302507400513,
"learning_rate": 4.89658546305323e-05,
"loss": 0.048909342288970946,
"step": 1175,
"token_acc": 0.9846306395637084
},
{
"epoch": 2.60573402417962,
"grad_norm": 2.0935652256011963,
"learning_rate": 4.835771213811336e-05,
"loss": 0.05250586867332459,
"step": 1180,
"token_acc": 0.9846534653465346
},
{
"epoch": 2.6167875647668395,
"grad_norm": 2.5400516986846924,
"learning_rate": 4.774981277157673e-05,
"loss": 0.05397605299949646,
"step": 1185,
"token_acc": 0.9816740960871718
},
{
"epoch": 2.627841105354059,
"grad_norm": 1.8448779582977295,
"learning_rate": 4.714224652493212e-05,
"loss": 0.0550678551197052,
"step": 1190,
"token_acc": 0.9800299550673989
},
{
"epoch": 2.638894645941278,
"grad_norm": 2.3702774047851562,
"learning_rate": 4.6535103342873885e-05,
"loss": 0.055988776683807376,
"step": 1195,
"token_acc": 0.9795511221945137
},
{
"epoch": 2.6499481865284973,
"grad_norm": 3.8877556324005127,
"learning_rate": 4.592847310746549e-05,
"loss": 0.054580336809158324,
"step": 1200,
"token_acc": 0.9802078179119248
},
{
"epoch": 2.6499481865284973,
"eval_loss": 0.06643825024366379,
"eval_runtime": 2174.0597,
"eval_samples_per_second": 1.08,
"eval_steps_per_second": 1.08,
"eval_token_acc": 0.9772273542449159,
"step": 1200
},
{
"epoch": 2.6610017271157167,
"grad_norm": 4.951501846313477,
"learning_rate": 4.5322445624833255e-05,
"loss": 0.05614232420921326,
"step": 1205,
"token_acc": 0.979571499750872
},
{
"epoch": 2.672055267702936,
"grad_norm": 1.8844807147979736,
"learning_rate": 4.471711061187144e-05,
"loss": 0.05867302417755127,
"step": 1210,
"token_acc": 0.9790523690773068
},
{
"epoch": 2.6831088082901555,
"grad_norm": 2.765587329864502,
"learning_rate": 4.411255768296038e-05,
"loss": 0.05516909956932068,
"step": 1215,
"token_acc": 0.9800697558545092
},
{
"epoch": 2.694162348877375,
"grad_norm": 3.7630977630615234,
"learning_rate": 4.3508876336699974e-05,
"loss": 0.05011783838272095,
"step": 1220,
"token_acc": 0.981555333998006
},
{
"epoch": 2.7052158894645943,
"grad_norm": 3.2775371074676514,
"learning_rate": 4.290615594266013e-05,
"loss": 0.04247501492500305,
"step": 1225,
"token_acc": 0.9855
},
{
"epoch": 2.7162694300518133,
"grad_norm": 2.6794540882110596,
"learning_rate": 4.230448572815053e-05,
"loss": 0.04988014101982117,
"step": 1230,
"token_acc": 0.9826130153999006
},
{
"epoch": 2.7273229706390327,
"grad_norm": 2.045193910598755,
"learning_rate": 4.170395476501119e-05,
"loss": 0.04342162907123566,
"step": 1235,
"token_acc": 0.9841269841269841
},
{
"epoch": 2.738376511226252,
"grad_norm": 2.58183217048645,
"learning_rate": 4.1104651956426296e-05,
"loss": 0.04766501486301422,
"step": 1240,
"token_acc": 0.9831097863884749
},
{
"epoch": 2.7494300518134716,
"grad_norm": 3.195136785507202,
"learning_rate": 4.050666602376287e-05,
"loss": 0.05665205121040344,
"step": 1245,
"token_acc": 0.9766749379652605
},
{
"epoch": 2.760483592400691,
"grad_norm": 3.2019379138946533,
"learning_rate": 3.991008549343626e-05,
"loss": 0.07038918733596802,
"step": 1250,
"token_acc": 0.9775449101796407
},
{
"epoch": 2.77153713298791,
"grad_norm": 4.3877854347229,
"learning_rate": 3.931499868380482e-05,
"loss": 0.06642740964889526,
"step": 1255,
"token_acc": 0.9761904761904762
},
{
"epoch": 2.78259067357513,
"grad_norm": 1.3834000825881958,
"learning_rate": 3.872149369209491e-05,
"loss": 0.0616798460483551,
"step": 1260,
"token_acc": 0.979571499750872
},
{
"epoch": 2.7936442141623488,
"grad_norm": 4.06312370300293,
"learning_rate": 3.8129658381359156e-05,
"loss": 0.07107862830162048,
"step": 1265,
"token_acc": 0.9755244755244755
},
{
"epoch": 2.804697754749568,
"grad_norm": 3.0957045555114746,
"learning_rate": 3.753958036746894e-05,
"loss": 0.0476302444934845,
"step": 1270,
"token_acc": 0.9815645241654211
},
{
"epoch": 2.8157512953367876,
"grad_norm": 4.347245693206787,
"learning_rate": 3.695134700614372e-05,
"loss": 0.06514678001403809,
"step": 1275,
"token_acc": 0.9790836653386454
},
{
"epoch": 2.826804835924007,
"grad_norm": 3.311340808868408,
"learning_rate": 3.636504538001882e-05,
"loss": 0.0548922598361969,
"step": 1280,
"token_acc": 0.9831432821021319
},
{
"epoch": 2.8378583765112264,
"grad_norm": 1.910610556602478,
"learning_rate": 3.5780762285753616e-05,
"loss": 0.04039471745491028,
"step": 1285,
"token_acc": 0.9861454725383474
},
{
"epoch": 2.8489119170984454,
"grad_norm": 3.0203163623809814,
"learning_rate": 3.519858422118206e-05,
"loss": 0.06901986002922059,
"step": 1290,
"token_acc": 0.9781312127236581
},
{
"epoch": 2.859965457685665,
"grad_norm": 7.763772487640381,
"learning_rate": 3.461859737250752e-05,
"loss": 0.042749127745628356,
"step": 1295,
"token_acc": 0.9840637450199203
},
{
"epoch": 2.871018998272884,
"grad_norm": 4.355055332183838,
"learning_rate": 3.4040887601543574e-05,
"loss": 0.06063474416732788,
"step": 1300,
"token_acc": 0.9815277084373439
},
{
"epoch": 2.8820725388601036,
"grad_norm": 3.254500389099121,
"learning_rate": 3.346554043300308e-05,
"loss": 0.058100783824920656,
"step": 1305,
"token_acc": 0.9791976225854383
},
{
"epoch": 2.893126079447323,
"grad_norm": 4.576897621154785,
"learning_rate": 3.289264104183691e-05,
"loss": 0.05097652673721313,
"step": 1310,
"token_acc": 0.983201581027668
},
{
"epoch": 2.9041796200345424,
"grad_norm": 4.716442584991455,
"learning_rate": 3.232227424062464e-05,
"loss": 0.05045266747474671,
"step": 1315,
"token_acc": 0.9829059829059829
},
{
"epoch": 2.915233160621762,
"grad_norm": 3.8253066539764404,
"learning_rate": 3.175452446701873e-05,
"loss": 0.05482856035232544,
"step": 1320,
"token_acc": 0.9791356184798807
},
{
"epoch": 2.926286701208981,
"grad_norm": 4.78739595413208,
"learning_rate": 3.118947577124439e-05,
"loss": 0.056392842531204225,
"step": 1325,
"token_acc": 0.9797130133597229
},
{
"epoch": 2.9373402417962002,
"grad_norm": 2.2553937435150146,
"learning_rate": 3.062721180365669e-05,
"loss": 0.05316250324249268,
"step": 1330,
"token_acc": 0.9816831683168317
},
{
"epoch": 2.9483937823834196,
"grad_norm": 3.2007408142089844,
"learning_rate": 3.0067815802356714e-05,
"loss": 0.055870598554611205,
"step": 1335,
"token_acc": 0.9796526054590571
},
{
"epoch": 2.959447322970639,
"grad_norm": 1.5691827535629272,
"learning_rate": 2.9511370580869213e-05,
"loss": 0.04847137331962585,
"step": 1340,
"token_acc": 0.980635551142006
},
{
"epoch": 2.9705008635578585,
"grad_norm": 4.0064826011657715,
"learning_rate": 2.895795851588252e-05,
"loss": 0.061286211013793945,
"step": 1345,
"token_acc": 0.9805583250249252
},
{
"epoch": 2.981554404145078,
"grad_norm": 3.6499500274658203,
"learning_rate": 2.8407661535053588e-05,
"loss": 0.0678468644618988,
"step": 1350,
"token_acc": 0.9766401590457257
},
{
"epoch": 2.9926079447322973,
"grad_norm": 3.093632936477661,
"learning_rate": 2.7860561104879357e-05,
"loss": 0.04808221161365509,
"step": 1355,
"token_acc": 0.9815920398009951
},
{
"epoch": 3.0,
"eval_loss": 0.057897526770830154,
"eval_runtime": 2238.3251,
"eval_samples_per_second": 1.049,
"eval_steps_per_second": 1.049,
"eval_token_acc": 0.9798328426894055,
"step": 1359
}
],
"logging_steps": 5,
"max_steps": 1359,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.83259479920255e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}