saves_7b_1_3_6 / trainer_state.json
chancharikm's picture
End of training
a287ed3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9961261759822913,
"eval_steps": 500,
"global_step": 3612,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011068068622025456,
"grad_norm": 111.91503449772634,
"learning_rate": 9.94475138121547e-07,
"loss": 3.4549,
"step": 10
},
{
"epoch": 0.02213613724405091,
"grad_norm": 7.922807222950788,
"learning_rate": 2.0994475138121547e-06,
"loss": 1.1747,
"step": 20
},
{
"epoch": 0.03320420586607637,
"grad_norm": 4.5465846973535,
"learning_rate": 3.204419889502763e-06,
"loss": 0.6454,
"step": 30
},
{
"epoch": 0.04427227448810182,
"grad_norm": 5.530354729245049,
"learning_rate": 4.309392265193371e-06,
"loss": 0.5239,
"step": 40
},
{
"epoch": 0.05534034311012728,
"grad_norm": 5.077908329619468,
"learning_rate": 5.414364640883978e-06,
"loss": 0.4425,
"step": 50
},
{
"epoch": 0.06640841173215274,
"grad_norm": 9.192826009944866,
"learning_rate": 6.5193370165745865e-06,
"loss": 0.3716,
"step": 60
},
{
"epoch": 0.0774764803541782,
"grad_norm": 3.003413039290293,
"learning_rate": 7.624309392265194e-06,
"loss": 0.3749,
"step": 70
},
{
"epoch": 0.08854454897620365,
"grad_norm": 3.723093767166122,
"learning_rate": 8.729281767955802e-06,
"loss": 0.3347,
"step": 80
},
{
"epoch": 0.0996126175982291,
"grad_norm": 2.9657326866531983,
"learning_rate": 9.834254143646411e-06,
"loss": 0.2678,
"step": 90
},
{
"epoch": 0.11068068622025456,
"grad_norm": 5.900212741354982,
"learning_rate": 1.0939226519337018e-05,
"loss": 0.2791,
"step": 100
},
{
"epoch": 0.12174875484228002,
"grad_norm": 2.69943519276576,
"learning_rate": 1.2044198895027625e-05,
"loss": 0.2175,
"step": 110
},
{
"epoch": 0.13281682346430548,
"grad_norm": 2.734146918784431,
"learning_rate": 1.3149171270718234e-05,
"loss": 0.2148,
"step": 120
},
{
"epoch": 0.14388489208633093,
"grad_norm": 4.568726147434186,
"learning_rate": 1.425414364640884e-05,
"loss": 0.1885,
"step": 130
},
{
"epoch": 0.1549529607083564,
"grad_norm": 2.826200663681499,
"learning_rate": 1.535911602209945e-05,
"loss": 0.1842,
"step": 140
},
{
"epoch": 0.16602102933038185,
"grad_norm": 4.656857528648145,
"learning_rate": 1.6464088397790058e-05,
"loss": 0.1586,
"step": 150
},
{
"epoch": 0.1770890979524073,
"grad_norm": 2.6844969949298103,
"learning_rate": 1.7569060773480663e-05,
"loss": 0.1412,
"step": 160
},
{
"epoch": 0.18815716657443277,
"grad_norm": 3.213817850319435,
"learning_rate": 1.8674033149171272e-05,
"loss": 0.1398,
"step": 170
},
{
"epoch": 0.1992252351964582,
"grad_norm": 2.1465861431244138,
"learning_rate": 1.977900552486188e-05,
"loss": 0.1242,
"step": 180
},
{
"epoch": 0.21029330381848368,
"grad_norm": 2.6793701438567465,
"learning_rate": 1.9999731708850868e-05,
"loss": 0.1232,
"step": 190
},
{
"epoch": 0.22136137244050913,
"grad_norm": 1.5667080473228716,
"learning_rate": 1.999864180073034e-05,
"loss": 0.1174,
"step": 200
},
{
"epoch": 0.23242944106253458,
"grad_norm": 2.037240256627066,
"learning_rate": 1.9996713598750047e-05,
"loss": 0.1067,
"step": 210
},
{
"epoch": 0.24349750968456005,
"grad_norm": 1.8670470335951264,
"learning_rate": 1.999394726457193e-05,
"loss": 0.1005,
"step": 220
},
{
"epoch": 0.2545655783065855,
"grad_norm": 4.718021586405129,
"learning_rate": 1.9990343030127588e-05,
"loss": 0.0966,
"step": 230
},
{
"epoch": 0.26563364692861097,
"grad_norm": 1.5167562544095952,
"learning_rate": 1.998590119759882e-05,
"loss": 0.084,
"step": 240
},
{
"epoch": 0.27670171555063644,
"grad_norm": 1.474380476616429,
"learning_rate": 1.998062213939231e-05,
"loss": 0.0843,
"step": 250
},
{
"epoch": 0.28776978417266186,
"grad_norm": 1.7009367999004055,
"learning_rate": 1.997450629810837e-05,
"loss": 0.077,
"step": 260
},
{
"epoch": 0.29883785279468733,
"grad_norm": 1.3054232841373299,
"learning_rate": 1.9967554186503864e-05,
"loss": 0.0645,
"step": 270
},
{
"epoch": 0.3099059214167128,
"grad_norm": 1.4759142904818798,
"learning_rate": 1.9959766387449203e-05,
"loss": 0.0652,
"step": 280
},
{
"epoch": 0.3209739900387382,
"grad_norm": 3.1917585407150324,
"learning_rate": 1.995114355387949e-05,
"loss": 0.0646,
"step": 290
},
{
"epoch": 0.3320420586607637,
"grad_norm": 1.897149045890919,
"learning_rate": 1.9941686408739748e-05,
"loss": 0.0636,
"step": 300
},
{
"epoch": 0.34311012728278917,
"grad_norm": 1.2297497325088935,
"learning_rate": 1.9931395744924345e-05,
"loss": 0.0573,
"step": 310
},
{
"epoch": 0.3541781959048146,
"grad_norm": 2.120351393194103,
"learning_rate": 1.992027242521049e-05,
"loss": 0.0552,
"step": 320
},
{
"epoch": 0.36524626452684006,
"grad_norm": 1.1544320003426745,
"learning_rate": 1.9908317382185904e-05,
"loss": 0.0509,
"step": 330
},
{
"epoch": 0.37631433314886553,
"grad_norm": 1.5719324649495832,
"learning_rate": 1.9895531618170646e-05,
"loss": 0.055,
"step": 340
},
{
"epoch": 0.387382401770891,
"grad_norm": 2.088937323356875,
"learning_rate": 1.9881916205133057e-05,
"loss": 0.0507,
"step": 350
},
{
"epoch": 0.3984504703929164,
"grad_norm": 1.0024237634001862,
"learning_rate": 1.9867472284599888e-05,
"loss": 0.0458,
"step": 360
},
{
"epoch": 0.4095185390149419,
"grad_norm": 0.8620888697564176,
"learning_rate": 1.9852201067560607e-05,
"loss": 0.0432,
"step": 370
},
{
"epoch": 0.42058660763696737,
"grad_norm": 0.6606971753051816,
"learning_rate": 1.9836103834365864e-05,
"loss": 0.0444,
"step": 380
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.9387070037035307,
"learning_rate": 1.9819181934620128e-05,
"loss": 0.0403,
"step": 390
},
{
"epoch": 0.44272274488101826,
"grad_norm": 0.9093244429730171,
"learning_rate": 1.9801436787068563e-05,
"loss": 0.0372,
"step": 400
},
{
"epoch": 0.45379081350304373,
"grad_norm": 1.2531686851495056,
"learning_rate": 1.9782869879478058e-05,
"loss": 0.0415,
"step": 410
},
{
"epoch": 0.46485888212506915,
"grad_norm": 0.907356018111985,
"learning_rate": 1.9763482768512507e-05,
"loss": 0.0374,
"step": 420
},
{
"epoch": 0.4759269507470946,
"grad_norm": 0.5914417873894128,
"learning_rate": 1.974327707960228e-05,
"loss": 0.0383,
"step": 430
},
{
"epoch": 0.4869950193691201,
"grad_norm": 0.9164172705484467,
"learning_rate": 1.972225450680796e-05,
"loss": 0.03,
"step": 440
},
{
"epoch": 0.49806308799114557,
"grad_norm": 1.1027419481663168,
"learning_rate": 1.9700416812678303e-05,
"loss": 0.0349,
"step": 450
},
{
"epoch": 0.509131156613171,
"grad_norm": 0.8422050935800688,
"learning_rate": 1.9677765828102477e-05,
"loss": 0.0346,
"step": 460
},
{
"epoch": 0.5201992252351965,
"grad_norm": 0.6238402717183468,
"learning_rate": 1.9654303452156535e-05,
"loss": 0.0315,
"step": 470
},
{
"epoch": 0.5312672938572219,
"grad_norm": 1.2101485253397948,
"learning_rate": 1.963003165194422e-05,
"loss": 0.0348,
"step": 480
},
{
"epoch": 0.5423353624792474,
"grad_norm": 0.697551293992083,
"learning_rate": 1.9604952462432032e-05,
"loss": 0.0325,
"step": 490
},
{
"epoch": 0.5534034311012729,
"grad_norm": 0.5906516672972566,
"learning_rate": 1.957906798627861e-05,
"loss": 0.0239,
"step": 500
},
{
"epoch": 0.5644714997232982,
"grad_norm": 1.117787904345762,
"learning_rate": 1.955238039365845e-05,
"loss": 0.0286,
"step": 510
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.7157423697495711,
"learning_rate": 1.952489192207995e-05,
"loss": 0.0342,
"step": 520
},
{
"epoch": 0.5866076369673492,
"grad_norm": 1.0572948484139497,
"learning_rate": 1.9496604876197826e-05,
"loss": 0.0272,
"step": 530
},
{
"epoch": 0.5976757055893747,
"grad_norm": 1.031127851727682,
"learning_rate": 1.9467521627619874e-05,
"loss": 0.0266,
"step": 540
},
{
"epoch": 0.6087437742114001,
"grad_norm": 0.6403187680270812,
"learning_rate": 1.9437644614708143e-05,
"loss": 0.0272,
"step": 550
},
{
"epoch": 0.6198118428334256,
"grad_norm": 1.062190185802606,
"learning_rate": 1.94069763423745e-05,
"loss": 0.0298,
"step": 560
},
{
"epoch": 0.630879911455451,
"grad_norm": 0.7801274055237428,
"learning_rate": 1.9375519381870608e-05,
"loss": 0.0242,
"step": 570
},
{
"epoch": 0.6419479800774764,
"grad_norm": 1.2015873253294118,
"learning_rate": 1.9343276370572357e-05,
"loss": 0.0227,
"step": 580
},
{
"epoch": 0.6530160486995019,
"grad_norm": 0.6941953274977626,
"learning_rate": 1.9310250011758752e-05,
"loss": 0.0235,
"step": 590
},
{
"epoch": 0.6640841173215274,
"grad_norm": 0.7439604839898615,
"learning_rate": 1.9276443074385246e-05,
"loss": 0.0225,
"step": 600
},
{
"epoch": 0.6751521859435529,
"grad_norm": 0.4090201825493306,
"learning_rate": 1.9241858392851612e-05,
"loss": 0.0171,
"step": 610
},
{
"epoch": 0.6862202545655783,
"grad_norm": 0.86586702597483,
"learning_rate": 1.920649886676429e-05,
"loss": 0.0204,
"step": 620
},
{
"epoch": 0.6972883231876038,
"grad_norm": 0.5684391358174806,
"learning_rate": 1.917036746069329e-05,
"loss": 0.0193,
"step": 630
},
{
"epoch": 0.7083563918096292,
"grad_norm": 1.2006453142034454,
"learning_rate": 1.913346720392363e-05,
"loss": 0.0183,
"step": 640
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.5095199603894148,
"learning_rate": 1.909580119020138e-05,
"loss": 0.0171,
"step": 650
},
{
"epoch": 0.7304925290536801,
"grad_norm": 0.6222791362666046,
"learning_rate": 1.9057372577474244e-05,
"loss": 0.0203,
"step": 660
},
{
"epoch": 0.7415605976757056,
"grad_norm": 0.5784262806138188,
"learning_rate": 1.901818458762683e-05,
"loss": 0.0187,
"step": 670
},
{
"epoch": 0.7526286662977311,
"grad_norm": 0.3384795920975776,
"learning_rate": 1.897824050621051e-05,
"loss": 0.0205,
"step": 680
},
{
"epoch": 0.7636967349197565,
"grad_norm": 0.7403008730689107,
"learning_rate": 1.893754368216796e-05,
"loss": 0.0144,
"step": 690
},
{
"epoch": 0.774764803541782,
"grad_norm": 0.6886290123638243,
"learning_rate": 1.8896097527552362e-05,
"loss": 0.018,
"step": 700
},
{
"epoch": 0.7858328721638074,
"grad_norm": 0.7094773397852386,
"learning_rate": 1.8853905517241384e-05,
"loss": 0.019,
"step": 710
},
{
"epoch": 0.7969009407858328,
"grad_norm": 0.37028282697220893,
"learning_rate": 1.8810971188645775e-05,
"loss": 0.016,
"step": 720
},
{
"epoch": 0.8079690094078583,
"grad_norm": 0.5783298917051538,
"learning_rate": 1.876729814141286e-05,
"loss": 0.0155,
"step": 730
},
{
"epoch": 0.8190370780298838,
"grad_norm": 0.817934182382838,
"learning_rate": 1.8722890037124674e-05,
"loss": 0.0181,
"step": 740
},
{
"epoch": 0.8301051466519093,
"grad_norm": 0.42757623486015045,
"learning_rate": 1.8677750598991023e-05,
"loss": 0.0121,
"step": 750
},
{
"epoch": 0.8411732152739347,
"grad_norm": 0.6738036479530188,
"learning_rate": 1.863188361153731e-05,
"loss": 0.0168,
"step": 760
},
{
"epoch": 0.8522412838959601,
"grad_norm": 0.4240964815744989,
"learning_rate": 1.8585292920287217e-05,
"loss": 0.0121,
"step": 770
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.6974893779227371,
"learning_rate": 1.8537982431440333e-05,
"loss": 0.0134,
"step": 780
},
{
"epoch": 0.874377421140011,
"grad_norm": 0.6497026126998792,
"learning_rate": 1.8489956111544624e-05,
"loss": 0.0117,
"step": 790
},
{
"epoch": 0.8854454897620365,
"grad_norm": 0.6623978615404418,
"learning_rate": 1.8441217987163874e-05,
"loss": 0.0123,
"step": 800
},
{
"epoch": 0.896513558384062,
"grad_norm": 0.5858726137972703,
"learning_rate": 1.8391772144540127e-05,
"loss": 0.0171,
"step": 810
},
{
"epoch": 0.9075816270060875,
"grad_norm": 1.103716010768291,
"learning_rate": 1.8341622729251062e-05,
"loss": 0.0142,
"step": 820
},
{
"epoch": 0.9186496956281129,
"grad_norm": 0.5032363986196893,
"learning_rate": 1.8290773945862428e-05,
"loss": 0.0149,
"step": 830
},
{
"epoch": 0.9297177642501383,
"grad_norm": 0.5622836058996923,
"learning_rate": 1.8239230057575542e-05,
"loss": 0.0136,
"step": 840
},
{
"epoch": 0.9407858328721638,
"grad_norm": 0.6104096448589974,
"learning_rate": 1.8186995385869857e-05,
"loss": 0.0127,
"step": 850
},
{
"epoch": 0.9518539014941892,
"grad_norm": 0.7843382280116427,
"learning_rate": 1.8134074310140638e-05,
"loss": 0.0139,
"step": 860
},
{
"epoch": 0.9629219701162147,
"grad_norm": 0.5746764736583982,
"learning_rate": 1.8080471267331792e-05,
"loss": 0.0125,
"step": 870
},
{
"epoch": 0.9739900387382402,
"grad_norm": 0.43873748930902157,
"learning_rate": 1.8026190751563874e-05,
"loss": 0.0127,
"step": 880
},
{
"epoch": 0.9850581073602657,
"grad_norm": 0.9328533127381619,
"learning_rate": 1.79712373137573e-05,
"loss": 0.0097,
"step": 890
},
{
"epoch": 0.9961261759822911,
"grad_norm": 0.684178143244021,
"learning_rate": 1.7915615561250783e-05,
"loss": 0.0085,
"step": 900
},
{
"epoch": 1.0066408411732153,
"grad_norm": 0.21673732820638353,
"learning_rate": 1.7859330157415065e-05,
"loss": 0.0092,
"step": 910
},
{
"epoch": 1.0177089097952408,
"grad_norm": 0.27867910779840166,
"learning_rate": 1.7802385821261922e-05,
"loss": 0.0096,
"step": 920
},
{
"epoch": 1.0287769784172662,
"grad_norm": 0.11162636675632644,
"learning_rate": 1.7744787327048533e-05,
"loss": 0.0084,
"step": 930
},
{
"epoch": 1.0398450470392917,
"grad_norm": 0.22048120465803261,
"learning_rate": 1.768653950387718e-05,
"loss": 0.0078,
"step": 940
},
{
"epoch": 1.0509131156613172,
"grad_norm": 0.6765205253168616,
"learning_rate": 1.7627647235290407e-05,
"loss": 0.0068,
"step": 950
},
{
"epoch": 1.0619811842833426,
"grad_norm": 0.4689878039036384,
"learning_rate": 1.7568115458861542e-05,
"loss": 0.0074,
"step": 960
},
{
"epoch": 1.073049252905368,
"grad_norm": 0.5676023150735331,
"learning_rate": 1.7507949165780753e-05,
"loss": 0.007,
"step": 970
},
{
"epoch": 1.0841173215273934,
"grad_norm": 0.7911642935038967,
"learning_rate": 1.7447153400436577e-05,
"loss": 0.0088,
"step": 980
},
{
"epoch": 1.0951853901494188,
"grad_norm": 0.5077417518483057,
"learning_rate": 1.738573325999299e-05,
"loss": 0.0076,
"step": 990
},
{
"epoch": 1.1062534587714443,
"grad_norm": 0.5625579004448811,
"learning_rate": 1.7323693893962055e-05,
"loss": 0.0089,
"step": 1000
},
{
"epoch": 1.1173215273934698,
"grad_norm": 0.39598238596354546,
"learning_rate": 1.7261040503772187e-05,
"loss": 0.008,
"step": 1010
},
{
"epoch": 1.1283895960154953,
"grad_norm": 0.39572470853097774,
"learning_rate": 1.7197778342332075e-05,
"loss": 0.0068,
"step": 1020
},
{
"epoch": 1.1394576646375207,
"grad_norm": 0.6313738026150261,
"learning_rate": 1.7133912713590243e-05,
"loss": 0.0122,
"step": 1030
},
{
"epoch": 1.1505257332595462,
"grad_norm": 0.6255276960016989,
"learning_rate": 1.7069448972090387e-05,
"loss": 0.0085,
"step": 1040
},
{
"epoch": 1.1615938018815717,
"grad_norm": 0.4559474668836657,
"learning_rate": 1.700439252252244e-05,
"loss": 0.0069,
"step": 1050
},
{
"epoch": 1.1726618705035972,
"grad_norm": 0.2959471392332271,
"learning_rate": 1.6938748819269436e-05,
"loss": 0.0082,
"step": 1060
},
{
"epoch": 1.1837299391256226,
"grad_norm": 0.4089461613308411,
"learning_rate": 1.6872523365950218e-05,
"loss": 0.0081,
"step": 1070
},
{
"epoch": 1.194798007747648,
"grad_norm": 0.1656790995662235,
"learning_rate": 1.6805721714957995e-05,
"loss": 0.006,
"step": 1080
},
{
"epoch": 1.2058660763696736,
"grad_norm": 0.9361330580444202,
"learning_rate": 1.6738349466994837e-05,
"loss": 0.0064,
"step": 1090
},
{
"epoch": 1.2169341449916988,
"grad_norm": 0.21493739963899747,
"learning_rate": 1.6670412270602115e-05,
"loss": 0.0071,
"step": 1100
},
{
"epoch": 1.2280022136137245,
"grad_norm": 0.558379730610169,
"learning_rate": 1.6601915821686895e-05,
"loss": 0.0056,
"step": 1110
},
{
"epoch": 1.2390702822357498,
"grad_norm": 0.4220154829452254,
"learning_rate": 1.6532865863044424e-05,
"loss": 0.0074,
"step": 1120
},
{
"epoch": 1.2501383508577752,
"grad_norm": 0.08715888601622518,
"learning_rate": 1.6463268183876627e-05,
"loss": 0.0095,
"step": 1130
},
{
"epoch": 1.2612064194798007,
"grad_norm": 0.9325238058616875,
"learning_rate": 1.6393128619306734e-05,
"loss": 0.0087,
"step": 1140
},
{
"epoch": 1.2722744881018262,
"grad_norm": 0.3113643778329001,
"learning_rate": 1.6322453049890078e-05,
"loss": 0.0073,
"step": 1150
},
{
"epoch": 1.2833425567238517,
"grad_norm": 0.2476875728900236,
"learning_rate": 1.625124740112104e-05,
"loss": 0.0062,
"step": 1160
},
{
"epoch": 1.2944106253458771,
"grad_norm": 0.21826634116425128,
"learning_rate": 1.617951764293628e-05,
"loss": 0.008,
"step": 1170
},
{
"epoch": 1.3054786939679026,
"grad_norm": 0.48793358414259447,
"learning_rate": 1.610726978921418e-05,
"loss": 0.0085,
"step": 1180
},
{
"epoch": 1.316546762589928,
"grad_norm": 0.4618692190051027,
"learning_rate": 1.603450989727066e-05,
"loss": 0.0079,
"step": 1190
},
{
"epoch": 1.3276148312119536,
"grad_norm": 0.5286134233635146,
"learning_rate": 1.5961244067351326e-05,
"loss": 0.0072,
"step": 1200
},
{
"epoch": 1.338682899833979,
"grad_norm": 0.16659699644844692,
"learning_rate": 1.5887478442120007e-05,
"loss": 0.0074,
"step": 1210
},
{
"epoch": 1.3497509684560045,
"grad_norm": 0.48249847302009274,
"learning_rate": 1.5813219206143755e-05,
"loss": 0.0076,
"step": 1220
},
{
"epoch": 1.3608190370780298,
"grad_norm": 0.3524591713873095,
"learning_rate": 1.5738472585374334e-05,
"loss": 0.0058,
"step": 1230
},
{
"epoch": 1.3718871057000555,
"grad_norm": 0.08214795303298088,
"learning_rate": 1.566324484662624e-05,
"loss": 0.0061,
"step": 1240
},
{
"epoch": 1.3829551743220807,
"grad_norm": 0.4243354207285399,
"learning_rate": 1.5587542297051233e-05,
"loss": 0.0082,
"step": 1250
},
{
"epoch": 1.3940232429441062,
"grad_norm": 0.2496071649771549,
"learning_rate": 1.5511371283609622e-05,
"loss": 0.0058,
"step": 1260
},
{
"epoch": 1.4050913115661317,
"grad_norm": 0.1955901791520104,
"learning_rate": 1.5434738192538067e-05,
"loss": 0.0041,
"step": 1270
},
{
"epoch": 1.4161593801881571,
"grad_norm": 0.5300487568144705,
"learning_rate": 1.5357649448814177e-05,
"loss": 0.0111,
"step": 1280
},
{
"epoch": 1.4272274488101826,
"grad_norm": 0.39956657844001175,
"learning_rate": 1.5280111515617835e-05,
"loss": 0.0095,
"step": 1290
},
{
"epoch": 1.438295517432208,
"grad_norm": 0.5470821515861973,
"learning_rate": 1.520213089378931e-05,
"loss": 0.0079,
"step": 1300
},
{
"epoch": 1.4493635860542335,
"grad_norm": 0.31965121329752255,
"learning_rate": 1.512371412128424e-05,
"loss": 0.0063,
"step": 1310
},
{
"epoch": 1.460431654676259,
"grad_norm": 0.2941816190422621,
"learning_rate": 1.5044867772625455e-05,
"loss": 0.0061,
"step": 1320
},
{
"epoch": 1.4714997232982845,
"grad_norm": 0.3564346350090648,
"learning_rate": 1.4965598458351797e-05,
"loss": 0.0057,
"step": 1330
},
{
"epoch": 1.48256779192031,
"grad_norm": 0.28234215930894024,
"learning_rate": 1.4885912824463875e-05,
"loss": 0.0059,
"step": 1340
},
{
"epoch": 1.4936358605423354,
"grad_norm": 0.8066370524907985,
"learning_rate": 1.4805817551866839e-05,
"loss": 0.0148,
"step": 1350
},
{
"epoch": 1.5047039291643607,
"grad_norm": 0.5977966209875002,
"learning_rate": 1.4725319355810282e-05,
"loss": 0.0218,
"step": 1360
},
{
"epoch": 1.5157719977863864,
"grad_norm": 0.6001415723968081,
"learning_rate": 1.4644424985325198e-05,
"loss": 0.019,
"step": 1370
},
{
"epoch": 1.5268400664084116,
"grad_norm": 0.4494259779868367,
"learning_rate": 1.4563141222658163e-05,
"loss": 0.0107,
"step": 1380
},
{
"epoch": 1.5379081350304373,
"grad_norm": 0.6458187453604811,
"learning_rate": 1.4481474882702688e-05,
"loss": 0.0135,
"step": 1390
},
{
"epoch": 1.5489762036524626,
"grad_norm": 0.45245571994179906,
"learning_rate": 1.4399432812427862e-05,
"loss": 0.0134,
"step": 1400
},
{
"epoch": 1.560044272274488,
"grad_norm": 0.4600542174327164,
"learning_rate": 1.4317021890304294e-05,
"loss": 0.0072,
"step": 1410
},
{
"epoch": 1.5711123408965135,
"grad_norm": 0.31360771518664865,
"learning_rate": 1.4234249025727419e-05,
"loss": 0.0088,
"step": 1420
},
{
"epoch": 1.582180409518539,
"grad_norm": 0.3841495404827664,
"learning_rate": 1.4151121158438195e-05,
"loss": 0.0056,
"step": 1430
},
{
"epoch": 1.5932484781405645,
"grad_norm": 0.46969355836910825,
"learning_rate": 1.4067645257941308e-05,
"loss": 0.0054,
"step": 1440
},
{
"epoch": 1.60431654676259,
"grad_norm": 0.3163537210189324,
"learning_rate": 1.3983828322920786e-05,
"loss": 0.005,
"step": 1450
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.15773797744824763,
"learning_rate": 1.3899677380653276e-05,
"loss": 0.0035,
"step": 1460
},
{
"epoch": 1.626452684006641,
"grad_norm": 0.49536783037004123,
"learning_rate": 1.3815199486418851e-05,
"loss": 0.0045,
"step": 1470
},
{
"epoch": 1.6375207526286664,
"grad_norm": 0.10176197013762307,
"learning_rate": 1.3730401722909479e-05,
"loss": 0.0032,
"step": 1480
},
{
"epoch": 1.6485888212506916,
"grad_norm": 0.6691956653694718,
"learning_rate": 1.3645291199635218e-05,
"loss": 0.0045,
"step": 1490
},
{
"epoch": 1.6596568898727173,
"grad_norm": 0.3794166881995434,
"learning_rate": 1.355987505232815e-05,
"loss": 0.0076,
"step": 1500
},
{
"epoch": 1.6707249584947426,
"grad_norm": 0.2887141429200375,
"learning_rate": 1.3474160442344118e-05,
"loss": 0.0065,
"step": 1510
},
{
"epoch": 1.6817930271167683,
"grad_norm": 0.38976801790866195,
"learning_rate": 1.3388154556062292e-05,
"loss": 0.0037,
"step": 1520
},
{
"epoch": 1.6928610957387935,
"grad_norm": 0.2228416804146198,
"learning_rate": 1.330186460428268e-05,
"loss": 0.0032,
"step": 1530
},
{
"epoch": 1.703929164360819,
"grad_norm": 0.09090228950977586,
"learning_rate": 1.3215297821621565e-05,
"loss": 0.0037,
"step": 1540
},
{
"epoch": 1.7149972329828445,
"grad_norm": 0.6251199746615973,
"learning_rate": 1.3128461465904938e-05,
"loss": 0.0037,
"step": 1550
},
{
"epoch": 1.72606530160487,
"grad_norm": 0.2501159476688824,
"learning_rate": 1.3041362817560007e-05,
"loss": 0.0033,
"step": 1560
},
{
"epoch": 1.7371333702268954,
"grad_norm": 0.32559821371482117,
"learning_rate": 1.2954009179004794e-05,
"loss": 0.0025,
"step": 1570
},
{
"epoch": 1.7482014388489209,
"grad_norm": 0.3658704749959391,
"learning_rate": 1.2866407874035904e-05,
"loss": 0.0051,
"step": 1580
},
{
"epoch": 1.7592695074709463,
"grad_norm": 0.17147399439018965,
"learning_rate": 1.2778566247214474e-05,
"loss": 0.0045,
"step": 1590
},
{
"epoch": 1.7703375760929718,
"grad_norm": 0.1551464032194942,
"learning_rate": 1.2690491663250428e-05,
"loss": 0.0018,
"step": 1600
},
{
"epoch": 1.7814056447149973,
"grad_norm": 0.5301425104181171,
"learning_rate": 1.260219150638498e-05,
"loss": 0.0044,
"step": 1610
},
{
"epoch": 1.7924737133370225,
"grad_norm": 0.44158919846599415,
"learning_rate": 1.2513673179771555e-05,
"loss": 0.0062,
"step": 1620
},
{
"epoch": 1.8035417819590482,
"grad_norm": 0.1730696100779873,
"learning_rate": 1.2424944104855107e-05,
"loss": 0.0032,
"step": 1630
},
{
"epoch": 1.8146098505810735,
"grad_norm": 0.05809913714960146,
"learning_rate": 1.2336011720749881e-05,
"loss": 0.0032,
"step": 1640
},
{
"epoch": 1.8256779192030992,
"grad_norm": 0.15220878659480627,
"learning_rate": 1.2246883483615731e-05,
"loss": 0.0024,
"step": 1650
},
{
"epoch": 1.8367459878251244,
"grad_norm": 0.44313561326203266,
"learning_rate": 1.215756686603296e-05,
"loss": 0.0034,
"step": 1660
},
{
"epoch": 1.8478140564471501,
"grad_norm": 1.1214134555289978,
"learning_rate": 1.2068069356375864e-05,
"loss": 0.0044,
"step": 1670
},
{
"epoch": 1.8588821250691754,
"grad_norm": 0.37158504715550505,
"learning_rate": 1.1978398458184848e-05,
"loss": 0.0063,
"step": 1680
},
{
"epoch": 1.8699501936912009,
"grad_norm": 0.050902224376628516,
"learning_rate": 1.188856168953735e-05,
"loss": 0.0026,
"step": 1690
},
{
"epoch": 1.8810182623132263,
"grad_norm": 0.16578056285472928,
"learning_rate": 1.1798566582417521e-05,
"loss": 0.0038,
"step": 1700
},
{
"epoch": 1.8920863309352518,
"grad_norm": 0.29387428387180203,
"learning_rate": 1.1708420682084722e-05,
"loss": 0.0032,
"step": 1710
},
{
"epoch": 1.9031543995572773,
"grad_norm": 0.3729500071630355,
"learning_rate": 1.1618131546440949e-05,
"loss": 0.0033,
"step": 1720
},
{
"epoch": 1.9142224681793027,
"grad_norm": 0.8756632832474044,
"learning_rate": 1.1527706745397143e-05,
"loss": 0.0021,
"step": 1730
},
{
"epoch": 1.9252905368013282,
"grad_norm": 0.29073014043576567,
"learning_rate": 1.1437153860238541e-05,
"loss": 0.0041,
"step": 1740
},
{
"epoch": 1.9363586054233535,
"grad_norm": 0.2886330704551084,
"learning_rate": 1.1346480482989055e-05,
"loss": 0.003,
"step": 1750
},
{
"epoch": 1.9474266740453792,
"grad_norm": 0.08054022503195961,
"learning_rate": 1.1255694215774743e-05,
"loss": 0.0033,
"step": 1760
},
{
"epoch": 1.9584947426674044,
"grad_norm": 0.34933906854298136,
"learning_rate": 1.1164802670186448e-05,
"loss": 0.0042,
"step": 1770
},
{
"epoch": 1.9695628112894301,
"grad_norm": 0.26977238441720924,
"learning_rate": 1.1073813466641633e-05,
"loss": 0.0037,
"step": 1780
},
{
"epoch": 1.9806308799114554,
"grad_norm": 0.24400676151661493,
"learning_rate": 1.0982734233745473e-05,
"loss": 0.0024,
"step": 1790
},
{
"epoch": 1.991698948533481,
"grad_norm": 0.32209501603512203,
"learning_rate": 1.0891572607651281e-05,
"loss": 0.0031,
"step": 1800
},
{
"epoch": 2.002213613724405,
"grad_norm": 0.04360741823732105,
"learning_rate": 1.0800336231420278e-05,
"loss": 0.0029,
"step": 1810
},
{
"epoch": 2.0132816823464306,
"grad_norm": 0.16855758165616114,
"learning_rate": 1.0709032754380797e-05,
"loss": 0.0025,
"step": 1820
},
{
"epoch": 2.024349750968456,
"grad_norm": 0.15666932651188484,
"learning_rate": 1.0617669831486944e-05,
"loss": 0.0013,
"step": 1830
},
{
"epoch": 2.0354178195904815,
"grad_norm": 0.20181179745149389,
"learning_rate": 1.0526255122676823e-05,
"loss": 0.0026,
"step": 1840
},
{
"epoch": 2.0464858882125068,
"grad_norm": 0.15003685273332168,
"learning_rate": 1.0434796292230303e-05,
"loss": 0.0008,
"step": 1850
},
{
"epoch": 2.0575539568345325,
"grad_norm": 0.00804358006410917,
"learning_rate": 1.0343301008126447e-05,
"loss": 0.0016,
"step": 1860
},
{
"epoch": 2.0686220254565577,
"grad_norm": 0.16774600983369314,
"learning_rate": 1.025177694140062e-05,
"loss": 0.0016,
"step": 1870
},
{
"epoch": 2.0796900940785834,
"grad_norm": 0.20346726127119819,
"learning_rate": 1.0160231765501345e-05,
"loss": 0.0019,
"step": 1880
},
{
"epoch": 2.0907581627006087,
"grad_norm": 0.08982622627628309,
"learning_rate": 1.006867315564696e-05,
"loss": 0.0012,
"step": 1890
},
{
"epoch": 2.1018262313226344,
"grad_norm": 0.12775435302757068,
"learning_rate": 9.977108788182104e-06,
"loss": 0.0017,
"step": 1900
},
{
"epoch": 2.1128942999446596,
"grad_norm": 0.2643672894526569,
"learning_rate": 9.885546339934145e-06,
"loss": 0.0016,
"step": 1910
},
{
"epoch": 2.1239623685666853,
"grad_norm": 0.03906918847071518,
"learning_rate": 9.793993487569544e-06,
"loss": 0.0023,
"step": 1920
},
{
"epoch": 2.1350304371887106,
"grad_norm": 0.01975929518029572,
"learning_rate": 9.702457906950235e-06,
"loss": 0.0006,
"step": 1930
},
{
"epoch": 2.146098505810736,
"grad_norm": 0.015510480606972247,
"learning_rate": 9.610947272490077e-06,
"loss": 0.0025,
"step": 1940
},
{
"epoch": 2.1571665744327615,
"grad_norm": 0.3546534218940891,
"learning_rate": 9.519469256511415e-06,
"loss": 0.0023,
"step": 1950
},
{
"epoch": 2.1682346430547867,
"grad_norm": 0.01060473919463944,
"learning_rate": 9.428031528601846e-06,
"loss": 0.0004,
"step": 1960
},
{
"epoch": 2.1793027116768124,
"grad_norm": 0.22104920814353005,
"learning_rate": 9.336641754971183e-06,
"loss": 0.0014,
"step": 1970
},
{
"epoch": 2.1903707802988377,
"grad_norm": 0.08689911675197709,
"learning_rate": 9.245307597808702e-06,
"loss": 0.0005,
"step": 1980
},
{
"epoch": 2.2014388489208634,
"grad_norm": 0.0883619792463627,
"learning_rate": 9.154036714640768e-06,
"loss": 0.0007,
"step": 1990
},
{
"epoch": 2.2125069175428886,
"grad_norm": 0.15082305658963324,
"learning_rate": 9.0628367576888e-06,
"loss": 0.0012,
"step": 2000
},
{
"epoch": 2.2235749861649143,
"grad_norm": 0.07102549960011285,
"learning_rate": 8.971715373227704e-06,
"loss": 0.0023,
"step": 2010
},
{
"epoch": 2.2346430547869396,
"grad_norm": 0.27718384861440826,
"learning_rate": 8.880680200944812e-06,
"loss": 0.001,
"step": 2020
},
{
"epoch": 2.2457111234089653,
"grad_norm": 0.39784384927805344,
"learning_rate": 8.789738873299356e-06,
"loss": 0.0014,
"step": 2030
},
{
"epoch": 2.2567791920309905,
"grad_norm": 0.10575032505942293,
"learning_rate": 8.698899014882572e-06,
"loss": 0.0007,
"step": 2040
},
{
"epoch": 2.2678472606530162,
"grad_norm": 0.015437632384868543,
"learning_rate": 8.60816824177842e-06,
"loss": 0.0004,
"step": 2050
},
{
"epoch": 2.2789153292750415,
"grad_norm": 0.013827548095505848,
"learning_rate": 8.517554160925073e-06,
"loss": 0.0013,
"step": 2060
},
{
"epoch": 2.2899833978970667,
"grad_norm": 0.1548641691681526,
"learning_rate": 8.42706436947714e-06,
"loss": 0.0009,
"step": 2070
},
{
"epoch": 2.3010514665190924,
"grad_norm": 0.1347709020457703,
"learning_rate": 8.336706454168701e-06,
"loss": 0.0012,
"step": 2080
},
{
"epoch": 2.312119535141118,
"grad_norm": 0.008104450008335772,
"learning_rate": 8.246487990677242e-06,
"loss": 0.0008,
"step": 2090
},
{
"epoch": 2.3231876037631434,
"grad_norm": 0.0069746001251187765,
"learning_rate": 8.156416542988505e-06,
"loss": 0.0005,
"step": 2100
},
{
"epoch": 2.3342556723851686,
"grad_norm": 0.06413164810520909,
"learning_rate": 8.066499662762312e-06,
"loss": 0.0017,
"step": 2110
},
{
"epoch": 2.3453237410071943,
"grad_norm": 0.27922715524820757,
"learning_rate": 7.976744888699416e-06,
"loss": 0.0005,
"step": 2120
},
{
"epoch": 2.3563918096292196,
"grad_norm": 0.1170676822125543,
"learning_rate": 7.887159745909484e-06,
"loss": 0.0023,
"step": 2130
},
{
"epoch": 2.3674598782512453,
"grad_norm": 0.09899495190658357,
"learning_rate": 7.797751745280153e-06,
"loss": 0.0012,
"step": 2140
},
{
"epoch": 2.3785279468732705,
"grad_norm": 0.1669162943718525,
"learning_rate": 7.708528382847333e-06,
"loss": 0.0017,
"step": 2150
},
{
"epoch": 2.389596015495296,
"grad_norm": 0.04531028241647312,
"learning_rate": 7.6194971391667126e-06,
"loss": 0.0012,
"step": 2160
},
{
"epoch": 2.4006640841173215,
"grad_norm": 0.024530073059923975,
"learning_rate": 7.530665478686613e-06,
"loss": 0.0006,
"step": 2170
},
{
"epoch": 2.411732152739347,
"grad_norm": 0.1393568845665947,
"learning_rate": 7.442040849122127e-06,
"loss": 0.0007,
"step": 2180
},
{
"epoch": 2.4228002213613724,
"grad_norm": 0.11837513907578925,
"learning_rate": 7.3536306808307256e-06,
"loss": 0.0008,
"step": 2190
},
{
"epoch": 2.4338682899833977,
"grad_norm": 0.2239767609538588,
"learning_rate": 7.265442386189281e-06,
"loss": 0.0006,
"step": 2200
},
{
"epoch": 2.4449363586054234,
"grad_norm": 0.03502224363169152,
"learning_rate": 7.177483358972607e-06,
"loss": 0.0007,
"step": 2210
},
{
"epoch": 2.456004427227449,
"grad_norm": 0.004121217145321139,
"learning_rate": 7.089760973733553e-06,
"loss": 0.0005,
"step": 2220
},
{
"epoch": 2.4670724958494743,
"grad_norm": 0.0080795156182359,
"learning_rate": 7.002282585184731e-06,
"loss": 0.0015,
"step": 2230
},
{
"epoch": 2.4781405644714996,
"grad_norm": 0.018145033752889977,
"learning_rate": 6.915055527581878e-06,
"loss": 0.0006,
"step": 2240
},
{
"epoch": 2.4892086330935252,
"grad_norm": 0.019850233754552235,
"learning_rate": 6.8280871141089415e-06,
"loss": 0.0008,
"step": 2250
},
{
"epoch": 2.5002767017155505,
"grad_norm": 0.1741781894645272,
"learning_rate": 6.741384636264961e-06,
"loss": 0.0031,
"step": 2260
},
{
"epoch": 2.511344770337576,
"grad_norm": 0.04992280498804918,
"learning_rate": 6.6549553632527154e-06,
"loss": 0.0006,
"step": 2270
},
{
"epoch": 2.5224128389596014,
"grad_norm": 0.01759604589558905,
"learning_rate": 6.568806541369287e-06,
"loss": 0.0004,
"step": 2280
},
{
"epoch": 2.533480907581627,
"grad_norm": 0.005607314564319438,
"learning_rate": 6.4829453933985096e-06,
"loss": 0.0003,
"step": 2290
},
{
"epoch": 2.5445489762036524,
"grad_norm": 0.08607954611806186,
"learning_rate": 6.397379118005423e-06,
"loss": 0.0008,
"step": 2300
},
{
"epoch": 2.555617044825678,
"grad_norm": 0.2820276784438115,
"learning_rate": 6.312114889132721e-06,
"loss": 0.0005,
"step": 2310
},
{
"epoch": 2.5666851134477033,
"grad_norm": 0.006054427468933993,
"learning_rate": 6.227159855399276e-06,
"loss": 0.0004,
"step": 2320
},
{
"epoch": 2.5777531820697286,
"grad_norm": 0.020395979259911268,
"learning_rate": 6.142521139500803e-06,
"loss": 0.0005,
"step": 2330
},
{
"epoch": 2.5888212506917543,
"grad_norm": 0.005378706050493182,
"learning_rate": 6.058205837612694e-06,
"loss": 0.0017,
"step": 2340
},
{
"epoch": 2.59988931931378,
"grad_norm": 0.6929959495629392,
"learning_rate": 5.974221018795048e-06,
"loss": 0.0018,
"step": 2350
},
{
"epoch": 2.6109573879358052,
"grad_norm": 0.1430231752480323,
"learning_rate": 5.89057372440002e-06,
"loss": 0.0006,
"step": 2360
},
{
"epoch": 2.6220254565578305,
"grad_norm": 0.08736899606365109,
"learning_rate": 5.807270967481442e-06,
"loss": 0.0005,
"step": 2370
},
{
"epoch": 2.633093525179856,
"grad_norm": 0.20917966364677992,
"learning_rate": 5.724319732206878e-06,
"loss": 0.0003,
"step": 2380
},
{
"epoch": 2.6441615938018814,
"grad_norm": 0.020443284638463004,
"learning_rate": 5.6417269732720204e-06,
"loss": 0.0008,
"step": 2390
},
{
"epoch": 2.655229662423907,
"grad_norm": 0.006232574325332509,
"learning_rate": 5.559499615317652e-06,
"loss": 0.0015,
"step": 2400
},
{
"epoch": 2.6662977310459324,
"grad_norm": 0.09483140471952521,
"learning_rate": 5.477644552349033e-06,
"loss": 0.001,
"step": 2410
},
{
"epoch": 2.677365799667958,
"grad_norm": 0.01497083814614671,
"learning_rate": 5.396168647157942e-06,
"loss": 0.0001,
"step": 2420
},
{
"epoch": 2.6884338682899833,
"grad_norm": 0.062082528516403966,
"learning_rate": 5.315078730747268e-06,
"loss": 0.0014,
"step": 2430
},
{
"epoch": 2.699501936912009,
"grad_norm": 0.016768871806098665,
"learning_rate": 5.234381601758306e-06,
"loss": 0.0004,
"step": 2440
},
{
"epoch": 2.7105700055340343,
"grad_norm": 0.021142976199730865,
"learning_rate": 5.154084025900742e-06,
"loss": 0.0004,
"step": 2450
},
{
"epoch": 2.7216380741560595,
"grad_norm": 0.005663906697363443,
"learning_rate": 5.0741927353854305e-06,
"loss": 0.0001,
"step": 2460
},
{
"epoch": 2.732706142778085,
"grad_norm": 0.0015717000409481488,
"learning_rate": 4.994714428359936e-06,
"loss": 0.0007,
"step": 2470
},
{
"epoch": 2.743774211400111,
"grad_norm": 0.003253438286001577,
"learning_rate": 4.915655768346975e-06,
"loss": 0.0005,
"step": 2480
},
{
"epoch": 2.754842280022136,
"grad_norm": 0.30553434446246647,
"learning_rate": 4.837023383685736e-06,
"loss": 0.0004,
"step": 2490
},
{
"epoch": 2.7659103486441614,
"grad_norm": 0.0015060930301738903,
"learning_rate": 4.758823866976152e-06,
"loss": 0.0003,
"step": 2500
},
{
"epoch": 2.776978417266187,
"grad_norm": 0.0012735780749710956,
"learning_rate": 4.681063774526166e-06,
"loss": 0.0,
"step": 2510
},
{
"epoch": 2.7880464858882124,
"grad_norm": 0.09126349400935568,
"learning_rate": 4.603749625802051e-06,
"loss": 0.0003,
"step": 2520
},
{
"epoch": 2.799114554510238,
"grad_norm": 0.14258230843971909,
"learning_rate": 4.526887902881822e-06,
"loss": 0.0002,
"step": 2530
},
{
"epoch": 2.8101826231322633,
"grad_norm": 0.00165028758610408,
"learning_rate": 4.450485049911757e-06,
"loss": 0.0011,
"step": 2540
},
{
"epoch": 2.821250691754289,
"grad_norm": 0.0067956970685621836,
"learning_rate": 4.374547472566129e-06,
"loss": 0.0004,
"step": 2550
},
{
"epoch": 2.8323187603763142,
"grad_norm": 0.003682680710757902,
"learning_rate": 4.299081537510143e-06,
"loss": 0.0005,
"step": 2560
},
{
"epoch": 2.84338682899834,
"grad_norm": 0.2401069296675629,
"learning_rate": 4.2240935718661365e-06,
"loss": 0.0003,
"step": 2570
},
{
"epoch": 2.854454897620365,
"grad_norm": 0.005953507115694622,
"learning_rate": 4.149589862683141e-06,
"loss": 0.0002,
"step": 2580
},
{
"epoch": 2.8655229662423904,
"grad_norm": 0.0014731662378259323,
"learning_rate": 4.075576656409733e-06,
"loss": 0.0001,
"step": 2590
},
{
"epoch": 2.876591034864416,
"grad_norm": 0.0023579910574366416,
"learning_rate": 4.002060158370361e-06,
"loss": 0.0001,
"step": 2600
},
{
"epoch": 2.887659103486442,
"grad_norm": 0.0011945901783503858,
"learning_rate": 3.9290465322450685e-06,
"loss": 0.0004,
"step": 2610
},
{
"epoch": 2.898727172108467,
"grad_norm": 0.03407895985174983,
"learning_rate": 3.8565418995527185e-06,
"loss": 0.0001,
"step": 2620
},
{
"epoch": 2.9097952407304923,
"grad_norm": 0.004566715125346021,
"learning_rate": 3.7845523391377815e-06,
"loss": 0.0005,
"step": 2630
},
{
"epoch": 2.920863309352518,
"grad_norm": 0.005795874642656273,
"learning_rate": 3.7130838866606665e-06,
"loss": 0.0004,
"step": 2640
},
{
"epoch": 2.9319313779745433,
"grad_norm": 0.008770525441123975,
"learning_rate": 3.642142534091695e-06,
"loss": 0.0001,
"step": 2650
},
{
"epoch": 2.942999446596569,
"grad_norm": 0.0013301081718034352,
"learning_rate": 3.571734229208712e-06,
"loss": 0.0002,
"step": 2660
},
{
"epoch": 2.9540675152185942,
"grad_norm": 0.022671106846233965,
"learning_rate": 3.5018648750984473e-06,
"loss": 0.0,
"step": 2670
},
{
"epoch": 2.96513558384062,
"grad_norm": 0.0019084527889065164,
"learning_rate": 3.4325403296615677e-06,
"loss": 0.0001,
"step": 2680
},
{
"epoch": 2.976203652462645,
"grad_norm": 0.0012586049007444003,
"learning_rate": 3.3637664051215703e-06,
"loss": 0.0002,
"step": 2690
},
{
"epoch": 2.987271721084671,
"grad_norm": 0.008633880712666318,
"learning_rate": 3.2955488675374635e-06,
"loss": 0.0005,
"step": 2700
},
{
"epoch": 2.998339789706696,
"grad_norm": 0.006829643276901058,
"learning_rate": 3.227893436320353e-06,
"loss": 0.0004,
"step": 2710
},
{
"epoch": 3.0088544548976204,
"grad_norm": 0.0856770046438516,
"learning_rate": 3.1608057837538976e-06,
"loss": 0.0001,
"step": 2720
},
{
"epoch": 3.0199225235196456,
"grad_norm": 0.0009597305762044255,
"learning_rate": 3.0942915345187617e-06,
"loss": 0.0001,
"step": 2730
},
{
"epoch": 3.0309905921416713,
"grad_norm": 0.016142733928253637,
"learning_rate": 3.028356265221033e-06,
"loss": 0.0001,
"step": 2740
},
{
"epoch": 3.0420586607636966,
"grad_norm": 0.0007246525915734167,
"learning_rate": 2.963005503924674e-06,
"loss": 0.0,
"step": 2750
},
{
"epoch": 3.0531267293857223,
"grad_norm": 0.0042929326717048525,
"learning_rate": 2.8982447296880423e-06,
"loss": 0.0001,
"step": 2760
},
{
"epoch": 3.0641947980077475,
"grad_norm": 0.0030026445487718596,
"learning_rate": 2.8340793721045266e-06,
"loss": 0.0001,
"step": 2770
},
{
"epoch": 3.075262866629773,
"grad_norm": 0.0012849609984761843,
"learning_rate": 2.7705148108473177e-06,
"loss": 0.0002,
"step": 2780
},
{
"epoch": 3.0863309352517985,
"grad_norm": 0.008697418155296581,
"learning_rate": 2.70755637521838e-06,
"loss": 0.0,
"step": 2790
},
{
"epoch": 3.097399003873824,
"grad_norm": 0.007466710314150477,
"learning_rate": 2.645209343701638e-06,
"loss": 0.0,
"step": 2800
},
{
"epoch": 3.1084670724958494,
"grad_norm": 0.000836180974578036,
"learning_rate": 2.5834789435204245e-06,
"loss": 0.0002,
"step": 2810
},
{
"epoch": 3.119535141117875,
"grad_norm": 0.10328226368751944,
"learning_rate": 2.5223703501992234e-06,
"loss": 0.0005,
"step": 2820
},
{
"epoch": 3.1306032097399004,
"grad_norm": 0.0014833894480815533,
"learning_rate": 2.4618886871297454e-06,
"loss": 0.0,
"step": 2830
},
{
"epoch": 3.141671278361926,
"grad_norm": 0.0012426287285033058,
"learning_rate": 2.4020390251413893e-06,
"loss": 0.0002,
"step": 2840
},
{
"epoch": 3.1527393469839513,
"grad_norm": 0.0012341856887167653,
"learning_rate": 2.342826382076098e-06,
"loss": 0.0001,
"step": 2850
},
{
"epoch": 3.1638074156059766,
"grad_norm": 0.0012545195151238747,
"learning_rate": 2.284255722367643e-06,
"loss": 0.0002,
"step": 2860
},
{
"epoch": 3.1748754842280023,
"grad_norm": 0.028964927877798387,
"learning_rate": 2.226331956625427e-06,
"loss": 0.0,
"step": 2870
},
{
"epoch": 3.1859435528500275,
"grad_norm": 0.0021281137508232786,
"learning_rate": 2.16905994122276e-06,
"loss": 0.0,
"step": 2880
},
{
"epoch": 3.197011621472053,
"grad_norm": 0.017846974643844316,
"learning_rate": 2.1124444778896914e-06,
"loss": 0.0001,
"step": 2890
},
{
"epoch": 3.2080796900940785,
"grad_norm": 0.04653872502302951,
"learning_rate": 2.0564903133104474e-06,
"loss": 0.0001,
"step": 2900
},
{
"epoch": 3.219147758716104,
"grad_norm": 0.00975360262148922,
"learning_rate": 2.001202138725451e-06,
"loss": 0.0001,
"step": 2910
},
{
"epoch": 3.2302158273381294,
"grad_norm": 0.0011681464770861175,
"learning_rate": 1.946584589538013e-06,
"loss": 0.0001,
"step": 2920
},
{
"epoch": 3.241283895960155,
"grad_norm": 0.002047929485430651,
"learning_rate": 1.8926422449256842e-06,
"loss": 0.0001,
"step": 2930
},
{
"epoch": 3.2523519645821803,
"grad_norm": 0.0011644854590848986,
"learning_rate": 1.8393796274563458e-06,
"loss": 0.0011,
"step": 2940
},
{
"epoch": 3.263420033204206,
"grad_norm": 0.006586949100274725,
"learning_rate": 1.786801202709032e-06,
"loss": 0.0004,
"step": 2950
},
{
"epoch": 3.2744881018262313,
"grad_norm": 0.05963749406991675,
"learning_rate": 1.7349113788995288e-06,
"loss": 0.0001,
"step": 2960
},
{
"epoch": 3.285556170448257,
"grad_norm": 0.0008960384323782237,
"learning_rate": 1.6837145065107862e-06,
"loss": 0.0,
"step": 2970
},
{
"epoch": 3.2966242390702822,
"grad_norm": 0.0010892427464036952,
"learning_rate": 1.6332148779281765e-06,
"loss": 0.0,
"step": 2980
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.005552386026199114,
"learning_rate": 1.583416727079602e-06,
"loss": 0.0,
"step": 2990
},
{
"epoch": 3.318760376314333,
"grad_norm": 0.0009765106236611565,
"learning_rate": 1.5343242290805348e-06,
"loss": 0.0,
"step": 3000
},
{
"epoch": 3.3298284449363584,
"grad_norm": 0.022067256735230037,
"learning_rate": 1.4859414998839694e-06,
"loss": 0.0003,
"step": 3010
},
{
"epoch": 3.340896513558384,
"grad_norm": 0.0006437609301820168,
"learning_rate": 1.4382725959353305e-06,
"loss": 0.0,
"step": 3020
},
{
"epoch": 3.3519645821804094,
"grad_norm": 0.0020417629447496534,
"learning_rate": 1.3913215138323877e-06,
"loss": 0.0,
"step": 3030
},
{
"epoch": 3.363032650802435,
"grad_norm": 0.0005098328209815861,
"learning_rate": 1.3450921899901637e-06,
"loss": 0.0001,
"step": 3040
},
{
"epoch": 3.3741007194244603,
"grad_norm": 0.0014165659540379754,
"learning_rate": 1.2995885003109166e-06,
"loss": 0.0002,
"step": 3050
},
{
"epoch": 3.385168788046486,
"grad_norm": 0.02746560881484342,
"learning_rate": 1.254814259859175e-06,
"loss": 0.0001,
"step": 3060
},
{
"epoch": 3.3962368566685113,
"grad_norm": 0.01739320847725493,
"learning_rate": 1.2107732225418766e-06,
"loss": 0.0,
"step": 3070
},
{
"epoch": 3.407304925290537,
"grad_norm": 0.0006431886745766922,
"learning_rate": 1.167469080793645e-06,
"loss": 0.0,
"step": 3080
},
{
"epoch": 3.418372993912562,
"grad_norm": 0.003257490633909583,
"learning_rate": 1.1249054652672097e-06,
"loss": 0.0,
"step": 3090
},
{
"epoch": 3.429441062534588,
"grad_norm": 0.00043153195343080357,
"learning_rate": 1.0830859445290044e-06,
"loss": 0.0001,
"step": 3100
},
{
"epoch": 3.440509131156613,
"grad_norm": 0.0016745154217626835,
"learning_rate": 1.0420140247599842e-06,
"loss": 0.0,
"step": 3110
},
{
"epoch": 3.4515771997786384,
"grad_norm": 0.009523095826316882,
"learning_rate": 1.0016931494616644e-06,
"loss": 0.0001,
"step": 3120
},
{
"epoch": 3.462645268400664,
"grad_norm": 0.0025193648225743472,
"learning_rate": 9.621266991674017e-07,
"loss": 0.0,
"step": 3130
},
{
"epoch": 3.4737133370226894,
"grad_norm": 0.004360486555943711,
"learning_rate": 9.233179911589874e-07,
"loss": 0.0,
"step": 3140
},
{
"epoch": 3.484781405644715,
"grad_norm": 0.0011194325613279412,
"learning_rate": 8.852702791885048e-07,
"loss": 0.0,
"step": 3150
},
{
"epoch": 3.4958494742667403,
"grad_norm": 0.006186485269155249,
"learning_rate": 8.479867532055452e-07,
"loss": 0.0001,
"step": 3160
},
{
"epoch": 3.506917542888766,
"grad_norm": 0.0707236321603267,
"learning_rate": 8.114705390897581e-07,
"loss": 0.0001,
"step": 3170
},
{
"epoch": 3.5179856115107913,
"grad_norm": 0.0008315016846056175,
"learning_rate": 7.757246983887679e-07,
"loss": 0.0001,
"step": 3180
},
{
"epoch": 3.529053680132817,
"grad_norm": 0.003081278609407882,
"learning_rate": 7.40752228061502e-07,
"loss": 0.0,
"step": 3190
},
{
"epoch": 3.540121748754842,
"grad_norm": 0.03262274667027854,
"learning_rate": 7.065560602269106e-07,
"loss": 0.0001,
"step": 3200
},
{
"epoch": 3.551189817376868,
"grad_norm": 0.0049780417471171675,
"learning_rate": 6.731390619181466e-07,
"loss": 0.0,
"step": 3210
},
{
"epoch": 3.562257885998893,
"grad_norm": 0.0006067046828209429,
"learning_rate": 6.405040348421876e-07,
"loss": 0.0004,
"step": 3220
},
{
"epoch": 3.573325954620919,
"grad_norm": 0.10601414226734147,
"learning_rate": 6.08653715144939e-07,
"loss": 0.0001,
"step": 3230
},
{
"epoch": 3.584394023242944,
"grad_norm": 0.014565859239713545,
"learning_rate": 5.775907731818308e-07,
"loss": 0.0006,
"step": 3240
},
{
"epoch": 3.5954620918649693,
"grad_norm": 0.0006330087021193012,
"learning_rate": 5.47317813293935e-07,
"loss": 0.0,
"step": 3250
},
{
"epoch": 3.606530160486995,
"grad_norm": 0.0008371382231997046,
"learning_rate": 5.17837373589618e-07,
"loss": 0.0001,
"step": 3260
},
{
"epoch": 3.6175982291090207,
"grad_norm": 0.0004617966536746336,
"learning_rate": 4.891519257317379e-07,
"loss": 0.0002,
"step": 3270
},
{
"epoch": 3.628666297731046,
"grad_norm": 0.04963699258614127,
"learning_rate": 4.612638747304243e-07,
"loss": 0.0001,
"step": 3280
},
{
"epoch": 3.6397343663530712,
"grad_norm": 0.00045115875633578936,
"learning_rate": 4.3417555874143644e-07,
"loss": 0.0001,
"step": 3290
},
{
"epoch": 3.650802434975097,
"grad_norm": 0.0003660195467245479,
"learning_rate": 4.078892488701347e-07,
"loss": 0.0,
"step": 3300
},
{
"epoch": 3.661870503597122,
"grad_norm": 0.005736554465665363,
"learning_rate": 3.824071489810599e-07,
"loss": 0.0,
"step": 3310
},
{
"epoch": 3.672938572219148,
"grad_norm": 0.007943425544438845,
"learning_rate": 3.5773139551317226e-07,
"loss": 0.0,
"step": 3320
},
{
"epoch": 3.684006640841173,
"grad_norm": 0.004240017703104073,
"learning_rate": 3.3386405730072237e-07,
"loss": 0.0,
"step": 3330
},
{
"epoch": 3.695074709463199,
"grad_norm": 0.006848121458682678,
"learning_rate": 3.108071353997999e-07,
"loss": 0.0,
"step": 3340
},
{
"epoch": 3.706142778085224,
"grad_norm": 0.0008720424724852751,
"learning_rate": 2.8856256292056797e-07,
"loss": 0.0006,
"step": 3350
},
{
"epoch": 3.7172108467072498,
"grad_norm": 0.0009071607947124355,
"learning_rate": 2.671322048651781e-07,
"loss": 0.0,
"step": 3360
},
{
"epoch": 3.728278915329275,
"grad_norm": 0.0006526998309807914,
"learning_rate": 2.4651785797142447e-07,
"loss": 0.0,
"step": 3370
},
{
"epoch": 3.7393469839513003,
"grad_norm": 0.000525162417622788,
"learning_rate": 2.267212505620886e-07,
"loss": 0.0,
"step": 3380
},
{
"epoch": 3.750415052573326,
"grad_norm": 0.0007723423555957404,
"learning_rate": 2.0774404240004432e-07,
"loss": 0.0001,
"step": 3390
},
{
"epoch": 3.7614831211953517,
"grad_norm": 0.0006151742760236711,
"learning_rate": 1.8958782454909563e-07,
"loss": 0.0,
"step": 3400
},
{
"epoch": 3.772551189817377,
"grad_norm": 0.019346914707342473,
"learning_rate": 1.72254119240588e-07,
"loss": 0.0,
"step": 3410
},
{
"epoch": 3.783619258439402,
"grad_norm": 0.000574300771385019,
"learning_rate": 1.5574437974577473e-07,
"loss": 0.0001,
"step": 3420
},
{
"epoch": 3.794687327061428,
"grad_norm": 0.0012025102235216681,
"learning_rate": 1.4005999025398231e-07,
"loss": 0.0004,
"step": 3430
},
{
"epoch": 3.805755395683453,
"grad_norm": 0.0019927107249734836,
"learning_rate": 1.2520226575655325e-07,
"loss": 0.0,
"step": 3440
},
{
"epoch": 3.816823464305479,
"grad_norm": 0.0004585933553038856,
"learning_rate": 1.1117245193659864e-07,
"loss": 0.0003,
"step": 3450
},
{
"epoch": 3.827891532927504,
"grad_norm": 0.0008599742748381332,
"learning_rate": 9.79717250645551e-08,
"loss": 0.0,
"step": 3460
},
{
"epoch": 3.8389596015495298,
"grad_norm": 0.005158902732885002,
"learning_rate": 8.56011918995725e-08,
"loss": 0.0001,
"step": 3470
},
{
"epoch": 3.850027670171555,
"grad_norm": 0.0005943589755676227,
"learning_rate": 7.406188959671601e-08,
"loss": 0.0,
"step": 3480
},
{
"epoch": 3.8610957387935807,
"grad_norm": 0.013582756443605442,
"learning_rate": 6.33547856200134e-08,
"loss": 0.0,
"step": 3490
},
{
"epoch": 3.872163807415606,
"grad_norm": 0.0015272986821739895,
"learning_rate": 5.3480777661341077e-08,
"loss": 0.0001,
"step": 3500
},
{
"epoch": 3.883231876037631,
"grad_norm": 0.023645089836197807,
"learning_rate": 4.4440693565160895e-08,
"loss": 0.0001,
"step": 3510
},
{
"epoch": 3.894299944659657,
"grad_norm": 0.004636458141232424,
"learning_rate": 3.6235291259113516e-08,
"loss": 0.0001,
"step": 3520
},
{
"epoch": 3.9053680132816826,
"grad_norm": 0.0005499173157726086,
"learning_rate": 2.886525869047363e-08,
"loss": 0.0,
"step": 3530
},
{
"epoch": 3.916436081903708,
"grad_norm": 0.0006622306223321599,
"learning_rate": 2.2331213768468363e-08,
"loss": 0.0,
"step": 3540
},
{
"epoch": 3.927504150525733,
"grad_norm": 0.0008544205486865539,
"learning_rate": 1.6633704312478683e-08,
"loss": 0.0001,
"step": 3550
},
{
"epoch": 3.938572219147759,
"grad_norm": 0.0027443144589558884,
"learning_rate": 1.177320800610171e-08,
"loss": 0.0,
"step": 3560
},
{
"epoch": 3.949640287769784,
"grad_norm": 0.0005251118311918424,
"learning_rate": 7.750132357106089e-09,
"loss": 0.0001,
"step": 3570
},
{
"epoch": 3.9607083563918097,
"grad_norm": 0.0012279003225779102,
"learning_rate": 4.5648146632648605e-09,
"loss": 0.0002,
"step": 3580
},
{
"epoch": 3.971776425013835,
"grad_norm": 0.001971040506480486,
"learning_rate": 2.217521984076987e-09,
"loss": 0.0,
"step": 3590
},
{
"epoch": 3.9828444936358607,
"grad_norm": 0.0005086949695235262,
"learning_rate": 7.08451118375253e-10,
"loss": 0.0,
"step": 3600
},
{
"epoch": 3.993912562257886,
"grad_norm": 0.001967814841098194,
"learning_rate": 3.772858782724598e-11,
"loss": 0.0,
"step": 3610
},
{
"epoch": 3.9961261759822913,
"step": 3612,
"total_flos": 3601561862275072.0,
"train_loss": 0.03465571804643162,
"train_runtime": 106374.8788,
"train_samples_per_second": 8.697,
"train_steps_per_second": 0.034
}
],
"logging_steps": 10,
"max_steps": 3612,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3601561862275072.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}