codementor-flan / checkpoint-30000 /trainer_state.json
Tuathe's picture
Upload model files with correct config
c42b33b
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.995203836930456,
"eval_steps": 500,
"global_step": 30000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.019984012789768184,
"grad_norm": 10.435233116149902,
"learning_rate": 4.9835131894484415e-05,
"loss": 14.0782,
"step": 100
},
{
"epoch": 0.03996802557953637,
"grad_norm": 19.157014846801758,
"learning_rate": 4.966859845456968e-05,
"loss": 3.596,
"step": 200
},
{
"epoch": 0.05995203836930456,
"grad_norm": 5.946423530578613,
"learning_rate": 4.950206501465495e-05,
"loss": 2.0558,
"step": 300
},
{
"epoch": 0.07993605115907274,
"grad_norm": 2.1558027267456055,
"learning_rate": 4.9335531574740214e-05,
"loss": 1.3957,
"step": 400
},
{
"epoch": 0.09992006394884093,
"grad_norm": 2.5541210174560547,
"learning_rate": 4.916899813482548e-05,
"loss": 1.252,
"step": 500
},
{
"epoch": 0.11990407673860912,
"grad_norm": 1.8417296409606934,
"learning_rate": 4.9002464694910735e-05,
"loss": 1.1695,
"step": 600
},
{
"epoch": 0.1398880895283773,
"grad_norm": 1.654639720916748,
"learning_rate": 4.8835931254996005e-05,
"loss": 1.1848,
"step": 700
},
{
"epoch": 0.15987210231814547,
"grad_norm": 2.093991756439209,
"learning_rate": 4.866939781508127e-05,
"loss": 1.1056,
"step": 800
},
{
"epoch": 0.17985611510791366,
"grad_norm": 1.9905765056610107,
"learning_rate": 4.850286437516653e-05,
"loss": 1.1186,
"step": 900
},
{
"epoch": 0.19984012789768185,
"grad_norm": 1.2864068746566772,
"learning_rate": 4.83363309352518e-05,
"loss": 1.1394,
"step": 1000
},
{
"epoch": 0.21982414068745004,
"grad_norm": 1.5142974853515625,
"learning_rate": 4.816979749533707e-05,
"loss": 1.1162,
"step": 1100
},
{
"epoch": 0.23980815347721823,
"grad_norm": 0.9597436189651489,
"learning_rate": 4.800326405542233e-05,
"loss": 1.0922,
"step": 1200
},
{
"epoch": 0.2597921662669864,
"grad_norm": 1.7915301322937012,
"learning_rate": 4.7836730615507595e-05,
"loss": 1.0711,
"step": 1300
},
{
"epoch": 0.2797761790567546,
"grad_norm": 1.1338952779769897,
"learning_rate": 4.767019717559286e-05,
"loss": 0.9945,
"step": 1400
},
{
"epoch": 0.2997601918465228,
"grad_norm": 1.3437297344207764,
"learning_rate": 4.750366373567813e-05,
"loss": 1.0322,
"step": 1500
},
{
"epoch": 0.31974420463629094,
"grad_norm": 1.2949973344802856,
"learning_rate": 4.733713029576339e-05,
"loss": 1.0613,
"step": 1600
},
{
"epoch": 0.33972821742605913,
"grad_norm": 1.1362179517745972,
"learning_rate": 4.717059685584866e-05,
"loss": 1.0122,
"step": 1700
},
{
"epoch": 0.3597122302158273,
"grad_norm": 1.1878063678741455,
"learning_rate": 4.700406341593392e-05,
"loss": 1.0068,
"step": 1800
},
{
"epoch": 0.3796962430055955,
"grad_norm": 1.3588361740112305,
"learning_rate": 4.6837529976019185e-05,
"loss": 0.955,
"step": 1900
},
{
"epoch": 0.3996802557953637,
"grad_norm": 1.1428577899932861,
"learning_rate": 4.667099653610445e-05,
"loss": 0.9863,
"step": 2000
},
{
"epoch": 0.4196642685851319,
"grad_norm": 1.6441487073898315,
"learning_rate": 4.650446309618971e-05,
"loss": 0.9532,
"step": 2100
},
{
"epoch": 0.4396482813749001,
"grad_norm": 2.607586145401001,
"learning_rate": 4.633792965627498e-05,
"loss": 0.9877,
"step": 2200
},
{
"epoch": 0.4596322941646683,
"grad_norm": 2.24434757232666,
"learning_rate": 4.617139621636025e-05,
"loss": 0.9956,
"step": 2300
},
{
"epoch": 0.47961630695443647,
"grad_norm": 1.8238356113433838,
"learning_rate": 4.600486277644551e-05,
"loss": 0.9769,
"step": 2400
},
{
"epoch": 0.49960031974420466,
"grad_norm": 2.0538158416748047,
"learning_rate": 4.5838329336530775e-05,
"loss": 0.9458,
"step": 2500
},
{
"epoch": 0.5195843325339728,
"grad_norm": 2.1354427337646484,
"learning_rate": 4.5671795896616045e-05,
"loss": 0.975,
"step": 2600
},
{
"epoch": 0.539568345323741,
"grad_norm": 1.3763636350631714,
"learning_rate": 4.550526245670131e-05,
"loss": 0.9464,
"step": 2700
},
{
"epoch": 0.5595523581135092,
"grad_norm": 2.6834394931793213,
"learning_rate": 4.533872901678657e-05,
"loss": 0.9437,
"step": 2800
},
{
"epoch": 0.5795363709032774,
"grad_norm": 1.3325830698013306,
"learning_rate": 4.517219557687184e-05,
"loss": 0.9617,
"step": 2900
},
{
"epoch": 0.5995203836930456,
"grad_norm": 1.841642141342163,
"learning_rate": 4.500566213695711e-05,
"loss": 0.9489,
"step": 3000
},
{
"epoch": 0.6195043964828137,
"grad_norm": 1.0305529832839966,
"learning_rate": 4.483912869704237e-05,
"loss": 0.9432,
"step": 3100
},
{
"epoch": 0.6394884092725819,
"grad_norm": 1.4249075651168823,
"learning_rate": 4.467259525712763e-05,
"loss": 0.952,
"step": 3200
},
{
"epoch": 0.6594724220623501,
"grad_norm": 1.2994813919067383,
"learning_rate": 4.45060618172129e-05,
"loss": 1.0247,
"step": 3300
},
{
"epoch": 0.6794564348521183,
"grad_norm": 1.537548303604126,
"learning_rate": 4.433952837729816e-05,
"loss": 0.928,
"step": 3400
},
{
"epoch": 0.6994404476418865,
"grad_norm": 1.646200180053711,
"learning_rate": 4.4172994937383427e-05,
"loss": 0.9817,
"step": 3500
},
{
"epoch": 0.7194244604316546,
"grad_norm": 1.2779592275619507,
"learning_rate": 4.400646149746869e-05,
"loss": 0.8974,
"step": 3600
},
{
"epoch": 0.7394084732214229,
"grad_norm": 1.7886369228363037,
"learning_rate": 4.383992805755396e-05,
"loss": 0.9351,
"step": 3700
},
{
"epoch": 0.759392486011191,
"grad_norm": 1.5072957277297974,
"learning_rate": 4.3673394617639225e-05,
"loss": 0.882,
"step": 3800
},
{
"epoch": 0.7793764988009593,
"grad_norm": 2.019421339035034,
"learning_rate": 4.350686117772449e-05,
"loss": 0.9324,
"step": 3900
},
{
"epoch": 0.7993605115907274,
"grad_norm": 1.7232331037521362,
"learning_rate": 4.334032773780975e-05,
"loss": 0.9094,
"step": 4000
},
{
"epoch": 0.8193445243804957,
"grad_norm": 1.7297419309616089,
"learning_rate": 4.317379429789502e-05,
"loss": 0.9658,
"step": 4100
},
{
"epoch": 0.8393285371702638,
"grad_norm": 1.560420274734497,
"learning_rate": 4.300726085798029e-05,
"loss": 0.9613,
"step": 4200
},
{
"epoch": 0.8593125499600319,
"grad_norm": 1.9014427661895752,
"learning_rate": 4.284072741806555e-05,
"loss": 0.9047,
"step": 4300
},
{
"epoch": 0.8792965627498002,
"grad_norm": 1.4741132259368896,
"learning_rate": 4.2674193978150815e-05,
"loss": 0.7914,
"step": 4400
},
{
"epoch": 0.8992805755395683,
"grad_norm": 0.8008555173873901,
"learning_rate": 4.250766053823608e-05,
"loss": 0.8678,
"step": 4500
},
{
"epoch": 0.9192645883293366,
"grad_norm": 1.5738321542739868,
"learning_rate": 4.234112709832134e-05,
"loss": 0.8379,
"step": 4600
},
{
"epoch": 0.9392486011191047,
"grad_norm": 1.7838175296783447,
"learning_rate": 4.2174593658406606e-05,
"loss": 0.9581,
"step": 4700
},
{
"epoch": 0.9592326139088729,
"grad_norm": 1.6761012077331543,
"learning_rate": 4.200806021849188e-05,
"loss": 0.8583,
"step": 4800
},
{
"epoch": 0.9792166266986411,
"grad_norm": 1.320033073425293,
"learning_rate": 4.184152677857714e-05,
"loss": 0.8709,
"step": 4900
},
{
"epoch": 0.9992006394884093,
"grad_norm": 2.0931804180145264,
"learning_rate": 4.1674993338662404e-05,
"loss": 0.9064,
"step": 5000
},
{
"epoch": 1.0191846522781776,
"grad_norm": 1.1682066917419434,
"learning_rate": 4.150845989874767e-05,
"loss": 0.8728,
"step": 5100
},
{
"epoch": 1.0391686650679457,
"grad_norm": 1.2569072246551514,
"learning_rate": 4.134192645883294e-05,
"loss": 0.8921,
"step": 5200
},
{
"epoch": 1.0591526778577138,
"grad_norm": 1.4215683937072754,
"learning_rate": 4.11753930189182e-05,
"loss": 0.8211,
"step": 5300
},
{
"epoch": 1.079136690647482,
"grad_norm": 1.6081187725067139,
"learning_rate": 4.1008859579003467e-05,
"loss": 0.9338,
"step": 5400
},
{
"epoch": 1.09912070343725,
"grad_norm": 1.4916200637817383,
"learning_rate": 4.084232613908873e-05,
"loss": 0.8142,
"step": 5500
},
{
"epoch": 1.1191047162270185,
"grad_norm": 1.8639625310897827,
"learning_rate": 4.0675792699174e-05,
"loss": 0.8746,
"step": 5600
},
{
"epoch": 1.1390887290167866,
"grad_norm": 1.1741764545440674,
"learning_rate": 4.0509259259259265e-05,
"loss": 0.8032,
"step": 5700
},
{
"epoch": 1.1590727418065547,
"grad_norm": 1.7627875804901123,
"learning_rate": 4.034272581934453e-05,
"loss": 0.8681,
"step": 5800
},
{
"epoch": 1.1790567545963229,
"grad_norm": 0.7432733178138733,
"learning_rate": 4.0176192379429786e-05,
"loss": 0.8968,
"step": 5900
},
{
"epoch": 1.1990407673860912,
"grad_norm": 1.5172642469406128,
"learning_rate": 4.0009658939515056e-05,
"loss": 0.9653,
"step": 6000
},
{
"epoch": 1.2190247801758594,
"grad_norm": 2.0822958946228027,
"learning_rate": 3.984312549960032e-05,
"loss": 0.8038,
"step": 6100
},
{
"epoch": 1.2390087929656275,
"grad_norm": 2.2852039337158203,
"learning_rate": 3.9676592059685584e-05,
"loss": 0.8041,
"step": 6200
},
{
"epoch": 1.2589928057553956,
"grad_norm": 1.214968204498291,
"learning_rate": 3.951005861977085e-05,
"loss": 0.8382,
"step": 6300
},
{
"epoch": 1.2789768185451638,
"grad_norm": 2.792722225189209,
"learning_rate": 3.934352517985612e-05,
"loss": 0.8534,
"step": 6400
},
{
"epoch": 1.2989608313349321,
"grad_norm": 1.6279624700546265,
"learning_rate": 3.917699173994138e-05,
"loss": 0.8387,
"step": 6500
},
{
"epoch": 1.3189448441247003,
"grad_norm": 1.57301664352417,
"learning_rate": 3.9010458300026646e-05,
"loss": 0.8583,
"step": 6600
},
{
"epoch": 1.3389288569144684,
"grad_norm": 1.2693675756454468,
"learning_rate": 3.884392486011191e-05,
"loss": 0.7893,
"step": 6700
},
{
"epoch": 1.3589128697042367,
"grad_norm": 1.1760280132293701,
"learning_rate": 3.867739142019718e-05,
"loss": 0.8204,
"step": 6800
},
{
"epoch": 1.3788968824940047,
"grad_norm": 1.8213127851486206,
"learning_rate": 3.8510857980282444e-05,
"loss": 0.9061,
"step": 6900
},
{
"epoch": 1.398880895283773,
"grad_norm": 1.2589592933654785,
"learning_rate": 3.834432454036771e-05,
"loss": 0.856,
"step": 7000
},
{
"epoch": 1.4188649080735412,
"grad_norm": 2.5817718505859375,
"learning_rate": 3.817779110045297e-05,
"loss": 0.8542,
"step": 7100
},
{
"epoch": 1.4388489208633093,
"grad_norm": 1.1825404167175293,
"learning_rate": 3.8011257660538236e-05,
"loss": 0.8298,
"step": 7200
},
{
"epoch": 1.4588329336530776,
"grad_norm": 1.6443575620651245,
"learning_rate": 3.78447242206235e-05,
"loss": 0.823,
"step": 7300
},
{
"epoch": 1.4788169464428458,
"grad_norm": 1.9887899160385132,
"learning_rate": 3.7678190780708764e-05,
"loss": 0.7936,
"step": 7400
},
{
"epoch": 1.498800959232614,
"grad_norm": 1.8799304962158203,
"learning_rate": 3.7511657340794034e-05,
"loss": 0.8755,
"step": 7500
},
{
"epoch": 1.518784972022382,
"grad_norm": 1.6680015325546265,
"learning_rate": 3.73451239008793e-05,
"loss": 0.8669,
"step": 7600
},
{
"epoch": 1.5387689848121502,
"grad_norm": 1.7756261825561523,
"learning_rate": 3.717859046096456e-05,
"loss": 0.8572,
"step": 7700
},
{
"epoch": 1.5587529976019185,
"grad_norm": 1.3951911926269531,
"learning_rate": 3.7012057021049826e-05,
"loss": 0.8422,
"step": 7800
},
{
"epoch": 1.5787370103916867,
"grad_norm": 1.8145322799682617,
"learning_rate": 3.6845523581135096e-05,
"loss": 0.783,
"step": 7900
},
{
"epoch": 1.5987210231814548,
"grad_norm": 1.4113447666168213,
"learning_rate": 3.667899014122036e-05,
"loss": 0.8368,
"step": 8000
},
{
"epoch": 1.6187050359712232,
"grad_norm": 1.5562957525253296,
"learning_rate": 3.6512456701305624e-05,
"loss": 0.8232,
"step": 8100
},
{
"epoch": 1.638689048760991,
"grad_norm": 2.0334463119506836,
"learning_rate": 3.634592326139089e-05,
"loss": 0.8272,
"step": 8200
},
{
"epoch": 1.6586730615507594,
"grad_norm": 2.305115222930908,
"learning_rate": 3.617938982147616e-05,
"loss": 0.8708,
"step": 8300
},
{
"epoch": 1.6786570743405276,
"grad_norm": 1.9576376676559448,
"learning_rate": 3.601285638156142e-05,
"loss": 0.8437,
"step": 8400
},
{
"epoch": 1.6986410871302957,
"grad_norm": 1.324064016342163,
"learning_rate": 3.584632294164668e-05,
"loss": 0.8197,
"step": 8500
},
{
"epoch": 1.718625099920064,
"grad_norm": 1.5594903230667114,
"learning_rate": 3.567978950173195e-05,
"loss": 0.8365,
"step": 8600
},
{
"epoch": 1.738609112709832,
"grad_norm": 1.853633999824524,
"learning_rate": 3.5513256061817214e-05,
"loss": 0.7914,
"step": 8700
},
{
"epoch": 1.7585931254996003,
"grad_norm": 1.839158296585083,
"learning_rate": 3.534672262190248e-05,
"loss": 0.9374,
"step": 8800
},
{
"epoch": 1.7785771382893685,
"grad_norm": 2.5038366317749023,
"learning_rate": 3.518018918198774e-05,
"loss": 0.8202,
"step": 8900
},
{
"epoch": 1.7985611510791366,
"grad_norm": 1.7603284120559692,
"learning_rate": 3.501365574207301e-05,
"loss": 0.8346,
"step": 9000
},
{
"epoch": 1.818545163868905,
"grad_norm": 1.9243416786193848,
"learning_rate": 3.4847122302158276e-05,
"loss": 0.8141,
"step": 9100
},
{
"epoch": 1.838529176658673,
"grad_norm": 1.6993930339813232,
"learning_rate": 3.468058886224354e-05,
"loss": 0.7974,
"step": 9200
},
{
"epoch": 1.8585131894484412,
"grad_norm": 1.9248780012130737,
"learning_rate": 3.4514055422328804e-05,
"loss": 0.8543,
"step": 9300
},
{
"epoch": 1.8784972022382096,
"grad_norm": 1.7247469425201416,
"learning_rate": 3.4347521982414074e-05,
"loss": 0.7968,
"step": 9400
},
{
"epoch": 1.8984812150279775,
"grad_norm": 1.165992259979248,
"learning_rate": 3.418098854249934e-05,
"loss": 0.7946,
"step": 9500
},
{
"epoch": 1.9184652278177459,
"grad_norm": 1.5617034435272217,
"learning_rate": 3.40144551025846e-05,
"loss": 0.8452,
"step": 9600
},
{
"epoch": 1.938449240607514,
"grad_norm": 1.9524955749511719,
"learning_rate": 3.3847921662669866e-05,
"loss": 0.821,
"step": 9700
},
{
"epoch": 1.9584332533972821,
"grad_norm": 1.201984167098999,
"learning_rate": 3.3681388222755136e-05,
"loss": 0.8244,
"step": 9800
},
{
"epoch": 1.9784172661870505,
"grad_norm": 1.5261083841323853,
"learning_rate": 3.3514854782840393e-05,
"loss": 0.8521,
"step": 9900
},
{
"epoch": 1.9984012789768184,
"grad_norm": 0.8879593014717102,
"learning_rate": 3.334832134292566e-05,
"loss": 0.7745,
"step": 10000
},
{
"epoch": 2.0183852917665868,
"grad_norm": 1.9460114240646362,
"learning_rate": 3.318178790301093e-05,
"loss": 0.7926,
"step": 10100
},
{
"epoch": 2.038369304556355,
"grad_norm": 2.0698747634887695,
"learning_rate": 3.301525446309619e-05,
"loss": 0.8668,
"step": 10200
},
{
"epoch": 2.058353317346123,
"grad_norm": 1.6188371181488037,
"learning_rate": 3.2848721023181456e-05,
"loss": 0.7743,
"step": 10300
},
{
"epoch": 2.0783373301358914,
"grad_norm": 1.4746142625808716,
"learning_rate": 3.268218758326672e-05,
"loss": 0.776,
"step": 10400
},
{
"epoch": 2.0983213429256593,
"grad_norm": 2.6285245418548584,
"learning_rate": 3.251565414335199e-05,
"loss": 0.7952,
"step": 10500
},
{
"epoch": 2.1183053557154277,
"grad_norm": 2.9462263584136963,
"learning_rate": 3.2349120703437254e-05,
"loss": 0.8492,
"step": 10600
},
{
"epoch": 2.138289368505196,
"grad_norm": 2.2768771648406982,
"learning_rate": 3.218258726352252e-05,
"loss": 0.7773,
"step": 10700
},
{
"epoch": 2.158273381294964,
"grad_norm": 2.4314112663269043,
"learning_rate": 3.201605382360778e-05,
"loss": 0.7789,
"step": 10800
},
{
"epoch": 2.1782573940847323,
"grad_norm": 2.631697654724121,
"learning_rate": 3.184952038369305e-05,
"loss": 0.7809,
"step": 10900
},
{
"epoch": 2.1982414068745,
"grad_norm": 2.0636370182037354,
"learning_rate": 3.1682986943778316e-05,
"loss": 0.7627,
"step": 11000
},
{
"epoch": 2.2182254196642686,
"grad_norm": 1.861494779586792,
"learning_rate": 3.151645350386358e-05,
"loss": 0.7454,
"step": 11100
},
{
"epoch": 2.238209432454037,
"grad_norm": 1.6431078910827637,
"learning_rate": 3.134992006394884e-05,
"loss": 0.8208,
"step": 11200
},
{
"epoch": 2.258193445243805,
"grad_norm": 1.1081715822219849,
"learning_rate": 3.118338662403411e-05,
"loss": 0.7963,
"step": 11300
},
{
"epoch": 2.278177458033573,
"grad_norm": 1.6696077585220337,
"learning_rate": 3.101685318411937e-05,
"loss": 0.7948,
"step": 11400
},
{
"epoch": 2.2981614708233415,
"grad_norm": 1.1712377071380615,
"learning_rate": 3.0850319744204635e-05,
"loss": 0.7907,
"step": 11500
},
{
"epoch": 2.3181454836131095,
"grad_norm": 1.28898024559021,
"learning_rate": 3.06837863042899e-05,
"loss": 0.7791,
"step": 11600
},
{
"epoch": 2.338129496402878,
"grad_norm": 1.3267985582351685,
"learning_rate": 3.0517252864375166e-05,
"loss": 0.7819,
"step": 11700
},
{
"epoch": 2.3581135091926457,
"grad_norm": 1.4074293375015259,
"learning_rate": 3.0350719424460434e-05,
"loss": 0.818,
"step": 11800
},
{
"epoch": 2.378097521982414,
"grad_norm": 0.9492627382278442,
"learning_rate": 3.0184185984545697e-05,
"loss": 0.7689,
"step": 11900
},
{
"epoch": 2.3980815347721824,
"grad_norm": 1.8090003728866577,
"learning_rate": 3.0017652544630965e-05,
"loss": 0.7845,
"step": 12000
},
{
"epoch": 2.4180655475619504,
"grad_norm": 1.899207353591919,
"learning_rate": 2.985111910471623e-05,
"loss": 0.7742,
"step": 12100
},
{
"epoch": 2.4380495603517187,
"grad_norm": 2.0821797847747803,
"learning_rate": 2.9684585664801496e-05,
"loss": 0.7696,
"step": 12200
},
{
"epoch": 2.4580335731414866,
"grad_norm": 1.089876651763916,
"learning_rate": 2.951805222488676e-05,
"loss": 0.7873,
"step": 12300
},
{
"epoch": 2.478017585931255,
"grad_norm": 1.265599250793457,
"learning_rate": 2.9351518784972027e-05,
"loss": 0.7504,
"step": 12400
},
{
"epoch": 2.4980015987210233,
"grad_norm": 2.7753829956054688,
"learning_rate": 2.9184985345057287e-05,
"loss": 0.7824,
"step": 12500
},
{
"epoch": 2.5179856115107913,
"grad_norm": 1.0310410261154175,
"learning_rate": 2.901845190514255e-05,
"loss": 0.7326,
"step": 12600
},
{
"epoch": 2.5379696243005596,
"grad_norm": 2.056279182434082,
"learning_rate": 2.8851918465227818e-05,
"loss": 0.8411,
"step": 12700
},
{
"epoch": 2.5579536370903275,
"grad_norm": 1.1815407276153564,
"learning_rate": 2.8685385025313082e-05,
"loss": 0.805,
"step": 12800
},
{
"epoch": 2.577937649880096,
"grad_norm": 1.6167210340499878,
"learning_rate": 2.851885158539835e-05,
"loss": 0.7311,
"step": 12900
},
{
"epoch": 2.5979216626698642,
"grad_norm": 1.488755226135254,
"learning_rate": 2.8352318145483613e-05,
"loss": 0.8335,
"step": 13000
},
{
"epoch": 2.617905675459632,
"grad_norm": 2.5013859272003174,
"learning_rate": 2.818578470556888e-05,
"loss": 0.7833,
"step": 13100
},
{
"epoch": 2.6378896882494005,
"grad_norm": 1.102152943611145,
"learning_rate": 2.8019251265654144e-05,
"loss": 0.7536,
"step": 13200
},
{
"epoch": 2.6578737010391684,
"grad_norm": 1.3805499076843262,
"learning_rate": 2.785271782573941e-05,
"loss": 0.775,
"step": 13300
},
{
"epoch": 2.677857713828937,
"grad_norm": 2.189347505569458,
"learning_rate": 2.7686184385824675e-05,
"loss": 0.8178,
"step": 13400
},
{
"epoch": 2.697841726618705,
"grad_norm": 0.9750763177871704,
"learning_rate": 2.7519650945909942e-05,
"loss": 0.7743,
"step": 13500
},
{
"epoch": 2.7178257394084735,
"grad_norm": 1.2844312191009521,
"learning_rate": 2.7353117505995206e-05,
"loss": 0.7631,
"step": 13600
},
{
"epoch": 2.7378097521982414,
"grad_norm": 1.4551914930343628,
"learning_rate": 2.7186584066080474e-05,
"loss": 0.76,
"step": 13700
},
{
"epoch": 2.7577937649880093,
"grad_norm": 0.8891064524650574,
"learning_rate": 2.7020050626165734e-05,
"loss": 0.8252,
"step": 13800
},
{
"epoch": 2.7777777777777777,
"grad_norm": 1.9776784181594849,
"learning_rate": 2.6853517186250998e-05,
"loss": 0.8037,
"step": 13900
},
{
"epoch": 2.797761790567546,
"grad_norm": 1.429692029953003,
"learning_rate": 2.6686983746336265e-05,
"loss": 0.7369,
"step": 14000
},
{
"epoch": 2.8177458033573144,
"grad_norm": 2.0837855339050293,
"learning_rate": 2.652045030642153e-05,
"loss": 0.742,
"step": 14100
},
{
"epoch": 2.8377298161470823,
"grad_norm": 1.2353509664535522,
"learning_rate": 2.6353916866506796e-05,
"loss": 0.7615,
"step": 14200
},
{
"epoch": 2.8577138289368507,
"grad_norm": 0.8735284209251404,
"learning_rate": 2.618738342659206e-05,
"loss": 0.7872,
"step": 14300
},
{
"epoch": 2.8776978417266186,
"grad_norm": 1.0889009237289429,
"learning_rate": 2.6020849986677327e-05,
"loss": 0.7455,
"step": 14400
},
{
"epoch": 2.897681854516387,
"grad_norm": 1.506787657737732,
"learning_rate": 2.585431654676259e-05,
"loss": 0.792,
"step": 14500
},
{
"epoch": 2.9176658673061553,
"grad_norm": 0.7630636096000671,
"learning_rate": 2.5687783106847858e-05,
"loss": 0.7918,
"step": 14600
},
{
"epoch": 2.937649880095923,
"grad_norm": 1.6361045837402344,
"learning_rate": 2.5521249666933122e-05,
"loss": 0.7357,
"step": 14700
},
{
"epoch": 2.9576338928856916,
"grad_norm": 2.248220920562744,
"learning_rate": 2.535471622701839e-05,
"loss": 0.7954,
"step": 14800
},
{
"epoch": 2.9776179056754595,
"grad_norm": 1.14662766456604,
"learning_rate": 2.5188182787103653e-05,
"loss": 0.7865,
"step": 14900
},
{
"epoch": 2.997601918465228,
"grad_norm": 1.3895844221115112,
"learning_rate": 2.502164934718892e-05,
"loss": 0.7364,
"step": 15000
},
{
"epoch": 3.017585931254996,
"grad_norm": 2.1330533027648926,
"learning_rate": 2.485511590727418e-05,
"loss": 0.7244,
"step": 15100
},
{
"epoch": 3.037569944044764,
"grad_norm": 1.384775996208191,
"learning_rate": 2.4688582467359448e-05,
"loss": 0.7393,
"step": 15200
},
{
"epoch": 3.0575539568345325,
"grad_norm": 0.9841705560684204,
"learning_rate": 2.4522049027444712e-05,
"loss": 0.8051,
"step": 15300
},
{
"epoch": 3.0775379696243004,
"grad_norm": 1.224924921989441,
"learning_rate": 2.435551558752998e-05,
"loss": 0.8004,
"step": 15400
},
{
"epoch": 3.0975219824140687,
"grad_norm": 2.2387399673461914,
"learning_rate": 2.418898214761524e-05,
"loss": 0.8051,
"step": 15500
},
{
"epoch": 3.117505995203837,
"grad_norm": 1.8771803379058838,
"learning_rate": 2.4022448707700507e-05,
"loss": 0.7903,
"step": 15600
},
{
"epoch": 3.137490007993605,
"grad_norm": 1.786600112915039,
"learning_rate": 2.385591526778577e-05,
"loss": 0.7796,
"step": 15700
},
{
"epoch": 3.1574740207833734,
"grad_norm": 1.0823020935058594,
"learning_rate": 2.3689381827871038e-05,
"loss": 0.7468,
"step": 15800
},
{
"epoch": 3.1774580335731413,
"grad_norm": 1.9462608098983765,
"learning_rate": 2.35228483879563e-05,
"loss": 0.7854,
"step": 15900
},
{
"epoch": 3.1974420463629096,
"grad_norm": 1.3235732316970825,
"learning_rate": 2.335631494804157e-05,
"loss": 0.7584,
"step": 16000
},
{
"epoch": 3.217426059152678,
"grad_norm": 1.5206961631774902,
"learning_rate": 2.3189781508126833e-05,
"loss": 0.8104,
"step": 16100
},
{
"epoch": 3.237410071942446,
"grad_norm": 1.4281466007232666,
"learning_rate": 2.3023248068212097e-05,
"loss": 0.7505,
"step": 16200
},
{
"epoch": 3.2573940847322143,
"grad_norm": 1.9032511711120605,
"learning_rate": 2.2856714628297364e-05,
"loss": 0.7813,
"step": 16300
},
{
"epoch": 3.277378097521982,
"grad_norm": 2.10361909866333,
"learning_rate": 2.2690181188382628e-05,
"loss": 0.7369,
"step": 16400
},
{
"epoch": 3.2973621103117505,
"grad_norm": 1.440158486366272,
"learning_rate": 2.2523647748467895e-05,
"loss": 0.7576,
"step": 16500
},
{
"epoch": 3.317346123101519,
"grad_norm": 1.8777798414230347,
"learning_rate": 2.235711430855316e-05,
"loss": 0.7317,
"step": 16600
},
{
"epoch": 3.337330135891287,
"grad_norm": 1.6413357257843018,
"learning_rate": 2.2190580868638426e-05,
"loss": 0.7802,
"step": 16700
},
{
"epoch": 3.357314148681055,
"grad_norm": 1.820087194442749,
"learning_rate": 2.2024047428723686e-05,
"loss": 0.7435,
"step": 16800
},
{
"epoch": 3.3772981614708235,
"grad_norm": 2.5140113830566406,
"learning_rate": 2.1857513988808954e-05,
"loss": 0.7532,
"step": 16900
},
{
"epoch": 3.3972821742605914,
"grad_norm": 1.7011070251464844,
"learning_rate": 2.1690980548894217e-05,
"loss": 0.728,
"step": 17000
},
{
"epoch": 3.41726618705036,
"grad_norm": 1.3051706552505493,
"learning_rate": 2.1524447108979485e-05,
"loss": 0.7493,
"step": 17100
},
{
"epoch": 3.437250199840128,
"grad_norm": 0.9745834469795227,
"learning_rate": 2.135791366906475e-05,
"loss": 0.7219,
"step": 17200
},
{
"epoch": 3.457234212629896,
"grad_norm": 1.3213515281677246,
"learning_rate": 2.1191380229150016e-05,
"loss": 0.7703,
"step": 17300
},
{
"epoch": 3.4772182254196644,
"grad_norm": 0.735060453414917,
"learning_rate": 2.102484678923528e-05,
"loss": 0.7342,
"step": 17400
},
{
"epoch": 3.4972022382094323,
"grad_norm": 1.073197603225708,
"learning_rate": 2.0858313349320543e-05,
"loss": 0.7023,
"step": 17500
},
{
"epoch": 3.5171862509992007,
"grad_norm": 1.797711730003357,
"learning_rate": 2.069177990940581e-05,
"loss": 0.7944,
"step": 17600
},
{
"epoch": 3.537170263788969,
"grad_norm": 1.3365331888198853,
"learning_rate": 2.0525246469491074e-05,
"loss": 0.7773,
"step": 17700
},
{
"epoch": 3.557154276578737,
"grad_norm": 1.451333999633789,
"learning_rate": 2.035871302957634e-05,
"loss": 0.7659,
"step": 17800
},
{
"epoch": 3.5771382893685053,
"grad_norm": 1.5622735023498535,
"learning_rate": 2.0192179589661606e-05,
"loss": 0.7676,
"step": 17900
},
{
"epoch": 3.597122302158273,
"grad_norm": 1.5826952457427979,
"learning_rate": 2.0025646149746873e-05,
"loss": 0.7393,
"step": 18000
},
{
"epoch": 3.6171063149480416,
"grad_norm": 0.7937633991241455,
"learning_rate": 1.9859112709832133e-05,
"loss": 0.7112,
"step": 18100
},
{
"epoch": 3.63709032773781,
"grad_norm": 1.8199377059936523,
"learning_rate": 1.96925792699174e-05,
"loss": 0.7722,
"step": 18200
},
{
"epoch": 3.657074340527578,
"grad_norm": 2.317171573638916,
"learning_rate": 1.9526045830002664e-05,
"loss": 0.7735,
"step": 18300
},
{
"epoch": 3.677058353317346,
"grad_norm": 0.9344459772109985,
"learning_rate": 1.935951239008793e-05,
"loss": 0.7168,
"step": 18400
},
{
"epoch": 3.697042366107114,
"grad_norm": 1.833892583847046,
"learning_rate": 1.9192978950173195e-05,
"loss": 0.6825,
"step": 18500
},
{
"epoch": 3.7170263788968825,
"grad_norm": 1.4093741178512573,
"learning_rate": 1.9026445510258463e-05,
"loss": 0.7087,
"step": 18600
},
{
"epoch": 3.737010391686651,
"grad_norm": 2.0284645557403564,
"learning_rate": 1.8859912070343726e-05,
"loss": 0.7579,
"step": 18700
},
{
"epoch": 3.7569944044764187,
"grad_norm": 1.8383668661117554,
"learning_rate": 1.869337863042899e-05,
"loss": 0.7603,
"step": 18800
},
{
"epoch": 3.776978417266187,
"grad_norm": 1.5985366106033325,
"learning_rate": 1.8526845190514254e-05,
"loss": 0.7166,
"step": 18900
},
{
"epoch": 3.796962430055955,
"grad_norm": 1.5089521408081055,
"learning_rate": 1.836031175059952e-05,
"loss": 0.7678,
"step": 19000
},
{
"epoch": 3.8169464428457234,
"grad_norm": 1.2770063877105713,
"learning_rate": 1.8193778310684785e-05,
"loss": 0.7213,
"step": 19100
},
{
"epoch": 3.8369304556354917,
"grad_norm": 2.4528274536132812,
"learning_rate": 1.8027244870770052e-05,
"loss": 0.7255,
"step": 19200
},
{
"epoch": 3.8569144684252596,
"grad_norm": 1.736755132675171,
"learning_rate": 1.7860711430855316e-05,
"loss": 0.6784,
"step": 19300
},
{
"epoch": 3.876898481215028,
"grad_norm": 1.719307780265808,
"learning_rate": 1.7694177990940583e-05,
"loss": 0.7795,
"step": 19400
},
{
"epoch": 3.896882494004796,
"grad_norm": 2.070528984069824,
"learning_rate": 1.7527644551025847e-05,
"loss": 0.7509,
"step": 19500
},
{
"epoch": 3.9168665067945643,
"grad_norm": 1.6482255458831787,
"learning_rate": 1.736111111111111e-05,
"loss": 0.7202,
"step": 19600
},
{
"epoch": 3.9368505195843326,
"grad_norm": 1.1660830974578857,
"learning_rate": 1.719457767119638e-05,
"loss": 0.7042,
"step": 19700
},
{
"epoch": 3.956834532374101,
"grad_norm": 1.0131560564041138,
"learning_rate": 1.7028044231281642e-05,
"loss": 0.7059,
"step": 19800
},
{
"epoch": 3.976818545163869,
"grad_norm": 1.1839569807052612,
"learning_rate": 1.686151079136691e-05,
"loss": 0.77,
"step": 19900
},
{
"epoch": 3.996802557953637,
"grad_norm": 1.736053705215454,
"learning_rate": 1.6694977351452173e-05,
"loss": 0.7703,
"step": 20000
},
{
"epoch": 4.016786570743405,
"grad_norm": 1.3700270652770996,
"learning_rate": 1.6528443911537437e-05,
"loss": 0.6643,
"step": 20100
},
{
"epoch": 4.0367705835331735,
"grad_norm": 1.347440481185913,
"learning_rate": 1.63619104716227e-05,
"loss": 0.7502,
"step": 20200
},
{
"epoch": 4.056754596322942,
"grad_norm": 1.9421720504760742,
"learning_rate": 1.6195377031707968e-05,
"loss": 0.7382,
"step": 20300
},
{
"epoch": 4.07673860911271,
"grad_norm": 0.9211772084236145,
"learning_rate": 1.6028843591793232e-05,
"loss": 0.7249,
"step": 20400
},
{
"epoch": 4.096722621902478,
"grad_norm": 2.1698520183563232,
"learning_rate": 1.58623101518785e-05,
"loss": 0.7339,
"step": 20500
},
{
"epoch": 4.116706634692246,
"grad_norm": 1.6852116584777832,
"learning_rate": 1.5695776711963763e-05,
"loss": 0.7525,
"step": 20600
},
{
"epoch": 4.136690647482014,
"grad_norm": 1.8582841157913208,
"learning_rate": 1.552924327204903e-05,
"loss": 0.7168,
"step": 20700
},
{
"epoch": 4.156674660271783,
"grad_norm": 1.3949832916259766,
"learning_rate": 1.536270983213429e-05,
"loss": 0.6835,
"step": 20800
},
{
"epoch": 4.176658673061551,
"grad_norm": 2.044853925704956,
"learning_rate": 1.5196176392219558e-05,
"loss": 0.7332,
"step": 20900
},
{
"epoch": 4.196642685851319,
"grad_norm": 1.3187381029129028,
"learning_rate": 1.5029642952304823e-05,
"loss": 0.7724,
"step": 21000
},
{
"epoch": 4.216626698641087,
"grad_norm": 1.18405020236969,
"learning_rate": 1.4863109512390089e-05,
"loss": 0.7677,
"step": 21100
},
{
"epoch": 4.236610711430855,
"grad_norm": 1.2868226766586304,
"learning_rate": 1.4696576072475355e-05,
"loss": 0.7168,
"step": 21200
},
{
"epoch": 4.256594724220624,
"grad_norm": 2.145659923553467,
"learning_rate": 1.453004263256062e-05,
"loss": 0.7574,
"step": 21300
},
{
"epoch": 4.276578737010392,
"grad_norm": 1.0491008758544922,
"learning_rate": 1.4363509192645886e-05,
"loss": 0.7274,
"step": 21400
},
{
"epoch": 4.2965627498001595,
"grad_norm": 1.9524632692337036,
"learning_rate": 1.4196975752731148e-05,
"loss": 0.7256,
"step": 21500
},
{
"epoch": 4.316546762589928,
"grad_norm": 1.6348446607589722,
"learning_rate": 1.4030442312816413e-05,
"loss": 0.6971,
"step": 21600
},
{
"epoch": 4.336530775379696,
"grad_norm": 1.6102409362792969,
"learning_rate": 1.3863908872901679e-05,
"loss": 0.7031,
"step": 21700
},
{
"epoch": 4.356514788169465,
"grad_norm": 1.4496809244155884,
"learning_rate": 1.3697375432986944e-05,
"loss": 0.76,
"step": 21800
},
{
"epoch": 4.376498800959233,
"grad_norm": 2.370002508163452,
"learning_rate": 1.353084199307221e-05,
"loss": 0.7098,
"step": 21900
},
{
"epoch": 4.396482813749,
"grad_norm": 1.1416559219360352,
"learning_rate": 1.3364308553157475e-05,
"loss": 0.7565,
"step": 22000
},
{
"epoch": 4.416466826538769,
"grad_norm": 1.6672168970108032,
"learning_rate": 1.319777511324274e-05,
"loss": 0.7939,
"step": 22100
},
{
"epoch": 4.436450839328537,
"grad_norm": 1.1106956005096436,
"learning_rate": 1.3031241673328005e-05,
"loss": 0.6645,
"step": 22200
},
{
"epoch": 4.4564348521183055,
"grad_norm": 1.4987940788269043,
"learning_rate": 1.286470823341327e-05,
"loss": 0.7117,
"step": 22300
},
{
"epoch": 4.476418864908074,
"grad_norm": 2.063014268875122,
"learning_rate": 1.2698174793498536e-05,
"loss": 0.767,
"step": 22400
},
{
"epoch": 4.496402877697841,
"grad_norm": 0.748756468296051,
"learning_rate": 1.2531641353583801e-05,
"loss": 0.7393,
"step": 22500
},
{
"epoch": 4.51638689048761,
"grad_norm": 1.3971226215362549,
"learning_rate": 1.2365107913669065e-05,
"loss": 0.7782,
"step": 22600
},
{
"epoch": 4.536370903277378,
"grad_norm": 1.5306447744369507,
"learning_rate": 1.219857447375433e-05,
"loss": 0.7299,
"step": 22700
},
{
"epoch": 4.556354916067146,
"grad_norm": 1.409225344657898,
"learning_rate": 1.2032041033839596e-05,
"loss": 0.6752,
"step": 22800
},
{
"epoch": 4.576338928856915,
"grad_norm": 1.396794080734253,
"learning_rate": 1.186550759392486e-05,
"loss": 0.6417,
"step": 22900
},
{
"epoch": 4.596322941646683,
"grad_norm": 1.6455470323562622,
"learning_rate": 1.1698974154010126e-05,
"loss": 0.7545,
"step": 23000
},
{
"epoch": 4.616306954436451,
"grad_norm": 1.4188311100006104,
"learning_rate": 1.1532440714095391e-05,
"loss": 0.7217,
"step": 23100
},
{
"epoch": 4.636290967226219,
"grad_norm": 1.1025303602218628,
"learning_rate": 1.1365907274180657e-05,
"loss": 0.7419,
"step": 23200
},
{
"epoch": 4.656274980015987,
"grad_norm": 1.0919783115386963,
"learning_rate": 1.119937383426592e-05,
"loss": 0.725,
"step": 23300
},
{
"epoch": 4.676258992805756,
"grad_norm": 2.179637908935547,
"learning_rate": 1.1032840394351186e-05,
"loss": 0.7052,
"step": 23400
},
{
"epoch": 4.696243005595523,
"grad_norm": 1.4243191480636597,
"learning_rate": 1.0866306954436452e-05,
"loss": 0.7437,
"step": 23500
},
{
"epoch": 4.7162270183852915,
"grad_norm": 1.6711329221725464,
"learning_rate": 1.0699773514521715e-05,
"loss": 0.7378,
"step": 23600
},
{
"epoch": 4.73621103117506,
"grad_norm": 1.2967829704284668,
"learning_rate": 1.0533240074606981e-05,
"loss": 0.7386,
"step": 23700
},
{
"epoch": 4.756195043964828,
"grad_norm": 1.737625002861023,
"learning_rate": 1.0366706634692246e-05,
"loss": 0.7012,
"step": 23800
},
{
"epoch": 4.7761790567545965,
"grad_norm": 1.062472939491272,
"learning_rate": 1.020017319477751e-05,
"loss": 0.6797,
"step": 23900
},
{
"epoch": 4.796163069544365,
"grad_norm": 1.044542908668518,
"learning_rate": 1.0033639754862776e-05,
"loss": 0.7285,
"step": 24000
},
{
"epoch": 4.816147082334132,
"grad_norm": 1.70567786693573,
"learning_rate": 9.867106314948041e-06,
"loss": 0.7777,
"step": 24100
},
{
"epoch": 4.836131095123901,
"grad_norm": 1.6937395334243774,
"learning_rate": 9.700572875033307e-06,
"loss": 0.7378,
"step": 24200
},
{
"epoch": 4.856115107913669,
"grad_norm": 2.7036936283111572,
"learning_rate": 9.534039435118572e-06,
"loss": 0.7813,
"step": 24300
},
{
"epoch": 4.876099120703437,
"grad_norm": 1.1682194471359253,
"learning_rate": 9.367505995203838e-06,
"loss": 0.7155,
"step": 24400
},
{
"epoch": 4.896083133493206,
"grad_norm": 1.2117973566055298,
"learning_rate": 9.200972555289104e-06,
"loss": 0.7273,
"step": 24500
},
{
"epoch": 4.916067146282973,
"grad_norm": 0.9339836239814758,
"learning_rate": 9.034439115374367e-06,
"loss": 0.7368,
"step": 24600
},
{
"epoch": 4.936051159072742,
"grad_norm": 1.3919428586959839,
"learning_rate": 8.867905675459633e-06,
"loss": 0.7059,
"step": 24700
},
{
"epoch": 4.95603517186251,
"grad_norm": 2.1438040733337402,
"learning_rate": 8.701372235544898e-06,
"loss": 0.7197,
"step": 24800
},
{
"epoch": 4.976019184652278,
"grad_norm": 1.892350435256958,
"learning_rate": 8.534838795630162e-06,
"loss": 0.7292,
"step": 24900
},
{
"epoch": 4.996003197442047,
"grad_norm": 2.050062656402588,
"learning_rate": 8.368305355715428e-06,
"loss": 0.7791,
"step": 25000
},
{
"epoch": 5.015987210231814,
"grad_norm": 2.285053014755249,
"learning_rate": 8.201771915800693e-06,
"loss": 0.6937,
"step": 25100
},
{
"epoch": 5.0359712230215825,
"grad_norm": 1.6725279092788696,
"learning_rate": 8.035238475885959e-06,
"loss": 0.7443,
"step": 25200
},
{
"epoch": 5.055955235811351,
"grad_norm": 1.590450644493103,
"learning_rate": 7.868705035971223e-06,
"loss": 0.7069,
"step": 25300
},
{
"epoch": 5.075939248601119,
"grad_norm": 0.7603669762611389,
"learning_rate": 7.702171596056488e-06,
"loss": 0.6778,
"step": 25400
},
{
"epoch": 5.095923261390888,
"grad_norm": 1.8916963338851929,
"learning_rate": 7.535638156141754e-06,
"loss": 0.769,
"step": 25500
},
{
"epoch": 5.115907274180655,
"grad_norm": 1.6110832691192627,
"learning_rate": 7.3691047162270184e-06,
"loss": 0.7027,
"step": 25600
},
{
"epoch": 5.135891286970423,
"grad_norm": 1.796796202659607,
"learning_rate": 7.202571276312284e-06,
"loss": 0.72,
"step": 25700
},
{
"epoch": 5.155875299760192,
"grad_norm": 1.8212794065475464,
"learning_rate": 7.0360378363975495e-06,
"loss": 0.7004,
"step": 25800
},
{
"epoch": 5.17585931254996,
"grad_norm": 1.0340906381607056,
"learning_rate": 6.869504396482813e-06,
"loss": 0.6687,
"step": 25900
},
{
"epoch": 5.1958433253397285,
"grad_norm": 1.8287034034729004,
"learning_rate": 6.702970956568079e-06,
"loss": 0.6774,
"step": 26000
},
{
"epoch": 5.215827338129497,
"grad_norm": 1.657259225845337,
"learning_rate": 6.536437516653344e-06,
"loss": 0.6995,
"step": 26100
},
{
"epoch": 5.235811350919264,
"grad_norm": 1.8235076665878296,
"learning_rate": 6.36990407673861e-06,
"loss": 0.7814,
"step": 26200
},
{
"epoch": 5.255795363709033,
"grad_norm": 1.6127688884735107,
"learning_rate": 6.203370636823875e-06,
"loss": 0.6797,
"step": 26300
},
{
"epoch": 5.275779376498801,
"grad_norm": 1.2275160551071167,
"learning_rate": 6.03683719690914e-06,
"loss": 0.7336,
"step": 26400
},
{
"epoch": 5.295763389288569,
"grad_norm": 1.6593281030654907,
"learning_rate": 5.870303756994405e-06,
"loss": 0.6988,
"step": 26500
},
{
"epoch": 5.315747402078338,
"grad_norm": 1.1069490909576416,
"learning_rate": 5.70377031707967e-06,
"loss": 0.801,
"step": 26600
},
{
"epoch": 5.335731414868105,
"grad_norm": 1.7498623132705688,
"learning_rate": 5.537236877164935e-06,
"loss": 0.715,
"step": 26700
},
{
"epoch": 5.355715427657874,
"grad_norm": 1.7322038412094116,
"learning_rate": 5.3707034372502e-06,
"loss": 0.7076,
"step": 26800
},
{
"epoch": 5.375699440447642,
"grad_norm": 1.2660248279571533,
"learning_rate": 5.204169997335465e-06,
"loss": 0.7494,
"step": 26900
},
{
"epoch": 5.39568345323741,
"grad_norm": 2.537752628326416,
"learning_rate": 5.03763655742073e-06,
"loss": 0.7326,
"step": 27000
},
{
"epoch": 5.415667466027179,
"grad_norm": 0.991534411907196,
"learning_rate": 4.8711031175059955e-06,
"loss": 0.6929,
"step": 27100
},
{
"epoch": 5.435651478816946,
"grad_norm": 2.0230729579925537,
"learning_rate": 4.70456967759126e-06,
"loss": 0.6787,
"step": 27200
},
{
"epoch": 5.4556354916067145,
"grad_norm": 1.5560120344161987,
"learning_rate": 4.538036237676526e-06,
"loss": 0.7251,
"step": 27300
},
{
"epoch": 5.475619504396483,
"grad_norm": 1.5086272954940796,
"learning_rate": 4.371502797761791e-06,
"loss": 0.7066,
"step": 27400
},
{
"epoch": 5.495603517186251,
"grad_norm": 1.6183174848556519,
"learning_rate": 4.204969357847056e-06,
"loss": 0.7161,
"step": 27500
},
{
"epoch": 5.5155875299760195,
"grad_norm": 1.1214239597320557,
"learning_rate": 4.0384359179323214e-06,
"loss": 0.7414,
"step": 27600
},
{
"epoch": 5.535571542765787,
"grad_norm": 1.4948476552963257,
"learning_rate": 3.871902478017586e-06,
"loss": 0.7303,
"step": 27700
},
{
"epoch": 5.555555555555555,
"grad_norm": 1.094460368156433,
"learning_rate": 3.705369038102851e-06,
"loss": 0.7214,
"step": 27800
},
{
"epoch": 5.575539568345324,
"grad_norm": 1.8006253242492676,
"learning_rate": 3.5388355981881163e-06,
"loss": 0.7382,
"step": 27900
},
{
"epoch": 5.595523581135092,
"grad_norm": 1.0595532655715942,
"learning_rate": 3.3723021582733815e-06,
"loss": 0.6835,
"step": 28000
},
{
"epoch": 5.61550759392486,
"grad_norm": 1.5675129890441895,
"learning_rate": 3.205768718358647e-06,
"loss": 0.7426,
"step": 28100
},
{
"epoch": 5.635491606714629,
"grad_norm": 1.543182134628296,
"learning_rate": 3.0392352784439117e-06,
"loss": 0.7393,
"step": 28200
},
{
"epoch": 5.655475619504396,
"grad_norm": 1.6735225915908813,
"learning_rate": 2.8727018385291768e-06,
"loss": 0.7106,
"step": 28300
},
{
"epoch": 5.675459632294165,
"grad_norm": 1.2037389278411865,
"learning_rate": 2.706168398614442e-06,
"loss": 0.6928,
"step": 28400
},
{
"epoch": 5.695443645083933,
"grad_norm": 1.957836627960205,
"learning_rate": 2.539634958699707e-06,
"loss": 0.8086,
"step": 28500
},
{
"epoch": 5.715427657873701,
"grad_norm": 2.085599899291992,
"learning_rate": 2.373101518784972e-06,
"loss": 0.7587,
"step": 28600
},
{
"epoch": 5.735411670663469,
"grad_norm": 1.3564984798431396,
"learning_rate": 2.206568078870237e-06,
"loss": 0.7197,
"step": 28700
},
{
"epoch": 5.755395683453237,
"grad_norm": 1.659226655960083,
"learning_rate": 2.0400346389555023e-06,
"loss": 0.728,
"step": 28800
},
{
"epoch": 5.7753796962430055,
"grad_norm": 1.3784935474395752,
"learning_rate": 1.8735011990407676e-06,
"loss": 0.7561,
"step": 28900
},
{
"epoch": 5.795363709032774,
"grad_norm": 1.4514496326446533,
"learning_rate": 1.7069677591260325e-06,
"loss": 0.7205,
"step": 29000
},
{
"epoch": 5.815347721822542,
"grad_norm": 1.7896771430969238,
"learning_rate": 1.5404343192112976e-06,
"loss": 0.667,
"step": 29100
},
{
"epoch": 5.835331734612311,
"grad_norm": 1.4074804782867432,
"learning_rate": 1.3739008792965628e-06,
"loss": 0.71,
"step": 29200
},
{
"epoch": 5.855315747402078,
"grad_norm": 1.33772873878479,
"learning_rate": 1.2073674393818279e-06,
"loss": 0.7688,
"step": 29300
},
{
"epoch": 5.875299760191846,
"grad_norm": 1.8295559883117676,
"learning_rate": 1.040833999467093e-06,
"loss": 0.7439,
"step": 29400
},
{
"epoch": 5.895283772981615,
"grad_norm": 0.9400151371955872,
"learning_rate": 8.743005595523582e-07,
"loss": 0.6891,
"step": 29500
},
{
"epoch": 5.915267785771383,
"grad_norm": 1.7990922927856445,
"learning_rate": 7.077671196376233e-07,
"loss": 0.6384,
"step": 29600
},
{
"epoch": 5.935251798561151,
"grad_norm": 1.74308180809021,
"learning_rate": 5.412336797228884e-07,
"loss": 0.6973,
"step": 29700
},
{
"epoch": 5.955235811350919,
"grad_norm": 1.1248557567596436,
"learning_rate": 3.747002398081535e-07,
"loss": 0.7265,
"step": 29800
},
{
"epoch": 5.975219824140687,
"grad_norm": 1.9805783033370972,
"learning_rate": 2.0816679989341861e-07,
"loss": 0.7107,
"step": 29900
},
{
"epoch": 5.995203836930456,
"grad_norm": 1.6576383113861084,
"learning_rate": 4.163335997868372e-08,
"loss": 0.7073,
"step": 30000
}
],
"logging_steps": 100,
"max_steps": 30024,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.230686056448e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}