{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.995203836930456, "eval_steps": 500, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019984012789768184, "grad_norm": 10.435233116149902, "learning_rate": 4.9835131894484415e-05, "loss": 14.0782, "step": 100 }, { "epoch": 0.03996802557953637, "grad_norm": 19.157014846801758, "learning_rate": 4.966859845456968e-05, "loss": 3.596, "step": 200 }, { "epoch": 0.05995203836930456, "grad_norm": 5.946423530578613, "learning_rate": 4.950206501465495e-05, "loss": 2.0558, "step": 300 }, { "epoch": 0.07993605115907274, "grad_norm": 2.1558027267456055, "learning_rate": 4.9335531574740214e-05, "loss": 1.3957, "step": 400 }, { "epoch": 0.09992006394884093, "grad_norm": 2.5541210174560547, "learning_rate": 4.916899813482548e-05, "loss": 1.252, "step": 500 }, { "epoch": 0.11990407673860912, "grad_norm": 1.8417296409606934, "learning_rate": 4.9002464694910735e-05, "loss": 1.1695, "step": 600 }, { "epoch": 0.1398880895283773, "grad_norm": 1.654639720916748, "learning_rate": 4.8835931254996005e-05, "loss": 1.1848, "step": 700 }, { "epoch": 0.15987210231814547, "grad_norm": 2.093991756439209, "learning_rate": 4.866939781508127e-05, "loss": 1.1056, "step": 800 }, { "epoch": 0.17985611510791366, "grad_norm": 1.9905765056610107, "learning_rate": 4.850286437516653e-05, "loss": 1.1186, "step": 900 }, { "epoch": 0.19984012789768185, "grad_norm": 1.2864068746566772, "learning_rate": 4.83363309352518e-05, "loss": 1.1394, "step": 1000 }, { "epoch": 0.21982414068745004, "grad_norm": 1.5142974853515625, "learning_rate": 4.816979749533707e-05, "loss": 1.1162, "step": 1100 }, { "epoch": 0.23980815347721823, "grad_norm": 0.9597436189651489, "learning_rate": 4.800326405542233e-05, "loss": 1.0922, "step": 1200 }, { "epoch": 0.2597921662669864, "grad_norm": 1.7915301322937012, "learning_rate": 4.7836730615507595e-05, "loss": 1.0711, "step": 1300 }, { "epoch": 0.2797761790567546, "grad_norm": 1.1338952779769897, "learning_rate": 4.767019717559286e-05, "loss": 0.9945, "step": 1400 }, { "epoch": 0.2997601918465228, "grad_norm": 1.3437297344207764, "learning_rate": 4.750366373567813e-05, "loss": 1.0322, "step": 1500 }, { "epoch": 0.31974420463629094, "grad_norm": 1.2949973344802856, "learning_rate": 4.733713029576339e-05, "loss": 1.0613, "step": 1600 }, { "epoch": 0.33972821742605913, "grad_norm": 1.1362179517745972, "learning_rate": 4.717059685584866e-05, "loss": 1.0122, "step": 1700 }, { "epoch": 0.3597122302158273, "grad_norm": 1.1878063678741455, "learning_rate": 4.700406341593392e-05, "loss": 1.0068, "step": 1800 }, { "epoch": 0.3796962430055955, "grad_norm": 1.3588361740112305, "learning_rate": 4.6837529976019185e-05, "loss": 0.955, "step": 1900 }, { "epoch": 0.3996802557953637, "grad_norm": 1.1428577899932861, "learning_rate": 4.667099653610445e-05, "loss": 0.9863, "step": 2000 }, { "epoch": 0.4196642685851319, "grad_norm": 1.6441487073898315, "learning_rate": 4.650446309618971e-05, "loss": 0.9532, "step": 2100 }, { "epoch": 0.4396482813749001, "grad_norm": 2.607586145401001, "learning_rate": 4.633792965627498e-05, "loss": 0.9877, "step": 2200 }, { "epoch": 0.4596322941646683, "grad_norm": 2.24434757232666, "learning_rate": 4.617139621636025e-05, "loss": 0.9956, "step": 2300 }, { "epoch": 0.47961630695443647, "grad_norm": 1.8238356113433838, "learning_rate": 4.600486277644551e-05, "loss": 0.9769, "step": 2400 }, { "epoch": 0.49960031974420466, "grad_norm": 2.0538158416748047, "learning_rate": 4.5838329336530775e-05, "loss": 0.9458, "step": 2500 }, { "epoch": 0.5195843325339728, "grad_norm": 2.1354427337646484, "learning_rate": 4.5671795896616045e-05, "loss": 0.975, "step": 2600 }, { "epoch": 0.539568345323741, "grad_norm": 1.3763636350631714, "learning_rate": 4.550526245670131e-05, "loss": 0.9464, "step": 2700 }, { "epoch": 0.5595523581135092, "grad_norm": 2.6834394931793213, "learning_rate": 4.533872901678657e-05, "loss": 0.9437, "step": 2800 }, { "epoch": 0.5795363709032774, "grad_norm": 1.3325830698013306, "learning_rate": 4.517219557687184e-05, "loss": 0.9617, "step": 2900 }, { "epoch": 0.5995203836930456, "grad_norm": 1.841642141342163, "learning_rate": 4.500566213695711e-05, "loss": 0.9489, "step": 3000 }, { "epoch": 0.6195043964828137, "grad_norm": 1.0305529832839966, "learning_rate": 4.483912869704237e-05, "loss": 0.9432, "step": 3100 }, { "epoch": 0.6394884092725819, "grad_norm": 1.4249075651168823, "learning_rate": 4.467259525712763e-05, "loss": 0.952, "step": 3200 }, { "epoch": 0.6594724220623501, "grad_norm": 1.2994813919067383, "learning_rate": 4.45060618172129e-05, "loss": 1.0247, "step": 3300 }, { "epoch": 0.6794564348521183, "grad_norm": 1.537548303604126, "learning_rate": 4.433952837729816e-05, "loss": 0.928, "step": 3400 }, { "epoch": 0.6994404476418865, "grad_norm": 1.646200180053711, "learning_rate": 4.4172994937383427e-05, "loss": 0.9817, "step": 3500 }, { "epoch": 0.7194244604316546, "grad_norm": 1.2779592275619507, "learning_rate": 4.400646149746869e-05, "loss": 0.8974, "step": 3600 }, { "epoch": 0.7394084732214229, "grad_norm": 1.7886369228363037, "learning_rate": 4.383992805755396e-05, "loss": 0.9351, "step": 3700 }, { "epoch": 0.759392486011191, "grad_norm": 1.5072957277297974, "learning_rate": 4.3673394617639225e-05, "loss": 0.882, "step": 3800 }, { "epoch": 0.7793764988009593, "grad_norm": 2.019421339035034, "learning_rate": 4.350686117772449e-05, "loss": 0.9324, "step": 3900 }, { "epoch": 0.7993605115907274, "grad_norm": 1.7232331037521362, "learning_rate": 4.334032773780975e-05, "loss": 0.9094, "step": 4000 }, { "epoch": 0.8193445243804957, "grad_norm": 1.7297419309616089, "learning_rate": 4.317379429789502e-05, "loss": 0.9658, "step": 4100 }, { "epoch": 0.8393285371702638, "grad_norm": 1.560420274734497, "learning_rate": 4.300726085798029e-05, "loss": 0.9613, "step": 4200 }, { "epoch": 0.8593125499600319, "grad_norm": 1.9014427661895752, "learning_rate": 4.284072741806555e-05, "loss": 0.9047, "step": 4300 }, { "epoch": 0.8792965627498002, "grad_norm": 1.4741132259368896, "learning_rate": 4.2674193978150815e-05, "loss": 0.7914, "step": 4400 }, { "epoch": 0.8992805755395683, "grad_norm": 0.8008555173873901, "learning_rate": 4.250766053823608e-05, "loss": 0.8678, "step": 4500 }, { "epoch": 0.9192645883293366, "grad_norm": 1.5738321542739868, "learning_rate": 4.234112709832134e-05, "loss": 0.8379, "step": 4600 }, { "epoch": 0.9392486011191047, "grad_norm": 1.7838175296783447, "learning_rate": 4.2174593658406606e-05, "loss": 0.9581, "step": 4700 }, { "epoch": 0.9592326139088729, "grad_norm": 1.6761012077331543, "learning_rate": 4.200806021849188e-05, "loss": 0.8583, "step": 4800 }, { "epoch": 0.9792166266986411, "grad_norm": 1.320033073425293, "learning_rate": 4.184152677857714e-05, "loss": 0.8709, "step": 4900 }, { "epoch": 0.9992006394884093, "grad_norm": 2.0931804180145264, "learning_rate": 4.1674993338662404e-05, "loss": 0.9064, "step": 5000 }, { "epoch": 1.0191846522781776, "grad_norm": 1.1682066917419434, "learning_rate": 4.150845989874767e-05, "loss": 0.8728, "step": 5100 }, { "epoch": 1.0391686650679457, "grad_norm": 1.2569072246551514, "learning_rate": 4.134192645883294e-05, "loss": 0.8921, "step": 5200 }, { "epoch": 1.0591526778577138, "grad_norm": 1.4215683937072754, "learning_rate": 4.11753930189182e-05, "loss": 0.8211, "step": 5300 }, { "epoch": 1.079136690647482, "grad_norm": 1.6081187725067139, "learning_rate": 4.1008859579003467e-05, "loss": 0.9338, "step": 5400 }, { "epoch": 1.09912070343725, "grad_norm": 1.4916200637817383, "learning_rate": 4.084232613908873e-05, "loss": 0.8142, "step": 5500 }, { "epoch": 1.1191047162270185, "grad_norm": 1.8639625310897827, "learning_rate": 4.0675792699174e-05, "loss": 0.8746, "step": 5600 }, { "epoch": 1.1390887290167866, "grad_norm": 1.1741764545440674, "learning_rate": 4.0509259259259265e-05, "loss": 0.8032, "step": 5700 }, { "epoch": 1.1590727418065547, "grad_norm": 1.7627875804901123, "learning_rate": 4.034272581934453e-05, "loss": 0.8681, "step": 5800 }, { "epoch": 1.1790567545963229, "grad_norm": 0.7432733178138733, "learning_rate": 4.0176192379429786e-05, "loss": 0.8968, "step": 5900 }, { "epoch": 1.1990407673860912, "grad_norm": 1.5172642469406128, "learning_rate": 4.0009658939515056e-05, "loss": 0.9653, "step": 6000 }, { "epoch": 1.2190247801758594, "grad_norm": 2.0822958946228027, "learning_rate": 3.984312549960032e-05, "loss": 0.8038, "step": 6100 }, { "epoch": 1.2390087929656275, "grad_norm": 2.2852039337158203, "learning_rate": 3.9676592059685584e-05, "loss": 0.8041, "step": 6200 }, { "epoch": 1.2589928057553956, "grad_norm": 1.214968204498291, "learning_rate": 3.951005861977085e-05, "loss": 0.8382, "step": 6300 }, { "epoch": 1.2789768185451638, "grad_norm": 2.792722225189209, "learning_rate": 3.934352517985612e-05, "loss": 0.8534, "step": 6400 }, { "epoch": 1.2989608313349321, "grad_norm": 1.6279624700546265, "learning_rate": 3.917699173994138e-05, "loss": 0.8387, "step": 6500 }, { "epoch": 1.3189448441247003, "grad_norm": 1.57301664352417, "learning_rate": 3.9010458300026646e-05, "loss": 0.8583, "step": 6600 }, { "epoch": 1.3389288569144684, "grad_norm": 1.2693675756454468, "learning_rate": 3.884392486011191e-05, "loss": 0.7893, "step": 6700 }, { "epoch": 1.3589128697042367, "grad_norm": 1.1760280132293701, "learning_rate": 3.867739142019718e-05, "loss": 0.8204, "step": 6800 }, { "epoch": 1.3788968824940047, "grad_norm": 1.8213127851486206, "learning_rate": 3.8510857980282444e-05, "loss": 0.9061, "step": 6900 }, { "epoch": 1.398880895283773, "grad_norm": 1.2589592933654785, "learning_rate": 3.834432454036771e-05, "loss": 0.856, "step": 7000 }, { "epoch": 1.4188649080735412, "grad_norm": 2.5817718505859375, "learning_rate": 3.817779110045297e-05, "loss": 0.8542, "step": 7100 }, { "epoch": 1.4388489208633093, "grad_norm": 1.1825404167175293, "learning_rate": 3.8011257660538236e-05, "loss": 0.8298, "step": 7200 }, { "epoch": 1.4588329336530776, "grad_norm": 1.6443575620651245, "learning_rate": 3.78447242206235e-05, "loss": 0.823, "step": 7300 }, { "epoch": 1.4788169464428458, "grad_norm": 1.9887899160385132, "learning_rate": 3.7678190780708764e-05, "loss": 0.7936, "step": 7400 }, { "epoch": 1.498800959232614, "grad_norm": 1.8799304962158203, "learning_rate": 3.7511657340794034e-05, "loss": 0.8755, "step": 7500 }, { "epoch": 1.518784972022382, "grad_norm": 1.6680015325546265, "learning_rate": 3.73451239008793e-05, "loss": 0.8669, "step": 7600 }, { "epoch": 1.5387689848121502, "grad_norm": 1.7756261825561523, "learning_rate": 3.717859046096456e-05, "loss": 0.8572, "step": 7700 }, { "epoch": 1.5587529976019185, "grad_norm": 1.3951911926269531, "learning_rate": 3.7012057021049826e-05, "loss": 0.8422, "step": 7800 }, { "epoch": 1.5787370103916867, "grad_norm": 1.8145322799682617, "learning_rate": 3.6845523581135096e-05, "loss": 0.783, "step": 7900 }, { "epoch": 1.5987210231814548, "grad_norm": 1.4113447666168213, "learning_rate": 3.667899014122036e-05, "loss": 0.8368, "step": 8000 }, { "epoch": 1.6187050359712232, "grad_norm": 1.5562957525253296, "learning_rate": 3.6512456701305624e-05, "loss": 0.8232, "step": 8100 }, { "epoch": 1.638689048760991, "grad_norm": 2.0334463119506836, "learning_rate": 3.634592326139089e-05, "loss": 0.8272, "step": 8200 }, { "epoch": 1.6586730615507594, "grad_norm": 2.305115222930908, "learning_rate": 3.617938982147616e-05, "loss": 0.8708, "step": 8300 }, { "epoch": 1.6786570743405276, "grad_norm": 1.9576376676559448, "learning_rate": 3.601285638156142e-05, "loss": 0.8437, "step": 8400 }, { "epoch": 1.6986410871302957, "grad_norm": 1.324064016342163, "learning_rate": 3.584632294164668e-05, "loss": 0.8197, "step": 8500 }, { "epoch": 1.718625099920064, "grad_norm": 1.5594903230667114, "learning_rate": 3.567978950173195e-05, "loss": 0.8365, "step": 8600 }, { "epoch": 1.738609112709832, "grad_norm": 1.853633999824524, "learning_rate": 3.5513256061817214e-05, "loss": 0.7914, "step": 8700 }, { "epoch": 1.7585931254996003, "grad_norm": 1.839158296585083, "learning_rate": 3.534672262190248e-05, "loss": 0.9374, "step": 8800 }, { "epoch": 1.7785771382893685, "grad_norm": 2.5038366317749023, "learning_rate": 3.518018918198774e-05, "loss": 0.8202, "step": 8900 }, { "epoch": 1.7985611510791366, "grad_norm": 1.7603284120559692, "learning_rate": 3.501365574207301e-05, "loss": 0.8346, "step": 9000 }, { "epoch": 1.818545163868905, "grad_norm": 1.9243416786193848, "learning_rate": 3.4847122302158276e-05, "loss": 0.8141, "step": 9100 }, { "epoch": 1.838529176658673, "grad_norm": 1.6993930339813232, "learning_rate": 3.468058886224354e-05, "loss": 0.7974, "step": 9200 }, { "epoch": 1.8585131894484412, "grad_norm": 1.9248780012130737, "learning_rate": 3.4514055422328804e-05, "loss": 0.8543, "step": 9300 }, { "epoch": 1.8784972022382096, "grad_norm": 1.7247469425201416, "learning_rate": 3.4347521982414074e-05, "loss": 0.7968, "step": 9400 }, { "epoch": 1.8984812150279775, "grad_norm": 1.165992259979248, "learning_rate": 3.418098854249934e-05, "loss": 0.7946, "step": 9500 }, { "epoch": 1.9184652278177459, "grad_norm": 1.5617034435272217, "learning_rate": 3.40144551025846e-05, "loss": 0.8452, "step": 9600 }, { "epoch": 1.938449240607514, "grad_norm": 1.9524955749511719, "learning_rate": 3.3847921662669866e-05, "loss": 0.821, "step": 9700 }, { "epoch": 1.9584332533972821, "grad_norm": 1.201984167098999, "learning_rate": 3.3681388222755136e-05, "loss": 0.8244, "step": 9800 }, { "epoch": 1.9784172661870505, "grad_norm": 1.5261083841323853, "learning_rate": 3.3514854782840393e-05, "loss": 0.8521, "step": 9900 }, { "epoch": 1.9984012789768184, "grad_norm": 0.8879593014717102, "learning_rate": 3.334832134292566e-05, "loss": 0.7745, "step": 10000 }, { "epoch": 2.0183852917665868, "grad_norm": 1.9460114240646362, "learning_rate": 3.318178790301093e-05, "loss": 0.7926, "step": 10100 }, { "epoch": 2.038369304556355, "grad_norm": 2.0698747634887695, "learning_rate": 3.301525446309619e-05, "loss": 0.8668, "step": 10200 }, { "epoch": 2.058353317346123, "grad_norm": 1.6188371181488037, "learning_rate": 3.2848721023181456e-05, "loss": 0.7743, "step": 10300 }, { "epoch": 2.0783373301358914, "grad_norm": 1.4746142625808716, "learning_rate": 3.268218758326672e-05, "loss": 0.776, "step": 10400 }, { "epoch": 2.0983213429256593, "grad_norm": 2.6285245418548584, "learning_rate": 3.251565414335199e-05, "loss": 0.7952, "step": 10500 }, { "epoch": 2.1183053557154277, "grad_norm": 2.9462263584136963, "learning_rate": 3.2349120703437254e-05, "loss": 0.8492, "step": 10600 }, { "epoch": 2.138289368505196, "grad_norm": 2.2768771648406982, "learning_rate": 3.218258726352252e-05, "loss": 0.7773, "step": 10700 }, { "epoch": 2.158273381294964, "grad_norm": 2.4314112663269043, "learning_rate": 3.201605382360778e-05, "loss": 0.7789, "step": 10800 }, { "epoch": 2.1782573940847323, "grad_norm": 2.631697654724121, "learning_rate": 3.184952038369305e-05, "loss": 0.7809, "step": 10900 }, { "epoch": 2.1982414068745, "grad_norm": 2.0636370182037354, "learning_rate": 3.1682986943778316e-05, "loss": 0.7627, "step": 11000 }, { "epoch": 2.2182254196642686, "grad_norm": 1.861494779586792, "learning_rate": 3.151645350386358e-05, "loss": 0.7454, "step": 11100 }, { "epoch": 2.238209432454037, "grad_norm": 1.6431078910827637, "learning_rate": 3.134992006394884e-05, "loss": 0.8208, "step": 11200 }, { "epoch": 2.258193445243805, "grad_norm": 1.1081715822219849, "learning_rate": 3.118338662403411e-05, "loss": 0.7963, "step": 11300 }, { "epoch": 2.278177458033573, "grad_norm": 1.6696077585220337, "learning_rate": 3.101685318411937e-05, "loss": 0.7948, "step": 11400 }, { "epoch": 2.2981614708233415, "grad_norm": 1.1712377071380615, "learning_rate": 3.0850319744204635e-05, "loss": 0.7907, "step": 11500 }, { "epoch": 2.3181454836131095, "grad_norm": 1.28898024559021, "learning_rate": 3.06837863042899e-05, "loss": 0.7791, "step": 11600 }, { "epoch": 2.338129496402878, "grad_norm": 1.3267985582351685, "learning_rate": 3.0517252864375166e-05, "loss": 0.7819, "step": 11700 }, { "epoch": 2.3581135091926457, "grad_norm": 1.4074293375015259, "learning_rate": 3.0350719424460434e-05, "loss": 0.818, "step": 11800 }, { "epoch": 2.378097521982414, "grad_norm": 0.9492627382278442, "learning_rate": 3.0184185984545697e-05, "loss": 0.7689, "step": 11900 }, { "epoch": 2.3980815347721824, "grad_norm": 1.8090003728866577, "learning_rate": 3.0017652544630965e-05, "loss": 0.7845, "step": 12000 }, { "epoch": 2.4180655475619504, "grad_norm": 1.899207353591919, "learning_rate": 2.985111910471623e-05, "loss": 0.7742, "step": 12100 }, { "epoch": 2.4380495603517187, "grad_norm": 2.0821797847747803, "learning_rate": 2.9684585664801496e-05, "loss": 0.7696, "step": 12200 }, { "epoch": 2.4580335731414866, "grad_norm": 1.089876651763916, "learning_rate": 2.951805222488676e-05, "loss": 0.7873, "step": 12300 }, { "epoch": 2.478017585931255, "grad_norm": 1.265599250793457, "learning_rate": 2.9351518784972027e-05, "loss": 0.7504, "step": 12400 }, { "epoch": 2.4980015987210233, "grad_norm": 2.7753829956054688, "learning_rate": 2.9184985345057287e-05, "loss": 0.7824, "step": 12500 }, { "epoch": 2.5179856115107913, "grad_norm": 1.0310410261154175, "learning_rate": 2.901845190514255e-05, "loss": 0.7326, "step": 12600 }, { "epoch": 2.5379696243005596, "grad_norm": 2.056279182434082, "learning_rate": 2.8851918465227818e-05, "loss": 0.8411, "step": 12700 }, { "epoch": 2.5579536370903275, "grad_norm": 1.1815407276153564, "learning_rate": 2.8685385025313082e-05, "loss": 0.805, "step": 12800 }, { "epoch": 2.577937649880096, "grad_norm": 1.6167210340499878, "learning_rate": 2.851885158539835e-05, "loss": 0.7311, "step": 12900 }, { "epoch": 2.5979216626698642, "grad_norm": 1.488755226135254, "learning_rate": 2.8352318145483613e-05, "loss": 0.8335, "step": 13000 }, { "epoch": 2.617905675459632, "grad_norm": 2.5013859272003174, "learning_rate": 2.818578470556888e-05, "loss": 0.7833, "step": 13100 }, { "epoch": 2.6378896882494005, "grad_norm": 1.102152943611145, "learning_rate": 2.8019251265654144e-05, "loss": 0.7536, "step": 13200 }, { "epoch": 2.6578737010391684, "grad_norm": 1.3805499076843262, "learning_rate": 2.785271782573941e-05, "loss": 0.775, "step": 13300 }, { "epoch": 2.677857713828937, "grad_norm": 2.189347505569458, "learning_rate": 2.7686184385824675e-05, "loss": 0.8178, "step": 13400 }, { "epoch": 2.697841726618705, "grad_norm": 0.9750763177871704, "learning_rate": 2.7519650945909942e-05, "loss": 0.7743, "step": 13500 }, { "epoch": 2.7178257394084735, "grad_norm": 1.2844312191009521, "learning_rate": 2.7353117505995206e-05, "loss": 0.7631, "step": 13600 }, { "epoch": 2.7378097521982414, "grad_norm": 1.4551914930343628, "learning_rate": 2.7186584066080474e-05, "loss": 0.76, "step": 13700 }, { "epoch": 2.7577937649880093, "grad_norm": 0.8891064524650574, "learning_rate": 2.7020050626165734e-05, "loss": 0.8252, "step": 13800 }, { "epoch": 2.7777777777777777, "grad_norm": 1.9776784181594849, "learning_rate": 2.6853517186250998e-05, "loss": 0.8037, "step": 13900 }, { "epoch": 2.797761790567546, "grad_norm": 1.429692029953003, "learning_rate": 2.6686983746336265e-05, "loss": 0.7369, "step": 14000 }, { "epoch": 2.8177458033573144, "grad_norm": 2.0837855339050293, "learning_rate": 2.652045030642153e-05, "loss": 0.742, "step": 14100 }, { "epoch": 2.8377298161470823, "grad_norm": 1.2353509664535522, "learning_rate": 2.6353916866506796e-05, "loss": 0.7615, "step": 14200 }, { "epoch": 2.8577138289368507, "grad_norm": 0.8735284209251404, "learning_rate": 2.618738342659206e-05, "loss": 0.7872, "step": 14300 }, { "epoch": 2.8776978417266186, "grad_norm": 1.0889009237289429, "learning_rate": 2.6020849986677327e-05, "loss": 0.7455, "step": 14400 }, { "epoch": 2.897681854516387, "grad_norm": 1.506787657737732, "learning_rate": 2.585431654676259e-05, "loss": 0.792, "step": 14500 }, { "epoch": 2.9176658673061553, "grad_norm": 0.7630636096000671, "learning_rate": 2.5687783106847858e-05, "loss": 0.7918, "step": 14600 }, { "epoch": 2.937649880095923, "grad_norm": 1.6361045837402344, "learning_rate": 2.5521249666933122e-05, "loss": 0.7357, "step": 14700 }, { "epoch": 2.9576338928856916, "grad_norm": 2.248220920562744, "learning_rate": 2.535471622701839e-05, "loss": 0.7954, "step": 14800 }, { "epoch": 2.9776179056754595, "grad_norm": 1.14662766456604, "learning_rate": 2.5188182787103653e-05, "loss": 0.7865, "step": 14900 }, { "epoch": 2.997601918465228, "grad_norm": 1.3895844221115112, "learning_rate": 2.502164934718892e-05, "loss": 0.7364, "step": 15000 }, { "epoch": 3.017585931254996, "grad_norm": 2.1330533027648926, "learning_rate": 2.485511590727418e-05, "loss": 0.7244, "step": 15100 }, { "epoch": 3.037569944044764, "grad_norm": 1.384775996208191, "learning_rate": 2.4688582467359448e-05, "loss": 0.7393, "step": 15200 }, { "epoch": 3.0575539568345325, "grad_norm": 0.9841705560684204, "learning_rate": 2.4522049027444712e-05, "loss": 0.8051, "step": 15300 }, { "epoch": 3.0775379696243004, "grad_norm": 1.224924921989441, "learning_rate": 2.435551558752998e-05, "loss": 0.8004, "step": 15400 }, { "epoch": 3.0975219824140687, "grad_norm": 2.2387399673461914, "learning_rate": 2.418898214761524e-05, "loss": 0.8051, "step": 15500 }, { "epoch": 3.117505995203837, "grad_norm": 1.8771803379058838, "learning_rate": 2.4022448707700507e-05, "loss": 0.7903, "step": 15600 }, { "epoch": 3.137490007993605, "grad_norm": 1.786600112915039, "learning_rate": 2.385591526778577e-05, "loss": 0.7796, "step": 15700 }, { "epoch": 3.1574740207833734, "grad_norm": 1.0823020935058594, "learning_rate": 2.3689381827871038e-05, "loss": 0.7468, "step": 15800 }, { "epoch": 3.1774580335731413, "grad_norm": 1.9462608098983765, "learning_rate": 2.35228483879563e-05, "loss": 0.7854, "step": 15900 }, { "epoch": 3.1974420463629096, "grad_norm": 1.3235732316970825, "learning_rate": 2.335631494804157e-05, "loss": 0.7584, "step": 16000 }, { "epoch": 3.217426059152678, "grad_norm": 1.5206961631774902, "learning_rate": 2.3189781508126833e-05, "loss": 0.8104, "step": 16100 }, { "epoch": 3.237410071942446, "grad_norm": 1.4281466007232666, "learning_rate": 2.3023248068212097e-05, "loss": 0.7505, "step": 16200 }, { "epoch": 3.2573940847322143, "grad_norm": 1.9032511711120605, "learning_rate": 2.2856714628297364e-05, "loss": 0.7813, "step": 16300 }, { "epoch": 3.277378097521982, "grad_norm": 2.10361909866333, "learning_rate": 2.2690181188382628e-05, "loss": 0.7369, "step": 16400 }, { "epoch": 3.2973621103117505, "grad_norm": 1.440158486366272, "learning_rate": 2.2523647748467895e-05, "loss": 0.7576, "step": 16500 }, { "epoch": 3.317346123101519, "grad_norm": 1.8777798414230347, "learning_rate": 2.235711430855316e-05, "loss": 0.7317, "step": 16600 }, { "epoch": 3.337330135891287, "grad_norm": 1.6413357257843018, "learning_rate": 2.2190580868638426e-05, "loss": 0.7802, "step": 16700 }, { "epoch": 3.357314148681055, "grad_norm": 1.820087194442749, "learning_rate": 2.2024047428723686e-05, "loss": 0.7435, "step": 16800 }, { "epoch": 3.3772981614708235, "grad_norm": 2.5140113830566406, "learning_rate": 2.1857513988808954e-05, "loss": 0.7532, "step": 16900 }, { "epoch": 3.3972821742605914, "grad_norm": 1.7011070251464844, "learning_rate": 2.1690980548894217e-05, "loss": 0.728, "step": 17000 }, { "epoch": 3.41726618705036, "grad_norm": 1.3051706552505493, "learning_rate": 2.1524447108979485e-05, "loss": 0.7493, "step": 17100 }, { "epoch": 3.437250199840128, "grad_norm": 0.9745834469795227, "learning_rate": 2.135791366906475e-05, "loss": 0.7219, "step": 17200 }, { "epoch": 3.457234212629896, "grad_norm": 1.3213515281677246, "learning_rate": 2.1191380229150016e-05, "loss": 0.7703, "step": 17300 }, { "epoch": 3.4772182254196644, "grad_norm": 0.735060453414917, "learning_rate": 2.102484678923528e-05, "loss": 0.7342, "step": 17400 }, { "epoch": 3.4972022382094323, "grad_norm": 1.073197603225708, "learning_rate": 2.0858313349320543e-05, "loss": 0.7023, "step": 17500 }, { "epoch": 3.5171862509992007, "grad_norm": 1.797711730003357, "learning_rate": 2.069177990940581e-05, "loss": 0.7944, "step": 17600 }, { "epoch": 3.537170263788969, "grad_norm": 1.3365331888198853, "learning_rate": 2.0525246469491074e-05, "loss": 0.7773, "step": 17700 }, { "epoch": 3.557154276578737, "grad_norm": 1.451333999633789, "learning_rate": 2.035871302957634e-05, "loss": 0.7659, "step": 17800 }, { "epoch": 3.5771382893685053, "grad_norm": 1.5622735023498535, "learning_rate": 2.0192179589661606e-05, "loss": 0.7676, "step": 17900 }, { "epoch": 3.597122302158273, "grad_norm": 1.5826952457427979, "learning_rate": 2.0025646149746873e-05, "loss": 0.7393, "step": 18000 }, { "epoch": 3.6171063149480416, "grad_norm": 0.7937633991241455, "learning_rate": 1.9859112709832133e-05, "loss": 0.7112, "step": 18100 }, { "epoch": 3.63709032773781, "grad_norm": 1.8199377059936523, "learning_rate": 1.96925792699174e-05, "loss": 0.7722, "step": 18200 }, { "epoch": 3.657074340527578, "grad_norm": 2.317171573638916, "learning_rate": 1.9526045830002664e-05, "loss": 0.7735, "step": 18300 }, { "epoch": 3.677058353317346, "grad_norm": 0.9344459772109985, "learning_rate": 1.935951239008793e-05, "loss": 0.7168, "step": 18400 }, { "epoch": 3.697042366107114, "grad_norm": 1.833892583847046, "learning_rate": 1.9192978950173195e-05, "loss": 0.6825, "step": 18500 }, { "epoch": 3.7170263788968825, "grad_norm": 1.4093741178512573, "learning_rate": 1.9026445510258463e-05, "loss": 0.7087, "step": 18600 }, { "epoch": 3.737010391686651, "grad_norm": 2.0284645557403564, "learning_rate": 1.8859912070343726e-05, "loss": 0.7579, "step": 18700 }, { "epoch": 3.7569944044764187, "grad_norm": 1.8383668661117554, "learning_rate": 1.869337863042899e-05, "loss": 0.7603, "step": 18800 }, { "epoch": 3.776978417266187, "grad_norm": 1.5985366106033325, "learning_rate": 1.8526845190514254e-05, "loss": 0.7166, "step": 18900 }, { "epoch": 3.796962430055955, "grad_norm": 1.5089521408081055, "learning_rate": 1.836031175059952e-05, "loss": 0.7678, "step": 19000 }, { "epoch": 3.8169464428457234, "grad_norm": 1.2770063877105713, "learning_rate": 1.8193778310684785e-05, "loss": 0.7213, "step": 19100 }, { "epoch": 3.8369304556354917, "grad_norm": 2.4528274536132812, "learning_rate": 1.8027244870770052e-05, "loss": 0.7255, "step": 19200 }, { "epoch": 3.8569144684252596, "grad_norm": 1.736755132675171, "learning_rate": 1.7860711430855316e-05, "loss": 0.6784, "step": 19300 }, { "epoch": 3.876898481215028, "grad_norm": 1.719307780265808, "learning_rate": 1.7694177990940583e-05, "loss": 0.7795, "step": 19400 }, { "epoch": 3.896882494004796, "grad_norm": 2.070528984069824, "learning_rate": 1.7527644551025847e-05, "loss": 0.7509, "step": 19500 }, { "epoch": 3.9168665067945643, "grad_norm": 1.6482255458831787, "learning_rate": 1.736111111111111e-05, "loss": 0.7202, "step": 19600 }, { "epoch": 3.9368505195843326, "grad_norm": 1.1660830974578857, "learning_rate": 1.719457767119638e-05, "loss": 0.7042, "step": 19700 }, { "epoch": 3.956834532374101, "grad_norm": 1.0131560564041138, "learning_rate": 1.7028044231281642e-05, "loss": 0.7059, "step": 19800 }, { "epoch": 3.976818545163869, "grad_norm": 1.1839569807052612, "learning_rate": 1.686151079136691e-05, "loss": 0.77, "step": 19900 }, { "epoch": 3.996802557953637, "grad_norm": 1.736053705215454, "learning_rate": 1.6694977351452173e-05, "loss": 0.7703, "step": 20000 }, { "epoch": 4.016786570743405, "grad_norm": 1.3700270652770996, "learning_rate": 1.6528443911537437e-05, "loss": 0.6643, "step": 20100 }, { "epoch": 4.0367705835331735, "grad_norm": 1.347440481185913, "learning_rate": 1.63619104716227e-05, "loss": 0.7502, "step": 20200 }, { "epoch": 4.056754596322942, "grad_norm": 1.9421720504760742, "learning_rate": 1.6195377031707968e-05, "loss": 0.7382, "step": 20300 }, { "epoch": 4.07673860911271, "grad_norm": 0.9211772084236145, "learning_rate": 1.6028843591793232e-05, "loss": 0.7249, "step": 20400 }, { "epoch": 4.096722621902478, "grad_norm": 2.1698520183563232, "learning_rate": 1.58623101518785e-05, "loss": 0.7339, "step": 20500 }, { "epoch": 4.116706634692246, "grad_norm": 1.6852116584777832, "learning_rate": 1.5695776711963763e-05, "loss": 0.7525, "step": 20600 }, { "epoch": 4.136690647482014, "grad_norm": 1.8582841157913208, "learning_rate": 1.552924327204903e-05, "loss": 0.7168, "step": 20700 }, { "epoch": 4.156674660271783, "grad_norm": 1.3949832916259766, "learning_rate": 1.536270983213429e-05, "loss": 0.6835, "step": 20800 }, { "epoch": 4.176658673061551, "grad_norm": 2.044853925704956, "learning_rate": 1.5196176392219558e-05, "loss": 0.7332, "step": 20900 }, { "epoch": 4.196642685851319, "grad_norm": 1.3187381029129028, "learning_rate": 1.5029642952304823e-05, "loss": 0.7724, "step": 21000 }, { "epoch": 4.216626698641087, "grad_norm": 1.18405020236969, "learning_rate": 1.4863109512390089e-05, "loss": 0.7677, "step": 21100 }, { "epoch": 4.236610711430855, "grad_norm": 1.2868226766586304, "learning_rate": 1.4696576072475355e-05, "loss": 0.7168, "step": 21200 }, { "epoch": 4.256594724220624, "grad_norm": 2.145659923553467, "learning_rate": 1.453004263256062e-05, "loss": 0.7574, "step": 21300 }, { "epoch": 4.276578737010392, "grad_norm": 1.0491008758544922, "learning_rate": 1.4363509192645886e-05, "loss": 0.7274, "step": 21400 }, { "epoch": 4.2965627498001595, "grad_norm": 1.9524632692337036, "learning_rate": 1.4196975752731148e-05, "loss": 0.7256, "step": 21500 }, { "epoch": 4.316546762589928, "grad_norm": 1.6348446607589722, "learning_rate": 1.4030442312816413e-05, "loss": 0.6971, "step": 21600 }, { "epoch": 4.336530775379696, "grad_norm": 1.6102409362792969, "learning_rate": 1.3863908872901679e-05, "loss": 0.7031, "step": 21700 }, { "epoch": 4.356514788169465, "grad_norm": 1.4496809244155884, "learning_rate": 1.3697375432986944e-05, "loss": 0.76, "step": 21800 }, { "epoch": 4.376498800959233, "grad_norm": 2.370002508163452, "learning_rate": 1.353084199307221e-05, "loss": 0.7098, "step": 21900 }, { "epoch": 4.396482813749, "grad_norm": 1.1416559219360352, "learning_rate": 1.3364308553157475e-05, "loss": 0.7565, "step": 22000 }, { "epoch": 4.416466826538769, "grad_norm": 1.6672168970108032, "learning_rate": 1.319777511324274e-05, "loss": 0.7939, "step": 22100 }, { "epoch": 4.436450839328537, "grad_norm": 1.1106956005096436, "learning_rate": 1.3031241673328005e-05, "loss": 0.6645, "step": 22200 }, { "epoch": 4.4564348521183055, "grad_norm": 1.4987940788269043, "learning_rate": 1.286470823341327e-05, "loss": 0.7117, "step": 22300 }, { "epoch": 4.476418864908074, "grad_norm": 2.063014268875122, "learning_rate": 1.2698174793498536e-05, "loss": 0.767, "step": 22400 }, { "epoch": 4.496402877697841, "grad_norm": 0.748756468296051, "learning_rate": 1.2531641353583801e-05, "loss": 0.7393, "step": 22500 }, { "epoch": 4.51638689048761, "grad_norm": 1.3971226215362549, "learning_rate": 1.2365107913669065e-05, "loss": 0.7782, "step": 22600 }, { "epoch": 4.536370903277378, "grad_norm": 1.5306447744369507, "learning_rate": 1.219857447375433e-05, "loss": 0.7299, "step": 22700 }, { "epoch": 4.556354916067146, "grad_norm": 1.409225344657898, "learning_rate": 1.2032041033839596e-05, "loss": 0.6752, "step": 22800 }, { "epoch": 4.576338928856915, "grad_norm": 1.396794080734253, "learning_rate": 1.186550759392486e-05, "loss": 0.6417, "step": 22900 }, { "epoch": 4.596322941646683, "grad_norm": 1.6455470323562622, "learning_rate": 1.1698974154010126e-05, "loss": 0.7545, "step": 23000 }, { "epoch": 4.616306954436451, "grad_norm": 1.4188311100006104, "learning_rate": 1.1532440714095391e-05, "loss": 0.7217, "step": 23100 }, { "epoch": 4.636290967226219, "grad_norm": 1.1025303602218628, "learning_rate": 1.1365907274180657e-05, "loss": 0.7419, "step": 23200 }, { "epoch": 4.656274980015987, "grad_norm": 1.0919783115386963, "learning_rate": 1.119937383426592e-05, "loss": 0.725, "step": 23300 }, { "epoch": 4.676258992805756, "grad_norm": 2.179637908935547, "learning_rate": 1.1032840394351186e-05, "loss": 0.7052, "step": 23400 }, { "epoch": 4.696243005595523, "grad_norm": 1.4243191480636597, "learning_rate": 1.0866306954436452e-05, "loss": 0.7437, "step": 23500 }, { "epoch": 4.7162270183852915, "grad_norm": 1.6711329221725464, "learning_rate": 1.0699773514521715e-05, "loss": 0.7378, "step": 23600 }, { "epoch": 4.73621103117506, "grad_norm": 1.2967829704284668, "learning_rate": 1.0533240074606981e-05, "loss": 0.7386, "step": 23700 }, { "epoch": 4.756195043964828, "grad_norm": 1.737625002861023, "learning_rate": 1.0366706634692246e-05, "loss": 0.7012, "step": 23800 }, { "epoch": 4.7761790567545965, "grad_norm": 1.062472939491272, "learning_rate": 1.020017319477751e-05, "loss": 0.6797, "step": 23900 }, { "epoch": 4.796163069544365, "grad_norm": 1.044542908668518, "learning_rate": 1.0033639754862776e-05, "loss": 0.7285, "step": 24000 }, { "epoch": 4.816147082334132, "grad_norm": 1.70567786693573, "learning_rate": 9.867106314948041e-06, "loss": 0.7777, "step": 24100 }, { "epoch": 4.836131095123901, "grad_norm": 1.6937395334243774, "learning_rate": 9.700572875033307e-06, "loss": 0.7378, "step": 24200 }, { "epoch": 4.856115107913669, "grad_norm": 2.7036936283111572, "learning_rate": 9.534039435118572e-06, "loss": 0.7813, "step": 24300 }, { "epoch": 4.876099120703437, "grad_norm": 1.1682194471359253, "learning_rate": 9.367505995203838e-06, "loss": 0.7155, "step": 24400 }, { "epoch": 4.896083133493206, "grad_norm": 1.2117973566055298, "learning_rate": 9.200972555289104e-06, "loss": 0.7273, "step": 24500 }, { "epoch": 4.916067146282973, "grad_norm": 0.9339836239814758, "learning_rate": 9.034439115374367e-06, "loss": 0.7368, "step": 24600 }, { "epoch": 4.936051159072742, "grad_norm": 1.3919428586959839, "learning_rate": 8.867905675459633e-06, "loss": 0.7059, "step": 24700 }, { "epoch": 4.95603517186251, "grad_norm": 2.1438040733337402, "learning_rate": 8.701372235544898e-06, "loss": 0.7197, "step": 24800 }, { "epoch": 4.976019184652278, "grad_norm": 1.892350435256958, "learning_rate": 8.534838795630162e-06, "loss": 0.7292, "step": 24900 }, { "epoch": 4.996003197442047, "grad_norm": 2.050062656402588, "learning_rate": 8.368305355715428e-06, "loss": 0.7791, "step": 25000 }, { "epoch": 5.015987210231814, "grad_norm": 2.285053014755249, "learning_rate": 8.201771915800693e-06, "loss": 0.6937, "step": 25100 }, { "epoch": 5.0359712230215825, "grad_norm": 1.6725279092788696, "learning_rate": 8.035238475885959e-06, "loss": 0.7443, "step": 25200 }, { "epoch": 5.055955235811351, "grad_norm": 1.590450644493103, "learning_rate": 7.868705035971223e-06, "loss": 0.7069, "step": 25300 }, { "epoch": 5.075939248601119, "grad_norm": 0.7603669762611389, "learning_rate": 7.702171596056488e-06, "loss": 0.6778, "step": 25400 }, { "epoch": 5.095923261390888, "grad_norm": 1.8916963338851929, "learning_rate": 7.535638156141754e-06, "loss": 0.769, "step": 25500 }, { "epoch": 5.115907274180655, "grad_norm": 1.6110832691192627, "learning_rate": 7.3691047162270184e-06, "loss": 0.7027, "step": 25600 }, { "epoch": 5.135891286970423, "grad_norm": 1.796796202659607, "learning_rate": 7.202571276312284e-06, "loss": 0.72, "step": 25700 }, { "epoch": 5.155875299760192, "grad_norm": 1.8212794065475464, "learning_rate": 7.0360378363975495e-06, "loss": 0.7004, "step": 25800 }, { "epoch": 5.17585931254996, "grad_norm": 1.0340906381607056, "learning_rate": 6.869504396482813e-06, "loss": 0.6687, "step": 25900 }, { "epoch": 5.1958433253397285, "grad_norm": 1.8287034034729004, "learning_rate": 6.702970956568079e-06, "loss": 0.6774, "step": 26000 }, { "epoch": 5.215827338129497, "grad_norm": 1.657259225845337, "learning_rate": 6.536437516653344e-06, "loss": 0.6995, "step": 26100 }, { "epoch": 5.235811350919264, "grad_norm": 1.8235076665878296, "learning_rate": 6.36990407673861e-06, "loss": 0.7814, "step": 26200 }, { "epoch": 5.255795363709033, "grad_norm": 1.6127688884735107, "learning_rate": 6.203370636823875e-06, "loss": 0.6797, "step": 26300 }, { "epoch": 5.275779376498801, "grad_norm": 1.2275160551071167, "learning_rate": 6.03683719690914e-06, "loss": 0.7336, "step": 26400 }, { "epoch": 5.295763389288569, "grad_norm": 1.6593281030654907, "learning_rate": 5.870303756994405e-06, "loss": 0.6988, "step": 26500 }, { "epoch": 5.315747402078338, "grad_norm": 1.1069490909576416, "learning_rate": 5.70377031707967e-06, "loss": 0.801, "step": 26600 }, { "epoch": 5.335731414868105, "grad_norm": 1.7498623132705688, "learning_rate": 5.537236877164935e-06, "loss": 0.715, "step": 26700 }, { "epoch": 5.355715427657874, "grad_norm": 1.7322038412094116, "learning_rate": 5.3707034372502e-06, "loss": 0.7076, "step": 26800 }, { "epoch": 5.375699440447642, "grad_norm": 1.2660248279571533, "learning_rate": 5.204169997335465e-06, "loss": 0.7494, "step": 26900 }, { "epoch": 5.39568345323741, "grad_norm": 2.537752628326416, "learning_rate": 5.03763655742073e-06, "loss": 0.7326, "step": 27000 }, { "epoch": 5.415667466027179, "grad_norm": 0.991534411907196, "learning_rate": 4.8711031175059955e-06, "loss": 0.6929, "step": 27100 }, { "epoch": 5.435651478816946, "grad_norm": 2.0230729579925537, "learning_rate": 4.70456967759126e-06, "loss": 0.6787, "step": 27200 }, { "epoch": 5.4556354916067145, "grad_norm": 1.5560120344161987, "learning_rate": 4.538036237676526e-06, "loss": 0.7251, "step": 27300 }, { "epoch": 5.475619504396483, "grad_norm": 1.5086272954940796, "learning_rate": 4.371502797761791e-06, "loss": 0.7066, "step": 27400 }, { "epoch": 5.495603517186251, "grad_norm": 1.6183174848556519, "learning_rate": 4.204969357847056e-06, "loss": 0.7161, "step": 27500 }, { "epoch": 5.5155875299760195, "grad_norm": 1.1214239597320557, "learning_rate": 4.0384359179323214e-06, "loss": 0.7414, "step": 27600 }, { "epoch": 5.535571542765787, "grad_norm": 1.4948476552963257, "learning_rate": 3.871902478017586e-06, "loss": 0.7303, "step": 27700 }, { "epoch": 5.555555555555555, "grad_norm": 1.094460368156433, "learning_rate": 3.705369038102851e-06, "loss": 0.7214, "step": 27800 }, { "epoch": 5.575539568345324, "grad_norm": 1.8006253242492676, "learning_rate": 3.5388355981881163e-06, "loss": 0.7382, "step": 27900 }, { "epoch": 5.595523581135092, "grad_norm": 1.0595532655715942, "learning_rate": 3.3723021582733815e-06, "loss": 0.6835, "step": 28000 }, { "epoch": 5.61550759392486, "grad_norm": 1.5675129890441895, "learning_rate": 3.205768718358647e-06, "loss": 0.7426, "step": 28100 }, { "epoch": 5.635491606714629, "grad_norm": 1.543182134628296, "learning_rate": 3.0392352784439117e-06, "loss": 0.7393, "step": 28200 }, { "epoch": 5.655475619504396, "grad_norm": 1.6735225915908813, "learning_rate": 2.8727018385291768e-06, "loss": 0.7106, "step": 28300 }, { "epoch": 5.675459632294165, "grad_norm": 1.2037389278411865, "learning_rate": 2.706168398614442e-06, "loss": 0.6928, "step": 28400 }, { "epoch": 5.695443645083933, "grad_norm": 1.957836627960205, "learning_rate": 2.539634958699707e-06, "loss": 0.8086, "step": 28500 }, { "epoch": 5.715427657873701, "grad_norm": 2.085599899291992, "learning_rate": 2.373101518784972e-06, "loss": 0.7587, "step": 28600 }, { "epoch": 5.735411670663469, "grad_norm": 1.3564984798431396, "learning_rate": 2.206568078870237e-06, "loss": 0.7197, "step": 28700 }, { "epoch": 5.755395683453237, "grad_norm": 1.659226655960083, "learning_rate": 2.0400346389555023e-06, "loss": 0.728, "step": 28800 }, { "epoch": 5.7753796962430055, "grad_norm": 1.3784935474395752, "learning_rate": 1.8735011990407676e-06, "loss": 0.7561, "step": 28900 }, { "epoch": 5.795363709032774, "grad_norm": 1.4514496326446533, "learning_rate": 1.7069677591260325e-06, "loss": 0.7205, "step": 29000 }, { "epoch": 5.815347721822542, "grad_norm": 1.7896771430969238, "learning_rate": 1.5404343192112976e-06, "loss": 0.667, "step": 29100 }, { "epoch": 5.835331734612311, "grad_norm": 1.4074804782867432, "learning_rate": 1.3739008792965628e-06, "loss": 0.71, "step": 29200 }, { "epoch": 5.855315747402078, "grad_norm": 1.33772873878479, "learning_rate": 1.2073674393818279e-06, "loss": 0.7688, "step": 29300 }, { "epoch": 5.875299760191846, "grad_norm": 1.8295559883117676, "learning_rate": 1.040833999467093e-06, "loss": 0.7439, "step": 29400 }, { "epoch": 5.895283772981615, "grad_norm": 0.9400151371955872, "learning_rate": 8.743005595523582e-07, "loss": 0.6891, "step": 29500 }, { "epoch": 5.915267785771383, "grad_norm": 1.7990922927856445, "learning_rate": 7.077671196376233e-07, "loss": 0.6384, "step": 29600 }, { "epoch": 5.935251798561151, "grad_norm": 1.74308180809021, "learning_rate": 5.412336797228884e-07, "loss": 0.6973, "step": 29700 }, { "epoch": 5.955235811350919, "grad_norm": 1.1248557567596436, "learning_rate": 3.747002398081535e-07, "loss": 0.7265, "step": 29800 }, { "epoch": 5.975219824140687, "grad_norm": 1.9805783033370972, "learning_rate": 2.0816679989341861e-07, "loss": 0.7107, "step": 29900 }, { "epoch": 5.995203836930456, "grad_norm": 1.6576383113861084, "learning_rate": 4.163335997868372e-08, "loss": 0.7073, "step": 30000 } ], "logging_steps": 100, "max_steps": 30024, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.230686056448e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }