| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.995203836930456, | |
| "eval_steps": 500, | |
| "global_step": 30000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019984012789768184, | |
| "grad_norm": 10.435233116149902, | |
| "learning_rate": 4.9835131894484415e-05, | |
| "loss": 14.0782, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03996802557953637, | |
| "grad_norm": 19.157014846801758, | |
| "learning_rate": 4.966859845456968e-05, | |
| "loss": 3.596, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05995203836930456, | |
| "grad_norm": 5.946423530578613, | |
| "learning_rate": 4.950206501465495e-05, | |
| "loss": 2.0558, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07993605115907274, | |
| "grad_norm": 2.1558027267456055, | |
| "learning_rate": 4.9335531574740214e-05, | |
| "loss": 1.3957, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09992006394884093, | |
| "grad_norm": 2.5541210174560547, | |
| "learning_rate": 4.916899813482548e-05, | |
| "loss": 1.252, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11990407673860912, | |
| "grad_norm": 1.8417296409606934, | |
| "learning_rate": 4.9002464694910735e-05, | |
| "loss": 1.1695, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1398880895283773, | |
| "grad_norm": 1.654639720916748, | |
| "learning_rate": 4.8835931254996005e-05, | |
| "loss": 1.1848, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15987210231814547, | |
| "grad_norm": 2.093991756439209, | |
| "learning_rate": 4.866939781508127e-05, | |
| "loss": 1.1056, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17985611510791366, | |
| "grad_norm": 1.9905765056610107, | |
| "learning_rate": 4.850286437516653e-05, | |
| "loss": 1.1186, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19984012789768185, | |
| "grad_norm": 1.2864068746566772, | |
| "learning_rate": 4.83363309352518e-05, | |
| "loss": 1.1394, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.21982414068745004, | |
| "grad_norm": 1.5142974853515625, | |
| "learning_rate": 4.816979749533707e-05, | |
| "loss": 1.1162, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.23980815347721823, | |
| "grad_norm": 0.9597436189651489, | |
| "learning_rate": 4.800326405542233e-05, | |
| "loss": 1.0922, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2597921662669864, | |
| "grad_norm": 1.7915301322937012, | |
| "learning_rate": 4.7836730615507595e-05, | |
| "loss": 1.0711, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2797761790567546, | |
| "grad_norm": 1.1338952779769897, | |
| "learning_rate": 4.767019717559286e-05, | |
| "loss": 0.9945, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2997601918465228, | |
| "grad_norm": 1.3437297344207764, | |
| "learning_rate": 4.750366373567813e-05, | |
| "loss": 1.0322, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.31974420463629094, | |
| "grad_norm": 1.2949973344802856, | |
| "learning_rate": 4.733713029576339e-05, | |
| "loss": 1.0613, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.33972821742605913, | |
| "grad_norm": 1.1362179517745972, | |
| "learning_rate": 4.717059685584866e-05, | |
| "loss": 1.0122, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3597122302158273, | |
| "grad_norm": 1.1878063678741455, | |
| "learning_rate": 4.700406341593392e-05, | |
| "loss": 1.0068, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3796962430055955, | |
| "grad_norm": 1.3588361740112305, | |
| "learning_rate": 4.6837529976019185e-05, | |
| "loss": 0.955, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3996802557953637, | |
| "grad_norm": 1.1428577899932861, | |
| "learning_rate": 4.667099653610445e-05, | |
| "loss": 0.9863, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4196642685851319, | |
| "grad_norm": 1.6441487073898315, | |
| "learning_rate": 4.650446309618971e-05, | |
| "loss": 0.9532, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.4396482813749001, | |
| "grad_norm": 2.607586145401001, | |
| "learning_rate": 4.633792965627498e-05, | |
| "loss": 0.9877, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.4596322941646683, | |
| "grad_norm": 2.24434757232666, | |
| "learning_rate": 4.617139621636025e-05, | |
| "loss": 0.9956, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.47961630695443647, | |
| "grad_norm": 1.8238356113433838, | |
| "learning_rate": 4.600486277644551e-05, | |
| "loss": 0.9769, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.49960031974420466, | |
| "grad_norm": 2.0538158416748047, | |
| "learning_rate": 4.5838329336530775e-05, | |
| "loss": 0.9458, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5195843325339728, | |
| "grad_norm": 2.1354427337646484, | |
| "learning_rate": 4.5671795896616045e-05, | |
| "loss": 0.975, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.539568345323741, | |
| "grad_norm": 1.3763636350631714, | |
| "learning_rate": 4.550526245670131e-05, | |
| "loss": 0.9464, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.5595523581135092, | |
| "grad_norm": 2.6834394931793213, | |
| "learning_rate": 4.533872901678657e-05, | |
| "loss": 0.9437, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5795363709032774, | |
| "grad_norm": 1.3325830698013306, | |
| "learning_rate": 4.517219557687184e-05, | |
| "loss": 0.9617, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5995203836930456, | |
| "grad_norm": 1.841642141342163, | |
| "learning_rate": 4.500566213695711e-05, | |
| "loss": 0.9489, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6195043964828137, | |
| "grad_norm": 1.0305529832839966, | |
| "learning_rate": 4.483912869704237e-05, | |
| "loss": 0.9432, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.6394884092725819, | |
| "grad_norm": 1.4249075651168823, | |
| "learning_rate": 4.467259525712763e-05, | |
| "loss": 0.952, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.6594724220623501, | |
| "grad_norm": 1.2994813919067383, | |
| "learning_rate": 4.45060618172129e-05, | |
| "loss": 1.0247, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.6794564348521183, | |
| "grad_norm": 1.537548303604126, | |
| "learning_rate": 4.433952837729816e-05, | |
| "loss": 0.928, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6994404476418865, | |
| "grad_norm": 1.646200180053711, | |
| "learning_rate": 4.4172994937383427e-05, | |
| "loss": 0.9817, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7194244604316546, | |
| "grad_norm": 1.2779592275619507, | |
| "learning_rate": 4.400646149746869e-05, | |
| "loss": 0.8974, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.7394084732214229, | |
| "grad_norm": 1.7886369228363037, | |
| "learning_rate": 4.383992805755396e-05, | |
| "loss": 0.9351, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.759392486011191, | |
| "grad_norm": 1.5072957277297974, | |
| "learning_rate": 4.3673394617639225e-05, | |
| "loss": 0.882, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.7793764988009593, | |
| "grad_norm": 2.019421339035034, | |
| "learning_rate": 4.350686117772449e-05, | |
| "loss": 0.9324, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.7993605115907274, | |
| "grad_norm": 1.7232331037521362, | |
| "learning_rate": 4.334032773780975e-05, | |
| "loss": 0.9094, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8193445243804957, | |
| "grad_norm": 1.7297419309616089, | |
| "learning_rate": 4.317379429789502e-05, | |
| "loss": 0.9658, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.8393285371702638, | |
| "grad_norm": 1.560420274734497, | |
| "learning_rate": 4.300726085798029e-05, | |
| "loss": 0.9613, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.8593125499600319, | |
| "grad_norm": 1.9014427661895752, | |
| "learning_rate": 4.284072741806555e-05, | |
| "loss": 0.9047, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.8792965627498002, | |
| "grad_norm": 1.4741132259368896, | |
| "learning_rate": 4.2674193978150815e-05, | |
| "loss": 0.7914, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.8992805755395683, | |
| "grad_norm": 0.8008555173873901, | |
| "learning_rate": 4.250766053823608e-05, | |
| "loss": 0.8678, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9192645883293366, | |
| "grad_norm": 1.5738321542739868, | |
| "learning_rate": 4.234112709832134e-05, | |
| "loss": 0.8379, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.9392486011191047, | |
| "grad_norm": 1.7838175296783447, | |
| "learning_rate": 4.2174593658406606e-05, | |
| "loss": 0.9581, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.9592326139088729, | |
| "grad_norm": 1.6761012077331543, | |
| "learning_rate": 4.200806021849188e-05, | |
| "loss": 0.8583, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.9792166266986411, | |
| "grad_norm": 1.320033073425293, | |
| "learning_rate": 4.184152677857714e-05, | |
| "loss": 0.8709, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.9992006394884093, | |
| "grad_norm": 2.0931804180145264, | |
| "learning_rate": 4.1674993338662404e-05, | |
| "loss": 0.9064, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.0191846522781776, | |
| "grad_norm": 1.1682066917419434, | |
| "learning_rate": 4.150845989874767e-05, | |
| "loss": 0.8728, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.0391686650679457, | |
| "grad_norm": 1.2569072246551514, | |
| "learning_rate": 4.134192645883294e-05, | |
| "loss": 0.8921, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.0591526778577138, | |
| "grad_norm": 1.4215683937072754, | |
| "learning_rate": 4.11753930189182e-05, | |
| "loss": 0.8211, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.079136690647482, | |
| "grad_norm": 1.6081187725067139, | |
| "learning_rate": 4.1008859579003467e-05, | |
| "loss": 0.9338, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.09912070343725, | |
| "grad_norm": 1.4916200637817383, | |
| "learning_rate": 4.084232613908873e-05, | |
| "loss": 0.8142, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.1191047162270185, | |
| "grad_norm": 1.8639625310897827, | |
| "learning_rate": 4.0675792699174e-05, | |
| "loss": 0.8746, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.1390887290167866, | |
| "grad_norm": 1.1741764545440674, | |
| "learning_rate": 4.0509259259259265e-05, | |
| "loss": 0.8032, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.1590727418065547, | |
| "grad_norm": 1.7627875804901123, | |
| "learning_rate": 4.034272581934453e-05, | |
| "loss": 0.8681, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.1790567545963229, | |
| "grad_norm": 0.7432733178138733, | |
| "learning_rate": 4.0176192379429786e-05, | |
| "loss": 0.8968, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.1990407673860912, | |
| "grad_norm": 1.5172642469406128, | |
| "learning_rate": 4.0009658939515056e-05, | |
| "loss": 0.9653, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.2190247801758594, | |
| "grad_norm": 2.0822958946228027, | |
| "learning_rate": 3.984312549960032e-05, | |
| "loss": 0.8038, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.2390087929656275, | |
| "grad_norm": 2.2852039337158203, | |
| "learning_rate": 3.9676592059685584e-05, | |
| "loss": 0.8041, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.2589928057553956, | |
| "grad_norm": 1.214968204498291, | |
| "learning_rate": 3.951005861977085e-05, | |
| "loss": 0.8382, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.2789768185451638, | |
| "grad_norm": 2.792722225189209, | |
| "learning_rate": 3.934352517985612e-05, | |
| "loss": 0.8534, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.2989608313349321, | |
| "grad_norm": 1.6279624700546265, | |
| "learning_rate": 3.917699173994138e-05, | |
| "loss": 0.8387, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.3189448441247003, | |
| "grad_norm": 1.57301664352417, | |
| "learning_rate": 3.9010458300026646e-05, | |
| "loss": 0.8583, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.3389288569144684, | |
| "grad_norm": 1.2693675756454468, | |
| "learning_rate": 3.884392486011191e-05, | |
| "loss": 0.7893, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.3589128697042367, | |
| "grad_norm": 1.1760280132293701, | |
| "learning_rate": 3.867739142019718e-05, | |
| "loss": 0.8204, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.3788968824940047, | |
| "grad_norm": 1.8213127851486206, | |
| "learning_rate": 3.8510857980282444e-05, | |
| "loss": 0.9061, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.398880895283773, | |
| "grad_norm": 1.2589592933654785, | |
| "learning_rate": 3.834432454036771e-05, | |
| "loss": 0.856, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.4188649080735412, | |
| "grad_norm": 2.5817718505859375, | |
| "learning_rate": 3.817779110045297e-05, | |
| "loss": 0.8542, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.4388489208633093, | |
| "grad_norm": 1.1825404167175293, | |
| "learning_rate": 3.8011257660538236e-05, | |
| "loss": 0.8298, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.4588329336530776, | |
| "grad_norm": 1.6443575620651245, | |
| "learning_rate": 3.78447242206235e-05, | |
| "loss": 0.823, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.4788169464428458, | |
| "grad_norm": 1.9887899160385132, | |
| "learning_rate": 3.7678190780708764e-05, | |
| "loss": 0.7936, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.498800959232614, | |
| "grad_norm": 1.8799304962158203, | |
| "learning_rate": 3.7511657340794034e-05, | |
| "loss": 0.8755, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.518784972022382, | |
| "grad_norm": 1.6680015325546265, | |
| "learning_rate": 3.73451239008793e-05, | |
| "loss": 0.8669, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.5387689848121502, | |
| "grad_norm": 1.7756261825561523, | |
| "learning_rate": 3.717859046096456e-05, | |
| "loss": 0.8572, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.5587529976019185, | |
| "grad_norm": 1.3951911926269531, | |
| "learning_rate": 3.7012057021049826e-05, | |
| "loss": 0.8422, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.5787370103916867, | |
| "grad_norm": 1.8145322799682617, | |
| "learning_rate": 3.6845523581135096e-05, | |
| "loss": 0.783, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.5987210231814548, | |
| "grad_norm": 1.4113447666168213, | |
| "learning_rate": 3.667899014122036e-05, | |
| "loss": 0.8368, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.6187050359712232, | |
| "grad_norm": 1.5562957525253296, | |
| "learning_rate": 3.6512456701305624e-05, | |
| "loss": 0.8232, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.638689048760991, | |
| "grad_norm": 2.0334463119506836, | |
| "learning_rate": 3.634592326139089e-05, | |
| "loss": 0.8272, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.6586730615507594, | |
| "grad_norm": 2.305115222930908, | |
| "learning_rate": 3.617938982147616e-05, | |
| "loss": 0.8708, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.6786570743405276, | |
| "grad_norm": 1.9576376676559448, | |
| "learning_rate": 3.601285638156142e-05, | |
| "loss": 0.8437, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.6986410871302957, | |
| "grad_norm": 1.324064016342163, | |
| "learning_rate": 3.584632294164668e-05, | |
| "loss": 0.8197, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.718625099920064, | |
| "grad_norm": 1.5594903230667114, | |
| "learning_rate": 3.567978950173195e-05, | |
| "loss": 0.8365, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.738609112709832, | |
| "grad_norm": 1.853633999824524, | |
| "learning_rate": 3.5513256061817214e-05, | |
| "loss": 0.7914, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.7585931254996003, | |
| "grad_norm": 1.839158296585083, | |
| "learning_rate": 3.534672262190248e-05, | |
| "loss": 0.9374, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.7785771382893685, | |
| "grad_norm": 2.5038366317749023, | |
| "learning_rate": 3.518018918198774e-05, | |
| "loss": 0.8202, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.7985611510791366, | |
| "grad_norm": 1.7603284120559692, | |
| "learning_rate": 3.501365574207301e-05, | |
| "loss": 0.8346, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.818545163868905, | |
| "grad_norm": 1.9243416786193848, | |
| "learning_rate": 3.4847122302158276e-05, | |
| "loss": 0.8141, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.838529176658673, | |
| "grad_norm": 1.6993930339813232, | |
| "learning_rate": 3.468058886224354e-05, | |
| "loss": 0.7974, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.8585131894484412, | |
| "grad_norm": 1.9248780012130737, | |
| "learning_rate": 3.4514055422328804e-05, | |
| "loss": 0.8543, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.8784972022382096, | |
| "grad_norm": 1.7247469425201416, | |
| "learning_rate": 3.4347521982414074e-05, | |
| "loss": 0.7968, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.8984812150279775, | |
| "grad_norm": 1.165992259979248, | |
| "learning_rate": 3.418098854249934e-05, | |
| "loss": 0.7946, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.9184652278177459, | |
| "grad_norm": 1.5617034435272217, | |
| "learning_rate": 3.40144551025846e-05, | |
| "loss": 0.8452, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.938449240607514, | |
| "grad_norm": 1.9524955749511719, | |
| "learning_rate": 3.3847921662669866e-05, | |
| "loss": 0.821, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.9584332533972821, | |
| "grad_norm": 1.201984167098999, | |
| "learning_rate": 3.3681388222755136e-05, | |
| "loss": 0.8244, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.9784172661870505, | |
| "grad_norm": 1.5261083841323853, | |
| "learning_rate": 3.3514854782840393e-05, | |
| "loss": 0.8521, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.9984012789768184, | |
| "grad_norm": 0.8879593014717102, | |
| "learning_rate": 3.334832134292566e-05, | |
| "loss": 0.7745, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.0183852917665868, | |
| "grad_norm": 1.9460114240646362, | |
| "learning_rate": 3.318178790301093e-05, | |
| "loss": 0.7926, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.038369304556355, | |
| "grad_norm": 2.0698747634887695, | |
| "learning_rate": 3.301525446309619e-05, | |
| "loss": 0.8668, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.058353317346123, | |
| "grad_norm": 1.6188371181488037, | |
| "learning_rate": 3.2848721023181456e-05, | |
| "loss": 0.7743, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 2.0783373301358914, | |
| "grad_norm": 1.4746142625808716, | |
| "learning_rate": 3.268218758326672e-05, | |
| "loss": 0.776, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 2.0983213429256593, | |
| "grad_norm": 2.6285245418548584, | |
| "learning_rate": 3.251565414335199e-05, | |
| "loss": 0.7952, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.1183053557154277, | |
| "grad_norm": 2.9462263584136963, | |
| "learning_rate": 3.2349120703437254e-05, | |
| "loss": 0.8492, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 2.138289368505196, | |
| "grad_norm": 2.2768771648406982, | |
| "learning_rate": 3.218258726352252e-05, | |
| "loss": 0.7773, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 2.158273381294964, | |
| "grad_norm": 2.4314112663269043, | |
| "learning_rate": 3.201605382360778e-05, | |
| "loss": 0.7789, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 2.1782573940847323, | |
| "grad_norm": 2.631697654724121, | |
| "learning_rate": 3.184952038369305e-05, | |
| "loss": 0.7809, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 2.1982414068745, | |
| "grad_norm": 2.0636370182037354, | |
| "learning_rate": 3.1682986943778316e-05, | |
| "loss": 0.7627, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.2182254196642686, | |
| "grad_norm": 1.861494779586792, | |
| "learning_rate": 3.151645350386358e-05, | |
| "loss": 0.7454, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 2.238209432454037, | |
| "grad_norm": 1.6431078910827637, | |
| "learning_rate": 3.134992006394884e-05, | |
| "loss": 0.8208, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 2.258193445243805, | |
| "grad_norm": 1.1081715822219849, | |
| "learning_rate": 3.118338662403411e-05, | |
| "loss": 0.7963, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.278177458033573, | |
| "grad_norm": 1.6696077585220337, | |
| "learning_rate": 3.101685318411937e-05, | |
| "loss": 0.7948, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.2981614708233415, | |
| "grad_norm": 1.1712377071380615, | |
| "learning_rate": 3.0850319744204635e-05, | |
| "loss": 0.7907, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.3181454836131095, | |
| "grad_norm": 1.28898024559021, | |
| "learning_rate": 3.06837863042899e-05, | |
| "loss": 0.7791, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.338129496402878, | |
| "grad_norm": 1.3267985582351685, | |
| "learning_rate": 3.0517252864375166e-05, | |
| "loss": 0.7819, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.3581135091926457, | |
| "grad_norm": 1.4074293375015259, | |
| "learning_rate": 3.0350719424460434e-05, | |
| "loss": 0.818, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.378097521982414, | |
| "grad_norm": 0.9492627382278442, | |
| "learning_rate": 3.0184185984545697e-05, | |
| "loss": 0.7689, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.3980815347721824, | |
| "grad_norm": 1.8090003728866577, | |
| "learning_rate": 3.0017652544630965e-05, | |
| "loss": 0.7845, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.4180655475619504, | |
| "grad_norm": 1.899207353591919, | |
| "learning_rate": 2.985111910471623e-05, | |
| "loss": 0.7742, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.4380495603517187, | |
| "grad_norm": 2.0821797847747803, | |
| "learning_rate": 2.9684585664801496e-05, | |
| "loss": 0.7696, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.4580335731414866, | |
| "grad_norm": 1.089876651763916, | |
| "learning_rate": 2.951805222488676e-05, | |
| "loss": 0.7873, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.478017585931255, | |
| "grad_norm": 1.265599250793457, | |
| "learning_rate": 2.9351518784972027e-05, | |
| "loss": 0.7504, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.4980015987210233, | |
| "grad_norm": 2.7753829956054688, | |
| "learning_rate": 2.9184985345057287e-05, | |
| "loss": 0.7824, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.5179856115107913, | |
| "grad_norm": 1.0310410261154175, | |
| "learning_rate": 2.901845190514255e-05, | |
| "loss": 0.7326, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.5379696243005596, | |
| "grad_norm": 2.056279182434082, | |
| "learning_rate": 2.8851918465227818e-05, | |
| "loss": 0.8411, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.5579536370903275, | |
| "grad_norm": 1.1815407276153564, | |
| "learning_rate": 2.8685385025313082e-05, | |
| "loss": 0.805, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.577937649880096, | |
| "grad_norm": 1.6167210340499878, | |
| "learning_rate": 2.851885158539835e-05, | |
| "loss": 0.7311, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.5979216626698642, | |
| "grad_norm": 1.488755226135254, | |
| "learning_rate": 2.8352318145483613e-05, | |
| "loss": 0.8335, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.617905675459632, | |
| "grad_norm": 2.5013859272003174, | |
| "learning_rate": 2.818578470556888e-05, | |
| "loss": 0.7833, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.6378896882494005, | |
| "grad_norm": 1.102152943611145, | |
| "learning_rate": 2.8019251265654144e-05, | |
| "loss": 0.7536, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.6578737010391684, | |
| "grad_norm": 1.3805499076843262, | |
| "learning_rate": 2.785271782573941e-05, | |
| "loss": 0.775, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.677857713828937, | |
| "grad_norm": 2.189347505569458, | |
| "learning_rate": 2.7686184385824675e-05, | |
| "loss": 0.8178, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.697841726618705, | |
| "grad_norm": 0.9750763177871704, | |
| "learning_rate": 2.7519650945909942e-05, | |
| "loss": 0.7743, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.7178257394084735, | |
| "grad_norm": 1.2844312191009521, | |
| "learning_rate": 2.7353117505995206e-05, | |
| "loss": 0.7631, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.7378097521982414, | |
| "grad_norm": 1.4551914930343628, | |
| "learning_rate": 2.7186584066080474e-05, | |
| "loss": 0.76, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.7577937649880093, | |
| "grad_norm": 0.8891064524650574, | |
| "learning_rate": 2.7020050626165734e-05, | |
| "loss": 0.8252, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 1.9776784181594849, | |
| "learning_rate": 2.6853517186250998e-05, | |
| "loss": 0.8037, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.797761790567546, | |
| "grad_norm": 1.429692029953003, | |
| "learning_rate": 2.6686983746336265e-05, | |
| "loss": 0.7369, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.8177458033573144, | |
| "grad_norm": 2.0837855339050293, | |
| "learning_rate": 2.652045030642153e-05, | |
| "loss": 0.742, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.8377298161470823, | |
| "grad_norm": 1.2353509664535522, | |
| "learning_rate": 2.6353916866506796e-05, | |
| "loss": 0.7615, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.8577138289368507, | |
| "grad_norm": 0.8735284209251404, | |
| "learning_rate": 2.618738342659206e-05, | |
| "loss": 0.7872, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.8776978417266186, | |
| "grad_norm": 1.0889009237289429, | |
| "learning_rate": 2.6020849986677327e-05, | |
| "loss": 0.7455, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.897681854516387, | |
| "grad_norm": 1.506787657737732, | |
| "learning_rate": 2.585431654676259e-05, | |
| "loss": 0.792, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.9176658673061553, | |
| "grad_norm": 0.7630636096000671, | |
| "learning_rate": 2.5687783106847858e-05, | |
| "loss": 0.7918, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.937649880095923, | |
| "grad_norm": 1.6361045837402344, | |
| "learning_rate": 2.5521249666933122e-05, | |
| "loss": 0.7357, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.9576338928856916, | |
| "grad_norm": 2.248220920562744, | |
| "learning_rate": 2.535471622701839e-05, | |
| "loss": 0.7954, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.9776179056754595, | |
| "grad_norm": 1.14662766456604, | |
| "learning_rate": 2.5188182787103653e-05, | |
| "loss": 0.7865, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.997601918465228, | |
| "grad_norm": 1.3895844221115112, | |
| "learning_rate": 2.502164934718892e-05, | |
| "loss": 0.7364, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.017585931254996, | |
| "grad_norm": 2.1330533027648926, | |
| "learning_rate": 2.485511590727418e-05, | |
| "loss": 0.7244, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 3.037569944044764, | |
| "grad_norm": 1.384775996208191, | |
| "learning_rate": 2.4688582467359448e-05, | |
| "loss": 0.7393, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 3.0575539568345325, | |
| "grad_norm": 0.9841705560684204, | |
| "learning_rate": 2.4522049027444712e-05, | |
| "loss": 0.8051, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 3.0775379696243004, | |
| "grad_norm": 1.224924921989441, | |
| "learning_rate": 2.435551558752998e-05, | |
| "loss": 0.8004, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 3.0975219824140687, | |
| "grad_norm": 2.2387399673461914, | |
| "learning_rate": 2.418898214761524e-05, | |
| "loss": 0.8051, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 3.117505995203837, | |
| "grad_norm": 1.8771803379058838, | |
| "learning_rate": 2.4022448707700507e-05, | |
| "loss": 0.7903, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 3.137490007993605, | |
| "grad_norm": 1.786600112915039, | |
| "learning_rate": 2.385591526778577e-05, | |
| "loss": 0.7796, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 3.1574740207833734, | |
| "grad_norm": 1.0823020935058594, | |
| "learning_rate": 2.3689381827871038e-05, | |
| "loss": 0.7468, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 3.1774580335731413, | |
| "grad_norm": 1.9462608098983765, | |
| "learning_rate": 2.35228483879563e-05, | |
| "loss": 0.7854, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 3.1974420463629096, | |
| "grad_norm": 1.3235732316970825, | |
| "learning_rate": 2.335631494804157e-05, | |
| "loss": 0.7584, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.217426059152678, | |
| "grad_norm": 1.5206961631774902, | |
| "learning_rate": 2.3189781508126833e-05, | |
| "loss": 0.8104, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 3.237410071942446, | |
| "grad_norm": 1.4281466007232666, | |
| "learning_rate": 2.3023248068212097e-05, | |
| "loss": 0.7505, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 3.2573940847322143, | |
| "grad_norm": 1.9032511711120605, | |
| "learning_rate": 2.2856714628297364e-05, | |
| "loss": 0.7813, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 3.277378097521982, | |
| "grad_norm": 2.10361909866333, | |
| "learning_rate": 2.2690181188382628e-05, | |
| "loss": 0.7369, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 3.2973621103117505, | |
| "grad_norm": 1.440158486366272, | |
| "learning_rate": 2.2523647748467895e-05, | |
| "loss": 0.7576, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 3.317346123101519, | |
| "grad_norm": 1.8777798414230347, | |
| "learning_rate": 2.235711430855316e-05, | |
| "loss": 0.7317, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 3.337330135891287, | |
| "grad_norm": 1.6413357257843018, | |
| "learning_rate": 2.2190580868638426e-05, | |
| "loss": 0.7802, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 3.357314148681055, | |
| "grad_norm": 1.820087194442749, | |
| "learning_rate": 2.2024047428723686e-05, | |
| "loss": 0.7435, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 3.3772981614708235, | |
| "grad_norm": 2.5140113830566406, | |
| "learning_rate": 2.1857513988808954e-05, | |
| "loss": 0.7532, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 3.3972821742605914, | |
| "grad_norm": 1.7011070251464844, | |
| "learning_rate": 2.1690980548894217e-05, | |
| "loss": 0.728, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.41726618705036, | |
| "grad_norm": 1.3051706552505493, | |
| "learning_rate": 2.1524447108979485e-05, | |
| "loss": 0.7493, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.437250199840128, | |
| "grad_norm": 0.9745834469795227, | |
| "learning_rate": 2.135791366906475e-05, | |
| "loss": 0.7219, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.457234212629896, | |
| "grad_norm": 1.3213515281677246, | |
| "learning_rate": 2.1191380229150016e-05, | |
| "loss": 0.7703, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.4772182254196644, | |
| "grad_norm": 0.735060453414917, | |
| "learning_rate": 2.102484678923528e-05, | |
| "loss": 0.7342, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.4972022382094323, | |
| "grad_norm": 1.073197603225708, | |
| "learning_rate": 2.0858313349320543e-05, | |
| "loss": 0.7023, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.5171862509992007, | |
| "grad_norm": 1.797711730003357, | |
| "learning_rate": 2.069177990940581e-05, | |
| "loss": 0.7944, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.537170263788969, | |
| "grad_norm": 1.3365331888198853, | |
| "learning_rate": 2.0525246469491074e-05, | |
| "loss": 0.7773, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.557154276578737, | |
| "grad_norm": 1.451333999633789, | |
| "learning_rate": 2.035871302957634e-05, | |
| "loss": 0.7659, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.5771382893685053, | |
| "grad_norm": 1.5622735023498535, | |
| "learning_rate": 2.0192179589661606e-05, | |
| "loss": 0.7676, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.597122302158273, | |
| "grad_norm": 1.5826952457427979, | |
| "learning_rate": 2.0025646149746873e-05, | |
| "loss": 0.7393, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.6171063149480416, | |
| "grad_norm": 0.7937633991241455, | |
| "learning_rate": 1.9859112709832133e-05, | |
| "loss": 0.7112, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.63709032773781, | |
| "grad_norm": 1.8199377059936523, | |
| "learning_rate": 1.96925792699174e-05, | |
| "loss": 0.7722, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.657074340527578, | |
| "grad_norm": 2.317171573638916, | |
| "learning_rate": 1.9526045830002664e-05, | |
| "loss": 0.7735, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.677058353317346, | |
| "grad_norm": 0.9344459772109985, | |
| "learning_rate": 1.935951239008793e-05, | |
| "loss": 0.7168, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.697042366107114, | |
| "grad_norm": 1.833892583847046, | |
| "learning_rate": 1.9192978950173195e-05, | |
| "loss": 0.6825, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.7170263788968825, | |
| "grad_norm": 1.4093741178512573, | |
| "learning_rate": 1.9026445510258463e-05, | |
| "loss": 0.7087, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.737010391686651, | |
| "grad_norm": 2.0284645557403564, | |
| "learning_rate": 1.8859912070343726e-05, | |
| "loss": 0.7579, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.7569944044764187, | |
| "grad_norm": 1.8383668661117554, | |
| "learning_rate": 1.869337863042899e-05, | |
| "loss": 0.7603, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.776978417266187, | |
| "grad_norm": 1.5985366106033325, | |
| "learning_rate": 1.8526845190514254e-05, | |
| "loss": 0.7166, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.796962430055955, | |
| "grad_norm": 1.5089521408081055, | |
| "learning_rate": 1.836031175059952e-05, | |
| "loss": 0.7678, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.8169464428457234, | |
| "grad_norm": 1.2770063877105713, | |
| "learning_rate": 1.8193778310684785e-05, | |
| "loss": 0.7213, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 3.8369304556354917, | |
| "grad_norm": 2.4528274536132812, | |
| "learning_rate": 1.8027244870770052e-05, | |
| "loss": 0.7255, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.8569144684252596, | |
| "grad_norm": 1.736755132675171, | |
| "learning_rate": 1.7860711430855316e-05, | |
| "loss": 0.6784, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 3.876898481215028, | |
| "grad_norm": 1.719307780265808, | |
| "learning_rate": 1.7694177990940583e-05, | |
| "loss": 0.7795, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 3.896882494004796, | |
| "grad_norm": 2.070528984069824, | |
| "learning_rate": 1.7527644551025847e-05, | |
| "loss": 0.7509, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.9168665067945643, | |
| "grad_norm": 1.6482255458831787, | |
| "learning_rate": 1.736111111111111e-05, | |
| "loss": 0.7202, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.9368505195843326, | |
| "grad_norm": 1.1660830974578857, | |
| "learning_rate": 1.719457767119638e-05, | |
| "loss": 0.7042, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 3.956834532374101, | |
| "grad_norm": 1.0131560564041138, | |
| "learning_rate": 1.7028044231281642e-05, | |
| "loss": 0.7059, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 3.976818545163869, | |
| "grad_norm": 1.1839569807052612, | |
| "learning_rate": 1.686151079136691e-05, | |
| "loss": 0.77, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 3.996802557953637, | |
| "grad_norm": 1.736053705215454, | |
| "learning_rate": 1.6694977351452173e-05, | |
| "loss": 0.7703, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.016786570743405, | |
| "grad_norm": 1.3700270652770996, | |
| "learning_rate": 1.6528443911537437e-05, | |
| "loss": 0.6643, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 4.0367705835331735, | |
| "grad_norm": 1.347440481185913, | |
| "learning_rate": 1.63619104716227e-05, | |
| "loss": 0.7502, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 4.056754596322942, | |
| "grad_norm": 1.9421720504760742, | |
| "learning_rate": 1.6195377031707968e-05, | |
| "loss": 0.7382, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 4.07673860911271, | |
| "grad_norm": 0.9211772084236145, | |
| "learning_rate": 1.6028843591793232e-05, | |
| "loss": 0.7249, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 4.096722621902478, | |
| "grad_norm": 2.1698520183563232, | |
| "learning_rate": 1.58623101518785e-05, | |
| "loss": 0.7339, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 4.116706634692246, | |
| "grad_norm": 1.6852116584777832, | |
| "learning_rate": 1.5695776711963763e-05, | |
| "loss": 0.7525, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 4.136690647482014, | |
| "grad_norm": 1.8582841157913208, | |
| "learning_rate": 1.552924327204903e-05, | |
| "loss": 0.7168, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 4.156674660271783, | |
| "grad_norm": 1.3949832916259766, | |
| "learning_rate": 1.536270983213429e-05, | |
| "loss": 0.6835, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 4.176658673061551, | |
| "grad_norm": 2.044853925704956, | |
| "learning_rate": 1.5196176392219558e-05, | |
| "loss": 0.7332, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 4.196642685851319, | |
| "grad_norm": 1.3187381029129028, | |
| "learning_rate": 1.5029642952304823e-05, | |
| "loss": 0.7724, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.216626698641087, | |
| "grad_norm": 1.18405020236969, | |
| "learning_rate": 1.4863109512390089e-05, | |
| "loss": 0.7677, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 4.236610711430855, | |
| "grad_norm": 1.2868226766586304, | |
| "learning_rate": 1.4696576072475355e-05, | |
| "loss": 0.7168, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 4.256594724220624, | |
| "grad_norm": 2.145659923553467, | |
| "learning_rate": 1.453004263256062e-05, | |
| "loss": 0.7574, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 4.276578737010392, | |
| "grad_norm": 1.0491008758544922, | |
| "learning_rate": 1.4363509192645886e-05, | |
| "loss": 0.7274, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 4.2965627498001595, | |
| "grad_norm": 1.9524632692337036, | |
| "learning_rate": 1.4196975752731148e-05, | |
| "loss": 0.7256, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 4.316546762589928, | |
| "grad_norm": 1.6348446607589722, | |
| "learning_rate": 1.4030442312816413e-05, | |
| "loss": 0.6971, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 4.336530775379696, | |
| "grad_norm": 1.6102409362792969, | |
| "learning_rate": 1.3863908872901679e-05, | |
| "loss": 0.7031, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 4.356514788169465, | |
| "grad_norm": 1.4496809244155884, | |
| "learning_rate": 1.3697375432986944e-05, | |
| "loss": 0.76, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 4.376498800959233, | |
| "grad_norm": 2.370002508163452, | |
| "learning_rate": 1.353084199307221e-05, | |
| "loss": 0.7098, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 4.396482813749, | |
| "grad_norm": 1.1416559219360352, | |
| "learning_rate": 1.3364308553157475e-05, | |
| "loss": 0.7565, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.416466826538769, | |
| "grad_norm": 1.6672168970108032, | |
| "learning_rate": 1.319777511324274e-05, | |
| "loss": 0.7939, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 4.436450839328537, | |
| "grad_norm": 1.1106956005096436, | |
| "learning_rate": 1.3031241673328005e-05, | |
| "loss": 0.6645, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 4.4564348521183055, | |
| "grad_norm": 1.4987940788269043, | |
| "learning_rate": 1.286470823341327e-05, | |
| "loss": 0.7117, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 4.476418864908074, | |
| "grad_norm": 2.063014268875122, | |
| "learning_rate": 1.2698174793498536e-05, | |
| "loss": 0.767, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 4.496402877697841, | |
| "grad_norm": 0.748756468296051, | |
| "learning_rate": 1.2531641353583801e-05, | |
| "loss": 0.7393, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 4.51638689048761, | |
| "grad_norm": 1.3971226215362549, | |
| "learning_rate": 1.2365107913669065e-05, | |
| "loss": 0.7782, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 4.536370903277378, | |
| "grad_norm": 1.5306447744369507, | |
| "learning_rate": 1.219857447375433e-05, | |
| "loss": 0.7299, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 4.556354916067146, | |
| "grad_norm": 1.409225344657898, | |
| "learning_rate": 1.2032041033839596e-05, | |
| "loss": 0.6752, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 4.576338928856915, | |
| "grad_norm": 1.396794080734253, | |
| "learning_rate": 1.186550759392486e-05, | |
| "loss": 0.6417, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 4.596322941646683, | |
| "grad_norm": 1.6455470323562622, | |
| "learning_rate": 1.1698974154010126e-05, | |
| "loss": 0.7545, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 4.616306954436451, | |
| "grad_norm": 1.4188311100006104, | |
| "learning_rate": 1.1532440714095391e-05, | |
| "loss": 0.7217, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 4.636290967226219, | |
| "grad_norm": 1.1025303602218628, | |
| "learning_rate": 1.1365907274180657e-05, | |
| "loss": 0.7419, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 4.656274980015987, | |
| "grad_norm": 1.0919783115386963, | |
| "learning_rate": 1.119937383426592e-05, | |
| "loss": 0.725, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 4.676258992805756, | |
| "grad_norm": 2.179637908935547, | |
| "learning_rate": 1.1032840394351186e-05, | |
| "loss": 0.7052, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 4.696243005595523, | |
| "grad_norm": 1.4243191480636597, | |
| "learning_rate": 1.0866306954436452e-05, | |
| "loss": 0.7437, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 4.7162270183852915, | |
| "grad_norm": 1.6711329221725464, | |
| "learning_rate": 1.0699773514521715e-05, | |
| "loss": 0.7378, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 4.73621103117506, | |
| "grad_norm": 1.2967829704284668, | |
| "learning_rate": 1.0533240074606981e-05, | |
| "loss": 0.7386, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 4.756195043964828, | |
| "grad_norm": 1.737625002861023, | |
| "learning_rate": 1.0366706634692246e-05, | |
| "loss": 0.7012, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 4.7761790567545965, | |
| "grad_norm": 1.062472939491272, | |
| "learning_rate": 1.020017319477751e-05, | |
| "loss": 0.6797, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 4.796163069544365, | |
| "grad_norm": 1.044542908668518, | |
| "learning_rate": 1.0033639754862776e-05, | |
| "loss": 0.7285, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 4.816147082334132, | |
| "grad_norm": 1.70567786693573, | |
| "learning_rate": 9.867106314948041e-06, | |
| "loss": 0.7777, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 4.836131095123901, | |
| "grad_norm": 1.6937395334243774, | |
| "learning_rate": 9.700572875033307e-06, | |
| "loss": 0.7378, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 4.856115107913669, | |
| "grad_norm": 2.7036936283111572, | |
| "learning_rate": 9.534039435118572e-06, | |
| "loss": 0.7813, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 4.876099120703437, | |
| "grad_norm": 1.1682194471359253, | |
| "learning_rate": 9.367505995203838e-06, | |
| "loss": 0.7155, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 4.896083133493206, | |
| "grad_norm": 1.2117973566055298, | |
| "learning_rate": 9.200972555289104e-06, | |
| "loss": 0.7273, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 4.916067146282973, | |
| "grad_norm": 0.9339836239814758, | |
| "learning_rate": 9.034439115374367e-06, | |
| "loss": 0.7368, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 4.936051159072742, | |
| "grad_norm": 1.3919428586959839, | |
| "learning_rate": 8.867905675459633e-06, | |
| "loss": 0.7059, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 4.95603517186251, | |
| "grad_norm": 2.1438040733337402, | |
| "learning_rate": 8.701372235544898e-06, | |
| "loss": 0.7197, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 4.976019184652278, | |
| "grad_norm": 1.892350435256958, | |
| "learning_rate": 8.534838795630162e-06, | |
| "loss": 0.7292, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 4.996003197442047, | |
| "grad_norm": 2.050062656402588, | |
| "learning_rate": 8.368305355715428e-06, | |
| "loss": 0.7791, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 5.015987210231814, | |
| "grad_norm": 2.285053014755249, | |
| "learning_rate": 8.201771915800693e-06, | |
| "loss": 0.6937, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 5.0359712230215825, | |
| "grad_norm": 1.6725279092788696, | |
| "learning_rate": 8.035238475885959e-06, | |
| "loss": 0.7443, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 5.055955235811351, | |
| "grad_norm": 1.590450644493103, | |
| "learning_rate": 7.868705035971223e-06, | |
| "loss": 0.7069, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 5.075939248601119, | |
| "grad_norm": 0.7603669762611389, | |
| "learning_rate": 7.702171596056488e-06, | |
| "loss": 0.6778, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 5.095923261390888, | |
| "grad_norm": 1.8916963338851929, | |
| "learning_rate": 7.535638156141754e-06, | |
| "loss": 0.769, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 5.115907274180655, | |
| "grad_norm": 1.6110832691192627, | |
| "learning_rate": 7.3691047162270184e-06, | |
| "loss": 0.7027, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 5.135891286970423, | |
| "grad_norm": 1.796796202659607, | |
| "learning_rate": 7.202571276312284e-06, | |
| "loss": 0.72, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 5.155875299760192, | |
| "grad_norm": 1.8212794065475464, | |
| "learning_rate": 7.0360378363975495e-06, | |
| "loss": 0.7004, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 5.17585931254996, | |
| "grad_norm": 1.0340906381607056, | |
| "learning_rate": 6.869504396482813e-06, | |
| "loss": 0.6687, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 5.1958433253397285, | |
| "grad_norm": 1.8287034034729004, | |
| "learning_rate": 6.702970956568079e-06, | |
| "loss": 0.6774, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 5.215827338129497, | |
| "grad_norm": 1.657259225845337, | |
| "learning_rate": 6.536437516653344e-06, | |
| "loss": 0.6995, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 5.235811350919264, | |
| "grad_norm": 1.8235076665878296, | |
| "learning_rate": 6.36990407673861e-06, | |
| "loss": 0.7814, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 5.255795363709033, | |
| "grad_norm": 1.6127688884735107, | |
| "learning_rate": 6.203370636823875e-06, | |
| "loss": 0.6797, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 5.275779376498801, | |
| "grad_norm": 1.2275160551071167, | |
| "learning_rate": 6.03683719690914e-06, | |
| "loss": 0.7336, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 5.295763389288569, | |
| "grad_norm": 1.6593281030654907, | |
| "learning_rate": 5.870303756994405e-06, | |
| "loss": 0.6988, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 5.315747402078338, | |
| "grad_norm": 1.1069490909576416, | |
| "learning_rate": 5.70377031707967e-06, | |
| "loss": 0.801, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 5.335731414868105, | |
| "grad_norm": 1.7498623132705688, | |
| "learning_rate": 5.537236877164935e-06, | |
| "loss": 0.715, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 5.355715427657874, | |
| "grad_norm": 1.7322038412094116, | |
| "learning_rate": 5.3707034372502e-06, | |
| "loss": 0.7076, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 5.375699440447642, | |
| "grad_norm": 1.2660248279571533, | |
| "learning_rate": 5.204169997335465e-06, | |
| "loss": 0.7494, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 5.39568345323741, | |
| "grad_norm": 2.537752628326416, | |
| "learning_rate": 5.03763655742073e-06, | |
| "loss": 0.7326, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 5.415667466027179, | |
| "grad_norm": 0.991534411907196, | |
| "learning_rate": 4.8711031175059955e-06, | |
| "loss": 0.6929, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 5.435651478816946, | |
| "grad_norm": 2.0230729579925537, | |
| "learning_rate": 4.70456967759126e-06, | |
| "loss": 0.6787, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 5.4556354916067145, | |
| "grad_norm": 1.5560120344161987, | |
| "learning_rate": 4.538036237676526e-06, | |
| "loss": 0.7251, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 5.475619504396483, | |
| "grad_norm": 1.5086272954940796, | |
| "learning_rate": 4.371502797761791e-06, | |
| "loss": 0.7066, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 5.495603517186251, | |
| "grad_norm": 1.6183174848556519, | |
| "learning_rate": 4.204969357847056e-06, | |
| "loss": 0.7161, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 5.5155875299760195, | |
| "grad_norm": 1.1214239597320557, | |
| "learning_rate": 4.0384359179323214e-06, | |
| "loss": 0.7414, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 5.535571542765787, | |
| "grad_norm": 1.4948476552963257, | |
| "learning_rate": 3.871902478017586e-06, | |
| "loss": 0.7303, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 1.094460368156433, | |
| "learning_rate": 3.705369038102851e-06, | |
| "loss": 0.7214, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 5.575539568345324, | |
| "grad_norm": 1.8006253242492676, | |
| "learning_rate": 3.5388355981881163e-06, | |
| "loss": 0.7382, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 5.595523581135092, | |
| "grad_norm": 1.0595532655715942, | |
| "learning_rate": 3.3723021582733815e-06, | |
| "loss": 0.6835, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 5.61550759392486, | |
| "grad_norm": 1.5675129890441895, | |
| "learning_rate": 3.205768718358647e-06, | |
| "loss": 0.7426, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 5.635491606714629, | |
| "grad_norm": 1.543182134628296, | |
| "learning_rate": 3.0392352784439117e-06, | |
| "loss": 0.7393, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 5.655475619504396, | |
| "grad_norm": 1.6735225915908813, | |
| "learning_rate": 2.8727018385291768e-06, | |
| "loss": 0.7106, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 5.675459632294165, | |
| "grad_norm": 1.2037389278411865, | |
| "learning_rate": 2.706168398614442e-06, | |
| "loss": 0.6928, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 5.695443645083933, | |
| "grad_norm": 1.957836627960205, | |
| "learning_rate": 2.539634958699707e-06, | |
| "loss": 0.8086, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 5.715427657873701, | |
| "grad_norm": 2.085599899291992, | |
| "learning_rate": 2.373101518784972e-06, | |
| "loss": 0.7587, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 5.735411670663469, | |
| "grad_norm": 1.3564984798431396, | |
| "learning_rate": 2.206568078870237e-06, | |
| "loss": 0.7197, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 5.755395683453237, | |
| "grad_norm": 1.659226655960083, | |
| "learning_rate": 2.0400346389555023e-06, | |
| "loss": 0.728, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 5.7753796962430055, | |
| "grad_norm": 1.3784935474395752, | |
| "learning_rate": 1.8735011990407676e-06, | |
| "loss": 0.7561, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 5.795363709032774, | |
| "grad_norm": 1.4514496326446533, | |
| "learning_rate": 1.7069677591260325e-06, | |
| "loss": 0.7205, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 5.815347721822542, | |
| "grad_norm": 1.7896771430969238, | |
| "learning_rate": 1.5404343192112976e-06, | |
| "loss": 0.667, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 5.835331734612311, | |
| "grad_norm": 1.4074804782867432, | |
| "learning_rate": 1.3739008792965628e-06, | |
| "loss": 0.71, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 5.855315747402078, | |
| "grad_norm": 1.33772873878479, | |
| "learning_rate": 1.2073674393818279e-06, | |
| "loss": 0.7688, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 5.875299760191846, | |
| "grad_norm": 1.8295559883117676, | |
| "learning_rate": 1.040833999467093e-06, | |
| "loss": 0.7439, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 5.895283772981615, | |
| "grad_norm": 0.9400151371955872, | |
| "learning_rate": 8.743005595523582e-07, | |
| "loss": 0.6891, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 5.915267785771383, | |
| "grad_norm": 1.7990922927856445, | |
| "learning_rate": 7.077671196376233e-07, | |
| "loss": 0.6384, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 5.935251798561151, | |
| "grad_norm": 1.74308180809021, | |
| "learning_rate": 5.412336797228884e-07, | |
| "loss": 0.6973, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 5.955235811350919, | |
| "grad_norm": 1.1248557567596436, | |
| "learning_rate": 3.747002398081535e-07, | |
| "loss": 0.7265, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 5.975219824140687, | |
| "grad_norm": 1.9805783033370972, | |
| "learning_rate": 2.0816679989341861e-07, | |
| "loss": 0.7107, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 5.995203836930456, | |
| "grad_norm": 1.6576383113861084, | |
| "learning_rate": 4.163335997868372e-08, | |
| "loss": 0.7073, | |
| "step": 30000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 30024, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.230686056448e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |