Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 72000, | |
| "best_metric": 3.5296003818511963, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_carry_frequency_2128/checkpoint-40000", | |
| "epoch": 29.129041654529566, | |
| "eval_steps": 1000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014564520827264784, | |
| "grad_norm": 0.8700253963470459, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4648, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.029129041654529567, | |
| "grad_norm": 0.8772715330123901, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.735, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.043693562481794346, | |
| "grad_norm": 0.5721131563186646, | |
| "learning_rate": 0.0005998286213931798, | |
| "loss": 6.3403, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.058258083309059135, | |
| "grad_norm": 0.42677509784698486, | |
| "learning_rate": 0.0005996537452637714, | |
| "loss": 6.1411, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07282260413632391, | |
| "grad_norm": 0.477642685174942, | |
| "learning_rate": 0.0005994788691343632, | |
| "loss": 5.9982, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08738712496358869, | |
| "grad_norm": 0.49641019105911255, | |
| "learning_rate": 0.0005993039930049548, | |
| "loss": 5.8842, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10195164579085349, | |
| "grad_norm": 0.43664422631263733, | |
| "learning_rate": 0.0005991291168755465, | |
| "loss": 5.7548, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11651616661811827, | |
| "grad_norm": 0.4933745861053467, | |
| "learning_rate": 0.0005989542407461382, | |
| "loss": 5.6359, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13108068744538304, | |
| "grad_norm": 0.48850634694099426, | |
| "learning_rate": 0.0005987793646167297, | |
| "loss": 5.5233, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14564520827264782, | |
| "grad_norm": 0.4590892493724823, | |
| "learning_rate": 0.0005986044884873214, | |
| "loss": 5.4179, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1602097290999126, | |
| "grad_norm": 0.4702446162700653, | |
| "learning_rate": 0.0005984296123579131, | |
| "loss": 5.3419, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17477424992717738, | |
| "grad_norm": 0.43737664818763733, | |
| "learning_rate": 0.0005982547362285047, | |
| "loss": 5.2567, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18933877075444216, | |
| "grad_norm": 0.5059506893157959, | |
| "learning_rate": 0.0005980798600990964, | |
| "loss": 5.1911, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20390329158170697, | |
| "grad_norm": 0.46494734287261963, | |
| "learning_rate": 0.0005979049839696881, | |
| "loss": 5.1499, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21846781240897176, | |
| "grad_norm": 0.5374208688735962, | |
| "learning_rate": 0.0005977301078402798, | |
| "loss": 5.0895, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23303233323623654, | |
| "grad_norm": 0.3784054219722748, | |
| "learning_rate": 0.0005975552317108715, | |
| "loss": 5.0231, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24759685406350132, | |
| "grad_norm": 0.4124829173088074, | |
| "learning_rate": 0.0005973803555814631, | |
| "loss": 4.9945, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2621613748907661, | |
| "grad_norm": 0.41545918583869934, | |
| "learning_rate": 0.0005972054794520547, | |
| "loss": 4.9248, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.27672589571803086, | |
| "grad_norm": 0.48763376474380493, | |
| "learning_rate": 0.0005970306033226464, | |
| "loss": 4.8882, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.29129041654529564, | |
| "grad_norm": 0.4686114192008972, | |
| "learning_rate": 0.0005968557271932381, | |
| "loss": 4.8401, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.29129041654529564, | |
| "eval_accuracy": 0.2521537178997339, | |
| "eval_loss": 4.771047115325928, | |
| "eval_runtime": 178.0535, | |
| "eval_samples_per_second": 93.461, | |
| "eval_steps_per_second": 5.847, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3058549373725604, | |
| "grad_norm": 0.5042315721511841, | |
| "learning_rate": 0.0005966808510638297, | |
| "loss": 4.7859, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3204194581998252, | |
| "grad_norm": 0.42784640192985535, | |
| "learning_rate": 0.0005965059749344214, | |
| "loss": 4.75, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.33498397902709, | |
| "grad_norm": 0.4303656816482544, | |
| "learning_rate": 0.0005963310988050131, | |
| "loss": 4.7097, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.34954849985435477, | |
| "grad_norm": 0.55189049243927, | |
| "learning_rate": 0.0005961562226756047, | |
| "loss": 4.6761, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36411302068161955, | |
| "grad_norm": 0.45820021629333496, | |
| "learning_rate": 0.0005959813465461965, | |
| "loss": 4.6322, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.37867754150888433, | |
| "grad_norm": 0.45095351338386536, | |
| "learning_rate": 0.000595806470416788, | |
| "loss": 4.6107, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.39324206233614917, | |
| "grad_norm": 0.42550787329673767, | |
| "learning_rate": 0.0005956315942873797, | |
| "loss": 4.5928, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.40780658316341395, | |
| "grad_norm": 0.4250703454017639, | |
| "learning_rate": 0.0005954567181579714, | |
| "loss": 4.5615, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42237110399067873, | |
| "grad_norm": 0.4031296670436859, | |
| "learning_rate": 0.000595281842028563, | |
| "loss": 4.5317, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4369356248179435, | |
| "grad_norm": 0.4141838848590851, | |
| "learning_rate": 0.0005951069658991547, | |
| "loss": 4.5138, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4515001456452083, | |
| "grad_norm": 0.3957858383655548, | |
| "learning_rate": 0.0005949320897697464, | |
| "loss": 4.492, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4660646664724731, | |
| "grad_norm": 0.42144542932510376, | |
| "learning_rate": 0.0005947572136403381, | |
| "loss": 4.4686, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.48062918729973786, | |
| "grad_norm": 0.3965926766395569, | |
| "learning_rate": 0.0005945823375109297, | |
| "loss": 4.4541, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.49519370812700264, | |
| "grad_norm": 0.46729621291160583, | |
| "learning_rate": 0.0005944074613815215, | |
| "loss": 4.4285, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5097582289542674, | |
| "grad_norm": 0.3873760998249054, | |
| "learning_rate": 0.000594232585252113, | |
| "loss": 4.4088, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5243227497815321, | |
| "grad_norm": 0.475016325712204, | |
| "learning_rate": 0.0005940577091227047, | |
| "loss": 4.403, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5388872706087969, | |
| "grad_norm": 0.407781183719635, | |
| "learning_rate": 0.0005938828329932964, | |
| "loss": 4.3943, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5534517914360617, | |
| "grad_norm": 0.4060550332069397, | |
| "learning_rate": 0.000593707956863888, | |
| "loss": 4.3651, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5680163122633265, | |
| "grad_norm": 0.4468280076980591, | |
| "learning_rate": 0.0005935330807344797, | |
| "loss": 4.359, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5825808330905913, | |
| "grad_norm": 0.3966878652572632, | |
| "learning_rate": 0.0005933582046050714, | |
| "loss": 4.3331, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5825808330905913, | |
| "eval_accuracy": 0.2995227523184138, | |
| "eval_loss": 4.283666133880615, | |
| "eval_runtime": 177.3551, | |
| "eval_samples_per_second": 93.829, | |
| "eval_steps_per_second": 5.87, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5971453539178561, | |
| "grad_norm": 0.39190909266471863, | |
| "learning_rate": 0.000593183328475663, | |
| "loss": 4.3321, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6117098747451208, | |
| "grad_norm": 0.38748636841773987, | |
| "learning_rate": 0.0005930084523462546, | |
| "loss": 4.3227, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6262743955723856, | |
| "grad_norm": 0.351399302482605, | |
| "learning_rate": 0.0005928335762168463, | |
| "loss": 4.2934, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6408389163996504, | |
| "grad_norm": 0.42470622062683105, | |
| "learning_rate": 0.000592658700087438, | |
| "loss": 4.2908, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6554034372269152, | |
| "grad_norm": 0.3766157329082489, | |
| "learning_rate": 0.0005924838239580297, | |
| "loss": 4.2909, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.66996795805418, | |
| "grad_norm": 0.5269715189933777, | |
| "learning_rate": 0.0005923089478286214, | |
| "loss": 4.2651, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6845324788814448, | |
| "grad_norm": 0.40443506836891174, | |
| "learning_rate": 0.000592134071699213, | |
| "loss": 4.2846, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6990969997087095, | |
| "grad_norm": 0.3809632360935211, | |
| "learning_rate": 0.0005919591955698047, | |
| "loss": 4.2389, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7136615205359743, | |
| "grad_norm": 0.3910612463951111, | |
| "learning_rate": 0.0005917843194403964, | |
| "loss": 4.2332, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7282260413632391, | |
| "grad_norm": 0.3787578046321869, | |
| "learning_rate": 0.000591609443310988, | |
| "loss": 4.2352, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7427905621905039, | |
| "grad_norm": 0.36433854699134827, | |
| "learning_rate": 0.0005914345671815796, | |
| "loss": 4.2098, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7573550830177687, | |
| "grad_norm": 0.3667093515396118, | |
| "learning_rate": 0.0005912596910521713, | |
| "loss": 4.22, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7719196038450336, | |
| "grad_norm": 0.36228933930397034, | |
| "learning_rate": 0.0005910848149227629, | |
| "loss": 4.2054, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7864841246722983, | |
| "grad_norm": 0.3999609649181366, | |
| "learning_rate": 0.0005909099387933547, | |
| "loss": 4.2007, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8010486454995631, | |
| "grad_norm": 0.3630334734916687, | |
| "learning_rate": 0.0005907350626639463, | |
| "loss": 4.1834, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8156131663268279, | |
| "grad_norm": 0.3720364272594452, | |
| "learning_rate": 0.000590560186534538, | |
| "loss": 4.1863, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8301776871540927, | |
| "grad_norm": 0.4026776850223541, | |
| "learning_rate": 0.0005903853104051297, | |
| "loss": 4.1729, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8447422079813575, | |
| "grad_norm": 0.42495641112327576, | |
| "learning_rate": 0.0005902104342757214, | |
| "loss": 4.1673, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8593067288086222, | |
| "grad_norm": 0.3550451397895813, | |
| "learning_rate": 0.000590035558146313, | |
| "loss": 4.1517, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.873871249635887, | |
| "grad_norm": 0.37605759501457214, | |
| "learning_rate": 0.0005898606820169046, | |
| "loss": 4.1526, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.873871249635887, | |
| "eval_accuracy": 0.31542069895270813, | |
| "eval_loss": 4.094066619873047, | |
| "eval_runtime": 177.1428, | |
| "eval_samples_per_second": 93.941, | |
| "eval_steps_per_second": 5.877, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8884357704631518, | |
| "grad_norm": 0.34033265709877014, | |
| "learning_rate": 0.0005896858058874963, | |
| "loss": 4.1369, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9030002912904166, | |
| "grad_norm": 0.3450336456298828, | |
| "learning_rate": 0.0005895109297580879, | |
| "loss": 4.1231, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9175648121176814, | |
| "grad_norm": 0.35001909732818604, | |
| "learning_rate": 0.0005893360536286797, | |
| "loss": 4.1407, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9321293329449462, | |
| "grad_norm": 0.365225225687027, | |
| "learning_rate": 0.0005891611774992713, | |
| "loss": 4.1225, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9466938537722109, | |
| "grad_norm": 0.3506869077682495, | |
| "learning_rate": 0.000588986301369863, | |
| "loss": 4.1155, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9612583745994757, | |
| "grad_norm": 0.3621656000614166, | |
| "learning_rate": 0.0005888114252404547, | |
| "loss": 4.1049, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9758228954267405, | |
| "grad_norm": 0.33093249797821045, | |
| "learning_rate": 0.0005886365491110463, | |
| "loss": 4.0921, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9903874162540053, | |
| "grad_norm": 0.3546348810195923, | |
| "learning_rate": 0.000588461672981638, | |
| "loss": 4.1038, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.00495193708127, | |
| "grad_norm": 0.33640843629837036, | |
| "learning_rate": 0.0005882867968522296, | |
| "loss": 4.0615, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0195164579085347, | |
| "grad_norm": 0.33964186906814575, | |
| "learning_rate": 0.0005881119207228212, | |
| "loss": 4.0277, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0340809787357996, | |
| "grad_norm": 0.34980398416519165, | |
| "learning_rate": 0.0005879370445934129, | |
| "loss": 4.003, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0486454995630643, | |
| "grad_norm": 0.3941769599914551, | |
| "learning_rate": 0.0005877621684640046, | |
| "loss": 4.0155, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0632100203903292, | |
| "grad_norm": 0.34226739406585693, | |
| "learning_rate": 0.0005875872923345963, | |
| "loss": 4.0147, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.0777745412175939, | |
| "grad_norm": 0.3588155508041382, | |
| "learning_rate": 0.000587412416205188, | |
| "loss": 4.0108, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0923390620448588, | |
| "grad_norm": 0.403024286031723, | |
| "learning_rate": 0.0005872375400757797, | |
| "loss": 4.0115, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1069035828721234, | |
| "grad_norm": 0.3496304154396057, | |
| "learning_rate": 0.0005870626639463713, | |
| "loss": 3.9869, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1214681036993883, | |
| "grad_norm": 0.3462716042995453, | |
| "learning_rate": 0.0005868877878169629, | |
| "loss": 4.0138, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.136032624526653, | |
| "grad_norm": 0.3597177565097809, | |
| "learning_rate": 0.0005867129116875546, | |
| "loss": 3.9896, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1505971453539179, | |
| "grad_norm": 0.3294402062892914, | |
| "learning_rate": 0.0005865380355581462, | |
| "loss": 3.9925, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1651616661811826, | |
| "grad_norm": 0.34170660376548767, | |
| "learning_rate": 0.0005863631594287379, | |
| "loss": 4.0002, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1651616661811826, | |
| "eval_accuracy": 0.32493390114318127, | |
| "eval_loss": 3.989036798477173, | |
| "eval_runtime": 177.4875, | |
| "eval_samples_per_second": 93.759, | |
| "eval_steps_per_second": 5.865, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1797261870084474, | |
| "grad_norm": 0.35232222080230713, | |
| "learning_rate": 0.0005861882832993296, | |
| "loss": 3.9896, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1942907078357121, | |
| "grad_norm": 0.3442701995372772, | |
| "learning_rate": 0.0005860134071699212, | |
| "loss": 3.9966, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.208855228662977, | |
| "grad_norm": 0.3539937436580658, | |
| "learning_rate": 0.000585838531040513, | |
| "loss": 3.9818, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2234197494902417, | |
| "grad_norm": 0.36053037643432617, | |
| "learning_rate": 0.0005856636549111046, | |
| "loss": 3.9763, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2379842703175066, | |
| "grad_norm": 0.35037800669670105, | |
| "learning_rate": 0.0005854887787816963, | |
| "loss": 3.978, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.2525487911447715, | |
| "grad_norm": 0.3158578872680664, | |
| "learning_rate": 0.0005853139026522879, | |
| "loss": 3.9718, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2671133119720361, | |
| "grad_norm": 0.35837191343307495, | |
| "learning_rate": 0.0005851390265228796, | |
| "loss": 3.968, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.2816778327993008, | |
| "grad_norm": 0.3676842749118805, | |
| "learning_rate": 0.0005849641503934712, | |
| "loss": 3.9758, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2962423536265657, | |
| "grad_norm": 0.3639284670352936, | |
| "learning_rate": 0.0005847892742640629, | |
| "loss": 3.9694, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3108068744538306, | |
| "grad_norm": 0.35421234369277954, | |
| "learning_rate": 0.0005846143981346546, | |
| "loss": 3.9627, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3253713952810953, | |
| "grad_norm": 0.3280850648880005, | |
| "learning_rate": 0.0005844395220052462, | |
| "loss": 3.9495, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.33993591610836, | |
| "grad_norm": 0.34968069195747375, | |
| "learning_rate": 0.000584264645875838, | |
| "loss": 3.9585, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.3545004369356248, | |
| "grad_norm": 0.36847731471061707, | |
| "learning_rate": 0.0005840897697464296, | |
| "loss": 3.9568, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.3690649577628897, | |
| "grad_norm": 0.3364753723144531, | |
| "learning_rate": 0.0005839148936170212, | |
| "loss": 3.9476, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.3836294785901544, | |
| "grad_norm": 0.330708384513855, | |
| "learning_rate": 0.0005837400174876129, | |
| "loss": 3.9338, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.398193999417419, | |
| "grad_norm": 0.34026676416397095, | |
| "learning_rate": 0.0005835651413582045, | |
| "loss": 3.9356, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.412758520244684, | |
| "grad_norm": 0.35521167516708374, | |
| "learning_rate": 0.0005833902652287962, | |
| "loss": 3.9481, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4273230410719489, | |
| "grad_norm": 0.3294624388217926, | |
| "learning_rate": 0.0005832153890993879, | |
| "loss": 3.9449, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4418875618992135, | |
| "grad_norm": 0.3308870792388916, | |
| "learning_rate": 0.0005830405129699796, | |
| "loss": 3.93, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.4564520827264782, | |
| "grad_norm": 0.3472626209259033, | |
| "learning_rate": 0.0005828656368405712, | |
| "loss": 3.9325, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4564520827264782, | |
| "eval_accuracy": 0.3317897428968204, | |
| "eval_loss": 3.913301467895508, | |
| "eval_runtime": 182.7411, | |
| "eval_samples_per_second": 91.063, | |
| "eval_steps_per_second": 5.697, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.471016603553743, | |
| "grad_norm": 0.32565367221832275, | |
| "learning_rate": 0.0005826907607111629, | |
| "loss": 3.9295, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.485581124381008, | |
| "grad_norm": 0.32511332631111145, | |
| "learning_rate": 0.0005825158845817546, | |
| "loss": 3.9292, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.5001456452082726, | |
| "grad_norm": 0.3272392451763153, | |
| "learning_rate": 0.0005823410084523462, | |
| "loss": 3.9286, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5147101660355373, | |
| "grad_norm": 0.35094988346099854, | |
| "learning_rate": 0.0005821661323229379, | |
| "loss": 3.9151, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.5292746868628022, | |
| "grad_norm": 0.3307012617588043, | |
| "learning_rate": 0.0005819912561935295, | |
| "loss": 3.9205, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.543839207690067, | |
| "grad_norm": 0.32005444169044495, | |
| "learning_rate": 0.0005818163800641212, | |
| "loss": 3.9183, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5584037285173318, | |
| "grad_norm": 0.339083194732666, | |
| "learning_rate": 0.0005816415039347129, | |
| "loss": 3.9148, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.5729682493445964, | |
| "grad_norm": 0.33372458815574646, | |
| "learning_rate": 0.0005814666278053045, | |
| "loss": 3.9079, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.5875327701718613, | |
| "grad_norm": 0.31862950325012207, | |
| "learning_rate": 0.0005812917516758962, | |
| "loss": 3.9085, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6020972909991262, | |
| "grad_norm": 0.3127332329750061, | |
| "learning_rate": 0.0005811168755464879, | |
| "loss": 3.9125, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.616661811826391, | |
| "grad_norm": 0.33630993962287903, | |
| "learning_rate": 0.0005809419994170794, | |
| "loss": 3.9024, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.6312263326536556, | |
| "grad_norm": 0.33405524492263794, | |
| "learning_rate": 0.0005807671232876712, | |
| "loss": 3.8963, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6457908534809205, | |
| "grad_norm": 0.3463149070739746, | |
| "learning_rate": 0.0005805922471582628, | |
| "loss": 3.8897, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.6603553743081854, | |
| "grad_norm": 0.34034085273742676, | |
| "learning_rate": 0.0005804173710288545, | |
| "loss": 3.8894, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.67491989513545, | |
| "grad_norm": 0.32425183057785034, | |
| "learning_rate": 0.0005802424948994462, | |
| "loss": 3.8879, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6894844159627147, | |
| "grad_norm": 0.3245879113674164, | |
| "learning_rate": 0.0005800676187700379, | |
| "loss": 3.8917, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.7040489367899796, | |
| "grad_norm": 0.3300783336162567, | |
| "learning_rate": 0.0005798927426406295, | |
| "loss": 3.8913, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7186134576172445, | |
| "grad_norm": 0.32403889298439026, | |
| "learning_rate": 0.0005797178665112212, | |
| "loss": 3.8974, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7331779784445092, | |
| "grad_norm": 0.3420887887477875, | |
| "learning_rate": 0.0005795429903818129, | |
| "loss": 3.8882, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7477424992717738, | |
| "grad_norm": 0.3141533136367798, | |
| "learning_rate": 0.0005793681142524044, | |
| "loss": 3.8836, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7477424992717738, | |
| "eval_accuracy": 0.3370036823439996, | |
| "eval_loss": 3.8556597232818604, | |
| "eval_runtime": 180.6605, | |
| "eval_samples_per_second": 92.112, | |
| "eval_steps_per_second": 5.762, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7623070200990387, | |
| "grad_norm": 0.31102555990219116, | |
| "learning_rate": 0.0005791932381229961, | |
| "loss": 3.8717, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7768715409263036, | |
| "grad_norm": 0.32457712292671204, | |
| "learning_rate": 0.0005790183619935878, | |
| "loss": 3.878, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7914360617535683, | |
| "grad_norm": 0.3170512914657593, | |
| "learning_rate": 0.0005788434858641795, | |
| "loss": 3.8658, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.806000582580833, | |
| "grad_norm": 0.3156622350215912, | |
| "learning_rate": 0.0005786686097347712, | |
| "loss": 3.8749, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8205651034080979, | |
| "grad_norm": 0.331449955701828, | |
| "learning_rate": 0.0005784937336053628, | |
| "loss": 3.8718, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.8351296242353627, | |
| "grad_norm": 0.33456477522850037, | |
| "learning_rate": 0.0005783188574759545, | |
| "loss": 3.8733, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8496941450626274, | |
| "grad_norm": 0.34169068932533264, | |
| "learning_rate": 0.0005781439813465462, | |
| "loss": 3.8603, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.864258665889892, | |
| "grad_norm": 0.33886536955833435, | |
| "learning_rate": 0.0005779691052171379, | |
| "loss": 3.8503, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.878823186717157, | |
| "grad_norm": 0.32133013010025024, | |
| "learning_rate": 0.0005777942290877294, | |
| "loss": 3.8587, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8933877075444219, | |
| "grad_norm": 0.3214126527309418, | |
| "learning_rate": 0.0005776193529583211, | |
| "loss": 3.8627, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9079522283716865, | |
| "grad_norm": 0.3037171959877014, | |
| "learning_rate": 0.0005774444768289128, | |
| "loss": 3.8568, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9225167491989512, | |
| "grad_norm": 0.3018639385700226, | |
| "learning_rate": 0.0005772696006995045, | |
| "loss": 3.8433, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.937081270026216, | |
| "grad_norm": 0.3456135094165802, | |
| "learning_rate": 0.0005770947245700962, | |
| "loss": 3.8554, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.951645790853481, | |
| "grad_norm": 0.30263465642929077, | |
| "learning_rate": 0.0005769198484406878, | |
| "loss": 3.8617, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9662103116807457, | |
| "grad_norm": 0.33179566264152527, | |
| "learning_rate": 0.0005767449723112795, | |
| "loss": 3.845, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9807748325080103, | |
| "grad_norm": 0.32268857955932617, | |
| "learning_rate": 0.0005765700961818712, | |
| "loss": 3.8517, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.9953393533352752, | |
| "grad_norm": 0.30953162908554077, | |
| "learning_rate": 0.0005763952200524627, | |
| "loss": 3.8508, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.00990387416254, | |
| "grad_norm": 0.3232676088809967, | |
| "learning_rate": 0.0005762203439230544, | |
| "loss": 3.775, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.024468394989805, | |
| "grad_norm": 0.33479437232017517, | |
| "learning_rate": 0.0005760454677936461, | |
| "loss": 3.7414, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0390329158170695, | |
| "grad_norm": 0.3089365065097809, | |
| "learning_rate": 0.0005758705916642378, | |
| "loss": 3.7659, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0390329158170695, | |
| "eval_accuracy": 0.3411992237125408, | |
| "eval_loss": 3.8152074813842773, | |
| "eval_runtime": 179.851, | |
| "eval_samples_per_second": 92.527, | |
| "eval_steps_per_second": 5.788, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0535974366443344, | |
| "grad_norm": 0.32421961426734924, | |
| "learning_rate": 0.0005756957155348294, | |
| "loss": 3.7506, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.0681619574715993, | |
| "grad_norm": 0.31613728404045105, | |
| "learning_rate": 0.0005755208394054211, | |
| "loss": 3.747, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.082726478298864, | |
| "grad_norm": 0.34031516313552856, | |
| "learning_rate": 0.0005753459632760128, | |
| "loss": 3.7435, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.0972909991261286, | |
| "grad_norm": 0.33439844846725464, | |
| "learning_rate": 0.0005751710871466045, | |
| "loss": 3.7575, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.1118555199533935, | |
| "grad_norm": 0.30961179733276367, | |
| "learning_rate": 0.0005749962110171962, | |
| "loss": 3.7542, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.1264200407806584, | |
| "grad_norm": 0.340189665555954, | |
| "learning_rate": 0.0005748213348877877, | |
| "loss": 3.7555, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.1409845616079233, | |
| "grad_norm": 0.3085797131061554, | |
| "learning_rate": 0.0005746464587583794, | |
| "loss": 3.7475, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.1555490824351877, | |
| "grad_norm": 0.31862226128578186, | |
| "learning_rate": 0.0005744715826289711, | |
| "loss": 3.7449, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1701136032624526, | |
| "grad_norm": 0.326460599899292, | |
| "learning_rate": 0.0005742967064995627, | |
| "loss": 3.7634, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.1846781240897175, | |
| "grad_norm": 0.32189229130744934, | |
| "learning_rate": 0.0005741218303701544, | |
| "loss": 3.7557, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1992426449169824, | |
| "grad_norm": 0.33258241415023804, | |
| "learning_rate": 0.0005739469542407461, | |
| "loss": 3.7553, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.213807165744247, | |
| "grad_norm": 0.32828521728515625, | |
| "learning_rate": 0.0005737720781113378, | |
| "loss": 3.7561, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2283716865715117, | |
| "grad_norm": 0.3204036355018616, | |
| "learning_rate": 0.0005735972019819295, | |
| "loss": 3.7508, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.2429362073987766, | |
| "grad_norm": 0.3126814365386963, | |
| "learning_rate": 0.000573422325852521, | |
| "loss": 3.7471, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.2575007282260415, | |
| "grad_norm": 0.3180292546749115, | |
| "learning_rate": 0.0005732474497231127, | |
| "loss": 3.7549, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.272065249053306, | |
| "grad_norm": 0.34320777654647827, | |
| "learning_rate": 0.0005730725735937044, | |
| "loss": 3.7495, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.286629769880571, | |
| "grad_norm": 0.2991611063480377, | |
| "learning_rate": 0.0005728976974642961, | |
| "loss": 3.7613, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.3011942907078358, | |
| "grad_norm": 0.3215004503726959, | |
| "learning_rate": 0.0005727228213348877, | |
| "loss": 3.7665, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3157588115351007, | |
| "grad_norm": 0.3161200284957886, | |
| "learning_rate": 0.0005725479452054794, | |
| "loss": 3.7515, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.330323332362365, | |
| "grad_norm": 0.3195878267288208, | |
| "learning_rate": 0.0005723730690760711, | |
| "loss": 3.7532, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.330323332362365, | |
| "eval_accuracy": 0.3439830019247253, | |
| "eval_loss": 3.782982349395752, | |
| "eval_runtime": 181.9405, | |
| "eval_samples_per_second": 91.464, | |
| "eval_steps_per_second": 5.722, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.34488785318963, | |
| "grad_norm": 0.31494781374931335, | |
| "learning_rate": 0.0005721981929466627, | |
| "loss": 3.7528, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.359452374016895, | |
| "grad_norm": 0.3217615783214569, | |
| "learning_rate": 0.0005720233168172545, | |
| "loss": 3.7411, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.37401689484416, | |
| "grad_norm": 0.3436919152736664, | |
| "learning_rate": 0.000571848440687846, | |
| "loss": 3.7428, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3885814156714242, | |
| "grad_norm": 0.33180883526802063, | |
| "learning_rate": 0.0005716735645584377, | |
| "loss": 3.7483, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.403145936498689, | |
| "grad_norm": 0.3267768919467926, | |
| "learning_rate": 0.0005714986884290294, | |
| "loss": 3.7527, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.417710457325954, | |
| "grad_norm": 0.3256717920303345, | |
| "learning_rate": 0.000571323812299621, | |
| "loss": 3.7446, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.432274978153219, | |
| "grad_norm": 0.29885855317115784, | |
| "learning_rate": 0.0005711489361702127, | |
| "loss": 3.7641, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4468394989804834, | |
| "grad_norm": 0.33645039796829224, | |
| "learning_rate": 0.0005709740600408044, | |
| "loss": 3.7423, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.4614040198077483, | |
| "grad_norm": 0.34129101037979126, | |
| "learning_rate": 0.0005707991839113961, | |
| "loss": 3.7658, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.475968540635013, | |
| "grad_norm": 0.3134017884731293, | |
| "learning_rate": 0.0005706243077819877, | |
| "loss": 3.7417, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.490533061462278, | |
| "grad_norm": 0.34077146649360657, | |
| "learning_rate": 0.0005704494316525793, | |
| "loss": 3.7468, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.505097582289543, | |
| "grad_norm": 0.3154885470867157, | |
| "learning_rate": 0.000570274555523171, | |
| "loss": 3.7407, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.5196621031168074, | |
| "grad_norm": 0.30456680059432983, | |
| "learning_rate": 0.0005700996793937627, | |
| "loss": 3.7523, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.5342266239440723, | |
| "grad_norm": 0.3212231397628784, | |
| "learning_rate": 0.0005699248032643544, | |
| "loss": 3.7513, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.548791144771337, | |
| "grad_norm": 0.32085394859313965, | |
| "learning_rate": 0.000569749927134946, | |
| "loss": 3.7364, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.5633556655986016, | |
| "grad_norm": 0.32111606001853943, | |
| "learning_rate": 0.0005695750510055377, | |
| "loss": 3.7442, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5779201864258665, | |
| "grad_norm": 0.3163037896156311, | |
| "learning_rate": 0.0005694001748761294, | |
| "loss": 3.7447, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.5924847072531314, | |
| "grad_norm": 0.3073652386665344, | |
| "learning_rate": 0.000569225298746721, | |
| "loss": 3.7537, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.6070492280803963, | |
| "grad_norm": 0.3179319500923157, | |
| "learning_rate": 0.0005690504226173127, | |
| "loss": 3.7442, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.621613748907661, | |
| "grad_norm": 0.32027360796928406, | |
| "learning_rate": 0.0005688755464879043, | |
| "loss": 3.7549, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.621613748907661, | |
| "eval_accuracy": 0.34702843553240287, | |
| "eval_loss": 3.7536637783050537, | |
| "eval_runtime": 182.0423, | |
| "eval_samples_per_second": 91.413, | |
| "eval_steps_per_second": 5.718, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6361782697349256, | |
| "grad_norm": 0.31084582209587097, | |
| "learning_rate": 0.000568700670358496, | |
| "loss": 3.7455, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6507427905621905, | |
| "grad_norm": 0.32904064655303955, | |
| "learning_rate": 0.0005685257942290877, | |
| "loss": 3.7334, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6653073113894554, | |
| "grad_norm": 0.3119734227657318, | |
| "learning_rate": 0.0005683509180996793, | |
| "loss": 3.7421, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.67987183221672, | |
| "grad_norm": 0.3344007730484009, | |
| "learning_rate": 0.000568176041970271, | |
| "loss": 3.7414, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.6944363530439848, | |
| "grad_norm": 0.31935930252075195, | |
| "learning_rate": 0.0005680011658408627, | |
| "loss": 3.7334, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.7090008738712497, | |
| "grad_norm": 0.29859659075737, | |
| "learning_rate": 0.0005678262897114544, | |
| "loss": 3.73, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.7235653946985146, | |
| "grad_norm": 0.3266841769218445, | |
| "learning_rate": 0.000567651413582046, | |
| "loss": 3.7266, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.7381299155257794, | |
| "grad_norm": 0.31048107147216797, | |
| "learning_rate": 0.0005674765374526377, | |
| "loss": 3.7315, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.752694436353044, | |
| "grad_norm": 0.32837024331092834, | |
| "learning_rate": 0.0005673016613232293, | |
| "loss": 3.734, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.767258957180309, | |
| "grad_norm": 0.3099216818809509, | |
| "learning_rate": 0.0005671267851938209, | |
| "loss": 3.7351, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7818234780075737, | |
| "grad_norm": 0.32197684049606323, | |
| "learning_rate": 0.0005669519090644127, | |
| "loss": 3.7335, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.796387998834838, | |
| "grad_norm": 0.32030460238456726, | |
| "learning_rate": 0.0005667770329350043, | |
| "loss": 3.7219, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.810952519662103, | |
| "grad_norm": 0.31780993938446045, | |
| "learning_rate": 0.000566602156805596, | |
| "loss": 3.719, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.825517040489368, | |
| "grad_norm": 0.3219031095504761, | |
| "learning_rate": 0.0005664272806761877, | |
| "loss": 3.7344, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.840081561316633, | |
| "grad_norm": 0.3055329918861389, | |
| "learning_rate": 0.0005662524045467793, | |
| "loss": 3.721, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8546460821438977, | |
| "grad_norm": 0.32618221640586853, | |
| "learning_rate": 0.000566077528417371, | |
| "loss": 3.7255, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.869210602971162, | |
| "grad_norm": 0.30417823791503906, | |
| "learning_rate": 0.0005659026522879626, | |
| "loss": 3.7293, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.883775123798427, | |
| "grad_norm": 0.30620038509368896, | |
| "learning_rate": 0.0005657277761585543, | |
| "loss": 3.7166, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.898339644625692, | |
| "grad_norm": 0.29888835549354553, | |
| "learning_rate": 0.0005655529000291459, | |
| "loss": 3.7218, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.9129041654529564, | |
| "grad_norm": 0.3025401830673218, | |
| "learning_rate": 0.0005653780238997376, | |
| "loss": 3.7257, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9129041654529564, | |
| "eval_accuracy": 0.3495604365752613, | |
| "eval_loss": 3.72790265083313, | |
| "eval_runtime": 179.1187, | |
| "eval_samples_per_second": 92.905, | |
| "eval_steps_per_second": 5.812, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.9274686862802213, | |
| "grad_norm": 0.300513356924057, | |
| "learning_rate": 0.0005652031477703293, | |
| "loss": 3.7143, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.942033207107486, | |
| "grad_norm": 0.3117719292640686, | |
| "learning_rate": 0.000565028271640921, | |
| "loss": 3.7157, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.956597727934751, | |
| "grad_norm": 0.30454131960868835, | |
| "learning_rate": 0.0005648533955115127, | |
| "loss": 3.7216, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.971162248762016, | |
| "grad_norm": 0.3001880943775177, | |
| "learning_rate": 0.0005646785193821043, | |
| "loss": 3.7289, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.9857267695892804, | |
| "grad_norm": 0.3038688898086548, | |
| "learning_rate": 0.000564503643252696, | |
| "loss": 3.7216, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.0002912904165453, | |
| "grad_norm": 0.3145761787891388, | |
| "learning_rate": 0.0005643287671232876, | |
| "loss": 3.7028, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.01485581124381, | |
| "grad_norm": 0.312094509601593, | |
| "learning_rate": 0.0005641538909938792, | |
| "loss": 3.6102, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.0294203320710746, | |
| "grad_norm": 0.32154324650764465, | |
| "learning_rate": 0.0005639790148644709, | |
| "loss": 3.6075, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.0439848528983395, | |
| "grad_norm": 0.37198418378829956, | |
| "learning_rate": 0.0005638041387350626, | |
| "loss": 3.6255, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.0585493737256044, | |
| "grad_norm": 0.3222768008708954, | |
| "learning_rate": 0.0005636292626056543, | |
| "loss": 3.6191, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.0731138945528693, | |
| "grad_norm": 0.3282540738582611, | |
| "learning_rate": 0.000563454386476246, | |
| "loss": 3.6232, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.087678415380134, | |
| "grad_norm": 0.31507760286331177, | |
| "learning_rate": 0.0005632795103468376, | |
| "loss": 3.6289, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1022429362073987, | |
| "grad_norm": 0.32736721634864807, | |
| "learning_rate": 0.0005631046342174293, | |
| "loss": 3.6296, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.1168074570346636, | |
| "grad_norm": 0.30472615361213684, | |
| "learning_rate": 0.000562929758088021, | |
| "loss": 3.6449, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.1313719778619284, | |
| "grad_norm": 0.31732335686683655, | |
| "learning_rate": 0.0005627548819586126, | |
| "loss": 3.6378, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.145936498689193, | |
| "grad_norm": 0.3036065697669983, | |
| "learning_rate": 0.0005625800058292042, | |
| "loss": 3.6246, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.160501019516458, | |
| "grad_norm": 0.30610308051109314, | |
| "learning_rate": 0.0005624051296997959, | |
| "loss": 3.6416, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.1750655403437227, | |
| "grad_norm": 0.3098933696746826, | |
| "learning_rate": 0.0005622302535703876, | |
| "loss": 3.6409, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.1896300611709876, | |
| "grad_norm": 0.3232332170009613, | |
| "learning_rate": 0.0005620553774409792, | |
| "loss": 3.6316, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.2041945819982525, | |
| "grad_norm": 0.32577645778656006, | |
| "learning_rate": 0.000561880501311571, | |
| "loss": 3.631, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.2041945819982525, | |
| "eval_accuracy": 0.3517711600718335, | |
| "eval_loss": 3.7140092849731445, | |
| "eval_runtime": 177.1297, | |
| "eval_samples_per_second": 93.948, | |
| "eval_steps_per_second": 5.877, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.218759102825517, | |
| "grad_norm": 0.3147296607494354, | |
| "learning_rate": 0.0005617056251821626, | |
| "loss": 3.631, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.233323623652782, | |
| "grad_norm": 0.31632447242736816, | |
| "learning_rate": 0.0005615307490527543, | |
| "loss": 3.6358, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.2478881444800467, | |
| "grad_norm": 0.3221932649612427, | |
| "learning_rate": 0.000561355872923346, | |
| "loss": 3.6407, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.262452665307311, | |
| "grad_norm": 0.33630186319351196, | |
| "learning_rate": 0.0005611809967939375, | |
| "loss": 3.6311, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.277017186134576, | |
| "grad_norm": 0.3187567889690399, | |
| "learning_rate": 0.0005610061206645292, | |
| "loss": 3.6442, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.291581706961841, | |
| "grad_norm": 0.31802159547805786, | |
| "learning_rate": 0.0005608312445351209, | |
| "loss": 3.6407, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.306146227789106, | |
| "grad_norm": 0.3117535412311554, | |
| "learning_rate": 0.0005606563684057126, | |
| "loss": 3.6412, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3207107486163707, | |
| "grad_norm": 0.3003256022930145, | |
| "learning_rate": 0.0005604814922763042, | |
| "loss": 3.6494, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.335275269443635, | |
| "grad_norm": 0.3591480255126953, | |
| "learning_rate": 0.0005603066161468959, | |
| "loss": 3.6408, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.3498397902709, | |
| "grad_norm": 0.3271133005619049, | |
| "learning_rate": 0.0005601317400174876, | |
| "loss": 3.6469, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.364404311098165, | |
| "grad_norm": 0.3171232342720032, | |
| "learning_rate": 0.0005599568638880793, | |
| "loss": 3.6423, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.3789688319254294, | |
| "grad_norm": 0.3135507106781006, | |
| "learning_rate": 0.0005597819877586709, | |
| "loss": 3.6472, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.3935333527526943, | |
| "grad_norm": 0.30243971943855286, | |
| "learning_rate": 0.0005596071116292625, | |
| "loss": 3.6556, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.408097873579959, | |
| "grad_norm": 0.3211742043495178, | |
| "learning_rate": 0.0005594322354998542, | |
| "loss": 3.636, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.422662394407224, | |
| "grad_norm": 0.3331124484539032, | |
| "learning_rate": 0.0005592573593704459, | |
| "loss": 3.6303, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.437226915234489, | |
| "grad_norm": 0.32489416003227234, | |
| "learning_rate": 0.0005590824832410375, | |
| "loss": 3.638, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4517914360617534, | |
| "grad_norm": 0.31111645698547363, | |
| "learning_rate": 0.0005589076071116292, | |
| "loss": 3.644, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.4663559568890183, | |
| "grad_norm": 0.31853461265563965, | |
| "learning_rate": 0.0005587327309822209, | |
| "loss": 3.6332, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.480920477716283, | |
| "grad_norm": 0.31134849786758423, | |
| "learning_rate": 0.0005585578548528126, | |
| "loss": 3.6513, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.495484998543548, | |
| "grad_norm": 0.31197866797447205, | |
| "learning_rate": 0.0005583829787234043, | |
| "loss": 3.6437, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.495484998543548, | |
| "eval_accuracy": 0.35321032354601034, | |
| "eval_loss": 3.6977498531341553, | |
| "eval_runtime": 177.4665, | |
| "eval_samples_per_second": 93.77, | |
| "eval_steps_per_second": 5.866, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.5100495193708126, | |
| "grad_norm": 0.3007756173610687, | |
| "learning_rate": 0.0005582081025939958, | |
| "loss": 3.6602, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5246140401980774, | |
| "grad_norm": 0.29941409826278687, | |
| "learning_rate": 0.0005580332264645875, | |
| "loss": 3.6426, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.5391785610253423, | |
| "grad_norm": 0.31950122117996216, | |
| "learning_rate": 0.0005578583503351792, | |
| "loss": 3.6384, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.5537430818526072, | |
| "grad_norm": 0.30377620458602905, | |
| "learning_rate": 0.0005576834742057709, | |
| "loss": 3.6427, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5683076026798717, | |
| "grad_norm": 0.343955397605896, | |
| "learning_rate": 0.0005575085980763625, | |
| "loss": 3.6349, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.5828721235071366, | |
| "grad_norm": 0.3224445879459381, | |
| "learning_rate": 0.0005573337219469542, | |
| "loss": 3.6349, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.5974366443344015, | |
| "grad_norm": 0.3116309344768524, | |
| "learning_rate": 0.0005571588458175459, | |
| "loss": 3.6422, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.612001165161666, | |
| "grad_norm": 0.31105419993400574, | |
| "learning_rate": 0.0005569839696881374, | |
| "loss": 3.6441, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.626565685988931, | |
| "grad_norm": 0.3085189461708069, | |
| "learning_rate": 0.0005568090935587292, | |
| "loss": 3.6405, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.6411302068161957, | |
| "grad_norm": 0.29982301592826843, | |
| "learning_rate": 0.0005566342174293208, | |
| "loss": 3.6503, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6556947276434606, | |
| "grad_norm": 0.31388601660728455, | |
| "learning_rate": 0.0005564593412999125, | |
| "loss": 3.6528, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6702592484707255, | |
| "grad_norm": 0.305575430393219, | |
| "learning_rate": 0.0005562844651705042, | |
| "loss": 3.6421, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.68482376929799, | |
| "grad_norm": 0.32143914699554443, | |
| "learning_rate": 0.0005561095890410958, | |
| "loss": 3.646, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.699388290125255, | |
| "grad_norm": 0.3150655925273895, | |
| "learning_rate": 0.0005559347129116875, | |
| "loss": 3.637, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.7139528109525197, | |
| "grad_norm": 0.31396007537841797, | |
| "learning_rate": 0.0005557598367822792, | |
| "loss": 3.647, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.728517331779784, | |
| "grad_norm": 0.3067757189273834, | |
| "learning_rate": 0.0005555849606528709, | |
| "loss": 3.6471, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.743081852607049, | |
| "grad_norm": 0.313796728849411, | |
| "learning_rate": 0.0005554100845234624, | |
| "loss": 3.6308, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.757646373434314, | |
| "grad_norm": 0.3006753921508789, | |
| "learning_rate": 0.0005552352083940541, | |
| "loss": 3.649, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.772210894261579, | |
| "grad_norm": 0.3247009217739105, | |
| "learning_rate": 0.0005550603322646458, | |
| "loss": 3.633, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.7867754150888437, | |
| "grad_norm": 0.30989664793014526, | |
| "learning_rate": 0.0005548854561352375, | |
| "loss": 3.6449, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7867754150888437, | |
| "eval_accuracy": 0.3550483792006422, | |
| "eval_loss": 3.677523612976074, | |
| "eval_runtime": 176.8362, | |
| "eval_samples_per_second": 94.104, | |
| "eval_steps_per_second": 5.887, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.8013399359161086, | |
| "grad_norm": 0.31539955735206604, | |
| "learning_rate": 0.0005547105800058292, | |
| "loss": 3.6357, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.815904456743373, | |
| "grad_norm": 0.2919875979423523, | |
| "learning_rate": 0.0005545357038764208, | |
| "loss": 3.6256, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.830468977570638, | |
| "grad_norm": 0.3210577070713043, | |
| "learning_rate": 0.0005543608277470125, | |
| "loss": 3.636, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.845033498397903, | |
| "grad_norm": 0.3022839426994324, | |
| "learning_rate": 0.0005541859516176042, | |
| "loss": 3.6284, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.8595980192251673, | |
| "grad_norm": 0.315828800201416, | |
| "learning_rate": 0.0005540110754881958, | |
| "loss": 3.6304, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.874162540052432, | |
| "grad_norm": 0.30515676736831665, | |
| "learning_rate": 0.0005538361993587874, | |
| "loss": 3.6442, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.888727060879697, | |
| "grad_norm": 0.3042634427547455, | |
| "learning_rate": 0.0005536613232293791, | |
| "loss": 3.6366, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.903291581706962, | |
| "grad_norm": 0.3206663727760315, | |
| "learning_rate": 0.0005534864470999708, | |
| "loss": 3.6317, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.917856102534227, | |
| "grad_norm": 0.3056301772594452, | |
| "learning_rate": 0.0005533115709705625, | |
| "loss": 3.6391, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9324206233614913, | |
| "grad_norm": 0.3067208230495453, | |
| "learning_rate": 0.0005531366948411541, | |
| "loss": 3.6312, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.9469851441887562, | |
| "grad_norm": 0.3234356641769409, | |
| "learning_rate": 0.0005529618187117458, | |
| "loss": 3.6221, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.961549665016021, | |
| "grad_norm": 0.31787794828414917, | |
| "learning_rate": 0.0005527869425823375, | |
| "loss": 3.6323, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9761141858432856, | |
| "grad_norm": 0.2994586229324341, | |
| "learning_rate": 0.0005526120664529292, | |
| "loss": 3.6276, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.9906787066705505, | |
| "grad_norm": 0.31581565737724304, | |
| "learning_rate": 0.0005524371903235207, | |
| "loss": 3.6349, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.005243227497815, | |
| "grad_norm": 0.32180607318878174, | |
| "learning_rate": 0.0005522623141941124, | |
| "loss": 3.5938, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.01980774832508, | |
| "grad_norm": 0.32863345742225647, | |
| "learning_rate": 0.0005520874380647041, | |
| "loss": 3.522, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.034372269152345, | |
| "grad_norm": 0.3115909695625305, | |
| "learning_rate": 0.0005519125619352957, | |
| "loss": 3.5133, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.04893678997961, | |
| "grad_norm": 0.3063027858734131, | |
| "learning_rate": 0.0005517376858058875, | |
| "loss": 3.5327, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.063501310806874, | |
| "grad_norm": 0.3200002908706665, | |
| "learning_rate": 0.0005515628096764791, | |
| "loss": 3.5327, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.078065831634139, | |
| "grad_norm": 0.32157978415489197, | |
| "learning_rate": 0.0005513879335470708, | |
| "loss": 3.5368, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.078065831634139, | |
| "eval_accuracy": 0.356187785549825, | |
| "eval_loss": 3.672633171081543, | |
| "eval_runtime": 179.4019, | |
| "eval_samples_per_second": 92.758, | |
| "eval_steps_per_second": 5.803, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.092630352461404, | |
| "grad_norm": 0.3131697475910187, | |
| "learning_rate": 0.0005512130574176625, | |
| "loss": 3.5373, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.107194873288669, | |
| "grad_norm": 0.3397645652294159, | |
| "learning_rate": 0.000551038181288254, | |
| "loss": 3.5416, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.121759394115934, | |
| "grad_norm": 0.31081974506378174, | |
| "learning_rate": 0.0005508633051588457, | |
| "loss": 3.5646, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.1363239149431985, | |
| "grad_norm": 0.3408530354499817, | |
| "learning_rate": 0.0005506884290294374, | |
| "loss": 3.5485, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.150888435770463, | |
| "grad_norm": 0.3124403655529022, | |
| "learning_rate": 0.0005505135529000291, | |
| "loss": 3.5518, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.165452956597728, | |
| "grad_norm": 0.3036765456199646, | |
| "learning_rate": 0.0005503386767706207, | |
| "loss": 3.5706, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.180017477424992, | |
| "grad_norm": 0.3298614025115967, | |
| "learning_rate": 0.0005501638006412124, | |
| "loss": 3.5419, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.194581998252257, | |
| "grad_norm": 0.3148026764392853, | |
| "learning_rate": 0.0005499889245118041, | |
| "loss": 3.5606, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.209146519079522, | |
| "grad_norm": 0.3329651653766632, | |
| "learning_rate": 0.0005498140483823958, | |
| "loss": 3.5666, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.223711039906787, | |
| "grad_norm": 0.3147095739841461, | |
| "learning_rate": 0.0005496391722529875, | |
| "loss": 3.5674, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.238275560734052, | |
| "grad_norm": 0.319242388010025, | |
| "learning_rate": 0.000549464296123579, | |
| "loss": 3.5579, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.252840081561317, | |
| "grad_norm": 0.3104586601257324, | |
| "learning_rate": 0.0005492894199941707, | |
| "loss": 3.5685, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.267404602388582, | |
| "grad_norm": 0.32091012597084045, | |
| "learning_rate": 0.0005491145438647624, | |
| "loss": 3.5587, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.2819691232158466, | |
| "grad_norm": 0.3283713161945343, | |
| "learning_rate": 0.000548939667735354, | |
| "loss": 3.5648, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.2965336440431106, | |
| "grad_norm": 0.3096083700656891, | |
| "learning_rate": 0.0005487647916059457, | |
| "loss": 3.5531, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.3110981648703754, | |
| "grad_norm": 0.3022560775279999, | |
| "learning_rate": 0.0005485899154765374, | |
| "loss": 3.5714, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.32566268569764, | |
| "grad_norm": 0.30840808153152466, | |
| "learning_rate": 0.0005484150393471291, | |
| "loss": 3.5691, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.340227206524905, | |
| "grad_norm": 0.3090154528617859, | |
| "learning_rate": 0.0005482401632177208, | |
| "loss": 3.5772, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.35479172735217, | |
| "grad_norm": 0.3259793221950531, | |
| "learning_rate": 0.0005480652870883124, | |
| "loss": 3.5707, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.369356248179435, | |
| "grad_norm": 0.29390519857406616, | |
| "learning_rate": 0.000547890410958904, | |
| "loss": 3.5699, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.369356248179435, | |
| "eval_accuracy": 0.3573489475161612, | |
| "eval_loss": 3.6574037075042725, | |
| "eval_runtime": 177.1658, | |
| "eval_samples_per_second": 93.929, | |
| "eval_steps_per_second": 5.876, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.3839207690067, | |
| "grad_norm": 0.3094480037689209, | |
| "learning_rate": 0.0005477155348294957, | |
| "loss": 3.563, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.398485289833965, | |
| "grad_norm": 0.31205040216445923, | |
| "learning_rate": 0.0005475406587000874, | |
| "loss": 3.5634, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.41304981066123, | |
| "grad_norm": 0.31506818532943726, | |
| "learning_rate": 0.000547365782570679, | |
| "loss": 3.5656, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.427614331488494, | |
| "grad_norm": 0.32489773631095886, | |
| "learning_rate": 0.0005471909064412707, | |
| "loss": 3.5617, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.442178852315759, | |
| "grad_norm": 0.30814430117607117, | |
| "learning_rate": 0.0005470160303118624, | |
| "loss": 3.5657, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.4567433731430235, | |
| "grad_norm": 0.32689669728279114, | |
| "learning_rate": 0.000546841154182454, | |
| "loss": 3.5538, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.471307893970288, | |
| "grad_norm": 0.29945799708366394, | |
| "learning_rate": 0.0005466662780530458, | |
| "loss": 3.5746, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.485872414797553, | |
| "grad_norm": 0.30545687675476074, | |
| "learning_rate": 0.0005464914019236374, | |
| "loss": 3.5712, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.500436935624818, | |
| "grad_norm": 0.3214097023010254, | |
| "learning_rate": 0.000546316525794229, | |
| "loss": 3.572, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.515001456452083, | |
| "grad_norm": 0.31878820061683655, | |
| "learning_rate": 0.0005461416496648207, | |
| "loss": 3.5719, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.529565977279347, | |
| "grad_norm": 0.30244311690330505, | |
| "learning_rate": 0.0005459667735354123, | |
| "loss": 3.5655, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.544130498106612, | |
| "grad_norm": 0.3169194161891937, | |
| "learning_rate": 0.000545791897406004, | |
| "loss": 3.5598, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.558695018933877, | |
| "grad_norm": 0.30807802081108093, | |
| "learning_rate": 0.0005456170212765957, | |
| "loss": 3.5783, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.573259539761142, | |
| "grad_norm": 0.32763341069221497, | |
| "learning_rate": 0.0005454421451471874, | |
| "loss": 3.5838, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.587824060588407, | |
| "grad_norm": 0.3086375892162323, | |
| "learning_rate": 0.000545267269017779, | |
| "loss": 3.5667, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.6023885814156715, | |
| "grad_norm": 0.3195725977420807, | |
| "learning_rate": 0.0005450923928883708, | |
| "loss": 3.5764, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.616953102242936, | |
| "grad_norm": 0.3060102164745331, | |
| "learning_rate": 0.0005449175167589623, | |
| "loss": 3.5688, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.631517623070201, | |
| "grad_norm": 0.317550390958786, | |
| "learning_rate": 0.000544742640629554, | |
| "loss": 3.5628, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.646082143897466, | |
| "grad_norm": 0.3243100047111511, | |
| "learning_rate": 0.0005445677645001457, | |
| "loss": 3.5821, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.66064666472473, | |
| "grad_norm": 0.34374845027923584, | |
| "learning_rate": 0.0005443928883707373, | |
| "loss": 3.5808, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.66064666472473, | |
| "eval_accuracy": 0.35801655096794266, | |
| "eval_loss": 3.6481385231018066, | |
| "eval_runtime": 178.0131, | |
| "eval_samples_per_second": 93.482, | |
| "eval_steps_per_second": 5.848, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.675211185551995, | |
| "grad_norm": 0.3434811532497406, | |
| "learning_rate": 0.000544218012241329, | |
| "loss": 3.5719, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.68977570637926, | |
| "grad_norm": 0.3100740313529968, | |
| "learning_rate": 0.0005440431361119207, | |
| "loss": 3.5787, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.704340227206525, | |
| "grad_norm": 0.3187655508518219, | |
| "learning_rate": 0.0005438682599825123, | |
| "loss": 3.5632, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.71890474803379, | |
| "grad_norm": 0.3257180452346802, | |
| "learning_rate": 0.000543693383853104, | |
| "loss": 3.5716, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.733469268861055, | |
| "grad_norm": 0.32113736867904663, | |
| "learning_rate": 0.0005435185077236957, | |
| "loss": 3.5782, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.74803378968832, | |
| "grad_norm": 0.31867560744285583, | |
| "learning_rate": 0.0005433436315942873, | |
| "loss": 3.5806, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.762598310515584, | |
| "grad_norm": 0.32009127736091614, | |
| "learning_rate": 0.000543168755464879, | |
| "loss": 3.5803, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.7771628313428485, | |
| "grad_norm": 0.31877410411834717, | |
| "learning_rate": 0.0005429938793354706, | |
| "loss": 3.5814, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.791727352170113, | |
| "grad_norm": 0.3448466360569, | |
| "learning_rate": 0.0005428190032060623, | |
| "loss": 3.5634, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.806291872997378, | |
| "grad_norm": 0.3082084059715271, | |
| "learning_rate": 0.000542644127076654, | |
| "loss": 3.5878, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.820856393824643, | |
| "grad_norm": 0.33161383867263794, | |
| "learning_rate": 0.0005424692509472457, | |
| "loss": 3.5829, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.835420914651908, | |
| "grad_norm": 0.3231264650821686, | |
| "learning_rate": 0.0005422943748178373, | |
| "loss": 3.5867, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.849985435479173, | |
| "grad_norm": 0.2988690137863159, | |
| "learning_rate": 0.000542119498688429, | |
| "loss": 3.5869, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.864549956306438, | |
| "grad_norm": 0.31382545828819275, | |
| "learning_rate": 0.0005419446225590207, | |
| "loss": 3.5744, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.879114477133703, | |
| "grad_norm": 0.3041090965270996, | |
| "learning_rate": 0.0005417697464296122, | |
| "loss": 3.5798, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.893678997960967, | |
| "grad_norm": 0.32487449049949646, | |
| "learning_rate": 0.000541594870300204, | |
| "loss": 3.5786, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.908243518788232, | |
| "grad_norm": 0.3080225884914398, | |
| "learning_rate": 0.0005414199941707956, | |
| "loss": 3.5735, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.9228080396154965, | |
| "grad_norm": 0.3145715594291687, | |
| "learning_rate": 0.0005412451180413873, | |
| "loss": 3.5699, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.937372560442761, | |
| "grad_norm": 0.29266849160194397, | |
| "learning_rate": 0.000541070241911979, | |
| "loss": 3.5733, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.951937081270026, | |
| "grad_norm": 0.3222334086894989, | |
| "learning_rate": 0.0005408953657825706, | |
| "loss": 3.5669, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.951937081270026, | |
| "eval_accuracy": 0.35965457254269423, | |
| "eval_loss": 3.634514808654785, | |
| "eval_runtime": 178.122, | |
| "eval_samples_per_second": 93.425, | |
| "eval_steps_per_second": 5.844, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.966501602097291, | |
| "grad_norm": 0.3042418360710144, | |
| "learning_rate": 0.0005407204896531623, | |
| "loss": 3.589, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.981066122924556, | |
| "grad_norm": 0.30115336179733276, | |
| "learning_rate": 0.000540545613523754, | |
| "loss": 3.5776, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.99563064375182, | |
| "grad_norm": 0.3397320508956909, | |
| "learning_rate": 0.0005403707373943456, | |
| "loss": 3.5842, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.010195164579085, | |
| "grad_norm": 0.3029438555240631, | |
| "learning_rate": 0.0005401958612649372, | |
| "loss": 3.51, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.02475968540635, | |
| "grad_norm": 0.31321457028388977, | |
| "learning_rate": 0.000540020985135529, | |
| "loss": 3.4687, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.039324206233615, | |
| "grad_norm": 0.31178945302963257, | |
| "learning_rate": 0.0005398461090061206, | |
| "loss": 3.4781, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.05388872706088, | |
| "grad_norm": 0.34495481848716736, | |
| "learning_rate": 0.0005396712328767123, | |
| "loss": 3.4682, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.0684532478881446, | |
| "grad_norm": 0.30439019203186035, | |
| "learning_rate": 0.000539496356747304, | |
| "loss": 3.4708, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.0830177687154094, | |
| "grad_norm": 0.33205610513687134, | |
| "learning_rate": 0.0005393214806178956, | |
| "loss": 3.4734, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.097582289542674, | |
| "grad_norm": 0.32609128952026367, | |
| "learning_rate": 0.0005391466044884873, | |
| "loss": 3.4818, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.112146810369939, | |
| "grad_norm": 0.31185203790664673, | |
| "learning_rate": 0.000538971728359079, | |
| "loss": 3.4836, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.126711331197203, | |
| "grad_norm": 0.32222872972488403, | |
| "learning_rate": 0.0005387968522296705, | |
| "loss": 3.4734, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.141275852024468, | |
| "grad_norm": 0.3166067898273468, | |
| "learning_rate": 0.0005386219761002622, | |
| "loss": 3.4808, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.155840372851733, | |
| "grad_norm": 0.3085167109966278, | |
| "learning_rate": 0.0005384470999708539, | |
| "loss": 3.4861, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.170404893678998, | |
| "grad_norm": 0.3201219439506531, | |
| "learning_rate": 0.0005382722238414456, | |
| "loss": 3.5014, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.184969414506263, | |
| "grad_norm": 0.3096485137939453, | |
| "learning_rate": 0.0005380973477120373, | |
| "loss": 3.5066, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.199533935333528, | |
| "grad_norm": 0.3200867474079132, | |
| "learning_rate": 0.000537922471582629, | |
| "loss": 3.4955, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.214098456160793, | |
| "grad_norm": 0.31873369216918945, | |
| "learning_rate": 0.0005377475954532206, | |
| "loss": 3.4958, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.2286629769880575, | |
| "grad_norm": 0.3236185312271118, | |
| "learning_rate": 0.0005375727193238123, | |
| "loss": 3.5043, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.2432274978153215, | |
| "grad_norm": 0.31578707695007324, | |
| "learning_rate": 0.000537397843194404, | |
| "loss": 3.4856, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.2432274978153215, | |
| "eval_accuracy": 0.3600538175169409, | |
| "eval_loss": 3.636204481124878, | |
| "eval_runtime": 178.2632, | |
| "eval_samples_per_second": 93.351, | |
| "eval_steps_per_second": 5.84, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.257792018642586, | |
| "grad_norm": 0.3020099997520447, | |
| "learning_rate": 0.0005372229670649955, | |
| "loss": 3.5085, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.272356539469851, | |
| "grad_norm": 0.33753785490989685, | |
| "learning_rate": 0.0005370480909355872, | |
| "loss": 3.5011, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.286921060297116, | |
| "grad_norm": 0.3076134920120239, | |
| "learning_rate": 0.0005368732148061789, | |
| "loss": 3.5104, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.301485581124381, | |
| "grad_norm": 0.3213876485824585, | |
| "learning_rate": 0.0005366983386767705, | |
| "loss": 3.5087, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.316050101951646, | |
| "grad_norm": 0.30576950311660767, | |
| "learning_rate": 0.0005365234625473623, | |
| "loss": 3.5067, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.330614622778911, | |
| "grad_norm": 0.3095839023590088, | |
| "learning_rate": 0.0005363485864179539, | |
| "loss": 3.5164, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.345179143606176, | |
| "grad_norm": 0.31150195002555847, | |
| "learning_rate": 0.0005361737102885456, | |
| "loss": 3.506, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.35974366443344, | |
| "grad_norm": 0.3282429575920105, | |
| "learning_rate": 0.0005359988341591373, | |
| "loss": 3.5064, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.374308185260705, | |
| "grad_norm": 0.32314422726631165, | |
| "learning_rate": 0.000535823958029729, | |
| "loss": 3.5114, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.3888727060879695, | |
| "grad_norm": 0.3226771652698517, | |
| "learning_rate": 0.0005356490819003205, | |
| "loss": 3.515, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.403437226915234, | |
| "grad_norm": 0.30987077951431274, | |
| "learning_rate": 0.0005354742057709122, | |
| "loss": 3.5127, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.418001747742499, | |
| "grad_norm": 0.32695272564888, | |
| "learning_rate": 0.0005352993296415039, | |
| "loss": 3.5208, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.432566268569764, | |
| "grad_norm": 0.32112646102905273, | |
| "learning_rate": 0.0005351244535120955, | |
| "loss": 3.5216, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.447130789397029, | |
| "grad_norm": 0.32543012499809265, | |
| "learning_rate": 0.0005349495773826873, | |
| "loss": 3.5198, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.461695310224294, | |
| "grad_norm": 0.31456059217453003, | |
| "learning_rate": 0.0005347747012532789, | |
| "loss": 3.5269, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.476259831051558, | |
| "grad_norm": 0.2968471050262451, | |
| "learning_rate": 0.0005345998251238706, | |
| "loss": 3.5041, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.490824351878823, | |
| "grad_norm": 0.2999701499938965, | |
| "learning_rate": 0.0005344249489944623, | |
| "loss": 3.5057, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.505388872706088, | |
| "grad_norm": 0.31823596358299255, | |
| "learning_rate": 0.0005342500728650538, | |
| "loss": 3.5208, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.519953393533353, | |
| "grad_norm": 0.3265931010246277, | |
| "learning_rate": 0.0005340751967356455, | |
| "loss": 3.5322, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.534517914360618, | |
| "grad_norm": 0.3369291424751282, | |
| "learning_rate": 0.0005339003206062372, | |
| "loss": 3.5372, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.534517914360618, | |
| "eval_accuracy": 0.3610927952334266, | |
| "eval_loss": 3.6238300800323486, | |
| "eval_runtime": 178.1149, | |
| "eval_samples_per_second": 93.428, | |
| "eval_steps_per_second": 5.845, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.5490824351878825, | |
| "grad_norm": 0.3080558776855469, | |
| "learning_rate": 0.0005337254444768288, | |
| "loss": 3.5148, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.563646956015147, | |
| "grad_norm": 0.3292733132839203, | |
| "learning_rate": 0.0005335505683474205, | |
| "loss": 3.5206, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.578211476842412, | |
| "grad_norm": 0.2844310700893402, | |
| "learning_rate": 0.0005333756922180122, | |
| "loss": 3.5198, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.592775997669676, | |
| "grad_norm": 0.32660531997680664, | |
| "learning_rate": 0.0005332008160886039, | |
| "loss": 3.5196, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.607340518496941, | |
| "grad_norm": 0.3126123249530792, | |
| "learning_rate": 0.0005330259399591956, | |
| "loss": 3.5265, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.621905039324206, | |
| "grad_norm": 0.32773616909980774, | |
| "learning_rate": 0.0005328510638297873, | |
| "loss": 3.5149, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.636469560151471, | |
| "grad_norm": 0.33990636467933655, | |
| "learning_rate": 0.0005326761877003788, | |
| "loss": 3.5284, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.651034080978736, | |
| "grad_norm": 0.310086190700531, | |
| "learning_rate": 0.0005325013115709705, | |
| "loss": 3.5261, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.665598601806001, | |
| "grad_norm": 0.3189842402935028, | |
| "learning_rate": 0.0005323264354415622, | |
| "loss": 3.5141, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.680163122633266, | |
| "grad_norm": 0.3437648117542267, | |
| "learning_rate": 0.0005321515593121538, | |
| "loss": 3.5307, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.6947276434605305, | |
| "grad_norm": 0.3275405168533325, | |
| "learning_rate": 0.0005319766831827455, | |
| "loss": 3.5203, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.709292164287795, | |
| "grad_norm": 0.33150243759155273, | |
| "learning_rate": 0.0005318018070533372, | |
| "loss": 3.5261, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.723856685115059, | |
| "grad_norm": 0.31431785225868225, | |
| "learning_rate": 0.0005316269309239288, | |
| "loss": 3.5317, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.738421205942324, | |
| "grad_norm": 0.310982882976532, | |
| "learning_rate": 0.0005314520547945206, | |
| "loss": 3.5258, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.752985726769589, | |
| "grad_norm": 0.3376849591732025, | |
| "learning_rate": 0.0005312771786651121, | |
| "loss": 3.5293, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.767550247596854, | |
| "grad_norm": 0.3104221522808075, | |
| "learning_rate": 0.0005311023025357038, | |
| "loss": 3.5352, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.782114768424119, | |
| "grad_norm": 0.3133756220340729, | |
| "learning_rate": 0.0005309274264062955, | |
| "loss": 3.5305, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.796679289251384, | |
| "grad_norm": 0.3189453184604645, | |
| "learning_rate": 0.0005307525502768872, | |
| "loss": 3.5303, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.811243810078649, | |
| "grad_norm": 0.3235698342323303, | |
| "learning_rate": 0.0005305776741474788, | |
| "loss": 3.5243, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.825808330905913, | |
| "grad_norm": 0.33478856086730957, | |
| "learning_rate": 0.0005304027980180705, | |
| "loss": 3.5283, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.825808330905913, | |
| "eval_accuracy": 0.36207003403636906, | |
| "eval_loss": 3.6125800609588623, | |
| "eval_runtime": 177.938, | |
| "eval_samples_per_second": 93.521, | |
| "eval_steps_per_second": 5.85, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.840372851733178, | |
| "grad_norm": 0.3260256052017212, | |
| "learning_rate": 0.0005302279218886622, | |
| "loss": 3.5366, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.8549373725604426, | |
| "grad_norm": 0.32015252113342285, | |
| "learning_rate": 0.0005300530457592538, | |
| "loss": 3.5379, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.8695018933877074, | |
| "grad_norm": 0.3291431665420532, | |
| "learning_rate": 0.0005298781696298456, | |
| "loss": 3.538, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.884066414214972, | |
| "grad_norm": 0.30976444482803345, | |
| "learning_rate": 0.0005297032935004371, | |
| "loss": 3.5296, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.898630935042237, | |
| "grad_norm": 0.315712034702301, | |
| "learning_rate": 0.0005295284173710288, | |
| "loss": 3.5213, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.913195455869502, | |
| "grad_norm": 0.32749900221824646, | |
| "learning_rate": 0.0005293535412416205, | |
| "loss": 3.5263, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.927759976696767, | |
| "grad_norm": 0.30342087149620056, | |
| "learning_rate": 0.0005291786651122121, | |
| "loss": 3.5256, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.942324497524032, | |
| "grad_norm": 0.2914330065250397, | |
| "learning_rate": 0.0005290037889828038, | |
| "loss": 3.5342, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.956889018351296, | |
| "grad_norm": 0.32604503631591797, | |
| "learning_rate": 0.0005288289128533955, | |
| "loss": 3.545, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.971453539178561, | |
| "grad_norm": 0.36326587200164795, | |
| "learning_rate": 0.0005286540367239872, | |
| "loss": 3.5389, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.986018060005826, | |
| "grad_norm": 0.31661051511764526, | |
| "learning_rate": 0.0005284791605945788, | |
| "loss": 3.529, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.000582580833091, | |
| "grad_norm": 0.30371156334877014, | |
| "learning_rate": 0.0005283042844651704, | |
| "loss": 3.5235, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.0151471016603555, | |
| "grad_norm": 0.31312742829322815, | |
| "learning_rate": 0.0005281294083357621, | |
| "loss": 3.4146, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.02971162248762, | |
| "grad_norm": 0.31279024481773376, | |
| "learning_rate": 0.0005279545322063538, | |
| "loss": 3.419, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.044276143314885, | |
| "grad_norm": 0.30267471075057983, | |
| "learning_rate": 0.0005277796560769455, | |
| "loss": 3.4232, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.058840664142149, | |
| "grad_norm": 0.3136891722679138, | |
| "learning_rate": 0.0005276047799475371, | |
| "loss": 3.4228, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.073405184969414, | |
| "grad_norm": 0.3055378198623657, | |
| "learning_rate": 0.0005274299038181288, | |
| "loss": 3.4312, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.087969705796679, | |
| "grad_norm": 0.32942959666252136, | |
| "learning_rate": 0.0005272550276887205, | |
| "loss": 3.4293, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.102534226623944, | |
| "grad_norm": 0.34652891755104065, | |
| "learning_rate": 0.0005270801515593121, | |
| "loss": 3.431, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.117098747451209, | |
| "grad_norm": 0.3244026005268097, | |
| "learning_rate": 0.0005269052754299037, | |
| "loss": 3.4499, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.117098747451209, | |
| "eval_accuracy": 0.3623327478132371, | |
| "eval_loss": 3.6183032989501953, | |
| "eval_runtime": 177.6206, | |
| "eval_samples_per_second": 93.688, | |
| "eval_steps_per_second": 5.861, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.131663268278474, | |
| "grad_norm": 0.31809812784194946, | |
| "learning_rate": 0.0005267303993004954, | |
| "loss": 3.4428, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.146227789105739, | |
| "grad_norm": 0.3030714988708496, | |
| "learning_rate": 0.000526555523171087, | |
| "loss": 3.4458, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.1607923099330035, | |
| "grad_norm": 0.34124982357025146, | |
| "learning_rate": 0.0005263806470416788, | |
| "loss": 3.4431, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.175356830760268, | |
| "grad_norm": 0.31598350405693054, | |
| "learning_rate": 0.0005262057709122704, | |
| "loss": 3.4564, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.189921351587532, | |
| "grad_norm": 0.3193134069442749, | |
| "learning_rate": 0.0005260308947828621, | |
| "loss": 3.4537, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.204485872414797, | |
| "grad_norm": 0.3082662522792816, | |
| "learning_rate": 0.0005258560186534538, | |
| "loss": 3.4516, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.219050393242062, | |
| "grad_norm": 0.3147899806499481, | |
| "learning_rate": 0.0005256811425240455, | |
| "loss": 3.4563, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.233614914069327, | |
| "grad_norm": 0.3170771300792694, | |
| "learning_rate": 0.0005255062663946371, | |
| "loss": 3.449, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.248179434896592, | |
| "grad_norm": 0.7409493923187256, | |
| "learning_rate": 0.0005253313902652287, | |
| "loss": 3.4565, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.262743955723857, | |
| "grad_norm": 0.3383581340312958, | |
| "learning_rate": 0.0005251565141358204, | |
| "loss": 3.4602, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.277308476551122, | |
| "grad_norm": 0.32042765617370605, | |
| "learning_rate": 0.000524981638006412, | |
| "loss": 3.4521, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.291872997378386, | |
| "grad_norm": 0.32029616832733154, | |
| "learning_rate": 0.0005248067618770038, | |
| "loss": 3.4656, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.306437518205651, | |
| "grad_norm": 0.3129471242427826, | |
| "learning_rate": 0.0005246318857475954, | |
| "loss": 3.4757, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.321002039032916, | |
| "grad_norm": 0.325960636138916, | |
| "learning_rate": 0.0005244570096181871, | |
| "loss": 3.4633, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.3355665598601805, | |
| "grad_norm": 0.3158135712146759, | |
| "learning_rate": 0.0005242821334887788, | |
| "loss": 3.4638, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.350131080687445, | |
| "grad_norm": 0.3112996220588684, | |
| "learning_rate": 0.0005241072573593704, | |
| "loss": 3.4754, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.36469560151471, | |
| "grad_norm": 0.32430002093315125, | |
| "learning_rate": 0.000523932381229962, | |
| "loss": 3.4659, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.379260122341975, | |
| "grad_norm": 0.34102514386177063, | |
| "learning_rate": 0.0005237575051005537, | |
| "loss": 3.4633, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.39382464316924, | |
| "grad_norm": 0.3184153437614441, | |
| "learning_rate": 0.0005235826289711454, | |
| "loss": 3.4729, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.408389163996505, | |
| "grad_norm": 0.31477609276771545, | |
| "learning_rate": 0.000523407752841737, | |
| "loss": 3.478, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.408389163996505, | |
| "eval_accuracy": 0.3631589908733422, | |
| "eval_loss": 3.611259937286377, | |
| "eval_runtime": 177.9074, | |
| "eval_samples_per_second": 93.537, | |
| "eval_steps_per_second": 5.851, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.422953684823769, | |
| "grad_norm": 0.30401411652565, | |
| "learning_rate": 0.0005232328767123287, | |
| "loss": 3.4673, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.437518205651034, | |
| "grad_norm": 0.32664841413497925, | |
| "learning_rate": 0.0005230580005829204, | |
| "loss": 3.4699, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.452082726478299, | |
| "grad_norm": 0.30075621604919434, | |
| "learning_rate": 0.0005228831244535121, | |
| "loss": 3.4883, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.466647247305564, | |
| "grad_norm": 0.3116818070411682, | |
| "learning_rate": 0.0005227082483241038, | |
| "loss": 3.4699, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.4812117681328285, | |
| "grad_norm": 0.33584606647491455, | |
| "learning_rate": 0.0005225333721946954, | |
| "loss": 3.4799, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.495776288960093, | |
| "grad_norm": 0.3176494240760803, | |
| "learning_rate": 0.000522358496065287, | |
| "loss": 3.4829, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.510340809787358, | |
| "grad_norm": 0.3094027042388916, | |
| "learning_rate": 0.0005221836199358787, | |
| "loss": 3.4793, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.524905330614622, | |
| "grad_norm": 0.29500812292099, | |
| "learning_rate": 0.0005220087438064703, | |
| "loss": 3.4881, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.539469851441887, | |
| "grad_norm": 0.32507067918777466, | |
| "learning_rate": 0.000521833867677062, | |
| "loss": 3.4813, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.554034372269152, | |
| "grad_norm": 0.300907164812088, | |
| "learning_rate": 0.0005216589915476537, | |
| "loss": 3.4818, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.568598893096417, | |
| "grad_norm": 0.3277975618839264, | |
| "learning_rate": 0.0005214841154182454, | |
| "loss": 3.4774, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.583163413923682, | |
| "grad_norm": 0.32927611470222473, | |
| "learning_rate": 0.0005213092392888371, | |
| "loss": 3.4826, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.597727934750947, | |
| "grad_norm": 0.3341529369354248, | |
| "learning_rate": 0.0005211343631594287, | |
| "loss": 3.4821, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.612292455578212, | |
| "grad_norm": 0.323186457157135, | |
| "learning_rate": 0.0005209594870300204, | |
| "loss": 3.4759, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.6268569764054766, | |
| "grad_norm": 0.3139779269695282, | |
| "learning_rate": 0.000520784610900612, | |
| "loss": 3.484, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.6414214972327414, | |
| "grad_norm": 0.31565406918525696, | |
| "learning_rate": 0.0005206097347712037, | |
| "loss": 3.4833, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.6559860180600054, | |
| "grad_norm": 0.31253114342689514, | |
| "learning_rate": 0.0005204348586417953, | |
| "loss": 3.4874, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.67055053888727, | |
| "grad_norm": 0.3426834046840668, | |
| "learning_rate": 0.000520259982512387, | |
| "loss": 3.484, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.685115059714535, | |
| "grad_norm": 0.3148155212402344, | |
| "learning_rate": 0.0005200851063829787, | |
| "loss": 3.4855, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.6996795805418, | |
| "grad_norm": 0.33372002840042114, | |
| "learning_rate": 0.0005199102302535703, | |
| "loss": 3.4924, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.6996795805418, | |
| "eval_accuracy": 0.36406990444344955, | |
| "eval_loss": 3.59806489944458, | |
| "eval_runtime": 177.7268, | |
| "eval_samples_per_second": 93.632, | |
| "eval_steps_per_second": 5.857, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.714244101369065, | |
| "grad_norm": 0.3200724720954895, | |
| "learning_rate": 0.0005197353541241621, | |
| "loss": 3.4794, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.72880862219633, | |
| "grad_norm": 0.3293537199497223, | |
| "learning_rate": 0.0005195604779947537, | |
| "loss": 3.4753, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.743373143023595, | |
| "grad_norm": 0.32551905512809753, | |
| "learning_rate": 0.0005193856018653454, | |
| "loss": 3.4896, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.757937663850859, | |
| "grad_norm": 0.3181687593460083, | |
| "learning_rate": 0.000519210725735937, | |
| "loss": 3.4845, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.772502184678124, | |
| "grad_norm": 0.3090181350708008, | |
| "learning_rate": 0.0005190358496065286, | |
| "loss": 3.4876, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.787066705505389, | |
| "grad_norm": 0.32364892959594727, | |
| "learning_rate": 0.0005188609734771203, | |
| "loss": 3.4979, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.8016312263326535, | |
| "grad_norm": 0.3206116855144501, | |
| "learning_rate": 0.000518686097347712, | |
| "loss": 3.4882, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.816195747159918, | |
| "grad_norm": 0.3306562006473541, | |
| "learning_rate": 0.0005185112212183037, | |
| "loss": 3.4947, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.830760267987183, | |
| "grad_norm": 0.3269500136375427, | |
| "learning_rate": 0.0005183363450888953, | |
| "loss": 3.4892, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.845324788814448, | |
| "grad_norm": 0.3314773142337799, | |
| "learning_rate": 0.000518161468959487, | |
| "loss": 3.4941, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.859889309641713, | |
| "grad_norm": 0.3544946014881134, | |
| "learning_rate": 0.0005179865928300787, | |
| "loss": 3.4873, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.874453830468978, | |
| "grad_norm": 0.34047919511795044, | |
| "learning_rate": 0.0005178117167006703, | |
| "loss": 3.4965, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.889018351296242, | |
| "grad_norm": 0.3339918553829193, | |
| "learning_rate": 0.000517636840571262, | |
| "loss": 3.498, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.903582872123507, | |
| "grad_norm": 0.33018672466278076, | |
| "learning_rate": 0.0005174619644418536, | |
| "loss": 3.4936, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.918147392950772, | |
| "grad_norm": 0.3264847993850708, | |
| "learning_rate": 0.0005172870883124453, | |
| "loss": 3.507, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.932711913778037, | |
| "grad_norm": 0.3399565815925598, | |
| "learning_rate": 0.000517112212183037, | |
| "loss": 3.4907, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.9472764346053015, | |
| "grad_norm": 0.30828312039375305, | |
| "learning_rate": 0.0005169373360536286, | |
| "loss": 3.4914, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.961840955432566, | |
| "grad_norm": 0.3247275948524475, | |
| "learning_rate": 0.0005167624599242203, | |
| "loss": 3.493, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.976405476259831, | |
| "grad_norm": 0.3123205304145813, | |
| "learning_rate": 0.000516587583794812, | |
| "loss": 3.4967, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.990969997087096, | |
| "grad_norm": 0.32256489992141724, | |
| "learning_rate": 0.0005164127076654037, | |
| "loss": 3.4865, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.990969997087096, | |
| "eval_accuracy": 0.3646406072004507, | |
| "eval_loss": 3.5887346267700195, | |
| "eval_runtime": 177.4869, | |
| "eval_samples_per_second": 93.759, | |
| "eval_steps_per_second": 5.865, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.00553451791436, | |
| "grad_norm": 0.3418632447719574, | |
| "learning_rate": 0.0005162378315359953, | |
| "loss": 3.458, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.020099038741625, | |
| "grad_norm": 0.30545857548713684, | |
| "learning_rate": 0.0005160629554065869, | |
| "loss": 3.3774, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.03466355956889, | |
| "grad_norm": 0.30984604358673096, | |
| "learning_rate": 0.0005158880792771786, | |
| "loss": 3.3816, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.049228080396155, | |
| "grad_norm": 0.31869596242904663, | |
| "learning_rate": 0.0005157132031477703, | |
| "loss": 3.3935, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.06379260122342, | |
| "grad_norm": 0.34166282415390015, | |
| "learning_rate": 0.000515538327018362, | |
| "loss": 3.3896, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.078357122050685, | |
| "grad_norm": 0.3672468364238739, | |
| "learning_rate": 0.0005153634508889536, | |
| "loss": 3.3978, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.09292164287795, | |
| "grad_norm": 0.3174588978290558, | |
| "learning_rate": 0.0005151885747595453, | |
| "loss": 3.4059, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.1074861637052145, | |
| "grad_norm": 0.30413857102394104, | |
| "learning_rate": 0.000515013698630137, | |
| "loss": 3.4103, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.1220506845324785, | |
| "grad_norm": 0.31710121035575867, | |
| "learning_rate": 0.0005148388225007285, | |
| "loss": 3.4017, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.136615205359743, | |
| "grad_norm": 0.33834975957870483, | |
| "learning_rate": 0.0005146639463713203, | |
| "loss": 3.4017, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.151179726187008, | |
| "grad_norm": 0.307821661233902, | |
| "learning_rate": 0.0005144890702419119, | |
| "loss": 3.4139, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.165744247014273, | |
| "grad_norm": 0.33020275831222534, | |
| "learning_rate": 0.0005143141941125036, | |
| "loss": 3.4018, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.180308767841538, | |
| "grad_norm": 0.319395512342453, | |
| "learning_rate": 0.0005141393179830953, | |
| "loss": 3.4067, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.194873288668803, | |
| "grad_norm": 0.3333781659603119, | |
| "learning_rate": 0.0005139644418536869, | |
| "loss": 3.4189, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.209437809496068, | |
| "grad_norm": 0.34063929319381714, | |
| "learning_rate": 0.0005137895657242786, | |
| "loss": 3.4158, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.224002330323333, | |
| "grad_norm": 0.3293705880641937, | |
| "learning_rate": 0.0005136146895948703, | |
| "loss": 3.4296, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.238566851150597, | |
| "grad_norm": 0.3184684216976166, | |
| "learning_rate": 0.000513439813465462, | |
| "loss": 3.425, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.253131371977862, | |
| "grad_norm": 0.33404359221458435, | |
| "learning_rate": 0.0005132649373360535, | |
| "loss": 3.4167, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.2676958928051265, | |
| "grad_norm": 0.33032795786857605, | |
| "learning_rate": 0.0005130900612066452, | |
| "loss": 3.4225, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.282260413632391, | |
| "grad_norm": 0.3188001215457916, | |
| "learning_rate": 0.0005129151850772369, | |
| "loss": 3.4371, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.282260413632391, | |
| "eval_accuracy": 0.36465624772521504, | |
| "eval_loss": 3.598834276199341, | |
| "eval_runtime": 177.8103, | |
| "eval_samples_per_second": 93.589, | |
| "eval_steps_per_second": 5.855, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.296824934459656, | |
| "grad_norm": 0.32110267877578735, | |
| "learning_rate": 0.0005127403089478286, | |
| "loss": 3.4326, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.311389455286921, | |
| "grad_norm": 0.3441673517227173, | |
| "learning_rate": 0.0005125654328184203, | |
| "loss": 3.418, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.325953976114186, | |
| "grad_norm": 0.3550114035606384, | |
| "learning_rate": 0.0005123905566890119, | |
| "loss": 3.4363, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.340518496941451, | |
| "grad_norm": 0.31895849108695984, | |
| "learning_rate": 0.0005122156805596036, | |
| "loss": 3.4401, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.355083017768715, | |
| "grad_norm": 0.3154217004776001, | |
| "learning_rate": 0.0005120408044301953, | |
| "loss": 3.419, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.36964753859598, | |
| "grad_norm": 0.35524681210517883, | |
| "learning_rate": 0.0005118659283007868, | |
| "loss": 3.4247, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.384212059423245, | |
| "grad_norm": 0.33160510659217834, | |
| "learning_rate": 0.0005116910521713785, | |
| "loss": 3.4234, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.39877658025051, | |
| "grad_norm": 0.32226237654685974, | |
| "learning_rate": 0.0005115161760419702, | |
| "loss": 3.4328, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.4133411010777746, | |
| "grad_norm": 0.3131123185157776, | |
| "learning_rate": 0.0005113412999125619, | |
| "loss": 3.4389, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.4279056219050394, | |
| "grad_norm": 0.3247349262237549, | |
| "learning_rate": 0.0005111664237831536, | |
| "loss": 3.4296, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.442470142732304, | |
| "grad_norm": 0.318561851978302, | |
| "learning_rate": 0.0005109915476537452, | |
| "loss": 3.4451, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.457034663559569, | |
| "grad_norm": 0.31211626529693604, | |
| "learning_rate": 0.0005108166715243369, | |
| "loss": 3.4448, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.471599184386834, | |
| "grad_norm": 0.32736170291900635, | |
| "learning_rate": 0.0005106417953949286, | |
| "loss": 3.4446, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.486163705214098, | |
| "grad_norm": 0.32811227440834045, | |
| "learning_rate": 0.0005104669192655203, | |
| "loss": 3.4383, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.500728226041363, | |
| "grad_norm": 0.3180325925350189, | |
| "learning_rate": 0.0005102920431361118, | |
| "loss": 3.4454, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.515292746868628, | |
| "grad_norm": 0.32189640402793884, | |
| "learning_rate": 0.0005101171670067035, | |
| "loss": 3.4465, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.529857267695893, | |
| "grad_norm": 0.32531917095184326, | |
| "learning_rate": 0.0005099422908772952, | |
| "loss": 3.4449, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.544421788523158, | |
| "grad_norm": 0.33265554904937744, | |
| "learning_rate": 0.0005097674147478868, | |
| "loss": 3.4514, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.558986309350423, | |
| "grad_norm": 0.32440370321273804, | |
| "learning_rate": 0.0005095925386184786, | |
| "loss": 3.459, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.5735508301776875, | |
| "grad_norm": 0.32886600494384766, | |
| "learning_rate": 0.0005094176624890702, | |
| "loss": 3.4552, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.5735508301776875, | |
| "eval_accuracy": 0.36528269190130097, | |
| "eval_loss": 3.5886290073394775, | |
| "eval_runtime": 177.9265, | |
| "eval_samples_per_second": 93.527, | |
| "eval_steps_per_second": 5.851, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.5881153510049515, | |
| "grad_norm": 0.3494354486465454, | |
| "learning_rate": 0.0005092427863596619, | |
| "loss": 3.4429, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.602679871832216, | |
| "grad_norm": 0.3267322778701782, | |
| "learning_rate": 0.0005090679102302536, | |
| "loss": 3.4374, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.617244392659481, | |
| "grad_norm": 0.3428941071033478, | |
| "learning_rate": 0.0005088930341008451, | |
| "loss": 3.4462, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.631808913486746, | |
| "grad_norm": 0.3226052522659302, | |
| "learning_rate": 0.0005087181579714368, | |
| "loss": 3.4491, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.646373434314011, | |
| "grad_norm": 0.32438158988952637, | |
| "learning_rate": 0.0005085432818420285, | |
| "loss": 3.454, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.660937955141276, | |
| "grad_norm": 0.2988736629486084, | |
| "learning_rate": 0.0005083684057126202, | |
| "loss": 3.4578, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.675502475968541, | |
| "grad_norm": 0.31402266025543213, | |
| "learning_rate": 0.0005081935295832118, | |
| "loss": 3.4428, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.690066996795806, | |
| "grad_norm": 0.34731653332710266, | |
| "learning_rate": 0.0005080186534538035, | |
| "loss": 3.4578, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.704631517623071, | |
| "grad_norm": 0.33528274297714233, | |
| "learning_rate": 0.0005078437773243952, | |
| "loss": 3.458, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.719196038450335, | |
| "grad_norm": 0.3237103223800659, | |
| "learning_rate": 0.0005076689011949869, | |
| "loss": 3.4564, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.7337605592775995, | |
| "grad_norm": 0.3397464454174042, | |
| "learning_rate": 0.0005074940250655786, | |
| "loss": 3.4604, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.748325080104864, | |
| "grad_norm": 0.3478679358959198, | |
| "learning_rate": 0.0005073191489361701, | |
| "loss": 3.4475, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.762889600932129, | |
| "grad_norm": 0.33171746134757996, | |
| "learning_rate": 0.0005071442728067618, | |
| "loss": 3.4538, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.777454121759394, | |
| "grad_norm": 0.309638112783432, | |
| "learning_rate": 0.0005069693966773535, | |
| "loss": 3.4554, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.792018642586659, | |
| "grad_norm": 0.3162028193473816, | |
| "learning_rate": 0.0005067945205479451, | |
| "loss": 3.4465, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.806583163413924, | |
| "grad_norm": 0.3107338845729828, | |
| "learning_rate": 0.0005066196444185368, | |
| "loss": 3.454, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.821147684241188, | |
| "grad_norm": 0.33051323890686035, | |
| "learning_rate": 0.0005064447682891285, | |
| "loss": 3.4553, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.835712205068453, | |
| "grad_norm": 0.33980193734169006, | |
| "learning_rate": 0.0005062698921597202, | |
| "loss": 3.4523, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.850276725895718, | |
| "grad_norm": 0.34411004185676575, | |
| "learning_rate": 0.0005060950160303119, | |
| "loss": 3.4671, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.864841246722983, | |
| "grad_norm": 0.3099103271961212, | |
| "learning_rate": 0.0005059201399009035, | |
| "loss": 3.4686, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.864841246722983, | |
| "eval_accuracy": 0.36589326035676156, | |
| "eval_loss": 3.5795626640319824, | |
| "eval_runtime": 177.1061, | |
| "eval_samples_per_second": 93.961, | |
| "eval_steps_per_second": 5.878, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.879405767550248, | |
| "grad_norm": 0.32725054025650024, | |
| "learning_rate": 0.0005057452637714951, | |
| "loss": 3.4652, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.8939702883775125, | |
| "grad_norm": 0.31909966468811035, | |
| "learning_rate": 0.0005055703876420868, | |
| "loss": 3.4615, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.908534809204777, | |
| "grad_norm": 0.33096399903297424, | |
| "learning_rate": 0.0005053955115126785, | |
| "loss": 3.4602, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.923099330032042, | |
| "grad_norm": 0.3407413959503174, | |
| "learning_rate": 0.0005052206353832701, | |
| "loss": 3.4593, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.937663850859307, | |
| "grad_norm": 0.33889928460121155, | |
| "learning_rate": 0.0005050457592538618, | |
| "loss": 3.4601, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.952228371686571, | |
| "grad_norm": 0.32174986600875854, | |
| "learning_rate": 0.0005048708831244535, | |
| "loss": 3.4621, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.966792892513836, | |
| "grad_norm": 0.31754982471466064, | |
| "learning_rate": 0.0005046960069950451, | |
| "loss": 3.4708, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.981357413341101, | |
| "grad_norm": 0.3301248252391815, | |
| "learning_rate": 0.0005045211308656369, | |
| "loss": 3.4638, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.995921934168366, | |
| "grad_norm": 0.3506939113140106, | |
| "learning_rate": 0.0005043462547362284, | |
| "loss": 3.46, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.01048645499563, | |
| "grad_norm": 0.32264554500579834, | |
| "learning_rate": 0.0005041713786068201, | |
| "loss": 3.3792, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.025050975822895, | |
| "grad_norm": 0.3109362721443176, | |
| "learning_rate": 0.0005039965024774118, | |
| "loss": 3.3501, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.03961549665016, | |
| "grad_norm": 0.3230056166648865, | |
| "learning_rate": 0.0005038216263480034, | |
| "loss": 3.3474, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.054180017477425, | |
| "grad_norm": 0.3607815206050873, | |
| "learning_rate": 0.0005036467502185951, | |
| "loss": 3.355, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.06874453830469, | |
| "grad_norm": 0.3454449772834778, | |
| "learning_rate": 0.0005034718740891868, | |
| "loss": 3.3632, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.083309059131954, | |
| "grad_norm": 0.3308209478855133, | |
| "learning_rate": 0.0005032969979597785, | |
| "loss": 3.3578, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.09787357995922, | |
| "grad_norm": 0.3335447907447815, | |
| "learning_rate": 0.0005031221218303701, | |
| "loss": 3.372, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.112438100786484, | |
| "grad_norm": 0.35093986988067627, | |
| "learning_rate": 0.0005029472457009618, | |
| "loss": 3.3682, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.127002621613748, | |
| "grad_norm": 0.3294939696788788, | |
| "learning_rate": 0.0005027723695715534, | |
| "loss": 3.3767, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.141567142441014, | |
| "grad_norm": 0.3499290645122528, | |
| "learning_rate": 0.0005025974934421451, | |
| "loss": 3.3834, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.156131663268278, | |
| "grad_norm": 0.3340826630592346, | |
| "learning_rate": 0.0005024226173127368, | |
| "loss": 3.3643, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.156131663268278, | |
| "eval_accuracy": 0.36620242531620023, | |
| "eval_loss": 3.586412191390991, | |
| "eval_runtime": 177.6212, | |
| "eval_samples_per_second": 93.688, | |
| "eval_steps_per_second": 5.861, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.170696184095544, | |
| "grad_norm": 0.3302430212497711, | |
| "learning_rate": 0.0005022477411833284, | |
| "loss": 3.3864, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.185260704922808, | |
| "grad_norm": 0.35755255818367004, | |
| "learning_rate": 0.0005020728650539201, | |
| "loss": 3.3841, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.199825225750073, | |
| "grad_norm": 0.3413600027561188, | |
| "learning_rate": 0.0005018979889245118, | |
| "loss": 3.3885, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.214389746577337, | |
| "grad_norm": 0.31416332721710205, | |
| "learning_rate": 0.0005017231127951034, | |
| "loss": 3.394, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.228954267404603, | |
| "grad_norm": 0.3258178234100342, | |
| "learning_rate": 0.0005015482366656951, | |
| "loss": 3.3826, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.243518788231867, | |
| "grad_norm": 0.3640033006668091, | |
| "learning_rate": 0.0005013733605362868, | |
| "loss": 3.4016, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.258083309059131, | |
| "grad_norm": 0.3285066485404968, | |
| "learning_rate": 0.0005011984844068784, | |
| "loss": 3.4005, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.272647829886397, | |
| "grad_norm": 0.357901394367218, | |
| "learning_rate": 0.0005010236082774701, | |
| "loss": 3.3969, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.287212350713661, | |
| "grad_norm": 0.34709328413009644, | |
| "learning_rate": 0.0005008487321480617, | |
| "loss": 3.3909, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.301776871540927, | |
| "grad_norm": 0.31995636224746704, | |
| "learning_rate": 0.0005006738560186534, | |
| "loss": 3.4128, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.31634139236819, | |
| "grad_norm": 0.315400630235672, | |
| "learning_rate": 0.0005004989798892451, | |
| "loss": 3.3999, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.330905913195457, | |
| "grad_norm": 0.3495756685733795, | |
| "learning_rate": 0.0005003241037598368, | |
| "loss": 3.3972, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.34547043402272, | |
| "grad_norm": 0.3411330282688141, | |
| "learning_rate": 0.0005001492276304284, | |
| "loss": 3.4064, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.360034954849985, | |
| "grad_norm": 0.3218335211277008, | |
| "learning_rate": 0.0004999743515010201, | |
| "loss": 3.3977, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.37459947567725, | |
| "grad_norm": 0.33580780029296875, | |
| "learning_rate": 0.0004997994753716117, | |
| "loss": 3.4069, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.389163996504514, | |
| "grad_norm": 0.3284832537174225, | |
| "learning_rate": 0.0004996245992422033, | |
| "loss": 3.4001, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.40372851733178, | |
| "grad_norm": 0.32526466250419617, | |
| "learning_rate": 0.0004994497231127951, | |
| "loss": 3.4016, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.418293038159044, | |
| "grad_norm": 0.3263266086578369, | |
| "learning_rate": 0.0004992748469833867, | |
| "loss": 3.4154, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.43285755898631, | |
| "grad_norm": 0.3042384386062622, | |
| "learning_rate": 0.0004990999708539784, | |
| "loss": 3.4174, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.447422079813574, | |
| "grad_norm": 0.3383604884147644, | |
| "learning_rate": 0.0004989250947245701, | |
| "loss": 3.4177, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.447422079813574, | |
| "eval_accuracy": 0.36640586973606676, | |
| "eval_loss": 3.581047534942627, | |
| "eval_runtime": 177.2332, | |
| "eval_samples_per_second": 93.893, | |
| "eval_steps_per_second": 5.874, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.46198660064084, | |
| "grad_norm": 0.3453238606452942, | |
| "learning_rate": 0.0004987502185951617, | |
| "loss": 3.4169, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.476551121468104, | |
| "grad_norm": 0.32911792397499084, | |
| "learning_rate": 0.0004985753424657534, | |
| "loss": 3.3998, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.491115642295368, | |
| "grad_norm": 0.34755417704582214, | |
| "learning_rate": 0.000498400466336345, | |
| "loss": 3.4153, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.505680163122634, | |
| "grad_norm": 0.3180427849292755, | |
| "learning_rate": 0.0004982255902069367, | |
| "loss": 3.406, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.520244683949898, | |
| "grad_norm": 0.34140604734420776, | |
| "learning_rate": 0.0004980507140775283, | |
| "loss": 3.4127, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.534809204777163, | |
| "grad_norm": 0.329262375831604, | |
| "learning_rate": 0.0004978758379481201, | |
| "loss": 3.4351, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.549373725604427, | |
| "grad_norm": 0.3097538948059082, | |
| "learning_rate": 0.0004977009618187117, | |
| "loss": 3.4126, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.563938246431693, | |
| "grad_norm": 0.32443442940711975, | |
| "learning_rate": 0.0004975260856893034, | |
| "loss": 3.4117, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.578502767258957, | |
| "grad_norm": 0.3432025909423828, | |
| "learning_rate": 0.0004973512095598951, | |
| "loss": 3.4247, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.593067288086221, | |
| "grad_norm": 0.3406945765018463, | |
| "learning_rate": 0.0004971763334304867, | |
| "loss": 3.4125, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.607631808913487, | |
| "grad_norm": 0.32931724190711975, | |
| "learning_rate": 0.0004970014573010784, | |
| "loss": 3.4215, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.622196329740751, | |
| "grad_norm": 0.33969646692276, | |
| "learning_rate": 0.00049682658117167, | |
| "loss": 3.4158, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.636760850568017, | |
| "grad_norm": 0.3158787190914154, | |
| "learning_rate": 0.0004966517050422616, | |
| "loss": 3.4155, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.65132537139528, | |
| "grad_norm": 0.32108256220817566, | |
| "learning_rate": 0.0004964768289128533, | |
| "loss": 3.4218, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.665889892222546, | |
| "grad_norm": 0.3323673903942108, | |
| "learning_rate": 0.000496301952783445, | |
| "loss": 3.4267, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.68045441304981, | |
| "grad_norm": 0.3276951014995575, | |
| "learning_rate": 0.0004961270766540367, | |
| "loss": 3.4214, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.695018933877076, | |
| "grad_norm": 0.33299991488456726, | |
| "learning_rate": 0.0004959522005246284, | |
| "loss": 3.4335, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.70958345470434, | |
| "grad_norm": 0.33059605956077576, | |
| "learning_rate": 0.00049577732439522, | |
| "loss": 3.4223, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.724147975531604, | |
| "grad_norm": 0.32752349972724915, | |
| "learning_rate": 0.0004956024482658117, | |
| "loss": 3.4305, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.73871249635887, | |
| "grad_norm": 0.31223592162132263, | |
| "learning_rate": 0.0004954275721364034, | |
| "loss": 3.4214, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.73871249635887, | |
| "eval_accuracy": 0.36692435901189985, | |
| "eval_loss": 3.5738139152526855, | |
| "eval_runtime": 177.4984, | |
| "eval_samples_per_second": 93.753, | |
| "eval_steps_per_second": 5.865, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.753277017186134, | |
| "grad_norm": 0.3213596045970917, | |
| "learning_rate": 0.000495252696006995, | |
| "loss": 3.4262, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.7678415380134, | |
| "grad_norm": 0.3408668041229248, | |
| "learning_rate": 0.0004950778198775866, | |
| "loss": 3.4239, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.782406058840664, | |
| "grad_norm": 0.32721853256225586, | |
| "learning_rate": 0.0004949029437481783, | |
| "loss": 3.4222, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.79697057966793, | |
| "grad_norm": 0.31464850902557373, | |
| "learning_rate": 0.00049472806761877, | |
| "loss": 3.4332, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.811535100495194, | |
| "grad_norm": 0.3212704658508301, | |
| "learning_rate": 0.0004945531914893616, | |
| "loss": 3.4302, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.82609962132246, | |
| "grad_norm": 0.3242170810699463, | |
| "learning_rate": 0.0004943783153599534, | |
| "loss": 3.446, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.840664142149723, | |
| "grad_norm": 0.3245343863964081, | |
| "learning_rate": 0.000494203439230545, | |
| "loss": 3.4213, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.855228662976987, | |
| "grad_norm": 0.32496681809425354, | |
| "learning_rate": 0.0004940285631011367, | |
| "loss": 3.4294, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.869793183804253, | |
| "grad_norm": 0.36191901564598083, | |
| "learning_rate": 0.0004938536869717284, | |
| "loss": 3.4359, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.884357704631517, | |
| "grad_norm": 0.3322804272174835, | |
| "learning_rate": 0.0004936788108423199, | |
| "loss": 3.4327, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.898922225458783, | |
| "grad_norm": 0.3395395576953888, | |
| "learning_rate": 0.0004935039347129116, | |
| "loss": 3.423, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.913486746286047, | |
| "grad_norm": 0.3019578754901886, | |
| "learning_rate": 0.0004933290585835033, | |
| "loss": 3.4376, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.928051267113313, | |
| "grad_norm": 0.32183343172073364, | |
| "learning_rate": 0.000493154182454095, | |
| "loss": 3.4407, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.942615787940577, | |
| "grad_norm": 0.3220561742782593, | |
| "learning_rate": 0.0004929793063246866, | |
| "loss": 3.4417, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.95718030876784, | |
| "grad_norm": 0.36304983496665955, | |
| "learning_rate": 0.0004928044301952783, | |
| "loss": 3.4312, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.971744829595107, | |
| "grad_norm": 0.33577391505241394, | |
| "learning_rate": 0.00049262955406587, | |
| "loss": 3.4396, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.98630935042237, | |
| "grad_norm": 0.3271602392196655, | |
| "learning_rate": 0.0004924546779364617, | |
| "loss": 3.426, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 9.000873871249636, | |
| "grad_norm": 0.33349940180778503, | |
| "learning_rate": 0.0004922798018070533, | |
| "loss": 3.4341, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.0154383920769, | |
| "grad_norm": 0.32157132029533386, | |
| "learning_rate": 0.0004921049256776449, | |
| "loss": 3.3276, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.030002912904166, | |
| "grad_norm": 0.3370971381664276, | |
| "learning_rate": 0.0004919300495482366, | |
| "loss": 3.3159, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.030002912904166, | |
| "eval_accuracy": 0.3673821677555647, | |
| "eval_loss": 3.579301595687866, | |
| "eval_runtime": 177.4898, | |
| "eval_samples_per_second": 93.757, | |
| "eval_steps_per_second": 5.865, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.04456743373143, | |
| "grad_norm": 0.33405694365501404, | |
| "learning_rate": 0.0004917551734188283, | |
| "loss": 3.3151, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.059131954558694, | |
| "grad_norm": 0.3237508237361908, | |
| "learning_rate": 0.0004915802972894199, | |
| "loss": 3.3404, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.07369647538596, | |
| "grad_norm": 0.35597681999206543, | |
| "learning_rate": 0.0004914054211600116, | |
| "loss": 3.3346, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.088260996213224, | |
| "grad_norm": 0.32660970091819763, | |
| "learning_rate": 0.0004912305450306033, | |
| "loss": 3.3228, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.10282551704049, | |
| "grad_norm": 0.3498513996601105, | |
| "learning_rate": 0.000491055668901195, | |
| "loss": 3.3412, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.117390037867754, | |
| "grad_norm": 0.3514196574687958, | |
| "learning_rate": 0.0004908807927717865, | |
| "loss": 3.3467, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.13195455869502, | |
| "grad_norm": 0.33097749948501587, | |
| "learning_rate": 0.0004907059166423783, | |
| "loss": 3.3572, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.146519079522283, | |
| "grad_norm": 0.3433399498462677, | |
| "learning_rate": 0.0004905310405129699, | |
| "loss": 3.339, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.16108360034955, | |
| "grad_norm": 0.327028751373291, | |
| "learning_rate": 0.0004903561643835616, | |
| "loss": 3.3498, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.175648121176813, | |
| "grad_norm": 0.340170681476593, | |
| "learning_rate": 0.0004901812882541533, | |
| "loss": 3.3463, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.190212642004077, | |
| "grad_norm": 0.3196989893913269, | |
| "learning_rate": 0.0004900064121247449, | |
| "loss": 3.3639, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.204777162831343, | |
| "grad_norm": 0.3304920196533203, | |
| "learning_rate": 0.0004898315359953366, | |
| "loss": 3.3434, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.219341683658607, | |
| "grad_norm": 0.3360269367694855, | |
| "learning_rate": 0.0004896566598659283, | |
| "loss": 3.3688, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.233906204485873, | |
| "grad_norm": 0.3214164674282074, | |
| "learning_rate": 0.0004894817837365199, | |
| "loss": 3.3743, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.248470725313137, | |
| "grad_norm": 0.3371962904930115, | |
| "learning_rate": 0.0004893069076071115, | |
| "loss": 3.369, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.263035246140403, | |
| "grad_norm": 0.33907076716423035, | |
| "learning_rate": 0.0004891320314777032, | |
| "loss": 3.3607, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.277599766967667, | |
| "grad_norm": 0.3246038854122162, | |
| "learning_rate": 0.0004889571553482949, | |
| "loss": 3.375, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.292164287794932, | |
| "grad_norm": 0.3564911186695099, | |
| "learning_rate": 0.0004887822792188866, | |
| "loss": 3.3753, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.306728808622196, | |
| "grad_norm": 0.34122955799102783, | |
| "learning_rate": 0.0004886074030894782, | |
| "loss": 3.3754, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.32129332944946, | |
| "grad_norm": 0.3213154375553131, | |
| "learning_rate": 0.0004884325269600699, | |
| "loss": 3.3668, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.32129332944946, | |
| "eval_accuracy": 0.3676478214806967, | |
| "eval_loss": 3.577354907989502, | |
| "eval_runtime": 177.8216, | |
| "eval_samples_per_second": 93.583, | |
| "eval_steps_per_second": 5.854, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.335857850276726, | |
| "grad_norm": 0.3175533711910248, | |
| "learning_rate": 0.0004882576508306615, | |
| "loss": 3.3725, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.35042237110399, | |
| "grad_norm": 0.32937026023864746, | |
| "learning_rate": 0.00048808277470125327, | |
| "loss": 3.365, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.364986891931256, | |
| "grad_norm": 0.34098199009895325, | |
| "learning_rate": 0.0004879078985718449, | |
| "loss": 3.3741, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.37955141275852, | |
| "grad_norm": 0.3347371816635132, | |
| "learning_rate": 0.0004877330224424366, | |
| "loss": 3.3783, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.394115933585786, | |
| "grad_norm": 0.3389059603214264, | |
| "learning_rate": 0.00048755814631302823, | |
| "loss": 3.3772, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.40868045441305, | |
| "grad_norm": 0.34368738532066345, | |
| "learning_rate": 0.00048738327018361987, | |
| "loss": 3.3875, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.423244975240314, | |
| "grad_norm": 0.34114500880241394, | |
| "learning_rate": 0.00048720839405421156, | |
| "loss": 3.3882, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.43780949606758, | |
| "grad_norm": 0.38217416405677795, | |
| "learning_rate": 0.0004870335179248032, | |
| "loss": 3.3811, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.452374016894844, | |
| "grad_norm": 0.3433772921562195, | |
| "learning_rate": 0.0004868586417953949, | |
| "loss": 3.3803, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.46693853772211, | |
| "grad_norm": 0.31574320793151855, | |
| "learning_rate": 0.0004866837656659865, | |
| "loss": 3.3872, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.481503058549373, | |
| "grad_norm": 0.36580872535705566, | |
| "learning_rate": 0.00048650888953657816, | |
| "loss": 3.3849, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.49606757937664, | |
| "grad_norm": 0.3435962200164795, | |
| "learning_rate": 0.0004863340134071699, | |
| "loss": 3.3961, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.510632100203903, | |
| "grad_norm": 0.3202696740627289, | |
| "learning_rate": 0.00048615913727776154, | |
| "loss": 3.399, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.525196621031167, | |
| "grad_norm": 0.3501302897930145, | |
| "learning_rate": 0.00048598426114835323, | |
| "loss": 3.393, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.539761141858433, | |
| "grad_norm": 0.36124247312545776, | |
| "learning_rate": 0.00048580938501894486, | |
| "loss": 3.3921, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.554325662685697, | |
| "grad_norm": 0.34877392649650574, | |
| "learning_rate": 0.00048563450888953655, | |
| "loss": 3.3981, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.568890183512963, | |
| "grad_norm": 0.340874582529068, | |
| "learning_rate": 0.0004854596327601282, | |
| "loss": 3.411, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.583454704340227, | |
| "grad_norm": 0.3228774666786194, | |
| "learning_rate": 0.0004852847566307198, | |
| "loss": 3.3866, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.598019225167493, | |
| "grad_norm": 0.33764421939849854, | |
| "learning_rate": 0.0004851098805013115, | |
| "loss": 3.3922, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.612583745994757, | |
| "grad_norm": 0.3215000331401825, | |
| "learning_rate": 0.00048493500437190315, | |
| "loss": 3.3972, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.612583745994757, | |
| "eval_accuracy": 0.36817630658062733, | |
| "eval_loss": 3.5689446926116943, | |
| "eval_runtime": 177.5448, | |
| "eval_samples_per_second": 93.728, | |
| "eval_steps_per_second": 5.863, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.627148266822022, | |
| "grad_norm": 0.3449784517288208, | |
| "learning_rate": 0.0004847601282424949, | |
| "loss": 3.3968, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.641712787649286, | |
| "grad_norm": 0.343000590801239, | |
| "learning_rate": 0.00048458525211308653, | |
| "loss": 3.3888, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.65627730847655, | |
| "grad_norm": 0.35578057169914246, | |
| "learning_rate": 0.00048441037598367817, | |
| "loss": 3.3971, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.670841829303816, | |
| "grad_norm": 0.3461526334285736, | |
| "learning_rate": 0.00048423549985426986, | |
| "loss": 3.4172, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.68540635013108, | |
| "grad_norm": 0.3729863166809082, | |
| "learning_rate": 0.0004840606237248615, | |
| "loss": 3.3911, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.699970870958346, | |
| "grad_norm": 0.3255765736103058, | |
| "learning_rate": 0.0004838857475954532, | |
| "loss": 3.3958, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.71453539178561, | |
| "grad_norm": 0.32752174139022827, | |
| "learning_rate": 0.0004837108714660448, | |
| "loss": 3.397, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.729099912612876, | |
| "grad_norm": 0.35780587792396545, | |
| "learning_rate": 0.0004835359953366365, | |
| "loss": 3.403, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.74366443344014, | |
| "grad_norm": 0.3370216190814972, | |
| "learning_rate": 0.00048336111920722815, | |
| "loss": 3.4004, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.758228954267405, | |
| "grad_norm": 0.3510589897632599, | |
| "learning_rate": 0.0004831862430778198, | |
| "loss": 3.4153, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.77279347509467, | |
| "grad_norm": 0.3223910629749298, | |
| "learning_rate": 0.00048301136694841153, | |
| "loss": 3.407, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.787357995921933, | |
| "grad_norm": 0.34690526127815247, | |
| "learning_rate": 0.00048283649081900317, | |
| "loss": 3.3977, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.8019225167492, | |
| "grad_norm": 0.33463695645332336, | |
| "learning_rate": 0.00048266161468959486, | |
| "loss": 3.4125, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.816487037576463, | |
| "grad_norm": 0.35382333397865295, | |
| "learning_rate": 0.0004824867385601865, | |
| "loss": 3.4114, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.831051558403729, | |
| "grad_norm": 0.3453107476234436, | |
| "learning_rate": 0.00048231186243077813, | |
| "loss": 3.4108, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.845616079230993, | |
| "grad_norm": 0.33159446716308594, | |
| "learning_rate": 0.0004821369863013698, | |
| "loss": 3.4144, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.860180600058259, | |
| "grad_norm": 0.35295242071151733, | |
| "learning_rate": 0.00048196211017196146, | |
| "loss": 3.4014, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.874745120885523, | |
| "grad_norm": 0.3284096419811249, | |
| "learning_rate": 0.00048178723404255315, | |
| "loss": 3.4039, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.889309641712789, | |
| "grad_norm": 0.33951765298843384, | |
| "learning_rate": 0.0004816123579131448, | |
| "loss": 3.4153, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.903874162540053, | |
| "grad_norm": 0.3084050416946411, | |
| "learning_rate": 0.0004814374817837364, | |
| "loss": 3.416, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.903874162540053, | |
| "eval_accuracy": 0.36868256567168234, | |
| "eval_loss": 3.560567617416382, | |
| "eval_runtime": 177.1807, | |
| "eval_samples_per_second": 93.921, | |
| "eval_steps_per_second": 5.875, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.918438683367317, | |
| "grad_norm": 0.347179114818573, | |
| "learning_rate": 0.00048126260565432816, | |
| "loss": 3.4238, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.933003204194582, | |
| "grad_norm": 0.32636216282844543, | |
| "learning_rate": 0.0004810877295249198, | |
| "loss": 3.42, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.947567725021846, | |
| "grad_norm": 0.33995139598846436, | |
| "learning_rate": 0.0004809128533955115, | |
| "loss": 3.408, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.962132245849112, | |
| "grad_norm": 0.3723163902759552, | |
| "learning_rate": 0.0004807379772661031, | |
| "loss": 3.4068, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.976696766676376, | |
| "grad_norm": 0.33295756578445435, | |
| "learning_rate": 0.0004805631011366948, | |
| "loss": 3.4085, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.991261287503642, | |
| "grad_norm": 0.34439772367477417, | |
| "learning_rate": 0.00048038822500728645, | |
| "loss": 3.4136, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.005825808330906, | |
| "grad_norm": 0.33056867122650146, | |
| "learning_rate": 0.0004802133488778781, | |
| "loss": 3.3779, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.02039032915817, | |
| "grad_norm": 0.3284781575202942, | |
| "learning_rate": 0.0004800384727484698, | |
| "loss": 3.2906, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.034954849985436, | |
| "grad_norm": 0.35688483715057373, | |
| "learning_rate": 0.0004798635966190614, | |
| "loss": 3.3066, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.0495193708127, | |
| "grad_norm": 0.3438079059123993, | |
| "learning_rate": 0.00047968872048965316, | |
| "loss": 3.3037, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.064083891639966, | |
| "grad_norm": 0.33732229471206665, | |
| "learning_rate": 0.0004795138443602448, | |
| "loss": 3.3133, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.07864841246723, | |
| "grad_norm": 0.35935407876968384, | |
| "learning_rate": 0.00047933896823083643, | |
| "loss": 3.3062, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.093212933294495, | |
| "grad_norm": 0.3332599103450775, | |
| "learning_rate": 0.0004791640921014281, | |
| "loss": 3.3235, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.10777745412176, | |
| "grad_norm": 0.3485143184661865, | |
| "learning_rate": 0.00047898921597201976, | |
| "loss": 3.3178, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.122341974949023, | |
| "grad_norm": 0.32437801361083984, | |
| "learning_rate": 0.00047881433984261145, | |
| "loss": 3.326, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.136906495776289, | |
| "grad_norm": 0.3398897647857666, | |
| "learning_rate": 0.0004786394637132031, | |
| "loss": 3.3193, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.151471016603553, | |
| "grad_norm": 0.35131925344467163, | |
| "learning_rate": 0.0004784645875837948, | |
| "loss": 3.3303, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.166035537430819, | |
| "grad_norm": 0.341104120016098, | |
| "learning_rate": 0.0004782897114543864, | |
| "loss": 3.3299, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.180600058258083, | |
| "grad_norm": 0.3273942768573761, | |
| "learning_rate": 0.00047811483532497805, | |
| "loss": 3.3342, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.195164579085349, | |
| "grad_norm": 0.35876280069351196, | |
| "learning_rate": 0.0004779399591955698, | |
| "loss": 3.3313, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.195164579085349, | |
| "eval_accuracy": 0.36803965778531816, | |
| "eval_loss": 3.5746335983276367, | |
| "eval_runtime": 177.7657, | |
| "eval_samples_per_second": 93.612, | |
| "eval_steps_per_second": 5.856, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.209729099912613, | |
| "grad_norm": 0.33633482456207275, | |
| "learning_rate": 0.00047776508306616143, | |
| "loss": 3.3403, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.224293620739878, | |
| "grad_norm": 0.35305869579315186, | |
| "learning_rate": 0.0004775902069367531, | |
| "loss": 3.3432, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.238858141567142, | |
| "grad_norm": 0.3344394564628601, | |
| "learning_rate": 0.00047741533080734476, | |
| "loss": 3.3395, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.253422662394406, | |
| "grad_norm": 0.3661476969718933, | |
| "learning_rate": 0.0004772404546779364, | |
| "loss": 3.344, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.267987183221672, | |
| "grad_norm": 0.3455207049846649, | |
| "learning_rate": 0.0004770655785485281, | |
| "loss": 3.3494, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.282551704048936, | |
| "grad_norm": 0.33272865414619446, | |
| "learning_rate": 0.0004768907024191197, | |
| "loss": 3.3453, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.297116224876202, | |
| "grad_norm": 0.3241981565952301, | |
| "learning_rate": 0.0004767158262897114, | |
| "loss": 3.348, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.311680745703466, | |
| "grad_norm": 0.35547518730163574, | |
| "learning_rate": 0.00047654095016030305, | |
| "loss": 3.3454, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.326245266530732, | |
| "grad_norm": 0.3643966019153595, | |
| "learning_rate": 0.0004763660740308948, | |
| "loss": 3.3509, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.340809787357996, | |
| "grad_norm": 0.3433777987957001, | |
| "learning_rate": 0.0004761911979014864, | |
| "loss": 3.337, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.355374308185262, | |
| "grad_norm": 0.33827000856399536, | |
| "learning_rate": 0.00047601632177207806, | |
| "loss": 3.3625, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.369938829012526, | |
| "grad_norm": 0.34222304821014404, | |
| "learning_rate": 0.00047584144564266975, | |
| "loss": 3.3584, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.38450334983979, | |
| "grad_norm": 0.3403022885322571, | |
| "learning_rate": 0.0004756665695132614, | |
| "loss": 3.3523, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.399067870667055, | |
| "grad_norm": 0.3352959156036377, | |
| "learning_rate": 0.0004754916933838531, | |
| "loss": 3.3585, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.41363239149432, | |
| "grad_norm": 0.3228458762168884, | |
| "learning_rate": 0.0004753168172544447, | |
| "loss": 3.3669, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.428196912321585, | |
| "grad_norm": 0.3480757474899292, | |
| "learning_rate": 0.00047514194112503635, | |
| "loss": 3.3639, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.44276143314885, | |
| "grad_norm": 0.3398342728614807, | |
| "learning_rate": 0.00047496706499562804, | |
| "loss": 3.3651, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.457325953976115, | |
| "grad_norm": 0.3550876975059509, | |
| "learning_rate": 0.0004747921888662197, | |
| "loss": 3.3634, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.471890474803379, | |
| "grad_norm": 0.35050809383392334, | |
| "learning_rate": 0.0004746173127368114, | |
| "loss": 3.3584, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.486454995630643, | |
| "grad_norm": 0.3375207781791687, | |
| "learning_rate": 0.00047444243660740306, | |
| "loss": 3.3625, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.486454995630643, | |
| "eval_accuracy": 0.3688550818358119, | |
| "eval_loss": 3.565290927886963, | |
| "eval_runtime": 177.5982, | |
| "eval_samples_per_second": 93.7, | |
| "eval_steps_per_second": 5.862, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.501019516457909, | |
| "grad_norm": 0.34734171628952026, | |
| "learning_rate": 0.0004742675604779947, | |
| "loss": 3.3803, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.515584037285173, | |
| "grad_norm": 0.34996819496154785, | |
| "learning_rate": 0.0004740926843485864, | |
| "loss": 3.3676, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.530148558112439, | |
| "grad_norm": 0.3279918432235718, | |
| "learning_rate": 0.000473917808219178, | |
| "loss": 3.3782, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.544713078939703, | |
| "grad_norm": 0.354153573513031, | |
| "learning_rate": 0.0004737429320897697, | |
| "loss": 3.3713, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.559277599766968, | |
| "grad_norm": 0.3350876569747925, | |
| "learning_rate": 0.00047356805596036135, | |
| "loss": 3.3706, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.573842120594232, | |
| "grad_norm": 0.3531542122364044, | |
| "learning_rate": 0.00047339317983095304, | |
| "loss": 3.3671, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.588406641421496, | |
| "grad_norm": 0.31363147497177124, | |
| "learning_rate": 0.0004732183037015447, | |
| "loss": 3.3689, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.602971162248762, | |
| "grad_norm": 0.3484424650669098, | |
| "learning_rate": 0.0004730434275721363, | |
| "loss": 3.3738, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.617535683076026, | |
| "grad_norm": 0.3363831341266632, | |
| "learning_rate": 0.00047286855144272806, | |
| "loss": 3.3781, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.632100203903292, | |
| "grad_norm": 0.3282327353954315, | |
| "learning_rate": 0.0004726936753133197, | |
| "loss": 3.3775, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.646664724730556, | |
| "grad_norm": 0.35747599601745605, | |
| "learning_rate": 0.0004725187991839114, | |
| "loss": 3.3739, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.661229245557822, | |
| "grad_norm": 0.3524666428565979, | |
| "learning_rate": 0.000472343923054503, | |
| "loss": 3.3857, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.675793766385086, | |
| "grad_norm": 0.34615790843963623, | |
| "learning_rate": 0.00047216904692509465, | |
| "loss": 3.374, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.690358287212351, | |
| "grad_norm": 0.32927921414375305, | |
| "learning_rate": 0.00047199417079568634, | |
| "loss": 3.3821, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.704922808039615, | |
| "grad_norm": 0.3339250981807709, | |
| "learning_rate": 0.000471819294666278, | |
| "loss": 3.3747, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.71948732886688, | |
| "grad_norm": 0.3413323760032654, | |
| "learning_rate": 0.00047164441853686967, | |
| "loss": 3.3821, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.734051849694145, | |
| "grad_norm": 0.31661462783813477, | |
| "learning_rate": 0.0004714695424074613, | |
| "loss": 3.3896, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.74861637052141, | |
| "grad_norm": 0.35339003801345825, | |
| "learning_rate": 0.00047129466627805305, | |
| "loss": 3.3677, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.763180891348675, | |
| "grad_norm": 0.3317316174507141, | |
| "learning_rate": 0.0004711197901486447, | |
| "loss": 3.3845, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.777745412175939, | |
| "grad_norm": 0.3493446707725525, | |
| "learning_rate": 0.0004709449140192363, | |
| "loss": 3.3663, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.777745412175939, | |
| "eval_accuracy": 0.36960982535413733, | |
| "eval_loss": 3.55883526802063, | |
| "eval_runtime": 177.6184, | |
| "eval_samples_per_second": 93.69, | |
| "eval_steps_per_second": 5.861, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.792309933003205, | |
| "grad_norm": 0.3213796019554138, | |
| "learning_rate": 0.000470770037889828, | |
| "loss": 3.3895, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.806874453830469, | |
| "grad_norm": 0.325885534286499, | |
| "learning_rate": 0.00047059516176041965, | |
| "loss": 3.3857, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.821438974657735, | |
| "grad_norm": 0.3278997540473938, | |
| "learning_rate": 0.00047042028563101134, | |
| "loss": 3.3964, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.836003495484999, | |
| "grad_norm": 0.31899142265319824, | |
| "learning_rate": 0.000470245409501603, | |
| "loss": 3.3808, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.850568016312263, | |
| "grad_norm": 0.3405511677265167, | |
| "learning_rate": 0.0004700705333721946, | |
| "loss": 3.3904, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.865132537139528, | |
| "grad_norm": 0.32967904210090637, | |
| "learning_rate": 0.0004698956572427863, | |
| "loss": 3.3937, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.879697057966792, | |
| "grad_norm": 0.368004709482193, | |
| "learning_rate": 0.00046972078111337794, | |
| "loss": 3.3859, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.894261578794058, | |
| "grad_norm": 0.34228798747062683, | |
| "learning_rate": 0.0004695459049839697, | |
| "loss": 3.3779, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.908826099621322, | |
| "grad_norm": 0.31284409761428833, | |
| "learning_rate": 0.0004693710288545613, | |
| "loss": 3.3925, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.923390620448588, | |
| "grad_norm": 0.3652680218219757, | |
| "learning_rate": 0.000469196152725153, | |
| "loss": 3.3866, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.937955141275852, | |
| "grad_norm": 0.3488202393054962, | |
| "learning_rate": 0.00046902127659574465, | |
| "loss": 3.4059, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.952519662103116, | |
| "grad_norm": 0.3468438982963562, | |
| "learning_rate": 0.0004688464004663363, | |
| "loss": 3.3932, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.967084182930382, | |
| "grad_norm": 0.3332480788230896, | |
| "learning_rate": 0.000468671524336928, | |
| "loss": 3.3879, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.981648703757646, | |
| "grad_norm": 0.3508252501487732, | |
| "learning_rate": 0.0004684966482075196, | |
| "loss": 3.3867, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.996213224584912, | |
| "grad_norm": 0.318619966506958, | |
| "learning_rate": 0.0004683217720781113, | |
| "loss": 3.3918, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.010777745412176, | |
| "grad_norm": 0.346383273601532, | |
| "learning_rate": 0.00046814689594870294, | |
| "loss": 3.3044, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.025342266239441, | |
| "grad_norm": 0.361944317817688, | |
| "learning_rate": 0.0004679720198192946, | |
| "loss": 3.2746, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.039906787066705, | |
| "grad_norm": 0.3629739284515381, | |
| "learning_rate": 0.0004677971436898863, | |
| "loss": 3.2872, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.054471307893971, | |
| "grad_norm": 0.343257874250412, | |
| "learning_rate": 0.00046762226756047795, | |
| "loss": 3.2868, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.069035828721235, | |
| "grad_norm": 0.3298133611679077, | |
| "learning_rate": 0.00046744739143106964, | |
| "loss": 3.2875, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.069035828721235, | |
| "eval_accuracy": 0.3689343428410084, | |
| "eval_loss": 3.569741725921631, | |
| "eval_runtime": 177.41, | |
| "eval_samples_per_second": 93.8, | |
| "eval_steps_per_second": 5.868, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.0836003495485, | |
| "grad_norm": 0.34320956468582153, | |
| "learning_rate": 0.0004672725153016613, | |
| "loss": 3.304, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.098164870375765, | |
| "grad_norm": 0.34359210729599, | |
| "learning_rate": 0.00046709763917225297, | |
| "loss": 3.2989, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.112729391203029, | |
| "grad_norm": 0.34714171290397644, | |
| "learning_rate": 0.0004669227630428446, | |
| "loss": 3.3076, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.127293912030295, | |
| "grad_norm": 0.36771103739738464, | |
| "learning_rate": 0.00046674788691343624, | |
| "loss": 3.3131, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.141858432857559, | |
| "grad_norm": 0.3316338360309601, | |
| "learning_rate": 0.00046657301078402793, | |
| "loss": 3.305, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.156422953684825, | |
| "grad_norm": 0.324266254901886, | |
| "learning_rate": 0.00046639813465461957, | |
| "loss": 3.3045, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.170987474512089, | |
| "grad_norm": 0.3661448359489441, | |
| "learning_rate": 0.0004662232585252113, | |
| "loss": 3.3088, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.185551995339353, | |
| "grad_norm": 0.33063098788261414, | |
| "learning_rate": 0.00046604838239580295, | |
| "loss": 3.319, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.200116516166618, | |
| "grad_norm": 0.347318172454834, | |
| "learning_rate": 0.0004658735062663946, | |
| "loss": 3.3146, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.214681036993882, | |
| "grad_norm": 0.34231555461883545, | |
| "learning_rate": 0.0004656986301369863, | |
| "loss": 3.3067, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.229245557821148, | |
| "grad_norm": 0.3556668162345886, | |
| "learning_rate": 0.0004655237540075779, | |
| "loss": 3.318, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.243810078648412, | |
| "grad_norm": 0.34402233362197876, | |
| "learning_rate": 0.0004653488778781696, | |
| "loss": 3.3317, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.258374599475678, | |
| "grad_norm": 0.36647072434425354, | |
| "learning_rate": 0.00046517400174876124, | |
| "loss": 3.3338, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.272939120302942, | |
| "grad_norm": 0.34458011388778687, | |
| "learning_rate": 0.0004649991256193529, | |
| "loss": 3.34, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.287503641130208, | |
| "grad_norm": 0.35463765263557434, | |
| "learning_rate": 0.00046482424948994457, | |
| "loss": 3.3393, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.302068161957472, | |
| "grad_norm": 0.36296704411506653, | |
| "learning_rate": 0.0004646493733605362, | |
| "loss": 3.3322, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.316632682784736, | |
| "grad_norm": 0.334695041179657, | |
| "learning_rate": 0.00046447449723112795, | |
| "loss": 3.3302, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.331197203612001, | |
| "grad_norm": 0.3402169346809387, | |
| "learning_rate": 0.0004642996211017196, | |
| "loss": 3.3366, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.345761724439265, | |
| "grad_norm": 0.35098734498023987, | |
| "learning_rate": 0.0004641247449723113, | |
| "loss": 3.3427, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.360326245266531, | |
| "grad_norm": 0.37707361578941345, | |
| "learning_rate": 0.0004639498688429029, | |
| "loss": 3.3281, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.360326245266531, | |
| "eval_accuracy": 0.3694583592195778, | |
| "eval_loss": 3.564202308654785, | |
| "eval_runtime": 177.5, | |
| "eval_samples_per_second": 93.752, | |
| "eval_steps_per_second": 5.865, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.374890766093795, | |
| "grad_norm": 0.33432015776634216, | |
| "learning_rate": 0.00046377499271349455, | |
| "loss": 3.3417, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.389455286921061, | |
| "grad_norm": 0.34372082352638245, | |
| "learning_rate": 0.00046360011658408624, | |
| "loss": 3.3352, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.404019807748325, | |
| "grad_norm": 0.3262201249599457, | |
| "learning_rate": 0.00046342524045467787, | |
| "loss": 3.3347, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.418584328575589, | |
| "grad_norm": 0.3505764901638031, | |
| "learning_rate": 0.00046325036432526956, | |
| "loss": 3.3344, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.433148849402855, | |
| "grad_norm": 0.34340110421180725, | |
| "learning_rate": 0.0004630754881958612, | |
| "loss": 3.3473, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.447713370230119, | |
| "grad_norm": 0.33771929144859314, | |
| "learning_rate": 0.00046290061206645284, | |
| "loss": 3.3516, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.462277891057385, | |
| "grad_norm": 0.3311103880405426, | |
| "learning_rate": 0.0004627257359370446, | |
| "loss": 3.3527, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.476842411884649, | |
| "grad_norm": 0.36507630348205566, | |
| "learning_rate": 0.0004625508598076362, | |
| "loss": 3.3381, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.491406932711914, | |
| "grad_norm": 0.36566632986068726, | |
| "learning_rate": 0.0004623759836782279, | |
| "loss": 3.3481, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.505971453539178, | |
| "grad_norm": 0.3532668352127075, | |
| "learning_rate": 0.00046220110754881954, | |
| "loss": 3.3388, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.520535974366444, | |
| "grad_norm": 0.3642367720603943, | |
| "learning_rate": 0.00046202623141941123, | |
| "loss": 3.3464, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.535100495193708, | |
| "grad_norm": 0.3446687161922455, | |
| "learning_rate": 0.00046185135529000287, | |
| "loss": 3.3419, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.549665016020972, | |
| "grad_norm": 0.32603853940963745, | |
| "learning_rate": 0.0004616764791605945, | |
| "loss": 3.345, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.564229536848238, | |
| "grad_norm": 0.3597732186317444, | |
| "learning_rate": 0.0004615016030311862, | |
| "loss": 3.3517, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.578794057675502, | |
| "grad_norm": 0.3671717643737793, | |
| "learning_rate": 0.00046132672690177783, | |
| "loss": 3.3627, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.593358578502768, | |
| "grad_norm": 0.326857328414917, | |
| "learning_rate": 0.0004611518507723696, | |
| "loss": 3.356, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.607923099330032, | |
| "grad_norm": 0.355274498462677, | |
| "learning_rate": 0.0004609769746429612, | |
| "loss": 3.352, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.622487620157298, | |
| "grad_norm": 0.35776060819625854, | |
| "learning_rate": 0.00046080209851355285, | |
| "loss": 3.3488, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.637052140984562, | |
| "grad_norm": 0.3599199950695038, | |
| "learning_rate": 0.00046062722238414454, | |
| "loss": 3.3537, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.651616661811826, | |
| "grad_norm": 0.38023436069488525, | |
| "learning_rate": 0.0004604523462547362, | |
| "loss": 3.3652, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.651616661811826, | |
| "eval_accuracy": 0.36993510123006257, | |
| "eval_loss": 3.5553505420684814, | |
| "eval_runtime": 177.5604, | |
| "eval_samples_per_second": 93.72, | |
| "eval_steps_per_second": 5.863, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.666181182639091, | |
| "grad_norm": 0.33803287148475647, | |
| "learning_rate": 0.00046027747012532787, | |
| "loss": 3.3708, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.680745703466355, | |
| "grad_norm": 0.3471790850162506, | |
| "learning_rate": 0.0004601025939959195, | |
| "loss": 3.3482, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.695310224293621, | |
| "grad_norm": 0.3702428340911865, | |
| "learning_rate": 0.0004599277178665112, | |
| "loss": 3.3483, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.709874745120885, | |
| "grad_norm": 0.341546893119812, | |
| "learning_rate": 0.00045975284173710283, | |
| "loss": 3.3534, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.724439265948151, | |
| "grad_norm": 0.34679174423217773, | |
| "learning_rate": 0.00045957796560769446, | |
| "loss": 3.3671, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.739003786775415, | |
| "grad_norm": 0.3513627350330353, | |
| "learning_rate": 0.0004594030894782862, | |
| "loss": 3.361, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.75356830760268, | |
| "grad_norm": 0.34741392731666565, | |
| "learning_rate": 0.00045922821334887785, | |
| "loss": 3.3687, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.768132828429945, | |
| "grad_norm": 0.36889341473579407, | |
| "learning_rate": 0.00045905333721946954, | |
| "loss": 3.3538, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.782697349257209, | |
| "grad_norm": 0.3231208622455597, | |
| "learning_rate": 0.00045887846109006117, | |
| "loss": 3.3654, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.797261870084474, | |
| "grad_norm": 0.350035160779953, | |
| "learning_rate": 0.0004587035849606528, | |
| "loss": 3.3555, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.811826390911738, | |
| "grad_norm": 0.33473822474479675, | |
| "learning_rate": 0.0004585287088312445, | |
| "loss": 3.3757, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.826390911739004, | |
| "grad_norm": 0.3523299992084503, | |
| "learning_rate": 0.00045835383270183613, | |
| "loss": 3.3593, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.840955432566268, | |
| "grad_norm": 0.336702823638916, | |
| "learning_rate": 0.0004581789565724278, | |
| "loss": 3.3607, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.855519953393534, | |
| "grad_norm": 0.353645384311676, | |
| "learning_rate": 0.00045800408044301946, | |
| "loss": 3.3705, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.870084474220798, | |
| "grad_norm": 0.35596734285354614, | |
| "learning_rate": 0.0004578292043136111, | |
| "loss": 3.3642, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.884648995048064, | |
| "grad_norm": 0.35289227962493896, | |
| "learning_rate": 0.00045765432818420284, | |
| "loss": 3.3708, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.899213515875328, | |
| "grad_norm": 0.3323861062526703, | |
| "learning_rate": 0.0004574794520547945, | |
| "loss": 3.3633, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.913778036702592, | |
| "grad_norm": 0.3809449374675751, | |
| "learning_rate": 0.00045730457592538617, | |
| "loss": 3.3729, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.928342557529858, | |
| "grad_norm": 0.361569344997406, | |
| "learning_rate": 0.0004571296997959778, | |
| "loss": 3.3697, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.942907078357122, | |
| "grad_norm": 0.3860667645931244, | |
| "learning_rate": 0.0004569548236665695, | |
| "loss": 3.3736, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.942907078357122, | |
| "eval_accuracy": 0.3703710367586435, | |
| "eval_loss": 3.5485339164733887, | |
| "eval_runtime": 177.6718, | |
| "eval_samples_per_second": 93.661, | |
| "eval_steps_per_second": 5.859, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.957471599184387, | |
| "grad_norm": 0.3505721390247345, | |
| "learning_rate": 0.00045677994753716113, | |
| "loss": 3.3755, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.972036120011651, | |
| "grad_norm": 0.35810521245002747, | |
| "learning_rate": 0.00045660507140775277, | |
| "loss": 3.3569, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.986600640838917, | |
| "grad_norm": 0.3344600796699524, | |
| "learning_rate": 0.00045643019527834446, | |
| "loss": 3.3613, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 12.001165161666181, | |
| "grad_norm": 0.34730231761932373, | |
| "learning_rate": 0.0004562553191489361, | |
| "loss": 3.36, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.015729682493445, | |
| "grad_norm": 0.3313257694244385, | |
| "learning_rate": 0.00045608044301952784, | |
| "loss": 3.269, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.030294203320711, | |
| "grad_norm": 0.3686826825141907, | |
| "learning_rate": 0.0004559055668901195, | |
| "loss": 3.2643, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.044858724147975, | |
| "grad_norm": 0.36742156744003296, | |
| "learning_rate": 0.0004557306907607111, | |
| "loss": 3.2595, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.05942324497524, | |
| "grad_norm": 0.34834322333335876, | |
| "learning_rate": 0.0004555558146313028, | |
| "loss": 3.2717, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.073987765802505, | |
| "grad_norm": 0.3443371057510376, | |
| "learning_rate": 0.00045538093850189444, | |
| "loss": 3.2769, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.08855228662977, | |
| "grad_norm": 0.352764755487442, | |
| "learning_rate": 0.00045520606237248613, | |
| "loss": 3.2705, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.103116807457035, | |
| "grad_norm": 0.3674822151660919, | |
| "learning_rate": 0.00045503118624307776, | |
| "loss": 3.2857, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.117681328284299, | |
| "grad_norm": 0.3471083343029022, | |
| "learning_rate": 0.00045485631011366945, | |
| "loss": 3.2775, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.132245849111564, | |
| "grad_norm": 0.36739566922187805, | |
| "learning_rate": 0.0004546814339842611, | |
| "loss": 3.2886, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.146810369938828, | |
| "grad_norm": 0.3347199261188507, | |
| "learning_rate": 0.0004545065578548527, | |
| "loss": 3.2946, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.161374890766094, | |
| "grad_norm": 0.42980143427848816, | |
| "learning_rate": 0.00045433168172544447, | |
| "loss": 3.2904, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.175939411593358, | |
| "grad_norm": 0.3469353914260864, | |
| "learning_rate": 0.0004541568055960361, | |
| "loss": 3.2946, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.190503932420624, | |
| "grad_norm": 0.3609008491039276, | |
| "learning_rate": 0.0004539819294666278, | |
| "loss": 3.3039, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.205068453247888, | |
| "grad_norm": 0.3403509855270386, | |
| "learning_rate": 0.00045380705333721943, | |
| "loss": 3.2941, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.219632974075154, | |
| "grad_norm": 0.35095566511154175, | |
| "learning_rate": 0.00045363217720781107, | |
| "loss": 3.2979, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.234197494902418, | |
| "grad_norm": 0.3400515019893646, | |
| "learning_rate": 0.00045345730107840276, | |
| "loss": 3.2965, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.234197494902418, | |
| "eval_accuracy": 0.37021039798550043, | |
| "eval_loss": 3.562443256378174, | |
| "eval_runtime": 177.9087, | |
| "eval_samples_per_second": 93.537, | |
| "eval_steps_per_second": 5.851, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.248762015729682, | |
| "grad_norm": 0.32279014587402344, | |
| "learning_rate": 0.0004532824249489944, | |
| "loss": 3.3031, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.263326536556947, | |
| "grad_norm": 0.32717186212539673, | |
| "learning_rate": 0.0004531075488195861, | |
| "loss": 3.3086, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.277891057384211, | |
| "grad_norm": 0.3563939034938812, | |
| "learning_rate": 0.0004529326726901777, | |
| "loss": 3.3055, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.292455578211477, | |
| "grad_norm": 0.3626421391963959, | |
| "learning_rate": 0.00045275779656076947, | |
| "loss": 3.3105, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.307020099038741, | |
| "grad_norm": 0.36385101079940796, | |
| "learning_rate": 0.0004525829204313611, | |
| "loss": 3.3156, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.321584619866007, | |
| "grad_norm": 0.3256656229496002, | |
| "learning_rate": 0.00045240804430195274, | |
| "loss": 3.3008, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.336149140693271, | |
| "grad_norm": 0.34348002076148987, | |
| "learning_rate": 0.00045223316817254443, | |
| "loss": 3.3234, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.350713661520537, | |
| "grad_norm": 0.37969276309013367, | |
| "learning_rate": 0.00045205829204313607, | |
| "loss": 3.3084, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.3652781823478, | |
| "grad_norm": 0.3616692125797272, | |
| "learning_rate": 0.00045188341591372776, | |
| "loss": 3.3194, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.379842703175065, | |
| "grad_norm": 0.3250804543495178, | |
| "learning_rate": 0.0004517085397843194, | |
| "loss": 3.3203, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.39440722400233, | |
| "grad_norm": 0.3668137490749359, | |
| "learning_rate": 0.00045153366365491103, | |
| "loss": 3.3219, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.408971744829595, | |
| "grad_norm": 0.3868430256843567, | |
| "learning_rate": 0.0004513587875255027, | |
| "loss": 3.3128, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.42353626565686, | |
| "grad_norm": 0.3570868968963623, | |
| "learning_rate": 0.00045118391139609436, | |
| "loss": 3.3157, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.438100786484124, | |
| "grad_norm": 0.35844898223876953, | |
| "learning_rate": 0.0004510090352666861, | |
| "loss": 3.3106, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.45266530731139, | |
| "grad_norm": 0.3538632094860077, | |
| "learning_rate": 0.00045083415913727774, | |
| "loss": 3.324, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.467229828138654, | |
| "grad_norm": 0.3571759760379791, | |
| "learning_rate": 0.0004506592830078694, | |
| "loss": 3.3308, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.481794348965918, | |
| "grad_norm": 0.3505135178565979, | |
| "learning_rate": 0.00045048440687846106, | |
| "loss": 3.3307, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.496358869793184, | |
| "grad_norm": 0.36675187945365906, | |
| "learning_rate": 0.0004503095307490527, | |
| "loss": 3.3453, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.510923390620448, | |
| "grad_norm": 0.348091185092926, | |
| "learning_rate": 0.0004501346546196444, | |
| "loss": 3.3215, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.525487911447714, | |
| "grad_norm": 0.3269301950931549, | |
| "learning_rate": 0.000449959778490236, | |
| "loss": 3.3183, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.525487911447714, | |
| "eval_accuracy": 0.3701031486728309, | |
| "eval_loss": 3.5565950870513916, | |
| "eval_runtime": 177.6773, | |
| "eval_samples_per_second": 93.659, | |
| "eval_steps_per_second": 5.859, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.540052432274978, | |
| "grad_norm": 0.35199809074401855, | |
| "learning_rate": 0.0004497849023608277, | |
| "loss": 3.3305, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.554616953102244, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00044961002623141935, | |
| "loss": 3.3298, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.569181473929508, | |
| "grad_norm": 0.3471096158027649, | |
| "learning_rate": 0.000449435150102011, | |
| "loss": 3.3326, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.583745994756772, | |
| "grad_norm": 0.3414683938026428, | |
| "learning_rate": 0.00044926027397260273, | |
| "loss": 3.3381, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.598310515584037, | |
| "grad_norm": 0.3478773832321167, | |
| "learning_rate": 0.00044908539784319437, | |
| "loss": 3.3287, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.612875036411301, | |
| "grad_norm": 0.33130770921707153, | |
| "learning_rate": 0.00044891052171378606, | |
| "loss": 3.3472, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.627439557238567, | |
| "grad_norm": 0.37261322140693665, | |
| "learning_rate": 0.0004487356455843777, | |
| "loss": 3.338, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.642004078065831, | |
| "grad_norm": 0.3253554701805115, | |
| "learning_rate": 0.00044856076945496933, | |
| "loss": 3.3443, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.656568598893097, | |
| "grad_norm": 0.3404645323753357, | |
| "learning_rate": 0.000448385893325561, | |
| "loss": 3.3503, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.671133119720361, | |
| "grad_norm": 0.3743685483932495, | |
| "learning_rate": 0.00044821101719615266, | |
| "loss": 3.3357, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.685697640547627, | |
| "grad_norm": 0.3546348214149475, | |
| "learning_rate": 0.00044803614106674435, | |
| "loss": 3.3363, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.70026216137489, | |
| "grad_norm": 0.3247695565223694, | |
| "learning_rate": 0.000447861264937336, | |
| "loss": 3.3544, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.714826682202155, | |
| "grad_norm": 0.366551011800766, | |
| "learning_rate": 0.00044768638880792773, | |
| "loss": 3.3392, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.72939120302942, | |
| "grad_norm": 0.3813282549381256, | |
| "learning_rate": 0.00044751151267851937, | |
| "loss": 3.3486, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.743955723856685, | |
| "grad_norm": 0.3566057085990906, | |
| "learning_rate": 0.000447336636549111, | |
| "loss": 3.3455, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.75852024468395, | |
| "grad_norm": 0.36512717604637146, | |
| "learning_rate": 0.0004471617604197027, | |
| "loss": 3.3554, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.773084765511214, | |
| "grad_norm": 0.3692280352115631, | |
| "learning_rate": 0.00044698688429029433, | |
| "loss": 3.3474, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.78764928633848, | |
| "grad_norm": 0.35620564222335815, | |
| "learning_rate": 0.000446812008160886, | |
| "loss": 3.3539, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.802213807165744, | |
| "grad_norm": 0.3521715998649597, | |
| "learning_rate": 0.00044663713203147766, | |
| "loss": 3.3583, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.81677832799301, | |
| "grad_norm": 0.36695396900177, | |
| "learning_rate": 0.0004464622559020693, | |
| "loss": 3.3537, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.81677832799301, | |
| "eval_accuracy": 0.3705989415480662, | |
| "eval_loss": 3.5467417240142822, | |
| "eval_runtime": 185.5243, | |
| "eval_samples_per_second": 89.697, | |
| "eval_steps_per_second": 5.611, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.831342848820274, | |
| "grad_norm": 0.3474844694137573, | |
| "learning_rate": 0.000446287379772661, | |
| "loss": 3.3493, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.845907369647538, | |
| "grad_norm": 0.3925248980522156, | |
| "learning_rate": 0.0004461125036432526, | |
| "loss": 3.3387, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.860471890474804, | |
| "grad_norm": 0.35316935181617737, | |
| "learning_rate": 0.00044593762751384436, | |
| "loss": 3.3445, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.875036411302068, | |
| "grad_norm": 0.3575705885887146, | |
| "learning_rate": 0.000445762751384436, | |
| "loss": 3.3477, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.889600932129333, | |
| "grad_norm": 0.3460574150085449, | |
| "learning_rate": 0.0004455878752550277, | |
| "loss": 3.3678, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.904165452956597, | |
| "grad_norm": 0.35342928767204285, | |
| "learning_rate": 0.0004454129991256193, | |
| "loss": 3.34, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.918729973783863, | |
| "grad_norm": 0.3239762485027313, | |
| "learning_rate": 0.00044523812299621096, | |
| "loss": 3.3549, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.933294494611127, | |
| "grad_norm": 0.3479582369327545, | |
| "learning_rate": 0.00044506324686680265, | |
| "loss": 3.3561, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.947859015438393, | |
| "grad_norm": 0.348714143037796, | |
| "learning_rate": 0.0004448883707373943, | |
| "loss": 3.3436, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.962423536265657, | |
| "grad_norm": 0.34078627824783325, | |
| "learning_rate": 0.000444713494607986, | |
| "loss": 3.3457, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.976988057092921, | |
| "grad_norm": 0.3270307183265686, | |
| "learning_rate": 0.0004445386184785776, | |
| "loss": 3.3583, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.991552577920187, | |
| "grad_norm": 0.3412953019142151, | |
| "learning_rate": 0.00044436374234916925, | |
| "loss": 3.3598, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 13.00611709874745, | |
| "grad_norm": 0.3603624701499939, | |
| "learning_rate": 0.000444188866219761, | |
| "loss": 3.3055, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.020681619574717, | |
| "grad_norm": 0.34541741013526917, | |
| "learning_rate": 0.00044401399009035263, | |
| "loss": 3.2502, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.03524614040198, | |
| "grad_norm": 0.34776028990745544, | |
| "learning_rate": 0.0004438391139609443, | |
| "loss": 3.2458, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.049810661229246, | |
| "grad_norm": 0.3470947742462158, | |
| "learning_rate": 0.00044366423783153596, | |
| "loss": 3.2418, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.06437518205651, | |
| "grad_norm": 0.37817707657814026, | |
| "learning_rate": 0.0004434893617021276, | |
| "loss": 3.2555, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.078939702883774, | |
| "grad_norm": 0.37848788499832153, | |
| "learning_rate": 0.0004433144855727193, | |
| "loss": 3.2488, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.09350422371104, | |
| "grad_norm": 0.3598591089248657, | |
| "learning_rate": 0.0004431396094433109, | |
| "loss": 3.274, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.108068744538304, | |
| "grad_norm": 0.3582230508327484, | |
| "learning_rate": 0.0004429647333139026, | |
| "loss": 3.2595, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.108068744538304, | |
| "eval_accuracy": 0.3703395205132538, | |
| "eval_loss": 3.558483839035034, | |
| "eval_runtime": 177.819, | |
| "eval_samples_per_second": 93.584, | |
| "eval_steps_per_second": 5.854, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.12263326536557, | |
| "grad_norm": 0.3447740972042084, | |
| "learning_rate": 0.00044278985718449425, | |
| "loss": 3.2738, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.137197786192834, | |
| "grad_norm": 0.36638307571411133, | |
| "learning_rate": 0.000442614981055086, | |
| "loss": 3.2711, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.1517623070201, | |
| "grad_norm": 0.33102279901504517, | |
| "learning_rate": 0.00044244010492567763, | |
| "loss": 3.2644, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.166326827847364, | |
| "grad_norm": 0.3540899455547333, | |
| "learning_rate": 0.00044226522879626927, | |
| "loss": 3.2714, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.180891348674628, | |
| "grad_norm": 0.3566311299800873, | |
| "learning_rate": 0.00044209035266686096, | |
| "loss": 3.2841, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.195455869501894, | |
| "grad_norm": 0.373551607131958, | |
| "learning_rate": 0.0004419154765374526, | |
| "loss": 3.2718, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.210020390329158, | |
| "grad_norm": 0.41288265585899353, | |
| "learning_rate": 0.0004417406004080443, | |
| "loss": 3.2849, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.224584911156423, | |
| "grad_norm": 0.35161110758781433, | |
| "learning_rate": 0.0004415657242786359, | |
| "loss": 3.2853, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.239149431983687, | |
| "grad_norm": 0.3606819808483124, | |
| "learning_rate": 0.00044139084814922755, | |
| "loss": 3.286, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.253713952810953, | |
| "grad_norm": 0.3907514810562134, | |
| "learning_rate": 0.00044121597201981924, | |
| "loss": 3.2811, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.268278473638217, | |
| "grad_norm": 0.355471134185791, | |
| "learning_rate": 0.0004410410958904109, | |
| "loss": 3.2962, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.282842994465483, | |
| "grad_norm": 0.36252570152282715, | |
| "learning_rate": 0.0004408662197610026, | |
| "loss": 3.2966, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.297407515292747, | |
| "grad_norm": 0.336823433637619, | |
| "learning_rate": 0.00044069134363159426, | |
| "loss": 3.3074, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.311972036120011, | |
| "grad_norm": 0.36092522740364075, | |
| "learning_rate": 0.00044051646750218595, | |
| "loss": 3.2994, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.326536556947277, | |
| "grad_norm": 0.36176231503486633, | |
| "learning_rate": 0.0004403415913727776, | |
| "loss": 3.3003, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.34110107777454, | |
| "grad_norm": 0.36278125643730164, | |
| "learning_rate": 0.0004401667152433692, | |
| "loss": 3.2931, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.355665598601806, | |
| "grad_norm": 0.34519457817077637, | |
| "learning_rate": 0.0004399918391139609, | |
| "loss": 3.3055, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.37023011942907, | |
| "grad_norm": 0.36939191818237305, | |
| "learning_rate": 0.00043981696298455255, | |
| "loss": 3.3019, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.384794640256336, | |
| "grad_norm": 0.3272865116596222, | |
| "learning_rate": 0.00043964208685514424, | |
| "loss": 3.2993, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.3993591610836, | |
| "grad_norm": 0.3592440187931061, | |
| "learning_rate": 0.0004394672107257359, | |
| "loss": 3.3001, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.3993591610836, | |
| "eval_accuracy": 0.3707306512302919, | |
| "eval_loss": 3.5545082092285156, | |
| "eval_runtime": 178.5043, | |
| "eval_samples_per_second": 93.225, | |
| "eval_steps_per_second": 5.832, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.413923681910866, | |
| "grad_norm": 0.33466318249702454, | |
| "learning_rate": 0.0004392923345963275, | |
| "loss": 3.301, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.42848820273813, | |
| "grad_norm": 0.3258253335952759, | |
| "learning_rate": 0.00043911745846691926, | |
| "loss": 3.3132, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.443052723565394, | |
| "grad_norm": 0.35109880566596985, | |
| "learning_rate": 0.0004389425823375109, | |
| "loss": 3.312, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.45761724439266, | |
| "grad_norm": 0.3569508492946625, | |
| "learning_rate": 0.0004387677062081026, | |
| "loss": 3.3208, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.472181765219924, | |
| "grad_norm": 0.3373895585536957, | |
| "learning_rate": 0.0004385928300786942, | |
| "loss": 3.306, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.48674628604719, | |
| "grad_norm": 0.3376927375793457, | |
| "learning_rate": 0.0004384179539492859, | |
| "loss": 3.3033, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.501310806874454, | |
| "grad_norm": 0.4034666419029236, | |
| "learning_rate": 0.00043824307781987755, | |
| "loss": 3.3082, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.51587532770172, | |
| "grad_norm": 0.3468514680862427, | |
| "learning_rate": 0.0004380682016904692, | |
| "loss": 3.3109, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.530439848528983, | |
| "grad_norm": 0.35658130049705505, | |
| "learning_rate": 0.0004378933255610609, | |
| "loss": 3.323, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.545004369356247, | |
| "grad_norm": 0.3441319763660431, | |
| "learning_rate": 0.0004377184494316525, | |
| "loss": 3.3248, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.559568890183513, | |
| "grad_norm": 0.3547191619873047, | |
| "learning_rate": 0.00043754357330224426, | |
| "loss": 3.3243, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.574133411010777, | |
| "grad_norm": 0.3560062050819397, | |
| "learning_rate": 0.0004373686971728359, | |
| "loss": 3.3055, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.588697931838043, | |
| "grad_norm": 0.35935595631599426, | |
| "learning_rate": 0.00043719382104342753, | |
| "loss": 3.318, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.603262452665307, | |
| "grad_norm": 0.3890141546726227, | |
| "learning_rate": 0.0004370189449140192, | |
| "loss": 3.3161, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.617826973492573, | |
| "grad_norm": 0.35833755135536194, | |
| "learning_rate": 0.00043684406878461085, | |
| "loss": 3.3174, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.632391494319837, | |
| "grad_norm": 0.35992640256881714, | |
| "learning_rate": 0.00043666919265520254, | |
| "loss": 3.3209, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.6469560151471, | |
| "grad_norm": 0.36018410325050354, | |
| "learning_rate": 0.0004364943165257942, | |
| "loss": 3.3206, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.661520535974367, | |
| "grad_norm": 0.37604689598083496, | |
| "learning_rate": 0.0004363194403963858, | |
| "loss": 3.327, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.67608505680163, | |
| "grad_norm": 0.3590412735939026, | |
| "learning_rate": 0.0004361445642669775, | |
| "loss": 3.3319, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.690649577628896, | |
| "grad_norm": 0.3439190089702606, | |
| "learning_rate": 0.00043596968813756914, | |
| "loss": 3.3331, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.690649577628896, | |
| "eval_accuracy": 0.37110143750534336, | |
| "eval_loss": 3.5459725856781006, | |
| "eval_runtime": 178.5404, | |
| "eval_samples_per_second": 93.206, | |
| "eval_steps_per_second": 5.831, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.70521409845616, | |
| "grad_norm": 0.3402903378009796, | |
| "learning_rate": 0.0004357948120081609, | |
| "loss": 3.3174, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.719778619283426, | |
| "grad_norm": 0.3717620372772217, | |
| "learning_rate": 0.0004356199358787525, | |
| "loss": 3.3351, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.73434314011069, | |
| "grad_norm": 0.3349936604499817, | |
| "learning_rate": 0.0004354450597493442, | |
| "loss": 3.3225, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.748907660937956, | |
| "grad_norm": 0.35813501477241516, | |
| "learning_rate": 0.00043527018361993585, | |
| "loss": 3.3373, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.76347218176522, | |
| "grad_norm": 0.32840263843536377, | |
| "learning_rate": 0.0004350953074905275, | |
| "loss": 3.3305, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.778036702592484, | |
| "grad_norm": 0.35672280192375183, | |
| "learning_rate": 0.0004349204313611192, | |
| "loss": 3.3327, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.79260122341975, | |
| "grad_norm": 0.3735807240009308, | |
| "learning_rate": 0.0004347455552317108, | |
| "loss": 3.3331, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.807165744247014, | |
| "grad_norm": 0.34489327669143677, | |
| "learning_rate": 0.0004345706791023025, | |
| "loss": 3.3362, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.82173026507428, | |
| "grad_norm": 0.36615490913391113, | |
| "learning_rate": 0.00043439580297289414, | |
| "loss": 3.3339, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.836294785901543, | |
| "grad_norm": 0.3411681056022644, | |
| "learning_rate": 0.0004342209268434858, | |
| "loss": 3.3363, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.85085930672881, | |
| "grad_norm": 0.3618829846382141, | |
| "learning_rate": 0.0004340460507140775, | |
| "loss": 3.3285, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.865423827556073, | |
| "grad_norm": 0.4026733338832855, | |
| "learning_rate": 0.00043387117458466916, | |
| "loss": 3.3342, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.879988348383339, | |
| "grad_norm": 0.35623764991760254, | |
| "learning_rate": 0.00043369629845526085, | |
| "loss": 3.3368, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.894552869210603, | |
| "grad_norm": 0.33052340149879456, | |
| "learning_rate": 0.0004335214223258525, | |
| "loss": 3.3291, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.909117390037867, | |
| "grad_norm": 0.3380168378353119, | |
| "learning_rate": 0.0004333465461964442, | |
| "loss": 3.3356, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.923681910865133, | |
| "grad_norm": 0.401959091424942, | |
| "learning_rate": 0.0004331716700670358, | |
| "loss": 3.3324, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.938246431692397, | |
| "grad_norm": 0.3894999623298645, | |
| "learning_rate": 0.00043299679393762745, | |
| "loss": 3.3359, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.952810952519663, | |
| "grad_norm": 0.35449472069740295, | |
| "learning_rate": 0.00043282191780821914, | |
| "loss": 3.3198, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.967375473346927, | |
| "grad_norm": 0.3355594575405121, | |
| "learning_rate": 0.00043264704167881077, | |
| "loss": 3.3327, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.981939994174192, | |
| "grad_norm": 0.35608407855033875, | |
| "learning_rate": 0.0004324721655494025, | |
| "loss": 3.3397, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.981939994174192, | |
| "eval_accuracy": 0.37175398842201335, | |
| "eval_loss": 3.540625810623169, | |
| "eval_runtime": 179.0517, | |
| "eval_samples_per_second": 92.94, | |
| "eval_steps_per_second": 5.814, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.996504515001456, | |
| "grad_norm": 0.36551040410995483, | |
| "learning_rate": 0.00043229728941999415, | |
| "loss": 3.3388, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.01106903582872, | |
| "grad_norm": 0.34817105531692505, | |
| "learning_rate": 0.0004321224132905858, | |
| "loss": 3.2595, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.025633556655986, | |
| "grad_norm": 0.3400195837020874, | |
| "learning_rate": 0.0004319475371611775, | |
| "loss": 3.2207, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.04019807748325, | |
| "grad_norm": 0.3744097352027893, | |
| "learning_rate": 0.0004317726610317691, | |
| "loss": 3.2301, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.054762598310516, | |
| "grad_norm": 0.3383084535598755, | |
| "learning_rate": 0.0004315977849023608, | |
| "loss": 3.2422, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.06932711913778, | |
| "grad_norm": 0.35336896777153015, | |
| "learning_rate": 0.00043142290877295244, | |
| "loss": 3.2273, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.083891639965046, | |
| "grad_norm": 0.3526099920272827, | |
| "learning_rate": 0.00043124803264354413, | |
| "loss": 3.2395, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.09845616079231, | |
| "grad_norm": 0.3588810861110687, | |
| "learning_rate": 0.00043107315651413577, | |
| "loss": 3.2406, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.113020681619576, | |
| "grad_norm": 0.36780351400375366, | |
| "learning_rate": 0.0004308982803847274, | |
| "loss": 3.2556, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.12758520244684, | |
| "grad_norm": 0.3457690477371216, | |
| "learning_rate": 0.00043072340425531915, | |
| "loss": 3.2578, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.142149723274104, | |
| "grad_norm": 0.3812199831008911, | |
| "learning_rate": 0.0004305485281259108, | |
| "loss": 3.2557, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.15671424410137, | |
| "grad_norm": 0.37096258997917175, | |
| "learning_rate": 0.0004303736519965025, | |
| "loss": 3.261, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.171278764928633, | |
| "grad_norm": 0.3497552275657654, | |
| "learning_rate": 0.0004301987758670941, | |
| "loss": 3.269, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.1858432857559, | |
| "grad_norm": 0.3859023153781891, | |
| "learning_rate": 0.00043002389973768575, | |
| "loss": 3.2629, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.200407806583163, | |
| "grad_norm": 0.35794612765312195, | |
| "learning_rate": 0.00042984902360827744, | |
| "loss": 3.2692, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.214972327410429, | |
| "grad_norm": 0.37541714310646057, | |
| "learning_rate": 0.0004296741474788691, | |
| "loss": 3.2796, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.229536848237693, | |
| "grad_norm": 0.3757866322994232, | |
| "learning_rate": 0.00042949927134946077, | |
| "loss": 3.2682, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.244101369064957, | |
| "grad_norm": 0.3648219406604767, | |
| "learning_rate": 0.0004293243952200524, | |
| "loss": 3.2742, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.258665889892223, | |
| "grad_norm": 0.36085084080696106, | |
| "learning_rate": 0.00042914951909064415, | |
| "loss": 3.2738, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.273230410719487, | |
| "grad_norm": 0.3444569706916809, | |
| "learning_rate": 0.0004289746429612358, | |
| "loss": 3.277, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.273230410719487, | |
| "eval_accuracy": 0.37122232817795764, | |
| "eval_loss": 3.556274652481079, | |
| "eval_runtime": 179.9174, | |
| "eval_samples_per_second": 92.492, | |
| "eval_steps_per_second": 5.786, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.287794931546753, | |
| "grad_norm": 0.34347397089004517, | |
| "learning_rate": 0.0004287997668318274, | |
| "loss": 3.2827, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.302359452374017, | |
| "grad_norm": 0.361592173576355, | |
| "learning_rate": 0.0004286248907024191, | |
| "loss": 3.2691, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.316923973201282, | |
| "grad_norm": 0.36173015832901, | |
| "learning_rate": 0.00042845001457301075, | |
| "loss": 3.2783, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.331488494028546, | |
| "grad_norm": 0.3683636784553528, | |
| "learning_rate": 0.00042827513844360244, | |
| "loss": 3.2796, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.346053014855812, | |
| "grad_norm": 0.36255374550819397, | |
| "learning_rate": 0.00042810026231419407, | |
| "loss": 3.2864, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.360617535683076, | |
| "grad_norm": 0.3432329595088959, | |
| "learning_rate": 0.0004279253861847857, | |
| "loss": 3.282, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.37518205651034, | |
| "grad_norm": 0.35291415452957153, | |
| "learning_rate": 0.0004277505100553774, | |
| "loss": 3.2799, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.389746577337606, | |
| "grad_norm": 0.3346937298774719, | |
| "learning_rate": 0.00042757563392596904, | |
| "loss": 3.2942, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.40431109816487, | |
| "grad_norm": 0.3730728328227997, | |
| "learning_rate": 0.0004274007577965608, | |
| "loss": 3.3006, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.418875618992136, | |
| "grad_norm": 0.36969706416130066, | |
| "learning_rate": 0.0004272258816671524, | |
| "loss": 3.2807, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.4334401398194, | |
| "grad_norm": 0.3247165083885193, | |
| "learning_rate": 0.00042705100553774405, | |
| "loss": 3.2888, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.448004660646665, | |
| "grad_norm": 0.3538820147514343, | |
| "learning_rate": 0.00042687612940833574, | |
| "loss": 3.3002, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.46256918147393, | |
| "grad_norm": 0.38062334060668945, | |
| "learning_rate": 0.0004267012532789274, | |
| "loss": 3.2894, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.477133702301193, | |
| "grad_norm": 0.35229548811912537, | |
| "learning_rate": 0.00042652637714951907, | |
| "loss": 3.2941, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.49169822312846, | |
| "grad_norm": 0.37731724977493286, | |
| "learning_rate": 0.0004263515010201107, | |
| "loss": 3.3014, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.506262743955723, | |
| "grad_norm": 0.3561221659183502, | |
| "learning_rate": 0.0004261766248907024, | |
| "loss": 3.2885, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.520827264782989, | |
| "grad_norm": 0.3692299723625183, | |
| "learning_rate": 0.00042600174876129403, | |
| "loss": 3.301, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.535391785610253, | |
| "grad_norm": 0.3614572584629059, | |
| "learning_rate": 0.00042582687263188567, | |
| "loss": 3.3106, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.549956306437519, | |
| "grad_norm": 0.3334461748600006, | |
| "learning_rate": 0.0004256519965024774, | |
| "loss": 3.3026, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.564520827264783, | |
| "grad_norm": 0.34117934107780457, | |
| "learning_rate": 0.00042547712037306905, | |
| "loss": 3.3188, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.564520827264783, | |
| "eval_accuracy": 0.3717561051847634, | |
| "eval_loss": 3.54461407661438, | |
| "eval_runtime": 178.8677, | |
| "eval_samples_per_second": 93.035, | |
| "eval_steps_per_second": 5.82, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.579085348092049, | |
| "grad_norm": 0.3184276521205902, | |
| "learning_rate": 0.00042530224424366074, | |
| "loss": 3.2982, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.593649868919313, | |
| "grad_norm": 0.33841148018836975, | |
| "learning_rate": 0.0004251273681142524, | |
| "loss": 3.3045, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.608214389746577, | |
| "grad_norm": 0.3844097852706909, | |
| "learning_rate": 0.000424952491984844, | |
| "loss": 3.3094, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.622778910573842, | |
| "grad_norm": 0.39223310351371765, | |
| "learning_rate": 0.0004247776158554357, | |
| "loss": 3.3074, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.637343431401106, | |
| "grad_norm": 0.3735564947128296, | |
| "learning_rate": 0.00042460273972602734, | |
| "loss": 3.3114, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.651907952228372, | |
| "grad_norm": 0.3553323447704315, | |
| "learning_rate": 0.00042442786359661903, | |
| "loss": 3.3107, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.666472473055636, | |
| "grad_norm": 0.33939993381500244, | |
| "learning_rate": 0.00042425298746721066, | |
| "loss": 3.3038, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.681036993882902, | |
| "grad_norm": 0.34699302911758423, | |
| "learning_rate": 0.0004240781113378024, | |
| "loss": 3.313, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.695601514710166, | |
| "grad_norm": 0.34914663434028625, | |
| "learning_rate": 0.00042390323520839405, | |
| "loss": 3.3057, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.71016603553743, | |
| "grad_norm": 0.35503461956977844, | |
| "learning_rate": 0.0004237283590789857, | |
| "loss": 3.3047, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.724730556364696, | |
| "grad_norm": 0.3494664132595062, | |
| "learning_rate": 0.00042355348294957737, | |
| "loss": 3.322, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.73929507719196, | |
| "grad_norm": 0.3523366451263428, | |
| "learning_rate": 0.000423378606820169, | |
| "loss": 3.3063, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.753859598019226, | |
| "grad_norm": 0.344511479139328, | |
| "learning_rate": 0.0004232037306907607, | |
| "loss": 3.3114, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.76842411884649, | |
| "grad_norm": 0.372232049703598, | |
| "learning_rate": 0.00042302885456135233, | |
| "loss": 3.3257, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.782988639673755, | |
| "grad_norm": 0.3332023024559021, | |
| "learning_rate": 0.00042285397843194397, | |
| "loss": 3.3077, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.79755316050102, | |
| "grad_norm": 0.3506964445114136, | |
| "learning_rate": 0.00042267910230253566, | |
| "loss": 3.3047, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.812117681328285, | |
| "grad_norm": 0.36583006381988525, | |
| "learning_rate": 0.0004225042261731273, | |
| "loss": 3.3078, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.826682202155549, | |
| "grad_norm": 0.36147186160087585, | |
| "learning_rate": 0.00042232935004371904, | |
| "loss": 3.3099, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.841246722982813, | |
| "grad_norm": 0.3693869113922119, | |
| "learning_rate": 0.0004221544739143107, | |
| "loss": 3.3175, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.855811243810079, | |
| "grad_norm": 0.33687424659729004, | |
| "learning_rate": 0.00042197959778490237, | |
| "loss": 3.3368, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.855811243810079, | |
| "eval_accuracy": 0.3719887138914084, | |
| "eval_loss": 3.539015054702759, | |
| "eval_runtime": 220.2245, | |
| "eval_samples_per_second": 75.564, | |
| "eval_steps_per_second": 4.727, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.870375764637343, | |
| "grad_norm": 0.36331212520599365, | |
| "learning_rate": 0.000421804721655494, | |
| "loss": 3.3188, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.884940285464609, | |
| "grad_norm": 0.3679030239582062, | |
| "learning_rate": 0.00042162984552608564, | |
| "loss": 3.3207, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.899504806291873, | |
| "grad_norm": 0.35701867938041687, | |
| "learning_rate": 0.00042145496939667733, | |
| "loss": 3.3098, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.914069327119138, | |
| "grad_norm": 0.3420349359512329, | |
| "learning_rate": 0.00042128009326726897, | |
| "loss": 3.3243, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.928633847946402, | |
| "grad_norm": 0.34434568881988525, | |
| "learning_rate": 0.00042110521713786066, | |
| "loss": 3.3099, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.943198368773668, | |
| "grad_norm": 0.34737786650657654, | |
| "learning_rate": 0.0004209303410084523, | |
| "loss": 3.3297, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.957762889600932, | |
| "grad_norm": 0.34293320775032043, | |
| "learning_rate": 0.00042075546487904393, | |
| "loss": 3.3189, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.972327410428196, | |
| "grad_norm": 0.35380107164382935, | |
| "learning_rate": 0.0004205805887496357, | |
| "loss": 3.3312, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.986891931255462, | |
| "grad_norm": 0.3664044141769409, | |
| "learning_rate": 0.0004204057126202273, | |
| "loss": 3.3259, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 15.001456452082726, | |
| "grad_norm": 0.3789098262786865, | |
| "learning_rate": 0.000420230836490819, | |
| "loss": 3.3188, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.016020972909992, | |
| "grad_norm": 0.3919133245944977, | |
| "learning_rate": 0.00042005596036141064, | |
| "loss": 3.211, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.030585493737256, | |
| "grad_norm": 0.3933902978897095, | |
| "learning_rate": 0.0004198810842320023, | |
| "loss": 3.2154, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.045150014564522, | |
| "grad_norm": 0.34845665097236633, | |
| "learning_rate": 0.00041970620810259396, | |
| "loss": 3.2258, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.059714535391786, | |
| "grad_norm": 0.3623676002025604, | |
| "learning_rate": 0.0004195313319731856, | |
| "loss": 3.2207, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.07427905621905, | |
| "grad_norm": 0.3734416365623474, | |
| "learning_rate": 0.0004193564558437773, | |
| "loss": 3.2309, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.088843577046315, | |
| "grad_norm": 0.365877240896225, | |
| "learning_rate": 0.0004191815797143689, | |
| "loss": 3.2182, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.10340809787358, | |
| "grad_norm": 0.3879176676273346, | |
| "learning_rate": 0.00041900670358496067, | |
| "loss": 3.2278, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.117972618700845, | |
| "grad_norm": 0.3594760000705719, | |
| "learning_rate": 0.0004188318274555523, | |
| "loss": 3.2356, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.13253713952811, | |
| "grad_norm": 0.3522253632545471, | |
| "learning_rate": 0.00041865695132614394, | |
| "loss": 3.2536, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.147101660355375, | |
| "grad_norm": 0.40796470642089844, | |
| "learning_rate": 0.00041848207519673563, | |
| "loss": 3.2346, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.147101660355375, | |
| "eval_accuracy": 0.3715526607648969, | |
| "eval_loss": 3.556135892868042, | |
| "eval_runtime": 178.7061, | |
| "eval_samples_per_second": 93.119, | |
| "eval_steps_per_second": 5.825, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.161666181182639, | |
| "grad_norm": 0.3638458251953125, | |
| "learning_rate": 0.00041830719906732727, | |
| "loss": 3.2442, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.176230702009903, | |
| "grad_norm": 0.36014366149902344, | |
| "learning_rate": 0.00041813232293791896, | |
| "loss": 3.2479, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.190795222837169, | |
| "grad_norm": 0.35548239946365356, | |
| "learning_rate": 0.0004179574468085106, | |
| "loss": 3.2596, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.205359743664433, | |
| "grad_norm": 0.3747190833091736, | |
| "learning_rate": 0.00041778257067910223, | |
| "loss": 3.2431, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.219924264491699, | |
| "grad_norm": 0.3518429100513458, | |
| "learning_rate": 0.0004176076945496939, | |
| "loss": 3.254, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.234488785318963, | |
| "grad_norm": 0.39838236570358276, | |
| "learning_rate": 0.00041743281842028556, | |
| "loss": 3.2552, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.249053306146228, | |
| "grad_norm": 0.3936876654624939, | |
| "learning_rate": 0.0004172579422908773, | |
| "loss": 3.2615, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.263617826973492, | |
| "grad_norm": 0.3447533845901489, | |
| "learning_rate": 0.00041708306616146894, | |
| "loss": 3.2685, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.278182347800758, | |
| "grad_norm": 0.3832126259803772, | |
| "learning_rate": 0.00041690819003206063, | |
| "loss": 3.2743, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.292746868628022, | |
| "grad_norm": 0.3800361156463623, | |
| "learning_rate": 0.00041673331390265227, | |
| "loss": 3.2568, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.307311389455286, | |
| "grad_norm": 0.3728283643722534, | |
| "learning_rate": 0.0004165584377732439, | |
| "loss": 3.2628, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.321875910282552, | |
| "grad_norm": 0.3551943898200989, | |
| "learning_rate": 0.0004163835616438356, | |
| "loss": 3.2639, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.336440431109816, | |
| "grad_norm": 0.37213918566703796, | |
| "learning_rate": 0.00041620868551442723, | |
| "loss": 3.2621, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.351004951937082, | |
| "grad_norm": 0.38535019755363464, | |
| "learning_rate": 0.0004160338093850189, | |
| "loss": 3.2665, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.365569472764346, | |
| "grad_norm": 0.38206279277801514, | |
| "learning_rate": 0.00041585893325561056, | |
| "loss": 3.2783, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.380133993591611, | |
| "grad_norm": 0.34792542457580566, | |
| "learning_rate": 0.0004156840571262022, | |
| "loss": 3.282, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.394698514418875, | |
| "grad_norm": 0.3482363820075989, | |
| "learning_rate": 0.00041550918099679394, | |
| "loss": 3.2774, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.409263035246141, | |
| "grad_norm": 0.40007925033569336, | |
| "learning_rate": 0.0004153343048673856, | |
| "loss": 3.2814, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.423827556073405, | |
| "grad_norm": 0.37153443694114685, | |
| "learning_rate": 0.00041515942873797726, | |
| "loss": 3.2838, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.43839207690067, | |
| "grad_norm": 0.35616305470466614, | |
| "learning_rate": 0.0004149845526085689, | |
| "loss": 3.2882, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.43839207690067, | |
| "eval_accuracy": 0.3718679408167247, | |
| "eval_loss": 3.5503857135772705, | |
| "eval_runtime": 178.8769, | |
| "eval_samples_per_second": 93.03, | |
| "eval_steps_per_second": 5.82, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.452956597727935, | |
| "grad_norm": 0.38153186440467834, | |
| "learning_rate": 0.0004148096764791606, | |
| "loss": 3.2855, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.467521118555199, | |
| "grad_norm": 0.364692747592926, | |
| "learning_rate": 0.0004146348003497522, | |
| "loss": 3.2756, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.482085639382465, | |
| "grad_norm": 0.3847792148590088, | |
| "learning_rate": 0.00041445992422034386, | |
| "loss": 3.2871, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.496650160209729, | |
| "grad_norm": 0.37956368923187256, | |
| "learning_rate": 0.00041428504809093555, | |
| "loss": 3.2836, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.511214681036995, | |
| "grad_norm": 0.36245131492614746, | |
| "learning_rate": 0.0004141101719615272, | |
| "loss": 3.2935, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.525779201864259, | |
| "grad_norm": 0.37407535314559937, | |
| "learning_rate": 0.00041393529583211893, | |
| "loss": 3.2729, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.540343722691523, | |
| "grad_norm": 0.34457436203956604, | |
| "learning_rate": 0.00041376041970271057, | |
| "loss": 3.3004, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.554908243518788, | |
| "grad_norm": 0.35518190264701843, | |
| "learning_rate": 0.0004135855435733022, | |
| "loss": 3.2959, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.569472764346052, | |
| "grad_norm": 0.36230894923210144, | |
| "learning_rate": 0.0004134106674438939, | |
| "loss": 3.291, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.584037285173318, | |
| "grad_norm": 0.37828779220581055, | |
| "learning_rate": 0.00041323579131448553, | |
| "loss": 3.2795, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.598601806000582, | |
| "grad_norm": 0.3493488132953644, | |
| "learning_rate": 0.0004130609151850772, | |
| "loss": 3.2829, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.613166326827848, | |
| "grad_norm": 0.40960273146629333, | |
| "learning_rate": 0.00041288603905566886, | |
| "loss": 3.2866, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.627730847655112, | |
| "grad_norm": 0.36528661847114563, | |
| "learning_rate": 0.0004127111629262605, | |
| "loss": 3.2866, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.642295368482376, | |
| "grad_norm": 0.3558300733566284, | |
| "learning_rate": 0.0004125362867968522, | |
| "loss": 3.3008, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.656859889309642, | |
| "grad_norm": 0.3836390972137451, | |
| "learning_rate": 0.0004123614106674438, | |
| "loss": 3.2955, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.671424410136906, | |
| "grad_norm": 0.3821122646331787, | |
| "learning_rate": 0.00041218653453803557, | |
| "loss": 3.2986, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.685988930964172, | |
| "grad_norm": 0.3575958013534546, | |
| "learning_rate": 0.0004120116584086272, | |
| "loss": 3.2853, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.700553451791436, | |
| "grad_norm": 0.4025585353374481, | |
| "learning_rate": 0.0004118367822792189, | |
| "loss": 3.2862, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.715117972618701, | |
| "grad_norm": 0.35863131284713745, | |
| "learning_rate": 0.00041166190614981053, | |
| "loss": 3.306, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.729682493445965, | |
| "grad_norm": 0.3342381715774536, | |
| "learning_rate": 0.00041148703002040217, | |
| "loss": 3.2962, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.729682493445965, | |
| "eval_accuracy": 0.37225413242067934, | |
| "eval_loss": 3.5404388904571533, | |
| "eval_runtime": 181.2437, | |
| "eval_samples_per_second": 91.816, | |
| "eval_steps_per_second": 5.744, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.744247014273231, | |
| "grad_norm": 0.3472493290901184, | |
| "learning_rate": 0.00041131215389099386, | |
| "loss": 3.2979, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.758811535100495, | |
| "grad_norm": 0.3713492751121521, | |
| "learning_rate": 0.0004111372777615855, | |
| "loss": 3.2897, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.77337605592776, | |
| "grad_norm": 0.41221651434898376, | |
| "learning_rate": 0.0004109624016321772, | |
| "loss": 3.2933, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.787940576755025, | |
| "grad_norm": 0.3364807665348053, | |
| "learning_rate": 0.0004107875255027688, | |
| "loss": 3.2879, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.802505097582289, | |
| "grad_norm": 0.34417688846588135, | |
| "learning_rate": 0.00041061264937336045, | |
| "loss": 3.2878, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.817069618409555, | |
| "grad_norm": 0.3484468460083008, | |
| "learning_rate": 0.0004104377732439522, | |
| "loss": 3.3104, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.831634139236819, | |
| "grad_norm": 0.37589672207832336, | |
| "learning_rate": 0.00041026289711454384, | |
| "loss": 3.2954, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.846198660064085, | |
| "grad_norm": 0.3464411795139313, | |
| "learning_rate": 0.0004100880209851355, | |
| "loss": 3.3115, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.860763180891349, | |
| "grad_norm": 0.3678479790687561, | |
| "learning_rate": 0.00040991314485572716, | |
| "loss": 3.3094, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.875327701718614, | |
| "grad_norm": 0.36492377519607544, | |
| "learning_rate": 0.00040973826872631885, | |
| "loss": 3.3113, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.889892222545878, | |
| "grad_norm": 0.35983067750930786, | |
| "learning_rate": 0.0004095633925969105, | |
| "loss": 3.3149, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.904456743373142, | |
| "grad_norm": 0.3600602149963379, | |
| "learning_rate": 0.0004093885164675021, | |
| "loss": 3.3083, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.919021264200408, | |
| "grad_norm": 0.36820098757743835, | |
| "learning_rate": 0.0004092136403380938, | |
| "loss": 3.3118, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.933585785027672, | |
| "grad_norm": 0.38710981607437134, | |
| "learning_rate": 0.00040903876420868545, | |
| "loss": 3.3086, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.948150305854938, | |
| "grad_norm": 0.360516756772995, | |
| "learning_rate": 0.00040886388807927714, | |
| "loss": 3.302, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.962714826682202, | |
| "grad_norm": 0.343983918428421, | |
| "learning_rate": 0.00040868901194986883, | |
| "loss": 3.3074, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.977279347509468, | |
| "grad_norm": 0.36255642771720886, | |
| "learning_rate": 0.00040851413582046047, | |
| "loss": 3.3113, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.991843868336732, | |
| "grad_norm": 0.366834819316864, | |
| "learning_rate": 0.00040833925969105216, | |
| "loss": 3.3066, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 16.006408389163997, | |
| "grad_norm": 0.38302844762802124, | |
| "learning_rate": 0.0004081643835616438, | |
| "loss": 3.2623, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.02097290999126, | |
| "grad_norm": 0.3558831214904785, | |
| "learning_rate": 0.0004079895074322355, | |
| "loss": 3.1914, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.02097290999126, | |
| "eval_accuracy": 0.3723332758279453, | |
| "eval_loss": 3.5491139888763428, | |
| "eval_runtime": 221.7065, | |
| "eval_samples_per_second": 75.059, | |
| "eval_steps_per_second": 4.695, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.035537430818525, | |
| "grad_norm": 0.3447836935520172, | |
| "learning_rate": 0.0004078146313028271, | |
| "loss": 3.2054, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.05010195164579, | |
| "grad_norm": 0.3696219325065613, | |
| "learning_rate": 0.0004076397551734188, | |
| "loss": 3.2151, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.064666472473057, | |
| "grad_norm": 0.35809266567230225, | |
| "learning_rate": 0.00040746487904401045, | |
| "loss": 3.2091, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.07923099330032, | |
| "grad_norm": 0.37019026279449463, | |
| "learning_rate": 0.0004072900029146021, | |
| "loss": 3.2244, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.093795514127585, | |
| "grad_norm": 0.35395923256874084, | |
| "learning_rate": 0.0004071151267851938, | |
| "loss": 3.2123, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.10836003495485, | |
| "grad_norm": 0.3639342784881592, | |
| "learning_rate": 0.00040694025065578546, | |
| "loss": 3.2293, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.122924555782113, | |
| "grad_norm": 0.3842836320400238, | |
| "learning_rate": 0.00040676537452637716, | |
| "loss": 3.2171, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.13748907660938, | |
| "grad_norm": 0.40237775444984436, | |
| "learning_rate": 0.0004065904983969688, | |
| "loss": 3.2236, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.152053597436645, | |
| "grad_norm": 0.35297468304634094, | |
| "learning_rate": 0.00040641562226756043, | |
| "loss": 3.2304, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.16661811826391, | |
| "grad_norm": 0.3450460731983185, | |
| "learning_rate": 0.0004062407461381521, | |
| "loss": 3.2163, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.181182639091173, | |
| "grad_norm": 0.3639377951622009, | |
| "learning_rate": 0.00040606587000874375, | |
| "loss": 3.2317, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.19574715991844, | |
| "grad_norm": 0.3449926972389221, | |
| "learning_rate": 0.00040589099387933544, | |
| "loss": 3.242, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.210311680745704, | |
| "grad_norm": 0.39611726999282837, | |
| "learning_rate": 0.0004057161177499271, | |
| "loss": 3.2357, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.224876201572968, | |
| "grad_norm": 0.3632814884185791, | |
| "learning_rate": 0.0004055412416205187, | |
| "loss": 3.2357, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.239440722400232, | |
| "grad_norm": 0.38807687163352966, | |
| "learning_rate": 0.0004053663654911104, | |
| "loss": 3.2555, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.254005243227496, | |
| "grad_norm": 0.41072791814804077, | |
| "learning_rate": 0.0004051914893617021, | |
| "loss": 3.2496, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.268569764054764, | |
| "grad_norm": 0.36269184947013855, | |
| "learning_rate": 0.0004050166132322938, | |
| "loss": 3.2409, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.283134284882028, | |
| "grad_norm": 0.37166592478752136, | |
| "learning_rate": 0.0004048417371028854, | |
| "loss": 3.2567, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.29769880570929, | |
| "grad_norm": 0.38969337940216064, | |
| "learning_rate": 0.0004046668609734771, | |
| "loss": 3.2598, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.312263326536556, | |
| "grad_norm": 0.3745807111263275, | |
| "learning_rate": 0.00040449198484406875, | |
| "loss": 3.2381, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.312263326536556, | |
| "eval_accuracy": 0.37207597155588296, | |
| "eval_loss": 3.551668882369995, | |
| "eval_runtime": 178.6199, | |
| "eval_samples_per_second": 93.164, | |
| "eval_steps_per_second": 5.828, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.326827847363823, | |
| "grad_norm": 0.3594777584075928, | |
| "learning_rate": 0.0004043171087146604, | |
| "loss": 3.2672, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.341392368191087, | |
| "grad_norm": 0.37412890791893005, | |
| "learning_rate": 0.0004041422325852521, | |
| "loss": 3.2596, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.35595688901835, | |
| "grad_norm": 0.3892935514450073, | |
| "learning_rate": 0.0004039673564558437, | |
| "loss": 3.2504, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.370521409845615, | |
| "grad_norm": 0.3575972020626068, | |
| "learning_rate": 0.0004037924803264354, | |
| "loss": 3.2628, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.38508593067288, | |
| "grad_norm": 0.33797597885131836, | |
| "learning_rate": 0.00040361760419702704, | |
| "loss": 3.2633, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.399650451500147, | |
| "grad_norm": 0.37844982743263245, | |
| "learning_rate": 0.00040344272806761873, | |
| "loss": 3.2632, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.41421497232741, | |
| "grad_norm": 0.379015177488327, | |
| "learning_rate": 0.0004032678519382104, | |
| "loss": 3.2548, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.428779493154675, | |
| "grad_norm": 0.42286205291748047, | |
| "learning_rate": 0.00040309297580880206, | |
| "loss": 3.2691, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.44334401398194, | |
| "grad_norm": 0.3584016263484955, | |
| "learning_rate": 0.00040291809967939375, | |
| "loss": 3.2678, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.457908534809206, | |
| "grad_norm": 0.4013825058937073, | |
| "learning_rate": 0.0004027432235499854, | |
| "loss": 3.2697, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.47247305563647, | |
| "grad_norm": 0.3746756911277771, | |
| "learning_rate": 0.0004025683474205771, | |
| "loss": 3.2663, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.487037576463734, | |
| "grad_norm": 0.4031076431274414, | |
| "learning_rate": 0.0004023934712911687, | |
| "loss": 3.2805, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.501602097291, | |
| "grad_norm": 0.384000688791275, | |
| "learning_rate": 0.00040221859516176035, | |
| "loss": 3.2609, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.516166618118262, | |
| "grad_norm": 0.3829132914543152, | |
| "learning_rate": 0.00040204371903235204, | |
| "loss": 3.2768, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.53073113894553, | |
| "grad_norm": 0.39709749817848206, | |
| "learning_rate": 0.0004018688429029437, | |
| "loss": 3.2693, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.545295659772794, | |
| "grad_norm": 0.3538981080055237, | |
| "learning_rate": 0.0004016939667735354, | |
| "loss": 3.2675, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.559860180600058, | |
| "grad_norm": 0.3409939706325531, | |
| "learning_rate": 0.00040151909064412705, | |
| "loss": 3.281, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.574424701427322, | |
| "grad_norm": 0.38922184705734253, | |
| "learning_rate": 0.0004013442145147187, | |
| "loss": 3.2702, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.58898922225459, | |
| "grad_norm": 0.35033589601516724, | |
| "learning_rate": 0.0004011693383853104, | |
| "loss": 3.2753, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.603553743081854, | |
| "grad_norm": 0.3954455256462097, | |
| "learning_rate": 0.000400994462255902, | |
| "loss": 3.2719, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.603553743081854, | |
| "eval_accuracy": 0.3724214742758643, | |
| "eval_loss": 3.545311689376831, | |
| "eval_runtime": 178.6369, | |
| "eval_samples_per_second": 93.155, | |
| "eval_steps_per_second": 5.827, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.618118263909118, | |
| "grad_norm": 0.36573052406311035, | |
| "learning_rate": 0.0004008195861264937, | |
| "loss": 3.2737, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.63268278473638, | |
| "grad_norm": 0.35185372829437256, | |
| "learning_rate": 0.00040064470999708534, | |
| "loss": 3.2861, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.647247305563646, | |
| "grad_norm": 0.37779009342193604, | |
| "learning_rate": 0.00040046983386767703, | |
| "loss": 3.2798, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.661811826390913, | |
| "grad_norm": 0.36057236790657043, | |
| "learning_rate": 0.00040029495773826867, | |
| "loss": 3.2909, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.676376347218177, | |
| "grad_norm": 0.39807477593421936, | |
| "learning_rate": 0.0004001200816088603, | |
| "loss": 3.2773, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.69094086804544, | |
| "grad_norm": 0.3643984794616699, | |
| "learning_rate": 0.00039994520547945205, | |
| "loss": 3.2895, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.705505388872705, | |
| "grad_norm": 0.3720785975456238, | |
| "learning_rate": 0.0003997703293500437, | |
| "loss": 3.2855, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.72006990969997, | |
| "grad_norm": 0.37378811836242676, | |
| "learning_rate": 0.0003995954532206354, | |
| "loss": 3.2924, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.734634430527237, | |
| "grad_norm": 0.3754887878894806, | |
| "learning_rate": 0.000399420577091227, | |
| "loss": 3.2776, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.7491989513545, | |
| "grad_norm": 0.3625001311302185, | |
| "learning_rate": 0.00039924570096181865, | |
| "loss": 3.2747, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.763763472181765, | |
| "grad_norm": 0.359059602022171, | |
| "learning_rate": 0.00039907082483241034, | |
| "loss": 3.2948, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.77832799300903, | |
| "grad_norm": 0.3593480885028839, | |
| "learning_rate": 0.000398895948703002, | |
| "loss": 3.2865, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.792892513836296, | |
| "grad_norm": 0.35039687156677246, | |
| "learning_rate": 0.00039872107257359367, | |
| "loss": 3.2893, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.80745703466356, | |
| "grad_norm": 0.38142552971839905, | |
| "learning_rate": 0.0003985461964441853, | |
| "loss": 3.2811, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.822021555490824, | |
| "grad_norm": 0.36417636275291443, | |
| "learning_rate": 0.00039837132031477694, | |
| "loss": 3.2869, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.83658607631809, | |
| "grad_norm": 0.3743918836116791, | |
| "learning_rate": 0.0003981964441853687, | |
| "loss": 3.2941, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.851150597145352, | |
| "grad_norm": 0.3657926619052887, | |
| "learning_rate": 0.0003980215680559603, | |
| "loss": 3.2944, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.86571511797262, | |
| "grad_norm": 0.34246590733528137, | |
| "learning_rate": 0.000397846691926552, | |
| "loss": 3.2907, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.880279638799884, | |
| "grad_norm": 0.35775476694107056, | |
| "learning_rate": 0.00039767181579714365, | |
| "loss": 3.2888, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.894844159627148, | |
| "grad_norm": 0.3570455312728882, | |
| "learning_rate": 0.00039749693966773534, | |
| "loss": 3.3051, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.894844159627148, | |
| "eval_accuracy": 0.3731435255694944, | |
| "eval_loss": 3.533418893814087, | |
| "eval_runtime": 178.6001, | |
| "eval_samples_per_second": 93.175, | |
| "eval_steps_per_second": 5.829, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.909408680454412, | |
| "grad_norm": 0.38562971353530884, | |
| "learning_rate": 0.00039732206353832697, | |
| "loss": 3.292, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.92397320128168, | |
| "grad_norm": 0.39436957240104675, | |
| "learning_rate": 0.0003971471874089186, | |
| "loss": 3.2875, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.938537722108943, | |
| "grad_norm": 0.4385693371295929, | |
| "learning_rate": 0.0003969723112795103, | |
| "loss": 3.2928, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.953102242936207, | |
| "grad_norm": 0.37202712893486023, | |
| "learning_rate": 0.00039679743515010194, | |
| "loss": 3.3134, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.96766676376347, | |
| "grad_norm": 0.3811296224594116, | |
| "learning_rate": 0.0003966225590206937, | |
| "loss": 3.2922, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.982231284590735, | |
| "grad_norm": 0.3608035743236542, | |
| "learning_rate": 0.0003964476828912853, | |
| "loss": 3.3055, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.996795805418003, | |
| "grad_norm": 0.3930389881134033, | |
| "learning_rate": 0.00039627280676187695, | |
| "loss": 3.3021, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.011360326245267, | |
| "grad_norm": 0.38095763325691223, | |
| "learning_rate": 0.00039609793063246864, | |
| "loss": 3.2166, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.02592484707253, | |
| "grad_norm": 0.40292036533355713, | |
| "learning_rate": 0.0003959230545030603, | |
| "loss": 3.1866, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.040489367899795, | |
| "grad_norm": 0.34886404871940613, | |
| "learning_rate": 0.00039574817837365197, | |
| "loss": 3.1835, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.055053888727063, | |
| "grad_norm": 0.35758545994758606, | |
| "learning_rate": 0.0003955733022442436, | |
| "loss": 3.2015, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.069618409554327, | |
| "grad_norm": 0.39563000202178955, | |
| "learning_rate": 0.0003953984261148353, | |
| "loss": 3.2003, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.08418293038159, | |
| "grad_norm": 0.36030465364456177, | |
| "learning_rate": 0.00039522354998542693, | |
| "loss": 3.2011, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.098747451208855, | |
| "grad_norm": 0.3758701980113983, | |
| "learning_rate": 0.00039504867385601857, | |
| "loss": 3.2118, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.11331197203612, | |
| "grad_norm": 0.3549492657184601, | |
| "learning_rate": 0.0003948737977266103, | |
| "loss": 3.2018, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.127876492863386, | |
| "grad_norm": 0.3890039920806885, | |
| "learning_rate": 0.00039469892159720195, | |
| "loss": 3.2095, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.14244101369065, | |
| "grad_norm": 0.3619527220726013, | |
| "learning_rate": 0.00039452404546779364, | |
| "loss": 3.2128, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.157005534517914, | |
| "grad_norm": 0.3767114579677582, | |
| "learning_rate": 0.0003943491693383853, | |
| "loss": 3.2153, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.171570055345178, | |
| "grad_norm": 0.3961483836174011, | |
| "learning_rate": 0.0003941742932089769, | |
| "loss": 3.2248, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.186134576172442, | |
| "grad_norm": 0.3526688516139984, | |
| "learning_rate": 0.0003939994170795686, | |
| "loss": 3.2186, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.186134576172442, | |
| "eval_accuracy": 0.3721778113637467, | |
| "eval_loss": 3.5520782470703125, | |
| "eval_runtime": 178.9303, | |
| "eval_samples_per_second": 93.003, | |
| "eval_steps_per_second": 5.818, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.20069909699971, | |
| "grad_norm": 0.36052054166793823, | |
| "learning_rate": 0.00039382454095016024, | |
| "loss": 3.2326, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.215263617826974, | |
| "grad_norm": 0.3980617821216583, | |
| "learning_rate": 0.00039364966482075193, | |
| "loss": 3.2385, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.229828138654238, | |
| "grad_norm": 0.37004363536834717, | |
| "learning_rate": 0.00039347478869134356, | |
| "loss": 3.2343, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.244392659481502, | |
| "grad_norm": 0.37232598662376404, | |
| "learning_rate": 0.0003932999125619353, | |
| "loss": 3.2274, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.25895718030877, | |
| "grad_norm": 0.3730037212371826, | |
| "learning_rate": 0.00039312503643252695, | |
| "loss": 3.2355, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.273521701136033, | |
| "grad_norm": 0.36988314986228943, | |
| "learning_rate": 0.0003929501603031186, | |
| "loss": 3.2329, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.288086221963297, | |
| "grad_norm": 0.4261464774608612, | |
| "learning_rate": 0.00039277528417371027, | |
| "loss": 3.225, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.30265074279056, | |
| "grad_norm": 0.3772455155849457, | |
| "learning_rate": 0.0003926004080443019, | |
| "loss": 3.2486, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.317215263617825, | |
| "grad_norm": 0.37627851963043213, | |
| "learning_rate": 0.0003924255319148936, | |
| "loss": 3.2467, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.331779784445093, | |
| "grad_norm": 0.3609490990638733, | |
| "learning_rate": 0.00039225065578548523, | |
| "loss": 3.2466, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.346344305272357, | |
| "grad_norm": 0.4445193409919739, | |
| "learning_rate": 0.00039207577965607687, | |
| "loss": 3.2415, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.36090882609962, | |
| "grad_norm": 0.36540132761001587, | |
| "learning_rate": 0.00039190090352666856, | |
| "loss": 3.2564, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.375473346926885, | |
| "grad_norm": 0.3811579644680023, | |
| "learning_rate": 0.0003917260273972602, | |
| "loss": 3.2256, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.390037867754153, | |
| "grad_norm": 0.3652814030647278, | |
| "learning_rate": 0.00039155115126785194, | |
| "loss": 3.2522, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.404602388581417, | |
| "grad_norm": 0.3721769154071808, | |
| "learning_rate": 0.0003913762751384436, | |
| "loss": 3.2463, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.41916690940868, | |
| "grad_norm": 0.36768290400505066, | |
| "learning_rate": 0.00039120139900903527, | |
| "loss": 3.2339, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.433731430235945, | |
| "grad_norm": 0.3575945496559143, | |
| "learning_rate": 0.0003910265228796269, | |
| "loss": 3.2611, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.44829595106321, | |
| "grad_norm": 0.36524489521980286, | |
| "learning_rate": 0.00039085164675021854, | |
| "loss": 3.2509, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.462860471890476, | |
| "grad_norm": 0.35425522923469543, | |
| "learning_rate": 0.00039067677062081023, | |
| "loss": 3.2521, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.47742499271774, | |
| "grad_norm": 0.3456664979457855, | |
| "learning_rate": 0.00039050189449140187, | |
| "loss": 3.2536, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.47742499271774, | |
| "eval_accuracy": 0.37240218821525267, | |
| "eval_loss": 3.5458858013153076, | |
| "eval_runtime": 178.3378, | |
| "eval_samples_per_second": 93.312, | |
| "eval_steps_per_second": 5.837, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.491989513545004, | |
| "grad_norm": 0.3824387788772583, | |
| "learning_rate": 0.00039032701836199356, | |
| "loss": 3.2533, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.506554034372268, | |
| "grad_norm": 0.3639180362224579, | |
| "learning_rate": 0.0003901521422325852, | |
| "loss": 3.2558, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.521118555199536, | |
| "grad_norm": 0.387299120426178, | |
| "learning_rate": 0.00038997726610317683, | |
| "loss": 3.2536, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.5356830760268, | |
| "grad_norm": 0.3662397563457489, | |
| "learning_rate": 0.0003898023899737686, | |
| "loss": 3.259, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.550247596854064, | |
| "grad_norm": 0.37291350960731506, | |
| "learning_rate": 0.0003896275138443602, | |
| "loss": 3.2633, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.564812117681328, | |
| "grad_norm": 0.40766435861587524, | |
| "learning_rate": 0.0003894526377149519, | |
| "loss": 3.2652, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.57937663850859, | |
| "grad_norm": 0.3494330048561096, | |
| "learning_rate": 0.00038927776158554354, | |
| "loss": 3.2648, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.59394115933586, | |
| "grad_norm": 0.3630947768688202, | |
| "learning_rate": 0.0003891028854561352, | |
| "loss": 3.254, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.608505680163123, | |
| "grad_norm": 0.358445942401886, | |
| "learning_rate": 0.00038892800932672686, | |
| "loss": 3.2676, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.623070200990387, | |
| "grad_norm": 0.35599270462989807, | |
| "learning_rate": 0.0003887531331973185, | |
| "loss": 3.2678, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.63763472181765, | |
| "grad_norm": 0.3681202828884125, | |
| "learning_rate": 0.0003885782570679102, | |
| "loss": 3.2697, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.65219924264492, | |
| "grad_norm": 0.34716203808784485, | |
| "learning_rate": 0.0003884033809385018, | |
| "loss": 3.2677, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.666763763472183, | |
| "grad_norm": 0.3918900787830353, | |
| "learning_rate": 0.00038822850480909357, | |
| "loss": 3.263, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.681328284299447, | |
| "grad_norm": 0.3595735430717468, | |
| "learning_rate": 0.0003880536286796852, | |
| "loss": 3.278, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.69589280512671, | |
| "grad_norm": 0.3838953673839569, | |
| "learning_rate": 0.00038787875255027684, | |
| "loss": 3.268, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.710457325953975, | |
| "grad_norm": 0.3666781187057495, | |
| "learning_rate": 0.00038770387642086853, | |
| "loss": 3.2855, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.725021846781242, | |
| "grad_norm": 0.3640023171901703, | |
| "learning_rate": 0.00038752900029146017, | |
| "loss": 3.2769, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.739586367608506, | |
| "grad_norm": 0.3869996964931488, | |
| "learning_rate": 0.00038735412416205186, | |
| "loss": 3.2637, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.75415088843577, | |
| "grad_norm": 0.3861514925956726, | |
| "learning_rate": 0.0003871792480326435, | |
| "loss": 3.2769, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.768715409263034, | |
| "grad_norm": 0.39806482195854187, | |
| "learning_rate": 0.00038700437190323513, | |
| "loss": 3.2864, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.768715409263034, | |
| "eval_accuracy": 0.37317821695900927, | |
| "eval_loss": 3.53695011138916, | |
| "eval_runtime": 178.9207, | |
| "eval_samples_per_second": 93.008, | |
| "eval_steps_per_second": 5.818, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.7832799300903, | |
| "grad_norm": 0.3672393262386322, | |
| "learning_rate": 0.0003868294957738268, | |
| "loss": 3.2849, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.797844450917566, | |
| "grad_norm": 0.3629036247730255, | |
| "learning_rate": 0.00038665461964441846, | |
| "loss": 3.2865, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.81240897174483, | |
| "grad_norm": 0.37374356389045715, | |
| "learning_rate": 0.0003864797435150102, | |
| "loss": 3.2839, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.826973492572094, | |
| "grad_norm": 0.3959326148033142, | |
| "learning_rate": 0.00038630486738560184, | |
| "loss": 3.2759, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.841538013399358, | |
| "grad_norm": 0.407258540391922, | |
| "learning_rate": 0.00038612999125619353, | |
| "loss": 3.294, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.856102534226626, | |
| "grad_norm": 0.37516331672668457, | |
| "learning_rate": 0.00038595511512678517, | |
| "loss": 3.2826, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.87066705505389, | |
| "grad_norm": 0.3762381970882416, | |
| "learning_rate": 0.0003857802389973768, | |
| "loss": 3.2829, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.885231575881154, | |
| "grad_norm": 0.3950086534023285, | |
| "learning_rate": 0.0003856053628679685, | |
| "loss": 3.2828, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.899796096708418, | |
| "grad_norm": 0.35681092739105225, | |
| "learning_rate": 0.00038543048673856013, | |
| "loss": 3.2767, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.91436061753568, | |
| "grad_norm": 0.3618324398994446, | |
| "learning_rate": 0.0003852556106091518, | |
| "loss": 3.2892, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.92892513836295, | |
| "grad_norm": 0.3859667479991913, | |
| "learning_rate": 0.00038508073447974346, | |
| "loss": 3.2809, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.943489659190213, | |
| "grad_norm": 0.37979528307914734, | |
| "learning_rate": 0.0003849058583503351, | |
| "loss": 3.2884, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.958054180017477, | |
| "grad_norm": 0.3932032287120819, | |
| "learning_rate": 0.00038473098222092684, | |
| "loss": 3.2719, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.97261870084474, | |
| "grad_norm": 0.383821040391922, | |
| "learning_rate": 0.0003845561060915185, | |
| "loss": 3.2818, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.98718322167201, | |
| "grad_norm": 0.3670404553413391, | |
| "learning_rate": 0.00038438122996211016, | |
| "loss": 3.2897, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 18.001747742499273, | |
| "grad_norm": 0.38702115416526794, | |
| "learning_rate": 0.0003842063538327018, | |
| "loss": 3.2594, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.016312263326537, | |
| "grad_norm": 0.3678998351097107, | |
| "learning_rate": 0.0003840314777032935, | |
| "loss": 3.182, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.0308767841538, | |
| "grad_norm": 0.4147825241088867, | |
| "learning_rate": 0.0003838566015738851, | |
| "loss": 3.1858, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.045441304981065, | |
| "grad_norm": 0.3900906443595886, | |
| "learning_rate": 0.00038368172544447676, | |
| "loss": 3.1817, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.060005825808332, | |
| "grad_norm": 0.3811590373516083, | |
| "learning_rate": 0.00038350684931506845, | |
| "loss": 3.1842, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.060005825808332, | |
| "eval_accuracy": 0.3725568294939373, | |
| "eval_loss": 3.5498850345611572, | |
| "eval_runtime": 178.3022, | |
| "eval_samples_per_second": 93.33, | |
| "eval_steps_per_second": 5.838, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.074570346635596, | |
| "grad_norm": 0.3556840121746063, | |
| "learning_rate": 0.0003833319731856601, | |
| "loss": 3.1909, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.08913486746286, | |
| "grad_norm": 0.37406861782073975, | |
| "learning_rate": 0.00038315709705625183, | |
| "loss": 3.2023, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.103699388290124, | |
| "grad_norm": 0.38093480467796326, | |
| "learning_rate": 0.00038298222092684347, | |
| "loss": 3.2011, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.11826390911739, | |
| "grad_norm": 0.36246082186698914, | |
| "learning_rate": 0.0003828073447974351, | |
| "loss": 3.1999, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.132828429944656, | |
| "grad_norm": 0.38521018624305725, | |
| "learning_rate": 0.0003826324686680268, | |
| "loss": 3.2039, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.14739295077192, | |
| "grad_norm": 0.387359619140625, | |
| "learning_rate": 0.00038245759253861843, | |
| "loss": 3.2073, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.161957471599184, | |
| "grad_norm": 0.3872774839401245, | |
| "learning_rate": 0.0003822827164092101, | |
| "loss": 3.2114, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.176521992426448, | |
| "grad_norm": 0.3680144250392914, | |
| "learning_rate": 0.00038210784027980176, | |
| "loss": 3.2058, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.191086513253715, | |
| "grad_norm": 0.37997880578041077, | |
| "learning_rate": 0.0003819329641503934, | |
| "loss": 3.2089, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.20565103408098, | |
| "grad_norm": 0.4042533338069916, | |
| "learning_rate": 0.0003817580880209851, | |
| "loss": 3.2144, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.220215554908243, | |
| "grad_norm": 0.3523563742637634, | |
| "learning_rate": 0.0003815832118915767, | |
| "loss": 3.2195, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.234780075735507, | |
| "grad_norm": 0.36140862107276917, | |
| "learning_rate": 0.00038140833576216847, | |
| "loss": 3.2074, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.24934459656277, | |
| "grad_norm": 0.36438101530075073, | |
| "learning_rate": 0.0003812334596327601, | |
| "loss": 3.215, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.26390911739004, | |
| "grad_norm": 0.3548491299152374, | |
| "learning_rate": 0.0003810585835033518, | |
| "loss": 3.2305, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.278473638217303, | |
| "grad_norm": 0.36885324120521545, | |
| "learning_rate": 0.00038088370737394343, | |
| "loss": 3.2123, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.293038159044567, | |
| "grad_norm": 0.3839961588382721, | |
| "learning_rate": 0.00038070883124453507, | |
| "loss": 3.2353, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.30760267987183, | |
| "grad_norm": 0.380561888217926, | |
| "learning_rate": 0.00038053395511512676, | |
| "loss": 3.2258, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.3221672006991, | |
| "grad_norm": 0.3721316456794739, | |
| "learning_rate": 0.0003803590789857184, | |
| "loss": 3.2348, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.336731721526363, | |
| "grad_norm": 0.38659441471099854, | |
| "learning_rate": 0.0003801842028563101, | |
| "loss": 3.2352, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.351296242353627, | |
| "grad_norm": 0.38408663868904114, | |
| "learning_rate": 0.0003800093267269017, | |
| "loss": 3.2292, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.351296242353627, | |
| "eval_accuracy": 0.3726717226720931, | |
| "eval_loss": 3.544954299926758, | |
| "eval_runtime": 178.702, | |
| "eval_samples_per_second": 93.122, | |
| "eval_steps_per_second": 5.825, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.36586076318089, | |
| "grad_norm": 0.38154080510139465, | |
| "learning_rate": 0.00037983445059749335, | |
| "loss": 3.2345, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.380425284008155, | |
| "grad_norm": 0.38055160641670227, | |
| "learning_rate": 0.0003796595744680851, | |
| "loss": 3.2304, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.394989804835422, | |
| "grad_norm": 0.3660554885864258, | |
| "learning_rate": 0.00037948469833867674, | |
| "loss": 3.2421, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.409554325662686, | |
| "grad_norm": 0.39061740040779114, | |
| "learning_rate": 0.0003793098222092684, | |
| "loss": 3.2387, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.42411884648995, | |
| "grad_norm": 0.3903264105319977, | |
| "learning_rate": 0.00037913494607986006, | |
| "loss": 3.2289, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.438683367317214, | |
| "grad_norm": 0.3617711365222931, | |
| "learning_rate": 0.00037896006995045175, | |
| "loss": 3.2362, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.45324788814448, | |
| "grad_norm": 0.3767446279525757, | |
| "learning_rate": 0.0003787851938210434, | |
| "loss": 3.2389, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.467812408971746, | |
| "grad_norm": 0.3788781762123108, | |
| "learning_rate": 0.000378610317691635, | |
| "loss": 3.2386, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.48237692979901, | |
| "grad_norm": 0.3798232674598694, | |
| "learning_rate": 0.0003784354415622267, | |
| "loss": 3.2459, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.496941450626274, | |
| "grad_norm": 0.3854493200778961, | |
| "learning_rate": 0.00037826056543281835, | |
| "loss": 3.2479, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.511505971453538, | |
| "grad_norm": 0.37762451171875, | |
| "learning_rate": 0.0003780856893034101, | |
| "loss": 3.2443, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.526070492280805, | |
| "grad_norm": 0.42830953001976013, | |
| "learning_rate": 0.00037791081317400173, | |
| "loss": 3.2505, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.54063501310807, | |
| "grad_norm": 0.3428388833999634, | |
| "learning_rate": 0.00037773593704459337, | |
| "loss": 3.2526, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.555199533935333, | |
| "grad_norm": 0.3522073030471802, | |
| "learning_rate": 0.00037756106091518506, | |
| "loss": 3.2507, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.569764054762597, | |
| "grad_norm": 0.39283233880996704, | |
| "learning_rate": 0.0003773861847857767, | |
| "loss": 3.2488, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.584328575589865, | |
| "grad_norm": 0.3995060622692108, | |
| "learning_rate": 0.0003772113086563684, | |
| "loss": 3.2485, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.59889309641713, | |
| "grad_norm": 0.3654598295688629, | |
| "learning_rate": 0.00037703643252696, | |
| "loss": 3.2569, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.613457617244393, | |
| "grad_norm": 0.4324260354042053, | |
| "learning_rate": 0.0003768615563975517, | |
| "loss": 3.2575, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.628022138071657, | |
| "grad_norm": 0.35259896516799927, | |
| "learning_rate": 0.00037668668026814335, | |
| "loss": 3.2635, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.64258665889892, | |
| "grad_norm": 0.35881707072257996, | |
| "learning_rate": 0.000376511804138735, | |
| "loss": 3.2734, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.64258665889892, | |
| "eval_accuracy": 0.37319409267963466, | |
| "eval_loss": 3.538569927215576, | |
| "eval_runtime": 178.7072, | |
| "eval_samples_per_second": 93.119, | |
| "eval_steps_per_second": 5.825, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.65715117972619, | |
| "grad_norm": 0.39576172828674316, | |
| "learning_rate": 0.00037633692800932673, | |
| "loss": 3.2583, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.671715700553452, | |
| "grad_norm": 0.36197108030319214, | |
| "learning_rate": 0.00037616205187991837, | |
| "loss": 3.2633, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.686280221380716, | |
| "grad_norm": 0.35696882009506226, | |
| "learning_rate": 0.00037598717575051006, | |
| "loss": 3.2624, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.70084474220798, | |
| "grad_norm": 0.36636996269226074, | |
| "learning_rate": 0.0003758122996211017, | |
| "loss": 3.2643, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.715409263035244, | |
| "grad_norm": 0.39471927285194397, | |
| "learning_rate": 0.00037563742349169333, | |
| "loss": 3.2639, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.729973783862512, | |
| "grad_norm": 0.36265599727630615, | |
| "learning_rate": 0.000375462547362285, | |
| "loss": 3.2608, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.744538304689776, | |
| "grad_norm": 0.38616278767585754, | |
| "learning_rate": 0.00037528767123287665, | |
| "loss": 3.2625, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.75910282551704, | |
| "grad_norm": 0.3862897753715515, | |
| "learning_rate": 0.00037511279510346834, | |
| "loss": 3.2614, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.773667346344304, | |
| "grad_norm": 0.377992182970047, | |
| "learning_rate": 0.00037493791897406, | |
| "loss": 3.2648, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.78823186717157, | |
| "grad_norm": 0.4095982015132904, | |
| "learning_rate": 0.0003747630428446516, | |
| "loss": 3.2688, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.802796387998836, | |
| "grad_norm": 0.4064045548439026, | |
| "learning_rate": 0.00037458816671524336, | |
| "loss": 3.2684, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.8173609088261, | |
| "grad_norm": 0.377204954624176, | |
| "learning_rate": 0.000374413290585835, | |
| "loss": 3.2699, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.831925429653364, | |
| "grad_norm": 0.38369378447532654, | |
| "learning_rate": 0.0003742384144564267, | |
| "loss": 3.2662, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.846489950480628, | |
| "grad_norm": 0.3455315828323364, | |
| "learning_rate": 0.0003740635383270183, | |
| "loss": 3.2626, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.861054471307895, | |
| "grad_norm": 0.3835621774196625, | |
| "learning_rate": 0.00037388866219761, | |
| "loss": 3.2531, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.87561899213516, | |
| "grad_norm": 0.37047526240348816, | |
| "learning_rate": 0.00037371378606820165, | |
| "loss": 3.2693, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.890183512962423, | |
| "grad_norm": 0.3814605474472046, | |
| "learning_rate": 0.0003735389099387933, | |
| "loss": 3.2607, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.904748033789687, | |
| "grad_norm": 0.36107689142227173, | |
| "learning_rate": 0.000373364033809385, | |
| "loss": 3.2606, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.919312554616955, | |
| "grad_norm": 0.37017542123794556, | |
| "learning_rate": 0.0003731891576799766, | |
| "loss": 3.2702, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.93387707544422, | |
| "grad_norm": 0.35455116629600525, | |
| "learning_rate": 0.00037301428155056836, | |
| "loss": 3.2742, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.93387707544422, | |
| "eval_accuracy": 0.37389015483061133, | |
| "eval_loss": 3.5325088500976562, | |
| "eval_runtime": 178.611, | |
| "eval_samples_per_second": 93.169, | |
| "eval_steps_per_second": 5.828, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.948441596271483, | |
| "grad_norm": 0.3578329384326935, | |
| "learning_rate": 0.00037283940542116, | |
| "loss": 3.2631, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.963006117098747, | |
| "grad_norm": 0.3659336268901825, | |
| "learning_rate": 0.00037266452929175163, | |
| "loss": 3.2701, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.97757063792601, | |
| "grad_norm": 0.3941463232040405, | |
| "learning_rate": 0.0003724896531623433, | |
| "loss": 3.2729, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.99213515875328, | |
| "grad_norm": 0.35798850655555725, | |
| "learning_rate": 0.00037231477703293496, | |
| "loss": 3.2721, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 19.006699679580542, | |
| "grad_norm": 0.3821282684803009, | |
| "learning_rate": 0.00037213990090352665, | |
| "loss": 3.2245, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.021264200407806, | |
| "grad_norm": 0.3936713933944702, | |
| "learning_rate": 0.0003719650247741183, | |
| "loss": 3.1673, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.03582872123507, | |
| "grad_norm": 0.3834037780761719, | |
| "learning_rate": 0.00037179014864471, | |
| "loss": 3.1683, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.050393242062338, | |
| "grad_norm": 0.41335737705230713, | |
| "learning_rate": 0.0003716152725153016, | |
| "loss": 3.1729, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.064957762889602, | |
| "grad_norm": 0.3922509551048279, | |
| "learning_rate": 0.00037144039638589325, | |
| "loss": 3.1782, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.079522283716866, | |
| "grad_norm": 0.4212184548377991, | |
| "learning_rate": 0.000371265520256485, | |
| "loss": 3.1944, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.09408680454413, | |
| "grad_norm": 0.38961464166641235, | |
| "learning_rate": 0.00037109064412707663, | |
| "loss": 3.1903, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.108651325371394, | |
| "grad_norm": 0.38067349791526794, | |
| "learning_rate": 0.0003709157679976683, | |
| "loss": 3.1827, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.12321584619866, | |
| "grad_norm": 0.3817111849784851, | |
| "learning_rate": 0.00037074089186825995, | |
| "loss": 3.1853, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.137780367025925, | |
| "grad_norm": 0.3943799138069153, | |
| "learning_rate": 0.0003705660157388516, | |
| "loss": 3.1897, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.15234488785319, | |
| "grad_norm": 0.34692874550819397, | |
| "learning_rate": 0.0003703911396094433, | |
| "loss": 3.1987, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.166909408680453, | |
| "grad_norm": 0.3758201003074646, | |
| "learning_rate": 0.0003702162634800349, | |
| "loss": 3.1943, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.181473929507717, | |
| "grad_norm": 0.3872537314891815, | |
| "learning_rate": 0.0003700413873506266, | |
| "loss": 3.2068, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.196038450334985, | |
| "grad_norm": 0.3869698941707611, | |
| "learning_rate": 0.00036986651122121824, | |
| "loss": 3.2054, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.21060297116225, | |
| "grad_norm": 0.374907910823822, | |
| "learning_rate": 0.00036969163509181, | |
| "loss": 3.2016, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.225167491989513, | |
| "grad_norm": 0.41257426142692566, | |
| "learning_rate": 0.0003695167589624016, | |
| "loss": 3.2181, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.225167491989513, | |
| "eval_accuracy": 0.37258681696622975, | |
| "eval_loss": 3.549304246902466, | |
| "eval_runtime": 178.7203, | |
| "eval_samples_per_second": 93.112, | |
| "eval_steps_per_second": 5.825, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.239732012816777, | |
| "grad_norm": 0.3578282594680786, | |
| "learning_rate": 0.00036934188283299326, | |
| "loss": 3.2035, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.254296533644045, | |
| "grad_norm": 0.3656151294708252, | |
| "learning_rate": 0.00036916700670358495, | |
| "loss": 3.2043, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.26886105447131, | |
| "grad_norm": 0.38482174277305603, | |
| "learning_rate": 0.0003689921305741766, | |
| "loss": 3.2088, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.283425575298573, | |
| "grad_norm": 0.3795786201953888, | |
| "learning_rate": 0.0003688172544447683, | |
| "loss": 3.2117, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.297990096125837, | |
| "grad_norm": 0.397713840007782, | |
| "learning_rate": 0.0003686423783153599, | |
| "loss": 3.2176, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.3125546169531, | |
| "grad_norm": 0.3665197491645813, | |
| "learning_rate": 0.00036846750218595155, | |
| "loss": 3.2059, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.327119137780368, | |
| "grad_norm": 0.39614352583885193, | |
| "learning_rate": 0.00036829262605654324, | |
| "loss": 3.227, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.341683658607632, | |
| "grad_norm": 0.3724098801612854, | |
| "learning_rate": 0.0003681177499271349, | |
| "loss": 3.2127, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.356248179434896, | |
| "grad_norm": 0.38602399826049805, | |
| "learning_rate": 0.0003679428737977266, | |
| "loss": 3.2227, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.37081270026216, | |
| "grad_norm": 0.36235538125038147, | |
| "learning_rate": 0.00036776799766831826, | |
| "loss": 3.2195, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.385377221089428, | |
| "grad_norm": 0.36831751465797424, | |
| "learning_rate": 0.0003675931215389099, | |
| "loss": 3.2191, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.39994174191669, | |
| "grad_norm": 0.3906136453151703, | |
| "learning_rate": 0.0003674182454095016, | |
| "loss": 3.217, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.414506262743956, | |
| "grad_norm": 0.40357115864753723, | |
| "learning_rate": 0.0003672433692800932, | |
| "loss": 3.2266, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.42907078357122, | |
| "grad_norm": 0.37824922800064087, | |
| "learning_rate": 0.0003670684931506849, | |
| "loss": 3.2299, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.443635304398484, | |
| "grad_norm": 0.3971153795719147, | |
| "learning_rate": 0.00036689361702127655, | |
| "loss": 3.2347, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.45819982522575, | |
| "grad_norm": 0.391778826713562, | |
| "learning_rate": 0.00036671874089186824, | |
| "loss": 3.2212, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.472764346053015, | |
| "grad_norm": 0.41135820746421814, | |
| "learning_rate": 0.00036654386476245987, | |
| "loss": 3.2307, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.48732886688028, | |
| "grad_norm": 0.38100093603134155, | |
| "learning_rate": 0.0003663689886330515, | |
| "loss": 3.2395, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.501893387707543, | |
| "grad_norm": 0.4028393626213074, | |
| "learning_rate": 0.00036619411250364325, | |
| "loss": 3.2361, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.51645790853481, | |
| "grad_norm": 0.40780872106552124, | |
| "learning_rate": 0.0003660192363742349, | |
| "loss": 3.2331, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.51645790853481, | |
| "eval_accuracy": 0.37296371833367026, | |
| "eval_loss": 3.543234348297119, | |
| "eval_runtime": 178.2342, | |
| "eval_samples_per_second": 93.366, | |
| "eval_steps_per_second": 5.841, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.531022429362075, | |
| "grad_norm": 0.36076855659484863, | |
| "learning_rate": 0.0003658443602448266, | |
| "loss": 3.2384, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.54558695018934, | |
| "grad_norm": 0.4246351420879364, | |
| "learning_rate": 0.0003656694841154182, | |
| "loss": 3.2335, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.560151471016603, | |
| "grad_norm": 0.38414379954338074, | |
| "learning_rate": 0.00036549460798600985, | |
| "loss": 3.2434, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.574715991843867, | |
| "grad_norm": 0.34863388538360596, | |
| "learning_rate": 0.00036531973185660154, | |
| "loss": 3.2359, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.589280512671134, | |
| "grad_norm": 0.39756739139556885, | |
| "learning_rate": 0.0003651448557271932, | |
| "loss": 3.2489, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.6038450334984, | |
| "grad_norm": 0.3935822546482086, | |
| "learning_rate": 0.00036496997959778487, | |
| "loss": 3.2575, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.618409554325662, | |
| "grad_norm": 0.3777560591697693, | |
| "learning_rate": 0.0003647951034683765, | |
| "loss": 3.2377, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.632974075152926, | |
| "grad_norm": 0.38349226117134094, | |
| "learning_rate": 0.00036462022733896825, | |
| "loss": 3.2485, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.647538595980194, | |
| "grad_norm": 0.38052335381507874, | |
| "learning_rate": 0.0003644453512095599, | |
| "loss": 3.2459, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.662103116807458, | |
| "grad_norm": 0.38704562187194824, | |
| "learning_rate": 0.0003642704750801515, | |
| "loss": 3.2401, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.676667637634722, | |
| "grad_norm": 0.43219882249832153, | |
| "learning_rate": 0.0003640955989507432, | |
| "loss": 3.2525, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.691232158461986, | |
| "grad_norm": 0.3853197991847992, | |
| "learning_rate": 0.00036392072282133485, | |
| "loss": 3.2386, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.70579667928925, | |
| "grad_norm": 0.37223896384239197, | |
| "learning_rate": 0.00036374584669192654, | |
| "loss": 3.2412, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.720361200116518, | |
| "grad_norm": 0.36086782813072205, | |
| "learning_rate": 0.0003635709705625182, | |
| "loss": 3.2501, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.73492572094378, | |
| "grad_norm": 0.38250860571861267, | |
| "learning_rate": 0.0003633960944331098, | |
| "loss": 3.2624, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.749490241771046, | |
| "grad_norm": 0.373307466506958, | |
| "learning_rate": 0.0003632212183037015, | |
| "loss": 3.2549, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.76405476259831, | |
| "grad_norm": 0.3858838677406311, | |
| "learning_rate": 0.00036304634217429314, | |
| "loss": 3.2465, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.778619283425574, | |
| "grad_norm": 0.36848515272140503, | |
| "learning_rate": 0.0003628714660448849, | |
| "loss": 3.2571, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.79318380425284, | |
| "grad_norm": 0.3879449963569641, | |
| "learning_rate": 0.0003626965899154765, | |
| "loss": 3.2531, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.807748325080105, | |
| "grad_norm": 0.3670133352279663, | |
| "learning_rate": 0.0003625217137860682, | |
| "loss": 3.2519, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.807748325080105, | |
| "eval_accuracy": 0.37371728587269015, | |
| "eval_loss": 3.535742998123169, | |
| "eval_runtime": 178.7112, | |
| "eval_samples_per_second": 93.117, | |
| "eval_steps_per_second": 5.825, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.82231284590737, | |
| "grad_norm": 0.4126584827899933, | |
| "learning_rate": 0.00036234683765665985, | |
| "loss": 3.2575, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.836877366734633, | |
| "grad_norm": 0.4170861542224884, | |
| "learning_rate": 0.0003621719615272515, | |
| "loss": 3.251, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.8514418875619, | |
| "grad_norm": 0.3947071135044098, | |
| "learning_rate": 0.00036199708539784317, | |
| "loss": 3.2655, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.866006408389165, | |
| "grad_norm": 0.37961897253990173, | |
| "learning_rate": 0.0003618222092684348, | |
| "loss": 3.2535, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.88057092921643, | |
| "grad_norm": 0.3961268961429596, | |
| "learning_rate": 0.0003616473331390265, | |
| "loss": 3.2499, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.895135450043693, | |
| "grad_norm": 0.37590292096138, | |
| "learning_rate": 0.00036147245700961813, | |
| "loss": 3.252, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.909699970870957, | |
| "grad_norm": 0.35986328125, | |
| "learning_rate": 0.00036129758088020977, | |
| "loss": 3.2624, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.924264491698224, | |
| "grad_norm": 0.3746156692504883, | |
| "learning_rate": 0.0003611227047508015, | |
| "loss": 3.2635, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.93882901252549, | |
| "grad_norm": 0.3684529662132263, | |
| "learning_rate": 0.00036094782862139315, | |
| "loss": 3.2655, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.953393533352752, | |
| "grad_norm": 0.38268017768859863, | |
| "learning_rate": 0.00036077295249198484, | |
| "loss": 3.2666, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.967958054180016, | |
| "grad_norm": 0.3657938838005066, | |
| "learning_rate": 0.0003605980763625765, | |
| "loss": 3.2621, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.982522575007284, | |
| "grad_norm": 0.3825475871562958, | |
| "learning_rate": 0.0003604232002331681, | |
| "loss": 3.2576, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.997087095834548, | |
| "grad_norm": 0.3889371156692505, | |
| "learning_rate": 0.0003602483241037598, | |
| "loss": 3.2623, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.011651616661812, | |
| "grad_norm": 0.39976590871810913, | |
| "learning_rate": 0.00036007344797435144, | |
| "loss": 3.1758, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.026216137489076, | |
| "grad_norm": 0.39569732546806335, | |
| "learning_rate": 0.00035989857184494313, | |
| "loss": 3.1563, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.04078065831634, | |
| "grad_norm": 0.38413846492767334, | |
| "learning_rate": 0.00035972369571553477, | |
| "loss": 3.1617, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.055345179143607, | |
| "grad_norm": 0.3950614333152771, | |
| "learning_rate": 0.0003595488195861265, | |
| "loss": 3.1581, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.06990969997087, | |
| "grad_norm": 0.3673596978187561, | |
| "learning_rate": 0.00035937394345671815, | |
| "loss": 3.1545, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.084474220798135, | |
| "grad_norm": 0.3909459412097931, | |
| "learning_rate": 0.0003591990673273098, | |
| "loss": 3.1761, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.0990387416254, | |
| "grad_norm": 0.3746775984764099, | |
| "learning_rate": 0.0003590241911979015, | |
| "loss": 3.1681, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.0990387416254, | |
| "eval_accuracy": 0.37325547879938625, | |
| "eval_loss": 3.54992938041687, | |
| "eval_runtime": 178.4066, | |
| "eval_samples_per_second": 93.276, | |
| "eval_steps_per_second": 5.835, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.113603262452667, | |
| "grad_norm": 0.38828980922698975, | |
| "learning_rate": 0.0003588493150684931, | |
| "loss": 3.1727, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.12816778327993, | |
| "grad_norm": 0.3792520761489868, | |
| "learning_rate": 0.0003586744389390848, | |
| "loss": 3.1815, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.142732304107195, | |
| "grad_norm": 0.41158244013786316, | |
| "learning_rate": 0.00035849956280967644, | |
| "loss": 3.1746, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.15729682493446, | |
| "grad_norm": 0.38685640692710876, | |
| "learning_rate": 0.0003583246866802681, | |
| "loss": 3.1866, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.171861345761723, | |
| "grad_norm": 0.392080157995224, | |
| "learning_rate": 0.00035814981055085976, | |
| "loss": 3.1875, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.18642586658899, | |
| "grad_norm": 0.3973603844642639, | |
| "learning_rate": 0.0003579749344214514, | |
| "loss": 3.1907, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.200990387416255, | |
| "grad_norm": 0.39429065585136414, | |
| "learning_rate": 0.00035780005829204315, | |
| "loss": 3.2021, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.21555490824352, | |
| "grad_norm": 0.3873383104801178, | |
| "learning_rate": 0.0003576251821626348, | |
| "loss": 3.2113, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.230119429070783, | |
| "grad_norm": 0.39196303486824036, | |
| "learning_rate": 0.00035745030603322647, | |
| "loss": 3.1806, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.244683949898047, | |
| "grad_norm": 0.38600245118141174, | |
| "learning_rate": 0.0003572754299038181, | |
| "loss": 3.1949, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.259248470725314, | |
| "grad_norm": 0.40740031003952026, | |
| "learning_rate": 0.00035710055377440974, | |
| "loss": 3.2046, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.273812991552578, | |
| "grad_norm": 0.3854810297489166, | |
| "learning_rate": 0.00035692567764500143, | |
| "loss": 3.2124, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.288377512379842, | |
| "grad_norm": 0.4143630266189575, | |
| "learning_rate": 0.00035675080151559307, | |
| "loss": 3.1975, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.302942033207106, | |
| "grad_norm": 0.39811912178993225, | |
| "learning_rate": 0.00035657592538618476, | |
| "loss": 3.2157, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.317506554034374, | |
| "grad_norm": 0.37693876028060913, | |
| "learning_rate": 0.0003564010492567764, | |
| "loss": 3.2072, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.332071074861638, | |
| "grad_norm": 0.36333632469177246, | |
| "learning_rate": 0.00035622617312736803, | |
| "loss": 3.2053, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.346635595688902, | |
| "grad_norm": 0.402942419052124, | |
| "learning_rate": 0.0003560512969979598, | |
| "loss": 3.2061, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.361200116516166, | |
| "grad_norm": 0.38568899035453796, | |
| "learning_rate": 0.0003558764208685514, | |
| "loss": 3.2177, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.37576463734343, | |
| "grad_norm": 0.36854901909828186, | |
| "learning_rate": 0.0003557015447391431, | |
| "loss": 3.2177, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.390329158170697, | |
| "grad_norm": 0.35696056485176086, | |
| "learning_rate": 0.00035552666860973474, | |
| "loss": 3.2174, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.390329158170697, | |
| "eval_accuracy": 0.3733148657543184, | |
| "eval_loss": 3.548022508621216, | |
| "eval_runtime": 178.9547, | |
| "eval_samples_per_second": 92.99, | |
| "eval_steps_per_second": 5.817, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.40489367899796, | |
| "grad_norm": 0.40440744161605835, | |
| "learning_rate": 0.00035535179248032643, | |
| "loss": 3.2185, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.419458199825225, | |
| "grad_norm": 0.42058974504470825, | |
| "learning_rate": 0.00035517691635091807, | |
| "loss": 3.217, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.43402272065249, | |
| "grad_norm": 0.3718623220920563, | |
| "learning_rate": 0.0003550020402215097, | |
| "loss": 3.2316, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.448587241479757, | |
| "grad_norm": 0.3532809019088745, | |
| "learning_rate": 0.0003548271640921014, | |
| "loss": 3.2117, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.46315176230702, | |
| "grad_norm": 0.38799864053726196, | |
| "learning_rate": 0.00035465228796269303, | |
| "loss": 3.2139, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.477716283134285, | |
| "grad_norm": 0.42314207553863525, | |
| "learning_rate": 0.0003544774118332848, | |
| "loss": 3.2205, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.49228080396155, | |
| "grad_norm": 0.40982669591903687, | |
| "learning_rate": 0.0003543025357038764, | |
| "loss": 3.2287, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.506845324788813, | |
| "grad_norm": 0.37632712721824646, | |
| "learning_rate": 0.00035412765957446805, | |
| "loss": 3.2222, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.52140984561608, | |
| "grad_norm": 0.38797253370285034, | |
| "learning_rate": 0.00035395278344505974, | |
| "loss": 3.228, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.535974366443345, | |
| "grad_norm": 0.39570584893226624, | |
| "learning_rate": 0.0003537779073156514, | |
| "loss": 3.2251, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.55053888727061, | |
| "grad_norm": 0.39038118720054626, | |
| "learning_rate": 0.00035360303118624306, | |
| "loss": 3.2203, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.565103408097873, | |
| "grad_norm": 0.3837476968765259, | |
| "learning_rate": 0.0003534281550568347, | |
| "loss": 3.2286, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.57966792892514, | |
| "grad_norm": 0.3945882022380829, | |
| "learning_rate": 0.0003532532789274264, | |
| "loss": 3.2226, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.594232449752404, | |
| "grad_norm": 0.39531904458999634, | |
| "learning_rate": 0.000353078402798018, | |
| "loss": 3.2334, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.608796970579668, | |
| "grad_norm": 0.38708940148353577, | |
| "learning_rate": 0.00035290352666860966, | |
| "loss": 3.2401, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.623361491406932, | |
| "grad_norm": 0.3743561804294586, | |
| "learning_rate": 0.0003527286505392014, | |
| "loss": 3.2474, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.637926012234196, | |
| "grad_norm": 0.3948131799697876, | |
| "learning_rate": 0.00035255377440979304, | |
| "loss": 3.2348, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.652490533061464, | |
| "grad_norm": 0.36936649680137634, | |
| "learning_rate": 0.00035237889828038473, | |
| "loss": 3.2371, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.667055053888728, | |
| "grad_norm": 0.4126057028770447, | |
| "learning_rate": 0.00035220402215097637, | |
| "loss": 3.233, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.68161957471599, | |
| "grad_norm": 0.37366172671318054, | |
| "learning_rate": 0.000352029146021568, | |
| "loss": 3.2276, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.68161957471599, | |
| "eval_accuracy": 0.37395671525930757, | |
| "eval_loss": 3.5362870693206787, | |
| "eval_runtime": 179.3905, | |
| "eval_samples_per_second": 92.764, | |
| "eval_steps_per_second": 5.803, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.696184095543256, | |
| "grad_norm": 0.3675983250141144, | |
| "learning_rate": 0.0003518542698921597, | |
| "loss": 3.2517, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.710748616370523, | |
| "grad_norm": 0.3695129156112671, | |
| "learning_rate": 0.00035167939376275133, | |
| "loss": 3.2383, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.725313137197787, | |
| "grad_norm": 0.38997822999954224, | |
| "learning_rate": 0.000351504517633343, | |
| "loss": 3.2221, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.73987765802505, | |
| "grad_norm": 0.3975091576576233, | |
| "learning_rate": 0.00035132964150393466, | |
| "loss": 3.2457, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.754442178852315, | |
| "grad_norm": 0.41574394702911377, | |
| "learning_rate": 0.0003511547653745263, | |
| "loss": 3.2452, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.76900669967958, | |
| "grad_norm": 0.39745384454727173, | |
| "learning_rate": 0.00035097988924511804, | |
| "loss": 3.2531, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.783571220506847, | |
| "grad_norm": 0.3609507083892822, | |
| "learning_rate": 0.0003508050131157097, | |
| "loss": 3.2371, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.79813574133411, | |
| "grad_norm": 0.3867054879665375, | |
| "learning_rate": 0.00035063013698630137, | |
| "loss": 3.2428, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.812700262161375, | |
| "grad_norm": 0.40387028455734253, | |
| "learning_rate": 0.000350455260856893, | |
| "loss": 3.237, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.82726478298864, | |
| "grad_norm": 0.3769632875919342, | |
| "learning_rate": 0.0003502803847274847, | |
| "loss": 3.2406, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.841829303815903, | |
| "grad_norm": 0.36971697211265564, | |
| "learning_rate": 0.00035010550859807633, | |
| "loss": 3.2429, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.85639382464317, | |
| "grad_norm": 0.38049620389938354, | |
| "learning_rate": 0.00034993063246866797, | |
| "loss": 3.2409, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.870958345470434, | |
| "grad_norm": 0.37809380888938904, | |
| "learning_rate": 0.00034975575633925966, | |
| "loss": 3.2387, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.8855228662977, | |
| "grad_norm": 0.4029906988143921, | |
| "learning_rate": 0.0003495808802098513, | |
| "loss": 3.2457, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.900087387124962, | |
| "grad_norm": 0.44036680459976196, | |
| "learning_rate": 0.00034940600408044304, | |
| "loss": 3.2435, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.91465190795223, | |
| "grad_norm": 0.40069833397865295, | |
| "learning_rate": 0.0003492311279510347, | |
| "loss": 3.2447, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.929216428779494, | |
| "grad_norm": 0.4169695973396301, | |
| "learning_rate": 0.0003490562518216263, | |
| "loss": 3.2302, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.943780949606758, | |
| "grad_norm": 0.3733760118484497, | |
| "learning_rate": 0.000348881375692218, | |
| "loss": 3.254, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.958345470434022, | |
| "grad_norm": 0.3739670515060425, | |
| "learning_rate": 0.00034870649956280964, | |
| "loss": 3.2525, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.972909991261286, | |
| "grad_norm": 0.36286690831184387, | |
| "learning_rate": 0.0003485316234334013, | |
| "loss": 3.2571, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.972909991261286, | |
| "eval_accuracy": 0.37421237316034206, | |
| "eval_loss": 3.5296003818511963, | |
| "eval_runtime": 179.1931, | |
| "eval_samples_per_second": 92.866, | |
| "eval_steps_per_second": 5.809, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.987474512088554, | |
| "grad_norm": 0.36834150552749634, | |
| "learning_rate": 0.00034835674730399296, | |
| "loss": 3.2607, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 21.002039032915818, | |
| "grad_norm": 0.40818142890930176, | |
| "learning_rate": 0.00034818187117458465, | |
| "loss": 3.2429, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 21.01660355374308, | |
| "grad_norm": 0.3603350818157196, | |
| "learning_rate": 0.0003480069950451763, | |
| "loss": 3.152, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.031168074570346, | |
| "grad_norm": 0.36682602763175964, | |
| "learning_rate": 0.0003478321189157679, | |
| "loss": 3.1391, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.045732595397613, | |
| "grad_norm": 0.3988490402698517, | |
| "learning_rate": 0.00034765724278635967, | |
| "loss": 3.1514, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.060297116224877, | |
| "grad_norm": 0.38823550939559937, | |
| "learning_rate": 0.0003474823666569513, | |
| "loss": 3.1507, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.07486163705214, | |
| "grad_norm": 0.4026493430137634, | |
| "learning_rate": 0.000347307490527543, | |
| "loss": 3.1611, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.089426157879405, | |
| "grad_norm": 0.3982233703136444, | |
| "learning_rate": 0.00034713261439813463, | |
| "loss": 3.1523, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.10399067870667, | |
| "grad_norm": 0.39218148589134216, | |
| "learning_rate": 0.00034695773826872627, | |
| "loss": 3.1553, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.118555199533937, | |
| "grad_norm": 0.4108741879463196, | |
| "learning_rate": 0.00034678286213931796, | |
| "loss": 3.1623, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.1331197203612, | |
| "grad_norm": 0.38565295934677124, | |
| "learning_rate": 0.0003466079860099096, | |
| "loss": 3.1706, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.147684241188465, | |
| "grad_norm": 0.4134363532066345, | |
| "learning_rate": 0.0003464331098805013, | |
| "loss": 3.1678, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.16224876201573, | |
| "grad_norm": 0.3783572018146515, | |
| "learning_rate": 0.0003462582337510929, | |
| "loss": 3.1789, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.176813282842993, | |
| "grad_norm": 0.35808396339416504, | |
| "learning_rate": 0.00034608335762168467, | |
| "loss": 3.1715, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.19137780367026, | |
| "grad_norm": 0.3888563811779022, | |
| "learning_rate": 0.0003459084814922763, | |
| "loss": 3.1744, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.205942324497524, | |
| "grad_norm": 0.3799726068973541, | |
| "learning_rate": 0.00034573360536286794, | |
| "loss": 3.1799, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.22050684532479, | |
| "grad_norm": 0.3735267221927643, | |
| "learning_rate": 0.00034555872923345963, | |
| "loss": 3.1833, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.235071366152052, | |
| "grad_norm": 0.41824302077293396, | |
| "learning_rate": 0.00034538385310405127, | |
| "loss": 3.193, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.24963588697932, | |
| "grad_norm": 0.3764243423938751, | |
| "learning_rate": 0.00034520897697464296, | |
| "loss": 3.1962, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.264200407806584, | |
| "grad_norm": 0.3964778482913971, | |
| "learning_rate": 0.0003450341008452346, | |
| "loss": 3.1985, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.264200407806584, | |
| "eval_accuracy": 0.37337625187407003, | |
| "eval_loss": 3.5458767414093018, | |
| "eval_runtime": 179.8315, | |
| "eval_samples_per_second": 92.537, | |
| "eval_steps_per_second": 5.789, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.278764928633848, | |
| "grad_norm": 0.38459548354148865, | |
| "learning_rate": 0.00034485922471582623, | |
| "loss": 3.1902, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.293329449461112, | |
| "grad_norm": 0.3928430378437042, | |
| "learning_rate": 0.0003446843485864179, | |
| "loss": 3.1909, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.307893970288376, | |
| "grad_norm": 0.39092567563056946, | |
| "learning_rate": 0.00034450947245700955, | |
| "loss": 3.186, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.322458491115643, | |
| "grad_norm": 0.37497106194496155, | |
| "learning_rate": 0.0003443345963276013, | |
| "loss": 3.1996, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.337023011942907, | |
| "grad_norm": 0.39056339859962463, | |
| "learning_rate": 0.00034415972019819294, | |
| "loss": 3.1937, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.35158753277017, | |
| "grad_norm": 0.3654477894306183, | |
| "learning_rate": 0.00034398484406878457, | |
| "loss": 3.1956, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.366152053597435, | |
| "grad_norm": 0.39564356207847595, | |
| "learning_rate": 0.00034380996793937626, | |
| "loss": 3.2004, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.380716574424703, | |
| "grad_norm": 0.39092445373535156, | |
| "learning_rate": 0.0003436350918099679, | |
| "loss": 3.2117, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.395281095251967, | |
| "grad_norm": 0.38866856694221497, | |
| "learning_rate": 0.0003434602156805596, | |
| "loss": 3.1905, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.40984561607923, | |
| "grad_norm": 0.3942776918411255, | |
| "learning_rate": 0.0003432853395511512, | |
| "loss": 3.2098, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.424410136906495, | |
| "grad_norm": 0.3794627785682678, | |
| "learning_rate": 0.0003431104634217429, | |
| "loss": 3.2067, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.43897465773376, | |
| "grad_norm": 0.38869741559028625, | |
| "learning_rate": 0.00034293558729233455, | |
| "loss": 3.197, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.453539178561027, | |
| "grad_norm": 0.3953700661659241, | |
| "learning_rate": 0.0003427607111629262, | |
| "loss": 3.2201, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.46810369938829, | |
| "grad_norm": 0.4092461168766022, | |
| "learning_rate": 0.00034258583503351793, | |
| "loss": 3.2107, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.482668220215555, | |
| "grad_norm": 0.37649816274642944, | |
| "learning_rate": 0.00034241095890410957, | |
| "loss": 3.2197, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.49723274104282, | |
| "grad_norm": 0.38819989562034607, | |
| "learning_rate": 0.00034223608277470126, | |
| "loss": 3.2178, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.511797261870086, | |
| "grad_norm": 0.3887033760547638, | |
| "learning_rate": 0.0003420612066452929, | |
| "loss": 3.2111, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.52636178269735, | |
| "grad_norm": 0.36329957842826843, | |
| "learning_rate": 0.00034188633051588453, | |
| "loss": 3.207, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.540926303524614, | |
| "grad_norm": 0.39315828680992126, | |
| "learning_rate": 0.0003417114543864762, | |
| "loss": 3.22, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.555490824351878, | |
| "grad_norm": 0.3785039782524109, | |
| "learning_rate": 0.00034153657825706786, | |
| "loss": 3.2095, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.555490824351878, | |
| "eval_accuracy": 0.3738740439141248, | |
| "eval_loss": 3.539914608001709, | |
| "eval_runtime": 179.8789, | |
| "eval_samples_per_second": 92.512, | |
| "eval_steps_per_second": 5.787, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.570055345179142, | |
| "grad_norm": 0.3970089256763458, | |
| "learning_rate": 0.00034136170212765955, | |
| "loss": 3.22, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.58461986600641, | |
| "grad_norm": 0.3790573477745056, | |
| "learning_rate": 0.0003411868259982512, | |
| "loss": 3.2221, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.599184386833674, | |
| "grad_norm": 0.3819750249385834, | |
| "learning_rate": 0.00034101194986884293, | |
| "loss": 3.2227, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.613748907660938, | |
| "grad_norm": 0.39000168442726135, | |
| "learning_rate": 0.00034083707373943456, | |
| "loss": 3.2129, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.6283134284882, | |
| "grad_norm": 0.3961773216724396, | |
| "learning_rate": 0.0003406621976100262, | |
| "loss": 3.2271, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.64287794931547, | |
| "grad_norm": 0.4604828953742981, | |
| "learning_rate": 0.0003404873214806179, | |
| "loss": 3.2178, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.657442470142733, | |
| "grad_norm": 0.4082699120044708, | |
| "learning_rate": 0.00034031244535120953, | |
| "loss": 3.2296, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.672006990969997, | |
| "grad_norm": 0.36979642510414124, | |
| "learning_rate": 0.0003401375692218012, | |
| "loss": 3.2126, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.68657151179726, | |
| "grad_norm": 0.41598883271217346, | |
| "learning_rate": 0.00033996269309239285, | |
| "loss": 3.2172, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.701136032624525, | |
| "grad_norm": 0.4014417231082916, | |
| "learning_rate": 0.0003397878169629845, | |
| "loss": 3.2295, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.715700553451793, | |
| "grad_norm": 0.39168688654899597, | |
| "learning_rate": 0.0003396129408335762, | |
| "loss": 3.2404, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.730265074279057, | |
| "grad_norm": 0.4000439941883087, | |
| "learning_rate": 0.0003394380647041678, | |
| "loss": 3.2364, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.74482959510632, | |
| "grad_norm": 0.3941012918949127, | |
| "learning_rate": 0.00033926318857475956, | |
| "loss": 3.2289, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.759394115933585, | |
| "grad_norm": 0.37271010875701904, | |
| "learning_rate": 0.0003390883124453512, | |
| "loss": 3.2265, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.77395863676085, | |
| "grad_norm": 0.4070070683956146, | |
| "learning_rate": 0.0003389134363159429, | |
| "loss": 3.2338, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.788523157588116, | |
| "grad_norm": 0.4027371108531952, | |
| "learning_rate": 0.0003387385601865345, | |
| "loss": 3.227, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.80308767841538, | |
| "grad_norm": 0.38957029581069946, | |
| "learning_rate": 0.00033856368405712616, | |
| "loss": 3.2263, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.817652199242644, | |
| "grad_norm": 0.4069308936595917, | |
| "learning_rate": 0.00033838880792771785, | |
| "loss": 3.2442, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.83221672006991, | |
| "grad_norm": 0.4276354908943176, | |
| "learning_rate": 0.0003382139317983095, | |
| "loss": 3.2518, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.846781240897176, | |
| "grad_norm": 0.3802242577075958, | |
| "learning_rate": 0.0003380390556689012, | |
| "loss": 3.2244, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.846781240897176, | |
| "eval_accuracy": 0.3742635282601351, | |
| "eval_loss": 3.531430721282959, | |
| "eval_runtime": 179.9299, | |
| "eval_samples_per_second": 92.486, | |
| "eval_steps_per_second": 5.786, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.86134576172444, | |
| "grad_norm": 0.38759249448776245, | |
| "learning_rate": 0.0003378641795394928, | |
| "loss": 3.2361, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.875910282551704, | |
| "grad_norm": 0.387056827545166, | |
| "learning_rate": 0.00033768930341008445, | |
| "loss": 3.2324, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.890474803378968, | |
| "grad_norm": 0.37463316321372986, | |
| "learning_rate": 0.0003375144272806762, | |
| "loss": 3.2475, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.905039324206232, | |
| "grad_norm": 0.39495569467544556, | |
| "learning_rate": 0.00033733955115126783, | |
| "loss": 3.2459, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.9196038450335, | |
| "grad_norm": 0.3841392993927002, | |
| "learning_rate": 0.0003371646750218595, | |
| "loss": 3.2408, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.934168365860764, | |
| "grad_norm": 0.40644872188568115, | |
| "learning_rate": 0.00033698979889245116, | |
| "loss": 3.237, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.948732886688028, | |
| "grad_norm": 0.3798723816871643, | |
| "learning_rate": 0.0003368149227630428, | |
| "loss": 3.237, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.96329740751529, | |
| "grad_norm": 0.38789594173431396, | |
| "learning_rate": 0.0003366400466336345, | |
| "loss": 3.26, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.97786192834256, | |
| "grad_norm": 0.3960188329219818, | |
| "learning_rate": 0.0003364651705042261, | |
| "loss": 3.2589, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 21.992426449169823, | |
| "grad_norm": 0.39203280210494995, | |
| "learning_rate": 0.0003362902943748178, | |
| "loss": 3.2426, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 22.006990969997087, | |
| "grad_norm": 0.38960763812065125, | |
| "learning_rate": 0.00033611541824540945, | |
| "loss": 3.1892, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.02155549082435, | |
| "grad_norm": 0.395844429731369, | |
| "learning_rate": 0.0003359405421160012, | |
| "loss": 3.1304, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.036120011651615, | |
| "grad_norm": 0.4585193991661072, | |
| "learning_rate": 0.0003357656659865928, | |
| "loss": 3.1417, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.050684532478883, | |
| "grad_norm": 0.39369523525238037, | |
| "learning_rate": 0.00033559078985718446, | |
| "loss": 3.1339, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.065249053306147, | |
| "grad_norm": 0.3878481090068817, | |
| "learning_rate": 0.00033541591372777615, | |
| "loss": 3.1498, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.07981357413341, | |
| "grad_norm": 0.40822944045066833, | |
| "learning_rate": 0.0003352410375983678, | |
| "loss": 3.1596, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.094378094960675, | |
| "grad_norm": 0.39915433526039124, | |
| "learning_rate": 0.0003350661614689595, | |
| "loss": 3.149, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.108942615787942, | |
| "grad_norm": 0.4160260558128357, | |
| "learning_rate": 0.0003348912853395511, | |
| "loss": 3.1563, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.123507136615206, | |
| "grad_norm": 0.44865837693214417, | |
| "learning_rate": 0.00033471640921014275, | |
| "loss": 3.1637, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.13807165744247, | |
| "grad_norm": 0.38839584589004517, | |
| "learning_rate": 0.00033454153308073444, | |
| "loss": 3.1653, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.13807165744247, | |
| "eval_accuracy": 0.37341882232493223, | |
| "eval_loss": 3.5493438243865967, | |
| "eval_runtime": 180.0454, | |
| "eval_samples_per_second": 92.427, | |
| "eval_steps_per_second": 5.782, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.152636178269734, | |
| "grad_norm": 0.40942075848579407, | |
| "learning_rate": 0.0003343666569513261, | |
| "loss": 3.1537, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.167200699097, | |
| "grad_norm": 0.37509503960609436, | |
| "learning_rate": 0.0003341917808219178, | |
| "loss": 3.1677, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.181765219924266, | |
| "grad_norm": 0.4073181450366974, | |
| "learning_rate": 0.00033401690469250946, | |
| "loss": 3.1711, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.19632974075153, | |
| "grad_norm": 0.3973526954650879, | |
| "learning_rate": 0.00033384202856310115, | |
| "loss": 3.163, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.210894261578794, | |
| "grad_norm": 0.38937580585479736, | |
| "learning_rate": 0.0003336671524336928, | |
| "loss": 3.1599, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.225458782406058, | |
| "grad_norm": 0.4311583936214447, | |
| "learning_rate": 0.0003334922763042844, | |
| "loss": 3.1733, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.240023303233322, | |
| "grad_norm": 0.3999621570110321, | |
| "learning_rate": 0.0003333174001748761, | |
| "loss": 3.1737, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.25458782406059, | |
| "grad_norm": 0.4125518500804901, | |
| "learning_rate": 0.00033314252404546775, | |
| "loss": 3.1615, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.269152344887853, | |
| "grad_norm": 0.3764672875404358, | |
| "learning_rate": 0.00033296764791605944, | |
| "loss": 3.1752, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.283716865715117, | |
| "grad_norm": 0.3720638155937195, | |
| "learning_rate": 0.0003327927717866511, | |
| "loss": 3.1851, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.29828138654238, | |
| "grad_norm": 0.42662569880485535, | |
| "learning_rate": 0.0003326178956572427, | |
| "loss": 3.1922, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.31284590736965, | |
| "grad_norm": 0.41372689604759216, | |
| "learning_rate": 0.00033244301952783446, | |
| "loss": 3.183, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.327410428196913, | |
| "grad_norm": 0.3920452892780304, | |
| "learning_rate": 0.0003322681433984261, | |
| "loss": 3.1926, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.341974949024177, | |
| "grad_norm": 0.4548211693763733, | |
| "learning_rate": 0.0003320932672690178, | |
| "loss": 3.1928, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.35653946985144, | |
| "grad_norm": 0.377517968416214, | |
| "learning_rate": 0.0003319183911396094, | |
| "loss": 3.1863, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.371103990678705, | |
| "grad_norm": 0.44053518772125244, | |
| "learning_rate": 0.0003317435150102011, | |
| "loss": 3.1934, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.385668511505973, | |
| "grad_norm": 0.3874462842941284, | |
| "learning_rate": 0.00033156863888079275, | |
| "loss": 3.1817, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.400233032333237, | |
| "grad_norm": 0.3760935962200165, | |
| "learning_rate": 0.0003313937627513844, | |
| "loss": 3.1908, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.4147975531605, | |
| "grad_norm": 0.40698912739753723, | |
| "learning_rate": 0.00033121888662197607, | |
| "loss": 3.1974, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.429362073987765, | |
| "grad_norm": 0.38324347138404846, | |
| "learning_rate": 0.0003310440104925677, | |
| "loss": 3.1924, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.429362073987765, | |
| "eval_accuracy": 0.37394048674489044, | |
| "eval_loss": 3.5421411991119385, | |
| "eval_runtime": 178.6723, | |
| "eval_samples_per_second": 93.137, | |
| "eval_steps_per_second": 5.826, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.443926594815032, | |
| "grad_norm": 0.4046727120876312, | |
| "learning_rate": 0.00033086913436315945, | |
| "loss": 3.1997, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.458491115642296, | |
| "grad_norm": 0.36527130007743835, | |
| "learning_rate": 0.0003306942582337511, | |
| "loss": 3.2036, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.47305563646956, | |
| "grad_norm": 0.39856159687042236, | |
| "learning_rate": 0.0003305193821043427, | |
| "loss": 3.2087, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.487620157296824, | |
| "grad_norm": 0.40473848581314087, | |
| "learning_rate": 0.0003303445059749344, | |
| "loss": 3.214, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.502184678124088, | |
| "grad_norm": 0.3842359185218811, | |
| "learning_rate": 0.00033016962984552605, | |
| "loss": 3.2017, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.516749198951356, | |
| "grad_norm": 0.37999603152275085, | |
| "learning_rate": 0.00032999475371611774, | |
| "loss": 3.1993, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.53131371977862, | |
| "grad_norm": 0.4159919023513794, | |
| "learning_rate": 0.0003298198775867094, | |
| "loss": 3.2092, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.545878240605884, | |
| "grad_norm": 0.38509103655815125, | |
| "learning_rate": 0.000329645001457301, | |
| "loss": 3.2052, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.560442761433148, | |
| "grad_norm": 0.3961235284805298, | |
| "learning_rate": 0.0003294701253278927, | |
| "loss": 3.2127, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.575007282260415, | |
| "grad_norm": 0.42305219173431396, | |
| "learning_rate": 0.00032929524919848434, | |
| "loss": 3.2055, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.58957180308768, | |
| "grad_norm": 0.3830503523349762, | |
| "learning_rate": 0.0003291203730690761, | |
| "loss": 3.2134, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.604136323914943, | |
| "grad_norm": 0.3828100562095642, | |
| "learning_rate": 0.0003289454969396677, | |
| "loss": 3.2212, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.618700844742207, | |
| "grad_norm": 0.4239085912704468, | |
| "learning_rate": 0.0003287706208102594, | |
| "loss": 3.2097, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.63326536556947, | |
| "grad_norm": 0.3623301386833191, | |
| "learning_rate": 0.00032859574468085105, | |
| "loss": 3.2136, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.64782988639674, | |
| "grad_norm": 0.39616623520851135, | |
| "learning_rate": 0.0003284208685514427, | |
| "loss": 3.2143, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.662394407224003, | |
| "grad_norm": 0.3920958936214447, | |
| "learning_rate": 0.0003282459924220344, | |
| "loss": 3.2097, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.676958928051267, | |
| "grad_norm": 0.3976283669471741, | |
| "learning_rate": 0.000328071116292626, | |
| "loss": 3.2166, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.69152344887853, | |
| "grad_norm": 0.39355382323265076, | |
| "learning_rate": 0.0003278962401632177, | |
| "loss": 3.2129, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.7060879697058, | |
| "grad_norm": 0.38081255555152893, | |
| "learning_rate": 0.00032772136403380934, | |
| "loss": 3.2249, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.720652490533062, | |
| "grad_norm": 0.397743284702301, | |
| "learning_rate": 0.000327546487904401, | |
| "loss": 3.2211, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.720652490533062, | |
| "eval_accuracy": 0.37458386502297686, | |
| "eval_loss": 3.534937620162964, | |
| "eval_runtime": 178.7352, | |
| "eval_samples_per_second": 93.104, | |
| "eval_steps_per_second": 5.824, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.735217011360326, | |
| "grad_norm": 0.36255332827568054, | |
| "learning_rate": 0.0003273716117749927, | |
| "loss": 3.2126, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.74978153218759, | |
| "grad_norm": 0.41010552644729614, | |
| "learning_rate": 0.00032719673564558435, | |
| "loss": 3.2159, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.764346053014854, | |
| "grad_norm": 0.371345192193985, | |
| "learning_rate": 0.00032702185951617605, | |
| "loss": 3.2146, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.778910573842122, | |
| "grad_norm": 0.38916873931884766, | |
| "learning_rate": 0.0003268469833867677, | |
| "loss": 3.2231, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.793475094669386, | |
| "grad_norm": 0.37377578020095825, | |
| "learning_rate": 0.00032667210725735937, | |
| "loss": 3.223, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.80803961549665, | |
| "grad_norm": 0.41026216745376587, | |
| "learning_rate": 0.000326497231127951, | |
| "loss": 3.2325, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.822604136323914, | |
| "grad_norm": 0.3754938542842865, | |
| "learning_rate": 0.00032632235499854264, | |
| "loss": 3.2346, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.837168657151178, | |
| "grad_norm": 0.38638442754745483, | |
| "learning_rate": 0.00032614747886913433, | |
| "loss": 3.2235, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.851733177978446, | |
| "grad_norm": 0.3845442533493042, | |
| "learning_rate": 0.00032597260273972597, | |
| "loss": 3.2299, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.86629769880571, | |
| "grad_norm": 0.3820870518684387, | |
| "learning_rate": 0.0003257977266103177, | |
| "loss": 3.2265, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.880862219632974, | |
| "grad_norm": 0.39607709646224976, | |
| "learning_rate": 0.00032562285048090935, | |
| "loss": 3.2228, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.895426740460238, | |
| "grad_norm": 0.3708309531211853, | |
| "learning_rate": 0.000325447974351501, | |
| "loss": 3.2371, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.909991261287505, | |
| "grad_norm": 0.37607479095458984, | |
| "learning_rate": 0.0003252730982220927, | |
| "loss": 3.2399, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.92455578211477, | |
| "grad_norm": 0.40992024540901184, | |
| "learning_rate": 0.0003250982220926843, | |
| "loss": 3.2251, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.939120302942033, | |
| "grad_norm": 0.41538867354393005, | |
| "learning_rate": 0.000324923345963276, | |
| "loss": 3.2247, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.953684823769297, | |
| "grad_norm": 0.3930741846561432, | |
| "learning_rate": 0.00032474846983386764, | |
| "loss": 3.2153, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.96824934459656, | |
| "grad_norm": 0.3772921562194824, | |
| "learning_rate": 0.00032457359370445933, | |
| "loss": 3.2273, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.98281386542383, | |
| "grad_norm": 0.41579151153564453, | |
| "learning_rate": 0.00032439871757505097, | |
| "loss": 3.2488, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 22.997378386251093, | |
| "grad_norm": 0.37834632396698, | |
| "learning_rate": 0.0003242238414456426, | |
| "loss": 3.2379, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 23.011942907078357, | |
| "grad_norm": 0.37508898973464966, | |
| "learning_rate": 0.00032404896531623435, | |
| "loss": 3.148, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.011942907078357, | |
| "eval_accuracy": 0.3738874500782085, | |
| "eval_loss": 3.542969226837158, | |
| "eval_runtime": 178.5346, | |
| "eval_samples_per_second": 93.209, | |
| "eval_steps_per_second": 5.831, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.02650742790562, | |
| "grad_norm": 0.39909064769744873, | |
| "learning_rate": 0.000323874089186826, | |
| "loss": 3.1349, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.04107194873289, | |
| "grad_norm": 0.39390829205513, | |
| "learning_rate": 0.0003236992130574177, | |
| "loss": 3.1389, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.055636469560152, | |
| "grad_norm": 0.4094650149345398, | |
| "learning_rate": 0.0003235243369280093, | |
| "loss": 3.133, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.070200990387416, | |
| "grad_norm": 0.41038405895233154, | |
| "learning_rate": 0.00032334946079860095, | |
| "loss": 3.1384, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.08476551121468, | |
| "grad_norm": 0.4140039384365082, | |
| "learning_rate": 0.00032317458466919264, | |
| "loss": 3.1473, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.099330032041944, | |
| "grad_norm": 0.4052468240261078, | |
| "learning_rate": 0.0003229997085397843, | |
| "loss": 3.1486, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.113894552869212, | |
| "grad_norm": 0.4668196141719818, | |
| "learning_rate": 0.00032282483241037596, | |
| "loss": 3.15, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.128459073696476, | |
| "grad_norm": 0.4077889919281006, | |
| "learning_rate": 0.0003226499562809676, | |
| "loss": 3.1468, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.14302359452374, | |
| "grad_norm": 0.4005262553691864, | |
| "learning_rate": 0.00032247508015155924, | |
| "loss": 3.1539, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.157588115351004, | |
| "grad_norm": 0.4289659261703491, | |
| "learning_rate": 0.000322300204022151, | |
| "loss": 3.158, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.17215263617827, | |
| "grad_norm": 0.4152050316333771, | |
| "learning_rate": 0.0003221253278927426, | |
| "loss": 3.1563, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.186717157005535, | |
| "grad_norm": 0.3984936773777008, | |
| "learning_rate": 0.0003219504517633343, | |
| "loss": 3.1531, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.2012816778328, | |
| "grad_norm": 0.3805445730686188, | |
| "learning_rate": 0.00032177557563392594, | |
| "loss": 3.1668, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.215846198660063, | |
| "grad_norm": 0.4050346314907074, | |
| "learning_rate": 0.00032160069950451763, | |
| "loss": 3.1607, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.230410719487327, | |
| "grad_norm": 0.3911287188529968, | |
| "learning_rate": 0.00032142582337510927, | |
| "loss": 3.1687, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.244975240314595, | |
| "grad_norm": 0.4178531765937805, | |
| "learning_rate": 0.0003212509472457009, | |
| "loss": 3.1675, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.25953976114186, | |
| "grad_norm": 0.38798150420188904, | |
| "learning_rate": 0.0003210760711162926, | |
| "loss": 3.1744, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.274104281969123, | |
| "grad_norm": 0.43264156579971313, | |
| "learning_rate": 0.00032090119498688423, | |
| "loss": 3.1695, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.288668802796387, | |
| "grad_norm": 0.41300347447395325, | |
| "learning_rate": 0.0003207263188574759, | |
| "loss": 3.1785, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.30323332362365, | |
| "grad_norm": 0.4206102192401886, | |
| "learning_rate": 0.0003205514427280676, | |
| "loss": 3.1766, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.30323332362365, | |
| "eval_accuracy": 0.3740295083783234, | |
| "eval_loss": 3.546813488006592, | |
| "eval_runtime": 178.6519, | |
| "eval_samples_per_second": 93.148, | |
| "eval_steps_per_second": 5.827, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.31779784445092, | |
| "grad_norm": 0.40217486023902893, | |
| "learning_rate": 0.00032037656659865925, | |
| "loss": 3.1198, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.332362365278183, | |
| "grad_norm": 0.3986579477787018, | |
| "learning_rate": 0.00032020169046925094, | |
| "loss": 3.131, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.346926886105447, | |
| "grad_norm": 0.3992542326450348, | |
| "learning_rate": 0.0003200268143398426, | |
| "loss": 3.1436, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.36149140693271, | |
| "grad_norm": 0.41430288553237915, | |
| "learning_rate": 0.00031985193821043427, | |
| "loss": 3.1413, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.376055927759978, | |
| "grad_norm": 0.39568737149238586, | |
| "learning_rate": 0.0003196770620810259, | |
| "loss": 3.1504, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.390620448587242, | |
| "grad_norm": 0.38190150260925293, | |
| "learning_rate": 0.0003195021859516176, | |
| "loss": 3.1547, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.405184969414506, | |
| "grad_norm": 0.4019649922847748, | |
| "learning_rate": 0.00031932730982220923, | |
| "loss": 3.158, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.41974949024177, | |
| "grad_norm": 0.3808055818080902, | |
| "learning_rate": 0.00031915243369280087, | |
| "loss": 3.1502, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.434314011069034, | |
| "grad_norm": 0.42824339866638184, | |
| "learning_rate": 0.00031897755756339256, | |
| "loss": 3.1643, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.448878531896302, | |
| "grad_norm": 0.3883609473705292, | |
| "learning_rate": 0.0003188026814339842, | |
| "loss": 3.1631, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.463443052723566, | |
| "grad_norm": 0.3974464237689972, | |
| "learning_rate": 0.00031862780530457594, | |
| "loss": 3.1658, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.47800757355083, | |
| "grad_norm": 0.39729073643684387, | |
| "learning_rate": 0.0003184529291751676, | |
| "loss": 3.1728, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.492572094378094, | |
| "grad_norm": 0.4245384633541107, | |
| "learning_rate": 0.0003182780530457592, | |
| "loss": 3.1712, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.50713661520536, | |
| "grad_norm": 0.3828080892562866, | |
| "learning_rate": 0.0003181031769163509, | |
| "loss": 3.1603, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.521701136032625, | |
| "grad_norm": 0.4366213083267212, | |
| "learning_rate": 0.00031792830078694254, | |
| "loss": 3.1628, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.53626565685989, | |
| "grad_norm": 0.39621713757514954, | |
| "learning_rate": 0.0003177534246575342, | |
| "loss": 3.1789, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.550830177687153, | |
| "grad_norm": 0.3925442397594452, | |
| "learning_rate": 0.00031757854852812586, | |
| "loss": 3.17, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.565394698514417, | |
| "grad_norm": 0.4557037055492401, | |
| "learning_rate": 0.00031740367239871755, | |
| "loss": 3.1698, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.579959219341685, | |
| "grad_norm": 0.41575995087623596, | |
| "learning_rate": 0.0003172287962693092, | |
| "loss": 3.1684, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.59452374016895, | |
| "grad_norm": 0.39075565338134766, | |
| "learning_rate": 0.0003170539201399008, | |
| "loss": 3.1807, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.59452374016895, | |
| "eval_accuracy": 0.37377808400278895, | |
| "eval_loss": 3.5497612953186035, | |
| "eval_runtime": 179.6798, | |
| "eval_samples_per_second": 92.615, | |
| "eval_steps_per_second": 5.794, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.609088260996213, | |
| "grad_norm": 0.4015454053878784, | |
| "learning_rate": 0.00031687904401049257, | |
| "loss": 3.182, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.623652781823477, | |
| "grad_norm": 0.4120422899723053, | |
| "learning_rate": 0.0003167041678810842, | |
| "loss": 3.1766, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.638217302650745, | |
| "grad_norm": 0.39650651812553406, | |
| "learning_rate": 0.0003165292917516759, | |
| "loss": 3.1766, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.65278182347801, | |
| "grad_norm": 0.42096227407455444, | |
| "learning_rate": 0.00031635441562226753, | |
| "loss": 3.1859, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.667346344305273, | |
| "grad_norm": 0.49677303433418274, | |
| "learning_rate": 0.00031617953949285917, | |
| "loss": 3.1784, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.681910865132537, | |
| "grad_norm": 0.4429994225502014, | |
| "learning_rate": 0.00031600466336345086, | |
| "loss": 3.2021, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.6964753859598, | |
| "grad_norm": 0.398057222366333, | |
| "learning_rate": 0.0003158297872340425, | |
| "loss": 3.1849, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.711039906787068, | |
| "grad_norm": 0.43820124864578247, | |
| "learning_rate": 0.0003156549111046342, | |
| "loss": 3.1758, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.725604427614332, | |
| "grad_norm": 0.4369910955429077, | |
| "learning_rate": 0.0003154800349752258, | |
| "loss": 3.1826, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.740168948441596, | |
| "grad_norm": 0.39258837699890137, | |
| "learning_rate": 0.00031530515884581757, | |
| "loss": 3.186, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.75473346926886, | |
| "grad_norm": 0.4479694962501526, | |
| "learning_rate": 0.0003151302827164092, | |
| "loss": 3.1904, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.769297990096128, | |
| "grad_norm": 0.39652347564697266, | |
| "learning_rate": 0.00031495540658700084, | |
| "loss": 3.1899, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.78386251092339, | |
| "grad_norm": 0.4215203523635864, | |
| "learning_rate": 0.00031478053045759253, | |
| "loss": 3.1985, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.798427031750656, | |
| "grad_norm": 0.39612090587615967, | |
| "learning_rate": 0.00031460565432818417, | |
| "loss": 3.1824, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.81299155257792, | |
| "grad_norm": 0.4307905435562134, | |
| "learning_rate": 0.00031443077819877586, | |
| "loss": 3.1988, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.827556073405184, | |
| "grad_norm": 0.4627580940723419, | |
| "learning_rate": 0.0003142559020693675, | |
| "loss": 3.2016, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.84212059423245, | |
| "grad_norm": 0.39896416664123535, | |
| "learning_rate": 0.00031408102593995913, | |
| "loss": 3.1818, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.856685115059715, | |
| "grad_norm": 0.416999489068985, | |
| "learning_rate": 0.0003139061498105508, | |
| "loss": 3.1896, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.87124963588698, | |
| "grad_norm": 0.390245646238327, | |
| "learning_rate": 0.00031373127368114245, | |
| "loss": 3.2024, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.885814156714243, | |
| "grad_norm": 0.43539178371429443, | |
| "learning_rate": 0.0003135563975517342, | |
| "loss": 3.2014, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.885814156714243, | |
| "eval_accuracy": 0.3740790171070886, | |
| "eval_loss": 3.5404930114746094, | |
| "eval_runtime": 180.5158, | |
| "eval_samples_per_second": 92.186, | |
| "eval_steps_per_second": 5.767, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.900378677541507, | |
| "grad_norm": 0.4117051661014557, | |
| "learning_rate": 0.00031338152142232584, | |
| "loss": 3.1863, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.914943198368775, | |
| "grad_norm": 0.3843604624271393, | |
| "learning_rate": 0.00031320664529291747, | |
| "loss": 3.2044, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.92950771919604, | |
| "grad_norm": 0.39629417657852173, | |
| "learning_rate": 0.00031303176916350916, | |
| "loss": 3.2066, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.944072240023303, | |
| "grad_norm": 0.40928950905799866, | |
| "learning_rate": 0.0003128568930341008, | |
| "loss": 3.2048, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.958636760850567, | |
| "grad_norm": 0.40854644775390625, | |
| "learning_rate": 0.0003126820169046925, | |
| "loss": 3.2035, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.973201281677834, | |
| "grad_norm": 0.41340750455856323, | |
| "learning_rate": 0.0003125071407752841, | |
| "loss": 3.1906, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 23.9877658025051, | |
| "grad_norm": 0.4238535761833191, | |
| "learning_rate": 0.0003123322646458758, | |
| "loss": 3.2094, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 24.002330323332362, | |
| "grad_norm": 0.4978727698326111, | |
| "learning_rate": 0.00031215738851646745, | |
| "loss": 3.1901, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 24.016894844159626, | |
| "grad_norm": 0.43836721777915955, | |
| "learning_rate": 0.0003119825123870591, | |
| "loss": 3.1183, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.03145936498689, | |
| "grad_norm": 0.4122955799102783, | |
| "learning_rate": 0.00031180763625765083, | |
| "loss": 3.1271, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.046023885814158, | |
| "grad_norm": 0.43053796887397766, | |
| "learning_rate": 0.00031163276012824247, | |
| "loss": 3.1254, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.060588406641422, | |
| "grad_norm": 0.4107215702533722, | |
| "learning_rate": 0.00031145788399883416, | |
| "loss": 3.146, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.075152927468686, | |
| "grad_norm": 0.3937588334083557, | |
| "learning_rate": 0.0003112830078694258, | |
| "loss": 3.1431, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.08971744829595, | |
| "grad_norm": 0.40734755992889404, | |
| "learning_rate": 0.00031110813174001743, | |
| "loss": 3.1434, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.104281969123218, | |
| "grad_norm": 0.3780282139778137, | |
| "learning_rate": 0.0003109332556106091, | |
| "loss": 3.1484, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.11884648995048, | |
| "grad_norm": 0.41484034061431885, | |
| "learning_rate": 0.00031075837948120076, | |
| "loss": 3.1533, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.133411010777746, | |
| "grad_norm": 0.3923600912094116, | |
| "learning_rate": 0.00031058350335179245, | |
| "loss": 3.1487, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.14797553160501, | |
| "grad_norm": 0.43652015924453735, | |
| "learning_rate": 0.0003104086272223841, | |
| "loss": 3.1473, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.162540052432274, | |
| "grad_norm": 0.4114309549331665, | |
| "learning_rate": 0.00031023375109297583, | |
| "loss": 3.1559, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.17710457325954, | |
| "grad_norm": 0.4024522006511688, | |
| "learning_rate": 0.00031005887496356746, | |
| "loss": 3.159, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.17710457325954, | |
| "eval_accuracy": 0.37380078040338677, | |
| "eval_loss": 3.5526421070098877, | |
| "eval_runtime": 180.4121, | |
| "eval_samples_per_second": 92.239, | |
| "eval_steps_per_second": 5.77, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.191669094086805, | |
| "grad_norm": 0.42163506150245667, | |
| "learning_rate": 0.0003098839988341591, | |
| "loss": 3.1565, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.20623361491407, | |
| "grad_norm": 0.4099547266960144, | |
| "learning_rate": 0.0003097091227047508, | |
| "loss": 3.1583, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.220798135741333, | |
| "grad_norm": 0.40957939624786377, | |
| "learning_rate": 0.00030953424657534243, | |
| "loss": 3.1522, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.235362656568597, | |
| "grad_norm": 0.4154437482357025, | |
| "learning_rate": 0.0003093593704459341, | |
| "loss": 3.1502, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.249927177395865, | |
| "grad_norm": 0.3992171585559845, | |
| "learning_rate": 0.00030918449431652575, | |
| "loss": 3.1741, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.26449169822313, | |
| "grad_norm": 0.3820195198059082, | |
| "learning_rate": 0.0003090096181871174, | |
| "loss": 3.1665, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.279056219050393, | |
| "grad_norm": 0.4085935652256012, | |
| "learning_rate": 0.0003088347420577091, | |
| "loss": 3.1679, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.293620739877657, | |
| "grad_norm": 0.4251910448074341, | |
| "learning_rate": 0.0003086598659283007, | |
| "loss": 3.1759, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.308185260704924, | |
| "grad_norm": 0.4546334445476532, | |
| "learning_rate": 0.00030848498979889246, | |
| "loss": 3.1591, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.32274978153219, | |
| "grad_norm": 0.4355502426624298, | |
| "learning_rate": 0.0003083101136694841, | |
| "loss": 3.171, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.337314302359452, | |
| "grad_norm": 0.38806089758872986, | |
| "learning_rate": 0.0003081352375400758, | |
| "loss": 3.1722, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.351878823186716, | |
| "grad_norm": 0.39891743659973145, | |
| "learning_rate": 0.0003079603614106674, | |
| "loss": 3.1802, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.36644334401398, | |
| "grad_norm": 0.3836468458175659, | |
| "learning_rate": 0.00030778548528125906, | |
| "loss": 3.1727, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.381007864841248, | |
| "grad_norm": 0.3941711187362671, | |
| "learning_rate": 0.00030761060915185075, | |
| "loss": 3.1839, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.395572385668512, | |
| "grad_norm": 0.4084647297859192, | |
| "learning_rate": 0.0003074357330224424, | |
| "loss": 3.1842, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.410136906495776, | |
| "grad_norm": 0.44058483839035034, | |
| "learning_rate": 0.0003072608568930341, | |
| "loss": 3.1778, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.42470142732304, | |
| "grad_norm": 0.371565580368042, | |
| "learning_rate": 0.0003070859807636257, | |
| "loss": 3.1763, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.439265948150307, | |
| "grad_norm": 0.410878449678421, | |
| "learning_rate": 0.00030691110463421735, | |
| "loss": 3.1825, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.45383046897757, | |
| "grad_norm": 0.4056607484817505, | |
| "learning_rate": 0.0003067362285048091, | |
| "loss": 3.1818, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.468394989804835, | |
| "grad_norm": 0.3960700035095215, | |
| "learning_rate": 0.00030656135237540073, | |
| "loss": 3.1787, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.468394989804835, | |
| "eval_accuracy": 0.3739261397973623, | |
| "eval_loss": 3.5454070568084717, | |
| "eval_runtime": 180.4732, | |
| "eval_samples_per_second": 92.208, | |
| "eval_steps_per_second": 5.768, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.4829595106321, | |
| "grad_norm": 0.40793466567993164, | |
| "learning_rate": 0.0003063864762459924, | |
| "loss": 3.1835, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.497524031459363, | |
| "grad_norm": 0.4240623116493225, | |
| "learning_rate": 0.00030621160011658406, | |
| "loss": 3.1869, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.51208855228663, | |
| "grad_norm": 0.38526567816734314, | |
| "learning_rate": 0.0003060367239871757, | |
| "loss": 3.1853, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.526653073113895, | |
| "grad_norm": 0.4033205807209015, | |
| "learning_rate": 0.0003058618478577674, | |
| "loss": 3.2023, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.54121759394116, | |
| "grad_norm": 0.39986664056777954, | |
| "learning_rate": 0.000305686971728359, | |
| "loss": 3.1882, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.555782114768423, | |
| "grad_norm": 0.42409780621528625, | |
| "learning_rate": 0.0003055120955989507, | |
| "loss": 3.1849, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.57034663559569, | |
| "grad_norm": 0.40158599615097046, | |
| "learning_rate": 0.00030533721946954235, | |
| "loss": 3.1887, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.584911156422955, | |
| "grad_norm": 0.3864559531211853, | |
| "learning_rate": 0.0003051623433401341, | |
| "loss": 3.195, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.59947567725022, | |
| "grad_norm": 0.39239874482154846, | |
| "learning_rate": 0.00030498746721072573, | |
| "loss": 3.1957, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.614040198077483, | |
| "grad_norm": 0.41269341111183167, | |
| "learning_rate": 0.00030481259108131736, | |
| "loss": 3.2007, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.628604718904747, | |
| "grad_norm": 0.430519163608551, | |
| "learning_rate": 0.00030463771495190905, | |
| "loss": 3.1983, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.643169239732014, | |
| "grad_norm": 0.4243789613246918, | |
| "learning_rate": 0.0003044628388225007, | |
| "loss": 3.2019, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.657733760559278, | |
| "grad_norm": 0.4113166630268097, | |
| "learning_rate": 0.0003042879626930924, | |
| "loss": 3.2001, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.672298281386542, | |
| "grad_norm": 0.3855622112751007, | |
| "learning_rate": 0.000304113086563684, | |
| "loss": 3.196, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.686862802213806, | |
| "grad_norm": 0.4105675220489502, | |
| "learning_rate": 0.00030393821043427565, | |
| "loss": 3.2068, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.701427323041074, | |
| "grad_norm": 0.42943140864372253, | |
| "learning_rate": 0.00030376333430486734, | |
| "loss": 3.1916, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.715991843868338, | |
| "grad_norm": 0.39312615990638733, | |
| "learning_rate": 0.000303588458175459, | |
| "loss": 3.1973, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.7305563646956, | |
| "grad_norm": 0.4099068343639374, | |
| "learning_rate": 0.0003034135820460507, | |
| "loss": 3.201, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.745120885522866, | |
| "grad_norm": 0.4250319302082062, | |
| "learning_rate": 0.00030323870591664236, | |
| "loss": 3.2122, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.75968540635013, | |
| "grad_norm": 0.4025106132030487, | |
| "learning_rate": 0.00030306382978723405, | |
| "loss": 3.2001, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.75968540635013, | |
| "eval_accuracy": 0.37493806998981954, | |
| "eval_loss": 3.5354461669921875, | |
| "eval_runtime": 180.1336, | |
| "eval_samples_per_second": 92.381, | |
| "eval_steps_per_second": 5.779, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.774249927177397, | |
| "grad_norm": 0.4010268449783325, | |
| "learning_rate": 0.0003028889536578257, | |
| "loss": 3.2078, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.78881444800466, | |
| "grad_norm": 0.3985311686992645, | |
| "learning_rate": 0.0003027140775284173, | |
| "loss": 3.1985, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.803378968831925, | |
| "grad_norm": 0.392164945602417, | |
| "learning_rate": 0.000302539201399009, | |
| "loss": 3.1969, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.81794348965919, | |
| "grad_norm": 0.39922991394996643, | |
| "learning_rate": 0.00030236432526960065, | |
| "loss": 3.2053, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.832508010486453, | |
| "grad_norm": 0.41001641750335693, | |
| "learning_rate": 0.00030218944914019234, | |
| "loss": 3.214, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.84707253131372, | |
| "grad_norm": 0.42852386832237244, | |
| "learning_rate": 0.000302014573010784, | |
| "loss": 3.2108, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.861637052140985, | |
| "grad_norm": 0.40917831659317017, | |
| "learning_rate": 0.0003018396968813756, | |
| "loss": 3.2083, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.87620157296825, | |
| "grad_norm": 0.3986416161060333, | |
| "learning_rate": 0.00030166482075196736, | |
| "loss": 3.2032, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.890766093795513, | |
| "grad_norm": 0.3986012041568756, | |
| "learning_rate": 0.000301489944622559, | |
| "loss": 3.2009, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.90533061462278, | |
| "grad_norm": 0.40824413299560547, | |
| "learning_rate": 0.0003013150684931507, | |
| "loss": 3.2016, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.919895135450044, | |
| "grad_norm": 0.41101622581481934, | |
| "learning_rate": 0.0003011401923637423, | |
| "loss": 3.2052, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.93445965627731, | |
| "grad_norm": 0.3962431848049164, | |
| "learning_rate": 0.000300965316234334, | |
| "loss": 3.2238, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.949024177104572, | |
| "grad_norm": 0.4147668480873108, | |
| "learning_rate": 0.00030079044010492565, | |
| "loss": 3.2103, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.963588697931836, | |
| "grad_norm": 0.3646432161331177, | |
| "learning_rate": 0.0003006155639755173, | |
| "loss": 3.2083, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.978153218759104, | |
| "grad_norm": 0.40107569098472595, | |
| "learning_rate": 0.00030044068784610897, | |
| "loss": 3.2216, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 24.992717739586368, | |
| "grad_norm": 0.428681343793869, | |
| "learning_rate": 0.0003002658117167006, | |
| "loss": 3.2139, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 25.007282260413632, | |
| "grad_norm": 0.40074416995048523, | |
| "learning_rate": 0.00030009093558729235, | |
| "loss": 3.1639, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.021846781240896, | |
| "grad_norm": 0.3931594789028168, | |
| "learning_rate": 0.000299916059457884, | |
| "loss": 3.1114, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.036411302068164, | |
| "grad_norm": 0.4038519263267517, | |
| "learning_rate": 0.0002997411833284756, | |
| "loss": 3.1225, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.050975822895428, | |
| "grad_norm": 0.39107102155685425, | |
| "learning_rate": 0.0002995663071990673, | |
| "loss": 3.1291, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.050975822895428, | |
| "eval_accuracy": 0.3742310712313009, | |
| "eval_loss": 3.5472183227539062, | |
| "eval_runtime": 180.135, | |
| "eval_samples_per_second": 92.381, | |
| "eval_steps_per_second": 5.779, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.06554034372269, | |
| "grad_norm": 0.4421219527721405, | |
| "learning_rate": 0.00029939143106965895, | |
| "loss": 3.1221, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.080104864549956, | |
| "grad_norm": 0.41089800000190735, | |
| "learning_rate": 0.00029921655494025064, | |
| "loss": 3.1253, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.09466938537722, | |
| "grad_norm": 0.40636569261550903, | |
| "learning_rate": 0.0002990416788108423, | |
| "loss": 3.1289, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.109233906204487, | |
| "grad_norm": 0.4226664900779724, | |
| "learning_rate": 0.00029886680268143397, | |
| "loss": 3.1304, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.12379842703175, | |
| "grad_norm": 0.4057096540927887, | |
| "learning_rate": 0.0002986919265520256, | |
| "loss": 3.1321, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.138362947859015, | |
| "grad_norm": 0.40901872515678406, | |
| "learning_rate": 0.0002985170504226173, | |
| "loss": 3.1259, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.15292746868628, | |
| "grad_norm": 0.4164910912513733, | |
| "learning_rate": 0.00029834217429320893, | |
| "loss": 3.1403, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.167491989513547, | |
| "grad_norm": 0.40740352869033813, | |
| "learning_rate": 0.0002981672981638006, | |
| "loss": 3.1476, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.18205651034081, | |
| "grad_norm": 0.4132038652896881, | |
| "learning_rate": 0.00029799242203439226, | |
| "loss": 3.1554, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.196621031168075, | |
| "grad_norm": 0.4355293810367584, | |
| "learning_rate": 0.00029781754590498395, | |
| "loss": 3.1588, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.21118555199534, | |
| "grad_norm": 0.4148186147212982, | |
| "learning_rate": 0.00029764266977557564, | |
| "loss": 3.1372, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.225750072822603, | |
| "grad_norm": 0.43595439195632935, | |
| "learning_rate": 0.0002974677936461673, | |
| "loss": 3.1552, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.24031459364987, | |
| "grad_norm": 0.45525822043418884, | |
| "learning_rate": 0.0002972929175167589, | |
| "loss": 3.1512, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.254879114477134, | |
| "grad_norm": 0.4085412323474884, | |
| "learning_rate": 0.0002971180413873506, | |
| "loss": 3.1533, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.2694436353044, | |
| "grad_norm": 0.41310837864875793, | |
| "learning_rate": 0.00029694316525794224, | |
| "loss": 3.1533, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.284008156131662, | |
| "grad_norm": 0.3980827331542969, | |
| "learning_rate": 0.00029676828912853393, | |
| "loss": 3.1537, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.298572676958926, | |
| "grad_norm": 0.42113929986953735, | |
| "learning_rate": 0.0002965934129991256, | |
| "loss": 3.1535, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.313137197786194, | |
| "grad_norm": 0.4222269058227539, | |
| "learning_rate": 0.00029641853686971726, | |
| "loss": 3.1623, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.327701718613458, | |
| "grad_norm": 0.4171670079231262, | |
| "learning_rate": 0.0002962436607403089, | |
| "loss": 3.153, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.342266239440722, | |
| "grad_norm": 0.4400247633457184, | |
| "learning_rate": 0.0002960687846109006, | |
| "loss": 3.1614, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.342266239440722, | |
| "eval_accuracy": 0.37419508626454995, | |
| "eval_loss": 3.54506778717041, | |
| "eval_runtime": 179.9944, | |
| "eval_samples_per_second": 92.453, | |
| "eval_steps_per_second": 5.784, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.356830760267986, | |
| "grad_norm": 0.4598556160926819, | |
| "learning_rate": 0.00029589390848149227, | |
| "loss": 3.1667, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.371395281095253, | |
| "grad_norm": 0.41732457280158997, | |
| "learning_rate": 0.0002957190323520839, | |
| "loss": 3.1687, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.385959801922517, | |
| "grad_norm": 0.3933047950267792, | |
| "learning_rate": 0.0002955441562226756, | |
| "loss": 3.1493, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.40052432274978, | |
| "grad_norm": 0.3980228900909424, | |
| "learning_rate": 0.00029536928009326723, | |
| "loss": 3.1678, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.415088843577045, | |
| "grad_norm": 0.42772722244262695, | |
| "learning_rate": 0.00029519440396385887, | |
| "loss": 3.1628, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.42965336440431, | |
| "grad_norm": 0.41940030455589294, | |
| "learning_rate": 0.00029501952783445056, | |
| "loss": 3.1814, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.444217885231577, | |
| "grad_norm": 0.42683565616607666, | |
| "learning_rate": 0.00029484465170504225, | |
| "loss": 3.1786, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.45878240605884, | |
| "grad_norm": 0.40677493810653687, | |
| "learning_rate": 0.0002946697755756339, | |
| "loss": 3.1701, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.473346926886105, | |
| "grad_norm": 0.4015192687511444, | |
| "learning_rate": 0.0002944948994462256, | |
| "loss": 3.1792, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.48791144771337, | |
| "grad_norm": 0.3968818783760071, | |
| "learning_rate": 0.00029432002331681727, | |
| "loss": 3.1833, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.502475968540637, | |
| "grad_norm": 0.41406768560409546, | |
| "learning_rate": 0.0002941451471874089, | |
| "loss": 3.1799, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.5170404893679, | |
| "grad_norm": 0.4274561405181885, | |
| "learning_rate": 0.00029397027105800054, | |
| "loss": 3.1923, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.531605010195165, | |
| "grad_norm": 0.38853442668914795, | |
| "learning_rate": 0.00029379539492859223, | |
| "loss": 3.1808, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.54616953102243, | |
| "grad_norm": 0.3968064486980438, | |
| "learning_rate": 0.00029362051879918387, | |
| "loss": 3.1786, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.560734051849693, | |
| "grad_norm": 0.44367527961730957, | |
| "learning_rate": 0.00029344564266977556, | |
| "loss": 3.185, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.57529857267696, | |
| "grad_norm": 0.4026988744735718, | |
| "learning_rate": 0.00029327076654036725, | |
| "loss": 3.1778, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.589863093504224, | |
| "grad_norm": 0.39344534277915955, | |
| "learning_rate": 0.0002930958904109589, | |
| "loss": 3.1752, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.604427614331488, | |
| "grad_norm": 0.39949455857276917, | |
| "learning_rate": 0.0002929210142815505, | |
| "loss": 3.1927, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.618992135158752, | |
| "grad_norm": 0.4150342643260956, | |
| "learning_rate": 0.0002927461381521422, | |
| "loss": 3.1774, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.63355665598602, | |
| "grad_norm": 0.3983399271965027, | |
| "learning_rate": 0.0002925712620227339, | |
| "loss": 3.1755, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.63355665598602, | |
| "eval_accuracy": 0.3747817823401071, | |
| "eval_loss": 3.536299228668213, | |
| "eval_runtime": 179.9386, | |
| "eval_samples_per_second": 92.482, | |
| "eval_steps_per_second": 5.785, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.648121176813284, | |
| "grad_norm": 0.4258618950843811, | |
| "learning_rate": 0.00029239638589332554, | |
| "loss": 3.1961, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.662685697640548, | |
| "grad_norm": 0.41384416818618774, | |
| "learning_rate": 0.0002922215097639172, | |
| "loss": 3.1849, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.67725021846781, | |
| "grad_norm": 0.4001385569572449, | |
| "learning_rate": 0.00029204663363450886, | |
| "loss": 3.1824, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.691814739295076, | |
| "grad_norm": 0.39791566133499146, | |
| "learning_rate": 0.0002918717575051005, | |
| "loss": 3.199, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.706379260122343, | |
| "grad_norm": 0.4087889492511749, | |
| "learning_rate": 0.0002916968813756922, | |
| "loss": 3.1977, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.720943780949607, | |
| "grad_norm": 0.40469250082969666, | |
| "learning_rate": 0.0002915220052462839, | |
| "loss": 3.1841, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.73550830177687, | |
| "grad_norm": 0.40084928274154663, | |
| "learning_rate": 0.0002913471291168755, | |
| "loss": 3.1995, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.750072822604135, | |
| "grad_norm": 0.4317362904548645, | |
| "learning_rate": 0.00029117225298746715, | |
| "loss": 3.1922, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.764637343431403, | |
| "grad_norm": 0.3823014497756958, | |
| "learning_rate": 0.00029099737685805884, | |
| "loss": 3.1865, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.779201864258667, | |
| "grad_norm": 0.37905919551849365, | |
| "learning_rate": 0.00029082250072865053, | |
| "loss": 3.1759, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.79376638508593, | |
| "grad_norm": 0.41488322615623474, | |
| "learning_rate": 0.00029064762459924217, | |
| "loss": 3.1959, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.808330905913195, | |
| "grad_norm": 0.4414539039134979, | |
| "learning_rate": 0.00029047274846983386, | |
| "loss": 3.1907, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.82289542674046, | |
| "grad_norm": 0.4049603044986725, | |
| "learning_rate": 0.0002902978723404255, | |
| "loss": 3.185, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.837459947567726, | |
| "grad_norm": 0.41079235076904297, | |
| "learning_rate": 0.00029012299621101713, | |
| "loss": 3.1952, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.85202446839499, | |
| "grad_norm": 0.4270045757293701, | |
| "learning_rate": 0.0002899481200816088, | |
| "loss": 3.2003, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.866588989222254, | |
| "grad_norm": 0.3901161551475525, | |
| "learning_rate": 0.0002897732439522005, | |
| "loss": 3.1942, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.88115351004952, | |
| "grad_norm": 0.4142347276210785, | |
| "learning_rate": 0.00028959836782279215, | |
| "loss": 3.2045, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.895718030876782, | |
| "grad_norm": 0.40068092942237854, | |
| "learning_rate": 0.00028942349169338384, | |
| "loss": 3.1998, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.91028255170405, | |
| "grad_norm": 0.3928789794445038, | |
| "learning_rate": 0.00028924861556397553, | |
| "loss": 3.2075, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.924847072531314, | |
| "grad_norm": 0.41609933972358704, | |
| "learning_rate": 0.00028907373943456717, | |
| "loss": 3.2004, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.924847072531314, | |
| "eval_accuracy": 0.3750450841066279, | |
| "eval_loss": 3.532590866088867, | |
| "eval_runtime": 180.0164, | |
| "eval_samples_per_second": 92.442, | |
| "eval_steps_per_second": 5.783, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.939411593358578, | |
| "grad_norm": 0.4000127613544464, | |
| "learning_rate": 0.0002888988633051588, | |
| "loss": 3.2011, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.953976114185842, | |
| "grad_norm": 0.41976398229599, | |
| "learning_rate": 0.0002887239871757505, | |
| "loss": 3.2051, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.96854063501311, | |
| "grad_norm": 0.4085160791873932, | |
| "learning_rate": 0.00028854911104634213, | |
| "loss": 3.2028, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.983105155840374, | |
| "grad_norm": 0.4441681206226349, | |
| "learning_rate": 0.0002883742349169338, | |
| "loss": 3.2012, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 25.997669676667638, | |
| "grad_norm": 0.40044912695884705, | |
| "learning_rate": 0.0002881993587875255, | |
| "loss": 3.2076, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 26.0122341974949, | |
| "grad_norm": 0.39951109886169434, | |
| "learning_rate": 0.00028802448265811715, | |
| "loss": 3.1123, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.026798718322166, | |
| "grad_norm": 0.43108323216438293, | |
| "learning_rate": 0.0002878496065287088, | |
| "loss": 3.0933, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.041363239149433, | |
| "grad_norm": 0.4239446818828583, | |
| "learning_rate": 0.0002876747303993005, | |
| "loss": 3.1119, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.055927759976697, | |
| "grad_norm": 0.4508779048919678, | |
| "learning_rate": 0.00028749985426989216, | |
| "loss": 3.1111, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.07049228080396, | |
| "grad_norm": 0.4007803201675415, | |
| "learning_rate": 0.0002873249781404838, | |
| "loss": 3.119, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.085056801631225, | |
| "grad_norm": 0.4217778444290161, | |
| "learning_rate": 0.0002871501020110755, | |
| "loss": 3.1271, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.099621322458493, | |
| "grad_norm": 0.426954448223114, | |
| "learning_rate": 0.0002869752258816671, | |
| "loss": 3.1251, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.114185843285757, | |
| "grad_norm": 0.40633267164230347, | |
| "learning_rate": 0.00028680034975225876, | |
| "loss": 3.1286, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.12875036411302, | |
| "grad_norm": 0.4173829257488251, | |
| "learning_rate": 0.00028662547362285045, | |
| "loss": 3.1363, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.143314884940285, | |
| "grad_norm": 0.39154112339019775, | |
| "learning_rate": 0.00028645059749344214, | |
| "loss": 3.1298, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.15787940576755, | |
| "grad_norm": 0.4263724088668823, | |
| "learning_rate": 0.0002862757213640338, | |
| "loss": 3.1379, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.172443926594816, | |
| "grad_norm": 0.4325745105743408, | |
| "learning_rate": 0.00028610084523462547, | |
| "loss": 3.1325, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.18700844742208, | |
| "grad_norm": 0.4075644612312317, | |
| "learning_rate": 0.0002859259691052171, | |
| "loss": 3.1412, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.201572968249344, | |
| "grad_norm": 0.4166417717933655, | |
| "learning_rate": 0.0002857510929758088, | |
| "loss": 3.1338, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.21613748907661, | |
| "grad_norm": 0.42380568385124207, | |
| "learning_rate": 0.00028557621684640043, | |
| "loss": 3.1347, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.21613748907661, | |
| "eval_accuracy": 0.37397917646404427, | |
| "eval_loss": 3.5486440658569336, | |
| "eval_runtime": 180.117, | |
| "eval_samples_per_second": 92.39, | |
| "eval_steps_per_second": 5.78, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.230702009903876, | |
| "grad_norm": 0.38530007004737854, | |
| "learning_rate": 0.0002854013407169921, | |
| "loss": 3.1367, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.24526653073114, | |
| "grad_norm": 0.41927337646484375, | |
| "learning_rate": 0.00028522646458758376, | |
| "loss": 3.1331, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.259831051558404, | |
| "grad_norm": 0.41560065746307373, | |
| "learning_rate": 0.00028505158845817545, | |
| "loss": 3.1429, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.274395572385668, | |
| "grad_norm": 0.41110533475875854, | |
| "learning_rate": 0.0002848767123287671, | |
| "loss": 3.1418, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.288960093212932, | |
| "grad_norm": 0.42608729004859924, | |
| "learning_rate": 0.0002847018361993588, | |
| "loss": 3.1524, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.3035246140402, | |
| "grad_norm": 0.46872904896736145, | |
| "learning_rate": 0.0002845269600699504, | |
| "loss": 3.1366, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.318089134867463, | |
| "grad_norm": 0.42742592096328735, | |
| "learning_rate": 0.0002843520839405421, | |
| "loss": 3.149, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.332653655694727, | |
| "grad_norm": 0.42787429690361023, | |
| "learning_rate": 0.0002841772078111338, | |
| "loss": 3.1612, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.34721817652199, | |
| "grad_norm": 0.4321795701980591, | |
| "learning_rate": 0.00028400233168172543, | |
| "loss": 3.1546, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.361782697349255, | |
| "grad_norm": 0.4483564496040344, | |
| "learning_rate": 0.00028382745555231707, | |
| "loss": 3.1611, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.376347218176523, | |
| "grad_norm": 0.41695746779441833, | |
| "learning_rate": 0.00028365257942290876, | |
| "loss": 3.162, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.390911739003787, | |
| "grad_norm": 0.4270467162132263, | |
| "learning_rate": 0.0002834777032935004, | |
| "loss": 3.1594, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.40547625983105, | |
| "grad_norm": 0.4106120467185974, | |
| "learning_rate": 0.0002833028271640921, | |
| "loss": 3.1573, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.420040780658315, | |
| "grad_norm": 0.40699175000190735, | |
| "learning_rate": 0.00028312795103468377, | |
| "loss": 3.1656, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.434605301485583, | |
| "grad_norm": 0.4184217154979706, | |
| "learning_rate": 0.0002829530749052754, | |
| "loss": 3.1588, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.449169822312847, | |
| "grad_norm": 0.4090009927749634, | |
| "learning_rate": 0.00028277819877586705, | |
| "loss": 3.1679, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.46373434314011, | |
| "grad_norm": 0.4072172939777374, | |
| "learning_rate": 0.00028260332264645874, | |
| "loss": 3.1526, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.478298863967375, | |
| "grad_norm": 0.4247322380542755, | |
| "learning_rate": 0.0002824284465170504, | |
| "loss": 3.1634, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.49286338479464, | |
| "grad_norm": 0.40537166595458984, | |
| "learning_rate": 0.00028225357038764206, | |
| "loss": 3.1682, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.507427905621906, | |
| "grad_norm": 0.3876939117908478, | |
| "learning_rate": 0.00028207869425823375, | |
| "loss": 3.1675, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.507427905621906, | |
| "eval_accuracy": 0.3747385063016615, | |
| "eval_loss": 3.5393879413604736, | |
| "eval_runtime": 180.0198, | |
| "eval_samples_per_second": 92.44, | |
| "eval_steps_per_second": 5.783, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.52199242644917, | |
| "grad_norm": 0.4608912765979767, | |
| "learning_rate": 0.0002819038181288254, | |
| "loss": 3.1717, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.536556947276434, | |
| "grad_norm": 0.3875090777873993, | |
| "learning_rate": 0.000281728941999417, | |
| "loss": 3.1683, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.551121468103698, | |
| "grad_norm": 0.3892468512058258, | |
| "learning_rate": 0.0002815540658700087, | |
| "loss": 3.1792, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.565685988930966, | |
| "grad_norm": 0.43506941199302673, | |
| "learning_rate": 0.0002813791897406004, | |
| "loss": 3.1765, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.58025050975823, | |
| "grad_norm": 0.44329991936683655, | |
| "learning_rate": 0.00028120431361119204, | |
| "loss": 3.1647, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.594815030585494, | |
| "grad_norm": 0.4021358788013458, | |
| "learning_rate": 0.00028102943748178373, | |
| "loss": 3.1723, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.609379551412758, | |
| "grad_norm": 0.4087882936000824, | |
| "learning_rate": 0.00028085456135237537, | |
| "loss": 3.1593, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.623944072240022, | |
| "grad_norm": 0.40286004543304443, | |
| "learning_rate": 0.00028067968522296706, | |
| "loss": 3.173, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.63850859306729, | |
| "grad_norm": 0.39587125182151794, | |
| "learning_rate": 0.0002805048090935587, | |
| "loss": 3.1768, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.653073113894553, | |
| "grad_norm": 0.43658262491226196, | |
| "learning_rate": 0.0002803299329641504, | |
| "loss": 3.1648, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.667637634721817, | |
| "grad_norm": 0.4274442493915558, | |
| "learning_rate": 0.000280155056834742, | |
| "loss": 3.1822, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.68220215554908, | |
| "grad_norm": 0.42451024055480957, | |
| "learning_rate": 0.0002799801807053337, | |
| "loss": 3.1862, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.69676667637635, | |
| "grad_norm": 0.44854724407196045, | |
| "learning_rate": 0.00027980530457592535, | |
| "loss": 3.1736, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.711331197203613, | |
| "grad_norm": 0.4080348312854767, | |
| "learning_rate": 0.00027963042844651704, | |
| "loss": 3.1805, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.725895718030877, | |
| "grad_norm": 0.43170300126075745, | |
| "learning_rate": 0.0002794555523171087, | |
| "loss": 3.1766, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.74046023885814, | |
| "grad_norm": 0.4210398197174072, | |
| "learning_rate": 0.00027928067618770037, | |
| "loss": 3.1802, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.755024759685405, | |
| "grad_norm": 0.4239577651023865, | |
| "learning_rate": 0.00027910580005829206, | |
| "loss": 3.1812, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.769589280512673, | |
| "grad_norm": 0.4054132103919983, | |
| "learning_rate": 0.0002789309239288837, | |
| "loss": 3.1792, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.784153801339937, | |
| "grad_norm": 0.4064629375934601, | |
| "learning_rate": 0.00027875604779947533, | |
| "loss": 3.1861, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.7987183221672, | |
| "grad_norm": 0.40748798847198486, | |
| "learning_rate": 0.000278581171670067, | |
| "loss": 3.192, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.7987183221672, | |
| "eval_accuracy": 0.3752517036706195, | |
| "eval_loss": 3.530374050140381, | |
| "eval_runtime": 180.0451, | |
| "eval_samples_per_second": 92.427, | |
| "eval_steps_per_second": 5.782, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.813282842994465, | |
| "grad_norm": 0.4335620403289795, | |
| "learning_rate": 0.00027840629554065865, | |
| "loss": 3.1725, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 26.827847363821732, | |
| "grad_norm": 0.41802123188972473, | |
| "learning_rate": 0.00027823141941125034, | |
| "loss": 3.1948, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 26.842411884648996, | |
| "grad_norm": 0.41359513998031616, | |
| "learning_rate": 0.00027805654328184204, | |
| "loss": 3.1822, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 26.85697640547626, | |
| "grad_norm": 0.3970206081867218, | |
| "learning_rate": 0.00027788166715243367, | |
| "loss": 3.2007, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 26.871540926303524, | |
| "grad_norm": 0.4091810882091522, | |
| "learning_rate": 0.0002777067910230253, | |
| "loss": 3.192, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 26.886105447130788, | |
| "grad_norm": 0.4705309271812439, | |
| "learning_rate": 0.000277531914893617, | |
| "loss": 3.2025, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 26.900669967958056, | |
| "grad_norm": 0.447348952293396, | |
| "learning_rate": 0.00027735703876420863, | |
| "loss": 3.1939, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 26.91523448878532, | |
| "grad_norm": 0.43095237016677856, | |
| "learning_rate": 0.0002771821626348003, | |
| "loss": 3.1773, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 26.929799009612584, | |
| "grad_norm": 0.4291156232357025, | |
| "learning_rate": 0.000277007286505392, | |
| "loss": 3.1828, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 26.944363530439848, | |
| "grad_norm": 0.4368513524532318, | |
| "learning_rate": 0.00027683241037598365, | |
| "loss": 3.2057, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 26.95892805126711, | |
| "grad_norm": 0.3827671408653259, | |
| "learning_rate": 0.0002766575342465753, | |
| "loss": 3.2017, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 26.97349257209438, | |
| "grad_norm": 0.4416309893131256, | |
| "learning_rate": 0.000276482658117167, | |
| "loss": 3.1994, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 26.988057092921643, | |
| "grad_norm": 0.41274315118789673, | |
| "learning_rate": 0.00027630778198775867, | |
| "loss": 3.1861, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 27.002621613748907, | |
| "grad_norm": 0.4293462336063385, | |
| "learning_rate": 0.0002761329058583503, | |
| "loss": 3.1783, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 27.01718613457617, | |
| "grad_norm": 0.4199707806110382, | |
| "learning_rate": 0.000275958029728942, | |
| "loss": 3.0981, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 27.03175065540344, | |
| "grad_norm": 0.4206013083457947, | |
| "learning_rate": 0.00027578315359953363, | |
| "loss": 3.0893, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 27.046315176230703, | |
| "grad_norm": 0.38021305203437805, | |
| "learning_rate": 0.00027560827747012527, | |
| "loss": 3.1103, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 27.060879697057967, | |
| "grad_norm": 0.38645049929618835, | |
| "learning_rate": 0.00027543340134071696, | |
| "loss": 3.107, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 27.07544421788523, | |
| "grad_norm": 0.40561676025390625, | |
| "learning_rate": 0.00027525852521130865, | |
| "loss": 3.1084, | |
| "step": 92950 | |
| }, | |
| { | |
| "epoch": 27.090008738712495, | |
| "grad_norm": 0.398436576128006, | |
| "learning_rate": 0.0002750836490819003, | |
| "loss": 3.1155, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.090008738712495, | |
| "eval_accuracy": 0.3744006474471665, | |
| "eval_loss": 3.5486044883728027, | |
| "eval_runtime": 180.0468, | |
| "eval_samples_per_second": 92.426, | |
| "eval_steps_per_second": 5.782, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.104573259539762, | |
| "grad_norm": 0.43171611428260803, | |
| "learning_rate": 0.000274908772952492, | |
| "loss": 3.1036, | |
| "step": 93050 | |
| }, | |
| { | |
| "epoch": 27.119137780367026, | |
| "grad_norm": 0.42062097787857056, | |
| "learning_rate": 0.0002747338968230836, | |
| "loss": 3.1142, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 27.13370230119429, | |
| "grad_norm": 0.4473419189453125, | |
| "learning_rate": 0.0002745590206936753, | |
| "loss": 3.1121, | |
| "step": 93150 | |
| }, | |
| { | |
| "epoch": 27.148266822021554, | |
| "grad_norm": 0.401583194732666, | |
| "learning_rate": 0.00027438414456426694, | |
| "loss": 3.1183, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 27.162831342848822, | |
| "grad_norm": 0.43579012155532837, | |
| "learning_rate": 0.00027420926843485863, | |
| "loss": 3.1282, | |
| "step": 93250 | |
| }, | |
| { | |
| "epoch": 27.177395863676086, | |
| "grad_norm": 0.4297228753566742, | |
| "learning_rate": 0.00027403439230545026, | |
| "loss": 3.1189, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 27.19196038450335, | |
| "grad_norm": 0.4076518416404724, | |
| "learning_rate": 0.00027385951617604195, | |
| "loss": 3.1232, | |
| "step": 93350 | |
| }, | |
| { | |
| "epoch": 27.206524905330614, | |
| "grad_norm": 0.4205459952354431, | |
| "learning_rate": 0.0002736846400466336, | |
| "loss": 3.1296, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 27.221089426157878, | |
| "grad_norm": 0.39670172333717346, | |
| "learning_rate": 0.0002735097639172253, | |
| "loss": 3.1249, | |
| "step": 93450 | |
| }, | |
| { | |
| "epoch": 27.235653946985146, | |
| "grad_norm": 0.4148896336555481, | |
| "learning_rate": 0.0002733348877878169, | |
| "loss": 3.1185, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 27.25021846781241, | |
| "grad_norm": 0.4132000803947449, | |
| "learning_rate": 0.0002731600116584086, | |
| "loss": 3.1365, | |
| "step": 93550 | |
| }, | |
| { | |
| "epoch": 27.264782988639674, | |
| "grad_norm": 0.4182775020599365, | |
| "learning_rate": 0.0002729851355290003, | |
| "loss": 3.1306, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 27.279347509466938, | |
| "grad_norm": 0.43432140350341797, | |
| "learning_rate": 0.00027281025939959193, | |
| "loss": 3.1452, | |
| "step": 93650 | |
| }, | |
| { | |
| "epoch": 27.2939120302942, | |
| "grad_norm": 0.4565012454986572, | |
| "learning_rate": 0.00027263538327018357, | |
| "loss": 3.1386, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 27.30847655112147, | |
| "grad_norm": 0.445963978767395, | |
| "learning_rate": 0.00027246050714077526, | |
| "loss": 3.1269, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 27.323041071948733, | |
| "grad_norm": 0.4095284044742584, | |
| "learning_rate": 0.0002722856310113669, | |
| "loss": 3.1549, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 27.337605592775997, | |
| "grad_norm": 0.4368303716182709, | |
| "learning_rate": 0.0002721107548819586, | |
| "loss": 3.151, | |
| "step": 93850 | |
| }, | |
| { | |
| "epoch": 27.35217011360326, | |
| "grad_norm": 0.46256592869758606, | |
| "learning_rate": 0.0002719358787525503, | |
| "loss": 3.1448, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 27.36673463443053, | |
| "grad_norm": 0.412517249584198, | |
| "learning_rate": 0.0002717610026231419, | |
| "loss": 3.1458, | |
| "step": 93950 | |
| }, | |
| { | |
| "epoch": 27.381299155257793, | |
| "grad_norm": 0.417863667011261, | |
| "learning_rate": 0.00027158612649373355, | |
| "loss": 3.1508, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.381299155257793, | |
| "eval_accuracy": 0.37466524279092345, | |
| "eval_loss": 3.546901226043701, | |
| "eval_runtime": 180.1597, | |
| "eval_samples_per_second": 92.368, | |
| "eval_steps_per_second": 5.778, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.395863676085057, | |
| "grad_norm": 0.49049368500709534, | |
| "learning_rate": 0.00027141125036432524, | |
| "loss": 3.1541, | |
| "step": 94050 | |
| }, | |
| { | |
| "epoch": 27.41042819691232, | |
| "grad_norm": 0.4306406080722809, | |
| "learning_rate": 0.00027123637423491693, | |
| "loss": 3.1577, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 27.424992717739585, | |
| "grad_norm": 0.40683841705322266, | |
| "learning_rate": 0.00027106149810550857, | |
| "loss": 3.1543, | |
| "step": 94150 | |
| }, | |
| { | |
| "epoch": 27.439557238566852, | |
| "grad_norm": 0.4452398717403412, | |
| "learning_rate": 0.00027088662197610026, | |
| "loss": 3.1457, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 27.454121759394116, | |
| "grad_norm": 0.4268343448638916, | |
| "learning_rate": 0.0002707117458466919, | |
| "loss": 3.1603, | |
| "step": 94250 | |
| }, | |
| { | |
| "epoch": 27.46868628022138, | |
| "grad_norm": 0.45762568712234497, | |
| "learning_rate": 0.00027053686971728353, | |
| "loss": 3.1528, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 27.483250801048644, | |
| "grad_norm": 0.41289466619491577, | |
| "learning_rate": 0.0002703619935878752, | |
| "loss": 3.1503, | |
| "step": 94350 | |
| }, | |
| { | |
| "epoch": 27.497815321875912, | |
| "grad_norm": 0.43442097306251526, | |
| "learning_rate": 0.0002701871174584669, | |
| "loss": 3.155, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 27.512379842703176, | |
| "grad_norm": 0.40458592772483826, | |
| "learning_rate": 0.00027001224132905855, | |
| "loss": 3.1625, | |
| "step": 94450 | |
| }, | |
| { | |
| "epoch": 27.52694436353044, | |
| "grad_norm": 0.3933391869068146, | |
| "learning_rate": 0.00026983736519965024, | |
| "loss": 3.1555, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 27.541508884357704, | |
| "grad_norm": 0.4415944814682007, | |
| "learning_rate": 0.0002696624890702419, | |
| "loss": 3.1709, | |
| "step": 94550 | |
| }, | |
| { | |
| "epoch": 27.556073405184968, | |
| "grad_norm": 0.43649542331695557, | |
| "learning_rate": 0.00026948761294083356, | |
| "loss": 3.1747, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 27.570637926012235, | |
| "grad_norm": 0.4254384934902191, | |
| "learning_rate": 0.0002693127368114252, | |
| "loss": 3.1716, | |
| "step": 94650 | |
| }, | |
| { | |
| "epoch": 27.5852024468395, | |
| "grad_norm": 0.4311431646347046, | |
| "learning_rate": 0.0002691378606820169, | |
| "loss": 3.1693, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 27.599766967666763, | |
| "grad_norm": 0.42994821071624756, | |
| "learning_rate": 0.0002689629845526085, | |
| "loss": 3.1747, | |
| "step": 94750 | |
| }, | |
| { | |
| "epoch": 27.614331488494027, | |
| "grad_norm": 0.44075125455856323, | |
| "learning_rate": 0.0002687881084232002, | |
| "loss": 3.1774, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 27.628896009321295, | |
| "grad_norm": 0.4176417589187622, | |
| "learning_rate": 0.00026861323229379185, | |
| "loss": 3.1704, | |
| "step": 94850 | |
| }, | |
| { | |
| "epoch": 27.64346053014856, | |
| "grad_norm": 0.4279754161834717, | |
| "learning_rate": 0.00026843835616438354, | |
| "loss": 3.1623, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 27.658025050975823, | |
| "grad_norm": 0.42134353518486023, | |
| "learning_rate": 0.0002682634800349752, | |
| "loss": 3.1738, | |
| "step": 94950 | |
| }, | |
| { | |
| "epoch": 27.672589571803087, | |
| "grad_norm": 0.40898674726486206, | |
| "learning_rate": 0.00026808860390556687, | |
| "loss": 3.1786, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.672589571803087, | |
| "eval_accuracy": 0.3749798172551679, | |
| "eval_loss": 3.540032386779785, | |
| "eval_runtime": 179.8274, | |
| "eval_samples_per_second": 92.539, | |
| "eval_steps_per_second": 5.789, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.68715409263035, | |
| "grad_norm": 0.43924784660339355, | |
| "learning_rate": 0.00026791372777615856, | |
| "loss": 3.1834, | |
| "step": 95050 | |
| }, | |
| { | |
| "epoch": 27.70171861345762, | |
| "grad_norm": 0.44329679012298584, | |
| "learning_rate": 0.0002677388516467502, | |
| "loss": 3.1736, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 27.716283134284883, | |
| "grad_norm": 0.4295293688774109, | |
| "learning_rate": 0.00026756397551734183, | |
| "loss": 3.1759, | |
| "step": 95150 | |
| }, | |
| { | |
| "epoch": 27.730847655112147, | |
| "grad_norm": 0.42074301838874817, | |
| "learning_rate": 0.0002673890993879335, | |
| "loss": 3.1631, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 27.74541217593941, | |
| "grad_norm": 0.4105515480041504, | |
| "learning_rate": 0.00026721422325852516, | |
| "loss": 3.16, | |
| "step": 95250 | |
| }, | |
| { | |
| "epoch": 27.759976696766678, | |
| "grad_norm": 0.42497944831848145, | |
| "learning_rate": 0.00026703934712911685, | |
| "loss": 3.1814, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 27.774541217593942, | |
| "grad_norm": 0.4068467915058136, | |
| "learning_rate": 0.00026686447099970854, | |
| "loss": 3.1802, | |
| "step": 95350 | |
| }, | |
| { | |
| "epoch": 27.789105738421206, | |
| "grad_norm": 0.41870132088661194, | |
| "learning_rate": 0.0002666895948703002, | |
| "loss": 3.1699, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 27.80367025924847, | |
| "grad_norm": 0.39957690238952637, | |
| "learning_rate": 0.0002665147187408918, | |
| "loss": 3.1722, | |
| "step": 95450 | |
| }, | |
| { | |
| "epoch": 27.818234780075734, | |
| "grad_norm": 0.42152419686317444, | |
| "learning_rate": 0.0002663398426114835, | |
| "loss": 3.1755, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 27.832799300903, | |
| "grad_norm": 0.46125051379203796, | |
| "learning_rate": 0.0002661649664820752, | |
| "loss": 3.1714, | |
| "step": 95550 | |
| }, | |
| { | |
| "epoch": 27.847363821730266, | |
| "grad_norm": 0.43029168248176575, | |
| "learning_rate": 0.00026599009035266683, | |
| "loss": 3.175, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 27.86192834255753, | |
| "grad_norm": 0.4232831597328186, | |
| "learning_rate": 0.0002658152142232585, | |
| "loss": 3.164, | |
| "step": 95650 | |
| }, | |
| { | |
| "epoch": 27.876492863384794, | |
| "grad_norm": 0.41490527987480164, | |
| "learning_rate": 0.00026564033809385016, | |
| "loss": 3.1884, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 27.891057384212058, | |
| "grad_norm": 0.4738544523715973, | |
| "learning_rate": 0.0002654654619644418, | |
| "loss": 3.1874, | |
| "step": 95750 | |
| }, | |
| { | |
| "epoch": 27.905621905039325, | |
| "grad_norm": 0.4612700343132019, | |
| "learning_rate": 0.0002652905858350335, | |
| "loss": 3.1621, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 27.92018642586659, | |
| "grad_norm": 0.4170343577861786, | |
| "learning_rate": 0.00026511570970562517, | |
| "loss": 3.1649, | |
| "step": 95850 | |
| }, | |
| { | |
| "epoch": 27.934750946693853, | |
| "grad_norm": 0.4307102560997009, | |
| "learning_rate": 0.0002649408335762168, | |
| "loss": 3.1862, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 27.949315467521117, | |
| "grad_norm": 0.3994083106517792, | |
| "learning_rate": 0.0002647659574468085, | |
| "loss": 3.1892, | |
| "step": 95950 | |
| }, | |
| { | |
| "epoch": 27.963879988348385, | |
| "grad_norm": 0.4175775647163391, | |
| "learning_rate": 0.0002645910813174002, | |
| "loss": 3.1759, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.963879988348385, | |
| "eval_accuracy": 0.37527063693743945, | |
| "eval_loss": 3.531499147415161, | |
| "eval_runtime": 180.0444, | |
| "eval_samples_per_second": 92.427, | |
| "eval_steps_per_second": 5.782, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.97844450917565, | |
| "grad_norm": 0.4074898660182953, | |
| "learning_rate": 0.0002644162051879918, | |
| "loss": 3.1731, | |
| "step": 96050 | |
| }, | |
| { | |
| "epoch": 27.993009030002913, | |
| "grad_norm": 0.4351227581501007, | |
| "learning_rate": 0.00026424132905858346, | |
| "loss": 3.2046, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 28.007573550830177, | |
| "grad_norm": 0.42597511410713196, | |
| "learning_rate": 0.00026406645292917515, | |
| "loss": 3.1421, | |
| "step": 96150 | |
| }, | |
| { | |
| "epoch": 28.02213807165744, | |
| "grad_norm": 0.4269953966140747, | |
| "learning_rate": 0.0002638915767997668, | |
| "loss": 3.0972, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 28.03670259248471, | |
| "grad_norm": 0.46322062611579895, | |
| "learning_rate": 0.0002637167006703585, | |
| "loss": 3.083, | |
| "step": 96250 | |
| }, | |
| { | |
| "epoch": 28.051267113311972, | |
| "grad_norm": 0.41330623626708984, | |
| "learning_rate": 0.00026354182454095017, | |
| "loss": 3.0867, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 28.065831634139236, | |
| "grad_norm": 0.4628863036632538, | |
| "learning_rate": 0.0002633669484115418, | |
| "loss": 3.1134, | |
| "step": 96350 | |
| }, | |
| { | |
| "epoch": 28.0803961549665, | |
| "grad_norm": 0.4199768304824829, | |
| "learning_rate": 0.00026319207228213344, | |
| "loss": 3.0953, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 28.094960675793768, | |
| "grad_norm": 0.43701887130737305, | |
| "learning_rate": 0.00026301719615272513, | |
| "loss": 3.1007, | |
| "step": 96450 | |
| }, | |
| { | |
| "epoch": 28.109525196621032, | |
| "grad_norm": 0.427827924489975, | |
| "learning_rate": 0.0002628423200233168, | |
| "loss": 3.1013, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 28.124089717448296, | |
| "grad_norm": 0.4370115101337433, | |
| "learning_rate": 0.00026266744389390846, | |
| "loss": 3.1081, | |
| "step": 96550 | |
| }, | |
| { | |
| "epoch": 28.13865423827556, | |
| "grad_norm": 0.41825947165489197, | |
| "learning_rate": 0.00026249256776450015, | |
| "loss": 3.1187, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 28.153218759102824, | |
| "grad_norm": 0.4340101182460785, | |
| "learning_rate": 0.0002623176916350918, | |
| "loss": 3.108, | |
| "step": 96650 | |
| }, | |
| { | |
| "epoch": 28.16778327993009, | |
| "grad_norm": 0.43236619234085083, | |
| "learning_rate": 0.0002621428155056834, | |
| "loss": 3.1045, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 28.182347800757356, | |
| "grad_norm": 0.42964163422584534, | |
| "learning_rate": 0.0002619679393762751, | |
| "loss": 3.1131, | |
| "step": 96750 | |
| }, | |
| { | |
| "epoch": 28.19691232158462, | |
| "grad_norm": 0.42945995926856995, | |
| "learning_rate": 0.0002617930632468668, | |
| "loss": 3.1188, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 28.211476842411884, | |
| "grad_norm": 0.4098386764526367, | |
| "learning_rate": 0.00026161818711745844, | |
| "loss": 3.1192, | |
| "step": 96850 | |
| }, | |
| { | |
| "epoch": 28.22604136323915, | |
| "grad_norm": 0.4059397578239441, | |
| "learning_rate": 0.0002614433109880501, | |
| "loss": 3.1344, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 28.240605884066415, | |
| "grad_norm": 0.4252930283546448, | |
| "learning_rate": 0.00026126843485864176, | |
| "loss": 3.1394, | |
| "step": 96950 | |
| }, | |
| { | |
| "epoch": 28.25517040489368, | |
| "grad_norm": 0.4543604254722595, | |
| "learning_rate": 0.00026109355872923345, | |
| "loss": 3.1214, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.25517040489368, | |
| "eval_accuracy": 0.374433810063584, | |
| "eval_loss": 3.551743507385254, | |
| "eval_runtime": 179.8752, | |
| "eval_samples_per_second": 92.514, | |
| "eval_steps_per_second": 5.787, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.269734925720943, | |
| "grad_norm": 0.4379526674747467, | |
| "learning_rate": 0.0002609186825998251, | |
| "loss": 3.1253, | |
| "step": 97050 | |
| }, | |
| { | |
| "epoch": 28.284299446548207, | |
| "grad_norm": 0.44791752099990845, | |
| "learning_rate": 0.0002607438064704168, | |
| "loss": 3.1335, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 28.298863967375475, | |
| "grad_norm": 0.4074171781539917, | |
| "learning_rate": 0.0002605689303410084, | |
| "loss": 3.1384, | |
| "step": 97150 | |
| }, | |
| { | |
| "epoch": 28.31342848820274, | |
| "grad_norm": 0.4251929819583893, | |
| "learning_rate": 0.00026039405421160005, | |
| "loss": 3.1338, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 28.327993009030003, | |
| "grad_norm": 0.4182596802711487, | |
| "learning_rate": 0.00026021917808219174, | |
| "loss": 3.1361, | |
| "step": 97250 | |
| }, | |
| { | |
| "epoch": 28.342557529857267, | |
| "grad_norm": 0.418155699968338, | |
| "learning_rate": 0.00026004430195278343, | |
| "loss": 3.1416, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 28.35712205068453, | |
| "grad_norm": 0.3963245153427124, | |
| "learning_rate": 0.00025986942582337507, | |
| "loss": 3.1434, | |
| "step": 97350 | |
| }, | |
| { | |
| "epoch": 28.3716865715118, | |
| "grad_norm": 0.4373180568218231, | |
| "learning_rate": 0.00025969454969396676, | |
| "loss": 3.1385, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 28.386251092339062, | |
| "grad_norm": 0.46445873379707336, | |
| "learning_rate": 0.00025951967356455845, | |
| "loss": 3.1333, | |
| "step": 97450 | |
| }, | |
| { | |
| "epoch": 28.400815613166326, | |
| "grad_norm": 0.43002384901046753, | |
| "learning_rate": 0.0002593447974351501, | |
| "loss": 3.1397, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 28.41538013399359, | |
| "grad_norm": 0.4203263521194458, | |
| "learning_rate": 0.0002591699213057417, | |
| "loss": 3.1423, | |
| "step": 97550 | |
| }, | |
| { | |
| "epoch": 28.429944654820858, | |
| "grad_norm": 0.422507107257843, | |
| "learning_rate": 0.0002589950451763334, | |
| "loss": 3.1467, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 28.444509175648122, | |
| "grad_norm": 0.4163612425327301, | |
| "learning_rate": 0.00025882016904692505, | |
| "loss": 3.1577, | |
| "step": 97650 | |
| }, | |
| { | |
| "epoch": 28.459073696475386, | |
| "grad_norm": 0.42211979627609253, | |
| "learning_rate": 0.00025864529291751674, | |
| "loss": 3.1456, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 28.47363821730265, | |
| "grad_norm": 0.4921233057975769, | |
| "learning_rate": 0.00025847041678810843, | |
| "loss": 3.1399, | |
| "step": 97750 | |
| }, | |
| { | |
| "epoch": 28.488202738129914, | |
| "grad_norm": 0.4806009829044342, | |
| "learning_rate": 0.00025829554065870007, | |
| "loss": 3.1502, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 28.50276725895718, | |
| "grad_norm": 0.44099995493888855, | |
| "learning_rate": 0.0002581206645292917, | |
| "loss": 3.1552, | |
| "step": 97850 | |
| }, | |
| { | |
| "epoch": 28.517331779784445, | |
| "grad_norm": 0.4322744607925415, | |
| "learning_rate": 0.0002579457883998834, | |
| "loss": 3.1645, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 28.53189630061171, | |
| "grad_norm": 0.4310096800327301, | |
| "learning_rate": 0.0002577709122704751, | |
| "loss": 3.1428, | |
| "step": 97950 | |
| }, | |
| { | |
| "epoch": 28.546460821438973, | |
| "grad_norm": 0.43033161759376526, | |
| "learning_rate": 0.0002575960361410667, | |
| "loss": 3.1545, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.546460821438973, | |
| "eval_accuracy": 0.37523300559966066, | |
| "eval_loss": 3.5400564670562744, | |
| "eval_runtime": 180.0919, | |
| "eval_samples_per_second": 92.403, | |
| "eval_steps_per_second": 5.78, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.56102534226624, | |
| "grad_norm": 0.45368677377700806, | |
| "learning_rate": 0.0002574211600116584, | |
| "loss": 3.1597, | |
| "step": 98050 | |
| }, | |
| { | |
| "epoch": 28.575589863093505, | |
| "grad_norm": 0.40790659189224243, | |
| "learning_rate": 0.00025724628388225005, | |
| "loss": 3.1517, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 28.59015438392077, | |
| "grad_norm": 0.4356526732444763, | |
| "learning_rate": 0.0002570714077528417, | |
| "loss": 3.1551, | |
| "step": 98150 | |
| }, | |
| { | |
| "epoch": 28.604718904748033, | |
| "grad_norm": 0.4357260465621948, | |
| "learning_rate": 0.0002568965316234334, | |
| "loss": 3.1605, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 28.619283425575297, | |
| "grad_norm": 0.40190744400024414, | |
| "learning_rate": 0.00025672165549402506, | |
| "loss": 3.1605, | |
| "step": 98250 | |
| }, | |
| { | |
| "epoch": 28.633847946402565, | |
| "grad_norm": 0.43007245659828186, | |
| "learning_rate": 0.0002565467793646167, | |
| "loss": 3.1633, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 28.64841246722983, | |
| "grad_norm": 0.4541544020175934, | |
| "learning_rate": 0.0002563719032352084, | |
| "loss": 3.1565, | |
| "step": 98350 | |
| }, | |
| { | |
| "epoch": 28.662976988057093, | |
| "grad_norm": 0.4124305248260498, | |
| "learning_rate": 0.0002561970271058, | |
| "loss": 3.1633, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 28.677541508884357, | |
| "grad_norm": 0.4222710132598877, | |
| "learning_rate": 0.0002560221509763917, | |
| "loss": 3.1576, | |
| "step": 98450 | |
| }, | |
| { | |
| "epoch": 28.692106029711624, | |
| "grad_norm": 0.4161086976528168, | |
| "learning_rate": 0.00025584727484698335, | |
| "loss": 3.1599, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 28.706670550538888, | |
| "grad_norm": 0.42461931705474854, | |
| "learning_rate": 0.00025567239871757504, | |
| "loss": 3.1562, | |
| "step": 98550 | |
| }, | |
| { | |
| "epoch": 28.721235071366152, | |
| "grad_norm": 0.43226149678230286, | |
| "learning_rate": 0.0002554975225881667, | |
| "loss": 3.1601, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 28.735799592193416, | |
| "grad_norm": 0.45831042528152466, | |
| "learning_rate": 0.00025532264645875837, | |
| "loss": 3.1543, | |
| "step": 98650 | |
| }, | |
| { | |
| "epoch": 28.75036411302068, | |
| "grad_norm": 0.4084044098854065, | |
| "learning_rate": 0.00025514777032935, | |
| "loss": 3.1819, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 28.764928633847948, | |
| "grad_norm": 0.4427086114883423, | |
| "learning_rate": 0.0002549728941999417, | |
| "loss": 3.1693, | |
| "step": 98750 | |
| }, | |
| { | |
| "epoch": 28.77949315467521, | |
| "grad_norm": 0.4414818584918976, | |
| "learning_rate": 0.00025479801807053333, | |
| "loss": 3.1639, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 28.794057675502476, | |
| "grad_norm": 0.4295366704463959, | |
| "learning_rate": 0.000254623141941125, | |
| "loss": 3.1766, | |
| "step": 98850 | |
| }, | |
| { | |
| "epoch": 28.80862219632974, | |
| "grad_norm": 0.4654344320297241, | |
| "learning_rate": 0.0002544482658117167, | |
| "loss": 3.1661, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 28.823186717157007, | |
| "grad_norm": 0.4341161549091339, | |
| "learning_rate": 0.00025427338968230835, | |
| "loss": 3.1745, | |
| "step": 98950 | |
| }, | |
| { | |
| "epoch": 28.83775123798427, | |
| "grad_norm": 0.42561206221580505, | |
| "learning_rate": 0.0002540985135529, | |
| "loss": 3.1694, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.83775123798427, | |
| "eval_accuracy": 0.37556016304247486, | |
| "eval_loss": 3.533388376235962, | |
| "eval_runtime": 179.9588, | |
| "eval_samples_per_second": 92.471, | |
| "eval_steps_per_second": 5.785, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.852315758811535, | |
| "grad_norm": 0.4475663900375366, | |
| "learning_rate": 0.0002539236374234917, | |
| "loss": 3.167, | |
| "step": 99050 | |
| }, | |
| { | |
| "epoch": 28.8668802796388, | |
| "grad_norm": 0.42597696185112, | |
| "learning_rate": 0.0002537487612940833, | |
| "loss": 3.177, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 28.881444800466063, | |
| "grad_norm": 0.4047093093395233, | |
| "learning_rate": 0.000253573885164675, | |
| "loss": 3.1726, | |
| "step": 99150 | |
| }, | |
| { | |
| "epoch": 28.89600932129333, | |
| "grad_norm": 0.41124317049980164, | |
| "learning_rate": 0.0002533990090352667, | |
| "loss": 3.1627, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 28.910573842120595, | |
| "grad_norm": 0.448076993227005, | |
| "learning_rate": 0.00025322413290585833, | |
| "loss": 3.1722, | |
| "step": 99250 | |
| }, | |
| { | |
| "epoch": 28.92513836294786, | |
| "grad_norm": 0.472428560256958, | |
| "learning_rate": 0.00025304925677644997, | |
| "loss": 3.1716, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 28.939702883775123, | |
| "grad_norm": 0.4386545419692993, | |
| "learning_rate": 0.00025287438064704166, | |
| "loss": 3.1641, | |
| "step": 99350 | |
| }, | |
| { | |
| "epoch": 28.954267404602387, | |
| "grad_norm": 0.43108075857162476, | |
| "learning_rate": 0.00025269950451763335, | |
| "loss": 3.1787, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 28.968831925429654, | |
| "grad_norm": 0.4541033208370209, | |
| "learning_rate": 0.000252524628388225, | |
| "loss": 3.1614, | |
| "step": 99450 | |
| }, | |
| { | |
| "epoch": 28.98339644625692, | |
| "grad_norm": 0.44250738620758057, | |
| "learning_rate": 0.0002523497522588167, | |
| "loss": 3.1782, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 28.997960967084182, | |
| "grad_norm": 0.43782711029052734, | |
| "learning_rate": 0.0002521748761294083, | |
| "loss": 3.1698, | |
| "step": 99550 | |
| }, | |
| { | |
| "epoch": 29.012525487911446, | |
| "grad_norm": 0.4725230932235718, | |
| "learning_rate": 0.00025199999999999995, | |
| "loss": 3.0891, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 29.027090008738714, | |
| "grad_norm": 0.4644363522529602, | |
| "learning_rate": 0.00025182512387059164, | |
| "loss": 3.0664, | |
| "step": 99650 | |
| }, | |
| { | |
| "epoch": 29.041654529565978, | |
| "grad_norm": 0.43191707134246826, | |
| "learning_rate": 0.0002516502477411833, | |
| "loss": 3.0881, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 29.056219050393242, | |
| "grad_norm": 0.45804300904273987, | |
| "learning_rate": 0.00025147537161177496, | |
| "loss": 3.0822, | |
| "step": 99750 | |
| }, | |
| { | |
| "epoch": 29.070783571220506, | |
| "grad_norm": 0.4388163685798645, | |
| "learning_rate": 0.00025130049548236665, | |
| "loss": 3.0982, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 29.08534809204777, | |
| "grad_norm": 0.4603483974933624, | |
| "learning_rate": 0.0002511256193529583, | |
| "loss": 3.0999, | |
| "step": 99850 | |
| }, | |
| { | |
| "epoch": 29.099912612875038, | |
| "grad_norm": 0.4348946809768677, | |
| "learning_rate": 0.00025095074322355, | |
| "loss": 3.0925, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 29.1144771337023, | |
| "grad_norm": 0.4159044027328491, | |
| "learning_rate": 0.0002507758670941416, | |
| "loss": 3.0993, | |
| "step": 99950 | |
| }, | |
| { | |
| "epoch": 29.129041654529566, | |
| "grad_norm": 0.4123495817184448, | |
| "learning_rate": 0.0002506009909647333, | |
| "loss": 3.095, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.129041654529566, | |
| "eval_accuracy": 0.37440417538508325, | |
| "eval_loss": 3.551318645477295, | |
| "eval_runtime": 179.942, | |
| "eval_samples_per_second": 92.48, | |
| "eval_steps_per_second": 5.785, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.129041654529566, | |
| "step": 100000, | |
| "total_flos": 2.090252903841792e+18, | |
| "train_loss": 0.6325678546142578, | |
| "train_runtime": 39813.0313, | |
| "train_samples_per_second": 344.898, | |
| "train_steps_per_second": 4.311 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171650, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 20 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.090252903841792e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |