Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN,
"... is not valid JSON
| { | |
| "best_global_step": 72000, | |
| "best_metric": 3.5322375297546387, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_hit_frequency_2128/checkpoint-30000", | |
| "epoch": 29.129340480074575, | |
| "eval_steps": 1000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01456536937776742, | |
| "grad_norm": 0.8308652639389038, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4387, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02913073875553484, | |
| "grad_norm": 0.6382876038551331, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7184, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04369610813330226, | |
| "grad_norm": 0.4452700912952423, | |
| "learning_rate": 0.0005998286213931798, | |
| "loss": 6.3602, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05826147751106968, | |
| "grad_norm": 0.4627819061279297, | |
| "learning_rate": 0.0005996537452637714, | |
| "loss": 6.1529, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0728268468888371, | |
| "grad_norm": 0.4908621311187744, | |
| "learning_rate": 0.0005994788691343632, | |
| "loss": 5.9753, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08739221626660452, | |
| "grad_norm": 0.46045973896980286, | |
| "learning_rate": 0.0005993039930049548, | |
| "loss": 5.8546, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10195758564437195, | |
| "grad_norm": 0.4518718123435974, | |
| "learning_rate": 0.0005991291168755465, | |
| "loss": 5.7234, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11652295502213936, | |
| "grad_norm": 0.41379204392433167, | |
| "learning_rate": 0.0005989542407461382, | |
| "loss": 5.6087, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13108832439990678, | |
| "grad_norm": 0.42392534017562866, | |
| "learning_rate": 0.0005987793646167297, | |
| "loss": 5.5138, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1456536937776742, | |
| "grad_norm": 0.44991201162338257, | |
| "learning_rate": 0.0005986044884873214, | |
| "loss": 5.4074, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16021906315544163, | |
| "grad_norm": 0.4449133574962616, | |
| "learning_rate": 0.0005984296123579131, | |
| "loss": 5.3554, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17478443253320905, | |
| "grad_norm": 0.43911343812942505, | |
| "learning_rate": 0.0005982547362285047, | |
| "loss": 5.2571, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18934980191097647, | |
| "grad_norm": 0.44633105397224426, | |
| "learning_rate": 0.0005980798600990964, | |
| "loss": 5.194, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2039151712887439, | |
| "grad_norm": 0.5142127275466919, | |
| "learning_rate": 0.0005979049839696881, | |
| "loss": 5.1364, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2184805406665113, | |
| "grad_norm": 0.42377275228500366, | |
| "learning_rate": 0.0005977301078402798, | |
| "loss": 5.0645, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23304591004427871, | |
| "grad_norm": 0.49829915165901184, | |
| "learning_rate": 0.0005975552317108715, | |
| "loss": 5.0326, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24761127942204614, | |
| "grad_norm": 0.4879177212715149, | |
| "learning_rate": 0.0005973803555814631, | |
| "loss": 4.9787, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.26217664879981356, | |
| "grad_norm": 0.40835854411125183, | |
| "learning_rate": 0.0005972054794520547, | |
| "loss": 4.9266, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.276742018177581, | |
| "grad_norm": 0.40832623839378357, | |
| "learning_rate": 0.0005970306033226464, | |
| "loss": 4.8749, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2913073875553484, | |
| "grad_norm": 0.41372886300086975, | |
| "learning_rate": 0.0005968557271932381, | |
| "loss": 4.8229, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2913073875553484, | |
| "eval_accuracy": 0.2537010050443267, | |
| "eval_loss": 4.7555251121521, | |
| "eval_runtime": 182.9052, | |
| "eval_samples_per_second": 91.003, | |
| "eval_steps_per_second": 5.691, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30587275693311583, | |
| "grad_norm": 0.4627732038497925, | |
| "learning_rate": 0.0005966808510638297, | |
| "loss": 4.7732, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32043812631088325, | |
| "grad_norm": 0.5072389245033264, | |
| "learning_rate": 0.0005965059749344214, | |
| "loss": 4.7445, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3350034956886507, | |
| "grad_norm": 0.4509411156177521, | |
| "learning_rate": 0.0005963310988050131, | |
| "loss": 4.7006, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3495688650664181, | |
| "grad_norm": 0.4881671965122223, | |
| "learning_rate": 0.0005961562226756047, | |
| "loss": 4.6717, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3641342344441855, | |
| "grad_norm": 0.43225303292274475, | |
| "learning_rate": 0.0005959813465461965, | |
| "loss": 4.6353, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.37869960382195295, | |
| "grad_norm": 0.40277963876724243, | |
| "learning_rate": 0.000595806470416788, | |
| "loss": 4.6007, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.39326497319972037, | |
| "grad_norm": 0.5126720070838928, | |
| "learning_rate": 0.0005956315942873797, | |
| "loss": 4.5957, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4078303425774878, | |
| "grad_norm": 0.4341893196105957, | |
| "learning_rate": 0.0005954567181579714, | |
| "loss": 4.5667, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42239571195525516, | |
| "grad_norm": 0.466202974319458, | |
| "learning_rate": 0.000595281842028563, | |
| "loss": 4.5251, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4369610813330226, | |
| "grad_norm": 0.47109341621398926, | |
| "learning_rate": 0.0005951069658991547, | |
| "loss": 4.504, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45152645071079, | |
| "grad_norm": 0.39960241317749023, | |
| "learning_rate": 0.0005949320897697464, | |
| "loss": 4.4896, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.46609182008855743, | |
| "grad_norm": 0.3836056590080261, | |
| "learning_rate": 0.0005947572136403381, | |
| "loss": 4.465, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.48065718946632485, | |
| "grad_norm": 0.4436993896961212, | |
| "learning_rate": 0.0005945823375109297, | |
| "loss": 4.4483, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4952225588440923, | |
| "grad_norm": 0.49260610342025757, | |
| "learning_rate": 0.0005944074613815215, | |
| "loss": 4.4329, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5097879282218597, | |
| "grad_norm": 0.40900883078575134, | |
| "learning_rate": 0.000594232585252113, | |
| "loss": 4.4216, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5243532975996271, | |
| "grad_norm": 0.38395267724990845, | |
| "learning_rate": 0.0005940577091227047, | |
| "loss": 4.3901, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5389186669773945, | |
| "grad_norm": 0.4002784490585327, | |
| "learning_rate": 0.0005938828329932964, | |
| "loss": 4.3719, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.553484036355162, | |
| "grad_norm": 0.39246320724487305, | |
| "learning_rate": 0.000593707956863888, | |
| "loss": 4.3556, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5680494057329294, | |
| "grad_norm": 0.429328054189682, | |
| "learning_rate": 0.0005935330807344797, | |
| "loss": 4.3495, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5826147751106968, | |
| "grad_norm": 0.37213221192359924, | |
| "learning_rate": 0.0005933582046050714, | |
| "loss": 4.3272, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5826147751106968, | |
| "eval_accuracy": 0.29963923746663224, | |
| "eval_loss": 4.282708644866943, | |
| "eval_runtime": 182.1675, | |
| "eval_samples_per_second": 91.372, | |
| "eval_steps_per_second": 5.715, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5971801444884642, | |
| "grad_norm": 0.4040358066558838, | |
| "learning_rate": 0.000593183328475663, | |
| "loss": 4.3313, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6117455138662317, | |
| "grad_norm": 0.4094908535480499, | |
| "learning_rate": 0.0005930084523462546, | |
| "loss": 4.3198, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6263108832439991, | |
| "grad_norm": 0.39454561471939087, | |
| "learning_rate": 0.0005928335762168463, | |
| "loss": 4.3, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6408762526217665, | |
| "grad_norm": 0.3934589624404907, | |
| "learning_rate": 0.000592658700087438, | |
| "loss": 4.3025, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6554416219995339, | |
| "grad_norm": 0.386088103055954, | |
| "learning_rate": 0.0005924838239580297, | |
| "loss": 4.2816, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6700069913773014, | |
| "grad_norm": 0.4144304096698761, | |
| "learning_rate": 0.0005923089478286214, | |
| "loss": 4.2738, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6845723607550688, | |
| "grad_norm": 0.4177938401699066, | |
| "learning_rate": 0.000592134071699213, | |
| "loss": 4.2481, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6991377301328362, | |
| "grad_norm": 0.36143994331359863, | |
| "learning_rate": 0.0005919591955698047, | |
| "loss": 4.2475, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7137030995106036, | |
| "grad_norm": 0.3758913278579712, | |
| "learning_rate": 0.0005917843194403964, | |
| "loss": 4.233, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.728268468888371, | |
| "grad_norm": 0.3990520238876343, | |
| "learning_rate": 0.000591609443310988, | |
| "loss": 4.2258, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7428338382661385, | |
| "grad_norm": 0.341074138879776, | |
| "learning_rate": 0.0005914345671815796, | |
| "loss": 4.2438, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7573992076439059, | |
| "grad_norm": 0.3899039924144745, | |
| "learning_rate": 0.0005912596910521713, | |
| "loss": 4.2077, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7719645770216733, | |
| "grad_norm": 0.3767816424369812, | |
| "learning_rate": 0.0005910848149227629, | |
| "loss": 4.1915, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7865299463994407, | |
| "grad_norm": 0.3586917221546173, | |
| "learning_rate": 0.0005909099387933547, | |
| "loss": 4.192, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8010953157772082, | |
| "grad_norm": 0.3622867465019226, | |
| "learning_rate": 0.0005907350626639463, | |
| "loss": 4.1811, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8156606851549756, | |
| "grad_norm": 0.34509238600730896, | |
| "learning_rate": 0.000590560186534538, | |
| "loss": 4.1775, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8302260545327429, | |
| "grad_norm": 0.3676803708076477, | |
| "learning_rate": 0.0005903853104051297, | |
| "loss": 4.1742, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8447914239105103, | |
| "grad_norm": 0.34232082962989807, | |
| "learning_rate": 0.0005902104342757214, | |
| "loss": 4.1572, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8593567932882777, | |
| "grad_norm": 0.3571970760822296, | |
| "learning_rate": 0.000590035558146313, | |
| "loss": 4.1584, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8739221626660452, | |
| "grad_norm": 0.35053205490112305, | |
| "learning_rate": 0.0005898606820169046, | |
| "loss": 4.142, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8739221626660452, | |
| "eval_accuracy": 0.31524614092253395, | |
| "eval_loss": 4.096119403839111, | |
| "eval_runtime": 182.324, | |
| "eval_samples_per_second": 91.294, | |
| "eval_steps_per_second": 5.71, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8884875320438126, | |
| "grad_norm": 0.4020876884460449, | |
| "learning_rate": 0.0005896858058874963, | |
| "loss": 4.1395, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.90305290142158, | |
| "grad_norm": 0.3728208839893341, | |
| "learning_rate": 0.0005895109297580879, | |
| "loss": 4.14, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9176182707993474, | |
| "grad_norm": 0.3634420335292816, | |
| "learning_rate": 0.0005893360536286797, | |
| "loss": 4.1265, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9321836401771149, | |
| "grad_norm": 0.3639736473560333, | |
| "learning_rate": 0.0005891611774992713, | |
| "loss": 4.1264, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9467490095548823, | |
| "grad_norm": 0.3608086407184601, | |
| "learning_rate": 0.000588986301369863, | |
| "loss": 4.1242, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9613143789326497, | |
| "grad_norm": 0.37832555174827576, | |
| "learning_rate": 0.0005888114252404547, | |
| "loss": 4.1139, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9758797483104171, | |
| "grad_norm": 0.3500097990036011, | |
| "learning_rate": 0.0005886365491110463, | |
| "loss": 4.0998, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9904451176881846, | |
| "grad_norm": 0.34508016705513, | |
| "learning_rate": 0.000588461672981638, | |
| "loss": 4.0876, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0049522255884409, | |
| "grad_norm": 0.346635103225708, | |
| "learning_rate": 0.0005882867968522296, | |
| "loss": 4.0698, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0195175949662083, | |
| "grad_norm": 0.351591020822525, | |
| "learning_rate": 0.0005881119207228212, | |
| "loss": 4.0152, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0340829643439757, | |
| "grad_norm": 0.342751145362854, | |
| "learning_rate": 0.0005879370445934129, | |
| "loss": 4.0189, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0486483337217432, | |
| "grad_norm": 0.34400904178619385, | |
| "learning_rate": 0.0005877621684640046, | |
| "loss": 4.0082, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0632137030995106, | |
| "grad_norm": 0.3556966483592987, | |
| "learning_rate": 0.0005875872923345963, | |
| "loss": 4.0164, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.077779072477278, | |
| "grad_norm": 0.3546448349952698, | |
| "learning_rate": 0.000587412416205188, | |
| "loss": 4.0009, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0923444418550454, | |
| "grad_norm": 0.37118959426879883, | |
| "learning_rate": 0.0005872375400757797, | |
| "loss": 4.0167, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1069098112328128, | |
| "grad_norm": 0.3513905704021454, | |
| "learning_rate": 0.0005870626639463713, | |
| "loss": 4.0167, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1214751806105803, | |
| "grad_norm": 0.3534930348396301, | |
| "learning_rate": 0.0005868877878169629, | |
| "loss": 3.9929, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1360405499883477, | |
| "grad_norm": 0.3811454176902771, | |
| "learning_rate": 0.0005867129116875546, | |
| "loss": 3.9999, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1506059193661151, | |
| "grad_norm": 0.34738337993621826, | |
| "learning_rate": 0.0005865380355581462, | |
| "loss": 4.0029, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1651712887438825, | |
| "grad_norm": 0.3603108525276184, | |
| "learning_rate": 0.0005863631594287379, | |
| "loss": 3.9925, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1651712887438825, | |
| "eval_accuracy": 0.32528376909551887, | |
| "eval_loss": 3.9905245304107666, | |
| "eval_runtime": 182.0264, | |
| "eval_samples_per_second": 91.443, | |
| "eval_steps_per_second": 5.719, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.17973665812165, | |
| "grad_norm": 0.3429189622402191, | |
| "learning_rate": 0.0005861882832993296, | |
| "loss": 3.9933, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1943020274994174, | |
| "grad_norm": 0.38075006008148193, | |
| "learning_rate": 0.0005860134071699212, | |
| "loss": 3.9855, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2088673968771848, | |
| "grad_norm": 0.3338114321231842, | |
| "learning_rate": 0.000585838531040513, | |
| "loss": 3.9794, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2234327662549522, | |
| "grad_norm": 0.36873266100883484, | |
| "learning_rate": 0.0005856636549111046, | |
| "loss": 3.9787, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2379981356327197, | |
| "grad_norm": 0.32824212312698364, | |
| "learning_rate": 0.0005854887787816963, | |
| "loss": 3.9766, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.252563505010487, | |
| "grad_norm": 0.3516254723072052, | |
| "learning_rate": 0.0005853139026522879, | |
| "loss": 3.9651, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2671288743882545, | |
| "grad_norm": 0.32511937618255615, | |
| "learning_rate": 0.0005851390265228796, | |
| "loss": 3.9725, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.281694243766022, | |
| "grad_norm": 0.32222479581832886, | |
| "learning_rate": 0.0005849641503934712, | |
| "loss": 3.9755, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2962596131437893, | |
| "grad_norm": 0.3308519423007965, | |
| "learning_rate": 0.0005847892742640629, | |
| "loss": 3.9635, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3108249825215568, | |
| "grad_norm": 0.32887038588523865, | |
| "learning_rate": 0.0005846143981346546, | |
| "loss": 3.9673, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3253903518993242, | |
| "grad_norm": 0.33978450298309326, | |
| "learning_rate": 0.0005844395220052462, | |
| "loss": 3.9637, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3399557212770916, | |
| "grad_norm": 0.3525462746620178, | |
| "learning_rate": 0.000584264645875838, | |
| "loss": 3.9552, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.354521090654859, | |
| "grad_norm": 0.3444075882434845, | |
| "learning_rate": 0.0005840897697464296, | |
| "loss": 3.953, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.3690864600326265, | |
| "grad_norm": 0.34169191122055054, | |
| "learning_rate": 0.0005839148936170212, | |
| "loss": 3.9486, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.3836518294103939, | |
| "grad_norm": 0.3524395227432251, | |
| "learning_rate": 0.0005837400174876129, | |
| "loss": 3.941, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3982171987881613, | |
| "grad_norm": 0.3321269154548645, | |
| "learning_rate": 0.0005835651413582045, | |
| "loss": 3.9375, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4127825681659287, | |
| "grad_norm": 0.3419478237628937, | |
| "learning_rate": 0.0005833902652287962, | |
| "loss": 3.9489, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4273479375436962, | |
| "grad_norm": 0.33724281191825867, | |
| "learning_rate": 0.0005832153890993879, | |
| "loss": 3.939, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4419133069214636, | |
| "grad_norm": 0.32965749502182007, | |
| "learning_rate": 0.0005830405129699796, | |
| "loss": 3.9303, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.456478676299231, | |
| "grad_norm": 0.32706567645072937, | |
| "learning_rate": 0.0005828656368405712, | |
| "loss": 3.9195, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.456478676299231, | |
| "eval_accuracy": 0.33166874275109504, | |
| "eval_loss": 3.9142563343048096, | |
| "eval_runtime": 182.2224, | |
| "eval_samples_per_second": 91.344, | |
| "eval_steps_per_second": 5.713, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4710440456769984, | |
| "grad_norm": 0.33142364025115967, | |
| "learning_rate": 0.0005826907607111629, | |
| "loss": 3.9196, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.4856094150547658, | |
| "grad_norm": 0.3274194896221161, | |
| "learning_rate": 0.0005825158845817546, | |
| "loss": 3.9284, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.500174784432533, | |
| "grad_norm": 0.35101616382598877, | |
| "learning_rate": 0.0005823410084523462, | |
| "loss": 3.9313, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5147401538103007, | |
| "grad_norm": 0.3553934395313263, | |
| "learning_rate": 0.0005821661323229379, | |
| "loss": 3.9222, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.529305523188068, | |
| "grad_norm": 0.32745224237442017, | |
| "learning_rate": 0.0005819912561935295, | |
| "loss": 3.9211, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5438708925658355, | |
| "grad_norm": 0.32173994183540344, | |
| "learning_rate": 0.0005818163800641212, | |
| "loss": 3.9237, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5584362619436027, | |
| "grad_norm": 0.3147367835044861, | |
| "learning_rate": 0.0005816415039347129, | |
| "loss": 3.9025, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.5730016313213704, | |
| "grad_norm": 0.3226154148578644, | |
| "learning_rate": 0.0005814666278053045, | |
| "loss": 3.9197, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.5875670006991376, | |
| "grad_norm": 0.3392418324947357, | |
| "learning_rate": 0.0005812917516758962, | |
| "loss": 3.9199, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6021323700769052, | |
| "grad_norm": 0.3240615427494049, | |
| "learning_rate": 0.0005811168755464879, | |
| "loss": 3.9066, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6166977394546724, | |
| "grad_norm": 0.3571517765522003, | |
| "learning_rate": 0.0005809419994170794, | |
| "loss": 3.9046, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.63126310883244, | |
| "grad_norm": 0.3363195061683655, | |
| "learning_rate": 0.0005807671232876712, | |
| "loss": 3.895, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6458284782102073, | |
| "grad_norm": 0.35087713599205017, | |
| "learning_rate": 0.0005805922471582628, | |
| "loss": 3.895, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.660393847587975, | |
| "grad_norm": 0.3502371907234192, | |
| "learning_rate": 0.0005804173710288545, | |
| "loss": 3.8907, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6749592169657421, | |
| "grad_norm": 0.37237074971199036, | |
| "learning_rate": 0.0005802424948994462, | |
| "loss": 3.8942, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6895245863435098, | |
| "grad_norm": 0.3460238575935364, | |
| "learning_rate": 0.0005800676187700379, | |
| "loss": 3.8973, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.704089955721277, | |
| "grad_norm": 0.3247397243976593, | |
| "learning_rate": 0.0005798927426406295, | |
| "loss": 3.8874, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7186553250990446, | |
| "grad_norm": 0.37104010581970215, | |
| "learning_rate": 0.0005797178665112212, | |
| "loss": 3.8913, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7332206944768118, | |
| "grad_norm": 0.32479041814804077, | |
| "learning_rate": 0.0005795429903818129, | |
| "loss": 3.8934, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7477860638545795, | |
| "grad_norm": 0.34869810938835144, | |
| "learning_rate": 0.0005793681142524044, | |
| "loss": 3.8842, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7477860638545795, | |
| "eval_accuracy": 0.3369487966450319, | |
| "eval_loss": 3.8599915504455566, | |
| "eval_runtime": 182.1572, | |
| "eval_samples_per_second": 91.377, | |
| "eval_steps_per_second": 5.715, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7623514332323467, | |
| "grad_norm": 0.32061490416526794, | |
| "learning_rate": 0.0005791932381229961, | |
| "loss": 3.8863, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7769168026101143, | |
| "grad_norm": 0.3404031991958618, | |
| "learning_rate": 0.0005790183619935878, | |
| "loss": 3.882, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7914821719878815, | |
| "grad_norm": 0.30672210454940796, | |
| "learning_rate": 0.0005788434858641795, | |
| "loss": 3.8718, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8060475413656492, | |
| "grad_norm": 0.3539854884147644, | |
| "learning_rate": 0.0005786686097347712, | |
| "loss": 3.8862, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8206129107434164, | |
| "grad_norm": 0.33336907625198364, | |
| "learning_rate": 0.0005784937336053628, | |
| "loss": 3.8618, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.835178280121184, | |
| "grad_norm": 0.3471635580062866, | |
| "learning_rate": 0.0005783188574759545, | |
| "loss": 3.8604, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8497436494989512, | |
| "grad_norm": 0.32666853070259094, | |
| "learning_rate": 0.0005781439813465462, | |
| "loss": 3.8604, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8643090188767188, | |
| "grad_norm": 0.3473672866821289, | |
| "learning_rate": 0.0005779691052171379, | |
| "loss": 3.8667, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.878874388254486, | |
| "grad_norm": 0.3199038803577423, | |
| "learning_rate": 0.0005777942290877294, | |
| "loss": 3.8611, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8934397576322537, | |
| "grad_norm": 0.32697010040283203, | |
| "learning_rate": 0.0005776193529583211, | |
| "loss": 3.858, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.908005127010021, | |
| "grad_norm": 0.37832486629486084, | |
| "learning_rate": 0.0005774444768289128, | |
| "loss": 3.8601, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9225704963877885, | |
| "grad_norm": 0.3379972279071808, | |
| "learning_rate": 0.0005772696006995045, | |
| "loss": 3.856, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9371358657655557, | |
| "grad_norm": 0.3384763300418854, | |
| "learning_rate": 0.0005770947245700962, | |
| "loss": 3.848, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9517012351433234, | |
| "grad_norm": 0.3093826472759247, | |
| "learning_rate": 0.0005769198484406878, | |
| "loss": 3.8563, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9662666045210906, | |
| "grad_norm": 0.32603582739830017, | |
| "learning_rate": 0.0005767449723112795, | |
| "loss": 3.8538, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9808319738988582, | |
| "grad_norm": 0.36787310242652893, | |
| "learning_rate": 0.0005765700961818712, | |
| "loss": 3.8543, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.9953973432766254, | |
| "grad_norm": 0.32396772503852844, | |
| "learning_rate": 0.0005763952200524627, | |
| "loss": 3.8532, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0099044511768818, | |
| "grad_norm": 0.3264414072036743, | |
| "learning_rate": 0.0005762203439230544, | |
| "loss": 3.7844, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0244698205546494, | |
| "grad_norm": 0.3082588315010071, | |
| "learning_rate": 0.0005760454677936461, | |
| "loss": 3.722, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0390351899324166, | |
| "grad_norm": 0.3626100718975067, | |
| "learning_rate": 0.0005758705916642378, | |
| "loss": 3.7443, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0390351899324166, | |
| "eval_accuracy": 0.34116120036282, | |
| "eval_loss": 3.816786766052246, | |
| "eval_runtime": 182.2244, | |
| "eval_samples_per_second": 91.343, | |
| "eval_steps_per_second": 5.713, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0536005593101843, | |
| "grad_norm": 0.33928582072257996, | |
| "learning_rate": 0.0005756957155348294, | |
| "loss": 3.7456, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.0681659286879515, | |
| "grad_norm": 0.33408093452453613, | |
| "learning_rate": 0.0005755208394054211, | |
| "loss": 3.7558, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.082731298065719, | |
| "grad_norm": 0.35014262795448303, | |
| "learning_rate": 0.0005753459632760128, | |
| "loss": 3.7546, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.0972966674434863, | |
| "grad_norm": 0.33521801233291626, | |
| "learning_rate": 0.0005751710871466045, | |
| "loss": 3.7488, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.111862036821254, | |
| "grad_norm": 0.3408453166484833, | |
| "learning_rate": 0.0005749962110171962, | |
| "loss": 3.7638, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.126427406199021, | |
| "grad_norm": 0.32431185245513916, | |
| "learning_rate": 0.0005748213348877877, | |
| "loss": 3.7519, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.140992775576789, | |
| "grad_norm": 0.32259050011634827, | |
| "learning_rate": 0.0005746464587583794, | |
| "loss": 3.7608, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.155558144954556, | |
| "grad_norm": 0.3296469748020172, | |
| "learning_rate": 0.0005744715826289711, | |
| "loss": 3.7617, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1701235143323236, | |
| "grad_norm": 0.3369705379009247, | |
| "learning_rate": 0.0005742967064995627, | |
| "loss": 3.747, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.184688883710091, | |
| "grad_norm": 0.335363507270813, | |
| "learning_rate": 0.0005741218303701544, | |
| "loss": 3.7517, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1992542530878585, | |
| "grad_norm": 0.3429674208164215, | |
| "learning_rate": 0.0005739469542407461, | |
| "loss": 3.7613, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2138196224656257, | |
| "grad_norm": 0.3400017023086548, | |
| "learning_rate": 0.0005737720781113378, | |
| "loss": 3.7465, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2283849918433933, | |
| "grad_norm": 0.33040422201156616, | |
| "learning_rate": 0.0005735972019819295, | |
| "loss": 3.765, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.2429503612211605, | |
| "grad_norm": 0.325589656829834, | |
| "learning_rate": 0.000573422325852521, | |
| "loss": 3.7555, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.257515730598928, | |
| "grad_norm": 0.31000620126724243, | |
| "learning_rate": 0.0005732474497231127, | |
| "loss": 3.7614, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2720810999766954, | |
| "grad_norm": 0.3232748806476593, | |
| "learning_rate": 0.0005730725735937044, | |
| "loss": 3.7535, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.286646469354463, | |
| "grad_norm": 0.3055737018585205, | |
| "learning_rate": 0.0005728976974642961, | |
| "loss": 3.7598, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.3012118387322302, | |
| "grad_norm": 0.32002055644989014, | |
| "learning_rate": 0.0005727228213348877, | |
| "loss": 3.7501, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3157772081099974, | |
| "grad_norm": 0.3241938054561615, | |
| "learning_rate": 0.0005725479452054794, | |
| "loss": 3.7547, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.330342577487765, | |
| "grad_norm": 0.3343994915485382, | |
| "learning_rate": 0.0005723730690760711, | |
| "loss": 3.7503, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.330342577487765, | |
| "eval_accuracy": 0.3443282921418196, | |
| "eval_loss": 3.7861814498901367, | |
| "eval_runtime": 182.4203, | |
| "eval_samples_per_second": 91.245, | |
| "eval_steps_per_second": 5.707, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3449079468655327, | |
| "grad_norm": 0.32609114050865173, | |
| "learning_rate": 0.0005721981929466627, | |
| "loss": 3.7511, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.3594733162433, | |
| "grad_norm": 0.3273298144340515, | |
| "learning_rate": 0.0005720233168172545, | |
| "loss": 3.7491, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.374038685621067, | |
| "grad_norm": 0.31795287132263184, | |
| "learning_rate": 0.000571848440687846, | |
| "loss": 3.7475, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3886040549988348, | |
| "grad_norm": 0.3376888334751129, | |
| "learning_rate": 0.0005716735645584377, | |
| "loss": 3.7563, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4031694243766024, | |
| "grad_norm": 0.32242295145988464, | |
| "learning_rate": 0.0005714986884290294, | |
| "loss": 3.7462, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4177347937543696, | |
| "grad_norm": 0.31965371966362, | |
| "learning_rate": 0.000571323812299621, | |
| "loss": 3.7578, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.432300163132137, | |
| "grad_norm": 0.3355007469654083, | |
| "learning_rate": 0.0005711489361702127, | |
| "loss": 3.7568, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4468655325099045, | |
| "grad_norm": 0.32753318548202515, | |
| "learning_rate": 0.0005709740600408044, | |
| "loss": 3.7353, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.461430901887672, | |
| "grad_norm": 0.3319459855556488, | |
| "learning_rate": 0.0005707991839113961, | |
| "loss": 3.743, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.4759962712654393, | |
| "grad_norm": 0.3193652927875519, | |
| "learning_rate": 0.0005706243077819877, | |
| "loss": 3.7468, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4905616406432065, | |
| "grad_norm": 0.32112497091293335, | |
| "learning_rate": 0.0005704494316525793, | |
| "loss": 3.7436, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.505127010020974, | |
| "grad_norm": 0.3209002614021301, | |
| "learning_rate": 0.000570274555523171, | |
| "loss": 3.7432, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.519692379398742, | |
| "grad_norm": 0.3239600956439972, | |
| "learning_rate": 0.0005700996793937627, | |
| "loss": 3.7495, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.534257748776509, | |
| "grad_norm": 0.34214961528778076, | |
| "learning_rate": 0.0005699248032643544, | |
| "loss": 3.7486, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.548823118154276, | |
| "grad_norm": 0.3178744316101074, | |
| "learning_rate": 0.000569749927134946, | |
| "loss": 3.7443, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.563388487532044, | |
| "grad_norm": 0.3296307921409607, | |
| "learning_rate": 0.0005695750510055377, | |
| "loss": 3.7494, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5779538569098115, | |
| "grad_norm": 0.33302944898605347, | |
| "learning_rate": 0.0005694001748761294, | |
| "loss": 3.7445, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.5925192262875787, | |
| "grad_norm": 0.33363667130470276, | |
| "learning_rate": 0.000569225298746721, | |
| "loss": 3.747, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.607084595665346, | |
| "grad_norm": 0.33573073148727417, | |
| "learning_rate": 0.0005690504226173127, | |
| "loss": 3.731, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6216499650431135, | |
| "grad_norm": 0.3124948740005493, | |
| "learning_rate": 0.0005688755464879043, | |
| "loss": 3.7376, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6216499650431135, | |
| "eval_accuracy": 0.3468023107143004, | |
| "eval_loss": 3.756934881210327, | |
| "eval_runtime": 182.4097, | |
| "eval_samples_per_second": 91.251, | |
| "eval_steps_per_second": 5.707, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.636215334420881, | |
| "grad_norm": 0.3117513954639435, | |
| "learning_rate": 0.000568700670358496, | |
| "loss": 3.7412, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6507807037986484, | |
| "grad_norm": 0.3153388798236847, | |
| "learning_rate": 0.0005685257942290877, | |
| "loss": 3.7424, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6653460731764156, | |
| "grad_norm": 0.31582581996917725, | |
| "learning_rate": 0.0005683509180996793, | |
| "loss": 3.7352, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6799114425541832, | |
| "grad_norm": 0.31198346614837646, | |
| "learning_rate": 0.000568176041970271, | |
| "loss": 3.7397, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.6944768119319504, | |
| "grad_norm": 0.33701658248901367, | |
| "learning_rate": 0.0005680011658408627, | |
| "loss": 3.7386, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.709042181309718, | |
| "grad_norm": 0.3240450918674469, | |
| "learning_rate": 0.0005678262897114544, | |
| "loss": 3.7343, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.7236075506874853, | |
| "grad_norm": 0.31347861886024475, | |
| "learning_rate": 0.000567651413582046, | |
| "loss": 3.7271, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.738172920065253, | |
| "grad_norm": 0.33607959747314453, | |
| "learning_rate": 0.0005674765374526377, | |
| "loss": 3.7378, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.75273828944302, | |
| "grad_norm": 0.33370694518089294, | |
| "learning_rate": 0.0005673016613232293, | |
| "loss": 3.7344, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.7673036588207878, | |
| "grad_norm": 0.29530900716781616, | |
| "learning_rate": 0.0005671267851938209, | |
| "loss": 3.7352, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.781869028198555, | |
| "grad_norm": 0.3362729251384735, | |
| "learning_rate": 0.0005669519090644127, | |
| "loss": 3.7499, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7964343975763226, | |
| "grad_norm": 0.3185634911060333, | |
| "learning_rate": 0.0005667770329350043, | |
| "loss": 3.727, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.81099976695409, | |
| "grad_norm": 0.3251460790634155, | |
| "learning_rate": 0.000566602156805596, | |
| "loss": 3.7348, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8255651363318575, | |
| "grad_norm": 0.32407787442207336, | |
| "learning_rate": 0.0005664272806761877, | |
| "loss": 3.7312, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8401305057096247, | |
| "grad_norm": 0.31047409772872925, | |
| "learning_rate": 0.0005662524045467793, | |
| "loss": 3.7414, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8546958750873923, | |
| "grad_norm": 0.3150789439678192, | |
| "learning_rate": 0.000566077528417371, | |
| "loss": 3.7292, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.8692612444651595, | |
| "grad_norm": 0.32285672426223755, | |
| "learning_rate": 0.0005659026522879626, | |
| "loss": 3.7278, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.883826613842927, | |
| "grad_norm": 0.3205214738845825, | |
| "learning_rate": 0.0005657277761585543, | |
| "loss": 3.7117, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.8983919832206944, | |
| "grad_norm": 0.3268585503101349, | |
| "learning_rate": 0.0005655529000291459, | |
| "loss": 3.7298, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.912957352598462, | |
| "grad_norm": 0.3318754732608795, | |
| "learning_rate": 0.0005653780238997376, | |
| "loss": 3.7244, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.912957352598462, | |
| "eval_accuracy": 0.3494827816278579, | |
| "eval_loss": 3.7275166511535645, | |
| "eval_runtime": 182.3792, | |
| "eval_samples_per_second": 91.266, | |
| "eval_steps_per_second": 5.708, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.927522721976229, | |
| "grad_norm": 0.32303711771965027, | |
| "learning_rate": 0.0005652031477703293, | |
| "loss": 3.7313, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.942088091353997, | |
| "grad_norm": 0.3396250009536743, | |
| "learning_rate": 0.000565028271640921, | |
| "loss": 3.728, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.956653460731764, | |
| "grad_norm": 0.31802433729171753, | |
| "learning_rate": 0.0005648533955115127, | |
| "loss": 3.7244, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.9712188301095317, | |
| "grad_norm": 0.3270646333694458, | |
| "learning_rate": 0.0005646785193821043, | |
| "loss": 3.7336, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.985784199487299, | |
| "grad_norm": 0.3221674859523773, | |
| "learning_rate": 0.000564503643252696, | |
| "loss": 3.7288, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.0002913073875552, | |
| "grad_norm": 0.35366907715797424, | |
| "learning_rate": 0.0005643287671232876, | |
| "loss": 3.7151, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.014856676765323, | |
| "grad_norm": 0.33569812774658203, | |
| "learning_rate": 0.0005641538909938792, | |
| "loss": 3.6103, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.02942204614309, | |
| "grad_norm": 0.3463501036167145, | |
| "learning_rate": 0.0005639790148644709, | |
| "loss": 3.6142, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.0439874155208577, | |
| "grad_norm": 0.3205231726169586, | |
| "learning_rate": 0.0005638041387350626, | |
| "loss": 3.6093, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.058552784898625, | |
| "grad_norm": 0.31053611636161804, | |
| "learning_rate": 0.0005636292626056543, | |
| "loss": 3.6248, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.0731181542763926, | |
| "grad_norm": 0.32655248045921326, | |
| "learning_rate": 0.000563454386476246, | |
| "loss": 3.6235, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.0876835236541598, | |
| "grad_norm": 0.3263218104839325, | |
| "learning_rate": 0.0005632795103468376, | |
| "loss": 3.6202, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1022488930319274, | |
| "grad_norm": 0.32272443175315857, | |
| "learning_rate": 0.0005631046342174293, | |
| "loss": 3.6317, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.1168142624096946, | |
| "grad_norm": 0.3152412474155426, | |
| "learning_rate": 0.000562929758088021, | |
| "loss": 3.6225, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.1313796317874623, | |
| "grad_norm": 0.3140038251876831, | |
| "learning_rate": 0.0005627548819586126, | |
| "loss": 3.6281, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1459450011652295, | |
| "grad_norm": 0.3572128713130951, | |
| "learning_rate": 0.0005625800058292042, | |
| "loss": 3.629, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.160510370542997, | |
| "grad_norm": 0.3352822959423065, | |
| "learning_rate": 0.0005624051296997959, | |
| "loss": 3.623, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.1750757399207643, | |
| "grad_norm": 0.3205251097679138, | |
| "learning_rate": 0.0005622302535703876, | |
| "loss": 3.6395, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.189641109298532, | |
| "grad_norm": 0.3109528720378876, | |
| "learning_rate": 0.0005620553774409792, | |
| "loss": 3.6292, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.204206478676299, | |
| "grad_norm": 0.3360290825366974, | |
| "learning_rate": 0.000561880501311571, | |
| "loss": 3.6353, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.204206478676299, | |
| "eval_accuracy": 0.35141351075380384, | |
| "eval_loss": 3.7175660133361816, | |
| "eval_runtime": 182.3999, | |
| "eval_samples_per_second": 91.256, | |
| "eval_steps_per_second": 5.707, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.218771848054067, | |
| "grad_norm": 0.32028627395629883, | |
| "learning_rate": 0.0005617056251821626, | |
| "loss": 3.6444, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.233337217431834, | |
| "grad_norm": 0.31713923811912537, | |
| "learning_rate": 0.0005615307490527543, | |
| "loss": 3.639, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.2479025868096016, | |
| "grad_norm": 0.3299584686756134, | |
| "learning_rate": 0.000561355872923346, | |
| "loss": 3.6466, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.262467956187369, | |
| "grad_norm": 0.3305450677871704, | |
| "learning_rate": 0.0005611809967939375, | |
| "loss": 3.6363, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.2770333255651365, | |
| "grad_norm": 0.3444271385669708, | |
| "learning_rate": 0.0005610061206645292, | |
| "loss": 3.6381, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.2915986949429037, | |
| "grad_norm": 0.31552445888519287, | |
| "learning_rate": 0.0005608312445351209, | |
| "loss": 3.6396, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.3061640643206713, | |
| "grad_norm": 0.309539258480072, | |
| "learning_rate": 0.0005606563684057126, | |
| "loss": 3.6437, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3207294336984385, | |
| "grad_norm": 0.322343647480011, | |
| "learning_rate": 0.0005604814922763042, | |
| "loss": 3.6493, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.335294803076206, | |
| "grad_norm": 0.3462202847003937, | |
| "learning_rate": 0.0005603066161468959, | |
| "loss": 3.6308, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.3498601724539734, | |
| "grad_norm": 0.3419882357120514, | |
| "learning_rate": 0.0005601317400174876, | |
| "loss": 3.6382, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.364425541831741, | |
| "grad_norm": 0.346147358417511, | |
| "learning_rate": 0.0005599568638880793, | |
| "loss": 3.632, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.3789909112095082, | |
| "grad_norm": 0.32104918360710144, | |
| "learning_rate": 0.0005597819877586709, | |
| "loss": 3.6416, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.393556280587276, | |
| "grad_norm": 0.3208399713039398, | |
| "learning_rate": 0.0005596071116292625, | |
| "loss": 3.6589, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.408121649965043, | |
| "grad_norm": 0.3355486989021301, | |
| "learning_rate": 0.0005594322354998542, | |
| "loss": 3.6351, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.4226870193428107, | |
| "grad_norm": 0.329441100358963, | |
| "learning_rate": 0.0005592573593704459, | |
| "loss": 3.6544, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.437252388720578, | |
| "grad_norm": 0.331617534160614, | |
| "learning_rate": 0.0005590824832410375, | |
| "loss": 3.6444, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4518177580983456, | |
| "grad_norm": 0.35520729422569275, | |
| "learning_rate": 0.0005589076071116292, | |
| "loss": 3.6517, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.4663831274761128, | |
| "grad_norm": 0.32801005244255066, | |
| "learning_rate": 0.0005587327309822209, | |
| "loss": 3.6411, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.4809484968538804, | |
| "grad_norm": 0.3370635509490967, | |
| "learning_rate": 0.0005585578548528126, | |
| "loss": 3.6359, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.4955138662316476, | |
| "grad_norm": 0.31257134675979614, | |
| "learning_rate": 0.0005583829787234043, | |
| "loss": 3.6428, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4955138662316476, | |
| "eval_accuracy": 0.3529537910046269, | |
| "eval_loss": 3.6976659297943115, | |
| "eval_runtime": 182.5172, | |
| "eval_samples_per_second": 91.197, | |
| "eval_steps_per_second": 5.704, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.510079235609415, | |
| "grad_norm": 0.3163146674633026, | |
| "learning_rate": 0.0005582081025939958, | |
| "loss": 3.6404, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5246446049871825, | |
| "grad_norm": 0.31714287400245667, | |
| "learning_rate": 0.0005580332264645875, | |
| "loss": 3.6287, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.53920997436495, | |
| "grad_norm": 0.31056082248687744, | |
| "learning_rate": 0.0005578583503351792, | |
| "loss": 3.6548, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.5537753437427173, | |
| "grad_norm": 0.31519201397895813, | |
| "learning_rate": 0.0005576834742057709, | |
| "loss": 3.6466, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5683407131204845, | |
| "grad_norm": 0.3620156943798065, | |
| "learning_rate": 0.0005575085980763625, | |
| "loss": 3.6459, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.582906082498252, | |
| "grad_norm": 0.3160246014595032, | |
| "learning_rate": 0.0005573337219469542, | |
| "loss": 3.6571, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.59747145187602, | |
| "grad_norm": 0.3447693884372711, | |
| "learning_rate": 0.0005571588458175459, | |
| "loss": 3.639, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.612036821253787, | |
| "grad_norm": 0.31839776039123535, | |
| "learning_rate": 0.0005569839696881374, | |
| "loss": 3.6516, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.626602190631554, | |
| "grad_norm": 0.3184966742992401, | |
| "learning_rate": 0.0005568090935587292, | |
| "loss": 3.6529, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.641167560009322, | |
| "grad_norm": 0.3189146816730499, | |
| "learning_rate": 0.0005566342174293208, | |
| "loss": 3.6469, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6557329293870895, | |
| "grad_norm": 0.34218892455101013, | |
| "learning_rate": 0.0005564593412999125, | |
| "loss": 3.6535, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6702982987648567, | |
| "grad_norm": 0.3211210370063782, | |
| "learning_rate": 0.0005562844651705042, | |
| "loss": 3.6398, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.684863668142624, | |
| "grad_norm": 0.31546565890312195, | |
| "learning_rate": 0.0005561095890410958, | |
| "loss": 3.6409, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.6994290375203915, | |
| "grad_norm": 0.32176557183265686, | |
| "learning_rate": 0.0005559347129116875, | |
| "loss": 3.6457, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.713994406898159, | |
| "grad_norm": 0.323650598526001, | |
| "learning_rate": 0.0005557598367822792, | |
| "loss": 3.6463, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.7285597762759264, | |
| "grad_norm": 0.31617245078086853, | |
| "learning_rate": 0.0005555849606528709, | |
| "loss": 3.6484, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7431251456536936, | |
| "grad_norm": 0.3181900084018707, | |
| "learning_rate": 0.0005554100845234624, | |
| "loss": 3.659, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.7576905150314612, | |
| "grad_norm": 0.3386443257331848, | |
| "learning_rate": 0.0005552352083940541, | |
| "loss": 3.6516, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.772255884409229, | |
| "grad_norm": 0.33526474237442017, | |
| "learning_rate": 0.0005550603322646458, | |
| "loss": 3.6433, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.786821253786996, | |
| "grad_norm": 0.3211400806903839, | |
| "learning_rate": 0.0005548854561352375, | |
| "loss": 3.646, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.786821253786996, | |
| "eval_accuracy": 0.3546315102000507, | |
| "eval_loss": 3.6797258853912354, | |
| "eval_runtime": 182.657, | |
| "eval_samples_per_second": 91.127, | |
| "eval_steps_per_second": 5.699, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.8013866231647633, | |
| "grad_norm": 0.31162765622138977, | |
| "learning_rate": 0.0005547105800058292, | |
| "loss": 3.6343, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.815951992542531, | |
| "grad_norm": 0.292121559381485, | |
| "learning_rate": 0.0005545357038764208, | |
| "loss": 3.6399, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.8305173619202986, | |
| "grad_norm": 0.354305237531662, | |
| "learning_rate": 0.0005543608277470125, | |
| "loss": 3.6457, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8450827312980658, | |
| "grad_norm": 0.3242047131061554, | |
| "learning_rate": 0.0005541859516176042, | |
| "loss": 3.6389, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.859648100675833, | |
| "grad_norm": 0.3040805757045746, | |
| "learning_rate": 0.0005540110754881958, | |
| "loss": 3.6281, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.8742134700536006, | |
| "grad_norm": 0.33070269227027893, | |
| "learning_rate": 0.0005538361993587874, | |
| "loss": 3.6411, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.888778839431368, | |
| "grad_norm": 0.3205200135707855, | |
| "learning_rate": 0.0005536613232293791, | |
| "loss": 3.6402, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.9033442088091355, | |
| "grad_norm": 0.3389040231704712, | |
| "learning_rate": 0.0005534864470999708, | |
| "loss": 3.6405, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.9179095781869027, | |
| "grad_norm": 0.34000879526138306, | |
| "learning_rate": 0.0005533115709705625, | |
| "loss": 3.6363, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9324749475646703, | |
| "grad_norm": 0.31868523359298706, | |
| "learning_rate": 0.0005531366948411541, | |
| "loss": 3.6466, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.9470403169424375, | |
| "grad_norm": 0.31396111845970154, | |
| "learning_rate": 0.0005529618187117458, | |
| "loss": 3.6298, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.961605686320205, | |
| "grad_norm": 0.31641459465026855, | |
| "learning_rate": 0.0005527869425823375, | |
| "loss": 3.6324, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9761710556979724, | |
| "grad_norm": 0.3213881254196167, | |
| "learning_rate": 0.0005526120664529292, | |
| "loss": 3.6464, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.99073642507574, | |
| "grad_norm": 0.32177630066871643, | |
| "learning_rate": 0.0005524371903235207, | |
| "loss": 3.6374, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.005243532975996, | |
| "grad_norm": 0.32364439964294434, | |
| "learning_rate": 0.0005522623141941124, | |
| "loss": 3.6032, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.0198089023537635, | |
| "grad_norm": 0.3256928026676178, | |
| "learning_rate": 0.0005520874380647041, | |
| "loss": 3.5313, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.034374271731531, | |
| "grad_norm": 0.332454651594162, | |
| "learning_rate": 0.0005519125619352957, | |
| "loss": 3.5456, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.048939641109299, | |
| "grad_norm": 0.34020113945007324, | |
| "learning_rate": 0.0005517376858058875, | |
| "loss": 3.5473, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.063505010487066, | |
| "grad_norm": 0.3192497491836548, | |
| "learning_rate": 0.0005515628096764791, | |
| "loss": 3.5316, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.078070379864833, | |
| "grad_norm": 0.31511300802230835, | |
| "learning_rate": 0.0005513879335470708, | |
| "loss": 3.5479, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.078070379864833, | |
| "eval_accuracy": 0.3563426191818444, | |
| "eval_loss": 3.672767162322998, | |
| "eval_runtime": 182.5621, | |
| "eval_samples_per_second": 91.174, | |
| "eval_steps_per_second": 5.702, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.092635749242601, | |
| "grad_norm": 0.3340079188346863, | |
| "learning_rate": 0.0005512130574176625, | |
| "loss": 3.5413, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.1072011186203685, | |
| "grad_norm": 0.3118899464607239, | |
| "learning_rate": 0.000551038181288254, | |
| "loss": 3.5422, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.121766487998135, | |
| "grad_norm": 0.3274664580821991, | |
| "learning_rate": 0.0005508633051588457, | |
| "loss": 3.5457, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.136331857375903, | |
| "grad_norm": 0.3153150677680969, | |
| "learning_rate": 0.0005506884290294374, | |
| "loss": 3.56, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.150897226753671, | |
| "grad_norm": 0.3385670781135559, | |
| "learning_rate": 0.0005505135529000291, | |
| "loss": 3.5466, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.165462596131438, | |
| "grad_norm": 0.3047159016132355, | |
| "learning_rate": 0.0005503386767706207, | |
| "loss": 3.5501, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.180027965509205, | |
| "grad_norm": 0.3238605260848999, | |
| "learning_rate": 0.0005501638006412124, | |
| "loss": 3.5547, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.194593334886973, | |
| "grad_norm": 0.3106607496738434, | |
| "learning_rate": 0.0005499889245118041, | |
| "loss": 3.5582, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.20915870426474, | |
| "grad_norm": 0.31510302424430847, | |
| "learning_rate": 0.0005498140483823958, | |
| "loss": 3.554, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.223724073642508, | |
| "grad_norm": 0.3922960162162781, | |
| "learning_rate": 0.0005496391722529875, | |
| "loss": 3.5483, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.238289443020275, | |
| "grad_norm": 0.333943635225296, | |
| "learning_rate": 0.000549464296123579, | |
| "loss": 3.56, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.252854812398042, | |
| "grad_norm": 0.3157419264316559, | |
| "learning_rate": 0.0005492894199941707, | |
| "loss": 3.5645, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.26742018177581, | |
| "grad_norm": 0.32612183690071106, | |
| "learning_rate": 0.0005491145438647624, | |
| "loss": 3.5556, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.281985551153578, | |
| "grad_norm": 0.33562448620796204, | |
| "learning_rate": 0.000548939667735354, | |
| "loss": 3.5716, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.296550920531344, | |
| "grad_norm": 0.32943233847618103, | |
| "learning_rate": 0.0005487647916059457, | |
| "loss": 3.5683, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.311116289909112, | |
| "grad_norm": 0.32873958349227905, | |
| "learning_rate": 0.0005485899154765374, | |
| "loss": 3.5584, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.32568165928688, | |
| "grad_norm": 0.3185548782348633, | |
| "learning_rate": 0.0005484150393471291, | |
| "loss": 3.5823, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.340247028664647, | |
| "grad_norm": 0.32610881328582764, | |
| "learning_rate": 0.0005482401632177208, | |
| "loss": 3.5761, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.354812398042414, | |
| "grad_norm": 0.31527790427207947, | |
| "learning_rate": 0.0005480652870883124, | |
| "loss": 3.5687, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.369377767420182, | |
| "grad_norm": 0.3269599378108978, | |
| "learning_rate": 0.000547890410958904, | |
| "loss": 3.5782, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.369377767420182, | |
| "eval_accuracy": 0.3571946465826318, | |
| "eval_loss": 3.6619949340820312, | |
| "eval_runtime": 182.5907, | |
| "eval_samples_per_second": 91.16, | |
| "eval_steps_per_second": 5.701, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.383943136797949, | |
| "grad_norm": 0.31807681918144226, | |
| "learning_rate": 0.0005477155348294957, | |
| "loss": 3.5722, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.398508506175717, | |
| "grad_norm": 0.3249761462211609, | |
| "learning_rate": 0.0005475406587000874, | |
| "loss": 3.58, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.413073875553484, | |
| "grad_norm": 0.33852142095565796, | |
| "learning_rate": 0.000547365782570679, | |
| "loss": 3.5796, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.427639244931251, | |
| "grad_norm": 0.32763510942459106, | |
| "learning_rate": 0.0005471909064412707, | |
| "loss": 3.5777, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.442204614309019, | |
| "grad_norm": 0.31176745891571045, | |
| "learning_rate": 0.0005470160303118624, | |
| "loss": 3.5721, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.456769983686787, | |
| "grad_norm": 0.35347744822502136, | |
| "learning_rate": 0.000546841154182454, | |
| "loss": 3.5759, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.471335353064553, | |
| "grad_norm": 0.33067938685417175, | |
| "learning_rate": 0.0005466662780530458, | |
| "loss": 3.5752, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.485900722442321, | |
| "grad_norm": 0.3238064646720886, | |
| "learning_rate": 0.0005464914019236374, | |
| "loss": 3.5855, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.500466091820089, | |
| "grad_norm": 0.33181995153427124, | |
| "learning_rate": 0.000546316525794229, | |
| "loss": 3.5676, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.515031461197856, | |
| "grad_norm": 0.32865026593208313, | |
| "learning_rate": 0.0005461416496648207, | |
| "loss": 3.5663, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.529596830575623, | |
| "grad_norm": 0.30539754033088684, | |
| "learning_rate": 0.0005459667735354123, | |
| "loss": 3.572, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.544162199953391, | |
| "grad_norm": 0.33434492349624634, | |
| "learning_rate": 0.000545791897406004, | |
| "loss": 3.5709, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.558727569331158, | |
| "grad_norm": 0.32164353132247925, | |
| "learning_rate": 0.0005456170212765957, | |
| "loss": 3.5783, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.573292938708926, | |
| "grad_norm": 0.3319690525531769, | |
| "learning_rate": 0.0005454421451471874, | |
| "loss": 3.58, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.587858308086693, | |
| "grad_norm": 0.3449385464191437, | |
| "learning_rate": 0.000545267269017779, | |
| "loss": 3.58, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.6024236774644605, | |
| "grad_norm": 0.32032108306884766, | |
| "learning_rate": 0.0005450923928883708, | |
| "loss": 3.5837, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.616989046842228, | |
| "grad_norm": 0.3191685974597931, | |
| "learning_rate": 0.0005449175167589623, | |
| "loss": 3.5717, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.631554416219995, | |
| "grad_norm": 0.32119137048721313, | |
| "learning_rate": 0.000544742640629554, | |
| "loss": 3.5932, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.6461197855977625, | |
| "grad_norm": 0.33280646800994873, | |
| "learning_rate": 0.0005445677645001457, | |
| "loss": 3.5691, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.66068515497553, | |
| "grad_norm": 0.32261455059051514, | |
| "learning_rate": 0.0005443928883707373, | |
| "loss": 3.5819, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.66068515497553, | |
| "eval_accuracy": 0.3582682928119667, | |
| "eval_loss": 3.6463451385498047, | |
| "eval_runtime": 182.103, | |
| "eval_samples_per_second": 91.404, | |
| "eval_steps_per_second": 5.717, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.675250524353298, | |
| "grad_norm": 0.3126870095729828, | |
| "learning_rate": 0.000544218012241329, | |
| "loss": 3.5687, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.689815893731065, | |
| "grad_norm": 0.3362468183040619, | |
| "learning_rate": 0.0005440431361119207, | |
| "loss": 3.5788, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.704381263108832, | |
| "grad_norm": 0.30104732513427734, | |
| "learning_rate": 0.0005438682599825123, | |
| "loss": 3.5875, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.7189466324866, | |
| "grad_norm": 0.3225014805793762, | |
| "learning_rate": 0.000543693383853104, | |
| "loss": 3.5542, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.7335120018643675, | |
| "grad_norm": 0.3089386522769928, | |
| "learning_rate": 0.0005435185077236957, | |
| "loss": 3.5733, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.748077371242134, | |
| "grad_norm": 0.32772549986839294, | |
| "learning_rate": 0.0005433436315942873, | |
| "loss": 3.5776, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.762642740619902, | |
| "grad_norm": 0.3394605815410614, | |
| "learning_rate": 0.000543168755464879, | |
| "loss": 3.5714, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.7772081099976695, | |
| "grad_norm": 0.31668463349342346, | |
| "learning_rate": 0.0005429938793354706, | |
| "loss": 3.5802, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.791773479375437, | |
| "grad_norm": 0.30830904841423035, | |
| "learning_rate": 0.0005428190032060623, | |
| "loss": 3.5706, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.806338848753205, | |
| "grad_norm": 0.3307313621044159, | |
| "learning_rate": 0.000542644127076654, | |
| "loss": 3.5669, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.820904218130972, | |
| "grad_norm": 0.3045443892478943, | |
| "learning_rate": 0.0005424692509472457, | |
| "loss": 3.5737, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.835469587508739, | |
| "grad_norm": 0.3446267247200012, | |
| "learning_rate": 0.0005422943748178373, | |
| "loss": 3.5815, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.850034956886507, | |
| "grad_norm": 0.33735302090644836, | |
| "learning_rate": 0.000542119498688429, | |
| "loss": 3.5774, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.864600326264274, | |
| "grad_norm": 0.3310108184814453, | |
| "learning_rate": 0.0005419446225590207, | |
| "loss": 3.5777, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.879165695642041, | |
| "grad_norm": 0.3099808692932129, | |
| "learning_rate": 0.0005417697464296122, | |
| "loss": 3.5811, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.893731065019809, | |
| "grad_norm": 0.3206506371498108, | |
| "learning_rate": 0.000541594870300204, | |
| "loss": 3.5822, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.908296434397577, | |
| "grad_norm": 0.31750303506851196, | |
| "learning_rate": 0.0005414199941707956, | |
| "loss": 3.5802, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.922861803775344, | |
| "grad_norm": 0.33419302105903625, | |
| "learning_rate": 0.0005412451180413873, | |
| "loss": 3.572, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.937427173153111, | |
| "grad_norm": 0.35304707288742065, | |
| "learning_rate": 0.000541070241911979, | |
| "loss": 3.5736, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.951992542530879, | |
| "grad_norm": 0.3392590284347534, | |
| "learning_rate": 0.0005408953657825706, | |
| "loss": 3.5792, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.951992542530879, | |
| "eval_accuracy": 0.35947126567865034, | |
| "eval_loss": 3.634124279022217, | |
| "eval_runtime": 181.9213, | |
| "eval_samples_per_second": 91.496, | |
| "eval_steps_per_second": 5.722, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.966557911908646, | |
| "grad_norm": 0.3133297264575958, | |
| "learning_rate": 0.0005407204896531623, | |
| "loss": 3.574, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.981123281286413, | |
| "grad_norm": 0.31224194169044495, | |
| "learning_rate": 0.000540545613523754, | |
| "loss": 3.5708, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.995688650664181, | |
| "grad_norm": 0.3288334310054779, | |
| "learning_rate": 0.0005403707373943456, | |
| "loss": 3.5705, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.010195758564437, | |
| "grad_norm": 0.343101441860199, | |
| "learning_rate": 0.0005401958612649372, | |
| "loss": 3.5059, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.024761127942204, | |
| "grad_norm": 0.31737393140792847, | |
| "learning_rate": 0.000540020985135529, | |
| "loss": 3.4676, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.039326497319972, | |
| "grad_norm": 0.3343667685985565, | |
| "learning_rate": 0.0005398461090061206, | |
| "loss": 3.4729, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.0538918666977395, | |
| "grad_norm": 0.3518417477607727, | |
| "learning_rate": 0.0005396712328767123, | |
| "loss": 3.4788, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.068457236075507, | |
| "grad_norm": 0.32810088992118835, | |
| "learning_rate": 0.000539496356747304, | |
| "loss": 3.4653, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.083022605453274, | |
| "grad_norm": 0.3590436279773712, | |
| "learning_rate": 0.0005393214806178956, | |
| "loss": 3.4829, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.0975879748310415, | |
| "grad_norm": 0.3379361629486084, | |
| "learning_rate": 0.0005391466044884873, | |
| "loss": 3.4851, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.112153344208809, | |
| "grad_norm": 0.3168104588985443, | |
| "learning_rate": 0.000538971728359079, | |
| "loss": 3.4889, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.126718713586577, | |
| "grad_norm": 0.33108997344970703, | |
| "learning_rate": 0.0005387968522296705, | |
| "loss": 3.492, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.141284082964344, | |
| "grad_norm": 0.3419332802295685, | |
| "learning_rate": 0.0005386219761002622, | |
| "loss": 3.4919, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.155849452342111, | |
| "grad_norm": 0.34570637345314026, | |
| "learning_rate": 0.0005384470999708539, | |
| "loss": 3.4986, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.170414821719879, | |
| "grad_norm": 0.3261895477771759, | |
| "learning_rate": 0.0005382722238414456, | |
| "loss": 3.5012, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.1849801910976465, | |
| "grad_norm": 0.34492751955986023, | |
| "learning_rate": 0.0005380973477120373, | |
| "loss": 3.4965, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.199545560475413, | |
| "grad_norm": 0.3237833082675934, | |
| "learning_rate": 0.000537922471582629, | |
| "loss": 3.4965, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.214110929853181, | |
| "grad_norm": 0.31047049164772034, | |
| "learning_rate": 0.0005377475954532206, | |
| "loss": 3.5017, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.228676299230949, | |
| "grad_norm": 0.32261335849761963, | |
| "learning_rate": 0.0005375727193238123, | |
| "loss": 3.5174, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.243241668608716, | |
| "grad_norm": 0.3363330662250519, | |
| "learning_rate": 0.000537397843194404, | |
| "loss": 3.512, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.243241668608716, | |
| "eval_accuracy": 0.3601516413607749, | |
| "eval_loss": 3.6384644508361816, | |
| "eval_runtime": 181.9458, | |
| "eval_samples_per_second": 91.483, | |
| "eval_steps_per_second": 5.721, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.257807037986483, | |
| "grad_norm": 0.3440595269203186, | |
| "learning_rate": 0.0005372229670649955, | |
| "loss": 3.505, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.272372407364251, | |
| "grad_norm": 0.3164835572242737, | |
| "learning_rate": 0.0005370480909355872, | |
| "loss": 3.5013, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.286937776742018, | |
| "grad_norm": 0.3304733335971832, | |
| "learning_rate": 0.0005368732148061789, | |
| "loss": 3.5155, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.301503146119786, | |
| "grad_norm": 0.3306984305381775, | |
| "learning_rate": 0.0005366983386767705, | |
| "loss": 3.5053, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.316068515497553, | |
| "grad_norm": 0.32221606373786926, | |
| "learning_rate": 0.0005365234625473623, | |
| "loss": 3.5078, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.33063388487532, | |
| "grad_norm": 0.30464252829551697, | |
| "learning_rate": 0.0005363485864179539, | |
| "loss": 3.5094, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.345199254253088, | |
| "grad_norm": 0.32405513525009155, | |
| "learning_rate": 0.0005361737102885456, | |
| "loss": 3.507, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.359764623630856, | |
| "grad_norm": 0.33651819825172424, | |
| "learning_rate": 0.0005359988341591373, | |
| "loss": 3.5145, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.374329993008622, | |
| "grad_norm": 0.357702374458313, | |
| "learning_rate": 0.000535823958029729, | |
| "loss": 3.5132, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.38889536238639, | |
| "grad_norm": 0.3228895962238312, | |
| "learning_rate": 0.0005356490819003205, | |
| "loss": 3.4973, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.403460731764158, | |
| "grad_norm": 0.3350990414619446, | |
| "learning_rate": 0.0005354742057709122, | |
| "loss": 3.5208, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.418026101141925, | |
| "grad_norm": 0.34133604168891907, | |
| "learning_rate": 0.0005352993296415039, | |
| "loss": 3.5177, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.432591470519692, | |
| "grad_norm": 0.32490041851997375, | |
| "learning_rate": 0.0005351244535120955, | |
| "loss": 3.5159, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.44715683989746, | |
| "grad_norm": 0.32596027851104736, | |
| "learning_rate": 0.0005349495773826873, | |
| "loss": 3.5274, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.461722209275227, | |
| "grad_norm": 0.3423188626766205, | |
| "learning_rate": 0.0005347747012532789, | |
| "loss": 3.53, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.476287578652995, | |
| "grad_norm": 0.31081074476242065, | |
| "learning_rate": 0.0005345998251238706, | |
| "loss": 3.5269, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.490852948030762, | |
| "grad_norm": 0.34136995673179626, | |
| "learning_rate": 0.0005344249489944623, | |
| "loss": 3.5268, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.505418317408529, | |
| "grad_norm": 0.34362757205963135, | |
| "learning_rate": 0.0005342500728650538, | |
| "loss": 3.5227, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.519983686786297, | |
| "grad_norm": 0.30831918120384216, | |
| "learning_rate": 0.0005340751967356455, | |
| "loss": 3.5222, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.534549056164065, | |
| "grad_norm": 0.3135395646095276, | |
| "learning_rate": 0.0005339003206062372, | |
| "loss": 3.5208, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.534549056164065, | |
| "eval_accuracy": 0.3609769804464003, | |
| "eval_loss": 3.626232385635376, | |
| "eval_runtime": 181.8603, | |
| "eval_samples_per_second": 91.526, | |
| "eval_steps_per_second": 5.724, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.549114425541831, | |
| "grad_norm": 0.3581465482711792, | |
| "learning_rate": 0.0005337254444768288, | |
| "loss": 3.531, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.563679794919599, | |
| "grad_norm": 0.32384639978408813, | |
| "learning_rate": 0.0005335505683474205, | |
| "loss": 3.5197, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.578245164297367, | |
| "grad_norm": 0.3450806736946106, | |
| "learning_rate": 0.0005333756922180122, | |
| "loss": 3.5301, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.592810533675134, | |
| "grad_norm": 0.32282331585884094, | |
| "learning_rate": 0.0005332008160886039, | |
| "loss": 3.5306, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.607375903052901, | |
| "grad_norm": 0.3486621677875519, | |
| "learning_rate": 0.0005330259399591956, | |
| "loss": 3.5208, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.621941272430669, | |
| "grad_norm": 0.3094702363014221, | |
| "learning_rate": 0.0005328510638297873, | |
| "loss": 3.5239, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.636506641808436, | |
| "grad_norm": 0.3274450898170471, | |
| "learning_rate": 0.0005326761877003788, | |
| "loss": 3.5336, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.651072011186204, | |
| "grad_norm": 0.3350226879119873, | |
| "learning_rate": 0.0005325013115709705, | |
| "loss": 3.5326, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.665637380563971, | |
| "grad_norm": 0.3588801622390747, | |
| "learning_rate": 0.0005323264354415622, | |
| "loss": 3.5271, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.6802027499417385, | |
| "grad_norm": 0.3390669524669647, | |
| "learning_rate": 0.0005321515593121538, | |
| "loss": 3.5297, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.694768119319506, | |
| "grad_norm": 0.322145938873291, | |
| "learning_rate": 0.0005319766831827455, | |
| "loss": 3.5217, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.709333488697274, | |
| "grad_norm": 0.35364869236946106, | |
| "learning_rate": 0.0005318018070533372, | |
| "loss": 3.5149, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.7238988580750405, | |
| "grad_norm": 0.32203230261802673, | |
| "learning_rate": 0.0005316269309239288, | |
| "loss": 3.5356, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.738464227452808, | |
| "grad_norm": 0.352469265460968, | |
| "learning_rate": 0.0005314520547945206, | |
| "loss": 3.532, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.753029596830576, | |
| "grad_norm": NaN, | |
| "learning_rate": 0.0005312771786651121, | |
| "loss": 3.5222, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.7675949662083426, | |
| "grad_norm": 0.3287372589111328, | |
| "learning_rate": 0.0005311023025357038, | |
| "loss": 3.5235, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.78216033558611, | |
| "grad_norm": 0.3262624442577362, | |
| "learning_rate": 0.0005309274264062955, | |
| "loss": 3.5397, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.796725704963878, | |
| "grad_norm": 0.3266109228134155, | |
| "learning_rate": 0.0005307525502768872, | |
| "loss": 3.5228, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.8112910743416455, | |
| "grad_norm": 0.34884291887283325, | |
| "learning_rate": 0.0005305776741474788, | |
| "loss": 3.5194, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.825856443719413, | |
| "grad_norm": 0.3074500858783722, | |
| "learning_rate": 0.0005304027980180705, | |
| "loss": 3.5273, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.825856443719413, | |
| "eval_accuracy": 0.3620989478102355, | |
| "eval_loss": 3.6148364543914795, | |
| "eval_runtime": 181.8652, | |
| "eval_samples_per_second": 91.524, | |
| "eval_steps_per_second": 5.724, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.84042181309718, | |
| "grad_norm": 0.3382808566093445, | |
| "learning_rate": 0.0005302279218886622, | |
| "loss": 3.5279, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.8549871824749475, | |
| "grad_norm": 0.3046127259731293, | |
| "learning_rate": 0.0005300530457592538, | |
| "loss": 3.5388, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.869552551852715, | |
| "grad_norm": 0.3430224657058716, | |
| "learning_rate": 0.0005298781696298456, | |
| "loss": 3.5282, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.884117921230482, | |
| "grad_norm": 0.36001190543174744, | |
| "learning_rate": 0.0005297032935004371, | |
| "loss": 3.5291, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.89868329060825, | |
| "grad_norm": 0.3140873312950134, | |
| "learning_rate": 0.0005295284173710288, | |
| "loss": 3.5389, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.913248659986017, | |
| "grad_norm": 0.34070631861686707, | |
| "learning_rate": 0.0005293535412416205, | |
| "loss": 3.5397, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.927814029363785, | |
| "grad_norm": 0.3694857954978943, | |
| "learning_rate": 0.0005291786651122121, | |
| "loss": 3.5365, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.9423793987415525, | |
| "grad_norm": 0.32443273067474365, | |
| "learning_rate": 0.0005290037889828038, | |
| "loss": 3.5392, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.956944768119319, | |
| "grad_norm": 0.3110935389995575, | |
| "learning_rate": 0.0005288289128533955, | |
| "loss": 3.5326, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.971510137497087, | |
| "grad_norm": 0.3232935965061188, | |
| "learning_rate": 0.0005286540367239872, | |
| "loss": 3.5381, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.986075506874855, | |
| "grad_norm": 0.3413400948047638, | |
| "learning_rate": 0.0005284791605945788, | |
| "loss": 3.5416, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.0005826147751105, | |
| "grad_norm": 0.3248980641365051, | |
| "learning_rate": 0.0005283042844651704, | |
| "loss": 3.5249, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.015147984152878, | |
| "grad_norm": 0.3668578565120697, | |
| "learning_rate": 0.0005281294083357621, | |
| "loss": 3.4226, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.029713353530646, | |
| "grad_norm": 0.32745733857154846, | |
| "learning_rate": 0.0005279545322063538, | |
| "loss": 3.4291, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.044278722908413, | |
| "grad_norm": 0.3670319616794586, | |
| "learning_rate": 0.0005277796560769455, | |
| "loss": 3.4327, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.05884409228618, | |
| "grad_norm": 0.3462134897708893, | |
| "learning_rate": 0.0005276047799475371, | |
| "loss": 3.4398, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.073409461663948, | |
| "grad_norm": 0.3434312641620636, | |
| "learning_rate": 0.0005274299038181288, | |
| "loss": 3.4347, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.087974831041715, | |
| "grad_norm": 0.3447525203227997, | |
| "learning_rate": 0.0005272550276887205, | |
| "loss": 3.448, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.102540200419483, | |
| "grad_norm": 0.3339570462703705, | |
| "learning_rate": 0.0005270801515593121, | |
| "loss": 3.4478, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.11710556979725, | |
| "grad_norm": 0.32857051491737366, | |
| "learning_rate": 0.0005269052754299037, | |
| "loss": 3.4475, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.11710556979725, | |
| "eval_accuracy": 0.3623171571183439, | |
| "eval_loss": 3.619291067123413, | |
| "eval_runtime": 182.0455, | |
| "eval_samples_per_second": 91.433, | |
| "eval_steps_per_second": 5.718, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.1316709391750175, | |
| "grad_norm": 0.33486610651016235, | |
| "learning_rate": 0.0005267303993004954, | |
| "loss": 3.435, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.146236308552785, | |
| "grad_norm": 0.3245387077331543, | |
| "learning_rate": 0.000526555523171087, | |
| "loss": 3.4504, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.160801677930552, | |
| "grad_norm": 0.325870543718338, | |
| "learning_rate": 0.0005263806470416788, | |
| "loss": 3.4532, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.1753670473083195, | |
| "grad_norm": 0.35105100274086, | |
| "learning_rate": 0.0005262057709122704, | |
| "loss": 3.4521, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.189932416686087, | |
| "grad_norm": 0.3488394320011139, | |
| "learning_rate": 0.0005260308947828621, | |
| "loss": 3.4584, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.204497786063855, | |
| "grad_norm": 0.3601958453655243, | |
| "learning_rate": 0.0005258560186534538, | |
| "loss": 3.4632, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.219063155441622, | |
| "grad_norm": 0.320527583360672, | |
| "learning_rate": 0.0005256811425240455, | |
| "loss": 3.4616, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.233628524819389, | |
| "grad_norm": 0.3193604648113251, | |
| "learning_rate": 0.0005255062663946371, | |
| "loss": 3.4545, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.248193894197157, | |
| "grad_norm": 0.32400959730148315, | |
| "learning_rate": 0.0005253313902652287, | |
| "loss": 3.46, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.2627592635749245, | |
| "grad_norm": 0.36129894852638245, | |
| "learning_rate": 0.0005251565141358204, | |
| "loss": 3.4582, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.277324632952691, | |
| "grad_norm": 0.34856846928596497, | |
| "learning_rate": 0.000524981638006412, | |
| "loss": 3.4597, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.291890002330459, | |
| "grad_norm": 0.35143759846687317, | |
| "learning_rate": 0.0005248067618770038, | |
| "loss": 3.4663, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.306455371708227, | |
| "grad_norm": 0.3181470036506653, | |
| "learning_rate": 0.0005246318857475954, | |
| "loss": 3.4571, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.321020741085994, | |
| "grad_norm": 0.3355952799320221, | |
| "learning_rate": 0.0005244570096181871, | |
| "loss": 3.4644, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.335586110463761, | |
| "grad_norm": 0.3471963405609131, | |
| "learning_rate": 0.0005242821334887788, | |
| "loss": 3.4665, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.350151479841529, | |
| "grad_norm": 0.322955846786499, | |
| "learning_rate": 0.0005241072573593704, | |
| "loss": 3.4713, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.364716849219296, | |
| "grad_norm": 0.35527798533439636, | |
| "learning_rate": 0.000523932381229962, | |
| "loss": 3.4744, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.379282218597064, | |
| "grad_norm": 0.3321806490421295, | |
| "learning_rate": 0.0005237575051005537, | |
| "loss": 3.4808, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.393847587974831, | |
| "grad_norm": 0.33331242203712463, | |
| "learning_rate": 0.0005235826289711454, | |
| "loss": 3.4621, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.408412957352598, | |
| "grad_norm": 0.3406297266483307, | |
| "learning_rate": 0.000523407752841737, | |
| "loss": 3.4806, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.408412957352598, | |
| "eval_accuracy": 0.3629372195595958, | |
| "eval_loss": 3.6126840114593506, | |
| "eval_runtime": 179.7323, | |
| "eval_samples_per_second": 92.61, | |
| "eval_steps_per_second": 5.792, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.422978326730366, | |
| "grad_norm": 0.3514094352722168, | |
| "learning_rate": 0.0005232328767123287, | |
| "loss": 3.4762, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.437543696108134, | |
| "grad_norm": 0.33424749970436096, | |
| "learning_rate": 0.0005230580005829204, | |
| "loss": 3.4723, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.4521090654859, | |
| "grad_norm": 0.3223506808280945, | |
| "learning_rate": 0.0005228831244535121, | |
| "loss": 3.4735, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.466674434863668, | |
| "grad_norm": 0.3629089891910553, | |
| "learning_rate": 0.0005227082483241038, | |
| "loss": 3.4835, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.481239804241436, | |
| "grad_norm": 0.3067444860935211, | |
| "learning_rate": 0.0005225333721946954, | |
| "loss": 3.4759, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.495805173619203, | |
| "grad_norm": 0.31871816515922546, | |
| "learning_rate": 0.000522358496065287, | |
| "loss": 3.4852, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.51037054299697, | |
| "grad_norm": 0.3342389464378357, | |
| "learning_rate": 0.0005221836199358787, | |
| "loss": 3.4748, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.524935912374738, | |
| "grad_norm": 0.334839403629303, | |
| "learning_rate": 0.0005220087438064703, | |
| "loss": 3.48, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.539501281752505, | |
| "grad_norm": 0.3808937072753906, | |
| "learning_rate": 0.000521833867677062, | |
| "loss": 3.4808, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.554066651130273, | |
| "grad_norm": 0.3652092516422272, | |
| "learning_rate": 0.0005216589915476537, | |
| "loss": 3.5021, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.56863202050804, | |
| "grad_norm": 0.32643789052963257, | |
| "learning_rate": 0.0005214841154182454, | |
| "loss": 3.4911, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.583197389885807, | |
| "grad_norm": 0.3469211459159851, | |
| "learning_rate": 0.0005213092392888371, | |
| "loss": 3.4757, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.597762759263575, | |
| "grad_norm": 0.3310937285423279, | |
| "learning_rate": 0.0005211343631594287, | |
| "loss": 3.4829, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.612328128641343, | |
| "grad_norm": 0.3375169634819031, | |
| "learning_rate": 0.0005209594870300204, | |
| "loss": 3.4889, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.626893498019109, | |
| "grad_norm": 0.3277340531349182, | |
| "learning_rate": 0.000520784610900612, | |
| "loss": 3.4734, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.641458867396877, | |
| "grad_norm": 0.35384461283683777, | |
| "learning_rate": 0.0005206097347712037, | |
| "loss": 3.5004, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.656024236774645, | |
| "grad_norm": 0.33254358172416687, | |
| "learning_rate": 0.0005204348586417953, | |
| "loss": 3.4922, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.670589606152412, | |
| "grad_norm": 0.3284110426902771, | |
| "learning_rate": 0.000520259982512387, | |
| "loss": 3.4888, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.685154975530179, | |
| "grad_norm": 0.32339197397232056, | |
| "learning_rate": 0.0005200851063829787, | |
| "loss": 3.4905, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.699720344907947, | |
| "grad_norm": 0.33628493547439575, | |
| "learning_rate": 0.0005199102302535703, | |
| "loss": 3.4748, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.699720344907947, | |
| "eval_accuracy": 0.36394831872432204, | |
| "eval_loss": 3.600461006164551, | |
| "eval_runtime": 179.7946, | |
| "eval_samples_per_second": 92.578, | |
| "eval_steps_per_second": 5.79, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.714285714285714, | |
| "grad_norm": 0.3240261375904083, | |
| "learning_rate": 0.0005197353541241621, | |
| "loss": 3.4955, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.728851083663482, | |
| "grad_norm": 0.3188318610191345, | |
| "learning_rate": 0.0005195604779947537, | |
| "loss": 3.483, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.743416453041249, | |
| "grad_norm": 0.3339631259441376, | |
| "learning_rate": 0.0005193856018653454, | |
| "loss": 3.4883, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.7579818224190165, | |
| "grad_norm": 0.3179808557033539, | |
| "learning_rate": 0.000519210725735937, | |
| "loss": 3.4943, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.772547191796784, | |
| "grad_norm": 0.3453110456466675, | |
| "learning_rate": 0.0005190358496065286, | |
| "loss": 3.496, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.787112561174552, | |
| "grad_norm": 0.32360783219337463, | |
| "learning_rate": 0.0005188609734771203, | |
| "loss": 3.4991, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.8016779305523185, | |
| "grad_norm": 0.3246710002422333, | |
| "learning_rate": 0.000518686097347712, | |
| "loss": 3.4903, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.816243299930086, | |
| "grad_norm": 0.344545841217041, | |
| "learning_rate": 0.0005185112212183037, | |
| "loss": 3.4918, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.830808669307854, | |
| "grad_norm": 0.32257169485092163, | |
| "learning_rate": 0.0005183363450888953, | |
| "loss": 3.487, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.845374038685621, | |
| "grad_norm": 0.3380378484725952, | |
| "learning_rate": 0.000518161468959487, | |
| "loss": 3.4881, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.859939408063388, | |
| "grad_norm": 0.34541237354278564, | |
| "learning_rate": 0.0005179865928300787, | |
| "loss": 3.4939, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.874504777441156, | |
| "grad_norm": 0.3542953431606293, | |
| "learning_rate": 0.0005178117167006703, | |
| "loss": 3.4869, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.8890701468189235, | |
| "grad_norm": 0.3760510981082916, | |
| "learning_rate": 0.000517636840571262, | |
| "loss": 3.5067, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.903635516196691, | |
| "grad_norm": 0.33901602029800415, | |
| "learning_rate": 0.0005174619644418536, | |
| "loss": 3.4951, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.918200885574458, | |
| "grad_norm": 0.33704662322998047, | |
| "learning_rate": 0.0005172870883124453, | |
| "loss": 3.4922, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.9327662549522255, | |
| "grad_norm": 0.32309016585350037, | |
| "learning_rate": 0.000517112212183037, | |
| "loss": 3.4908, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.947331624329993, | |
| "grad_norm": 0.3241852819919586, | |
| "learning_rate": 0.0005169373360536286, | |
| "loss": 3.4964, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.961896993707761, | |
| "grad_norm": 0.3242267370223999, | |
| "learning_rate": 0.0005167624599242203, | |
| "loss": 3.4817, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.976462363085528, | |
| "grad_norm": 0.3280220925807953, | |
| "learning_rate": 0.000516587583794812, | |
| "loss": 3.4932, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.991027732463295, | |
| "grad_norm": 0.3425884544849396, | |
| "learning_rate": 0.0005164127076654037, | |
| "loss": 3.4908, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.991027732463295, | |
| "eval_accuracy": 0.3643506421361469, | |
| "eval_loss": 3.5942769050598145, | |
| "eval_runtime": 179.5597, | |
| "eval_samples_per_second": 92.699, | |
| "eval_steps_per_second": 5.798, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.005534840363552, | |
| "grad_norm": 0.3431827127933502, | |
| "learning_rate": 0.0005162378315359953, | |
| "loss": 3.45, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.020100209741319, | |
| "grad_norm": 0.36127012968063354, | |
| "learning_rate": 0.0005160629554065869, | |
| "loss": 3.3704, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.034665579119086, | |
| "grad_norm": 0.36049118638038635, | |
| "learning_rate": 0.0005158880792771786, | |
| "loss": 3.3914, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.049230948496854, | |
| "grad_norm": 0.3440174162387848, | |
| "learning_rate": 0.0005157132031477703, | |
| "loss": 3.3868, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.063796317874622, | |
| "grad_norm": 0.38178542256355286, | |
| "learning_rate": 0.000515538327018362, | |
| "loss": 3.3956, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.0783616872523885, | |
| "grad_norm": 0.32825422286987305, | |
| "learning_rate": 0.0005153634508889536, | |
| "loss": 3.3839, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.092927056630156, | |
| "grad_norm": 0.34752145409584045, | |
| "learning_rate": 0.0005151885747595453, | |
| "loss": 3.3979, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.107492426007924, | |
| "grad_norm": 0.33364078402519226, | |
| "learning_rate": 0.000515013698630137, | |
| "loss": 3.404, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.122057795385691, | |
| "grad_norm": 0.3302494287490845, | |
| "learning_rate": 0.0005148388225007285, | |
| "loss": 3.4225, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.136623164763458, | |
| "grad_norm": 0.33415162563323975, | |
| "learning_rate": 0.0005146639463713203, | |
| "loss": 3.401, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.151188534141226, | |
| "grad_norm": 0.33528947830200195, | |
| "learning_rate": 0.0005144890702419119, | |
| "loss": 3.4015, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.165753903518993, | |
| "grad_norm": 0.3421080410480499, | |
| "learning_rate": 0.0005143141941125036, | |
| "loss": 3.4205, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.180319272896761, | |
| "grad_norm": 0.3325115442276001, | |
| "learning_rate": 0.0005141393179830953, | |
| "loss": 3.4281, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.194884642274528, | |
| "grad_norm": 0.31258365511894226, | |
| "learning_rate": 0.0005139644418536869, | |
| "loss": 3.426, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.2094500116522955, | |
| "grad_norm": 0.31508442759513855, | |
| "learning_rate": 0.0005137895657242786, | |
| "loss": 3.4143, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.224015381030063, | |
| "grad_norm": 0.3417088985443115, | |
| "learning_rate": 0.0005136146895948703, | |
| "loss": 3.4357, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.238580750407831, | |
| "grad_norm": 0.3098302185535431, | |
| "learning_rate": 0.000513439813465462, | |
| "loss": 3.4303, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.2531461197855975, | |
| "grad_norm": 0.31606337428092957, | |
| "learning_rate": 0.0005132649373360535, | |
| "loss": 3.4353, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.267711489163365, | |
| "grad_norm": 0.33023601770401, | |
| "learning_rate": 0.0005130900612066452, | |
| "loss": 3.4206, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.282276858541133, | |
| "grad_norm": 0.33378899097442627, | |
| "learning_rate": 0.0005129151850772369, | |
| "loss": 3.4153, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.282276858541133, | |
| "eval_accuracy": 0.364236481986269, | |
| "eval_loss": 3.6024866104125977, | |
| "eval_runtime": 179.4604, | |
| "eval_samples_per_second": 92.75, | |
| "eval_steps_per_second": 5.801, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.2968422279189, | |
| "grad_norm": 0.3245193064212799, | |
| "learning_rate": 0.0005127403089478286, | |
| "loss": 3.427, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.311407597296667, | |
| "grad_norm": 0.35483458638191223, | |
| "learning_rate": 0.0005125654328184203, | |
| "loss": 3.4314, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.325972966674435, | |
| "grad_norm": 0.3492553234100342, | |
| "learning_rate": 0.0005123905566890119, | |
| "loss": 3.4336, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.3405383360522025, | |
| "grad_norm": 0.35173410177230835, | |
| "learning_rate": 0.0005122156805596036, | |
| "loss": 3.4165, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.35510370542997, | |
| "grad_norm": 0.3420160412788391, | |
| "learning_rate": 0.0005120408044301953, | |
| "loss": 3.4402, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.369669074807737, | |
| "grad_norm": 0.3215605318546295, | |
| "learning_rate": 0.0005118659283007868, | |
| "loss": 3.447, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.384234444185505, | |
| "grad_norm": 0.3140503764152527, | |
| "learning_rate": 0.0005116910521713785, | |
| "loss": 3.4381, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.398799813563272, | |
| "grad_norm": 0.32911545038223267, | |
| "learning_rate": 0.0005115161760419702, | |
| "loss": 3.4403, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.413365182941039, | |
| "grad_norm": 0.3454091548919678, | |
| "learning_rate": 0.0005113412999125619, | |
| "loss": 3.4435, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.427930552318807, | |
| "grad_norm": 0.3304098844528198, | |
| "learning_rate": 0.0005111664237831536, | |
| "loss": 3.4443, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.442495921696574, | |
| "grad_norm": 0.32890447974205017, | |
| "learning_rate": 0.0005109915476537452, | |
| "loss": 3.4347, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.457061291074342, | |
| "grad_norm": 0.3333839476108551, | |
| "learning_rate": 0.0005108166715243369, | |
| "loss": 3.4441, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.471626660452109, | |
| "grad_norm": 0.3388593792915344, | |
| "learning_rate": 0.0005106417953949286, | |
| "loss": 3.4551, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.486192029829876, | |
| "grad_norm": 0.3506496846675873, | |
| "learning_rate": 0.0005104669192655203, | |
| "loss": 3.4465, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.500757399207644, | |
| "grad_norm": 0.35972943902015686, | |
| "learning_rate": 0.0005102920431361118, | |
| "loss": 3.455, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.515322768585412, | |
| "grad_norm": 0.3275600075721741, | |
| "learning_rate": 0.0005101171670067035, | |
| "loss": 3.4599, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.529888137963178, | |
| "grad_norm": 0.3396972417831421, | |
| "learning_rate": 0.0005099422908772952, | |
| "loss": 3.4512, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.544453507340946, | |
| "grad_norm": 0.3468742072582245, | |
| "learning_rate": 0.0005097674147478868, | |
| "loss": 3.4439, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.559018876718714, | |
| "grad_norm": 0.3341714143753052, | |
| "learning_rate": 0.0005095925386184786, | |
| "loss": 3.454, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.573584246096481, | |
| "grad_norm": 0.33167895674705505, | |
| "learning_rate": 0.0005094176624890702, | |
| "loss": 3.4552, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.573584246096481, | |
| "eval_accuracy": 0.3649543623932247, | |
| "eval_loss": 3.5933761596679688, | |
| "eval_runtime": 179.6882, | |
| "eval_samples_per_second": 92.633, | |
| "eval_steps_per_second": 5.793, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.588149615474248, | |
| "grad_norm": 0.38421186804771423, | |
| "learning_rate": 0.0005092427863596619, | |
| "loss": 3.4496, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.602714984852016, | |
| "grad_norm": 0.3296069800853729, | |
| "learning_rate": 0.0005090679102302536, | |
| "loss": 3.4484, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.617280354229783, | |
| "grad_norm": 0.33456072211265564, | |
| "learning_rate": 0.0005088930341008451, | |
| "loss": 3.4478, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.631845723607551, | |
| "grad_norm": 0.34444794058799744, | |
| "learning_rate": 0.0005087181579714368, | |
| "loss": 3.4491, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.646411092985318, | |
| "grad_norm": 0.3780238628387451, | |
| "learning_rate": 0.0005085432818420285, | |
| "loss": 3.451, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.660976462363085, | |
| "grad_norm": 0.33494746685028076, | |
| "learning_rate": 0.0005083684057126202, | |
| "loss": 3.4543, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.675541831740853, | |
| "grad_norm": 0.40819284319877625, | |
| "learning_rate": 0.0005081935295832118, | |
| "loss": 3.4655, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.690107201118621, | |
| "grad_norm": 0.3251825273036957, | |
| "learning_rate": 0.0005080186534538035, | |
| "loss": 3.4459, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.704672570496387, | |
| "grad_norm": 0.3159500062465668, | |
| "learning_rate": 0.0005078437773243952, | |
| "loss": 3.4561, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.719237939874155, | |
| "grad_norm": 0.3354164958000183, | |
| "learning_rate": 0.0005076689011949869, | |
| "loss": 3.4639, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.733803309251923, | |
| "grad_norm": 0.3452058732509613, | |
| "learning_rate": 0.0005074940250655786, | |
| "loss": 3.4627, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.74836867862969, | |
| "grad_norm": 0.3344949781894684, | |
| "learning_rate": 0.0005073191489361701, | |
| "loss": 3.4492, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.762934048007457, | |
| "grad_norm": 0.35478341579437256, | |
| "learning_rate": 0.0005071442728067618, | |
| "loss": 3.4452, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.777499417385225, | |
| "grad_norm": 0.3661314845085144, | |
| "learning_rate": 0.0005069693966773535, | |
| "loss": 3.4577, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.792064786762992, | |
| "grad_norm": 0.34170424938201904, | |
| "learning_rate": 0.0005067945205479451, | |
| "loss": 3.4678, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.80663015614076, | |
| "grad_norm": 0.31290966272354126, | |
| "learning_rate": 0.0005066196444185368, | |
| "loss": 3.4606, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.821195525518527, | |
| "grad_norm": 0.35089555382728577, | |
| "learning_rate": 0.0005064447682891285, | |
| "loss": 3.4679, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.8357608948962945, | |
| "grad_norm": 0.33421048521995544, | |
| "learning_rate": 0.0005062698921597202, | |
| "loss": 3.4708, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.850326264274062, | |
| "grad_norm": 0.35330483317375183, | |
| "learning_rate": 0.0005060950160303119, | |
| "loss": 3.4581, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.86489163365183, | |
| "grad_norm": 0.3339422941207886, | |
| "learning_rate": 0.0005059201399009035, | |
| "loss": 3.4603, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.86489163365183, | |
| "eval_accuracy": 0.3656727130788616, | |
| "eval_loss": 3.585218906402588, | |
| "eval_runtime": 179.5703, | |
| "eval_samples_per_second": 92.694, | |
| "eval_steps_per_second": 5.797, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.8794570030295965, | |
| "grad_norm": 0.335092157125473, | |
| "learning_rate": 0.0005057452637714951, | |
| "loss": 3.4618, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.894022372407364, | |
| "grad_norm": 0.35167837142944336, | |
| "learning_rate": 0.0005055703876420868, | |
| "loss": 3.4618, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.908587741785132, | |
| "grad_norm": 0.3454788327217102, | |
| "learning_rate": 0.0005053955115126785, | |
| "loss": 3.4624, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.923153111162899, | |
| "grad_norm": 0.35379868745803833, | |
| "learning_rate": 0.0005052206353832701, | |
| "loss": 3.4515, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.937718480540666, | |
| "grad_norm": 0.35463815927505493, | |
| "learning_rate": 0.0005050457592538618, | |
| "loss": 3.4616, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.952283849918434, | |
| "grad_norm": 0.36919155716896057, | |
| "learning_rate": 0.0005048708831244535, | |
| "loss": 3.465, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.9668492192962015, | |
| "grad_norm": 0.32335364818573, | |
| "learning_rate": 0.0005046960069950451, | |
| "loss": 3.4751, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.981414588673969, | |
| "grad_norm": 0.32492557168006897, | |
| "learning_rate": 0.0005045211308656369, | |
| "loss": 3.4596, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.995979958051736, | |
| "grad_norm": 0.32239827513694763, | |
| "learning_rate": 0.0005043462547362284, | |
| "loss": 3.4709, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.010487065951992, | |
| "grad_norm": 0.3487697243690491, | |
| "learning_rate": 0.0005041713786068201, | |
| "loss": 3.3878, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.02505243532976, | |
| "grad_norm": 0.339937299489975, | |
| "learning_rate": 0.0005039965024774118, | |
| "loss": 3.3456, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.039617804707527, | |
| "grad_norm": 0.34511151909828186, | |
| "learning_rate": 0.0005038216263480034, | |
| "loss": 3.3591, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.054183174085296, | |
| "grad_norm": 0.33415067195892334, | |
| "learning_rate": 0.0005036467502185951, | |
| "loss": 3.351, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.068748543463062, | |
| "grad_norm": 0.33265748620033264, | |
| "learning_rate": 0.0005034718740891868, | |
| "loss": 3.3549, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.08331391284083, | |
| "grad_norm": 0.3668820261955261, | |
| "learning_rate": 0.0005032969979597785, | |
| "loss": 3.3766, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.097879282218598, | |
| "grad_norm": 0.35505983233451843, | |
| "learning_rate": 0.0005031221218303701, | |
| "loss": 3.3705, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.112444651596364, | |
| "grad_norm": 0.3510807752609253, | |
| "learning_rate": 0.0005029472457009618, | |
| "loss": 3.3713, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.127010020974131, | |
| "grad_norm": 0.3338639736175537, | |
| "learning_rate": 0.0005027723695715534, | |
| "loss": 3.3701, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.1415753903519, | |
| "grad_norm": 0.327267587184906, | |
| "learning_rate": 0.0005025974934421451, | |
| "loss": 3.3841, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.156140759729666, | |
| "grad_norm": 0.3316822052001953, | |
| "learning_rate": 0.0005024226173127368, | |
| "loss": 3.3745, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.156140759729666, | |
| "eval_accuracy": 0.36602271798739533, | |
| "eval_loss": 3.592015266418457, | |
| "eval_runtime": 179.543, | |
| "eval_samples_per_second": 92.708, | |
| "eval_steps_per_second": 5.798, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.170706129107435, | |
| "grad_norm": 0.34737464785575867, | |
| "learning_rate": 0.0005022477411833284, | |
| "loss": 3.3776, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.185271498485202, | |
| "grad_norm": 0.35966166853904724, | |
| "learning_rate": 0.0005020728650539201, | |
| "loss": 3.3925, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.199836867862969, | |
| "grad_norm": 0.3718971312046051, | |
| "learning_rate": 0.0005018979889245118, | |
| "loss": 3.3919, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.214402237240737, | |
| "grad_norm": 0.36448919773101807, | |
| "learning_rate": 0.0005017231127951034, | |
| "loss": 3.383, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.228967606618504, | |
| "grad_norm": 0.3384815752506256, | |
| "learning_rate": 0.0005015482366656951, | |
| "loss": 3.395, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.24353297599627, | |
| "grad_norm": 0.35150644183158875, | |
| "learning_rate": 0.0005013733605362868, | |
| "loss": 3.392, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.258098345374039, | |
| "grad_norm": 0.3527531027793884, | |
| "learning_rate": 0.0005011984844068784, | |
| "loss": 3.3917, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.272663714751806, | |
| "grad_norm": 0.35610780119895935, | |
| "learning_rate": 0.0005010236082774701, | |
| "loss": 3.4048, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.287229084129574, | |
| "grad_norm": 0.3675138056278229, | |
| "learning_rate": 0.0005008487321480617, | |
| "loss": 3.4005, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.301794453507341, | |
| "grad_norm": 0.33330920338630676, | |
| "learning_rate": 0.0005006738560186534, | |
| "loss": 3.4086, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.316359822885108, | |
| "grad_norm": 0.3794533312320709, | |
| "learning_rate": 0.0005004989798892451, | |
| "loss": 3.4062, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.330925192262876, | |
| "grad_norm": 0.33697935938835144, | |
| "learning_rate": 0.0005003241037598368, | |
| "loss": 3.4017, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.345490561640643, | |
| "grad_norm": 0.3670228123664856, | |
| "learning_rate": 0.0005001492276304284, | |
| "loss": 3.4173, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.36005593101841, | |
| "grad_norm": 0.3506808578968048, | |
| "learning_rate": 0.0004999743515010201, | |
| "loss": 3.405, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.374621300396178, | |
| "grad_norm": 0.31867876648902893, | |
| "learning_rate": 0.0004997994753716117, | |
| "loss": 3.423, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.389186669773945, | |
| "grad_norm": 0.34136125445365906, | |
| "learning_rate": 0.0004996245992422033, | |
| "loss": 3.4096, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.403752039151712, | |
| "grad_norm": 0.36181262135505676, | |
| "learning_rate": 0.0004994497231127951, | |
| "loss": 3.4094, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.41831740852948, | |
| "grad_norm": 0.36319324374198914, | |
| "learning_rate": 0.0004992748469833867, | |
| "loss": 3.4074, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.432882777907247, | |
| "grad_norm": 0.31755128502845764, | |
| "learning_rate": 0.0004990999708539784, | |
| "loss": 3.4134, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.447448147285016, | |
| "grad_norm": 0.34557104110717773, | |
| "learning_rate": 0.0004989250947245701, | |
| "loss": 3.406, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.447448147285016, | |
| "eval_accuracy": 0.36604270483134926, | |
| "eval_loss": 3.5855937004089355, | |
| "eval_runtime": 179.6608, | |
| "eval_samples_per_second": 92.647, | |
| "eval_steps_per_second": 5.794, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.462013516662783, | |
| "grad_norm": 0.3253464102745056, | |
| "learning_rate": 0.0004987502185951617, | |
| "loss": 3.4097, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.47657888604055, | |
| "grad_norm": 0.3373265564441681, | |
| "learning_rate": 0.0004985753424657534, | |
| "loss": 3.4196, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.491144255418318, | |
| "grad_norm": 0.3448173701763153, | |
| "learning_rate": 0.000498400466336345, | |
| "loss": 3.423, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.505709624796085, | |
| "grad_norm": 0.33007627725601196, | |
| "learning_rate": 0.0004982255902069367, | |
| "loss": 3.4089, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.520274994173853, | |
| "grad_norm": 0.3380190432071686, | |
| "learning_rate": 0.0004980507140775283, | |
| "loss": 3.4091, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.53484036355162, | |
| "grad_norm": 0.33369770646095276, | |
| "learning_rate": 0.0004978758379481201, | |
| "loss": 3.4299, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.549405732929387, | |
| "grad_norm": 0.3573598265647888, | |
| "learning_rate": 0.0004977009618187117, | |
| "loss": 3.4095, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.563971102307155, | |
| "grad_norm": 0.3663587272167206, | |
| "learning_rate": 0.0004975260856893034, | |
| "loss": 3.4357, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.578536471684922, | |
| "grad_norm": 0.3414006233215332, | |
| "learning_rate": 0.0004973512095598951, | |
| "loss": 3.4199, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.593101841062689, | |
| "grad_norm": 0.34500792622566223, | |
| "learning_rate": 0.0004971763334304867, | |
| "loss": 3.4219, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.607667210440457, | |
| "grad_norm": 0.32384201884269714, | |
| "learning_rate": 0.0004970014573010784, | |
| "loss": 3.4209, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.622232579818224, | |
| "grad_norm": 0.351113498210907, | |
| "learning_rate": 0.00049682658117167, | |
| "loss": 3.425, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.63679794919599, | |
| "grad_norm": 0.3571644127368927, | |
| "learning_rate": 0.0004966517050422616, | |
| "loss": 3.4208, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.65136331857376, | |
| "grad_norm": 0.34789296984672546, | |
| "learning_rate": 0.0004964768289128533, | |
| "loss": 3.4295, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.665928687951526, | |
| "grad_norm": 0.34940293431282043, | |
| "learning_rate": 0.000496301952783445, | |
| "loss": 3.4267, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.680494057329295, | |
| "grad_norm": 0.34850820899009705, | |
| "learning_rate": 0.0004961270766540367, | |
| "loss": 3.4153, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.695059426707061, | |
| "grad_norm": 0.35262537002563477, | |
| "learning_rate": 0.0004959522005246284, | |
| "loss": 3.4276, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.709624796084828, | |
| "grad_norm": 0.33390718698501587, | |
| "learning_rate": 0.00049577732439522, | |
| "loss": 3.4387, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.724190165462597, | |
| "grad_norm": 0.33877861499786377, | |
| "learning_rate": 0.0004956024482658117, | |
| "loss": 3.4404, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.738755534840363, | |
| "grad_norm": 0.3310054838657379, | |
| "learning_rate": 0.0004954275721364034, | |
| "loss": 3.429, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.738755534840363, | |
| "eval_accuracy": 0.36692130297762826, | |
| "eval_loss": 3.57623028755188, | |
| "eval_runtime": 179.5297, | |
| "eval_samples_per_second": 92.714, | |
| "eval_steps_per_second": 5.798, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.753320904218132, | |
| "grad_norm": 0.3291216790676117, | |
| "learning_rate": 0.000495252696006995, | |
| "loss": 3.4293, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.767886273595899, | |
| "grad_norm": 0.336401104927063, | |
| "learning_rate": 0.0004950778198775866, | |
| "loss": 3.4227, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.782451642973665, | |
| "grad_norm": 0.3546486794948578, | |
| "learning_rate": 0.0004949029437481783, | |
| "loss": 3.4305, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.797017012351434, | |
| "grad_norm": 0.3768438398838043, | |
| "learning_rate": 0.00049472806761877, | |
| "loss": 3.4396, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.8115823817292, | |
| "grad_norm": 0.36031797528266907, | |
| "learning_rate": 0.0004945531914893616, | |
| "loss": 3.4374, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.826147751106967, | |
| "grad_norm": 0.338821679353714, | |
| "learning_rate": 0.0004943783153599534, | |
| "loss": 3.434, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.840713120484736, | |
| "grad_norm": 0.3513728976249695, | |
| "learning_rate": 0.000494203439230545, | |
| "loss": 3.4468, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.855278489862503, | |
| "grad_norm": 0.3318538963794708, | |
| "learning_rate": 0.0004940285631011367, | |
| "loss": 3.4349, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.86984385924027, | |
| "grad_norm": 0.35798031091690063, | |
| "learning_rate": 0.0004938536869717284, | |
| "loss": 3.4249, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.884409228618038, | |
| "grad_norm": 0.32275164127349854, | |
| "learning_rate": 0.0004936788108423199, | |
| "loss": 3.4337, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.898974597995805, | |
| "grad_norm": 0.3375926613807678, | |
| "learning_rate": 0.0004935039347129116, | |
| "loss": 3.4416, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.913539967373573, | |
| "grad_norm": 0.3426482677459717, | |
| "learning_rate": 0.0004933290585835033, | |
| "loss": 3.4399, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.92810533675134, | |
| "grad_norm": 0.3388661742210388, | |
| "learning_rate": 0.000493154182454095, | |
| "loss": 3.4365, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.942670706129107, | |
| "grad_norm": 0.33085867762565613, | |
| "learning_rate": 0.0004929793063246866, | |
| "loss": 3.4426, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.957236075506875, | |
| "grad_norm": 0.34728479385375977, | |
| "learning_rate": 0.0004928044301952783, | |
| "loss": 3.4563, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.971801444884642, | |
| "grad_norm": 0.37290677428245544, | |
| "learning_rate": 0.00049262955406587, | |
| "loss": 3.4409, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.986366814262409, | |
| "grad_norm": 0.3528863787651062, | |
| "learning_rate": 0.0004924546779364617, | |
| "loss": 3.4321, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 9.000873922162667, | |
| "grad_norm": 0.33041954040527344, | |
| "learning_rate": 0.0004922798018070533, | |
| "loss": 3.4225, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.015439291540433, | |
| "grad_norm": 0.3403935730457306, | |
| "learning_rate": 0.0004921049256776449, | |
| "loss": 3.3243, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.0300046609182, | |
| "grad_norm": 0.350462406873703, | |
| "learning_rate": 0.0004919300495482366, | |
| "loss": 3.317, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.0300046609182, | |
| "eval_accuracy": 0.3672527318782519, | |
| "eval_loss": 3.5818264484405518, | |
| "eval_runtime": 179.6159, | |
| "eval_samples_per_second": 92.67, | |
| "eval_steps_per_second": 5.796, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.044570030295969, | |
| "grad_norm": 0.3548542857170105, | |
| "learning_rate": 0.0004917551734188283, | |
| "loss": 3.3339, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.059135399673735, | |
| "grad_norm": 0.34578046202659607, | |
| "learning_rate": 0.0004915802972894199, | |
| "loss": 3.3305, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.073700769051504, | |
| "grad_norm": 0.335111141204834, | |
| "learning_rate": 0.0004914054211600116, | |
| "loss": 3.337, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.08826613842927, | |
| "grad_norm": 0.32308679819107056, | |
| "learning_rate": 0.0004912305450306033, | |
| "loss": 3.3521, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.102831507807037, | |
| "grad_norm": 0.359343558549881, | |
| "learning_rate": 0.000491055668901195, | |
| "loss": 3.3517, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.117396877184806, | |
| "grad_norm": 0.3571661710739136, | |
| "learning_rate": 0.0004908807927717865, | |
| "loss": 3.3586, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.131962246562573, | |
| "grad_norm": 0.3565060496330261, | |
| "learning_rate": 0.0004907059166423783, | |
| "loss": 3.3453, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.14652761594034, | |
| "grad_norm": 0.39377450942993164, | |
| "learning_rate": 0.0004905310405129699, | |
| "loss": 3.3478, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.161092985318108, | |
| "grad_norm": 0.3553165793418884, | |
| "learning_rate": 0.0004903561643835616, | |
| "loss": 3.3566, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.175658354695875, | |
| "grad_norm": 0.34331372380256653, | |
| "learning_rate": 0.0004901812882541533, | |
| "loss": 3.3631, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.190223724073643, | |
| "grad_norm": 0.3618679344654083, | |
| "learning_rate": 0.0004900064121247449, | |
| "loss": 3.3571, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.20478909345141, | |
| "grad_norm": 0.37382376194000244, | |
| "learning_rate": 0.0004898315359953366, | |
| "loss": 3.3748, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.219354462829177, | |
| "grad_norm": 0.35998404026031494, | |
| "learning_rate": 0.0004896566598659283, | |
| "loss": 3.3669, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.233919832206945, | |
| "grad_norm": 0.33387935161590576, | |
| "learning_rate": 0.0004894817837365199, | |
| "loss": 3.3586, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.248485201584712, | |
| "grad_norm": 0.3467971682548523, | |
| "learning_rate": 0.0004893069076071115, | |
| "loss": 3.37, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.263050570962479, | |
| "grad_norm": 0.33971235156059265, | |
| "learning_rate": 0.0004891320314777032, | |
| "loss": 3.3652, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.277615940340247, | |
| "grad_norm": 0.33422431349754333, | |
| "learning_rate": 0.0004889571553482949, | |
| "loss": 3.3646, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.292181309718014, | |
| "grad_norm": 0.3441632390022278, | |
| "learning_rate": 0.0004887822792188866, | |
| "loss": 3.3774, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.306746679095783, | |
| "grad_norm": 0.34111490845680237, | |
| "learning_rate": 0.0004886074030894782, | |
| "loss": 3.37, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.32131204847355, | |
| "grad_norm": 0.35753685235977173, | |
| "learning_rate": 0.0004884325269600699, | |
| "loss": 3.3833, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.32131204847355, | |
| "eval_accuracy": 0.36719994309627957, | |
| "eval_loss": 3.580444812774658, | |
| "eval_runtime": 179.6886, | |
| "eval_samples_per_second": 92.632, | |
| "eval_steps_per_second": 5.793, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.335877417851316, | |
| "grad_norm": 0.3657474219799042, | |
| "learning_rate": 0.0004882576508306615, | |
| "loss": 3.3801, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.350442787229085, | |
| "grad_norm": 0.360665500164032, | |
| "learning_rate": 0.00048808277470125327, | |
| "loss": 3.3826, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.365008156606851, | |
| "grad_norm": 0.3531738817691803, | |
| "learning_rate": 0.0004879078985718449, | |
| "loss": 3.379, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.379573525984618, | |
| "grad_norm": 0.3487440347671509, | |
| "learning_rate": 0.0004877330224424366, | |
| "loss": 3.3753, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.394138895362387, | |
| "grad_norm": 0.3362681269645691, | |
| "learning_rate": 0.00048755814631302823, | |
| "loss": 3.3867, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.408704264740154, | |
| "grad_norm": 0.38282108306884766, | |
| "learning_rate": 0.00048738327018361987, | |
| "loss": 3.3906, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.423269634117922, | |
| "grad_norm": 0.3605978786945343, | |
| "learning_rate": 0.00048720839405421156, | |
| "loss": 3.3758, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.437835003495689, | |
| "grad_norm": 0.33404749631881714, | |
| "learning_rate": 0.0004870335179248032, | |
| "loss": 3.3921, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.452400372873456, | |
| "grad_norm": 0.3609205186367035, | |
| "learning_rate": 0.0004868586417953949, | |
| "loss": 3.3738, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.466965742251224, | |
| "grad_norm": 0.3603569567203522, | |
| "learning_rate": 0.0004866837656659865, | |
| "loss": 3.3808, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.48153111162899, | |
| "grad_norm": 0.34669244289398193, | |
| "learning_rate": 0.00048650888953657816, | |
| "loss": 3.4071, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.496096481006758, | |
| "grad_norm": 0.3603197932243347, | |
| "learning_rate": 0.0004863340134071699, | |
| "loss": 3.3825, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.510661850384526, | |
| "grad_norm": 0.33809515833854675, | |
| "learning_rate": 0.00048615913727776154, | |
| "loss": 3.3967, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.525227219762293, | |
| "grad_norm": 0.33064547181129456, | |
| "learning_rate": 0.00048598426114835323, | |
| "loss": 3.4032, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.53979258914006, | |
| "grad_norm": 0.343467116355896, | |
| "learning_rate": 0.00048580938501894486, | |
| "loss": 3.4028, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.554357958517828, | |
| "grad_norm": 0.3574856221675873, | |
| "learning_rate": 0.00048563450888953655, | |
| "loss": 3.4011, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.568923327895595, | |
| "grad_norm": 0.373276025056839, | |
| "learning_rate": 0.0004854596327601282, | |
| "loss": 3.3909, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.583488697273363, | |
| "grad_norm": 0.372328519821167, | |
| "learning_rate": 0.0004852847566307198, | |
| "loss": 3.399, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.59805406665113, | |
| "grad_norm": 0.3499378561973572, | |
| "learning_rate": 0.0004851098805013115, | |
| "loss": 3.4031, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.612619436028897, | |
| "grad_norm": 0.36343520879745483, | |
| "learning_rate": 0.00048493500437190315, | |
| "loss": 3.3999, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.612619436028897, | |
| "eval_accuracy": 0.3681005267709079, | |
| "eval_loss": 3.5709691047668457, | |
| "eval_runtime": 179.7111, | |
| "eval_samples_per_second": 92.621, | |
| "eval_steps_per_second": 5.793, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.627184805406666, | |
| "grad_norm": 0.33371296525001526, | |
| "learning_rate": 0.0004847601282424949, | |
| "loss": 3.4052, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.641750174784432, | |
| "grad_norm": 0.3303377032279968, | |
| "learning_rate": 0.00048458525211308653, | |
| "loss": 3.4074, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.6563155441622, | |
| "grad_norm": 0.35207274556159973, | |
| "learning_rate": 0.00048441037598367817, | |
| "loss": 3.4158, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.670880913539968, | |
| "grad_norm": 0.365450918674469, | |
| "learning_rate": 0.00048423549985426986, | |
| "loss": 3.4032, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.685446282917734, | |
| "grad_norm": 0.33346548676490784, | |
| "learning_rate": 0.0004840606237248615, | |
| "loss": 3.4133, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.700011652295503, | |
| "grad_norm": 0.3496672511100769, | |
| "learning_rate": 0.0004838857475954532, | |
| "loss": 3.4026, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.71457702167327, | |
| "grad_norm": 0.3341895341873169, | |
| "learning_rate": 0.0004837108714660448, | |
| "loss": 3.4104, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.729142391051036, | |
| "grad_norm": 0.35880976915359497, | |
| "learning_rate": 0.0004835359953366365, | |
| "loss": 3.419, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.743707760428805, | |
| "grad_norm": 0.36180543899536133, | |
| "learning_rate": 0.00048336111920722815, | |
| "loss": 3.3987, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.758273129806572, | |
| "grad_norm": 0.33391448855400085, | |
| "learning_rate": 0.0004831862430778198, | |
| "loss": 3.4069, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.772838499184338, | |
| "grad_norm": 0.335786908864975, | |
| "learning_rate": 0.00048301136694841153, | |
| "loss": 3.4064, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.787403868562107, | |
| "grad_norm": 0.3425864279270172, | |
| "learning_rate": 0.00048283649081900317, | |
| "loss": 3.4015, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.801969237939874, | |
| "grad_norm": 0.3565329313278198, | |
| "learning_rate": 0.00048266161468959486, | |
| "loss": 3.4078, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.816534607317642, | |
| "grad_norm": 0.3550211489200592, | |
| "learning_rate": 0.0004824867385601865, | |
| "loss": 3.3977, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.831099976695409, | |
| "grad_norm": 0.33708900213241577, | |
| "learning_rate": 0.00048231186243077813, | |
| "loss": 3.4104, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.845665346073176, | |
| "grad_norm": 0.3294453024864197, | |
| "learning_rate": 0.0004821369863013698, | |
| "loss": 3.4108, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.860230715450944, | |
| "grad_norm": 0.373319536447525, | |
| "learning_rate": 0.00048196211017196146, | |
| "loss": 3.4074, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.874796084828711, | |
| "grad_norm": 0.34141167998313904, | |
| "learning_rate": 0.00048178723404255315, | |
| "loss": 3.4147, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.88936145420648, | |
| "grad_norm": 0.35247910022735596, | |
| "learning_rate": 0.0004816123579131448, | |
| "loss": 3.4067, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.903926823584246, | |
| "grad_norm": 0.3380388021469116, | |
| "learning_rate": 0.0004814374817837364, | |
| "loss": 3.4035, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.903926823584246, | |
| "eval_accuracy": 0.3680470325709136, | |
| "eval_loss": 3.5645980834960938, | |
| "eval_runtime": 179.6385, | |
| "eval_samples_per_second": 92.658, | |
| "eval_steps_per_second": 5.795, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.918492192962013, | |
| "grad_norm": 0.33066990971565247, | |
| "learning_rate": 0.00048126260565432816, | |
| "loss": 3.4149, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.933057562339782, | |
| "grad_norm": 0.36766499280929565, | |
| "learning_rate": 0.0004810877295249198, | |
| "loss": 3.4174, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.947622931717548, | |
| "grad_norm": 0.32511964440345764, | |
| "learning_rate": 0.0004809128533955115, | |
| "loss": 3.4018, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.962188301095315, | |
| "grad_norm": 0.3310585618019104, | |
| "learning_rate": 0.0004807379772661031, | |
| "loss": 3.4235, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.976753670473084, | |
| "grad_norm": 0.34995537996292114, | |
| "learning_rate": 0.0004805631011366948, | |
| "loss": 3.405, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.99131903985085, | |
| "grad_norm": 0.3458772301673889, | |
| "learning_rate": 0.00048038822500728645, | |
| "loss": 3.4196, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.005826147751106, | |
| "grad_norm": 0.3567534387111664, | |
| "learning_rate": 0.0004802133488778781, | |
| "loss": 3.3754, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.020391517128875, | |
| "grad_norm": 0.34907981753349304, | |
| "learning_rate": 0.0004800384727484698, | |
| "loss": 3.3042, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.034956886506642, | |
| "grad_norm": 0.36751100420951843, | |
| "learning_rate": 0.0004798635966190614, | |
| "loss": 3.3046, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.049522255884408, | |
| "grad_norm": 0.3411542773246765, | |
| "learning_rate": 0.00047968872048965316, | |
| "loss": 3.3093, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.064087625262177, | |
| "grad_norm": 0.35095176100730896, | |
| "learning_rate": 0.0004795138443602448, | |
| "loss": 3.3179, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.078652994639944, | |
| "grad_norm": 0.34343641996383667, | |
| "learning_rate": 0.00047933896823083643, | |
| "loss": 3.3087, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.093218364017712, | |
| "grad_norm": 0.39370444416999817, | |
| "learning_rate": 0.0004791640921014281, | |
| "loss": 3.3124, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.107783733395479, | |
| "grad_norm": 0.36577534675598145, | |
| "learning_rate": 0.00047898921597201976, | |
| "loss": 3.3235, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.122349102773246, | |
| "grad_norm": 0.38264644145965576, | |
| "learning_rate": 0.00047881433984261145, | |
| "loss": 3.3297, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.136914472151014, | |
| "grad_norm": 0.3537810742855072, | |
| "learning_rate": 0.0004786394637132031, | |
| "loss": 3.3233, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.151479841528781, | |
| "grad_norm": 0.36147990822792053, | |
| "learning_rate": 0.0004784645875837948, | |
| "loss": 3.3265, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.166045210906548, | |
| "grad_norm": 0.36201244592666626, | |
| "learning_rate": 0.0004782897114543864, | |
| "loss": 3.328, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.180610580284316, | |
| "grad_norm": 0.3463992178440094, | |
| "learning_rate": 0.00047811483532497805, | |
| "loss": 3.3514, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.195175949662083, | |
| "grad_norm": 0.3376941382884979, | |
| "learning_rate": 0.0004779399591955698, | |
| "loss": 3.3507, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.195175949662083, | |
| "eval_accuracy": 0.3683125048864894, | |
| "eval_loss": 3.574247121810913, | |
| "eval_runtime": 179.5052, | |
| "eval_samples_per_second": 92.727, | |
| "eval_steps_per_second": 5.799, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.209741319039852, | |
| "grad_norm": 0.3373648226261139, | |
| "learning_rate": 0.00047776508306616143, | |
| "loss": 3.339, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.224306688417618, | |
| "grad_norm": 0.34261074662208557, | |
| "learning_rate": 0.0004775902069367531, | |
| "loss": 3.344, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.238872057795385, | |
| "grad_norm": 0.3796720802783966, | |
| "learning_rate": 0.00047741533080734476, | |
| "loss": 3.3424, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.253437427173154, | |
| "grad_norm": 0.36016082763671875, | |
| "learning_rate": 0.0004772404546779364, | |
| "loss": 3.3428, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.26800279655092, | |
| "grad_norm": 0.350201815366745, | |
| "learning_rate": 0.0004770655785485281, | |
| "loss": 3.3538, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.282568165928687, | |
| "grad_norm": 0.3484781086444855, | |
| "learning_rate": 0.0004768907024191197, | |
| "loss": 3.3387, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.297133535306456, | |
| "grad_norm": 0.33689433336257935, | |
| "learning_rate": 0.0004767158262897114, | |
| "loss": 3.3516, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.311698904684222, | |
| "grad_norm": 0.3487689793109894, | |
| "learning_rate": 0.00047654095016030305, | |
| "loss": 3.346, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.326264274061991, | |
| "grad_norm": 0.34821704030036926, | |
| "learning_rate": 0.0004763660740308948, | |
| "loss": 3.3566, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.340829643439758, | |
| "grad_norm": 0.3616722524166107, | |
| "learning_rate": 0.0004761911979014864, | |
| "loss": 3.3588, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.355395012817525, | |
| "grad_norm": 0.390299528837204, | |
| "learning_rate": 0.00047601632177207806, | |
| "loss": 3.3664, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.369960382195293, | |
| "grad_norm": 0.35546815395355225, | |
| "learning_rate": 0.00047584144564266975, | |
| "loss": 3.3594, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.38452575157306, | |
| "grad_norm": 0.35586267709732056, | |
| "learning_rate": 0.0004756665695132614, | |
| "loss": 3.3715, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.399091120950827, | |
| "grad_norm": 0.3343265950679779, | |
| "learning_rate": 0.0004754916933838531, | |
| "loss": 3.3525, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.413656490328595, | |
| "grad_norm": 0.350169837474823, | |
| "learning_rate": 0.0004753168172544447, | |
| "loss": 3.3714, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.428221859706362, | |
| "grad_norm": 0.347483366727829, | |
| "learning_rate": 0.00047514194112503635, | |
| "loss": 3.3678, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.44278722908413, | |
| "grad_norm": 0.3664875328540802, | |
| "learning_rate": 0.00047496706499562804, | |
| "loss": 3.3738, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.457352598461897, | |
| "grad_norm": 0.35261833667755127, | |
| "learning_rate": 0.0004747921888662197, | |
| "loss": 3.374, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.471917967839664, | |
| "grad_norm": 0.3594660758972168, | |
| "learning_rate": 0.0004746173127368114, | |
| "loss": 3.3644, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.486483337217432, | |
| "grad_norm": 0.3424801528453827, | |
| "learning_rate": 0.00047444243660740306, | |
| "loss": 3.3689, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.486483337217432, | |
| "eval_accuracy": 0.36841914057746694, | |
| "eval_loss": 3.5697360038757324, | |
| "eval_runtime": 179.5198, | |
| "eval_samples_per_second": 92.72, | |
| "eval_steps_per_second": 5.799, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.5010487065952, | |
| "grad_norm": 0.3455258905887604, | |
| "learning_rate": 0.0004742675604779947, | |
| "loss": 3.3751, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.515614075972966, | |
| "grad_norm": 0.3722718358039856, | |
| "learning_rate": 0.0004740926843485864, | |
| "loss": 3.3761, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.530179445350734, | |
| "grad_norm": 0.3568621575832367, | |
| "learning_rate": 0.000473917808219178, | |
| "loss": 3.3765, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.544744814728501, | |
| "grad_norm": 0.3658199906349182, | |
| "learning_rate": 0.0004737429320897697, | |
| "loss": 3.3724, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.55931018410627, | |
| "grad_norm": 0.34567561745643616, | |
| "learning_rate": 0.00047356805596036135, | |
| "loss": 3.3713, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.573875553484037, | |
| "grad_norm": 0.3556523621082306, | |
| "learning_rate": 0.00047339317983095304, | |
| "loss": 3.3746, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.588440922861803, | |
| "grad_norm": 0.3559434115886688, | |
| "learning_rate": 0.0004732183037015447, | |
| "loss": 3.3639, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.603006292239572, | |
| "grad_norm": 0.36187922954559326, | |
| "learning_rate": 0.0004730434275721363, | |
| "loss": 3.3817, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.617571661617339, | |
| "grad_norm": 0.3516775369644165, | |
| "learning_rate": 0.00047286855144272806, | |
| "loss": 3.3719, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.632137030995105, | |
| "grad_norm": 0.34939226508140564, | |
| "learning_rate": 0.0004726936753133197, | |
| "loss": 3.3685, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.646702400372874, | |
| "grad_norm": 0.35714349150657654, | |
| "learning_rate": 0.0004725187991839114, | |
| "loss": 3.3735, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.66126776975064, | |
| "grad_norm": 0.38368478417396545, | |
| "learning_rate": 0.000472343923054503, | |
| "loss": 3.3903, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.675833139128407, | |
| "grad_norm": 0.3668145537376404, | |
| "learning_rate": 0.00047216904692509465, | |
| "loss": 3.389, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.690398508506176, | |
| "grad_norm": 0.3444463908672333, | |
| "learning_rate": 0.00047199417079568634, | |
| "loss": 3.3861, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.704963877883943, | |
| "grad_norm": 0.36496496200561523, | |
| "learning_rate": 0.000471819294666278, | |
| "loss": 3.3841, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.719529247261711, | |
| "grad_norm": 0.37470269203186035, | |
| "learning_rate": 0.00047164441853686967, | |
| "loss": 3.384, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.734094616639478, | |
| "grad_norm": 0.379410982131958, | |
| "learning_rate": 0.0004714695424074613, | |
| "loss": 3.376, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.748659986017245, | |
| "grad_norm": 0.34845808148384094, | |
| "learning_rate": 0.00047129466627805305, | |
| "loss": 3.386, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.763225355395013, | |
| "grad_norm": 0.33667710423469543, | |
| "learning_rate": 0.0004711197901486447, | |
| "loss": 3.3802, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.77779072477278, | |
| "grad_norm": 0.33371374011039734, | |
| "learning_rate": 0.0004709449140192363, | |
| "loss": 3.379, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.77779072477278, | |
| "eval_accuracy": 0.36916417957826586, | |
| "eval_loss": 3.5598151683807373, | |
| "eval_runtime": 179.634, | |
| "eval_samples_per_second": 92.661, | |
| "eval_steps_per_second": 5.795, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.792356094150549, | |
| "grad_norm": 0.3444075584411621, | |
| "learning_rate": 0.000470770037889828, | |
| "loss": 3.3856, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.806921463528315, | |
| "grad_norm": 0.34622007608413696, | |
| "learning_rate": 0.00047059516176041965, | |
| "loss": 3.3919, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.821486832906082, | |
| "grad_norm": 0.39845308661460876, | |
| "learning_rate": 0.00047042028563101134, | |
| "loss": 3.3956, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.83605220228385, | |
| "grad_norm": 0.36669787764549255, | |
| "learning_rate": 0.000470245409501603, | |
| "loss": 3.3887, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.850617571661617, | |
| "grad_norm": 0.34656545519828796, | |
| "learning_rate": 0.0004700705333721946, | |
| "loss": 3.4005, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.865182941039384, | |
| "grad_norm": 0.35354891419410706, | |
| "learning_rate": 0.0004698956572427863, | |
| "loss": 3.3945, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.879748310417153, | |
| "grad_norm": 0.33106672763824463, | |
| "learning_rate": 0.00046972078111337794, | |
| "loss": 3.3908, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.89431367979492, | |
| "grad_norm": 0.3322337567806244, | |
| "learning_rate": 0.0004695459049839697, | |
| "loss": 3.3792, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.908879049172686, | |
| "grad_norm": 0.35176587104797363, | |
| "learning_rate": 0.0004693710288545613, | |
| "loss": 3.3877, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.923444418550455, | |
| "grad_norm": 0.34353724122047424, | |
| "learning_rate": 0.000469196152725153, | |
| "loss": 3.3928, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.938009787928221, | |
| "grad_norm": 0.36018845438957214, | |
| "learning_rate": 0.00046902127659574465, | |
| "loss": 3.3907, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.95257515730599, | |
| "grad_norm": 0.3528198003768921, | |
| "learning_rate": 0.0004688464004663363, | |
| "loss": 3.3962, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.967140526683757, | |
| "grad_norm": 0.34510186314582825, | |
| "learning_rate": 0.000468671524336928, | |
| "loss": 3.3842, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.981705896061523, | |
| "grad_norm": 0.3542734384536743, | |
| "learning_rate": 0.0004684966482075196, | |
| "loss": 3.4001, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.996271265439292, | |
| "grad_norm": 0.3544139266014099, | |
| "learning_rate": 0.0004683217720781113, | |
| "loss": 3.3989, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.010778373339548, | |
| "grad_norm": 0.35023751854896545, | |
| "learning_rate": 0.00046814689594870294, | |
| "loss": 3.3096, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.025343742717315, | |
| "grad_norm": 0.36776822805404663, | |
| "learning_rate": 0.0004679720198192946, | |
| "loss": 3.2977, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.039909112095083, | |
| "grad_norm": 0.3768557608127594, | |
| "learning_rate": 0.0004677971436898863, | |
| "loss": 3.2838, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.05447448147285, | |
| "grad_norm": 0.34867051243782043, | |
| "learning_rate": 0.00046762226756047795, | |
| "loss": 3.2998, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.069039850850617, | |
| "grad_norm": 0.3468160331249237, | |
| "learning_rate": 0.00046744739143106964, | |
| "loss": 3.2905, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.069039850850617, | |
| "eval_accuracy": 0.368822286976984, | |
| "eval_loss": 3.57218337059021, | |
| "eval_runtime": 179.7316, | |
| "eval_samples_per_second": 92.61, | |
| "eval_steps_per_second": 5.792, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.083605220228385, | |
| "grad_norm": 0.35249587893486023, | |
| "learning_rate": 0.0004672725153016613, | |
| "loss": 3.3018, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.098170589606152, | |
| "grad_norm": 0.3808625340461731, | |
| "learning_rate": 0.00046709763917225297, | |
| "loss": 3.297, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.11273595898392, | |
| "grad_norm": 0.3879368007183075, | |
| "learning_rate": 0.0004669227630428446, | |
| "loss": 3.3146, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.127301328361687, | |
| "grad_norm": 0.34145820140838623, | |
| "learning_rate": 0.00046674788691343624, | |
| "loss": 3.3006, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.141866697739454, | |
| "grad_norm": 0.3693101108074188, | |
| "learning_rate": 0.00046657301078402793, | |
| "loss": 3.3096, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.156432067117223, | |
| "grad_norm": 0.3697426915168762, | |
| "learning_rate": 0.00046639813465461957, | |
| "loss": 3.3111, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.17099743649499, | |
| "grad_norm": 0.37008944153785706, | |
| "learning_rate": 0.0004662232585252113, | |
| "loss": 3.3214, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.185562805872756, | |
| "grad_norm": 0.3367568254470825, | |
| "learning_rate": 0.00046604838239580295, | |
| "loss": 3.3141, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.200128175250525, | |
| "grad_norm": 0.36272698640823364, | |
| "learning_rate": 0.0004658735062663946, | |
| "loss": 3.3173, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.214693544628291, | |
| "grad_norm": 0.35574871301651, | |
| "learning_rate": 0.0004656986301369863, | |
| "loss": 3.3162, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.22925891400606, | |
| "grad_norm": 0.3292505443096161, | |
| "learning_rate": 0.0004655237540075779, | |
| "loss": 3.3112, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.243824283383827, | |
| "grad_norm": 0.36970898509025574, | |
| "learning_rate": 0.0004653488778781696, | |
| "loss": 3.3333, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.258389652761593, | |
| "grad_norm": 0.3610363304615021, | |
| "learning_rate": 0.00046517400174876124, | |
| "loss": 3.3249, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.272955022139362, | |
| "grad_norm": 0.38531872630119324, | |
| "learning_rate": 0.0004649991256193529, | |
| "loss": 3.3284, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.287520391517129, | |
| "grad_norm": 0.37316545844078064, | |
| "learning_rate": 0.00046482424948994457, | |
| "loss": 3.3431, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.302085760894895, | |
| "grad_norm": 0.4000358581542969, | |
| "learning_rate": 0.0004646493733605362, | |
| "loss": 3.3247, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.316651130272664, | |
| "grad_norm": 0.3880660831928253, | |
| "learning_rate": 0.00046447449723112795, | |
| "loss": 3.337, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.33121649965043, | |
| "grad_norm": 0.38193467259407043, | |
| "learning_rate": 0.0004642996211017196, | |
| "loss": 3.3412, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.3457818690282, | |
| "grad_norm": 0.3892691433429718, | |
| "learning_rate": 0.0004641247449723113, | |
| "loss": 3.3442, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.360347238405966, | |
| "grad_norm": 0.3644644618034363, | |
| "learning_rate": 0.0004639498688429029, | |
| "loss": 3.3332, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.360347238405966, | |
| "eval_accuracy": 0.36906248181344165, | |
| "eval_loss": 3.567659616470337, | |
| "eval_runtime": 179.7805, | |
| "eval_samples_per_second": 92.585, | |
| "eval_steps_per_second": 5.79, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.374912607783733, | |
| "grad_norm": 0.38239893317222595, | |
| "learning_rate": 0.00046377499271349455, | |
| "loss": 3.334, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.389477977161501, | |
| "grad_norm": 0.34430813789367676, | |
| "learning_rate": 0.00046360011658408624, | |
| "loss": 3.3331, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.404043346539268, | |
| "grad_norm": 0.3366773724555969, | |
| "learning_rate": 0.00046342524045467787, | |
| "loss": 3.3524, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.418608715917035, | |
| "grad_norm": 0.3577640950679779, | |
| "learning_rate": 0.00046325036432526956, | |
| "loss": 3.3418, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.433174085294803, | |
| "grad_norm": 0.3685474395751953, | |
| "learning_rate": 0.0004630754881958612, | |
| "loss": 3.348, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.44773945467257, | |
| "grad_norm": 0.36754393577575684, | |
| "learning_rate": 0.00046290061206645284, | |
| "loss": 3.3386, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.462304824050339, | |
| "grad_norm": 0.36672037839889526, | |
| "learning_rate": 0.0004627257359370446, | |
| "loss": 3.3531, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.476870193428105, | |
| "grad_norm": 0.3429676592350006, | |
| "learning_rate": 0.0004625508598076362, | |
| "loss": 3.3398, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.491435562805872, | |
| "grad_norm": 0.40767839550971985, | |
| "learning_rate": 0.0004623759836782279, | |
| "loss": 3.3367, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.50600093218364, | |
| "grad_norm": 0.3494288921356201, | |
| "learning_rate": 0.00046220110754881954, | |
| "loss": 3.3617, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.520566301561407, | |
| "grad_norm": 0.35878896713256836, | |
| "learning_rate": 0.00046202623141941123, | |
| "loss": 3.354, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.535131670939174, | |
| "grad_norm": 0.3389423191547394, | |
| "learning_rate": 0.00046185135529000287, | |
| "loss": 3.3341, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.549697040316943, | |
| "grad_norm": 0.35176849365234375, | |
| "learning_rate": 0.0004616764791605945, | |
| "loss": 3.3581, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.56426240969471, | |
| "grad_norm": 0.33921658992767334, | |
| "learning_rate": 0.0004615016030311862, | |
| "loss": 3.3701, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.578827779072478, | |
| "grad_norm": 0.35132601857185364, | |
| "learning_rate": 0.00046132672690177783, | |
| "loss": 3.3563, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.593393148450245, | |
| "grad_norm": 0.34590718150138855, | |
| "learning_rate": 0.0004611518507723696, | |
| "loss": 3.3562, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.607958517828012, | |
| "grad_norm": 0.3639914393424988, | |
| "learning_rate": 0.0004609769746429612, | |
| "loss": 3.36, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.62252388720578, | |
| "grad_norm": 0.3800894021987915, | |
| "learning_rate": 0.00046080209851355285, | |
| "loss": 3.3671, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.637089256583547, | |
| "grad_norm": 0.3536291718482971, | |
| "learning_rate": 0.00046062722238414454, | |
| "loss": 3.3553, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.651654625961314, | |
| "grad_norm": 0.35299673676490784, | |
| "learning_rate": 0.0004604523462547362, | |
| "loss": 3.3602, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.651654625961314, | |
| "eval_accuracy": 0.3690838794934393, | |
| "eval_loss": 3.560605764389038, | |
| "eval_runtime": 179.7311, | |
| "eval_samples_per_second": 92.611, | |
| "eval_steps_per_second": 5.792, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.666219995339082, | |
| "grad_norm": 0.37060144543647766, | |
| "learning_rate": 0.00046027747012532787, | |
| "loss": 3.3573, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.680785364716849, | |
| "grad_norm": 0.4053195118904114, | |
| "learning_rate": 0.0004601025939959195, | |
| "loss": 3.3733, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.695350734094617, | |
| "grad_norm": 0.3602805733680725, | |
| "learning_rate": 0.0004599277178665112, | |
| "loss": 3.3565, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.709916103472384, | |
| "grad_norm": 0.3374777138233185, | |
| "learning_rate": 0.00045975284173710283, | |
| "loss": 3.357, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.724481472850151, | |
| "grad_norm": 0.369104266166687, | |
| "learning_rate": 0.00045957796560769446, | |
| "loss": 3.371, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.73904684222792, | |
| "grad_norm": 0.3783121407032013, | |
| "learning_rate": 0.0004594030894782862, | |
| "loss": 3.3587, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.753612211605686, | |
| "grad_norm": 0.3741343021392822, | |
| "learning_rate": 0.00045922821334887785, | |
| "loss": 3.3759, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.768177580983453, | |
| "grad_norm": 0.40091201663017273, | |
| "learning_rate": 0.00045905333721946954, | |
| "loss": 3.363, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.782742950361222, | |
| "grad_norm": 0.3418520390987396, | |
| "learning_rate": 0.00045887846109006117, | |
| "loss": 3.3579, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.797308319738988, | |
| "grad_norm": 0.328216016292572, | |
| "learning_rate": 0.0004587035849606528, | |
| "loss": 3.368, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.811873689116755, | |
| "grad_norm": 0.3601894676685333, | |
| "learning_rate": 0.0004585287088312445, | |
| "loss": 3.3796, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.826439058494524, | |
| "grad_norm": 0.3456202447414398, | |
| "learning_rate": 0.00045835383270183613, | |
| "loss": 3.3721, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.84100442787229, | |
| "grad_norm": 0.3935994803905487, | |
| "learning_rate": 0.0004581789565724278, | |
| "loss": 3.3641, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.855569797250059, | |
| "grad_norm": 0.38910239934921265, | |
| "learning_rate": 0.00045800408044301946, | |
| "loss": 3.3686, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.870135166627826, | |
| "grad_norm": 0.37248024344444275, | |
| "learning_rate": 0.0004578292043136111, | |
| "loss": 3.3777, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.884700536005592, | |
| "grad_norm": 0.36761152744293213, | |
| "learning_rate": 0.00045765432818420284, | |
| "loss": 3.359, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.899265905383361, | |
| "grad_norm": 0.36878830194473267, | |
| "learning_rate": 0.0004574794520547945, | |
| "loss": 3.374, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.913831274761128, | |
| "grad_norm": 0.36202317476272583, | |
| "learning_rate": 0.00045730457592538617, | |
| "loss": 3.3762, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.928396644138896, | |
| "grad_norm": 0.37495362758636475, | |
| "learning_rate": 0.0004571296997959778, | |
| "loss": 3.3604, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.942962013516663, | |
| "grad_norm": 0.3607545495033264, | |
| "learning_rate": 0.0004569548236665695, | |
| "loss": 3.3818, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.942962013516663, | |
| "eval_accuracy": 0.37025463827045607, | |
| "eval_loss": 3.5513434410095215, | |
| "eval_runtime": 179.6988, | |
| "eval_samples_per_second": 92.627, | |
| "eval_steps_per_second": 5.793, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.95752738289443, | |
| "grad_norm": 0.3468360900878906, | |
| "learning_rate": 0.00045677994753716113, | |
| "loss": 3.3849, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.972092752272198, | |
| "grad_norm": 0.3485865294933319, | |
| "learning_rate": 0.00045660507140775277, | |
| "loss": 3.3785, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.986658121649965, | |
| "grad_norm": 0.34709101915359497, | |
| "learning_rate": 0.00045643019527834446, | |
| "loss": 3.3663, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 12.001165229550221, | |
| "grad_norm": 0.38278666138648987, | |
| "learning_rate": 0.0004562553191489361, | |
| "loss": 3.3766, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.01573059892799, | |
| "grad_norm": 0.34984317421913147, | |
| "learning_rate": 0.00045608044301952784, | |
| "loss": 3.2617, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.030295968305756, | |
| "grad_norm": 0.329671174287796, | |
| "learning_rate": 0.0004559055668901195, | |
| "loss": 3.265, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.044861337683523, | |
| "grad_norm": 0.377273291349411, | |
| "learning_rate": 0.0004557306907607111, | |
| "loss": 3.274, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.059426707061291, | |
| "grad_norm": 0.353500097990036, | |
| "learning_rate": 0.0004555558146313028, | |
| "loss": 3.2773, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.073992076439058, | |
| "grad_norm": 0.33617493510246277, | |
| "learning_rate": 0.00045538093850189444, | |
| "loss": 3.2767, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.088557445816827, | |
| "grad_norm": 0.37292763590812683, | |
| "learning_rate": 0.00045520606237248613, | |
| "loss": 3.2795, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.103122815194594, | |
| "grad_norm": 0.3755471706390381, | |
| "learning_rate": 0.00045503118624307776, | |
| "loss": 3.2793, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.11768818457236, | |
| "grad_norm": 0.4093469977378845, | |
| "learning_rate": 0.00045485631011366945, | |
| "loss": 3.2801, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.132253553950129, | |
| "grad_norm": 0.3457607924938202, | |
| "learning_rate": 0.0004546814339842611, | |
| "loss": 3.2882, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.146818923327896, | |
| "grad_norm": 0.37875697016716003, | |
| "learning_rate": 0.0004545065578548527, | |
| "loss": 3.2868, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.161384292705662, | |
| "grad_norm": 0.3649895489215851, | |
| "learning_rate": 0.00045433168172544447, | |
| "loss": 3.2959, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.17594966208343, | |
| "grad_norm": 0.391001433134079, | |
| "learning_rate": 0.0004541568055960361, | |
| "loss": 3.3045, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.190515031461198, | |
| "grad_norm": 0.3820909857749939, | |
| "learning_rate": 0.0004539819294666278, | |
| "loss": 3.3038, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.205080400838966, | |
| "grad_norm": 0.37390992045402527, | |
| "learning_rate": 0.00045380705333721943, | |
| "loss": 3.2909, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.219645770216733, | |
| "grad_norm": 0.38024622201919556, | |
| "learning_rate": 0.00045363217720781107, | |
| "loss": 3.3109, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.2342111395945, | |
| "grad_norm": 0.3541949391365051, | |
| "learning_rate": 0.00045345730107840276, | |
| "loss": 3.2924, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.2342111395945, | |
| "eval_accuracy": 0.3695281752775673, | |
| "eval_loss": 3.5650668144226074, | |
| "eval_runtime": 179.7999, | |
| "eval_samples_per_second": 92.575, | |
| "eval_steps_per_second": 5.79, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.248776508972268, | |
| "grad_norm": 0.35464340448379517, | |
| "learning_rate": 0.0004532824249489944, | |
| "loss": 3.3094, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.263341878350035, | |
| "grad_norm": 0.3588162362575531, | |
| "learning_rate": 0.0004531075488195861, | |
| "loss": 3.3044, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.277907247727802, | |
| "grad_norm": 0.35945770144462585, | |
| "learning_rate": 0.0004529326726901777, | |
| "loss": 3.3149, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.29247261710557, | |
| "grad_norm": 0.3775635063648224, | |
| "learning_rate": 0.00045275779656076947, | |
| "loss": 3.3133, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.307037986483337, | |
| "grad_norm": 0.36539244651794434, | |
| "learning_rate": 0.0004525829204313611, | |
| "loss": 3.3236, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.321603355861104, | |
| "grad_norm": 0.36154550313949585, | |
| "learning_rate": 0.00045240804430195274, | |
| "loss": 3.3229, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.336168725238872, | |
| "grad_norm": 0.36169642210006714, | |
| "learning_rate": 0.00045223316817254443, | |
| "loss": 3.3229, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.350734094616639, | |
| "grad_norm": 0.3900752067565918, | |
| "learning_rate": 0.00045205829204313607, | |
| "loss": 3.3163, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.365299463994408, | |
| "grad_norm": 0.36182570457458496, | |
| "learning_rate": 0.00045188341591372776, | |
| "loss": 3.3282, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.379864833372174, | |
| "grad_norm": 0.36705368757247925, | |
| "learning_rate": 0.0004517085397843194, | |
| "loss": 3.3301, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.394430202749941, | |
| "grad_norm": 0.35039055347442627, | |
| "learning_rate": 0.00045153366365491103, | |
| "loss": 3.3307, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.40899557212771, | |
| "grad_norm": 0.38655978441238403, | |
| "learning_rate": 0.0004513587875255027, | |
| "loss": 3.3337, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.423560941505476, | |
| "grad_norm": 0.34374991059303284, | |
| "learning_rate": 0.00045118391139609436, | |
| "loss": 3.3308, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.438126310883243, | |
| "grad_norm": 0.366639643907547, | |
| "learning_rate": 0.0004510090352666861, | |
| "loss": 3.3421, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.452691680261012, | |
| "grad_norm": 0.3672907054424286, | |
| "learning_rate": 0.00045083415913727774, | |
| "loss": 3.3184, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.467257049638778, | |
| "grad_norm": 0.36934059858322144, | |
| "learning_rate": 0.0004506592830078694, | |
| "loss": 3.3262, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.481822419016547, | |
| "grad_norm": 0.36116307973861694, | |
| "learning_rate": 0.00045048440687846106, | |
| "loss": 3.3253, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.496387788394314, | |
| "grad_norm": 0.3898486793041229, | |
| "learning_rate": 0.0004503095307490527, | |
| "loss": 3.3426, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.51095315777208, | |
| "grad_norm": 0.3628914952278137, | |
| "learning_rate": 0.0004501346546196444, | |
| "loss": 3.3299, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.525518527149849, | |
| "grad_norm": 0.3678210973739624, | |
| "learning_rate": 0.000449959778490236, | |
| "loss": 3.3339, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.525518527149849, | |
| "eval_accuracy": 0.3700859257935512, | |
| "eval_loss": 3.558652877807617, | |
| "eval_runtime": 179.6498, | |
| "eval_samples_per_second": 92.652, | |
| "eval_steps_per_second": 5.795, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.540083896527616, | |
| "grad_norm": 0.3682061433792114, | |
| "learning_rate": 0.0004497849023608277, | |
| "loss": 3.3348, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.554649265905383, | |
| "grad_norm": 0.3561127781867981, | |
| "learning_rate": 0.00044961002623141935, | |
| "loss": 3.332, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.569214635283151, | |
| "grad_norm": 0.36310920119285583, | |
| "learning_rate": 0.000449435150102011, | |
| "loss": 3.3311, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.583780004660918, | |
| "grad_norm": 0.378019779920578, | |
| "learning_rate": 0.00044926027397260273, | |
| "loss": 3.3486, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.598345374038686, | |
| "grad_norm": 0.35761335492134094, | |
| "learning_rate": 0.00044908539784319437, | |
| "loss": 3.3456, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.612910743416453, | |
| "grad_norm": 0.38079050183296204, | |
| "learning_rate": 0.00044891052171378606, | |
| "loss": 3.34, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.62747611279422, | |
| "grad_norm": 0.36224061250686646, | |
| "learning_rate": 0.0004487356455843777, | |
| "loss": 3.3454, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.642041482171988, | |
| "grad_norm": 0.3658839762210846, | |
| "learning_rate": 0.00044856076945496933, | |
| "loss": 3.3426, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.656606851549755, | |
| "grad_norm": 0.3703918755054474, | |
| "learning_rate": 0.000448385893325561, | |
| "loss": 3.3413, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.671172220927522, | |
| "grad_norm": 0.39403098821640015, | |
| "learning_rate": 0.00044821101719615266, | |
| "loss": 3.3435, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.68573759030529, | |
| "grad_norm": 0.3617226481437683, | |
| "learning_rate": 0.00044803614106674435, | |
| "loss": 3.3455, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.700302959683057, | |
| "grad_norm": 0.38985195755958557, | |
| "learning_rate": 0.000447861264937336, | |
| "loss": 3.347, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.714868329060826, | |
| "grad_norm": 0.3814113438129425, | |
| "learning_rate": 0.00044768638880792773, | |
| "loss": 3.345, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.729433698438593, | |
| "grad_norm": 0.33327075839042664, | |
| "learning_rate": 0.00044751151267851937, | |
| "loss": 3.3439, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.74399906781636, | |
| "grad_norm": 0.39450183510780334, | |
| "learning_rate": 0.000447336636549111, | |
| "loss": 3.354, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.758564437194128, | |
| "grad_norm": 0.34784385561943054, | |
| "learning_rate": 0.0004471617604197027, | |
| "loss": 3.3438, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.773129806571895, | |
| "grad_norm": 0.3895801305770874, | |
| "learning_rate": 0.00044698688429029433, | |
| "loss": 3.3385, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.787695175949661, | |
| "grad_norm": 0.37325412034988403, | |
| "learning_rate": 0.000446812008160886, | |
| "loss": 3.3382, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.80226054532743, | |
| "grad_norm": 0.3635622262954712, | |
| "learning_rate": 0.00044663713203147766, | |
| "loss": 3.351, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.816825914705197, | |
| "grad_norm": 0.34390226006507874, | |
| "learning_rate": 0.0004464622559020693, | |
| "loss": 3.352, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.816825914705197, | |
| "eval_accuracy": 0.37086952764621406, | |
| "eval_loss": 3.5511374473571777, | |
| "eval_runtime": 179.8312, | |
| "eval_samples_per_second": 92.559, | |
| "eval_steps_per_second": 5.789, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.831391284082965, | |
| "grad_norm": 0.3434717059135437, | |
| "learning_rate": 0.000446287379772661, | |
| "loss": 3.358, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.845956653460732, | |
| "grad_norm": 0.3896610140800476, | |
| "learning_rate": 0.0004461125036432526, | |
| "loss": 3.349, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.860522022838499, | |
| "grad_norm": 0.38952627778053284, | |
| "learning_rate": 0.00044593762751384436, | |
| "loss": 3.3503, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.875087392216267, | |
| "grad_norm": 0.3934302031993866, | |
| "learning_rate": 0.000445762751384436, | |
| "loss": 3.365, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.889652761594034, | |
| "grad_norm": 0.4013819694519043, | |
| "learning_rate": 0.0004455878752550277, | |
| "loss": 3.3597, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.9042181309718, | |
| "grad_norm": 0.3661031126976013, | |
| "learning_rate": 0.0004454129991256193, | |
| "loss": 3.3674, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.91878350034957, | |
| "grad_norm": 0.417216420173645, | |
| "learning_rate": 0.00044523812299621096, | |
| "loss": 3.3518, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.933348869727336, | |
| "grad_norm": 0.3435131907463074, | |
| "learning_rate": 0.00044506324686680265, | |
| "loss": 3.3626, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.947914239105105, | |
| "grad_norm": 0.3419662415981293, | |
| "learning_rate": 0.0004448883707373943, | |
| "loss": 3.3505, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.962479608482871, | |
| "grad_norm": 0.370420902967453, | |
| "learning_rate": 0.000444713494607986, | |
| "loss": 3.3523, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.977044977860638, | |
| "grad_norm": 0.37728220224380493, | |
| "learning_rate": 0.0004445386184785776, | |
| "loss": 3.344, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.991610347238407, | |
| "grad_norm": 0.34244418144226074, | |
| "learning_rate": 0.00044436374234916925, | |
| "loss": 3.363, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 13.006117455138662, | |
| "grad_norm": 0.37950703501701355, | |
| "learning_rate": 0.000444188866219761, | |
| "loss": 3.3071, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.02068282451643, | |
| "grad_norm": 0.3414568305015564, | |
| "learning_rate": 0.00044401399009035263, | |
| "loss": 3.2523, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.035248193894198, | |
| "grad_norm": 0.3659200966358185, | |
| "learning_rate": 0.0004438391139609443, | |
| "loss": 3.2501, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.049813563271965, | |
| "grad_norm": 0.3586159348487854, | |
| "learning_rate": 0.00044366423783153596, | |
| "loss": 3.2587, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.064378932649731, | |
| "grad_norm": 0.3674415051937103, | |
| "learning_rate": 0.0004434893617021276, | |
| "loss": 3.265, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.0789443020275, | |
| "grad_norm": 0.36518362164497375, | |
| "learning_rate": 0.0004433144855727193, | |
| "loss": 3.2717, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.093509671405267, | |
| "grad_norm": 0.35845455527305603, | |
| "learning_rate": 0.0004431396094433109, | |
| "loss": 3.2737, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.108075040783035, | |
| "grad_norm": 0.3665563464164734, | |
| "learning_rate": 0.0004429647333139026, | |
| "loss": 3.2587, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.108075040783035, | |
| "eval_accuracy": 0.37041758983351547, | |
| "eval_loss": 3.561434268951416, | |
| "eval_runtime": 179.821, | |
| "eval_samples_per_second": 92.564, | |
| "eval_steps_per_second": 5.789, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.122640410160802, | |
| "grad_norm": 0.34525081515312195, | |
| "learning_rate": 0.00044278985718449425, | |
| "loss": 3.2747, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.137205779538569, | |
| "grad_norm": 0.3698437809944153, | |
| "learning_rate": 0.000442614981055086, | |
| "loss": 3.2796, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.151771148916337, | |
| "grad_norm": 0.34682697057724, | |
| "learning_rate": 0.00044244010492567763, | |
| "loss": 3.2792, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.166336518294104, | |
| "grad_norm": 0.371136337518692, | |
| "learning_rate": 0.00044226522879626927, | |
| "loss": 3.2857, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.18090188767187, | |
| "grad_norm": 0.3699894845485687, | |
| "learning_rate": 0.00044209035266686096, | |
| "loss": 3.2826, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.19546725704964, | |
| "grad_norm": 0.35856735706329346, | |
| "learning_rate": 0.0004419154765374526, | |
| "loss": 3.2864, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.210032626427406, | |
| "grad_norm": 0.37214043736457825, | |
| "learning_rate": 0.0004417406004080443, | |
| "loss": 3.2817, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.224597995805174, | |
| "grad_norm": 0.35830315947532654, | |
| "learning_rate": 0.0004415657242786359, | |
| "loss": 3.2916, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.239163365182941, | |
| "grad_norm": 0.38184452056884766, | |
| "learning_rate": 0.00044139084814922755, | |
| "loss": 3.2905, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.253728734560708, | |
| "grad_norm": 0.36729663610458374, | |
| "learning_rate": 0.00044121597201981924, | |
| "loss": 3.2988, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.268294103938477, | |
| "grad_norm": 0.360363632440567, | |
| "learning_rate": 0.0004410410958904109, | |
| "loss": 3.2856, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.282859473316243, | |
| "grad_norm": 0.3617470860481262, | |
| "learning_rate": 0.0004408662197610026, | |
| "loss": 3.2866, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.29742484269401, | |
| "grad_norm": 0.35599926114082336, | |
| "learning_rate": 0.00044069134363159426, | |
| "loss": 3.2948, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.311990212071779, | |
| "grad_norm": 0.3696223199367523, | |
| "learning_rate": 0.00044051646750218595, | |
| "loss": 3.3045, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.326555581449545, | |
| "grad_norm": 0.3685641288757324, | |
| "learning_rate": 0.0004403415913727776, | |
| "loss": 3.2931, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.341120950827314, | |
| "grad_norm": 0.36321672797203064, | |
| "learning_rate": 0.0004401667152433692, | |
| "loss": 3.3018, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.35568632020508, | |
| "grad_norm": 0.3663840591907501, | |
| "learning_rate": 0.0004399918391139609, | |
| "loss": 3.3055, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.370251689582847, | |
| "grad_norm": 0.37315475940704346, | |
| "learning_rate": 0.00043981696298455255, | |
| "loss": 3.3086, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.384817058960616, | |
| "grad_norm": 0.404310941696167, | |
| "learning_rate": 0.00043964208685514424, | |
| "loss": 3.3042, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.399382428338383, | |
| "grad_norm": 0.3406934142112732, | |
| "learning_rate": 0.0004394672107257359, | |
| "loss": 3.3059, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.399382428338383, | |
| "eval_accuracy": 0.37039325291175984, | |
| "eval_loss": 3.558974266052246, | |
| "eval_runtime": 179.7471, | |
| "eval_samples_per_second": 92.602, | |
| "eval_steps_per_second": 5.791, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.41394779771615, | |
| "grad_norm": 0.3774344325065613, | |
| "learning_rate": 0.0004392923345963275, | |
| "loss": 3.3012, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.428513167093918, | |
| "grad_norm": 0.36368629336357117, | |
| "learning_rate": 0.00043911745846691926, | |
| "loss": 3.3133, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.443078536471685, | |
| "grad_norm": 0.34740936756134033, | |
| "learning_rate": 0.0004389425823375109, | |
| "loss": 3.3042, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.457643905849451, | |
| "grad_norm": 0.3726678192615509, | |
| "learning_rate": 0.0004387677062081026, | |
| "loss": 3.319, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.47220927522722, | |
| "grad_norm": 0.3507010340690613, | |
| "learning_rate": 0.0004385928300786942, | |
| "loss": 3.3134, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.486774644604987, | |
| "grad_norm": 0.36555829644203186, | |
| "learning_rate": 0.0004384179539492859, | |
| "loss": 3.3205, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.501340013982755, | |
| "grad_norm": 0.34969812631607056, | |
| "learning_rate": 0.00043824307781987755, | |
| "loss": 3.3253, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.515905383360522, | |
| "grad_norm": 0.3921741545200348, | |
| "learning_rate": 0.0004380682016904692, | |
| "loss": 3.3118, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.530470752738289, | |
| "grad_norm": 0.4136374592781067, | |
| "learning_rate": 0.0004378933255610609, | |
| "loss": 3.3229, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.545036122116057, | |
| "grad_norm": 0.39142510294914246, | |
| "learning_rate": 0.0004377184494316525, | |
| "loss": 3.3253, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.559601491493824, | |
| "grad_norm": 0.35085681080818176, | |
| "learning_rate": 0.00043754357330224426, | |
| "loss": 3.3399, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.574166860871593, | |
| "grad_norm": 0.38441339135169983, | |
| "learning_rate": 0.0004373686971728359, | |
| "loss": 3.3177, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.58873223024936, | |
| "grad_norm": 0.3715854287147522, | |
| "learning_rate": 0.00043719382104342753, | |
| "loss": 3.3232, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.603297599627126, | |
| "grad_norm": 0.37551915645599365, | |
| "learning_rate": 0.0004370189449140192, | |
| "loss": 3.319, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.617862969004895, | |
| "grad_norm": 0.41187357902526855, | |
| "learning_rate": 0.00043684406878461085, | |
| "loss": 3.3137, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.632428338382661, | |
| "grad_norm": 0.37525227665901184, | |
| "learning_rate": 0.00043666919265520254, | |
| "loss": 3.319, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.646993707760428, | |
| "grad_norm": 0.3764234483242035, | |
| "learning_rate": 0.0004364943165257942, | |
| "loss": 3.3301, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.661559077138197, | |
| "grad_norm": 0.3498331904411316, | |
| "learning_rate": 0.0004363194403963858, | |
| "loss": 3.3309, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.676124446515963, | |
| "grad_norm": 0.35479307174682617, | |
| "learning_rate": 0.0004361445642669775, | |
| "loss": 3.3297, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.69068981589373, | |
| "grad_norm": 0.36635255813598633, | |
| "learning_rate": 0.00043596968813756914, | |
| "loss": 3.3337, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.69068981589373, | |
| "eval_accuracy": 0.370927724633021, | |
| "eval_loss": 3.551501989364624, | |
| "eval_runtime": 179.6084, | |
| "eval_samples_per_second": 92.674, | |
| "eval_steps_per_second": 5.796, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.705255185271499, | |
| "grad_norm": 0.3506696820259094, | |
| "learning_rate": 0.0004357948120081609, | |
| "loss": 3.3283, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.719820554649266, | |
| "grad_norm": 0.33802589774131775, | |
| "learning_rate": 0.0004356199358787525, | |
| "loss": 3.3193, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.734385924027034, | |
| "grad_norm": 0.4081648886203766, | |
| "learning_rate": 0.0004354450597493442, | |
| "loss": 3.3291, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.7489512934048, | |
| "grad_norm": 0.35251572728157043, | |
| "learning_rate": 0.00043527018361993585, | |
| "loss": 3.3204, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.763516662782568, | |
| "grad_norm": 0.34620070457458496, | |
| "learning_rate": 0.0004350953074905275, | |
| "loss": 3.3274, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.778082032160336, | |
| "grad_norm": 0.36664706468582153, | |
| "learning_rate": 0.0004349204313611192, | |
| "loss": 3.3266, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.792647401538103, | |
| "grad_norm": 0.35269954800605774, | |
| "learning_rate": 0.0004347455552317108, | |
| "loss": 3.3388, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.80721277091587, | |
| "grad_norm": 0.39205700159072876, | |
| "learning_rate": 0.0004345706791023025, | |
| "loss": 3.3248, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.821778140293638, | |
| "grad_norm": 0.39721786975860596, | |
| "learning_rate": 0.00043439580297289414, | |
| "loss": 3.3325, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.836343509671405, | |
| "grad_norm": 0.39120006561279297, | |
| "learning_rate": 0.0004342209268434858, | |
| "loss": 3.3322, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.850908879049173, | |
| "grad_norm": 0.34740549325942993, | |
| "learning_rate": 0.0004340460507140775, | |
| "loss": 3.3376, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.86547424842694, | |
| "grad_norm": 0.3894191086292267, | |
| "learning_rate": 0.00043387117458466916, | |
| "loss": 3.3419, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.880039617804707, | |
| "grad_norm": 0.36777186393737793, | |
| "learning_rate": 0.00043369629845526085, | |
| "loss": 3.3491, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.894604987182475, | |
| "grad_norm": 0.3732227683067322, | |
| "learning_rate": 0.0004335214223258525, | |
| "loss": 3.3383, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.909170356560242, | |
| "grad_norm": 0.3650364279747009, | |
| "learning_rate": 0.0004333465461964442, | |
| "loss": 3.3382, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.923735725938009, | |
| "grad_norm": 0.36800310015678406, | |
| "learning_rate": 0.0004331716700670358, | |
| "loss": 3.3333, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.938301095315778, | |
| "grad_norm": 0.4116499125957489, | |
| "learning_rate": 0.00043299679393762745, | |
| "loss": 3.3534, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.952866464693544, | |
| "grad_norm": 0.3539418876171112, | |
| "learning_rate": 0.00043282191780821914, | |
| "loss": 3.3383, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.967431834071313, | |
| "grad_norm": 0.35670432448387146, | |
| "learning_rate": 0.00043264704167881077, | |
| "loss": 3.347, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.98199720344908, | |
| "grad_norm": 0.3599216639995575, | |
| "learning_rate": 0.0004324721655494025, | |
| "loss": 3.3343, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.98199720344908, | |
| "eval_accuracy": 0.3713798975850602, | |
| "eval_loss": 3.5442428588867188, | |
| "eval_runtime": 179.6319, | |
| "eval_samples_per_second": 92.662, | |
| "eval_steps_per_second": 5.795, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.996562572826846, | |
| "grad_norm": 0.3528788387775421, | |
| "learning_rate": 0.00043229728941999415, | |
| "loss": 3.345, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.011069680727104, | |
| "grad_norm": 0.3962458670139313, | |
| "learning_rate": 0.0004321224132905858, | |
| "loss": 3.2571, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.02563505010487, | |
| "grad_norm": 0.3570566475391388, | |
| "learning_rate": 0.0004319475371611775, | |
| "loss": 3.2367, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.040200419482638, | |
| "grad_norm": 0.3566766679286957, | |
| "learning_rate": 0.0004317726610317691, | |
| "loss": 3.2429, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.054765788860406, | |
| "grad_norm": 0.38148075342178345, | |
| "learning_rate": 0.0004315977849023608, | |
| "loss": 3.2336, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.069331158238173, | |
| "grad_norm": 0.36465924978256226, | |
| "learning_rate": 0.00043142290877295244, | |
| "loss": 3.2448, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.08389652761594, | |
| "grad_norm": 0.4034234583377838, | |
| "learning_rate": 0.00043124803264354413, | |
| "loss": 3.2529, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.098461896993708, | |
| "grad_norm": 0.38046795129776, | |
| "learning_rate": 0.00043107315651413577, | |
| "loss": 3.2572, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.113027266371475, | |
| "grad_norm": 0.37367624044418335, | |
| "learning_rate": 0.0004308982803847274, | |
| "loss": 3.2464, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.127592635749243, | |
| "grad_norm": 0.3575690984725952, | |
| "learning_rate": 0.00043072340425531915, | |
| "loss": 3.2573, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.14215800512701, | |
| "grad_norm": 0.37947431206703186, | |
| "learning_rate": 0.0004305485281259108, | |
| "loss": 3.2611, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.156723374504777, | |
| "grad_norm": 0.4311124086380005, | |
| "learning_rate": 0.0004303736519965025, | |
| "loss": 3.2644, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.171288743882545, | |
| "grad_norm": 0.40697988867759705, | |
| "learning_rate": 0.0004301987758670941, | |
| "loss": 3.271, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.185854113260312, | |
| "grad_norm": 0.3614986538887024, | |
| "learning_rate": 0.00043002389973768575, | |
| "loss": 3.2703, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.200419482638079, | |
| "grad_norm": 0.40103664994239807, | |
| "learning_rate": 0.00042984902360827744, | |
| "loss": 3.2759, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.214984852015847, | |
| "grad_norm": 0.3614042401313782, | |
| "learning_rate": 0.0004296741474788691, | |
| "loss": 3.2763, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.229550221393614, | |
| "grad_norm": 0.3694717586040497, | |
| "learning_rate": 0.00042949927134946077, | |
| "loss": 3.2836, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.244115590771383, | |
| "grad_norm": 0.3669103682041168, | |
| "learning_rate": 0.0004293243952200524, | |
| "loss": 3.2802, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.25868096014915, | |
| "grad_norm": 0.38352200388908386, | |
| "learning_rate": 0.00042914951909064415, | |
| "loss": 3.2663, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.273246329526916, | |
| "grad_norm": 0.44323334097862244, | |
| "learning_rate": 0.0004289746429612358, | |
| "loss": 3.2804, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.273246329526916, | |
| "eval_accuracy": 0.3708143874708354, | |
| "eval_loss": 3.558420181274414, | |
| "eval_runtime": 179.5689, | |
| "eval_samples_per_second": 92.694, | |
| "eval_steps_per_second": 5.797, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.287811698904685, | |
| "grad_norm": 0.3794345259666443, | |
| "learning_rate": 0.0004287997668318274, | |
| "loss": 3.2769, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.302377068282452, | |
| "grad_norm": 0.38060110807418823, | |
| "learning_rate": 0.0004286248907024191, | |
| "loss": 3.2725, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.316942437660218, | |
| "grad_norm": 0.43410491943359375, | |
| "learning_rate": 0.00042845001457301075, | |
| "loss": 3.2889, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.331507807037987, | |
| "grad_norm": 0.39754602313041687, | |
| "learning_rate": 0.00042827513844360244, | |
| "loss": 3.2825, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.346073176415754, | |
| "grad_norm": 0.41671112179756165, | |
| "learning_rate": 0.00042810026231419407, | |
| "loss": 3.2873, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.360638545793522, | |
| "grad_norm": 0.37623023986816406, | |
| "learning_rate": 0.0004279253861847857, | |
| "loss": 3.2924, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.375203915171289, | |
| "grad_norm": 0.421653687953949, | |
| "learning_rate": 0.0004277505100553774, | |
| "loss": 3.2735, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.389769284549056, | |
| "grad_norm": 0.3558456301689148, | |
| "learning_rate": 0.00042757563392596904, | |
| "loss": 3.3055, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.404334653926824, | |
| "grad_norm": 0.3729119300842285, | |
| "learning_rate": 0.0004274007577965608, | |
| "loss": 3.3128, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.418900023304591, | |
| "grad_norm": 0.3821575939655304, | |
| "learning_rate": 0.0004272258816671524, | |
| "loss": 3.2866, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.433465392682358, | |
| "grad_norm": 0.38078463077545166, | |
| "learning_rate": 0.00042705100553774405, | |
| "loss": 3.2985, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.448030762060126, | |
| "grad_norm": 0.38333752751350403, | |
| "learning_rate": 0.00042687612940833574, | |
| "loss": 3.2894, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.462596131437893, | |
| "grad_norm": 0.343722939491272, | |
| "learning_rate": 0.0004267012532789274, | |
| "loss": 3.307, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.477161500815662, | |
| "grad_norm": 0.34256860613822937, | |
| "learning_rate": 0.00042652637714951907, | |
| "loss": 3.3008, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.491726870193428, | |
| "grad_norm": 0.37949851155281067, | |
| "learning_rate": 0.0004263515010201107, | |
| "loss": 3.3107, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.506292239571195, | |
| "grad_norm": 0.3749626874923706, | |
| "learning_rate": 0.0004261766248907024, | |
| "loss": 3.2993, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.520857608948964, | |
| "grad_norm": 0.36548712849617004, | |
| "learning_rate": 0.00042600174876129403, | |
| "loss": 3.3102, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.53542297832673, | |
| "grad_norm": 0.404715895652771, | |
| "learning_rate": 0.00042582687263188567, | |
| "loss": 3.3081, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.549988347704497, | |
| "grad_norm": 0.37166038155555725, | |
| "learning_rate": 0.0004256519965024774, | |
| "loss": 3.3047, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.564553717082266, | |
| "grad_norm": 0.3786543011665344, | |
| "learning_rate": 0.00042547712037306905, | |
| "loss": 3.293, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.564553717082266, | |
| "eval_accuracy": 0.37125997652133685, | |
| "eval_loss": 3.553053140640259, | |
| "eval_runtime": 179.5811, | |
| "eval_samples_per_second": 92.688, | |
| "eval_steps_per_second": 5.797, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.579119086460032, | |
| "grad_norm": 0.35522982478141785, | |
| "learning_rate": 0.00042530224424366074, | |
| "loss": 3.3022, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.5936844558378, | |
| "grad_norm": 0.3993748426437378, | |
| "learning_rate": 0.0004251273681142524, | |
| "loss": 3.312, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.608249825215568, | |
| "grad_norm": 0.39016029238700867, | |
| "learning_rate": 0.000424952491984844, | |
| "loss": 3.3076, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.622815194593334, | |
| "grad_norm": 0.38183167576789856, | |
| "learning_rate": 0.0004247776158554357, | |
| "loss": 3.2985, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.637380563971103, | |
| "grad_norm": 0.3808605968952179, | |
| "learning_rate": 0.00042460273972602734, | |
| "loss": 3.3138, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.65194593334887, | |
| "grad_norm": 0.366777241230011, | |
| "learning_rate": 0.00042442786359661903, | |
| "loss": 3.3074, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.666511302726637, | |
| "grad_norm": 0.3863094449043274, | |
| "learning_rate": 0.00042425298746721066, | |
| "loss": 3.3151, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.681076672104405, | |
| "grad_norm": 0.35356074571609497, | |
| "learning_rate": 0.0004240781113378024, | |
| "loss": 3.3114, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.695642041482172, | |
| "grad_norm": 0.38444754481315613, | |
| "learning_rate": 0.00042390323520839405, | |
| "loss": 3.3152, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.71020741085994, | |
| "grad_norm": 0.3628937602043152, | |
| "learning_rate": 0.0004237283590789857, | |
| "loss": 3.3073, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.724772780237707, | |
| "grad_norm": 0.3597457706928253, | |
| "learning_rate": 0.00042355348294957737, | |
| "loss": 3.3091, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.739338149615474, | |
| "grad_norm": 0.40730124711990356, | |
| "learning_rate": 0.000423378606820169, | |
| "loss": 3.303, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.753903518993242, | |
| "grad_norm": 0.3871900737285614, | |
| "learning_rate": 0.0004232037306907607, | |
| "loss": 3.323, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.76846888837101, | |
| "grad_norm": 0.3685663938522339, | |
| "learning_rate": 0.00042302885456135233, | |
| "loss": 3.3253, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.783034257748776, | |
| "grad_norm": 0.358916699886322, | |
| "learning_rate": 0.00042285397843194397, | |
| "loss": 3.3162, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.797599627126544, | |
| "grad_norm": 0.37842485308647156, | |
| "learning_rate": 0.00042267910230253566, | |
| "loss": 3.335, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.812164996504311, | |
| "grad_norm": 0.36957690119743347, | |
| "learning_rate": 0.0004225042261731273, | |
| "loss": 3.3302, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.826730365882078, | |
| "grad_norm": 0.3704380989074707, | |
| "learning_rate": 0.00042232935004371904, | |
| "loss": 3.324, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.841295735259846, | |
| "grad_norm": 0.3660496175289154, | |
| "learning_rate": 0.0004221544739143107, | |
| "loss": 3.3219, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.855861104637613, | |
| "grad_norm": 0.3719576299190521, | |
| "learning_rate": 0.00042197959778490237, | |
| "loss": 3.3208, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.855861104637613, | |
| "eval_accuracy": 0.3716361994663513, | |
| "eval_loss": 3.5443522930145264, | |
| "eval_runtime": 179.6673, | |
| "eval_samples_per_second": 92.643, | |
| "eval_steps_per_second": 5.794, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.870426474015382, | |
| "grad_norm": 0.39968937635421753, | |
| "learning_rate": 0.000421804721655494, | |
| "loss": 3.332, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.884991843393149, | |
| "grad_norm": 0.3981848359107971, | |
| "learning_rate": 0.00042162984552608564, | |
| "loss": 3.3107, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.899557212770915, | |
| "grad_norm": 0.3961758613586426, | |
| "learning_rate": 0.00042145496939667733, | |
| "loss": 3.3288, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.914122582148684, | |
| "grad_norm": 0.3636086881160736, | |
| "learning_rate": 0.00042128009326726897, | |
| "loss": 3.3134, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.92868795152645, | |
| "grad_norm": 0.36392343044281006, | |
| "learning_rate": 0.00042110521713786066, | |
| "loss": 3.3325, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.943253320904217, | |
| "grad_norm": 0.36686888337135315, | |
| "learning_rate": 0.0004209303410084523, | |
| "loss": 3.3204, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.957818690281986, | |
| "grad_norm": 0.37597978115081787, | |
| "learning_rate": 0.00042075546487904393, | |
| "loss": 3.3212, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.972384059659753, | |
| "grad_norm": 0.3588141202926636, | |
| "learning_rate": 0.0004205805887496357, | |
| "loss": 3.3291, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.986949429037521, | |
| "grad_norm": 0.37139445543289185, | |
| "learning_rate": 0.0004204057126202273, | |
| "loss": 3.3212, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 15.001456536937777, | |
| "grad_norm": 0.393344908952713, | |
| "learning_rate": 0.000420230836490819, | |
| "loss": 3.3173, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.016021906315544, | |
| "grad_norm": 0.4330257475376129, | |
| "learning_rate": 0.00042005596036141064, | |
| "loss": 3.2146, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.030587275693312, | |
| "grad_norm": 0.3898090422153473, | |
| "learning_rate": 0.0004198810842320023, | |
| "loss": 3.2189, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.045152645071079, | |
| "grad_norm": 0.4010067880153656, | |
| "learning_rate": 0.00041970620810259396, | |
| "loss": 3.2251, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.059718014448846, | |
| "grad_norm": 0.40854838490486145, | |
| "learning_rate": 0.0004195313319731856, | |
| "loss": 3.2156, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.074283383826614, | |
| "grad_norm": 0.36628204584121704, | |
| "learning_rate": 0.0004193564558437773, | |
| "loss": 3.234, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.088848753204381, | |
| "grad_norm": 0.38783887028694153, | |
| "learning_rate": 0.0004191815797143689, | |
| "loss": 3.2315, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.103414122582148, | |
| "grad_norm": 0.3718164265155792, | |
| "learning_rate": 0.00041900670358496067, | |
| "loss": 3.2369, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.117979491959916, | |
| "grad_norm": 0.42094212770462036, | |
| "learning_rate": 0.0004188318274555523, | |
| "loss": 3.2398, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.132544861337683, | |
| "grad_norm": 0.36034852266311646, | |
| "learning_rate": 0.00041865695132614394, | |
| "loss": 3.2404, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.147110230715452, | |
| "grad_norm": 0.3888159692287445, | |
| "learning_rate": 0.00041848207519673563, | |
| "loss": 3.2449, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.147110230715452, | |
| "eval_accuracy": 0.3711453460927778, | |
| "eval_loss": 3.5572781562805176, | |
| "eval_runtime": 179.6315, | |
| "eval_samples_per_second": 92.662, | |
| "eval_steps_per_second": 5.795, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.161675600093218, | |
| "grad_norm": 0.3690231740474701, | |
| "learning_rate": 0.00041830719906732727, | |
| "loss": 3.2507, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.176240969470985, | |
| "grad_norm": 0.37178748846054077, | |
| "learning_rate": 0.00041813232293791896, | |
| "loss": 3.2635, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.190806338848754, | |
| "grad_norm": 0.40822193026542664, | |
| "learning_rate": 0.0004179574468085106, | |
| "loss": 3.2505, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.20537170822652, | |
| "grad_norm": 0.40897294878959656, | |
| "learning_rate": 0.00041778257067910223, | |
| "loss": 3.2578, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.219937077604287, | |
| "grad_norm": 0.416759729385376, | |
| "learning_rate": 0.0004176076945496939, | |
| "loss": 3.2576, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.234502446982056, | |
| "grad_norm": 0.3542684018611908, | |
| "learning_rate": 0.00041743281842028556, | |
| "loss": 3.248, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.249067816359823, | |
| "grad_norm": 0.3839828670024872, | |
| "learning_rate": 0.0004172579422908773, | |
| "loss": 3.2648, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.263633185737591, | |
| "grad_norm": 0.36714503169059753, | |
| "learning_rate": 0.00041708306616146894, | |
| "loss": 3.257, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.278198555115358, | |
| "grad_norm": 0.38585343956947327, | |
| "learning_rate": 0.00041690819003206063, | |
| "loss": 3.263, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.292763924493125, | |
| "grad_norm": 0.3717619776725769, | |
| "learning_rate": 0.00041673331390265227, | |
| "loss": 3.2767, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.307329293870893, | |
| "grad_norm": 0.3753516972064972, | |
| "learning_rate": 0.0004165584377732439, | |
| "loss": 3.2765, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.32189466324866, | |
| "grad_norm": 0.3641180098056793, | |
| "learning_rate": 0.0004163835616438356, | |
| "loss": 3.2676, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.336460032626427, | |
| "grad_norm": 0.37075987458229065, | |
| "learning_rate": 0.00041620868551442723, | |
| "loss": 3.2739, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.351025402004195, | |
| "grad_norm": 0.3723219633102417, | |
| "learning_rate": 0.0004160338093850189, | |
| "loss": 3.2761, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.365590771381962, | |
| "grad_norm": 0.3730946183204651, | |
| "learning_rate": 0.00041585893325561056, | |
| "loss": 3.2739, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.38015614075973, | |
| "grad_norm": 0.3803166449069977, | |
| "learning_rate": 0.0004156840571262022, | |
| "loss": 3.2923, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.394721510137497, | |
| "grad_norm": 0.3986593782901764, | |
| "learning_rate": 0.00041550918099679394, | |
| "loss": 3.2797, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.409286879515264, | |
| "grad_norm": 0.3798179626464844, | |
| "learning_rate": 0.0004153343048673856, | |
| "loss": 3.2717, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.423852248893033, | |
| "grad_norm": 0.4205482304096222, | |
| "learning_rate": 0.00041515942873797726, | |
| "loss": 3.2857, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.4384176182708, | |
| "grad_norm": 0.35909244418144226, | |
| "learning_rate": 0.0004149845526085689, | |
| "loss": 3.2765, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.4384176182708, | |
| "eval_accuracy": 0.3717464798171086, | |
| "eval_loss": 3.5521633625030518, | |
| "eval_runtime": 193.9407, | |
| "eval_samples_per_second": 85.825, | |
| "eval_steps_per_second": 5.368, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.452982987648566, | |
| "grad_norm": 0.361979216337204, | |
| "learning_rate": 0.0004148096764791606, | |
| "loss": 3.2751, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.467548357026335, | |
| "grad_norm": 0.36735597252845764, | |
| "learning_rate": 0.0004146348003497522, | |
| "loss": 3.3015, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.482113726404101, | |
| "grad_norm": 0.3767015039920807, | |
| "learning_rate": 0.00041445992422034386, | |
| "loss": 3.2938, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.49667909578187, | |
| "grad_norm": 0.38670143485069275, | |
| "learning_rate": 0.00041428504809093555, | |
| "loss": 3.2936, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.511244465159637, | |
| "grad_norm": 0.39119359850883484, | |
| "learning_rate": 0.0004141101719615272, | |
| "loss": 3.2893, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.525809834537403, | |
| "grad_norm": 0.36352699995040894, | |
| "learning_rate": 0.00041393529583211893, | |
| "loss": 3.2955, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.540375203915172, | |
| "grad_norm": 0.38741451501846313, | |
| "learning_rate": 0.00041376041970271057, | |
| "loss": 3.299, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.554940573292939, | |
| "grad_norm": 0.3951430916786194, | |
| "learning_rate": 0.0004135855435733022, | |
| "loss": 3.2996, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.569505942670705, | |
| "grad_norm": 0.36441171169281006, | |
| "learning_rate": 0.0004134106674438939, | |
| "loss": 3.2938, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.584071312048474, | |
| "grad_norm": 0.3774093985557556, | |
| "learning_rate": 0.00041323579131448553, | |
| "loss": 3.2882, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.59863668142624, | |
| "grad_norm": 0.3849200904369354, | |
| "learning_rate": 0.0004130609151850772, | |
| "loss": 3.3071, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.61320205080401, | |
| "grad_norm": 0.3753909468650818, | |
| "learning_rate": 0.00041288603905566886, | |
| "loss": 3.2861, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.627767420181776, | |
| "grad_norm": 0.3853233754634857, | |
| "learning_rate": 0.0004127111629262605, | |
| "loss": 3.3072, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.642332789559543, | |
| "grad_norm": 0.3988652229309082, | |
| "learning_rate": 0.0004125362867968522, | |
| "loss": 3.3094, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.656898158937311, | |
| "grad_norm": 0.3708445429801941, | |
| "learning_rate": 0.0004123614106674438, | |
| "loss": 3.2967, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.671463528315078, | |
| "grad_norm": 0.36685454845428467, | |
| "learning_rate": 0.00041218653453803557, | |
| "loss": 3.2913, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.686028897692845, | |
| "grad_norm": 0.38278666138648987, | |
| "learning_rate": 0.0004120116584086272, | |
| "loss": 3.2861, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.700594267070613, | |
| "grad_norm": 0.384741872549057, | |
| "learning_rate": 0.0004118367822792189, | |
| "loss": 3.304, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.71515963644838, | |
| "grad_norm": 0.3768286108970642, | |
| "learning_rate": 0.00041166190614981053, | |
| "loss": 3.2982, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.729725005826147, | |
| "grad_norm": 0.3943612575531006, | |
| "learning_rate": 0.00041148703002040217, | |
| "loss": 3.2996, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.729725005826147, | |
| "eval_accuracy": 0.3718921486386314, | |
| "eval_loss": 3.544917345046997, | |
| "eval_runtime": 220.4654, | |
| "eval_samples_per_second": 75.499, | |
| "eval_steps_per_second": 4.722, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.744290375203915, | |
| "grad_norm": 0.3631158769130707, | |
| "learning_rate": 0.00041131215389099386, | |
| "loss": 3.2903, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.758855744581682, | |
| "grad_norm": 0.40076887607574463, | |
| "learning_rate": 0.0004111372777615855, | |
| "loss": 3.3015, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.77342111395945, | |
| "grad_norm": 0.3838764429092407, | |
| "learning_rate": 0.0004109624016321772, | |
| "loss": 3.2982, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.787986483337217, | |
| "grad_norm": 0.3836144804954529, | |
| "learning_rate": 0.0004107875255027688, | |
| "loss": 3.3118, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.802551852714984, | |
| "grad_norm": 0.39159563183784485, | |
| "learning_rate": 0.00041061264937336045, | |
| "loss": 3.2957, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.817117222092753, | |
| "grad_norm": 0.3700462281703949, | |
| "learning_rate": 0.0004104377732439522, | |
| "loss": 3.3081, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.83168259147052, | |
| "grad_norm": 0.37243711948394775, | |
| "learning_rate": 0.00041026289711454384, | |
| "loss": 3.3134, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.846247960848288, | |
| "grad_norm": 0.38975927233695984, | |
| "learning_rate": 0.0004100880209851355, | |
| "loss": 3.3046, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.860813330226055, | |
| "grad_norm": 0.39330175518989563, | |
| "learning_rate": 0.00040991314485572716, | |
| "loss": 3.3103, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.875378699603822, | |
| "grad_norm": 0.36677080392837524, | |
| "learning_rate": 0.00040973826872631885, | |
| "loss": 3.3041, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.88994406898159, | |
| "grad_norm": 0.38371795415878296, | |
| "learning_rate": 0.0004095633925969105, | |
| "loss": 3.3192, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.904509438359357, | |
| "grad_norm": 0.37720179557800293, | |
| "learning_rate": 0.0004093885164675021, | |
| "loss": 3.3182, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.919074807737124, | |
| "grad_norm": 0.372707724571228, | |
| "learning_rate": 0.0004092136403380938, | |
| "loss": 3.3182, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.933640177114892, | |
| "grad_norm": 0.38466477394104004, | |
| "learning_rate": 0.00040903876420868545, | |
| "loss": 3.3171, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.948205546492659, | |
| "grad_norm": 0.4322209656238556, | |
| "learning_rate": 0.00040886388807927714, | |
| "loss": 3.3102, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.962770915870426, | |
| "grad_norm": 0.3643110394477844, | |
| "learning_rate": 0.00040868901194986883, | |
| "loss": 3.3154, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.977336285248194, | |
| "grad_norm": 0.3549572229385376, | |
| "learning_rate": 0.00040851413582046047, | |
| "loss": 3.3118, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.991901654625961, | |
| "grad_norm": 0.35710573196411133, | |
| "learning_rate": 0.00040833925969105216, | |
| "loss": 3.3088, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 16.006408762526217, | |
| "grad_norm": 0.3736666738986969, | |
| "learning_rate": 0.0004081643835616438, | |
| "loss": 3.2679, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.020974131903984, | |
| "grad_norm": 0.39405354857444763, | |
| "learning_rate": 0.0004079895074322355, | |
| "loss": 3.195, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.020974131903984, | |
| "eval_accuracy": 0.37159434466371843, | |
| "eval_loss": 3.5555102825164795, | |
| "eval_runtime": 179.8603, | |
| "eval_samples_per_second": 92.544, | |
| "eval_steps_per_second": 5.788, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.035539501281754, | |
| "grad_norm": 0.38940897583961487, | |
| "learning_rate": 0.0004078146313028271, | |
| "loss": 3.2109, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.05010487065952, | |
| "grad_norm": 0.3882853388786316, | |
| "learning_rate": 0.0004076397551734188, | |
| "loss": 3.2045, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.064670240037287, | |
| "grad_norm": 0.39605289697647095, | |
| "learning_rate": 0.00040746487904401045, | |
| "loss": 3.2356, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.079235609415054, | |
| "grad_norm": 0.3754449486732483, | |
| "learning_rate": 0.0004072900029146021, | |
| "loss": 3.2203, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.09380097879282, | |
| "grad_norm": 0.4028746783733368, | |
| "learning_rate": 0.0004071151267851938, | |
| "loss": 3.2177, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.10836634817059, | |
| "grad_norm": 0.36337772011756897, | |
| "learning_rate": 0.00040694025065578546, | |
| "loss": 3.2235, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.122931717548358, | |
| "grad_norm": 0.3819507360458374, | |
| "learning_rate": 0.00040676537452637716, | |
| "loss": 3.2269, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.137497086926125, | |
| "grad_norm": 0.38409850001335144, | |
| "learning_rate": 0.0004065904983969688, | |
| "loss": 3.2293, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.15206245630389, | |
| "grad_norm": 0.4041096866130829, | |
| "learning_rate": 0.00040641562226756043, | |
| "loss": 3.2422, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.16662782568166, | |
| "grad_norm": 0.3929169774055481, | |
| "learning_rate": 0.0004062407461381521, | |
| "loss": 3.2401, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.181193195059425, | |
| "grad_norm": 0.379218190908432, | |
| "learning_rate": 0.00040606587000874375, | |
| "loss": 3.2338, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.195758564437195, | |
| "grad_norm": 0.39579394459724426, | |
| "learning_rate": 0.00040589099387933544, | |
| "loss": 3.2271, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.210323933814962, | |
| "grad_norm": 0.38522908091545105, | |
| "learning_rate": 0.0004057161177499271, | |
| "loss": 3.2319, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.22488930319273, | |
| "grad_norm": 0.3886246085166931, | |
| "learning_rate": 0.0004055412416205187, | |
| "loss": 3.25, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.239454672570496, | |
| "grad_norm": 0.387268990278244, | |
| "learning_rate": 0.0004053663654911104, | |
| "loss": 3.2485, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.254020041948262, | |
| "grad_norm": 0.3706577718257904, | |
| "learning_rate": 0.0004051914893617021, | |
| "loss": 3.2549, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.268585411326033, | |
| "grad_norm": 0.36555173993110657, | |
| "learning_rate": 0.0004050166132322938, | |
| "loss": 3.2546, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.2831507807038, | |
| "grad_norm": 0.4174744486808777, | |
| "learning_rate": 0.0004048417371028854, | |
| "loss": 3.2461, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.297716150081566, | |
| "grad_norm": 0.3815324604511261, | |
| "learning_rate": 0.0004046668609734771, | |
| "loss": 3.2584, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.312281519459333, | |
| "grad_norm": 0.3781425654888153, | |
| "learning_rate": 0.00040449198484406875, | |
| "loss": 3.2685, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.312281519459333, | |
| "eval_accuracy": 0.3717649382553484, | |
| "eval_loss": 3.551072835922241, | |
| "eval_runtime": 179.583, | |
| "eval_samples_per_second": 92.687, | |
| "eval_steps_per_second": 5.797, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.3268468888371, | |
| "grad_norm": 0.3846278190612793, | |
| "learning_rate": 0.0004043171087146604, | |
| "loss": 3.2671, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.34141225821487, | |
| "grad_norm": 0.3843114674091339, | |
| "learning_rate": 0.0004041422325852521, | |
| "loss": 3.2572, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.355977627592637, | |
| "grad_norm": 0.3832460343837738, | |
| "learning_rate": 0.0004039673564558437, | |
| "loss": 3.2482, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.370542996970403, | |
| "grad_norm": 0.39614608883857727, | |
| "learning_rate": 0.0004037924803264354, | |
| "loss": 3.2568, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.38510836634817, | |
| "grad_norm": 0.4128139615058899, | |
| "learning_rate": 0.00040361760419702704, | |
| "loss": 3.2602, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.399673735725937, | |
| "grad_norm": 0.41927552223205566, | |
| "learning_rate": 0.00040344272806761873, | |
| "loss": 3.2699, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.414239105103704, | |
| "grad_norm": 0.4142034351825714, | |
| "learning_rate": 0.0004032678519382104, | |
| "loss": 3.2651, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.428804474481474, | |
| "grad_norm": 0.4234794080257416, | |
| "learning_rate": 0.00040309297580880206, | |
| "loss": 3.2693, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.44336984385924, | |
| "grad_norm": 0.379566490650177, | |
| "learning_rate": 0.00040291809967939375, | |
| "loss": 3.259, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.457935213237008, | |
| "grad_norm": 0.3937167525291443, | |
| "learning_rate": 0.0004027432235499854, | |
| "loss": 3.2841, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.472500582614774, | |
| "grad_norm": 0.386248379945755, | |
| "learning_rate": 0.0004025683474205771, | |
| "loss": 3.277, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.48706595199254, | |
| "grad_norm": 0.38750800490379333, | |
| "learning_rate": 0.0004023934712911687, | |
| "loss": 3.2801, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.50163132137031, | |
| "grad_norm": 0.39586499333381653, | |
| "learning_rate": 0.00040221859516176035, | |
| "loss": 3.2722, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.516196690748078, | |
| "grad_norm": 0.37789252400398254, | |
| "learning_rate": 0.00040204371903235204, | |
| "loss": 3.2748, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.530762060125845, | |
| "grad_norm": 0.3938862085342407, | |
| "learning_rate": 0.0004018688429029437, | |
| "loss": 3.277, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.54532742950361, | |
| "grad_norm": 0.3977769613265991, | |
| "learning_rate": 0.0004016939667735354, | |
| "loss": 3.291, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.55989279888138, | |
| "grad_norm": 0.3525155782699585, | |
| "learning_rate": 0.00040151909064412705, | |
| "loss": 3.2735, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.57445816825915, | |
| "grad_norm": 0.362099826335907, | |
| "learning_rate": 0.0004013442145147187, | |
| "loss": 3.2917, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.589023537636916, | |
| "grad_norm": 0.37509220838546753, | |
| "learning_rate": 0.0004011693383853104, | |
| "loss": 3.2782, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.603588907014682, | |
| "grad_norm": 0.3895016312599182, | |
| "learning_rate": 0.000400994462255902, | |
| "loss": 3.2776, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.603588907014682, | |
| "eval_accuracy": 0.3718954405894003, | |
| "eval_loss": 3.5490047931671143, | |
| "eval_runtime": 185.9028, | |
| "eval_samples_per_second": 89.536, | |
| "eval_steps_per_second": 5.6, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.61815427639245, | |
| "grad_norm": 0.3511142432689667, | |
| "learning_rate": 0.0004008195861264937, | |
| "loss": 3.2816, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.632719645770216, | |
| "grad_norm": 0.42816150188446045, | |
| "learning_rate": 0.00040064470999708534, | |
| "loss": 3.2748, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.647285015147983, | |
| "grad_norm": 0.370182067155838, | |
| "learning_rate": 0.00040046983386767703, | |
| "loss": 3.2864, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.661850384525753, | |
| "grad_norm": 0.39222970604896545, | |
| "learning_rate": 0.00040029495773826867, | |
| "loss": 3.2856, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.67641575390352, | |
| "grad_norm": 0.3937409818172455, | |
| "learning_rate": 0.0004001200816088603, | |
| "loss": 3.2803, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.690981123281286, | |
| "grad_norm": 0.38916105031967163, | |
| "learning_rate": 0.00039994520547945205, | |
| "loss": 3.2942, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.705546492659053, | |
| "grad_norm": 0.37478119134902954, | |
| "learning_rate": 0.0003997703293500437, | |
| "loss": 3.3025, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.72011186203682, | |
| "grad_norm": 0.3683931827545166, | |
| "learning_rate": 0.0003995954532206354, | |
| "loss": 3.2842, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.73467723141459, | |
| "grad_norm": 0.4007303714752197, | |
| "learning_rate": 0.000399420577091227, | |
| "loss": 3.2829, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.749242600792357, | |
| "grad_norm": 0.3843965232372284, | |
| "learning_rate": 0.00039924570096181865, | |
| "loss": 3.2901, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.763807970170124, | |
| "grad_norm": 0.3941800594329834, | |
| "learning_rate": 0.00039907082483241034, | |
| "loss": 3.2858, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.77837333954789, | |
| "grad_norm": 0.37438079714775085, | |
| "learning_rate": 0.000398895948703002, | |
| "loss": 3.2916, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.792938708925657, | |
| "grad_norm": 0.3703000545501709, | |
| "learning_rate": 0.00039872107257359367, | |
| "loss": 3.3016, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.807504078303424, | |
| "grad_norm": 0.3948332369327545, | |
| "learning_rate": 0.0003985461964441853, | |
| "loss": 3.3057, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.822069447681194, | |
| "grad_norm": 0.38669082522392273, | |
| "learning_rate": 0.00039837132031477694, | |
| "loss": 3.2897, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.83663481705896, | |
| "grad_norm": 0.3628772497177124, | |
| "learning_rate": 0.0003981964441853687, | |
| "loss": 3.2891, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.851200186436728, | |
| "grad_norm": 0.39237385988235474, | |
| "learning_rate": 0.0003980215680559603, | |
| "loss": 3.3021, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.865765555814495, | |
| "grad_norm": 0.3908953070640564, | |
| "learning_rate": 0.000397846691926552, | |
| "loss": 3.2962, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.88033092519226, | |
| "grad_norm": 0.3867229223251343, | |
| "learning_rate": 0.00039767181579714365, | |
| "loss": 3.2861, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.89489629457003, | |
| "grad_norm": 0.3902886211872101, | |
| "learning_rate": 0.00039749693966773534, | |
| "loss": 3.3034, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.89489629457003, | |
| "eval_accuracy": 0.3726240198363548, | |
| "eval_loss": 3.5381805896759033, | |
| "eval_runtime": 441.5045, | |
| "eval_samples_per_second": 37.701, | |
| "eval_steps_per_second": 2.358, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.9094616639478, | |
| "grad_norm": 0.38796380162239075, | |
| "learning_rate": 0.00039732206353832697, | |
| "loss": 3.2994, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.924027033325565, | |
| "grad_norm": 0.35192742943763733, | |
| "learning_rate": 0.0003971471874089186, | |
| "loss": 3.2959, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.938592402703332, | |
| "grad_norm": 0.372641384601593, | |
| "learning_rate": 0.0003969723112795103, | |
| "loss": 3.3026, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.9531577720811, | |
| "grad_norm": 0.37450307607650757, | |
| "learning_rate": 0.00039679743515010194, | |
| "loss": 3.3098, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.96772314145887, | |
| "grad_norm": 0.38844752311706543, | |
| "learning_rate": 0.0003966225590206937, | |
| "loss": 3.3031, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.982288510836636, | |
| "grad_norm": 0.37731024622917175, | |
| "learning_rate": 0.0003964476828912853, | |
| "loss": 3.3081, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.996853880214402, | |
| "grad_norm": 0.37375837564468384, | |
| "learning_rate": 0.00039627280676187695, | |
| "loss": 3.2973, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.01136098811466, | |
| "grad_norm": 0.4000365436077118, | |
| "learning_rate": 0.00039609793063246864, | |
| "loss": 3.2205, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.025926357492427, | |
| "grad_norm": 0.3670046031475067, | |
| "learning_rate": 0.0003959230545030603, | |
| "loss": 3.1889, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.040491726870194, | |
| "grad_norm": 0.39600327610969543, | |
| "learning_rate": 0.00039574817837365197, | |
| "loss": 3.2056, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.05505709624796, | |
| "grad_norm": 0.38830217719078064, | |
| "learning_rate": 0.0003955733022442436, | |
| "loss": 3.2047, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.069622465625727, | |
| "grad_norm": 0.394195556640625, | |
| "learning_rate": 0.0003953984261148353, | |
| "loss": 3.2156, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.084187835003497, | |
| "grad_norm": 0.3784361183643341, | |
| "learning_rate": 0.00039522354998542693, | |
| "loss": 3.2088, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.098753204381264, | |
| "grad_norm": 0.4057703912258148, | |
| "learning_rate": 0.00039504867385601857, | |
| "loss": 3.2158, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.11331857375903, | |
| "grad_norm": 0.37357842922210693, | |
| "learning_rate": 0.0003948737977266103, | |
| "loss": 3.2155, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.127883943136798, | |
| "grad_norm": 0.3923245370388031, | |
| "learning_rate": 0.00039469892159720195, | |
| "loss": 3.2217, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.142449312514564, | |
| "grad_norm": 0.4075673520565033, | |
| "learning_rate": 0.00039452404546779364, | |
| "loss": 3.2176, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.15701468189233, | |
| "grad_norm": 0.37942689657211304, | |
| "learning_rate": 0.0003943491693383853, | |
| "loss": 3.2212, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.1715800512701, | |
| "grad_norm": 0.39208337664604187, | |
| "learning_rate": 0.0003941742932089769, | |
| "loss": 3.2204, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.18614542064787, | |
| "grad_norm": 0.3947177529335022, | |
| "learning_rate": 0.0003939994170795686, | |
| "loss": 3.2318, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.18614542064787, | |
| "eval_accuracy": 0.37187474832742445, | |
| "eval_loss": 3.5549113750457764, | |
| "eval_runtime": 179.6498, | |
| "eval_samples_per_second": 92.653, | |
| "eval_steps_per_second": 5.795, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.200710790025635, | |
| "grad_norm": 0.4263794720172882, | |
| "learning_rate": 0.00039382454095016024, | |
| "loss": 3.2376, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.215276159403402, | |
| "grad_norm": 0.37080931663513184, | |
| "learning_rate": 0.00039364966482075193, | |
| "loss": 3.2378, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.22984152878117, | |
| "grad_norm": 0.39057183265686035, | |
| "learning_rate": 0.00039347478869134356, | |
| "loss": 3.2199, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.24440689815894, | |
| "grad_norm": 0.41177624464035034, | |
| "learning_rate": 0.0003932999125619353, | |
| "loss": 3.2296, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.258972267536706, | |
| "grad_norm": 0.4065467417240143, | |
| "learning_rate": 0.00039312503643252695, | |
| "loss": 3.2364, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.273537636914472, | |
| "grad_norm": 0.37535977363586426, | |
| "learning_rate": 0.0003929501603031186, | |
| "loss": 3.2309, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.28810300629224, | |
| "grad_norm": 0.4139235019683838, | |
| "learning_rate": 0.00039277528417371027, | |
| "loss": 3.2482, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.302668375670006, | |
| "grad_norm": 0.3840341866016388, | |
| "learning_rate": 0.0003926004080443019, | |
| "loss": 3.2422, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.317233745047773, | |
| "grad_norm": 0.3817002475261688, | |
| "learning_rate": 0.0003924255319148936, | |
| "loss": 3.2387, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.331799114425543, | |
| "grad_norm": 0.3794045150279999, | |
| "learning_rate": 0.00039225065578548523, | |
| "loss": 3.2531, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.34636448380331, | |
| "grad_norm": 0.3869137465953827, | |
| "learning_rate": 0.00039207577965607687, | |
| "loss": 3.2437, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.360929853181077, | |
| "grad_norm": 0.39294636249542236, | |
| "learning_rate": 0.00039190090352666856, | |
| "loss": 3.247, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.375495222558843, | |
| "grad_norm": 0.37759485840797424, | |
| "learning_rate": 0.0003917260273972602, | |
| "loss": 3.2565, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.39006059193661, | |
| "grad_norm": 0.379200279712677, | |
| "learning_rate": 0.00039155115126785194, | |
| "loss": 3.245, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.40462596131438, | |
| "grad_norm": 0.40147554874420166, | |
| "learning_rate": 0.0003913762751384436, | |
| "loss": 3.2482, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.419191330692147, | |
| "grad_norm": 0.38646212220191956, | |
| "learning_rate": 0.00039120139900903527, | |
| "loss": 3.254, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.433756700069914, | |
| "grad_norm": 0.3718118965625763, | |
| "learning_rate": 0.0003910265228796269, | |
| "loss": 3.2701, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.44832206944768, | |
| "grad_norm": 0.4207517206668854, | |
| "learning_rate": 0.00039085164675021854, | |
| "loss": 3.2703, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.462887438825447, | |
| "grad_norm": 0.41934168338775635, | |
| "learning_rate": 0.00039067677062081023, | |
| "loss": 3.2493, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.477452808203218, | |
| "grad_norm": 0.377540647983551, | |
| "learning_rate": 0.00039050189449140187, | |
| "loss": 3.2693, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.477452808203218, | |
| "eval_accuracy": 0.37255406588251616, | |
| "eval_loss": 3.5508663654327393, | |
| "eval_runtime": 179.5955, | |
| "eval_samples_per_second": 92.681, | |
| "eval_steps_per_second": 5.796, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.492018177580984, | |
| "grad_norm": 0.39113548398017883, | |
| "learning_rate": 0.00039032701836199356, | |
| "loss": 3.2626, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.50658354695875, | |
| "grad_norm": 0.4056381285190582, | |
| "learning_rate": 0.0003901521422325852, | |
| "loss": 3.263, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.521148916336518, | |
| "grad_norm": 0.3636176884174347, | |
| "learning_rate": 0.00038997726610317683, | |
| "loss": 3.2644, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.535714285714285, | |
| "grad_norm": 0.3688655495643616, | |
| "learning_rate": 0.0003898023899737686, | |
| "loss": 3.2731, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.55027965509205, | |
| "grad_norm": 0.39269739389419556, | |
| "learning_rate": 0.0003896275138443602, | |
| "loss": 3.2603, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.56484502446982, | |
| "grad_norm": 0.3798394799232483, | |
| "learning_rate": 0.0003894526377149519, | |
| "loss": 3.2662, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.57941039384759, | |
| "grad_norm": 0.3824335038661957, | |
| "learning_rate": 0.00038927776158554354, | |
| "loss": 3.2774, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.593975763225355, | |
| "grad_norm": 0.38308337330818176, | |
| "learning_rate": 0.0003891028854561352, | |
| "loss": 3.2772, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.608541132603122, | |
| "grad_norm": 0.3748604655265808, | |
| "learning_rate": 0.00038892800932672686, | |
| "loss": 3.2586, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.62310650198089, | |
| "grad_norm": 0.40975135564804077, | |
| "learning_rate": 0.0003887531331973185, | |
| "loss": 3.2655, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.63767187135866, | |
| "grad_norm": 0.38740789890289307, | |
| "learning_rate": 0.0003885782570679102, | |
| "loss": 3.2761, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.652237240736426, | |
| "grad_norm": 0.3646203577518463, | |
| "learning_rate": 0.0003884033809385018, | |
| "loss": 3.2751, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.666802610114193, | |
| "grad_norm": 0.37517863512039185, | |
| "learning_rate": 0.00038822850480909357, | |
| "loss": 3.2792, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.68136797949196, | |
| "grad_norm": 0.3650130033493042, | |
| "learning_rate": 0.0003880536286796852, | |
| "loss": 3.2693, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.695933348869726, | |
| "grad_norm": 0.4000101089477539, | |
| "learning_rate": 0.00038787875255027684, | |
| "loss": 3.2749, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.710498718247496, | |
| "grad_norm": 0.35344168543815613, | |
| "learning_rate": 0.00038770387642086853, | |
| "loss": 3.2679, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.725064087625263, | |
| "grad_norm": 0.40958935022354126, | |
| "learning_rate": 0.00038752900029146017, | |
| "loss": 3.2857, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.73962945700303, | |
| "grad_norm": 0.377948135137558, | |
| "learning_rate": 0.00038735412416205186, | |
| "loss": 3.2691, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.754194826380797, | |
| "grad_norm": 0.4192025065422058, | |
| "learning_rate": 0.0003871792480326435, | |
| "loss": 3.2764, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.768760195758563, | |
| "grad_norm": 0.3829701244831085, | |
| "learning_rate": 0.00038700437190323513, | |
| "loss": 3.2845, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.768760195758563, | |
| "eval_accuracy": 0.3727979053787536, | |
| "eval_loss": 3.5411858558654785, | |
| "eval_runtime": 179.8099, | |
| "eval_samples_per_second": 92.57, | |
| "eval_steps_per_second": 5.789, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.78332556513633, | |
| "grad_norm": 0.34948283433914185, | |
| "learning_rate": 0.0003868294957738268, | |
| "loss": 3.2742, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.7978909345141, | |
| "grad_norm": 0.3973924219608307, | |
| "learning_rate": 0.00038665461964441846, | |
| "loss": 3.2799, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.812456303891867, | |
| "grad_norm": 0.39604613184928894, | |
| "learning_rate": 0.0003864797435150102, | |
| "loss": 3.2907, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.827021673269634, | |
| "grad_norm": 0.3890770971775055, | |
| "learning_rate": 0.00038630486738560184, | |
| "loss": 3.2818, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.8415870426474, | |
| "grad_norm": 0.39360764622688293, | |
| "learning_rate": 0.00038612999125619353, | |
| "loss": 3.2866, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.856152412025168, | |
| "grad_norm": 0.3879394233226776, | |
| "learning_rate": 0.00038595511512678517, | |
| "loss": 3.2828, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.870717781402938, | |
| "grad_norm": 0.3946910500526428, | |
| "learning_rate": 0.0003857802389973768, | |
| "loss": 3.2841, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.885283150780705, | |
| "grad_norm": 0.3722352981567383, | |
| "learning_rate": 0.0003856053628679685, | |
| "loss": 3.2906, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.89984852015847, | |
| "grad_norm": 0.37943729758262634, | |
| "learning_rate": 0.00038543048673856013, | |
| "loss": 3.2828, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.914413889536238, | |
| "grad_norm": 0.3946760594844818, | |
| "learning_rate": 0.0003852556106091518, | |
| "loss": 3.2874, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.928979258914005, | |
| "grad_norm": 0.38303500413894653, | |
| "learning_rate": 0.00038508073447974346, | |
| "loss": 3.2743, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.943544628291775, | |
| "grad_norm": 0.35602617263793945, | |
| "learning_rate": 0.0003849058583503351, | |
| "loss": 3.2839, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.958109997669542, | |
| "grad_norm": 0.364681601524353, | |
| "learning_rate": 0.00038473098222092684, | |
| "loss": 3.2894, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.97267536704731, | |
| "grad_norm": 0.3929082155227661, | |
| "learning_rate": 0.0003845561060915185, | |
| "loss": 3.2816, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.987240736425075, | |
| "grad_norm": 0.37357085943222046, | |
| "learning_rate": 0.00038438122996211016, | |
| "loss": 3.295, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 18.001747844325333, | |
| "grad_norm": 0.4023423492908478, | |
| "learning_rate": 0.0003842063538327018, | |
| "loss": 3.2778, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.0163132137031, | |
| "grad_norm": 0.3764852285385132, | |
| "learning_rate": 0.0003840314777032935, | |
| "loss": 3.175, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.030878583080867, | |
| "grad_norm": 0.4034103453159332, | |
| "learning_rate": 0.0003838566015738851, | |
| "loss": 3.1882, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.045443952458633, | |
| "grad_norm": 0.3662955164909363, | |
| "learning_rate": 0.00038368172544447676, | |
| "loss": 3.1941, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.0600093218364, | |
| "grad_norm": 0.3883397579193115, | |
| "learning_rate": 0.00038350684931506845, | |
| "loss": 3.2022, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.0600093218364, | |
| "eval_accuracy": 0.3724488410275824, | |
| "eval_loss": 3.5511727333068848, | |
| "eval_runtime": 179.5651, | |
| "eval_samples_per_second": 92.696, | |
| "eval_steps_per_second": 5.797, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.07457469121417, | |
| "grad_norm": 0.39406126737594604, | |
| "learning_rate": 0.0003833319731856601, | |
| "loss": 3.1855, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.089140060591937, | |
| "grad_norm": 0.39789289236068726, | |
| "learning_rate": 0.00038315709705625183, | |
| "loss": 3.1935, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.103705429969704, | |
| "grad_norm": 0.38348227739334106, | |
| "learning_rate": 0.00038298222092684347, | |
| "loss": 3.1978, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.11827079934747, | |
| "grad_norm": 0.3813340663909912, | |
| "learning_rate": 0.0003828073447974351, | |
| "loss": 3.2017, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.132836168725238, | |
| "grad_norm": 0.4284285008907318, | |
| "learning_rate": 0.0003826324686680268, | |
| "loss": 3.2196, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.147401538103008, | |
| "grad_norm": 0.39629238843917847, | |
| "learning_rate": 0.00038245759253861843, | |
| "loss": 3.2077, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.161966907480775, | |
| "grad_norm": 0.40169140696525574, | |
| "learning_rate": 0.0003822827164092101, | |
| "loss": 3.2137, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.17653227685854, | |
| "grad_norm": 0.3855275511741638, | |
| "learning_rate": 0.00038210784027980176, | |
| "loss": 3.2136, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.191097646236308, | |
| "grad_norm": 0.3961770236492157, | |
| "learning_rate": 0.0003819329641503934, | |
| "loss": 3.2086, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.205663015614075, | |
| "grad_norm": 0.37826651334762573, | |
| "learning_rate": 0.0003817580880209851, | |
| "loss": 3.2079, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.22022838499184, | |
| "grad_norm": 0.4296334385871887, | |
| "learning_rate": 0.0003815832118915767, | |
| "loss": 3.2111, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.234793754369612, | |
| "grad_norm": 0.40578290820121765, | |
| "learning_rate": 0.00038140833576216847, | |
| "loss": 3.2258, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.24935912374738, | |
| "grad_norm": 0.4326179027557373, | |
| "learning_rate": 0.0003812334596327601, | |
| "loss": 3.2318, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.263924493125145, | |
| "grad_norm": 0.40364038944244385, | |
| "learning_rate": 0.0003810585835033518, | |
| "loss": 3.2235, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.278489862502912, | |
| "grad_norm": 0.41583460569381714, | |
| "learning_rate": 0.00038088370737394343, | |
| "loss": 3.2385, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.29305523188068, | |
| "grad_norm": 0.3775072693824768, | |
| "learning_rate": 0.00038070883124453507, | |
| "loss": 3.2353, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.30762060125845, | |
| "grad_norm": 0.4191801846027374, | |
| "learning_rate": 0.00038053395511512676, | |
| "loss": 3.2385, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.322185970636216, | |
| "grad_norm": 0.3996080458164215, | |
| "learning_rate": 0.0003803590789857184, | |
| "loss": 3.22, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.336751340013983, | |
| "grad_norm": 0.3744161128997803, | |
| "learning_rate": 0.0003801842028563101, | |
| "loss": 3.242, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.35131670939175, | |
| "grad_norm": 0.3929766118526459, | |
| "learning_rate": 0.0003800093267269017, | |
| "loss": 3.2434, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.35131670939175, | |
| "eval_accuracy": 0.3726039154227306, | |
| "eval_loss": 3.551494836807251, | |
| "eval_runtime": 179.6728, | |
| "eval_samples_per_second": 92.641, | |
| "eval_steps_per_second": 5.794, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.365882078769516, | |
| "grad_norm": 0.37301939725875854, | |
| "learning_rate": 0.00037983445059749335, | |
| "loss": 3.2367, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.380447448147287, | |
| "grad_norm": 0.3646920323371887, | |
| "learning_rate": 0.0003796595744680851, | |
| "loss": 3.2411, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.395012817525053, | |
| "grad_norm": 0.41786086559295654, | |
| "learning_rate": 0.00037948469833867674, | |
| "loss": 3.238, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.40957818690282, | |
| "grad_norm": 0.39375847578048706, | |
| "learning_rate": 0.0003793098222092684, | |
| "loss": 3.2392, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.424143556280587, | |
| "grad_norm": 0.3906821310520172, | |
| "learning_rate": 0.00037913494607986006, | |
| "loss": 3.2303, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.438708925658354, | |
| "grad_norm": 0.38584890961647034, | |
| "learning_rate": 0.00037896006995045175, | |
| "loss": 3.2507, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.45327429503612, | |
| "grad_norm": 0.37560147047042847, | |
| "learning_rate": 0.0003787851938210434, | |
| "loss": 3.2401, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.46783966441389, | |
| "grad_norm": 0.39870715141296387, | |
| "learning_rate": 0.000378610317691635, | |
| "loss": 3.2559, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.482405033791657, | |
| "grad_norm": 0.416790634393692, | |
| "learning_rate": 0.0003784354415622267, | |
| "loss": 3.252, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.496970403169424, | |
| "grad_norm": 0.3930261731147766, | |
| "learning_rate": 0.00037826056543281835, | |
| "loss": 3.2556, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.51153577254719, | |
| "grad_norm": 0.38571596145629883, | |
| "learning_rate": 0.0003780856893034101, | |
| "loss": 3.2523, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.526101141924958, | |
| "grad_norm": 0.3790442943572998, | |
| "learning_rate": 0.00037791081317400173, | |
| "loss": 3.2502, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.540666511302728, | |
| "grad_norm": 0.40154215693473816, | |
| "learning_rate": 0.00037773593704459337, | |
| "loss": 3.2554, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.555231880680495, | |
| "grad_norm": 0.3869607746601105, | |
| "learning_rate": 0.00037756106091518506, | |
| "loss": 3.2674, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.56979725005826, | |
| "grad_norm": 0.36808493733406067, | |
| "learning_rate": 0.0003773861847857767, | |
| "loss": 3.2566, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.58436261943603, | |
| "grad_norm": 0.4031069278717041, | |
| "learning_rate": 0.0003772113086563684, | |
| "loss": 3.2647, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.598927988813795, | |
| "grad_norm": 0.39664480090141296, | |
| "learning_rate": 0.00037703643252696, | |
| "loss": 3.2611, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.613493358191565, | |
| "grad_norm": 0.4211257994174957, | |
| "learning_rate": 0.0003768615563975517, | |
| "loss": 3.2466, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.628058727569332, | |
| "grad_norm": 0.37485969066619873, | |
| "learning_rate": 0.00037668668026814335, | |
| "loss": 3.2698, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.6426240969471, | |
| "grad_norm": 0.3820188343524933, | |
| "learning_rate": 0.000376511804138735, | |
| "loss": 3.2583, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.6426240969471, | |
| "eval_accuracy": 0.37283764392732077, | |
| "eval_loss": 3.542705535888672, | |
| "eval_runtime": 179.7091, | |
| "eval_samples_per_second": 92.622, | |
| "eval_steps_per_second": 5.793, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.657189466324866, | |
| "grad_norm": 0.3915201425552368, | |
| "learning_rate": 0.00037633692800932673, | |
| "loss": 3.2704, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.671754835702632, | |
| "grad_norm": 0.36770007014274597, | |
| "learning_rate": 0.00037616205187991837, | |
| "loss": 3.2531, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.6863202050804, | |
| "grad_norm": 0.4022904336452484, | |
| "learning_rate": 0.00037598717575051006, | |
| "loss": 3.2499, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.70088557445817, | |
| "grad_norm": 0.36411207914352417, | |
| "learning_rate": 0.0003758122996211017, | |
| "loss": 3.2623, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.715450943835936, | |
| "grad_norm": 0.37535756826400757, | |
| "learning_rate": 0.00037563742349169333, | |
| "loss": 3.2548, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.730016313213703, | |
| "grad_norm": 0.3946349322795868, | |
| "learning_rate": 0.000375462547362285, | |
| "loss": 3.2668, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.74458168259147, | |
| "grad_norm": 0.4044114053249359, | |
| "learning_rate": 0.00037528767123287665, | |
| "loss": 3.2717, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.759147051969236, | |
| "grad_norm": 0.3657906949520111, | |
| "learning_rate": 0.00037511279510346834, | |
| "loss": 3.2569, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.773712421347007, | |
| "grad_norm": 0.3859136402606964, | |
| "learning_rate": 0.00037493791897406, | |
| "loss": 3.2739, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.788277790724774, | |
| "grad_norm": 0.38982921838760376, | |
| "learning_rate": 0.0003747630428446516, | |
| "loss": 3.2765, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.80284316010254, | |
| "grad_norm": 0.3761852979660034, | |
| "learning_rate": 0.00037458816671524336, | |
| "loss": 3.2637, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.817408529480307, | |
| "grad_norm": 0.3764474093914032, | |
| "learning_rate": 0.000374413290585835, | |
| "loss": 3.2715, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.831973898858074, | |
| "grad_norm": 0.37012961506843567, | |
| "learning_rate": 0.0003742384144564267, | |
| "loss": 3.2756, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.846539268235844, | |
| "grad_norm": 0.4159339964389801, | |
| "learning_rate": 0.0003740635383270183, | |
| "loss": 3.2728, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.86110463761361, | |
| "grad_norm": 0.3688717484474182, | |
| "learning_rate": 0.00037388866219761, | |
| "loss": 3.2715, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.875670006991378, | |
| "grad_norm": 0.4111153185367584, | |
| "learning_rate": 0.00037371378606820165, | |
| "loss": 3.283, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.890235376369144, | |
| "grad_norm": 0.4147163927555084, | |
| "learning_rate": 0.0003735389099387933, | |
| "loss": 3.2711, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.90480074574691, | |
| "grad_norm": 0.36633679270744324, | |
| "learning_rate": 0.000373364033809385, | |
| "loss": 3.2735, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.919366115124678, | |
| "grad_norm": 0.3624868094921112, | |
| "learning_rate": 0.0003731891576799766, | |
| "loss": 3.2772, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.93393148450245, | |
| "grad_norm": 0.38773536682128906, | |
| "learning_rate": 0.00037301428155056836, | |
| "loss": 3.2754, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.93393148450245, | |
| "eval_accuracy": 0.3732179818107963, | |
| "eval_loss": 3.5359609127044678, | |
| "eval_runtime": 179.6247, | |
| "eval_samples_per_second": 92.665, | |
| "eval_steps_per_second": 5.795, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.948496853880215, | |
| "grad_norm": 0.37369510531425476, | |
| "learning_rate": 0.00037283940542116, | |
| "loss": 3.2842, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.96306222325798, | |
| "grad_norm": 0.4039534032344818, | |
| "learning_rate": 0.00037266452929175163, | |
| "loss": 3.2827, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.97762759263575, | |
| "grad_norm": 0.36381030082702637, | |
| "learning_rate": 0.0003724896531623433, | |
| "loss": 3.2838, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.992192962013515, | |
| "grad_norm": 0.38101911544799805, | |
| "learning_rate": 0.00037231477703293496, | |
| "loss": 3.277, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 19.006700069913773, | |
| "grad_norm": 0.3940986692905426, | |
| "learning_rate": 0.00037213990090352665, | |
| "loss": 3.2168, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.02126543929154, | |
| "grad_norm": 0.44007158279418945, | |
| "learning_rate": 0.0003719650247741183, | |
| "loss": 3.1807, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.035830808669306, | |
| "grad_norm": 0.3865497410297394, | |
| "learning_rate": 0.00037179014864471, | |
| "loss": 3.1847, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.050396178047077, | |
| "grad_norm": 0.40062960982322693, | |
| "learning_rate": 0.0003716152725153016, | |
| "loss": 3.1729, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.064961547424844, | |
| "grad_norm": 0.3954075276851654, | |
| "learning_rate": 0.00037144039638589325, | |
| "loss": 3.1875, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.07952691680261, | |
| "grad_norm": 0.37583020329475403, | |
| "learning_rate": 0.000371265520256485, | |
| "loss": 3.1852, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.094092286180377, | |
| "grad_norm": 0.38721004128456116, | |
| "learning_rate": 0.00037109064412707663, | |
| "loss": 3.1877, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.108657655558144, | |
| "grad_norm": 0.4319014847278595, | |
| "learning_rate": 0.0003709157679976683, | |
| "loss": 3.2013, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.123223024935914, | |
| "grad_norm": 0.36834290623664856, | |
| "learning_rate": 0.00037074089186825995, | |
| "loss": 3.1977, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.13778839431368, | |
| "grad_norm": 0.4034636318683624, | |
| "learning_rate": 0.0003705660157388516, | |
| "loss": 3.1942, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.152353763691448, | |
| "grad_norm": 0.3813159763813019, | |
| "learning_rate": 0.0003703911396094433, | |
| "loss": 3.1803, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.166919133069214, | |
| "grad_norm": 0.3532137870788574, | |
| "learning_rate": 0.0003702162634800349, | |
| "loss": 3.2012, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.18148450244698, | |
| "grad_norm": 0.38538220524787903, | |
| "learning_rate": 0.0003700413873506266, | |
| "loss": 3.2056, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.196049871824748, | |
| "grad_norm": 0.36290931701660156, | |
| "learning_rate": 0.00036986651122121824, | |
| "loss": 3.214, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.210615241202518, | |
| "grad_norm": 0.4123310446739197, | |
| "learning_rate": 0.00036969163509181, | |
| "loss": 3.2063, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.225180610580285, | |
| "grad_norm": 0.41633448004722595, | |
| "learning_rate": 0.0003695167589624016, | |
| "loss": 3.1981, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.225180610580285, | |
| "eval_accuracy": 0.3726028572956977, | |
| "eval_loss": 3.5563158988952637, | |
| "eval_runtime": 179.6893, | |
| "eval_samples_per_second": 92.632, | |
| "eval_steps_per_second": 5.793, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.23974597995805, | |
| "grad_norm": 0.4212048053741455, | |
| "learning_rate": 0.00036934188283299326, | |
| "loss": 3.221, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.25431134933582, | |
| "grad_norm": 0.4007203280925751, | |
| "learning_rate": 0.00036916700670358495, | |
| "loss": 3.2141, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.268876718713585, | |
| "grad_norm": 0.4050043523311615, | |
| "learning_rate": 0.0003689921305741766, | |
| "loss": 3.2165, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.283442088091356, | |
| "grad_norm": 0.41472339630126953, | |
| "learning_rate": 0.0003688172544447683, | |
| "loss": 3.2188, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.298007457469122, | |
| "grad_norm": 0.3750508725643158, | |
| "learning_rate": 0.0003686423783153599, | |
| "loss": 3.2196, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.31257282684689, | |
| "grad_norm": 0.3770619034767151, | |
| "learning_rate": 0.00036846750218595155, | |
| "loss": 3.2164, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.327138196224656, | |
| "grad_norm": 0.36972129344940186, | |
| "learning_rate": 0.00036829262605654324, | |
| "loss": 3.2416, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.341703565602423, | |
| "grad_norm": 0.3639586567878723, | |
| "learning_rate": 0.0003681177499271349, | |
| "loss": 3.2269, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.356268934980193, | |
| "grad_norm": 0.410324364900589, | |
| "learning_rate": 0.0003679428737977266, | |
| "loss": 3.2258, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.37083430435796, | |
| "grad_norm": 0.4071573317050934, | |
| "learning_rate": 0.00036776799766831826, | |
| "loss": 3.2323, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.385399673735726, | |
| "grad_norm": 0.3902466595172882, | |
| "learning_rate": 0.0003675931215389099, | |
| "loss": 3.2259, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.399965043113493, | |
| "grad_norm": 0.37968191504478455, | |
| "learning_rate": 0.0003674182454095016, | |
| "loss": 3.2274, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.41453041249126, | |
| "grad_norm": 0.39837968349456787, | |
| "learning_rate": 0.0003672433692800932, | |
| "loss": 3.2345, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.429095781869027, | |
| "grad_norm": 0.38949036598205566, | |
| "learning_rate": 0.0003670684931506849, | |
| "loss": 3.2259, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.443661151246797, | |
| "grad_norm": 0.4259556531906128, | |
| "learning_rate": 0.00036689361702127655, | |
| "loss": 3.2506, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.458226520624564, | |
| "grad_norm": 0.3879312574863434, | |
| "learning_rate": 0.00036671874089186824, | |
| "loss": 3.2416, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.47279189000233, | |
| "grad_norm": 0.3887031078338623, | |
| "learning_rate": 0.00036654386476245987, | |
| "loss": 3.2328, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.487357259380097, | |
| "grad_norm": 0.41111478209495544, | |
| "learning_rate": 0.0003663689886330515, | |
| "loss": 3.2329, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.501922628757864, | |
| "grad_norm": 0.406820684671402, | |
| "learning_rate": 0.00036619411250364325, | |
| "loss": 3.2492, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.516487998135634, | |
| "grad_norm": 0.3900870382785797, | |
| "learning_rate": 0.0003660192363742349, | |
| "loss": 3.2406, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.516487998135634, | |
| "eval_accuracy": 0.37308265912026145, | |
| "eval_loss": 3.544171094894409, | |
| "eval_runtime": 179.6633, | |
| "eval_samples_per_second": 92.646, | |
| "eval_steps_per_second": 5.794, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.5310533675134, | |
| "grad_norm": 0.3849544823169708, | |
| "learning_rate": 0.0003658443602448266, | |
| "loss": 3.2406, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.545618736891168, | |
| "grad_norm": 0.4000382423400879, | |
| "learning_rate": 0.0003656694841154182, | |
| "loss": 3.2455, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.560184106268935, | |
| "grad_norm": 0.39659613370895386, | |
| "learning_rate": 0.00036549460798600985, | |
| "loss": 3.2367, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.5747494756467, | |
| "grad_norm": 0.36925145983695984, | |
| "learning_rate": 0.00036531973185660154, | |
| "loss": 3.2491, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.589314845024468, | |
| "grad_norm": 0.3747584819793701, | |
| "learning_rate": 0.0003651448557271932, | |
| "loss": 3.2431, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.60388021440224, | |
| "grad_norm": 0.371640682220459, | |
| "learning_rate": 0.00036496997959778487, | |
| "loss": 3.2545, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.618445583780005, | |
| "grad_norm": 0.38793015480041504, | |
| "learning_rate": 0.0003647951034683765, | |
| "loss": 3.2471, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.633010953157772, | |
| "grad_norm": 0.4079042375087738, | |
| "learning_rate": 0.00036462022733896825, | |
| "loss": 3.2497, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.64757632253554, | |
| "grad_norm": 0.39877283573150635, | |
| "learning_rate": 0.0003644453512095599, | |
| "loss": 3.2512, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.662141691913305, | |
| "grad_norm": 0.40305206179618835, | |
| "learning_rate": 0.0003642704750801515, | |
| "loss": 3.2577, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.676707061291076, | |
| "grad_norm": 0.3949699103832245, | |
| "learning_rate": 0.0003640955989507432, | |
| "loss": 3.2556, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.691272430668842, | |
| "grad_norm": 0.3933976888656616, | |
| "learning_rate": 0.00036392072282133485, | |
| "loss": 3.2607, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.70583780004661, | |
| "grad_norm": 0.4123631417751312, | |
| "learning_rate": 0.00036374584669192654, | |
| "loss": 3.2436, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.720403169424376, | |
| "grad_norm": 0.36142420768737793, | |
| "learning_rate": 0.0003635709705625182, | |
| "loss": 3.2589, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.734968538802143, | |
| "grad_norm": 0.4008404612541199, | |
| "learning_rate": 0.0003633960944331098, | |
| "loss": 3.2537, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.749533908179913, | |
| "grad_norm": 0.36675870418548584, | |
| "learning_rate": 0.0003632212183037015, | |
| "loss": 3.256, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.76409927755768, | |
| "grad_norm": 0.3671972155570984, | |
| "learning_rate": 0.00036304634217429314, | |
| "loss": 3.242, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.778664646935447, | |
| "grad_norm": 0.40335404872894287, | |
| "learning_rate": 0.0003628714660448849, | |
| "loss": 3.2533, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.793230016313213, | |
| "grad_norm": 0.4161832630634308, | |
| "learning_rate": 0.0003626965899154765, | |
| "loss": 3.2499, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.80779538569098, | |
| "grad_norm": 0.4160614311695099, | |
| "learning_rate": 0.0003625217137860682, | |
| "loss": 3.2603, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.80779538569098, | |
| "eval_accuracy": 0.3735364780476851, | |
| "eval_loss": 3.539116859436035, | |
| "eval_runtime": 179.5961, | |
| "eval_samples_per_second": 92.68, | |
| "eval_steps_per_second": 5.796, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.822360755068747, | |
| "grad_norm": 0.3749236464500427, | |
| "learning_rate": 0.00036234683765665985, | |
| "loss": 3.2614, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.836926124446517, | |
| "grad_norm": 0.39177680015563965, | |
| "learning_rate": 0.0003621719615272515, | |
| "loss": 3.257, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.851491493824284, | |
| "grad_norm": 0.3916817307472229, | |
| "learning_rate": 0.00036199708539784317, | |
| "loss": 3.264, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.86605686320205, | |
| "grad_norm": 0.43153202533721924, | |
| "learning_rate": 0.0003618222092684348, | |
| "loss": 3.2513, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.880622232579817, | |
| "grad_norm": 0.40334609150886536, | |
| "learning_rate": 0.0003616473331390265, | |
| "loss": 3.2527, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.895187601957584, | |
| "grad_norm": 0.4108611047267914, | |
| "learning_rate": 0.00036147245700961813, | |
| "loss": 3.2539, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.909752971335354, | |
| "grad_norm": 0.39317360520362854, | |
| "learning_rate": 0.00036129758088020977, | |
| "loss": 3.2706, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.92431834071312, | |
| "grad_norm": 0.3866609036922455, | |
| "learning_rate": 0.0003611227047508015, | |
| "loss": 3.2614, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.938883710090888, | |
| "grad_norm": 0.4029618501663208, | |
| "learning_rate": 0.00036094782862139315, | |
| "loss": 3.2586, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.953449079468655, | |
| "grad_norm": 0.3921782672405243, | |
| "learning_rate": 0.00036077295249198484, | |
| "loss": 3.2668, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.96801444884642, | |
| "grad_norm": 0.37679243087768555, | |
| "learning_rate": 0.0003605980763625765, | |
| "loss": 3.2669, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.982579818224192, | |
| "grad_norm": 0.39157822728157043, | |
| "learning_rate": 0.0003604232002331681, | |
| "loss": 3.2711, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.99714518760196, | |
| "grad_norm": 0.37484461069107056, | |
| "learning_rate": 0.0003602483241037598, | |
| "loss": 3.2748, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.011652295502213, | |
| "grad_norm": 0.3911686837673187, | |
| "learning_rate": 0.00036007344797435144, | |
| "loss": 3.184, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 20.026217664879983, | |
| "grad_norm": 0.37566766142845154, | |
| "learning_rate": 0.00035989857184494313, | |
| "loss": 3.1588, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 20.04078303425775, | |
| "grad_norm": 0.3786637485027313, | |
| "learning_rate": 0.00035972369571553477, | |
| "loss": 3.1597, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 20.055348403635517, | |
| "grad_norm": 0.4000002145767212, | |
| "learning_rate": 0.0003595488195861265, | |
| "loss": 3.1753, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 20.069913773013283, | |
| "grad_norm": 0.41424959897994995, | |
| "learning_rate": 0.00035937394345671815, | |
| "loss": 3.1716, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 20.08447914239105, | |
| "grad_norm": 0.4069176912307739, | |
| "learning_rate": 0.0003591990673273098, | |
| "loss": 3.1743, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 20.099044511768817, | |
| "grad_norm": 0.39178600907325745, | |
| "learning_rate": 0.0003590241911979015, | |
| "loss": 3.1759, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.099044511768817, | |
| "eval_accuracy": 0.3731403858283871, | |
| "eval_loss": 3.5491995811462402, | |
| "eval_runtime": 179.7147, | |
| "eval_samples_per_second": 92.619, | |
| "eval_steps_per_second": 5.793, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 20.113609881146587, | |
| "grad_norm": 0.42330384254455566, | |
| "learning_rate": 0.0003588493150684931, | |
| "loss": 3.1797, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 20.128175250524354, | |
| "grad_norm": 0.3965478241443634, | |
| "learning_rate": 0.0003586744389390848, | |
| "loss": 3.1863, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 20.14274061990212, | |
| "grad_norm": 0.41020357608795166, | |
| "learning_rate": 0.00035849956280967644, | |
| "loss": 3.1812, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 20.157305989279887, | |
| "grad_norm": 0.42909374833106995, | |
| "learning_rate": 0.0003583246866802681, | |
| "loss": 3.2021, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 20.171871358657654, | |
| "grad_norm": 0.38205522298812866, | |
| "learning_rate": 0.00035814981055085976, | |
| "loss": 3.2022, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 20.186436728035424, | |
| "grad_norm": 0.3973395526409149, | |
| "learning_rate": 0.0003579749344214514, | |
| "loss": 3.2015, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 20.20100209741319, | |
| "grad_norm": 0.41039976477622986, | |
| "learning_rate": 0.00035780005829204315, | |
| "loss": 3.1961, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 20.215567466790958, | |
| "grad_norm": 0.3577198088169098, | |
| "learning_rate": 0.0003576251821626348, | |
| "loss": 3.2056, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 20.230132836168725, | |
| "grad_norm": 0.43166354298591614, | |
| "learning_rate": 0.00035745030603322647, | |
| "loss": 3.2002, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 20.24469820554649, | |
| "grad_norm": 0.3968643546104431, | |
| "learning_rate": 0.0003572754299038181, | |
| "loss": 3.2117, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 20.25926357492426, | |
| "grad_norm": 0.3748406171798706, | |
| "learning_rate": 0.00035710055377440974, | |
| "loss": 3.199, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 20.27382894430203, | |
| "grad_norm": 0.41351601481437683, | |
| "learning_rate": 0.00035692567764500143, | |
| "loss": 3.1976, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 20.288394313679795, | |
| "grad_norm": 0.40381181240081787, | |
| "learning_rate": 0.00035675080151559307, | |
| "loss": 3.1988, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 20.302959683057562, | |
| "grad_norm": 0.38123536109924316, | |
| "learning_rate": 0.00035657592538618476, | |
| "loss": 3.2122, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 20.31752505243533, | |
| "grad_norm": 0.40208685398101807, | |
| "learning_rate": 0.0003564010492567764, | |
| "loss": 3.2187, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 20.332090421813096, | |
| "grad_norm": 0.40056926012039185, | |
| "learning_rate": 0.00035622617312736803, | |
| "loss": 3.2165, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 20.346655791190866, | |
| "grad_norm": 0.40889421105384827, | |
| "learning_rate": 0.0003560512969979598, | |
| "loss": 3.2228, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 20.361221160568633, | |
| "grad_norm": 0.3698402941226959, | |
| "learning_rate": 0.0003558764208685514, | |
| "loss": 3.2188, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 20.3757865299464, | |
| "grad_norm": 0.4034403860569, | |
| "learning_rate": 0.0003557015447391431, | |
| "loss": 3.2169, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 20.390351899324166, | |
| "grad_norm": 0.37178969383239746, | |
| "learning_rate": 0.00035552666860973474, | |
| "loss": 3.2213, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.390351899324166, | |
| "eval_accuracy": 0.3731590794059675, | |
| "eval_loss": 3.5475502014160156, | |
| "eval_runtime": 179.6766, | |
| "eval_samples_per_second": 92.639, | |
| "eval_steps_per_second": 5.794, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 20.404917268701933, | |
| "grad_norm": 0.40938612818717957, | |
| "learning_rate": 0.00035535179248032643, | |
| "loss": 3.2173, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 20.419482638079703, | |
| "grad_norm": 0.4097348749637604, | |
| "learning_rate": 0.00035517691635091807, | |
| "loss": 3.225, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 20.43404800745747, | |
| "grad_norm": 0.37782126665115356, | |
| "learning_rate": 0.0003550020402215097, | |
| "loss": 3.2252, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 20.448613376835237, | |
| "grad_norm": 0.36499133706092834, | |
| "learning_rate": 0.0003548271640921014, | |
| "loss": 3.2237, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 20.463178746213003, | |
| "grad_norm": 0.39261844754219055, | |
| "learning_rate": 0.00035465228796269303, | |
| "loss": 3.2361, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 20.47774411559077, | |
| "grad_norm": 0.42958515882492065, | |
| "learning_rate": 0.0003544774118332848, | |
| "loss": 3.2253, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 20.49230948496854, | |
| "grad_norm": 0.3969309628009796, | |
| "learning_rate": 0.0003543025357038764, | |
| "loss": 3.2322, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 20.506874854346307, | |
| "grad_norm": 0.37618428468704224, | |
| "learning_rate": 0.00035412765957446805, | |
| "loss": 3.2311, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 20.521440223724074, | |
| "grad_norm": 0.413291871547699, | |
| "learning_rate": 0.00035395278344505974, | |
| "loss": 3.2176, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 20.53600559310184, | |
| "grad_norm": 0.38615167140960693, | |
| "learning_rate": 0.0003537779073156514, | |
| "loss": 3.236, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 20.550570962479608, | |
| "grad_norm": 0.39638757705688477, | |
| "learning_rate": 0.00035360303118624306, | |
| "loss": 3.2306, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 20.565136331857374, | |
| "grad_norm": 0.4069354236125946, | |
| "learning_rate": 0.0003534281550568347, | |
| "loss": 3.228, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 20.579701701235145, | |
| "grad_norm": 0.38820722699165344, | |
| "learning_rate": 0.0003532532789274264, | |
| "loss": 3.2352, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 20.59426707061291, | |
| "grad_norm": 0.47593021392822266, | |
| "learning_rate": 0.000353078402798018, | |
| "loss": 3.2209, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 20.608832439990678, | |
| "grad_norm": 0.38806480169296265, | |
| "learning_rate": 0.00035290352666860966, | |
| "loss": 3.2287, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 20.623397809368445, | |
| "grad_norm": 0.399444580078125, | |
| "learning_rate": 0.0003527286505392014, | |
| "loss": 3.2371, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 20.63796317874621, | |
| "grad_norm": 0.3905348479747772, | |
| "learning_rate": 0.00035255377440979304, | |
| "loss": 3.2353, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 20.652528548123982, | |
| "grad_norm": 0.42255914211273193, | |
| "learning_rate": 0.00035237889828038473, | |
| "loss": 3.2346, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 20.66709391750175, | |
| "grad_norm": 0.40701016783714294, | |
| "learning_rate": 0.00035220402215097637, | |
| "loss": 3.2384, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 20.681659286879515, | |
| "grad_norm": 0.39621832966804504, | |
| "learning_rate": 0.000352029146021568, | |
| "loss": 3.2306, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.681659286879515, | |
| "eval_accuracy": 0.3734405411967064, | |
| "eval_loss": 3.54192852973938, | |
| "eval_runtime": 179.7033, | |
| "eval_samples_per_second": 92.625, | |
| "eval_steps_per_second": 5.793, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 20.696224656257282, | |
| "grad_norm": 0.43927934765815735, | |
| "learning_rate": 0.0003518542698921597, | |
| "loss": 3.2395, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 20.71079002563505, | |
| "grad_norm": 0.4314327538013458, | |
| "learning_rate": 0.00035167939376275133, | |
| "loss": 3.2466, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 20.72535539501282, | |
| "grad_norm": 0.4066350758075714, | |
| "learning_rate": 0.000351504517633343, | |
| "loss": 3.2542, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 20.739920764390586, | |
| "grad_norm": 0.3890044093132019, | |
| "learning_rate": 0.00035132964150393466, | |
| "loss": 3.2435, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 20.754486133768353, | |
| "grad_norm": 0.38141369819641113, | |
| "learning_rate": 0.0003511547653745263, | |
| "loss": 3.2417, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 20.76905150314612, | |
| "grad_norm": 0.3846435546875, | |
| "learning_rate": 0.00035097988924511804, | |
| "loss": 3.237, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 20.783616872523886, | |
| "grad_norm": 0.3892623782157898, | |
| "learning_rate": 0.0003508050131157097, | |
| "loss": 3.2461, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 20.798182241901653, | |
| "grad_norm": 0.39577746391296387, | |
| "learning_rate": 0.00035063013698630137, | |
| "loss": 3.2561, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 20.812747611279423, | |
| "grad_norm": 0.43198904395103455, | |
| "learning_rate": 0.000350455260856893, | |
| "loss": 3.2542, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 20.82731298065719, | |
| "grad_norm": 0.3951154947280884, | |
| "learning_rate": 0.0003502803847274847, | |
| "loss": 3.2507, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 20.841878350034957, | |
| "grad_norm": 0.3816114068031311, | |
| "learning_rate": 0.00035010550859807633, | |
| "loss": 3.2455, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 20.856443719412724, | |
| "grad_norm": 0.4048319160938263, | |
| "learning_rate": 0.00034993063246866797, | |
| "loss": 3.2574, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 20.87100908879049, | |
| "grad_norm": 0.41361382603645325, | |
| "learning_rate": 0.00034975575633925966, | |
| "loss": 3.2607, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 20.88557445816826, | |
| "grad_norm": 0.3653562068939209, | |
| "learning_rate": 0.0003495808802098513, | |
| "loss": 3.2585, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 20.900139827546028, | |
| "grad_norm": 0.3662196099758148, | |
| "learning_rate": 0.00034940600408044304, | |
| "loss": 3.2576, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 20.914705196923794, | |
| "grad_norm": 0.4024188220500946, | |
| "learning_rate": 0.0003492311279510347, | |
| "loss": 3.2611, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 20.92927056630156, | |
| "grad_norm": 0.4076612889766693, | |
| "learning_rate": 0.0003490562518216263, | |
| "loss": 3.2563, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 20.943835935679328, | |
| "grad_norm": 0.3958513140678406, | |
| "learning_rate": 0.000348881375692218, | |
| "loss": 3.2524, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 20.958401305057095, | |
| "grad_norm": 0.40390610694885254, | |
| "learning_rate": 0.00034870649956280964, | |
| "loss": 3.2448, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 20.972966674434865, | |
| "grad_norm": 0.41278237104415894, | |
| "learning_rate": 0.0003485316234334013, | |
| "loss": 3.259, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.972966674434865, | |
| "eval_accuracy": 0.3740693037935618, | |
| "eval_loss": 3.5322375297546387, | |
| "eval_runtime": 179.5665, | |
| "eval_samples_per_second": 92.695, | |
| "eval_steps_per_second": 5.797, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 20.98753204381263, | |
| "grad_norm": 0.3775636553764343, | |
| "learning_rate": 0.00034835674730399296, | |
| "loss": 3.262, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 21.002039151712886, | |
| "grad_norm": 0.4008495509624481, | |
| "learning_rate": 0.00034818187117458465, | |
| "loss": 3.2479, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 21.016604521090656, | |
| "grad_norm": 0.38264980912208557, | |
| "learning_rate": 0.0003480069950451763, | |
| "loss": 3.1549, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 21.031169890468423, | |
| "grad_norm": 0.40952587127685547, | |
| "learning_rate": 0.0003478321189157679, | |
| "loss": 3.1519, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 21.04573525984619, | |
| "grad_norm": 0.40433329343795776, | |
| "learning_rate": 0.00034765724278635967, | |
| "loss": 3.159, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 21.060300629223956, | |
| "grad_norm": 0.40058109164237976, | |
| "learning_rate": 0.0003474823666569513, | |
| "loss": 3.155, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 21.074865998601723, | |
| "grad_norm": 0.39246848225593567, | |
| "learning_rate": 0.000347307490527543, | |
| "loss": 3.1671, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 21.089431367979493, | |
| "grad_norm": 0.40194451808929443, | |
| "learning_rate": 0.00034713261439813463, | |
| "loss": 3.1781, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 21.10399673735726, | |
| "grad_norm": 0.3998311161994934, | |
| "learning_rate": 0.00034695773826872627, | |
| "loss": 3.1801, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 21.118562106735027, | |
| "grad_norm": 0.3767092525959015, | |
| "learning_rate": 0.00034678286213931796, | |
| "loss": 3.1863, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 21.133127476112794, | |
| "grad_norm": 0.3758101761341095, | |
| "learning_rate": 0.0003466079860099096, | |
| "loss": 3.1936, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 21.14769284549056, | |
| "grad_norm": 0.37861061096191406, | |
| "learning_rate": 0.0003464331098805013, | |
| "loss": 3.1772, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 21.16225821486833, | |
| "grad_norm": 0.39717918634414673, | |
| "learning_rate": 0.0003462582337510929, | |
| "loss": 3.1924, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 21.176823584246097, | |
| "grad_norm": 0.37728017568588257, | |
| "learning_rate": 0.00034608335762168467, | |
| "loss": 3.1804, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 21.191388953623864, | |
| "grad_norm": 0.3761899769306183, | |
| "learning_rate": 0.0003459084814922763, | |
| "loss": 3.1765, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 21.20595432300163, | |
| "grad_norm": 0.3872700333595276, | |
| "learning_rate": 0.00034573360536286794, | |
| "loss": 3.1829, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 21.220519692379398, | |
| "grad_norm": 0.4359159469604492, | |
| "learning_rate": 0.00034555872923345963, | |
| "loss": 3.184, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 21.235085061757164, | |
| "grad_norm": 0.42497533559799194, | |
| "learning_rate": 0.00034538385310405127, | |
| "loss": 3.1963, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 21.249650431134935, | |
| "grad_norm": 0.41894423961639404, | |
| "learning_rate": 0.00034520897697464296, | |
| "loss": 3.1796, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 21.2642158005127, | |
| "grad_norm": 0.5145085453987122, | |
| "learning_rate": 0.0003450341008452346, | |
| "loss": 3.2027, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.2642158005127, | |
| "eval_accuracy": 0.3728435224108366, | |
| "eval_loss": 3.552633047103882, | |
| "eval_runtime": 179.6553, | |
| "eval_samples_per_second": 92.65, | |
| "eval_steps_per_second": 5.794, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 21.27878116989047, | |
| "grad_norm": 0.4553317427635193, | |
| "learning_rate": 0.00034485922471582623, | |
| "loss": 3.2022, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 21.293346539268235, | |
| "grad_norm": 0.3895247280597687, | |
| "learning_rate": 0.0003446843485864179, | |
| "loss": 3.2031, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 21.307911908646002, | |
| "grad_norm": 0.4008367955684662, | |
| "learning_rate": 0.00034450947245700955, | |
| "loss": 3.1877, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 21.322477278023772, | |
| "grad_norm": 0.39398273825645447, | |
| "learning_rate": 0.0003443345963276013, | |
| "loss": 3.1962, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 21.33704264740154, | |
| "grad_norm": 0.42520371079444885, | |
| "learning_rate": 0.00034415972019819294, | |
| "loss": 3.2137, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 21.351608016779306, | |
| "grad_norm": 0.3961343467235565, | |
| "learning_rate": 0.00034398484406878457, | |
| "loss": 3.2061, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 21.366173386157072, | |
| "grad_norm": 0.40649378299713135, | |
| "learning_rate": 0.00034380996793937626, | |
| "loss": 3.2057, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 21.38073875553484, | |
| "grad_norm": 0.40000006556510925, | |
| "learning_rate": 0.0003436350918099679, | |
| "loss": 3.2122, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 21.39530412491261, | |
| "grad_norm": 0.386966347694397, | |
| "learning_rate": 0.0003434602156805596, | |
| "loss": 3.2013, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 21.409869494290376, | |
| "grad_norm": 0.3852634131908417, | |
| "learning_rate": 0.0003432853395511512, | |
| "loss": 3.2136, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 21.424434863668143, | |
| "grad_norm": 0.4111328721046448, | |
| "learning_rate": 0.0003431104634217429, | |
| "loss": 3.2111, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 21.43900023304591, | |
| "grad_norm": 0.36920034885406494, | |
| "learning_rate": 0.00034293558729233455, | |
| "loss": 3.2099, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 21.453565602423676, | |
| "grad_norm": 0.3971029818058014, | |
| "learning_rate": 0.0003427607111629262, | |
| "loss": 3.2053, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 21.468130971801443, | |
| "grad_norm": 0.38907453417778015, | |
| "learning_rate": 0.00034258583503351793, | |
| "loss": 3.2086, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 21.482696341179214, | |
| "grad_norm": 0.4156521260738373, | |
| "learning_rate": 0.00034241095890410957, | |
| "loss": 3.2186, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 21.49726171055698, | |
| "grad_norm": 0.37278106808662415, | |
| "learning_rate": 0.00034223608277470126, | |
| "loss": 3.2098, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 21.511827079934747, | |
| "grad_norm": 0.39464330673217773, | |
| "learning_rate": 0.0003420612066452929, | |
| "loss": 3.2295, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 21.526392449312514, | |
| "grad_norm": 0.4381686747074127, | |
| "learning_rate": 0.00034188633051588453, | |
| "loss": 3.2146, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 21.54095781869028, | |
| "grad_norm": 0.3839596211910248, | |
| "learning_rate": 0.0003417114543864762, | |
| "loss": 3.2248, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 21.55552318806805, | |
| "grad_norm": 0.4019978940486908, | |
| "learning_rate": 0.00034153657825706786, | |
| "loss": 3.2301, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.55552318806805, | |
| "eval_accuracy": 0.3734910961549427, | |
| "eval_loss": 3.544201612472534, | |
| "eval_runtime": 179.6317, | |
| "eval_samples_per_second": 92.662, | |
| "eval_steps_per_second": 5.795, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 21.570088557445818, | |
| "grad_norm": 0.3982439935207367, | |
| "learning_rate": 0.00034136170212765955, | |
| "loss": 3.2257, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 21.584653926823584, | |
| "grad_norm": 0.40451157093048096, | |
| "learning_rate": 0.0003411868259982512, | |
| "loss": 3.223, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 21.59921929620135, | |
| "grad_norm": 0.40273866057395935, | |
| "learning_rate": 0.00034101194986884293, | |
| "loss": 3.2108, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 21.613784665579118, | |
| "grad_norm": 0.4382562041282654, | |
| "learning_rate": 0.00034083707373943456, | |
| "loss": 3.217, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 21.62835003495689, | |
| "grad_norm": 0.4187087118625641, | |
| "learning_rate": 0.0003406621976100262, | |
| "loss": 3.225, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 21.642915404334655, | |
| "grad_norm": 0.42689085006713867, | |
| "learning_rate": 0.0003404873214806179, | |
| "loss": 3.2501, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 21.65748077371242, | |
| "grad_norm": 0.4216347336769104, | |
| "learning_rate": 0.00034031244535120953, | |
| "loss": 3.2472, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 21.67204614309019, | |
| "grad_norm": 0.36499667167663574, | |
| "learning_rate": 0.0003401375692218012, | |
| "loss": 3.228, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 21.686611512467955, | |
| "grad_norm": 0.38649997115135193, | |
| "learning_rate": 0.00033996269309239285, | |
| "loss": 3.2288, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 21.701176881845722, | |
| "grad_norm": 0.41683053970336914, | |
| "learning_rate": 0.0003397878169629845, | |
| "loss": 3.2331, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 21.715742251223492, | |
| "grad_norm": 0.3885643780231476, | |
| "learning_rate": 0.0003396129408335762, | |
| "loss": 3.2369, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 21.73030762060126, | |
| "grad_norm": 0.37198543548583984, | |
| "learning_rate": 0.0003394380647041678, | |
| "loss": 3.2315, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 21.744872989979026, | |
| "grad_norm": 0.3774627149105072, | |
| "learning_rate": 0.00033926318857475956, | |
| "loss": 3.2361, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 21.759438359356793, | |
| "grad_norm": 0.3771449327468872, | |
| "learning_rate": 0.0003390883124453512, | |
| "loss": 3.2282, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 21.77400372873456, | |
| "grad_norm": 0.4178008437156677, | |
| "learning_rate": 0.0003389134363159429, | |
| "loss": 3.2358, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 21.78856909811233, | |
| "grad_norm": 0.3946438133716583, | |
| "learning_rate": 0.0003387385601865345, | |
| "loss": 3.2344, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 21.803134467490096, | |
| "grad_norm": 0.39481014013290405, | |
| "learning_rate": 0.00033856368405712616, | |
| "loss": 3.2308, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 21.817699836867863, | |
| "grad_norm": 0.39908483624458313, | |
| "learning_rate": 0.00033838880792771785, | |
| "loss": 3.2348, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 21.83226520624563, | |
| "grad_norm": 0.37811824679374695, | |
| "learning_rate": 0.0003382139317983095, | |
| "loss": 3.2501, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 21.846830575623397, | |
| "grad_norm": 0.4090801775455475, | |
| "learning_rate": 0.0003380390556689012, | |
| "loss": 3.2353, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.846830575623397, | |
| "eval_accuracy": 0.37407236060499, | |
| "eval_loss": 3.5360472202301025, | |
| "eval_runtime": 180.4467, | |
| "eval_samples_per_second": 92.243, | |
| "eval_steps_per_second": 5.769, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 21.861395945001163, | |
| "grad_norm": 0.37465283274650574, | |
| "learning_rate": 0.0003378641795394928, | |
| "loss": 3.238, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 21.875961314378934, | |
| "grad_norm": 0.39519503712654114, | |
| "learning_rate": 0.00033768930341008445, | |
| "loss": 3.233, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 21.8905266837567, | |
| "grad_norm": 0.41474273800849915, | |
| "learning_rate": 0.0003375144272806762, | |
| "loss": 3.2464, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 21.905092053134467, | |
| "grad_norm": 0.42501193284988403, | |
| "learning_rate": 0.00033733955115126783, | |
| "loss": 3.2312, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 21.919657422512234, | |
| "grad_norm": 0.3937487006187439, | |
| "learning_rate": 0.0003371646750218595, | |
| "loss": 3.2464, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 21.93422279189, | |
| "grad_norm": 0.38560178875923157, | |
| "learning_rate": 0.00033698979889245116, | |
| "loss": 3.2429, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 21.94878816126777, | |
| "grad_norm": 0.40010857582092285, | |
| "learning_rate": 0.0003368149227630428, | |
| "loss": 3.2451, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 21.963353530645538, | |
| "grad_norm": 0.3742867410182953, | |
| "learning_rate": 0.0003366400466336345, | |
| "loss": 3.2484, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 21.977918900023305, | |
| "grad_norm": 0.3764432370662689, | |
| "learning_rate": 0.0003364651705042261, | |
| "loss": 3.2551, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 21.99248426940107, | |
| "grad_norm": 0.39993754029273987, | |
| "learning_rate": 0.0003362902943748178, | |
| "loss": 3.2462, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 22.00699137730133, | |
| "grad_norm": 0.415519654750824, | |
| "learning_rate": 0.00033611541824540945, | |
| "loss": 3.1819, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 22.021556746679096, | |
| "grad_norm": 0.4040631651878357, | |
| "learning_rate": 0.0003359405421160012, | |
| "loss": 3.1398, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 22.036122116056863, | |
| "grad_norm": 0.3827342689037323, | |
| "learning_rate": 0.0003357656659865928, | |
| "loss": 3.1443, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 22.05068748543463, | |
| "grad_norm": 0.42973458766937256, | |
| "learning_rate": 0.00033559078985718446, | |
| "loss": 3.1524, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 22.0652528548124, | |
| "grad_norm": 0.3999570906162262, | |
| "learning_rate": 0.00033541591372777615, | |
| "loss": 3.1535, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 22.079818224190166, | |
| "grad_norm": 0.4020233750343323, | |
| "learning_rate": 0.0003352410375983678, | |
| "loss": 3.1444, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 22.094383593567933, | |
| "grad_norm": 0.39696866273880005, | |
| "learning_rate": 0.0003350661614689595, | |
| "loss": 3.1719, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 22.1089489629457, | |
| "grad_norm": 0.3695342242717743, | |
| "learning_rate": 0.0003348912853395511, | |
| "loss": 3.1644, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 22.123514332323467, | |
| "grad_norm": 0.40176528692245483, | |
| "learning_rate": 0.00033471640921014275, | |
| "loss": 3.1743, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 22.138079701701233, | |
| "grad_norm": 0.39200496673583984, | |
| "learning_rate": 0.00033454153308073444, | |
| "loss": 3.1793, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.138079701701233, | |
| "eval_accuracy": 0.37305079773960553, | |
| "eval_loss": 3.5540237426757812, | |
| "eval_runtime": 180.2832, | |
| "eval_samples_per_second": 92.327, | |
| "eval_steps_per_second": 5.774, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 22.152645071079004, | |
| "grad_norm": 0.39109188318252563, | |
| "learning_rate": 0.0003343666569513261, | |
| "loss": 3.1723, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 22.16721044045677, | |
| "grad_norm": 0.4028526544570923, | |
| "learning_rate": 0.0003341917808219178, | |
| "loss": 3.1775, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 22.181775809834537, | |
| "grad_norm": 0.4098780155181885, | |
| "learning_rate": 0.00033401690469250946, | |
| "loss": 3.1614, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 22.196341179212304, | |
| "grad_norm": 0.4064423441886902, | |
| "learning_rate": 0.00033384202856310115, | |
| "loss": 3.1635, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 22.21090654859007, | |
| "grad_norm": 0.3825031816959381, | |
| "learning_rate": 0.0003336671524336928, | |
| "loss": 3.1861, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 22.22547191796784, | |
| "grad_norm": 0.41401293873786926, | |
| "learning_rate": 0.0003334922763042844, | |
| "loss": 3.183, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 22.240037287345608, | |
| "grad_norm": 0.404306560754776, | |
| "learning_rate": 0.0003333174001748761, | |
| "loss": 3.1818, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 22.254602656723375, | |
| "grad_norm": 0.43731221556663513, | |
| "learning_rate": 0.00033314252404546775, | |
| "loss": 3.1921, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 22.26916802610114, | |
| "grad_norm": 0.42374587059020996, | |
| "learning_rate": 0.00033296764791605944, | |
| "loss": 3.1832, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 22.283733395478908, | |
| "grad_norm": 0.4058934450149536, | |
| "learning_rate": 0.0003327927717866511, | |
| "loss": 3.1896, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 22.29829876485668, | |
| "grad_norm": 0.4124061167240143, | |
| "learning_rate": 0.0003326178956572427, | |
| "loss": 3.1953, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 22.312864134234445, | |
| "grad_norm": 0.37550002336502075, | |
| "learning_rate": 0.00033244301952783446, | |
| "loss": 3.1904, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 22.327429503612212, | |
| "grad_norm": 0.38566455245018005, | |
| "learning_rate": 0.0003322681433984261, | |
| "loss": 3.1946, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 22.34199487298998, | |
| "grad_norm": 0.41883614659309387, | |
| "learning_rate": 0.0003320932672690178, | |
| "loss": 3.1925, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 22.356560242367745, | |
| "grad_norm": 0.4119344651699066, | |
| "learning_rate": 0.0003319183911396094, | |
| "loss": 3.177, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 22.371125611745512, | |
| "grad_norm": 0.4048214852809906, | |
| "learning_rate": 0.0003317435150102011, | |
| "loss": 3.2043, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 22.385690981123282, | |
| "grad_norm": 0.43698737025260925, | |
| "learning_rate": 0.00033156863888079275, | |
| "loss": 3.1895, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 22.40025635050105, | |
| "grad_norm": 0.40300291776657104, | |
| "learning_rate": 0.0003313937627513844, | |
| "loss": 3.2016, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 22.414821719878816, | |
| "grad_norm": 0.39613571763038635, | |
| "learning_rate": 0.00033121888662197607, | |
| "loss": 3.1993, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 22.429387089256583, | |
| "grad_norm": 0.4303956627845764, | |
| "learning_rate": 0.0003310440104925677, | |
| "loss": 3.2118, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.429387089256583, | |
| "eval_accuracy": 0.3733571843004516, | |
| "eval_loss": 3.5473194122314453, | |
| "eval_runtime": 180.1499, | |
| "eval_samples_per_second": 92.395, | |
| "eval_steps_per_second": 5.779, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 22.44395245863435, | |
| "grad_norm": 0.40987488627433777, | |
| "learning_rate": 0.00033086913436315945, | |
| "loss": 3.2011, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 22.45851782801212, | |
| "grad_norm": 0.5552119612693787, | |
| "learning_rate": 0.0003306942582337511, | |
| "loss": 3.2065, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 22.473083197389887, | |
| "grad_norm": 0.4037717878818512, | |
| "learning_rate": 0.0003305193821043427, | |
| "loss": 3.1915, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 22.487648566767653, | |
| "grad_norm": 0.39489609003067017, | |
| "learning_rate": 0.0003303445059749344, | |
| "loss": 3.2056, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 22.50221393614542, | |
| "grad_norm": 0.4151366353034973, | |
| "learning_rate": 0.00033016962984552605, | |
| "loss": 3.2079, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 22.516779305523187, | |
| "grad_norm": 0.3901776373386383, | |
| "learning_rate": 0.00032999475371611774, | |
| "loss": 3.2011, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 22.531344674900957, | |
| "grad_norm": 0.39502573013305664, | |
| "learning_rate": 0.0003298198775867094, | |
| "loss": 3.2107, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 22.545910044278724, | |
| "grad_norm": 0.4188506305217743, | |
| "learning_rate": 0.000329645001457301, | |
| "loss": 3.2172, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 22.56047541365649, | |
| "grad_norm": 0.40013113617897034, | |
| "learning_rate": 0.0003294701253278927, | |
| "loss": 3.2247, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 22.575040783034257, | |
| "grad_norm": 0.42045021057128906, | |
| "learning_rate": 0.00032929524919848434, | |
| "loss": 3.2161, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 22.589606152412024, | |
| "grad_norm": 0.44547539949417114, | |
| "learning_rate": 0.0003291203730690761, | |
| "loss": 3.2248, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 22.60417152178979, | |
| "grad_norm": 0.4550730884075165, | |
| "learning_rate": 0.0003289454969396677, | |
| "loss": 3.219, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 22.61873689116756, | |
| "grad_norm": 0.40038976073265076, | |
| "learning_rate": 0.0003287706208102594, | |
| "loss": 3.2128, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 22.633302260545328, | |
| "grad_norm": 0.4054413139820099, | |
| "learning_rate": 0.00032859574468085105, | |
| "loss": 3.2153, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 22.647867629923095, | |
| "grad_norm": 0.39146846532821655, | |
| "learning_rate": 0.0003284208685514427, | |
| "loss": 3.2212, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 22.66243299930086, | |
| "grad_norm": 0.402034193277359, | |
| "learning_rate": 0.0003282459924220344, | |
| "loss": 3.2227, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 22.67699836867863, | |
| "grad_norm": 0.38476046919822693, | |
| "learning_rate": 0.000328071116292626, | |
| "loss": 3.2133, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 22.6915637380564, | |
| "grad_norm": 0.412173867225647, | |
| "learning_rate": 0.0003278962401632177, | |
| "loss": 3.2176, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 22.706129107434165, | |
| "grad_norm": 0.3986635208129883, | |
| "learning_rate": 0.00032772136403380934, | |
| "loss": 3.2216, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 22.720694476811932, | |
| "grad_norm": 0.4033834934234619, | |
| "learning_rate": 0.000327546487904401, | |
| "loss": 3.2325, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.720694476811932, | |
| "eval_accuracy": 0.37393880145950986, | |
| "eval_loss": 3.539693832397461, | |
| "eval_runtime": 180.3316, | |
| "eval_samples_per_second": 92.302, | |
| "eval_steps_per_second": 5.773, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 22.7352598461897, | |
| "grad_norm": 0.3904629945755005, | |
| "learning_rate": 0.0003273716117749927, | |
| "loss": 3.2313, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 22.749825215567466, | |
| "grad_norm": 0.4374755322933197, | |
| "learning_rate": 0.00032719673564558435, | |
| "loss": 3.2222, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 22.764390584945236, | |
| "grad_norm": 0.3812626302242279, | |
| "learning_rate": 0.00032702185951617605, | |
| "loss": 3.2216, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 22.778955954323003, | |
| "grad_norm": 0.41498416662216187, | |
| "learning_rate": 0.0003268469833867677, | |
| "loss": 3.2297, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 22.79352132370077, | |
| "grad_norm": 0.37831243872642517, | |
| "learning_rate": 0.00032667210725735937, | |
| "loss": 3.2222, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 22.808086693078536, | |
| "grad_norm": 0.4045485258102417, | |
| "learning_rate": 0.000326497231127951, | |
| "loss": 3.2285, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 22.822652062456303, | |
| "grad_norm": 0.4422939717769623, | |
| "learning_rate": 0.00032632235499854264, | |
| "loss": 3.2316, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 22.83721743183407, | |
| "grad_norm": 0.42193302512168884, | |
| "learning_rate": 0.00032614747886913433, | |
| "loss": 3.2325, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 22.85178280121184, | |
| "grad_norm": 0.40983685851097107, | |
| "learning_rate": 0.00032597260273972597, | |
| "loss": 3.2343, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 22.866348170589607, | |
| "grad_norm": 0.40650826692581177, | |
| "learning_rate": 0.0003257977266103177, | |
| "loss": 3.2459, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 22.880913539967374, | |
| "grad_norm": 0.4187788963317871, | |
| "learning_rate": 0.00032562285048090935, | |
| "loss": 3.2262, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 22.89547890934514, | |
| "grad_norm": 0.4191761314868927, | |
| "learning_rate": 0.000325447974351501, | |
| "loss": 3.2322, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 22.910044278722907, | |
| "grad_norm": 0.4233628511428833, | |
| "learning_rate": 0.0003252730982220927, | |
| "loss": 3.2224, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 22.924609648100677, | |
| "grad_norm": 0.406550794839859, | |
| "learning_rate": 0.0003250982220926843, | |
| "loss": 3.2343, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 22.939175017478444, | |
| "grad_norm": 0.4227450489997864, | |
| "learning_rate": 0.000324923345963276, | |
| "loss": 3.2362, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 22.95374038685621, | |
| "grad_norm": 0.4387967884540558, | |
| "learning_rate": 0.00032474846983386764, | |
| "loss": 3.2464, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 22.968305756233978, | |
| "grad_norm": 0.4019660949707031, | |
| "learning_rate": 0.00032457359370445933, | |
| "loss": 3.2288, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 22.982871125611744, | |
| "grad_norm": 0.3797237277030945, | |
| "learning_rate": 0.00032439871757505097, | |
| "loss": 3.2282, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 22.997436494989515, | |
| "grad_norm": 0.4136102497577667, | |
| "learning_rate": 0.0003242238414456426, | |
| "loss": 3.2305, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 23.01194360288977, | |
| "grad_norm": 0.3978010416030884, | |
| "learning_rate": 0.00032404896531623435, | |
| "loss": 3.1549, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.01194360288977, | |
| "eval_accuracy": 0.37382040880150064, | |
| "eval_loss": 3.54584002494812, | |
| "eval_runtime": 180.1859, | |
| "eval_samples_per_second": 92.377, | |
| "eval_steps_per_second": 5.777, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 23.026508972267536, | |
| "grad_norm": 0.4399334788322449, | |
| "learning_rate": 0.000323874089186826, | |
| "loss": 3.1371, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 23.041074341645306, | |
| "grad_norm": 0.4500805139541626, | |
| "learning_rate": 0.0003236992130574177, | |
| "loss": 3.1506, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 23.055639711023073, | |
| "grad_norm": 0.42291468381881714, | |
| "learning_rate": 0.0003235243369280093, | |
| "loss": 3.1273, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 23.07020508040084, | |
| "grad_norm": 0.3858034610748291, | |
| "learning_rate": 0.00032334946079860095, | |
| "loss": 3.1407, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 23.084770449778606, | |
| "grad_norm": 0.4042983651161194, | |
| "learning_rate": 0.00032317458466919264, | |
| "loss": 3.1527, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 23.099335819156373, | |
| "grad_norm": 0.39408230781555176, | |
| "learning_rate": 0.0003229997085397843, | |
| "loss": 3.1524, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 23.11390118853414, | |
| "grad_norm": 0.3985508680343628, | |
| "learning_rate": 0.00032282483241037596, | |
| "loss": 3.1637, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 23.12846655791191, | |
| "grad_norm": 0.4050256907939911, | |
| "learning_rate": 0.0003226499562809676, | |
| "loss": 3.1487, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 23.143031927289677, | |
| "grad_norm": 0.469816118478775, | |
| "learning_rate": 0.00032247508015155924, | |
| "loss": 3.1614, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 23.157597296667443, | |
| "grad_norm": 0.4118306636810303, | |
| "learning_rate": 0.000322300204022151, | |
| "loss": 3.1541, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 23.17216266604521, | |
| "grad_norm": 0.3904615640640259, | |
| "learning_rate": 0.0003221253278927426, | |
| "loss": 3.1667, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 23.186728035422977, | |
| "grad_norm": 0.40412455797195435, | |
| "learning_rate": 0.0003219504517633343, | |
| "loss": 3.1724, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 23.201293404800747, | |
| "grad_norm": 0.43276599049568176, | |
| "learning_rate": 0.00032177557563392594, | |
| "loss": 3.1651, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 23.215858774178514, | |
| "grad_norm": 0.38202792406082153, | |
| "learning_rate": 0.00032160069950451763, | |
| "loss": 3.1693, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 23.23042414355628, | |
| "grad_norm": 0.4077218472957611, | |
| "learning_rate": 0.00032142582337510927, | |
| "loss": 3.1715, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 23.244989512934048, | |
| "grad_norm": 0.42455849051475525, | |
| "learning_rate": 0.0003212509472457009, | |
| "loss": 3.1702, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 23.259554882311814, | |
| "grad_norm": 0.4102165400981903, | |
| "learning_rate": 0.0003210760711162926, | |
| "loss": 3.1808, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 23.27412025168958, | |
| "grad_norm": 0.4249970316886902, | |
| "learning_rate": 0.00032090119498688423, | |
| "loss": 3.1731, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 23.28868562106735, | |
| "grad_norm": 0.39110127091407776, | |
| "learning_rate": 0.0003207263188574759, | |
| "loss": 3.1802, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 23.303250990445118, | |
| "grad_norm": 0.43393391370773315, | |
| "learning_rate": 0.0003205514427280676, | |
| "loss": 3.1875, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.303250990445118, | |
| "eval_accuracy": 0.3732344415646407, | |
| "eval_loss": 3.552424907684326, | |
| "eval_runtime": 180.3512, | |
| "eval_samples_per_second": 92.292, | |
| "eval_steps_per_second": 5.772, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 23.317816359822885, | |
| "grad_norm": 0.3961530029773712, | |
| "learning_rate": 0.00032037656659865925, | |
| "loss": 3.128, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 23.33238172920065, | |
| "grad_norm": 0.43098706007003784, | |
| "learning_rate": 0.00032020169046925094, | |
| "loss": 3.1388, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 23.34694709857842, | |
| "grad_norm": 0.4158017337322235, | |
| "learning_rate": 0.0003200268143398426, | |
| "loss": 3.1452, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 23.36151246795619, | |
| "grad_norm": 0.40117818117141724, | |
| "learning_rate": 0.00031985193821043427, | |
| "loss": 3.1548, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 23.376077837333956, | |
| "grad_norm": 0.4437515139579773, | |
| "learning_rate": 0.0003196770620810259, | |
| "loss": 3.1432, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 23.390643206711722, | |
| "grad_norm": 0.41832467913627625, | |
| "learning_rate": 0.0003195021859516176, | |
| "loss": 3.1679, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 23.40520857608949, | |
| "grad_norm": 0.41394323110580444, | |
| "learning_rate": 0.00031932730982220923, | |
| "loss": 3.1668, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 23.419773945467256, | |
| "grad_norm": 0.4065753221511841, | |
| "learning_rate": 0.00031915243369280087, | |
| "loss": 3.1507, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 23.434339314845026, | |
| "grad_norm": 0.41089168190956116, | |
| "learning_rate": 0.00031897755756339256, | |
| "loss": 3.1685, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 23.448904684222793, | |
| "grad_norm": 0.4270135164260864, | |
| "learning_rate": 0.0003188026814339842, | |
| "loss": 3.1697, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 23.46347005360056, | |
| "grad_norm": 0.4168857932090759, | |
| "learning_rate": 0.00031862780530457594, | |
| "loss": 3.1673, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 23.478035422978326, | |
| "grad_norm": 0.39209380745887756, | |
| "learning_rate": 0.0003184529291751676, | |
| "loss": 3.1786, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 23.492600792356093, | |
| "grad_norm": 0.45542216300964355, | |
| "learning_rate": 0.0003182780530457592, | |
| "loss": 3.1646, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 23.50716616173386, | |
| "grad_norm": 0.45212727785110474, | |
| "learning_rate": 0.0003181031769163509, | |
| "loss": 3.1683, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 23.52173153111163, | |
| "grad_norm": 0.4164752960205078, | |
| "learning_rate": 0.00031792830078694254, | |
| "loss": 3.1663, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 23.536296900489397, | |
| "grad_norm": 0.4018906354904175, | |
| "learning_rate": 0.0003177534246575342, | |
| "loss": 3.1639, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 23.550862269867164, | |
| "grad_norm": 0.41436582803726196, | |
| "learning_rate": 0.00031757854852812586, | |
| "loss": 3.1684, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 23.56542763924493, | |
| "grad_norm": 0.4052976369857788, | |
| "learning_rate": 0.00031740367239871755, | |
| "loss": 3.1658, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 23.579993008622697, | |
| "grad_norm": 0.39007437229156494, | |
| "learning_rate": 0.0003172287962693092, | |
| "loss": 3.167, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 23.594558378000468, | |
| "grad_norm": 0.41448211669921875, | |
| "learning_rate": 0.0003170539201399008, | |
| "loss": 3.1815, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.594558378000468, | |
| "eval_accuracy": 0.3736459354107502, | |
| "eval_loss": 3.551520347595215, | |
| "eval_runtime": 180.7918, | |
| "eval_samples_per_second": 92.067, | |
| "eval_steps_per_second": 5.758, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 23.609123747378234, | |
| "grad_norm": 0.42460495233535767, | |
| "learning_rate": 0.00031687904401049257, | |
| "loss": 3.1911, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 23.623689116756, | |
| "grad_norm": 0.44785287976264954, | |
| "learning_rate": 0.0003167041678810842, | |
| "loss": 3.1938, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 23.638254486133768, | |
| "grad_norm": 0.4176446497440338, | |
| "learning_rate": 0.0003165292917516759, | |
| "loss": 3.188, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 23.652819855511535, | |
| "grad_norm": 0.40342187881469727, | |
| "learning_rate": 0.00031635441562226753, | |
| "loss": 3.1827, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 23.667385224889305, | |
| "grad_norm": 0.4379565119743347, | |
| "learning_rate": 0.00031617953949285917, | |
| "loss": 3.1919, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 23.68195059426707, | |
| "grad_norm": 0.4336557388305664, | |
| "learning_rate": 0.00031600466336345086, | |
| "loss": 3.1789, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 23.69651596364484, | |
| "grad_norm": 0.4131733775138855, | |
| "learning_rate": 0.0003158297872340425, | |
| "loss": 3.1905, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 23.711081333022605, | |
| "grad_norm": 0.4182552397251129, | |
| "learning_rate": 0.0003156549111046342, | |
| "loss": 3.1819, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 23.725646702400372, | |
| "grad_norm": 0.42562443017959595, | |
| "learning_rate": 0.0003154800349752258, | |
| "loss": 3.1956, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 23.74021207177814, | |
| "grad_norm": 0.40080058574676514, | |
| "learning_rate": 0.00031530515884581757, | |
| "loss": 3.2053, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 23.75477744115591, | |
| "grad_norm": 0.3980943560600281, | |
| "learning_rate": 0.0003151302827164092, | |
| "loss": 3.2019, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 23.769342810533676, | |
| "grad_norm": 0.41131657361984253, | |
| "learning_rate": 0.00031495540658700084, | |
| "loss": 3.1842, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 23.783908179911442, | |
| "grad_norm": 0.4096752405166626, | |
| "learning_rate": 0.00031478053045759253, | |
| "loss": 3.1931, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 23.79847354928921, | |
| "grad_norm": 0.4075625538825989, | |
| "learning_rate": 0.00031460565432818417, | |
| "loss": 3.1858, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 23.813038918666976, | |
| "grad_norm": 0.3922245502471924, | |
| "learning_rate": 0.00031443077819877586, | |
| "loss": 3.1979, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 23.827604288044746, | |
| "grad_norm": 0.4519116282463074, | |
| "learning_rate": 0.0003142559020693675, | |
| "loss": 3.2028, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 23.842169657422513, | |
| "grad_norm": 0.3990562856197357, | |
| "learning_rate": 0.00031408102593995913, | |
| "loss": 3.1981, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 23.85673502680028, | |
| "grad_norm": 0.4091752767562866, | |
| "learning_rate": 0.0003139061498105508, | |
| "loss": 3.2087, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 23.871300396178047, | |
| "grad_norm": 0.43344610929489136, | |
| "learning_rate": 0.00031373127368114245, | |
| "loss": 3.2012, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 23.885865765555813, | |
| "grad_norm": 0.42658382654190063, | |
| "learning_rate": 0.0003135563975517342, | |
| "loss": 3.2017, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.885865765555813, | |
| "eval_accuracy": 0.3737183583276655, | |
| "eval_loss": 3.5459020137786865, | |
| "eval_runtime": 180.2368, | |
| "eval_samples_per_second": 92.351, | |
| "eval_steps_per_second": 5.776, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 23.900431134933584, | |
| "grad_norm": 0.41615602374076843, | |
| "learning_rate": 0.00031338152142232584, | |
| "loss": 3.2067, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 23.91499650431135, | |
| "grad_norm": 0.4384036958217621, | |
| "learning_rate": 0.00031320664529291747, | |
| "loss": 3.2036, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 23.929561873689117, | |
| "grad_norm": 0.40373918414115906, | |
| "learning_rate": 0.00031303176916350916, | |
| "loss": 3.2073, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 23.944127243066884, | |
| "grad_norm": 0.4283479154109955, | |
| "learning_rate": 0.0003128568930341008, | |
| "loss": 3.2195, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 23.95869261244465, | |
| "grad_norm": 0.42980676889419556, | |
| "learning_rate": 0.0003126820169046925, | |
| "loss": 3.2104, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 23.973257981822417, | |
| "grad_norm": 0.4395185708999634, | |
| "learning_rate": 0.0003125071407752841, | |
| "loss": 3.2022, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 23.987823351200188, | |
| "grad_norm": 0.4349308907985687, | |
| "learning_rate": 0.0003123322646458758, | |
| "loss": 3.2058, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 24.002621766487998, | |
| "grad_norm": 0.37979841232299805, | |
| "learning_rate": 0.00031215738851646745, | |
| "loss": 3.2653, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 24.017187135865765, | |
| "grad_norm": 0.39357176423072815, | |
| "learning_rate": 0.0003119825123870591, | |
| "loss": 3.1359, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 24.03175250524353, | |
| "grad_norm": 0.40956372022628784, | |
| "learning_rate": 0.00031180763625765083, | |
| "loss": 3.1277, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 24.0463178746213, | |
| "grad_norm": 0.3902159631252289, | |
| "learning_rate": 0.00031163276012824247, | |
| "loss": 3.1391, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 24.06088324399907, | |
| "grad_norm": 0.43166887760162354, | |
| "learning_rate": 0.00031145788399883416, | |
| "loss": 3.1339, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 24.075448613376835, | |
| "grad_norm": 0.40408581495285034, | |
| "learning_rate": 0.0003112830078694258, | |
| "loss": 3.1298, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 24.090013982754602, | |
| "grad_norm": 0.44965606927871704, | |
| "learning_rate": 0.00031110813174001743, | |
| "loss": 3.1439, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 24.10457935213237, | |
| "grad_norm": 0.43857842683792114, | |
| "learning_rate": 0.0003109332556106091, | |
| "loss": 3.1466, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 24.11914472151014, | |
| "grad_norm": 0.42380571365356445, | |
| "learning_rate": 0.00031075837948120076, | |
| "loss": 3.1439, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 24.133710090887906, | |
| "grad_norm": 0.4155460298061371, | |
| "learning_rate": 0.00031058350335179245, | |
| "loss": 3.1448, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 24.148275460265673, | |
| "grad_norm": 0.42540451884269714, | |
| "learning_rate": 0.0003104086272223841, | |
| "loss": 3.1484, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 24.16284082964344, | |
| "grad_norm": 0.4119025468826294, | |
| "learning_rate": 0.00031023375109297583, | |
| "loss": 3.1508, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 24.177406199021206, | |
| "grad_norm": 0.4322400391101837, | |
| "learning_rate": 0.00031005887496356746, | |
| "loss": 3.1608, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.177406199021206, | |
| "eval_accuracy": 0.3733360217597946, | |
| "eval_loss": 3.553809642791748, | |
| "eval_runtime": 180.1586, | |
| "eval_samples_per_second": 92.391, | |
| "eval_steps_per_second": 5.778, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 24.191971568398973, | |
| "grad_norm": 0.416153222322464, | |
| "learning_rate": 0.0003098839988341591, | |
| "loss": 3.1627, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 24.206536937776743, | |
| "grad_norm": 0.4277220070362091, | |
| "learning_rate": 0.0003097091227047508, | |
| "loss": 3.1689, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 24.22110230715451, | |
| "grad_norm": 0.3882681727409363, | |
| "learning_rate": 0.00030953424657534243, | |
| "loss": 3.168, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 24.235667676532277, | |
| "grad_norm": 0.41531258821487427, | |
| "learning_rate": 0.0003093593704459341, | |
| "loss": 3.168, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 24.250233045910043, | |
| "grad_norm": 0.44125837087631226, | |
| "learning_rate": 0.00030918449431652575, | |
| "loss": 3.1814, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 24.26479841528781, | |
| "grad_norm": 0.4468991160392761, | |
| "learning_rate": 0.0003090096181871174, | |
| "loss": 3.1816, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 24.27936378466558, | |
| "grad_norm": 0.40799108147621155, | |
| "learning_rate": 0.0003088347420577091, | |
| "loss": 3.1606, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 24.293929154043347, | |
| "grad_norm": 0.4418475329875946, | |
| "learning_rate": 0.0003086598659283007, | |
| "loss": 3.1716, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 24.308494523421114, | |
| "grad_norm": 0.435165137052536, | |
| "learning_rate": 0.00030848498979889246, | |
| "loss": 3.1786, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 24.32305989279888, | |
| "grad_norm": 0.40027478337287903, | |
| "learning_rate": 0.0003083101136694841, | |
| "loss": 3.1831, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 24.337625262176648, | |
| "grad_norm": 0.4050601124763489, | |
| "learning_rate": 0.0003081352375400758, | |
| "loss": 3.169, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 24.352190631554418, | |
| "grad_norm": 0.4074668288230896, | |
| "learning_rate": 0.0003079603614106674, | |
| "loss": 3.1695, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 24.366756000932185, | |
| "grad_norm": 0.4010626971721649, | |
| "learning_rate": 0.00030778548528125906, | |
| "loss": 3.1853, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 24.38132137030995, | |
| "grad_norm": 0.44930556416511536, | |
| "learning_rate": 0.00030761060915185075, | |
| "loss": 3.1887, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 24.395886739687718, | |
| "grad_norm": 0.42292553186416626, | |
| "learning_rate": 0.0003074357330224424, | |
| "loss": 3.1904, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 24.410452109065485, | |
| "grad_norm": 0.4097443222999573, | |
| "learning_rate": 0.0003072608568930341, | |
| "loss": 3.1923, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 24.42501747844325, | |
| "grad_norm": 0.3933105766773224, | |
| "learning_rate": 0.0003070859807636257, | |
| "loss": 3.197, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 24.439582847821022, | |
| "grad_norm": 0.3940275311470032, | |
| "learning_rate": 0.00030691110463421735, | |
| "loss": 3.1935, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 24.45414821719879, | |
| "grad_norm": 0.4267207086086273, | |
| "learning_rate": 0.0003067362285048091, | |
| "loss": 3.1905, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 24.468713586576555, | |
| "grad_norm": 0.4132387936115265, | |
| "learning_rate": 0.00030656135237540073, | |
| "loss": 3.1977, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.468713586576555, | |
| "eval_accuracy": 0.37396490192632026, | |
| "eval_loss": 3.546501398086548, | |
| "eval_runtime": 180.189, | |
| "eval_samples_per_second": 92.375, | |
| "eval_steps_per_second": 5.777, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 24.483278955954322, | |
| "grad_norm": 0.4435485303401947, | |
| "learning_rate": 0.0003063864762459924, | |
| "loss": 3.1924, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 24.49784432533209, | |
| "grad_norm": 0.402010440826416, | |
| "learning_rate": 0.00030621160011658406, | |
| "loss": 3.1967, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 24.51240969470986, | |
| "grad_norm": 0.42332908511161804, | |
| "learning_rate": 0.0003060367239871757, | |
| "loss": 3.2062, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 24.526975064087626, | |
| "grad_norm": 0.40317755937576294, | |
| "learning_rate": 0.0003058618478577674, | |
| "loss": 3.1848, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 24.541540433465393, | |
| "grad_norm": 0.4025932848453522, | |
| "learning_rate": 0.000305686971728359, | |
| "loss": 3.2035, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 24.55610580284316, | |
| "grad_norm": 0.41158631443977356, | |
| "learning_rate": 0.0003055120955989507, | |
| "loss": 3.2026, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 24.570671172220926, | |
| "grad_norm": 0.4377208650112152, | |
| "learning_rate": 0.00030533721946954235, | |
| "loss": 3.2001, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 24.585236541598697, | |
| "grad_norm": 0.3986060917377472, | |
| "learning_rate": 0.0003051623433401341, | |
| "loss": 3.195, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 24.599801910976463, | |
| "grad_norm": 0.4415377080440521, | |
| "learning_rate": 0.00030498746721072573, | |
| "loss": 3.1899, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 24.61436728035423, | |
| "grad_norm": 0.43405723571777344, | |
| "learning_rate": 0.00030481259108131736, | |
| "loss": 3.2059, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 24.628932649731997, | |
| "grad_norm": 0.42428821325302124, | |
| "learning_rate": 0.00030463771495190905, | |
| "loss": 3.1971, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 24.643498019109764, | |
| "grad_norm": 0.40802910923957825, | |
| "learning_rate": 0.0003044628388225007, | |
| "loss": 3.1907, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 24.65806338848753, | |
| "grad_norm": 0.4088672995567322, | |
| "learning_rate": 0.0003042879626930924, | |
| "loss": 3.2058, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 24.6726287578653, | |
| "grad_norm": 0.44554194808006287, | |
| "learning_rate": 0.000304113086563684, | |
| "loss": 3.2052, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 24.687194127243067, | |
| "grad_norm": 0.40504729747772217, | |
| "learning_rate": 0.00030393821043427565, | |
| "loss": 3.2203, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 24.701759496620834, | |
| "grad_norm": 0.40326762199401855, | |
| "learning_rate": 0.00030376333430486734, | |
| "loss": 3.2086, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 24.7163248659986, | |
| "grad_norm": 0.4128248989582062, | |
| "learning_rate": 0.000303588458175459, | |
| "loss": 3.1938, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 24.730890235376368, | |
| "grad_norm": 0.4104618430137634, | |
| "learning_rate": 0.0003034135820460507, | |
| "loss": 3.2093, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 24.745455604754138, | |
| "grad_norm": 0.43903848528862, | |
| "learning_rate": 0.00030323870591664236, | |
| "loss": 3.2078, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 24.760020974131905, | |
| "grad_norm": 0.4496263861656189, | |
| "learning_rate": 0.00030306382978723405, | |
| "loss": 3.2052, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.760020974131905, | |
| "eval_accuracy": 0.3744878518198903, | |
| "eval_loss": 3.5395586490631104, | |
| "eval_runtime": 180.1456, | |
| "eval_samples_per_second": 92.397, | |
| "eval_steps_per_second": 5.779, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 24.77458634350967, | |
| "grad_norm": 0.39450597763061523, | |
| "learning_rate": 0.0003028889536578257, | |
| "loss": 3.1969, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 24.78915171288744, | |
| "grad_norm": 0.4104599058628082, | |
| "learning_rate": 0.0003027140775284173, | |
| "loss": 3.2143, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 24.803717082265205, | |
| "grad_norm": 0.435557097196579, | |
| "learning_rate": 0.000302539201399009, | |
| "loss": 3.2116, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 24.818282451642972, | |
| "grad_norm": 0.3877793848514557, | |
| "learning_rate": 0.00030236432526960065, | |
| "loss": 3.204, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 24.832847821020742, | |
| "grad_norm": 0.41204720735549927, | |
| "learning_rate": 0.00030218944914019234, | |
| "loss": 3.2179, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 24.84741319039851, | |
| "grad_norm": 0.4013805389404297, | |
| "learning_rate": 0.000302014573010784, | |
| "loss": 3.2141, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 24.861978559776276, | |
| "grad_norm": 0.42889299988746643, | |
| "learning_rate": 0.0003018396968813756, | |
| "loss": 3.212, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 24.876543929154042, | |
| "grad_norm": 0.403367280960083, | |
| "learning_rate": 0.00030166482075196736, | |
| "loss": 3.2101, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 24.89110929853181, | |
| "grad_norm": 0.4000912010669708, | |
| "learning_rate": 0.000301489944622559, | |
| "loss": 3.2086, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 24.90567466790958, | |
| "grad_norm": 0.43617841601371765, | |
| "learning_rate": 0.0003013150684931507, | |
| "loss": 3.2146, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 24.920240037287346, | |
| "grad_norm": 0.39104828238487244, | |
| "learning_rate": 0.0003011401923637423, | |
| "loss": 3.2112, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 24.934805406665113, | |
| "grad_norm": 0.3803448975086212, | |
| "learning_rate": 0.000300965316234334, | |
| "loss": 3.2246, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 24.94937077604288, | |
| "grad_norm": 0.40348923206329346, | |
| "learning_rate": 0.00030079044010492565, | |
| "loss": 3.2231, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 24.963936145420647, | |
| "grad_norm": 0.40725383162498474, | |
| "learning_rate": 0.0003006155639755173, | |
| "loss": 3.2106, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 24.978501514798417, | |
| "grad_norm": 1.2001488208770752, | |
| "learning_rate": 0.00030044068784610897, | |
| "loss": 3.2361, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 24.993066884176184, | |
| "grad_norm": 0.4001549780368805, | |
| "learning_rate": 0.0003002658117167006, | |
| "loss": 3.226, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 25.007573992076438, | |
| "grad_norm": 0.44032537937164307, | |
| "learning_rate": 0.00030009093558729235, | |
| "loss": 3.1571, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 25.022139361454208, | |
| "grad_norm": 0.3807680606842041, | |
| "learning_rate": 0.000299916059457884, | |
| "loss": 3.1191, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 25.036704730831975, | |
| "grad_norm": 0.412787526845932, | |
| "learning_rate": 0.0002997411833284756, | |
| "loss": 3.1055, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 25.05127010020974, | |
| "grad_norm": 0.4120820462703705, | |
| "learning_rate": 0.0002995663071990673, | |
| "loss": 3.122, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.05127010020974, | |
| "eval_accuracy": 0.37377279308502226, | |
| "eval_loss": 3.5503273010253906, | |
| "eval_runtime": 180.5607, | |
| "eval_samples_per_second": 92.185, | |
| "eval_steps_per_second": 5.765, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 25.06583546958751, | |
| "grad_norm": 0.4001462757587433, | |
| "learning_rate": 0.00029939143106965895, | |
| "loss": 3.123, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 25.080400838965275, | |
| "grad_norm": 0.38697749376296997, | |
| "learning_rate": 0.00029921655494025064, | |
| "loss": 3.1187, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 25.094966208343042, | |
| "grad_norm": 0.4431687891483307, | |
| "learning_rate": 0.0002990416788108423, | |
| "loss": 3.1332, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 25.109531577720812, | |
| "grad_norm": 0.39642491936683655, | |
| "learning_rate": 0.00029886680268143397, | |
| "loss": 3.1289, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 25.12409694709858, | |
| "grad_norm": 0.4230706989765167, | |
| "learning_rate": 0.0002986919265520256, | |
| "loss": 3.1309, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 25.138662316476346, | |
| "grad_norm": 0.41169217228889465, | |
| "learning_rate": 0.0002985170504226173, | |
| "loss": 3.1399, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 25.153227685854112, | |
| "grad_norm": 0.42555660009384155, | |
| "learning_rate": 0.00029834217429320893, | |
| "loss": 3.1517, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 25.16779305523188, | |
| "grad_norm": 0.4415445029735565, | |
| "learning_rate": 0.0002981672981638006, | |
| "loss": 3.138, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 25.18235842460965, | |
| "grad_norm": 0.3974837064743042, | |
| "learning_rate": 0.00029799242203439226, | |
| "loss": 3.154, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 25.196923793987416, | |
| "grad_norm": 0.3946952521800995, | |
| "learning_rate": 0.00029781754590498395, | |
| "loss": 3.1533, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 25.211489163365183, | |
| "grad_norm": 0.40269792079925537, | |
| "learning_rate": 0.00029764266977557564, | |
| "loss": 3.1555, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 25.22605453274295, | |
| "grad_norm": 0.42831283807754517, | |
| "learning_rate": 0.0002974677936461673, | |
| "loss": 3.16, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 25.240619902120716, | |
| "grad_norm": 0.4184077978134155, | |
| "learning_rate": 0.0002972929175167589, | |
| "loss": 3.1594, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 25.255185271498487, | |
| "grad_norm": 0.40978071093559265, | |
| "learning_rate": 0.0002971180413873506, | |
| "loss": 3.1551, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 25.269750640876254, | |
| "grad_norm": 0.4196624755859375, | |
| "learning_rate": 0.00029694316525794224, | |
| "loss": 3.164, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 25.28431601025402, | |
| "grad_norm": 0.4315820336341858, | |
| "learning_rate": 0.00029676828912853393, | |
| "loss": 3.1674, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 25.298881379631787, | |
| "grad_norm": 0.3970293700695038, | |
| "learning_rate": 0.0002965934129991256, | |
| "loss": 3.173, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 25.313446749009554, | |
| "grad_norm": 0.4287879168987274, | |
| "learning_rate": 0.00029641853686971726, | |
| "loss": 3.1667, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 25.32801211838732, | |
| "grad_norm": 0.4140596091747284, | |
| "learning_rate": 0.0002962436607403089, | |
| "loss": 3.1729, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 25.34257748776509, | |
| "grad_norm": 0.43017467856407166, | |
| "learning_rate": 0.0002960687846109006, | |
| "loss": 3.1656, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.34257748776509, | |
| "eval_accuracy": 0.3738958885298442, | |
| "eval_loss": 3.5481033325195312, | |
| "eval_runtime": 180.694, | |
| "eval_samples_per_second": 92.117, | |
| "eval_steps_per_second": 5.761, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 25.357142857142858, | |
| "grad_norm": 0.4702874422073364, | |
| "learning_rate": 0.00029589390848149227, | |
| "loss": 3.1753, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 25.371708226520624, | |
| "grad_norm": 0.4354191720485687, | |
| "learning_rate": 0.0002957190323520839, | |
| "loss": 3.1733, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 25.38627359589839, | |
| "grad_norm": 0.42500242590904236, | |
| "learning_rate": 0.0002955441562226756, | |
| "loss": 3.1809, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 25.400838965276158, | |
| "grad_norm": 0.41088682413101196, | |
| "learning_rate": 0.00029536928009326723, | |
| "loss": 3.1821, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 25.41540433465393, | |
| "grad_norm": 0.4109274446964264, | |
| "learning_rate": 0.00029519440396385887, | |
| "loss": 3.1791, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 25.429969704031695, | |
| "grad_norm": 0.41495808959007263, | |
| "learning_rate": 0.00029501952783445056, | |
| "loss": 3.1909, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 25.44453507340946, | |
| "grad_norm": 0.4120579659938812, | |
| "learning_rate": 0.00029484465170504225, | |
| "loss": 3.1717, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 25.45910044278723, | |
| "grad_norm": 0.4141676425933838, | |
| "learning_rate": 0.0002946697755756339, | |
| "loss": 3.1782, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 25.473665812164995, | |
| "grad_norm": 0.47025105357170105, | |
| "learning_rate": 0.0002944948994462256, | |
| "loss": 3.1839, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 25.488231181542766, | |
| "grad_norm": 0.40863239765167236, | |
| "learning_rate": 0.00029432002331681727, | |
| "loss": 3.1862, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 25.502796550920532, | |
| "grad_norm": 0.4443865120410919, | |
| "learning_rate": 0.0002941451471874089, | |
| "loss": 3.1814, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 25.5173619202983, | |
| "grad_norm": 0.416560560464859, | |
| "learning_rate": 0.00029397027105800054, | |
| "loss": 3.1751, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 25.531927289676066, | |
| "grad_norm": 0.4450867474079132, | |
| "learning_rate": 0.00029379539492859223, | |
| "loss": 3.1802, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 25.546492659053833, | |
| "grad_norm": 0.39942240715026855, | |
| "learning_rate": 0.00029362051879918387, | |
| "loss": 3.176, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 25.5610580284316, | |
| "grad_norm": 0.4375326931476593, | |
| "learning_rate": 0.00029344564266977556, | |
| "loss": 3.1886, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 25.57562339780937, | |
| "grad_norm": 0.4080294668674469, | |
| "learning_rate": 0.00029327076654036725, | |
| "loss": 3.194, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 25.590188767187136, | |
| "grad_norm": 0.438039630651474, | |
| "learning_rate": 0.0002930958904109589, | |
| "loss": 3.1934, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 25.604754136564903, | |
| "grad_norm": 0.3943357765674591, | |
| "learning_rate": 0.0002929210142815505, | |
| "loss": 3.1864, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 25.61931950594267, | |
| "grad_norm": 0.42448991537094116, | |
| "learning_rate": 0.0002927461381521422, | |
| "loss": 3.1733, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 25.633884875320437, | |
| "grad_norm": 0.4249454140663147, | |
| "learning_rate": 0.0002925712620227339, | |
| "loss": 3.1894, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.633884875320437, | |
| "eval_accuracy": 0.3742290809755226, | |
| "eval_loss": 3.5432162284851074, | |
| "eval_runtime": 180.1053, | |
| "eval_samples_per_second": 92.418, | |
| "eval_steps_per_second": 5.78, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 25.648450244698207, | |
| "grad_norm": 0.4210646152496338, | |
| "learning_rate": 0.00029239638589332554, | |
| "loss": 3.1882, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 25.663015614075974, | |
| "grad_norm": 0.423607736825943, | |
| "learning_rate": 0.0002922215097639172, | |
| "loss": 3.1924, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 25.67758098345374, | |
| "grad_norm": 0.40322986245155334, | |
| "learning_rate": 0.00029204663363450886, | |
| "loss": 3.1921, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 25.692146352831507, | |
| "grad_norm": 0.4051462709903717, | |
| "learning_rate": 0.0002918717575051005, | |
| "loss": 3.1993, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 25.706711722209274, | |
| "grad_norm": 0.46788325905799866, | |
| "learning_rate": 0.0002916968813756922, | |
| "loss": 3.196, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 25.721277091587044, | |
| "grad_norm": 0.42460089921951294, | |
| "learning_rate": 0.0002915220052462839, | |
| "loss": 3.1927, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 25.73584246096481, | |
| "grad_norm": 0.40352416038513184, | |
| "learning_rate": 0.0002913471291168755, | |
| "loss": 3.2058, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 25.750407830342578, | |
| "grad_norm": 0.4355858266353607, | |
| "learning_rate": 0.00029117225298746715, | |
| "loss": 3.1969, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 25.764973199720345, | |
| "grad_norm": 0.4477996528148651, | |
| "learning_rate": 0.00029099737685805884, | |
| "loss": 3.2066, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 25.77953856909811, | |
| "grad_norm": 0.41174259781837463, | |
| "learning_rate": 0.00029082250072865053, | |
| "loss": 3.1981, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 25.794103938475878, | |
| "grad_norm": 0.41174444556236267, | |
| "learning_rate": 0.00029064762459924217, | |
| "loss": 3.2023, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 25.80866930785365, | |
| "grad_norm": 0.426297664642334, | |
| "learning_rate": 0.00029047274846983386, | |
| "loss": 3.2076, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 25.823234677231415, | |
| "grad_norm": 0.4066264033317566, | |
| "learning_rate": 0.0002902978723404255, | |
| "loss": 3.2045, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 25.837800046609182, | |
| "grad_norm": 0.4121791124343872, | |
| "learning_rate": 0.00029012299621101713, | |
| "loss": 3.2059, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 25.85236541598695, | |
| "grad_norm": 0.3962363302707672, | |
| "learning_rate": 0.0002899481200816088, | |
| "loss": 3.2033, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 25.866930785364715, | |
| "grad_norm": 0.3978932201862335, | |
| "learning_rate": 0.0002897732439522005, | |
| "loss": 3.2063, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 25.881496154742486, | |
| "grad_norm": 0.4174967408180237, | |
| "learning_rate": 0.00028959836782279215, | |
| "loss": 3.1895, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 25.896061524120253, | |
| "grad_norm": 0.40950289368629456, | |
| "learning_rate": 0.00028942349169338384, | |
| "loss": 3.2054, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 25.91062689349802, | |
| "grad_norm": 0.4066673517227173, | |
| "learning_rate": 0.00028924861556397553, | |
| "loss": 3.1945, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 25.925192262875786, | |
| "grad_norm": 0.4098004102706909, | |
| "learning_rate": 0.00028907373943456717, | |
| "loss": 3.2088, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.925192262875786, | |
| "eval_accuracy": 0.37464927497723555, | |
| "eval_loss": 3.532310724258423, | |
| "eval_runtime": 180.277, | |
| "eval_samples_per_second": 92.33, | |
| "eval_steps_per_second": 5.774, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 25.939757632253553, | |
| "grad_norm": 0.41714373230934143, | |
| "learning_rate": 0.0002888988633051588, | |
| "loss": 3.2092, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 25.954323001631323, | |
| "grad_norm": 0.4226542115211487, | |
| "learning_rate": 0.0002887239871757505, | |
| "loss": 3.2017, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 25.96888837100909, | |
| "grad_norm": 0.4572153091430664, | |
| "learning_rate": 0.00028854911104634213, | |
| "loss": 3.2083, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 25.983453740386857, | |
| "grad_norm": 0.42790505290031433, | |
| "learning_rate": 0.0002883742349169338, | |
| "loss": 3.2144, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 25.998019109764623, | |
| "grad_norm": 0.4000281095504761, | |
| "learning_rate": 0.0002881993587875255, | |
| "loss": 3.2087, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 26.01252621766488, | |
| "grad_norm": 0.40959304571151733, | |
| "learning_rate": 0.00028802448265811715, | |
| "loss": 3.1098, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 26.027091587042648, | |
| "grad_norm": 0.43922939896583557, | |
| "learning_rate": 0.0002878496065287088, | |
| "loss": 3.1144, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 26.041656956420415, | |
| "grad_norm": 0.43358200788497925, | |
| "learning_rate": 0.0002876747303993005, | |
| "loss": 3.1118, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 26.05622232579818, | |
| "grad_norm": 0.4154283106327057, | |
| "learning_rate": 0.00028749985426989216, | |
| "loss": 3.1089, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 26.070787695175948, | |
| "grad_norm": 0.445342093706131, | |
| "learning_rate": 0.0002873249781404838, | |
| "loss": 3.1359, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 26.08535306455372, | |
| "grad_norm": 0.4268057644367218, | |
| "learning_rate": 0.0002871501020110755, | |
| "loss": 3.11, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 26.099918433931485, | |
| "grad_norm": 0.41992899775505066, | |
| "learning_rate": 0.0002869752258816671, | |
| "loss": 3.1328, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 26.114483803309252, | |
| "grad_norm": 0.4173597991466522, | |
| "learning_rate": 0.00028680034975225876, | |
| "loss": 3.1256, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 26.12904917268702, | |
| "grad_norm": 0.4342476427555084, | |
| "learning_rate": 0.00028662547362285045, | |
| "loss": 3.1181, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 26.143614542064785, | |
| "grad_norm": 0.4344112277030945, | |
| "learning_rate": 0.00028645059749344214, | |
| "loss": 3.1259, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 26.158179911442556, | |
| "grad_norm": 0.4283551573753357, | |
| "learning_rate": 0.0002862757213640338, | |
| "loss": 3.141, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 26.172745280820322, | |
| "grad_norm": 0.4363539516925812, | |
| "learning_rate": 0.00028610084523462547, | |
| "loss": 3.1399, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 26.18731065019809, | |
| "grad_norm": 0.403045654296875, | |
| "learning_rate": 0.0002859259691052171, | |
| "loss": 3.1433, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 26.201876019575856, | |
| "grad_norm": 0.39886170625686646, | |
| "learning_rate": 0.0002857510929758088, | |
| "loss": 3.1444, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 26.216441388953623, | |
| "grad_norm": 0.419612854719162, | |
| "learning_rate": 0.00028557621684640043, | |
| "loss": 3.1508, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.216441388953623, | |
| "eval_accuracy": 0.3737163596432701, | |
| "eval_loss": 3.5523500442504883, | |
| "eval_runtime": 180.3889, | |
| "eval_samples_per_second": 92.273, | |
| "eval_steps_per_second": 5.771, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 26.23100675833139, | |
| "grad_norm": 0.42642802000045776, | |
| "learning_rate": 0.0002854013407169921, | |
| "loss": 3.1504, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 26.24557212770916, | |
| "grad_norm": 0.4583907127380371, | |
| "learning_rate": 0.00028522646458758376, | |
| "loss": 3.1457, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 26.260137497086927, | |
| "grad_norm": 0.42439937591552734, | |
| "learning_rate": 0.00028505158845817545, | |
| "loss": 3.1559, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 26.274702866464693, | |
| "grad_norm": 0.44137027859687805, | |
| "learning_rate": 0.0002848767123287671, | |
| "loss": 3.1467, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 26.28926823584246, | |
| "grad_norm": 0.42748647928237915, | |
| "learning_rate": 0.0002847018361993588, | |
| "loss": 3.157, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 26.303833605220227, | |
| "grad_norm": 0.4357030391693115, | |
| "learning_rate": 0.0002845269600699504, | |
| "loss": 3.1568, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 26.318398974597997, | |
| "grad_norm": 0.4434165358543396, | |
| "learning_rate": 0.0002843520839405421, | |
| "loss": 3.1537, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 26.332964343975764, | |
| "grad_norm": 0.4163769781589508, | |
| "learning_rate": 0.0002841772078111338, | |
| "loss": 3.1612, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 26.34752971335353, | |
| "grad_norm": 0.4222979247570038, | |
| "learning_rate": 0.00028400233168172543, | |
| "loss": 3.1499, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 26.362095082731297, | |
| "grad_norm": 0.42296892404556274, | |
| "learning_rate": 0.00028382745555231707, | |
| "loss": 3.1515, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 26.376660452109064, | |
| "grad_norm": 0.4424600899219513, | |
| "learning_rate": 0.00028365257942290876, | |
| "loss": 3.1693, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 26.391225821486834, | |
| "grad_norm": 0.49197620153427124, | |
| "learning_rate": 0.0002834777032935004, | |
| "loss": 3.1578, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 26.4057911908646, | |
| "grad_norm": 0.42167529463768005, | |
| "learning_rate": 0.0002833028271640921, | |
| "loss": 3.16, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 26.420356560242368, | |
| "grad_norm": 0.41751259565353394, | |
| "learning_rate": 0.00028312795103468377, | |
| "loss": 3.1661, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 26.434921929620135, | |
| "grad_norm": 0.4054003357887268, | |
| "learning_rate": 0.0002829530749052754, | |
| "loss": 3.1695, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 26.4494872989979, | |
| "grad_norm": 0.4270538091659546, | |
| "learning_rate": 0.00028277819877586705, | |
| "loss": 3.1721, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 26.46405266837567, | |
| "grad_norm": 0.4425770342350006, | |
| "learning_rate": 0.00028260332264645874, | |
| "loss": 3.1652, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 26.47861803775344, | |
| "grad_norm": 0.45323696732521057, | |
| "learning_rate": 0.0002824284465170504, | |
| "loss": 3.1668, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 26.493183407131205, | |
| "grad_norm": 0.4280482232570648, | |
| "learning_rate": 0.00028225357038764206, | |
| "loss": 3.1749, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 26.507748776508972, | |
| "grad_norm": 0.40407219529151917, | |
| "learning_rate": 0.00028207869425823375, | |
| "loss": 3.1713, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.507748776508972, | |
| "eval_accuracy": 0.3742687019544194, | |
| "eval_loss": 3.5423803329467773, | |
| "eval_runtime": 180.32, | |
| "eval_samples_per_second": 92.308, | |
| "eval_steps_per_second": 5.773, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 26.52231414588674, | |
| "grad_norm": 0.44610825181007385, | |
| "learning_rate": 0.0002819038181288254, | |
| "loss": 3.1699, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 26.536879515264506, | |
| "grad_norm": 0.4127073585987091, | |
| "learning_rate": 0.000281728941999417, | |
| "loss": 3.1862, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 26.551444884642276, | |
| "grad_norm": 0.5033323764801025, | |
| "learning_rate": 0.0002815540658700087, | |
| "loss": 3.1827, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 26.566010254020043, | |
| "grad_norm": 0.40992677211761475, | |
| "learning_rate": 0.0002813791897406004, | |
| "loss": 3.1775, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 26.58057562339781, | |
| "grad_norm": 0.42564913630485535, | |
| "learning_rate": 0.00028120431361119204, | |
| "loss": 3.1795, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 26.595140992775576, | |
| "grad_norm": 0.4116276502609253, | |
| "learning_rate": 0.00028102943748178373, | |
| "loss": 3.1807, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 26.609706362153343, | |
| "grad_norm": 0.41622668504714966, | |
| "learning_rate": 0.00028085456135237537, | |
| "loss": 3.1831, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 26.624271731531113, | |
| "grad_norm": 0.409170925617218, | |
| "learning_rate": 0.00028067968522296706, | |
| "loss": 3.1773, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 26.63883710090888, | |
| "grad_norm": 0.43525442481040955, | |
| "learning_rate": 0.0002805048090935587, | |
| "loss": 3.1753, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 26.653402470286647, | |
| "grad_norm": 0.4610871970653534, | |
| "learning_rate": 0.0002803299329641504, | |
| "loss": 3.167, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 26.667967839664414, | |
| "grad_norm": 0.4206131100654602, | |
| "learning_rate": 0.000280155056834742, | |
| "loss": 3.192, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 26.68253320904218, | |
| "grad_norm": 0.41161221265792847, | |
| "learning_rate": 0.0002799801807053337, | |
| "loss": 3.1936, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 26.697098578419947, | |
| "grad_norm": 0.46236079931259155, | |
| "learning_rate": 0.00027980530457592535, | |
| "loss": 3.1873, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 26.711663947797717, | |
| "grad_norm": 0.43499431014060974, | |
| "learning_rate": 0.00027963042844651704, | |
| "loss": 3.1807, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 26.726229317175484, | |
| "grad_norm": 0.41808217763900757, | |
| "learning_rate": 0.0002794555523171087, | |
| "loss": 3.1958, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 26.74079468655325, | |
| "grad_norm": 0.4619711637496948, | |
| "learning_rate": 0.00027928067618770037, | |
| "loss": 3.2044, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 26.755360055931018, | |
| "grad_norm": 0.4205552935600281, | |
| "learning_rate": 0.00027910580005829206, | |
| "loss": 3.1972, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 26.769925425308784, | |
| "grad_norm": 0.448356568813324, | |
| "learning_rate": 0.0002789309239288837, | |
| "loss": 3.1929, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 26.784490794686555, | |
| "grad_norm": 0.443820595741272, | |
| "learning_rate": 0.00027875604779947533, | |
| "loss": 3.1916, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 26.79905616406432, | |
| "grad_norm": 0.39487943053245544, | |
| "learning_rate": 0.000278581171670067, | |
| "loss": 3.1843, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.79905616406432, | |
| "eval_accuracy": 0.37467796197679293, | |
| "eval_loss": 3.5352799892425537, | |
| "eval_runtime": 180.1788, | |
| "eval_samples_per_second": 92.38, | |
| "eval_steps_per_second": 5.778, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 26.813621533442088, | |
| "grad_norm": 0.42585235834121704, | |
| "learning_rate": 0.00027840629554065865, | |
| "loss": 3.2017, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 26.828186902819855, | |
| "grad_norm": 0.45854058861732483, | |
| "learning_rate": 0.00027823141941125034, | |
| "loss": 3.1888, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 26.84275227219762, | |
| "grad_norm": 0.4178307354450226, | |
| "learning_rate": 0.00027805654328184204, | |
| "loss": 3.1904, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 26.857317641575392, | |
| "grad_norm": 0.4266476631164551, | |
| "learning_rate": 0.00027788166715243367, | |
| "loss": 3.1942, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 26.87188301095316, | |
| "grad_norm": 0.42667528986930847, | |
| "learning_rate": 0.0002777067910230253, | |
| "loss": 3.1938, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 26.886448380330926, | |
| "grad_norm": 0.44742685556411743, | |
| "learning_rate": 0.000277531914893617, | |
| "loss": 3.2002, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 26.901013749708692, | |
| "grad_norm": 0.4107263386249542, | |
| "learning_rate": 0.00027735703876420863, | |
| "loss": 3.1956, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 26.91557911908646, | |
| "grad_norm": 0.4186893403530121, | |
| "learning_rate": 0.0002771821626348003, | |
| "loss": 3.2089, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 26.930144488464226, | |
| "grad_norm": 0.42867034673690796, | |
| "learning_rate": 0.000277007286505392, | |
| "loss": 3.1904, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 26.944709857841996, | |
| "grad_norm": 0.4075056314468384, | |
| "learning_rate": 0.00027683241037598365, | |
| "loss": 3.1973, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 26.959275227219763, | |
| "grad_norm": 0.435605525970459, | |
| "learning_rate": 0.0002766575342465753, | |
| "loss": 3.1938, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 26.97384059659753, | |
| "grad_norm": 0.4103272259235382, | |
| "learning_rate": 0.000276482658117167, | |
| "loss": 3.2019, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 26.988405965975296, | |
| "grad_norm": 0.42094647884368896, | |
| "learning_rate": 0.00027630778198775867, | |
| "loss": 3.1926, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 27.002913073875554, | |
| "grad_norm": 0.44302475452423096, | |
| "learning_rate": 0.0002761329058583503, | |
| "loss": 3.1699, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 27.01747844325332, | |
| "grad_norm": 0.41190701723098755, | |
| "learning_rate": 0.000275958029728942, | |
| "loss": 3.1075, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 27.032043812631088, | |
| "grad_norm": 0.40939658880233765, | |
| "learning_rate": 0.00027578315359953363, | |
| "loss": 3.0978, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 27.046609182008854, | |
| "grad_norm": 0.470214307308197, | |
| "learning_rate": 0.00027560827747012527, | |
| "loss": 3.1039, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 27.061174551386625, | |
| "grad_norm": 0.4300399720668793, | |
| "learning_rate": 0.00027543340134071696, | |
| "loss": 3.1086, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 27.07573992076439, | |
| "grad_norm": 0.4189159572124481, | |
| "learning_rate": 0.00027525852521130865, | |
| "loss": 3.1083, | |
| "step": 92950 | |
| }, | |
| { | |
| "epoch": 27.090305290142158, | |
| "grad_norm": 0.42655813694000244, | |
| "learning_rate": 0.0002750836490819003, | |
| "loss": 3.1242, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.090305290142158, | |
| "eval_accuracy": 0.3741485457513554, | |
| "eval_loss": 3.5492360591888428, | |
| "eval_runtime": 180.2167, | |
| "eval_samples_per_second": 92.361, | |
| "eval_steps_per_second": 5.776, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 27.104870659519925, | |
| "grad_norm": 0.42484626173973083, | |
| "learning_rate": 0.000274908772952492, | |
| "loss": 3.1262, | |
| "step": 93050 | |
| }, | |
| { | |
| "epoch": 27.11943602889769, | |
| "grad_norm": 0.4413082003593445, | |
| "learning_rate": 0.0002747338968230836, | |
| "loss": 3.119, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 27.134001398275462, | |
| "grad_norm": 0.467857301235199, | |
| "learning_rate": 0.0002745590206936753, | |
| "loss": 3.1178, | |
| "step": 93150 | |
| }, | |
| { | |
| "epoch": 27.14856676765323, | |
| "grad_norm": 0.44680628180503845, | |
| "learning_rate": 0.00027438414456426694, | |
| "loss": 3.1284, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 27.163132137030995, | |
| "grad_norm": 0.4301452040672302, | |
| "learning_rate": 0.00027420926843485863, | |
| "loss": 3.1252, | |
| "step": 93250 | |
| }, | |
| { | |
| "epoch": 27.177697506408762, | |
| "grad_norm": 0.4273969531059265, | |
| "learning_rate": 0.00027403439230545026, | |
| "loss": 3.1284, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 27.19226287578653, | |
| "grad_norm": 0.4401126801967621, | |
| "learning_rate": 0.00027385951617604195, | |
| "loss": 3.1346, | |
| "step": 93350 | |
| }, | |
| { | |
| "epoch": 27.206828245164296, | |
| "grad_norm": 0.4096077084541321, | |
| "learning_rate": 0.0002736846400466336, | |
| "loss": 3.1379, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 27.221393614542066, | |
| "grad_norm": 0.4174330532550812, | |
| "learning_rate": 0.0002735097639172253, | |
| "loss": 3.1322, | |
| "step": 93450 | |
| }, | |
| { | |
| "epoch": 27.235958983919833, | |
| "grad_norm": 0.4422391653060913, | |
| "learning_rate": 0.0002733348877878169, | |
| "loss": 3.1311, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 27.2505243532976, | |
| "grad_norm": 0.4375896453857422, | |
| "learning_rate": 0.0002731600116584086, | |
| "loss": 3.1417, | |
| "step": 93550 | |
| }, | |
| { | |
| "epoch": 27.265089722675366, | |
| "grad_norm": 0.4318002164363861, | |
| "learning_rate": 0.0002729851355290003, | |
| "loss": 3.1503, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 27.279655092053133, | |
| "grad_norm": 0.421117901802063, | |
| "learning_rate": 0.00027281025939959193, | |
| "loss": 3.1242, | |
| "step": 93650 | |
| }, | |
| { | |
| "epoch": 27.294220461430903, | |
| "grad_norm": 0.4362134635448456, | |
| "learning_rate": 0.00027263538327018357, | |
| "loss": 3.1582, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 27.30878583080867, | |
| "grad_norm": 0.40824374556541443, | |
| "learning_rate": 0.00027246050714077526, | |
| "loss": 3.1412, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 27.323351200186437, | |
| "grad_norm": 0.4372468590736389, | |
| "learning_rate": 0.0002722856310113669, | |
| "loss": 3.1593, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 27.337916569564204, | |
| "grad_norm": 0.4370090365409851, | |
| "learning_rate": 0.0002721107548819586, | |
| "loss": 3.1438, | |
| "step": 93850 | |
| }, | |
| { | |
| "epoch": 27.35248193894197, | |
| "grad_norm": 0.4227096736431122, | |
| "learning_rate": 0.0002719358787525503, | |
| "loss": 3.1577, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 27.36704730831974, | |
| "grad_norm": 0.44676777720451355, | |
| "learning_rate": 0.0002717610026231419, | |
| "loss": 3.1549, | |
| "step": 93950 | |
| }, | |
| { | |
| "epoch": 27.381612677697508, | |
| "grad_norm": 0.42205673456192017, | |
| "learning_rate": 0.00027158612649373355, | |
| "loss": 3.1564, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.381612677697508, | |
| "eval_accuracy": 0.37452947148318255, | |
| "eval_loss": 3.5454885959625244, | |
| "eval_runtime": 180.3004, | |
| "eval_samples_per_second": 92.318, | |
| "eval_steps_per_second": 5.774, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 27.396178047075274, | |
| "grad_norm": 0.41671884059906006, | |
| "learning_rate": 0.00027141125036432524, | |
| "loss": 3.1507, | |
| "step": 94050 | |
| }, | |
| { | |
| "epoch": 27.41074341645304, | |
| "grad_norm": 0.4220370352268219, | |
| "learning_rate": 0.00027123637423491693, | |
| "loss": 3.1518, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 27.425308785830808, | |
| "grad_norm": 0.41778427362442017, | |
| "learning_rate": 0.00027106149810550857, | |
| "loss": 3.1651, | |
| "step": 94150 | |
| }, | |
| { | |
| "epoch": 27.439874155208575, | |
| "grad_norm": 0.44522127509117126, | |
| "learning_rate": 0.00027088662197610026, | |
| "loss": 3.1566, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 27.454439524586345, | |
| "grad_norm": 0.41769787669181824, | |
| "learning_rate": 0.0002707117458466919, | |
| "loss": 3.1643, | |
| "step": 94250 | |
| }, | |
| { | |
| "epoch": 27.46900489396411, | |
| "grad_norm": 0.4191843569278717, | |
| "learning_rate": 0.00027053686971728353, | |
| "loss": 3.1574, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 27.48357026334188, | |
| "grad_norm": 0.3989506661891937, | |
| "learning_rate": 0.0002703619935878752, | |
| "loss": 3.1625, | |
| "step": 94350 | |
| }, | |
| { | |
| "epoch": 27.498135632719645, | |
| "grad_norm": 0.4043886661529541, | |
| "learning_rate": 0.0002701871174584669, | |
| "loss": 3.1676, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 27.512701002097412, | |
| "grad_norm": 0.4305398166179657, | |
| "learning_rate": 0.00027001224132905855, | |
| "loss": 3.1538, | |
| "step": 94450 | |
| }, | |
| { | |
| "epoch": 27.527266371475182, | |
| "grad_norm": 0.43826940655708313, | |
| "learning_rate": 0.00026983736519965024, | |
| "loss": 3.1715, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 27.54183174085295, | |
| "grad_norm": 0.41890406608581543, | |
| "learning_rate": 0.0002696624890702419, | |
| "loss": 3.1744, | |
| "step": 94550 | |
| }, | |
| { | |
| "epoch": 27.556397110230716, | |
| "grad_norm": 0.42319968342781067, | |
| "learning_rate": 0.00026948761294083356, | |
| "loss": 3.1698, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 27.570962479608482, | |
| "grad_norm": 0.45210587978363037, | |
| "learning_rate": 0.0002693127368114252, | |
| "loss": 3.1734, | |
| "step": 94650 | |
| }, | |
| { | |
| "epoch": 27.58552784898625, | |
| "grad_norm": 0.43893709778785706, | |
| "learning_rate": 0.0002691378606820169, | |
| "loss": 3.1757, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 27.600093218364016, | |
| "grad_norm": 0.41676828265190125, | |
| "learning_rate": 0.0002689629845526085, | |
| "loss": 3.1645, | |
| "step": 94750 | |
| }, | |
| { | |
| "epoch": 27.614658587741786, | |
| "grad_norm": 0.4389956295490265, | |
| "learning_rate": 0.0002687881084232002, | |
| "loss": 3.1572, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 27.629223957119553, | |
| "grad_norm": 0.45625096559524536, | |
| "learning_rate": 0.00026861323229379185, | |
| "loss": 3.1791, | |
| "step": 94850 | |
| }, | |
| { | |
| "epoch": 27.64378932649732, | |
| "grad_norm": 0.43963149189949036, | |
| "learning_rate": 0.00026843835616438354, | |
| "loss": 3.1701, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 27.658354695875087, | |
| "grad_norm": 0.4541909694671631, | |
| "learning_rate": 0.0002682634800349752, | |
| "loss": 3.1809, | |
| "step": 94950 | |
| }, | |
| { | |
| "epoch": 27.672920065252853, | |
| "grad_norm": 0.4407740831375122, | |
| "learning_rate": 0.00026808860390556687, | |
| "loss": 3.1625, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.672920065252853, | |
| "eval_accuracy": 0.3749729442796183, | |
| "eval_loss": 3.5385658740997314, | |
| "eval_runtime": 180.3064, | |
| "eval_samples_per_second": 92.315, | |
| "eval_steps_per_second": 5.774, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 27.687485434630624, | |
| "grad_norm": 0.4178466200828552, | |
| "learning_rate": 0.00026791372777615856, | |
| "loss": 3.1759, | |
| "step": 95050 | |
| }, | |
| { | |
| "epoch": 27.70205080400839, | |
| "grad_norm": 0.42705100774765015, | |
| "learning_rate": 0.0002677388516467502, | |
| "loss": 3.1759, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 27.716616173386157, | |
| "grad_norm": 0.4739421308040619, | |
| "learning_rate": 0.00026756397551734183, | |
| "loss": 3.1809, | |
| "step": 95150 | |
| }, | |
| { | |
| "epoch": 27.731181542763924, | |
| "grad_norm": 0.4194541871547699, | |
| "learning_rate": 0.0002673890993879335, | |
| "loss": 3.1845, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 27.74574691214169, | |
| "grad_norm": 0.440497487783432, | |
| "learning_rate": 0.00026721422325852516, | |
| "loss": 3.1882, | |
| "step": 95250 | |
| }, | |
| { | |
| "epoch": 27.76031228151946, | |
| "grad_norm": 0.42385539412498474, | |
| "learning_rate": 0.00026703934712911685, | |
| "loss": 3.1663, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 27.774877650897228, | |
| "grad_norm": 0.4445149898529053, | |
| "learning_rate": 0.00026686447099970854, | |
| "loss": 3.186, | |
| "step": 95350 | |
| }, | |
| { | |
| "epoch": 27.789443020274994, | |
| "grad_norm": 0.4726870059967041, | |
| "learning_rate": 0.0002666895948703002, | |
| "loss": 3.1859, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 27.80400838965276, | |
| "grad_norm": 0.44043588638305664, | |
| "learning_rate": 0.0002665147187408918, | |
| "loss": 3.1927, | |
| "step": 95450 | |
| }, | |
| { | |
| "epoch": 27.818573759030528, | |
| "grad_norm": 0.4248933792114258, | |
| "learning_rate": 0.0002663398426114835, | |
| "loss": 3.1934, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 27.833139128408295, | |
| "grad_norm": 0.4195023775100708, | |
| "learning_rate": 0.0002661649664820752, | |
| "loss": 3.1849, | |
| "step": 95550 | |
| }, | |
| { | |
| "epoch": 27.847704497786065, | |
| "grad_norm": 0.4301508963108063, | |
| "learning_rate": 0.00026599009035266683, | |
| "loss": 3.1771, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 27.862269867163832, | |
| "grad_norm": 0.4297967851161957, | |
| "learning_rate": 0.0002658152142232585, | |
| "loss": 3.1853, | |
| "step": 95650 | |
| }, | |
| { | |
| "epoch": 27.8768352365416, | |
| "grad_norm": 0.42951589822769165, | |
| "learning_rate": 0.00026564033809385016, | |
| "loss": 3.1878, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 27.891400605919365, | |
| "grad_norm": 0.39632225036621094, | |
| "learning_rate": 0.0002654654619644418, | |
| "loss": 3.1832, | |
| "step": 95750 | |
| }, | |
| { | |
| "epoch": 27.905965975297132, | |
| "grad_norm": 0.4643055200576782, | |
| "learning_rate": 0.0002652905858350335, | |
| "loss": 3.1983, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 27.920531344674902, | |
| "grad_norm": 0.41126561164855957, | |
| "learning_rate": 0.00026511570970562517, | |
| "loss": 3.1842, | |
| "step": 95850 | |
| }, | |
| { | |
| "epoch": 27.93509671405267, | |
| "grad_norm": 0.40602535009384155, | |
| "learning_rate": 0.0002649408335762168, | |
| "loss": 3.1804, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 27.949662083430436, | |
| "grad_norm": 0.42574018239974976, | |
| "learning_rate": 0.0002647659574468085, | |
| "loss": 3.1809, | |
| "step": 95950 | |
| }, | |
| { | |
| "epoch": 27.964227452808203, | |
| "grad_norm": 0.4175705313682556, | |
| "learning_rate": 0.0002645910813174002, | |
| "loss": 3.1875, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.964227452808203, | |
| "eval_accuracy": 0.37480446694205405, | |
| "eval_loss": 3.5358710289001465, | |
| "eval_runtime": 180.3859, | |
| "eval_samples_per_second": 92.274, | |
| "eval_steps_per_second": 5.771, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 27.97879282218597, | |
| "grad_norm": 0.4382724165916443, | |
| "learning_rate": 0.0002644162051879918, | |
| "loss": 3.1871, | |
| "step": 96050 | |
| }, | |
| { | |
| "epoch": 27.99335819156374, | |
| "grad_norm": 0.4022817611694336, | |
| "learning_rate": 0.00026424132905858346, | |
| "loss": 3.1892, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 28.007865299463994, | |
| "grad_norm": 0.41491270065307617, | |
| "learning_rate": 0.00026406645292917515, | |
| "loss": 3.1331, | |
| "step": 96150 | |
| }, | |
| { | |
| "epoch": 28.02243066884176, | |
| "grad_norm": 0.45356419682502747, | |
| "learning_rate": 0.0002638915767997668, | |
| "loss": 3.0958, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 28.03699603821953, | |
| "grad_norm": 0.4329795837402344, | |
| "learning_rate": 0.0002637167006703585, | |
| "loss": 3.0965, | |
| "step": 96250 | |
| }, | |
| { | |
| "epoch": 28.051561407597298, | |
| "grad_norm": 0.43281567096710205, | |
| "learning_rate": 0.00026354182454095017, | |
| "loss": 3.0937, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 28.066126776975064, | |
| "grad_norm": 0.42949551343917847, | |
| "learning_rate": 0.0002633669484115418, | |
| "loss": 3.0962, | |
| "step": 96350 | |
| }, | |
| { | |
| "epoch": 28.08069214635283, | |
| "grad_norm": 0.46434319019317627, | |
| "learning_rate": 0.00026319207228213344, | |
| "loss": 3.0916, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 28.095257515730598, | |
| "grad_norm": 0.43190398812294006, | |
| "learning_rate": 0.00026301719615272513, | |
| "loss": 3.1137, | |
| "step": 96450 | |
| }, | |
| { | |
| "epoch": 28.109822885108365, | |
| "grad_norm": 0.44686824083328247, | |
| "learning_rate": 0.0002628423200233168, | |
| "loss": 3.1218, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 28.124388254486135, | |
| "grad_norm": 0.4280760884284973, | |
| "learning_rate": 0.00026266744389390846, | |
| "loss": 3.1093, | |
| "step": 96550 | |
| }, | |
| { | |
| "epoch": 28.1389536238639, | |
| "grad_norm": 0.41021299362182617, | |
| "learning_rate": 0.00026249256776450015, | |
| "loss": 3.1239, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 28.15351899324167, | |
| "grad_norm": 0.4637337923049927, | |
| "learning_rate": 0.0002623176916350918, | |
| "loss": 3.1269, | |
| "step": 96650 | |
| }, | |
| { | |
| "epoch": 28.168084362619435, | |
| "grad_norm": 0.4205281436443329, | |
| "learning_rate": 0.0002621428155056834, | |
| "loss": 3.1244, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 28.182649731997202, | |
| "grad_norm": 0.43963325023651123, | |
| "learning_rate": 0.0002619679393762751, | |
| "loss": 3.1256, | |
| "step": 96750 | |
| }, | |
| { | |
| "epoch": 28.197215101374972, | |
| "grad_norm": 0.40746745467185974, | |
| "learning_rate": 0.0002617930632468668, | |
| "loss": 3.1245, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 28.21178047075274, | |
| "grad_norm": 0.43309372663497925, | |
| "learning_rate": 0.00026161818711745844, | |
| "loss": 3.1279, | |
| "step": 96850 | |
| }, | |
| { | |
| "epoch": 28.226345840130506, | |
| "grad_norm": 0.46507173776626587, | |
| "learning_rate": 0.0002614433109880501, | |
| "loss": 3.1247, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 28.240911209508273, | |
| "grad_norm": 0.4328254759311676, | |
| "learning_rate": 0.00026126843485864176, | |
| "loss": 3.1296, | |
| "step": 96950 | |
| }, | |
| { | |
| "epoch": 28.25547657888604, | |
| "grad_norm": 0.42537787556648254, | |
| "learning_rate": 0.00026109355872923345, | |
| "loss": 3.1266, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.25547657888604, | |
| "eval_accuracy": 0.37432748678957795, | |
| "eval_loss": 3.5481479167938232, | |
| "eval_runtime": 180.2202, | |
| "eval_samples_per_second": 92.359, | |
| "eval_steps_per_second": 5.776, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 28.27004194826381, | |
| "grad_norm": 0.4779875576496124, | |
| "learning_rate": 0.0002609186825998251, | |
| "loss": 3.1381, | |
| "step": 97050 | |
| }, | |
| { | |
| "epoch": 28.284607317641576, | |
| "grad_norm": 0.45657962560653687, | |
| "learning_rate": 0.0002607438064704168, | |
| "loss": 3.1405, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 28.299172687019343, | |
| "grad_norm": 0.44699814915657043, | |
| "learning_rate": 0.0002605689303410084, | |
| "loss": 3.1292, | |
| "step": 97150 | |
| }, | |
| { | |
| "epoch": 28.31373805639711, | |
| "grad_norm": 0.42782288789749146, | |
| "learning_rate": 0.00026039405421160005, | |
| "loss": 3.1424, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 28.328303425774877, | |
| "grad_norm": 0.43705010414123535, | |
| "learning_rate": 0.00026021917808219174, | |
| "loss": 3.1352, | |
| "step": 97250 | |
| }, | |
| { | |
| "epoch": 28.342868795152643, | |
| "grad_norm": 0.4696052074432373, | |
| "learning_rate": 0.00026004430195278343, | |
| "loss": 3.1416, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 28.357434164530414, | |
| "grad_norm": 0.4281631112098694, | |
| "learning_rate": 0.00025986942582337507, | |
| "loss": 3.1427, | |
| "step": 97350 | |
| }, | |
| { | |
| "epoch": 28.37199953390818, | |
| "grad_norm": 0.4597308337688446, | |
| "learning_rate": 0.00025969454969396676, | |
| "loss": 3.1431, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 28.386564903285947, | |
| "grad_norm": 0.43386340141296387, | |
| "learning_rate": 0.00025951967356455845, | |
| "loss": 3.1386, | |
| "step": 97450 | |
| }, | |
| { | |
| "epoch": 28.401130272663714, | |
| "grad_norm": 0.44734376668930054, | |
| "learning_rate": 0.0002593447974351501, | |
| "loss": 3.1467, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 28.41569564204148, | |
| "grad_norm": 0.4462273120880127, | |
| "learning_rate": 0.0002591699213057417, | |
| "loss": 3.1383, | |
| "step": 97550 | |
| }, | |
| { | |
| "epoch": 28.43026101141925, | |
| "grad_norm": 0.4251599907875061, | |
| "learning_rate": 0.0002589950451763334, | |
| "loss": 3.1555, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 28.444826380797018, | |
| "grad_norm": 0.42566782236099243, | |
| "learning_rate": 0.00025882016904692505, | |
| "loss": 3.1627, | |
| "step": 97650 | |
| }, | |
| { | |
| "epoch": 28.459391750174785, | |
| "grad_norm": 0.4228300452232361, | |
| "learning_rate": 0.00025864529291751674, | |
| "loss": 3.1494, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 28.47395711955255, | |
| "grad_norm": 0.44921255111694336, | |
| "learning_rate": 0.00025847041678810843, | |
| "loss": 3.1556, | |
| "step": 97750 | |
| }, | |
| { | |
| "epoch": 28.488522488930318, | |
| "grad_norm": 0.4171012043952942, | |
| "learning_rate": 0.00025829554065870007, | |
| "loss": 3.1618, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 28.503087858308085, | |
| "grad_norm": 0.4369617700576782, | |
| "learning_rate": 0.0002581206645292917, | |
| "loss": 3.157, | |
| "step": 97850 | |
| }, | |
| { | |
| "epoch": 28.517653227685855, | |
| "grad_norm": 0.4576139748096466, | |
| "learning_rate": 0.0002579457883998834, | |
| "loss": 3.1497, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 28.532218597063622, | |
| "grad_norm": 0.46645256876945496, | |
| "learning_rate": 0.0002577709122704751, | |
| "loss": 3.1585, | |
| "step": 97950 | |
| }, | |
| { | |
| "epoch": 28.54678396644139, | |
| "grad_norm": 0.4609380066394806, | |
| "learning_rate": 0.0002575960361410667, | |
| "loss": 3.1698, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.54678396644139, | |
| "eval_accuracy": 0.37467278891129896, | |
| "eval_loss": 3.543879747390747, | |
| "eval_runtime": 180.2696, | |
| "eval_samples_per_second": 92.334, | |
| "eval_steps_per_second": 5.775, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 28.561349335819155, | |
| "grad_norm": 0.4351586401462555, | |
| "learning_rate": 0.0002574211600116584, | |
| "loss": 3.1573, | |
| "step": 98050 | |
| }, | |
| { | |
| "epoch": 28.575914705196922, | |
| "grad_norm": 0.4162726402282715, | |
| "learning_rate": 0.00025724628388225005, | |
| "loss": 3.1631, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 28.590480074574693, | |
| "grad_norm": 0.44690102338790894, | |
| "learning_rate": 0.0002570714077528417, | |
| "loss": 3.1584, | |
| "step": 98150 | |
| }, | |
| { | |
| "epoch": 28.60504544395246, | |
| "grad_norm": 0.43216705322265625, | |
| "learning_rate": 0.0002568965316234334, | |
| "loss": 3.1693, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 28.619610813330226, | |
| "grad_norm": 0.45666563510894775, | |
| "learning_rate": 0.00025672165549402506, | |
| "loss": 3.1635, | |
| "step": 98250 | |
| }, | |
| { | |
| "epoch": 28.634176182707993, | |
| "grad_norm": 0.44742050766944885, | |
| "learning_rate": 0.0002565467793646167, | |
| "loss": 3.1559, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 28.64874155208576, | |
| "grad_norm": 0.4800066351890564, | |
| "learning_rate": 0.0002563719032352084, | |
| "loss": 3.1656, | |
| "step": 98350 | |
| }, | |
| { | |
| "epoch": 28.66330692146353, | |
| "grad_norm": 0.4308624267578125, | |
| "learning_rate": 0.0002561970271058, | |
| "loss": 3.1618, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 28.677872290841297, | |
| "grad_norm": 0.4453083276748657, | |
| "learning_rate": 0.0002560221509763917, | |
| "loss": 3.1673, | |
| "step": 98450 | |
| }, | |
| { | |
| "epoch": 28.692437660219063, | |
| "grad_norm": 0.4215611219406128, | |
| "learning_rate": 0.00025584727484698335, | |
| "loss": 3.1757, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 28.70700302959683, | |
| "grad_norm": 0.466304749250412, | |
| "learning_rate": 0.00025567239871757504, | |
| "loss": 3.1556, | |
| "step": 98550 | |
| }, | |
| { | |
| "epoch": 28.721568398974597, | |
| "grad_norm": 0.4174833297729492, | |
| "learning_rate": 0.0002554975225881667, | |
| "loss": 3.168, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 28.736133768352367, | |
| "grad_norm": 0.44293835759162903, | |
| "learning_rate": 0.00025532264645875837, | |
| "loss": 3.1706, | |
| "step": 98650 | |
| }, | |
| { | |
| "epoch": 28.750699137730134, | |
| "grad_norm": 0.4146621525287628, | |
| "learning_rate": 0.00025514777032935, | |
| "loss": 3.1773, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 28.7652645071079, | |
| "grad_norm": 0.42804571986198425, | |
| "learning_rate": 0.0002549728941999417, | |
| "loss": 3.1804, | |
| "step": 98750 | |
| }, | |
| { | |
| "epoch": 28.779829876485667, | |
| "grad_norm": 0.3940410315990448, | |
| "learning_rate": 0.00025479801807053333, | |
| "loss": 3.1746, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 28.794395245863434, | |
| "grad_norm": 0.4341360330581665, | |
| "learning_rate": 0.000254623141941125, | |
| "loss": 3.1759, | |
| "step": 98850 | |
| }, | |
| { | |
| "epoch": 28.8089606152412, | |
| "grad_norm": 0.4237947165966034, | |
| "learning_rate": 0.0002544482658117167, | |
| "loss": 3.1689, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 28.82352598461897, | |
| "grad_norm": 0.4510186016559601, | |
| "learning_rate": 0.00025427338968230835, | |
| "loss": 3.1761, | |
| "step": 98950 | |
| }, | |
| { | |
| "epoch": 28.838091353996738, | |
| "grad_norm": 0.47351089119911194, | |
| "learning_rate": 0.0002540985135529, | |
| "loss": 3.1624, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.838091353996738, | |
| "eval_accuracy": 0.3749749429640137, | |
| "eval_loss": 3.540158271789551, | |
| "eval_runtime": 180.0523, | |
| "eval_samples_per_second": 92.445, | |
| "eval_steps_per_second": 5.782, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 28.852656723374505, | |
| "grad_norm": 0.4320347011089325, | |
| "learning_rate": 0.0002539236374234917, | |
| "loss": 3.1782, | |
| "step": 99050 | |
| }, | |
| { | |
| "epoch": 28.86722209275227, | |
| "grad_norm": 0.4034181833267212, | |
| "learning_rate": 0.0002537487612940833, | |
| "loss": 3.1781, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 28.88178746213004, | |
| "grad_norm": 0.4507603943347931, | |
| "learning_rate": 0.000253573885164675, | |
| "loss": 3.1755, | |
| "step": 99150 | |
| }, | |
| { | |
| "epoch": 28.89635283150781, | |
| "grad_norm": 0.41703903675079346, | |
| "learning_rate": 0.0002533990090352667, | |
| "loss": 3.1741, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 28.910918200885575, | |
| "grad_norm": 0.4325399100780487, | |
| "learning_rate": 0.00025322413290585833, | |
| "loss": 3.1772, | |
| "step": 99250 | |
| }, | |
| { | |
| "epoch": 28.925483570263342, | |
| "grad_norm": 0.4348280131816864, | |
| "learning_rate": 0.00025304925677644997, | |
| "loss": 3.1738, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 28.94004893964111, | |
| "grad_norm": 0.423967182636261, | |
| "learning_rate": 0.00025287438064704166, | |
| "loss": 3.184, | |
| "step": 99350 | |
| }, | |
| { | |
| "epoch": 28.954614309018876, | |
| "grad_norm": 0.4055376350879669, | |
| "learning_rate": 0.00025269950451763335, | |
| "loss": 3.1833, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 28.969179678396642, | |
| "grad_norm": 0.40919673442840576, | |
| "learning_rate": 0.000252524628388225, | |
| "loss": 3.1864, | |
| "step": 99450 | |
| }, | |
| { | |
| "epoch": 28.983745047774413, | |
| "grad_norm": 0.45877301692962646, | |
| "learning_rate": 0.0002523497522588167, | |
| "loss": 3.177, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 28.99831041715218, | |
| "grad_norm": 0.4333646297454834, | |
| "learning_rate": 0.0002521748761294083, | |
| "loss": 3.1833, | |
| "step": 99550 | |
| }, | |
| { | |
| "epoch": 29.012817525052434, | |
| "grad_norm": 0.4438258409500122, | |
| "learning_rate": 0.00025199999999999995, | |
| "loss": 3.0928, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 29.027382894430204, | |
| "grad_norm": 0.44858071208000183, | |
| "learning_rate": 0.00025182512387059164, | |
| "loss": 3.092, | |
| "step": 99650 | |
| }, | |
| { | |
| "epoch": 29.04194826380797, | |
| "grad_norm": 0.4630739390850067, | |
| "learning_rate": 0.0002516502477411833, | |
| "loss": 3.0938, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 29.056513633185737, | |
| "grad_norm": 0.45132312178611755, | |
| "learning_rate": 0.00025147537161177496, | |
| "loss": 3.0957, | |
| "step": 99750 | |
| }, | |
| { | |
| "epoch": 29.071079002563504, | |
| "grad_norm": 0.44013890624046326, | |
| "learning_rate": 0.00025130049548236665, | |
| "loss": 3.092, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 29.08564437194127, | |
| "grad_norm": 0.4376552700996399, | |
| "learning_rate": 0.0002511256193529583, | |
| "loss": 3.0954, | |
| "step": 99850 | |
| }, | |
| { | |
| "epoch": 29.10020974131904, | |
| "grad_norm": 0.4498368799686432, | |
| "learning_rate": 0.00025095074322355, | |
| "loss": 3.1093, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 29.114775110696808, | |
| "grad_norm": 0.4516642093658447, | |
| "learning_rate": 0.0002507758670941416, | |
| "loss": 3.1087, | |
| "step": 99950 | |
| }, | |
| { | |
| "epoch": 29.129340480074575, | |
| "grad_norm": 0.46183279156684875, | |
| "learning_rate": 0.0002506009909647333, | |
| "loss": 3.1128, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.129340480074575, | |
| "eval_accuracy": 0.3746281124365785, | |
| "eval_loss": 3.5496273040771484, | |
| "eval_runtime": 180.3174, | |
| "eval_samples_per_second": 92.309, | |
| "eval_steps_per_second": 5.773, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 29.129340480074575, | |
| "step": 100000, | |
| "total_flos": 2.090213187452928e+18, | |
| "train_loss": 0.6336719000244141, | |
| "train_runtime": 39937.4764, | |
| "train_samples_per_second": 343.814, | |
| "train_steps_per_second": 4.298 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 171650, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 20 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.090213187452928e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |