Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_global_step": 65000, | |
| "best_metric": 3.518871307373047, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_1001/checkpoint-30000", | |
| "epoch": 20.0, | |
| "eval_steps": 1000, | |
| "global_step": 68660, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01456536937776742, | |
| "grad_norm": 1.0411887168884277, | |
| "learning_rate": 0.000294, | |
| "loss": 8.4313, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02913073875553484, | |
| "grad_norm": 0.9934611916542053, | |
| "learning_rate": 0.0005939999999999999, | |
| "loss": 6.7347, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04369610813330226, | |
| "grad_norm": 0.6710580587387085, | |
| "learning_rate": 0.0005995711785297549, | |
| "loss": 6.3491, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05826147751106968, | |
| "grad_norm": 0.46758192777633667, | |
| "learning_rate": 0.0005991336056009335, | |
| "loss": 6.1272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0728268468888371, | |
| "grad_norm": 0.5224136710166931, | |
| "learning_rate": 0.000598696032672112, | |
| "loss": 5.9887, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08739221626660452, | |
| "grad_norm": 0.5374292135238647, | |
| "learning_rate": 0.0005982584597432905, | |
| "loss": 5.8537, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10195758564437195, | |
| "grad_norm": 0.42533859610557556, | |
| "learning_rate": 0.0005978208868144691, | |
| "loss": 5.7305, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11652295502213936, | |
| "grad_norm": 0.5247730016708374, | |
| "learning_rate": 0.0005973833138856476, | |
| "loss": 5.6139, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13108832439990678, | |
| "grad_norm": 0.5447224974632263, | |
| "learning_rate": 0.000596945740956826, | |
| "loss": 5.4885, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1456536937776742, | |
| "grad_norm": 0.5301450490951538, | |
| "learning_rate": 0.0005965081680280046, | |
| "loss": 5.4119, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16021906315544163, | |
| "grad_norm": 0.5106812119483948, | |
| "learning_rate": 0.0005960705950991831, | |
| "loss": 5.3254, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.17478443253320905, | |
| "grad_norm": 0.42297908663749695, | |
| "learning_rate": 0.0005956330221703616, | |
| "loss": 5.2446, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18934980191097647, | |
| "grad_norm": 0.47052502632141113, | |
| "learning_rate": 0.0005951954492415402, | |
| "loss": 5.1939, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2039151712887439, | |
| "grad_norm": 0.4460456371307373, | |
| "learning_rate": 0.0005947578763127188, | |
| "loss": 5.1229, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2184805406665113, | |
| "grad_norm": 0.46692177653312683, | |
| "learning_rate": 0.0005943203033838973, | |
| "loss": 5.0837, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23304591004427871, | |
| "grad_norm": 0.4475383758544922, | |
| "learning_rate": 0.0005938827304550758, | |
| "loss": 5.0189, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24761127942204614, | |
| "grad_norm": 0.4715788960456848, | |
| "learning_rate": 0.0005934451575262544, | |
| "loss": 4.9712, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.26217664879981356, | |
| "grad_norm": 0.5530597567558289, | |
| "learning_rate": 0.0005930075845974328, | |
| "loss": 4.9247, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.276742018177581, | |
| "grad_norm": 0.5195161700248718, | |
| "learning_rate": 0.0005925700116686113, | |
| "loss": 4.8776, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2913073875553484, | |
| "grad_norm": 0.46087169647216797, | |
| "learning_rate": 0.0005921324387397899, | |
| "loss": 4.8133, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2913073875553484, | |
| "eval_accuracy": 0.2545722064591014, | |
| "eval_loss": 4.754235744476318, | |
| "eval_runtime": 180.1197, | |
| "eval_samples_per_second": 92.405, | |
| "eval_steps_per_second": 5.779, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.30587275693311583, | |
| "grad_norm": 0.44081172347068787, | |
| "learning_rate": 0.0005916948658109684, | |
| "loss": 4.7792, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.32043812631088325, | |
| "grad_norm": 0.48162946105003357, | |
| "learning_rate": 0.000591257292882147, | |
| "loss": 4.7359, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3350034956886507, | |
| "grad_norm": 0.4235544204711914, | |
| "learning_rate": 0.0005908197199533255, | |
| "loss": 4.6992, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3495688650664181, | |
| "grad_norm": 0.4740869402885437, | |
| "learning_rate": 0.0005903821470245041, | |
| "loss": 4.6654, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3641342344441855, | |
| "grad_norm": 0.4276205897331238, | |
| "learning_rate": 0.0005899445740956826, | |
| "loss": 4.6244, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.37869960382195295, | |
| "grad_norm": 0.40895992517471313, | |
| "learning_rate": 0.0005895070011668611, | |
| "loss": 4.6068, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.39326497319972037, | |
| "grad_norm": 0.4188133478164673, | |
| "learning_rate": 0.0005890694282380397, | |
| "loss": 4.5584, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4078303425774878, | |
| "grad_norm": 0.4317689538002014, | |
| "learning_rate": 0.0005886318553092181, | |
| "loss": 4.5422, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42239571195525516, | |
| "grad_norm": 0.40392470359802246, | |
| "learning_rate": 0.0005881942823803966, | |
| "loss": 4.5333, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4369610813330226, | |
| "grad_norm": 0.4244018793106079, | |
| "learning_rate": 0.0005877567094515752, | |
| "loss": 4.4986, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.45152645071079, | |
| "grad_norm": 0.44470831751823425, | |
| "learning_rate": 0.0005873191365227537, | |
| "loss": 4.4928, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.46609182008855743, | |
| "grad_norm": 0.4386588931083679, | |
| "learning_rate": 0.0005868815635939323, | |
| "loss": 4.4553, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.48065718946632485, | |
| "grad_norm": 0.42980971932411194, | |
| "learning_rate": 0.0005864439906651108, | |
| "loss": 4.4385, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4952225588440923, | |
| "grad_norm": 0.3935016691684723, | |
| "learning_rate": 0.0005860064177362894, | |
| "loss": 4.4327, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5097879282218597, | |
| "grad_norm": 0.4373241662979126, | |
| "learning_rate": 0.0005855688448074679, | |
| "loss": 4.4099, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5243532975996271, | |
| "grad_norm": 0.4172551929950714, | |
| "learning_rate": 0.0005851312718786464, | |
| "loss": 4.3901, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5389186669773945, | |
| "grad_norm": 0.40378788113594055, | |
| "learning_rate": 0.0005846936989498249, | |
| "loss": 4.383, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.553484036355162, | |
| "grad_norm": 0.38236093521118164, | |
| "learning_rate": 0.0005842561260210034, | |
| "loss": 4.3598, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5680494057329294, | |
| "grad_norm": 0.381078839302063, | |
| "learning_rate": 0.000583818553092182, | |
| "loss": 4.3638, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5826147751106968, | |
| "grad_norm": 0.4327857196331024, | |
| "learning_rate": 0.0005833809801633605, | |
| "loss": 4.3432, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5826147751106968, | |
| "eval_accuracy": 0.29888864119390235, | |
| "eval_loss": 4.287432670593262, | |
| "eval_runtime": 180.4505, | |
| "eval_samples_per_second": 92.236, | |
| "eval_steps_per_second": 5.769, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5971801444884642, | |
| "grad_norm": 0.4143087863922119, | |
| "learning_rate": 0.000582943407234539, | |
| "loss": 4.329, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6117455138662317, | |
| "grad_norm": 0.3753448724746704, | |
| "learning_rate": 0.0005825058343057176, | |
| "loss": 4.3033, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6263108832439991, | |
| "grad_norm": 0.40621188282966614, | |
| "learning_rate": 0.0005820682613768961, | |
| "loss": 4.3041, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6408762526217665, | |
| "grad_norm": 0.40833911299705505, | |
| "learning_rate": 0.0005816306884480747, | |
| "loss": 4.2834, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6554416219995339, | |
| "grad_norm": 0.4088577628135681, | |
| "learning_rate": 0.0005811931155192532, | |
| "loss": 4.2804, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6700069913773014, | |
| "grad_norm": 0.3746855556964874, | |
| "learning_rate": 0.0005807555425904316, | |
| "loss": 4.2762, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6845723607550688, | |
| "grad_norm": 0.3618931770324707, | |
| "learning_rate": 0.0005803179696616102, | |
| "loss": 4.2598, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6991377301328362, | |
| "grad_norm": 0.3690814971923828, | |
| "learning_rate": 0.0005798803967327887, | |
| "loss": 4.2413, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7137030995106036, | |
| "grad_norm": 0.40264639258384705, | |
| "learning_rate": 0.0005794428238039673, | |
| "loss": 4.2375, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.728268468888371, | |
| "grad_norm": 0.4249323606491089, | |
| "learning_rate": 0.0005790052508751458, | |
| "loss": 4.233, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7428338382661385, | |
| "grad_norm": 0.39969372749328613, | |
| "learning_rate": 0.0005785676779463243, | |
| "loss": 4.2202, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7573992076439059, | |
| "grad_norm": 0.3819160759449005, | |
| "learning_rate": 0.0005781301050175029, | |
| "loss": 4.2199, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7719645770216733, | |
| "grad_norm": 0.361541211605072, | |
| "learning_rate": 0.0005776925320886814, | |
| "loss": 4.204, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7865299463994407, | |
| "grad_norm": 0.3613761365413666, | |
| "learning_rate": 0.00057725495915986, | |
| "loss": 4.1961, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8010953157772082, | |
| "grad_norm": 0.4024335741996765, | |
| "learning_rate": 0.0005768173862310384, | |
| "loss": 4.1899, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8156606851549756, | |
| "grad_norm": 0.34226447343826294, | |
| "learning_rate": 0.0005763798133022169, | |
| "loss": 4.1648, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8302260545327429, | |
| "grad_norm": 0.3609713315963745, | |
| "learning_rate": 0.0005759422403733955, | |
| "loss": 4.1756, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.8447914239105103, | |
| "grad_norm": 0.37800899147987366, | |
| "learning_rate": 0.000575504667444574, | |
| "loss": 4.1618, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8593567932882777, | |
| "grad_norm": 0.35399892926216125, | |
| "learning_rate": 0.0005750670945157526, | |
| "loss": 4.151, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8739221626660452, | |
| "grad_norm": 0.35685768723487854, | |
| "learning_rate": 0.0005746295215869311, | |
| "loss": 4.1316, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8739221626660452, | |
| "eval_accuracy": 0.3165242106956263, | |
| "eval_loss": 4.094448566436768, | |
| "eval_runtime": 180.2904, | |
| "eval_samples_per_second": 92.318, | |
| "eval_steps_per_second": 5.774, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8884875320438126, | |
| "grad_norm": 0.3725755214691162, | |
| "learning_rate": 0.0005741919486581096, | |
| "loss": 4.1422, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.90305290142158, | |
| "grad_norm": 0.3675600290298462, | |
| "learning_rate": 0.0005737543757292882, | |
| "loss": 4.1361, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9176182707993474, | |
| "grad_norm": 0.3458426892757416, | |
| "learning_rate": 0.0005733168028004667, | |
| "loss": 4.1292, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9321836401771149, | |
| "grad_norm": 0.35508471727371216, | |
| "learning_rate": 0.0005728792298716453, | |
| "loss": 4.1173, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9467490095548823, | |
| "grad_norm": 0.3473420739173889, | |
| "learning_rate": 0.0005724416569428237, | |
| "loss": 4.1051, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.9613143789326497, | |
| "grad_norm": 0.38041558861732483, | |
| "learning_rate": 0.0005720040840140023, | |
| "loss": 4.1141, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9758797483104171, | |
| "grad_norm": 0.36010000109672546, | |
| "learning_rate": 0.0005715665110851808, | |
| "loss": 4.1036, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9904451176881846, | |
| "grad_norm": 0.35879573225975037, | |
| "learning_rate": 0.0005711289381563593, | |
| "loss": 4.099, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0049522255884409, | |
| "grad_norm": 0.36708125472068787, | |
| "learning_rate": 0.0005706913652275379, | |
| "loss": 4.0613, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0195175949662083, | |
| "grad_norm": 0.36130914092063904, | |
| "learning_rate": 0.0005702537922987164, | |
| "loss": 4.023, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0340829643439757, | |
| "grad_norm": 0.3636951446533203, | |
| "learning_rate": 0.0005698162193698949, | |
| "loss": 4.0197, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.0486483337217432, | |
| "grad_norm": 0.3318287432193756, | |
| "learning_rate": 0.0005693786464410735, | |
| "loss": 4.009, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.0632137030995106, | |
| "grad_norm": 0.3470671474933624, | |
| "learning_rate": 0.000568941073512252, | |
| "loss": 3.9984, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.077779072477278, | |
| "grad_norm": 0.3891682028770447, | |
| "learning_rate": 0.0005685035005834305, | |
| "loss": 4.0067, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.0923444418550454, | |
| "grad_norm": 0.3590176999568939, | |
| "learning_rate": 0.000568065927654609, | |
| "loss": 4.011, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1069098112328128, | |
| "grad_norm": 0.3605038821697235, | |
| "learning_rate": 0.0005676283547257876, | |
| "loss": 4.0014, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1214751806105803, | |
| "grad_norm": 0.35803160071372986, | |
| "learning_rate": 0.0005671907817969661, | |
| "loss": 3.9972, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1360405499883477, | |
| "grad_norm": 0.3601053059101105, | |
| "learning_rate": 0.0005667532088681446, | |
| "loss": 3.9894, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.1506059193661151, | |
| "grad_norm": 0.3761005699634552, | |
| "learning_rate": 0.0005663156359393232, | |
| "loss": 3.9892, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.1651712887438825, | |
| "grad_norm": 0.34091663360595703, | |
| "learning_rate": 0.0005658780630105017, | |
| "loss": 3.9856, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1651712887438825, | |
| "eval_accuracy": 0.3262635618883952, | |
| "eval_loss": 3.9839839935302734, | |
| "eval_runtime": 180.3348, | |
| "eval_samples_per_second": 92.295, | |
| "eval_steps_per_second": 5.773, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.17973665812165, | |
| "grad_norm": 0.3509597182273865, | |
| "learning_rate": 0.0005654404900816802, | |
| "loss": 3.9952, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.1943020274994174, | |
| "grad_norm": 0.35156598687171936, | |
| "learning_rate": 0.0005650029171528588, | |
| "loss": 3.9769, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2088673968771848, | |
| "grad_norm": 0.34221357107162476, | |
| "learning_rate": 0.0005645653442240373, | |
| "loss": 3.9839, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.2234327662549522, | |
| "grad_norm": 0.3706187307834625, | |
| "learning_rate": 0.0005641277712952158, | |
| "loss": 3.9845, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.2379981356327197, | |
| "grad_norm": 0.3384045660495758, | |
| "learning_rate": 0.0005636901983663943, | |
| "loss": 3.9635, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.252563505010487, | |
| "grad_norm": 0.36682382225990295, | |
| "learning_rate": 0.0005632526254375729, | |
| "loss": 3.9741, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.2671288743882545, | |
| "grad_norm": 0.3488970398902893, | |
| "learning_rate": 0.0005628150525087514, | |
| "loss": 3.9682, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.281694243766022, | |
| "grad_norm": 0.3281860053539276, | |
| "learning_rate": 0.0005623774795799299, | |
| "loss": 3.9527, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.2962596131437893, | |
| "grad_norm": 0.3465306758880615, | |
| "learning_rate": 0.0005619399066511085, | |
| "loss": 3.9529, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3108249825215568, | |
| "grad_norm": 0.3299737870693207, | |
| "learning_rate": 0.000561502333722287, | |
| "loss": 3.9559, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3253903518993242, | |
| "grad_norm": 0.3362690508365631, | |
| "learning_rate": 0.0005610647607934655, | |
| "loss": 3.9554, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.3399557212770916, | |
| "grad_norm": 0.34816160798072815, | |
| "learning_rate": 0.000560627187864644, | |
| "loss": 3.951, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.354521090654859, | |
| "grad_norm": 0.3506231904029846, | |
| "learning_rate": 0.0005601896149358226, | |
| "loss": 3.9438, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.3690864600326265, | |
| "grad_norm": 0.34532177448272705, | |
| "learning_rate": 0.0005597520420070011, | |
| "loss": 3.9489, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.3836518294103939, | |
| "grad_norm": 0.3467022180557251, | |
| "learning_rate": 0.0005593144690781796, | |
| "loss": 3.9438, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.3982171987881613, | |
| "grad_norm": 0.3695443272590637, | |
| "learning_rate": 0.0005588768961493582, | |
| "loss": 3.9331, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4127825681659287, | |
| "grad_norm": 0.33018758893013, | |
| "learning_rate": 0.0005584393232205367, | |
| "loss": 3.9295, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.4273479375436962, | |
| "grad_norm": 0.3564456105232239, | |
| "learning_rate": 0.0005580017502917152, | |
| "loss": 3.9395, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.4419133069214636, | |
| "grad_norm": 0.3350387215614319, | |
| "learning_rate": 0.0005575641773628938, | |
| "loss": 3.9266, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.456478676299231, | |
| "grad_norm": 0.33804184198379517, | |
| "learning_rate": 0.0005571266044340723, | |
| "loss": 3.9435, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.456478676299231, | |
| "eval_accuracy": 0.33277813599489436, | |
| "eval_loss": 3.9095029830932617, | |
| "eval_runtime": 180.2432, | |
| "eval_samples_per_second": 92.342, | |
| "eval_steps_per_second": 5.776, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4710440456769984, | |
| "grad_norm": 0.33603259921073914, | |
| "learning_rate": 0.0005566890315052507, | |
| "loss": 3.9152, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.4856094150547658, | |
| "grad_norm": 0.3263108730316162, | |
| "learning_rate": 0.0005562514585764293, | |
| "loss": 3.9295, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.500174784432533, | |
| "grad_norm": 0.3421551287174225, | |
| "learning_rate": 0.0005558138856476079, | |
| "loss": 3.9092, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.5147401538103007, | |
| "grad_norm": 0.3283444344997406, | |
| "learning_rate": 0.0005553763127187864, | |
| "loss": 3.9035, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.529305523188068, | |
| "grad_norm": 0.34648576378822327, | |
| "learning_rate": 0.0005549387397899649, | |
| "loss": 3.9155, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.5438708925658355, | |
| "grad_norm": 0.3276433050632477, | |
| "learning_rate": 0.0005545011668611435, | |
| "loss": 3.9183, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.5584362619436027, | |
| "grad_norm": 0.333248108625412, | |
| "learning_rate": 0.000554063593932322, | |
| "loss": 3.9177, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.5730016313213704, | |
| "grad_norm": 0.337734580039978, | |
| "learning_rate": 0.0005536260210035005, | |
| "loss": 3.9095, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.5875670006991376, | |
| "grad_norm": 0.3482300043106079, | |
| "learning_rate": 0.0005531884480746791, | |
| "loss": 3.8858, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.6021323700769052, | |
| "grad_norm": 0.3184448480606079, | |
| "learning_rate": 0.0005527508751458577, | |
| "loss": 3.8971, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6166977394546724, | |
| "grad_norm": 0.3283264935016632, | |
| "learning_rate": 0.0005523133022170361, | |
| "loss": 3.877, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.63126310883244, | |
| "grad_norm": 0.34048277139663696, | |
| "learning_rate": 0.0005518757292882146, | |
| "loss": 3.8963, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.6458284782102073, | |
| "grad_norm": 0.32839393615722656, | |
| "learning_rate": 0.0005514381563593932, | |
| "loss": 3.8916, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.660393847587975, | |
| "grad_norm": 0.33673906326293945, | |
| "learning_rate": 0.0005510005834305717, | |
| "loss": 3.8848, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.6749592169657421, | |
| "grad_norm": 0.34184661507606506, | |
| "learning_rate": 0.0005505630105017502, | |
| "loss": 3.8913, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.6895245863435098, | |
| "grad_norm": 0.3328911364078522, | |
| "learning_rate": 0.0005501254375729288, | |
| "loss": 3.8844, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.704089955721277, | |
| "grad_norm": 0.34918013215065, | |
| "learning_rate": 0.0005496878646441073, | |
| "loss": 3.8826, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.7186553250990446, | |
| "grad_norm": 0.3278209865093231, | |
| "learning_rate": 0.0005492502917152858, | |
| "loss": 3.8737, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.7332206944768118, | |
| "grad_norm": 0.32764044404029846, | |
| "learning_rate": 0.0005488127187864644, | |
| "loss": 3.8653, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.7477860638545795, | |
| "grad_norm": 0.320593923330307, | |
| "learning_rate": 0.000548375145857643, | |
| "loss": 3.8665, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7477860638545795, | |
| "eval_accuracy": 0.33797937798145206, | |
| "eval_loss": 3.8514480590820312, | |
| "eval_runtime": 180.396, | |
| "eval_samples_per_second": 92.264, | |
| "eval_steps_per_second": 5.771, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.7623514332323467, | |
| "grad_norm": 0.32193249464035034, | |
| "learning_rate": 0.0005479375729288214, | |
| "loss": 3.863, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.7769168026101143, | |
| "grad_norm": 0.35405752062797546, | |
| "learning_rate": 0.0005474999999999999, | |
| "loss": 3.8648, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.7914821719878815, | |
| "grad_norm": 0.3290136754512787, | |
| "learning_rate": 0.0005470624270711785, | |
| "loss": 3.8579, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.8060475413656492, | |
| "grad_norm": 0.3399069607257843, | |
| "learning_rate": 0.000546624854142357, | |
| "loss": 3.8711, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.8206129107434164, | |
| "grad_norm": 0.333492249250412, | |
| "learning_rate": 0.0005461872812135355, | |
| "loss": 3.8686, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.835178280121184, | |
| "grad_norm": 0.3360602557659149, | |
| "learning_rate": 0.0005457497082847141, | |
| "loss": 3.8639, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.8497436494989512, | |
| "grad_norm": 0.349657267332077, | |
| "learning_rate": 0.0005453121353558927, | |
| "loss": 3.8582, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.8643090188767188, | |
| "grad_norm": 0.31816044449806213, | |
| "learning_rate": 0.0005448745624270712, | |
| "loss": 3.8605, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.878874388254486, | |
| "grad_norm": 0.3400065004825592, | |
| "learning_rate": 0.0005444369894982496, | |
| "loss": 3.8617, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.8934397576322537, | |
| "grad_norm": 0.3279556632041931, | |
| "learning_rate": 0.0005439994165694282, | |
| "loss": 3.8528, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.908005127010021, | |
| "grad_norm": 0.33743831515312195, | |
| "learning_rate": 0.0005435618436406067, | |
| "loss": 3.8504, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.9225704963877885, | |
| "grad_norm": 0.3401290476322174, | |
| "learning_rate": 0.0005431242707117852, | |
| "loss": 3.8496, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.9371358657655557, | |
| "grad_norm": 0.3282126486301422, | |
| "learning_rate": 0.0005426866977829638, | |
| "loss": 3.8469, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.9517012351433234, | |
| "grad_norm": 0.3605695068836212, | |
| "learning_rate": 0.0005422491248541423, | |
| "loss": 3.858, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.9662666045210906, | |
| "grad_norm": 0.32521483302116394, | |
| "learning_rate": 0.0005418115519253208, | |
| "loss": 3.852, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.9808319738988582, | |
| "grad_norm": 0.33284640312194824, | |
| "learning_rate": 0.0005413739789964994, | |
| "loss": 3.8486, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.9953973432766254, | |
| "grad_norm": 0.3308689296245575, | |
| "learning_rate": 0.000540936406067678, | |
| "loss": 3.8482, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.0099044511768818, | |
| "grad_norm": 0.33800598978996277, | |
| "learning_rate": 0.0005404988331388564, | |
| "loss": 3.77, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.0244698205546494, | |
| "grad_norm": 0.3277951180934906, | |
| "learning_rate": 0.0005400612602100349, | |
| "loss": 3.7368, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.0390351899324166, | |
| "grad_norm": 0.3203679919242859, | |
| "learning_rate": 0.0005396236872812135, | |
| "loss": 3.7491, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0390351899324166, | |
| "eval_accuracy": 0.34216722609676753, | |
| "eval_loss": 3.8109800815582275, | |
| "eval_runtime": 180.255, | |
| "eval_samples_per_second": 92.336, | |
| "eval_steps_per_second": 5.775, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.0536005593101843, | |
| "grad_norm": 0.35276371240615845, | |
| "learning_rate": 0.000539186114352392, | |
| "loss": 3.7337, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.0681659286879515, | |
| "grad_norm": 0.35237714648246765, | |
| "learning_rate": 0.0005387485414235705, | |
| "loss": 3.7357, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.082731298065719, | |
| "grad_norm": 0.3209347426891327, | |
| "learning_rate": 0.0005383109684947491, | |
| "loss": 3.7583, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.0972966674434863, | |
| "grad_norm": 0.32085931301116943, | |
| "learning_rate": 0.0005378733955659276, | |
| "loss": 3.7539, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.111862036821254, | |
| "grad_norm": 0.31919988989830017, | |
| "learning_rate": 0.0005374358226371061, | |
| "loss": 3.7393, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.126427406199021, | |
| "grad_norm": 0.3325698673725128, | |
| "learning_rate": 0.0005369982497082847, | |
| "loss": 3.751, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.140992775576789, | |
| "grad_norm": 0.32088345289230347, | |
| "learning_rate": 0.0005365606767794633, | |
| "loss": 3.7441, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.155558144954556, | |
| "grad_norm": 0.31885406374931335, | |
| "learning_rate": 0.0005361231038506417, | |
| "loss": 3.7468, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.1701235143323236, | |
| "grad_norm": 0.32321396470069885, | |
| "learning_rate": 0.0005356855309218202, | |
| "loss": 3.7511, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.184688883710091, | |
| "grad_norm": 0.339028924703598, | |
| "learning_rate": 0.0005352479579929988, | |
| "loss": 3.7568, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.1992542530878585, | |
| "grad_norm": 0.3378174901008606, | |
| "learning_rate": 0.0005348103850641773, | |
| "loss": 3.7351, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.2138196224656257, | |
| "grad_norm": 0.32842838764190674, | |
| "learning_rate": 0.0005343728121353558, | |
| "loss": 3.7614, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.2283849918433933, | |
| "grad_norm": 0.3337772488594055, | |
| "learning_rate": 0.0005339352392065344, | |
| "loss": 3.7502, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.2429503612211605, | |
| "grad_norm": 0.31574419140815735, | |
| "learning_rate": 0.000533497666277713, | |
| "loss": 3.7559, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.257515730598928, | |
| "grad_norm": 0.3204760253429413, | |
| "learning_rate": 0.0005330600933488915, | |
| "loss": 3.7382, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.2720810999766954, | |
| "grad_norm": 0.33120566606521606, | |
| "learning_rate": 0.00053262252042007, | |
| "loss": 3.7488, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.286646469354463, | |
| "grad_norm": 0.3328082263469696, | |
| "learning_rate": 0.0005321849474912485, | |
| "loss": 3.7518, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.3012118387322302, | |
| "grad_norm": 0.3446897268295288, | |
| "learning_rate": 0.000531747374562427, | |
| "loss": 3.7421, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.3157772081099974, | |
| "grad_norm": 0.3277474641799927, | |
| "learning_rate": 0.0005313098016336055, | |
| "loss": 3.7376, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.330342577487765, | |
| "grad_norm": 0.32570740580558777, | |
| "learning_rate": 0.0005308722287047841, | |
| "loss": 3.7416, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.330342577487765, | |
| "eval_accuracy": 0.3451675491976329, | |
| "eval_loss": 3.7787580490112305, | |
| "eval_runtime": 180.355, | |
| "eval_samples_per_second": 92.285, | |
| "eval_steps_per_second": 5.772, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.3449079468655327, | |
| "grad_norm": 0.3236296474933624, | |
| "learning_rate": 0.0005304346557759626, | |
| "loss": 3.737, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.3594733162433, | |
| "grad_norm": 0.3300062417984009, | |
| "learning_rate": 0.0005299970828471411, | |
| "loss": 3.74, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.374038685621067, | |
| "grad_norm": 0.3218088746070862, | |
| "learning_rate": 0.0005295595099183197, | |
| "loss": 3.7618, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.3886040549988348, | |
| "grad_norm": 0.32456105947494507, | |
| "learning_rate": 0.0005291219369894983, | |
| "loss": 3.7461, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.4031694243766024, | |
| "grad_norm": 0.3256712257862091, | |
| "learning_rate": 0.0005286843640606768, | |
| "loss": 3.7343, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.4177347937543696, | |
| "grad_norm": 0.3265218734741211, | |
| "learning_rate": 0.0005282467911318552, | |
| "loss": 3.7396, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.432300163132137, | |
| "grad_norm": 0.3039201498031616, | |
| "learning_rate": 0.0005278092182030338, | |
| "loss": 3.7405, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.4468655325099045, | |
| "grad_norm": 0.3367139995098114, | |
| "learning_rate": 0.0005273716452742123, | |
| "loss": 3.7426, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.461430901887672, | |
| "grad_norm": 0.314224511384964, | |
| "learning_rate": 0.0005269340723453908, | |
| "loss": 3.7412, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.4759962712654393, | |
| "grad_norm": 0.3330950140953064, | |
| "learning_rate": 0.0005264964994165694, | |
| "loss": 3.7463, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4905616406432065, | |
| "grad_norm": 0.3340461552143097, | |
| "learning_rate": 0.000526058926487748, | |
| "loss": 3.7469, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.505127010020974, | |
| "grad_norm": 0.3362635672092438, | |
| "learning_rate": 0.0005256213535589265, | |
| "loss": 3.7338, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.519692379398742, | |
| "grad_norm": 0.3297460377216339, | |
| "learning_rate": 0.000525183780630105, | |
| "loss": 3.7498, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.534257748776509, | |
| "grad_norm": 0.3183857500553131, | |
| "learning_rate": 0.0005247462077012836, | |
| "loss": 3.7391, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.548823118154276, | |
| "grad_norm": 0.33508941531181335, | |
| "learning_rate": 0.000524308634772462, | |
| "loss": 3.7348, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.563388487532044, | |
| "grad_norm": 0.3083733022212982, | |
| "learning_rate": 0.0005238710618436405, | |
| "loss": 3.7285, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.5779538569098115, | |
| "grad_norm": 0.31876590847969055, | |
| "learning_rate": 0.0005234334889148191, | |
| "loss": 3.7443, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.5925192262875787, | |
| "grad_norm": 0.3193049430847168, | |
| "learning_rate": 0.0005229959159859976, | |
| "loss": 3.7422, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.607084595665346, | |
| "grad_norm": 0.32590124011039734, | |
| "learning_rate": 0.0005225583430571761, | |
| "loss": 3.7424, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.6216499650431135, | |
| "grad_norm": 0.3363872170448303, | |
| "learning_rate": 0.0005221207701283547, | |
| "loss": 3.728, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.6216499650431135, | |
| "eval_accuracy": 0.3480742812181514, | |
| "eval_loss": 3.7482504844665527, | |
| "eval_runtime": 180.2179, | |
| "eval_samples_per_second": 92.355, | |
| "eval_steps_per_second": 5.776, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.636215334420881, | |
| "grad_norm": 0.3201189935207367, | |
| "learning_rate": 0.0005216831971995333, | |
| "loss": 3.7477, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 2.6507807037986484, | |
| "grad_norm": 0.33287513256073, | |
| "learning_rate": 0.0005212456242707118, | |
| "loss": 3.7203, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.6653460731764156, | |
| "grad_norm": 0.3236483335494995, | |
| "learning_rate": 0.0005208080513418903, | |
| "loss": 3.7265, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 2.6799114425541832, | |
| "grad_norm": 0.3180456757545471, | |
| "learning_rate": 0.0005203704784130689, | |
| "loss": 3.7303, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.6944768119319504, | |
| "grad_norm": 0.3273324966430664, | |
| "learning_rate": 0.0005199329054842473, | |
| "loss": 3.7266, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 2.709042181309718, | |
| "grad_norm": 0.3243292272090912, | |
| "learning_rate": 0.0005194953325554258, | |
| "loss": 3.7301, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.7236075506874853, | |
| "grad_norm": 0.32646605372428894, | |
| "learning_rate": 0.0005190577596266044, | |
| "loss": 3.7284, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 2.738172920065253, | |
| "grad_norm": 0.3168424665927887, | |
| "learning_rate": 0.0005186201866977829, | |
| "loss": 3.7384, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.75273828944302, | |
| "grad_norm": 0.3341065049171448, | |
| "learning_rate": 0.0005181826137689614, | |
| "loss": 3.7279, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 2.7673036588207878, | |
| "grad_norm": 0.3197799623012543, | |
| "learning_rate": 0.00051774504084014, | |
| "loss": 3.7302, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.781869028198555, | |
| "grad_norm": 0.31474462151527405, | |
| "learning_rate": 0.0005173074679113186, | |
| "loss": 3.735, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 2.7964343975763226, | |
| "grad_norm": 0.3133241832256317, | |
| "learning_rate": 0.0005168698949824971, | |
| "loss": 3.7139, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.81099976695409, | |
| "grad_norm": 0.31363457441329956, | |
| "learning_rate": 0.0005164323220536755, | |
| "loss": 3.7076, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 2.8255651363318575, | |
| "grad_norm": 0.32894420623779297, | |
| "learning_rate": 0.0005159947491248541, | |
| "loss": 3.717, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 2.8401305057096247, | |
| "grad_norm": 0.33178263902664185, | |
| "learning_rate": 0.0005155571761960326, | |
| "loss": 3.7305, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 2.8546958750873923, | |
| "grad_norm": 0.31269919872283936, | |
| "learning_rate": 0.0005151196032672111, | |
| "loss": 3.7172, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 2.8692612444651595, | |
| "grad_norm": 0.32776308059692383, | |
| "learning_rate": 0.0005146820303383897, | |
| "loss": 3.7171, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 2.883826613842927, | |
| "grad_norm": 0.3176999092102051, | |
| "learning_rate": 0.0005142444574095682, | |
| "loss": 3.6958, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 2.8983919832206944, | |
| "grad_norm": 0.3453384339809418, | |
| "learning_rate": 0.0005138068844807468, | |
| "loss": 3.7037, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 2.912957352598462, | |
| "grad_norm": 0.31886717677116394, | |
| "learning_rate": 0.0005133693115519253, | |
| "loss": 3.6958, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.912957352598462, | |
| "eval_accuracy": 0.3504000665954622, | |
| "eval_loss": 3.722299575805664, | |
| "eval_runtime": 180.4701, | |
| "eval_samples_per_second": 92.226, | |
| "eval_steps_per_second": 5.768, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.927522721976229, | |
| "grad_norm": 0.3301357626914978, | |
| "learning_rate": 0.0005129317386231039, | |
| "loss": 3.7174, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 2.942088091353997, | |
| "grad_norm": 0.31266558170318604, | |
| "learning_rate": 0.0005124941656942824, | |
| "loss": 3.7087, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 2.956653460731764, | |
| "grad_norm": 0.2986539602279663, | |
| "learning_rate": 0.0005120565927654608, | |
| "loss": 3.7032, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 2.9712188301095317, | |
| "grad_norm": 0.3215900659561157, | |
| "learning_rate": 0.0005116190198366394, | |
| "loss": 3.7156, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 2.985784199487299, | |
| "grad_norm": 0.34506484866142273, | |
| "learning_rate": 0.0005111814469078179, | |
| "loss": 3.7182, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.0002913073875552, | |
| "grad_norm": 0.3209165036678314, | |
| "learning_rate": 0.0005107438739789964, | |
| "loss": 3.6979, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.014856676765323, | |
| "grad_norm": 0.3145550489425659, | |
| "learning_rate": 0.000510306301050175, | |
| "loss": 3.5923, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.02942204614309, | |
| "grad_norm": 0.33601146936416626, | |
| "learning_rate": 0.0005098687281213535, | |
| "loss": 3.6103, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.0439874155208577, | |
| "grad_norm": 0.31010255217552185, | |
| "learning_rate": 0.0005094311551925321, | |
| "loss": 3.6064, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.058552784898625, | |
| "grad_norm": 0.3235945701599121, | |
| "learning_rate": 0.0005089935822637106, | |
| "loss": 3.5974, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.0731181542763926, | |
| "grad_norm": 0.32663047313690186, | |
| "learning_rate": 0.0005085560093348892, | |
| "loss": 3.6164, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.0876835236541598, | |
| "grad_norm": 0.32186612486839294, | |
| "learning_rate": 0.0005081184364060676, | |
| "loss": 3.605, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.1022488930319274, | |
| "grad_norm": 0.3103710114955902, | |
| "learning_rate": 0.0005076808634772461, | |
| "loss": 3.622, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.1168142624096946, | |
| "grad_norm": 0.32506147027015686, | |
| "learning_rate": 0.0005072432905484247, | |
| "loss": 3.6183, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.1313796317874623, | |
| "grad_norm": 0.354626327753067, | |
| "learning_rate": 0.0005068057176196032, | |
| "loss": 3.6236, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.1459450011652295, | |
| "grad_norm": 0.31761565804481506, | |
| "learning_rate": 0.0005063681446907818, | |
| "loss": 3.6218, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.160510370542997, | |
| "grad_norm": 0.3158835172653198, | |
| "learning_rate": 0.0005059305717619603, | |
| "loss": 3.6275, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.1750757399207643, | |
| "grad_norm": 0.3345862925052643, | |
| "learning_rate": 0.0005054929988331388, | |
| "loss": 3.6209, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.189641109298532, | |
| "grad_norm": 0.33414244651794434, | |
| "learning_rate": 0.0005050554259043174, | |
| "loss": 3.6138, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.204206478676299, | |
| "grad_norm": 0.321621835231781, | |
| "learning_rate": 0.0005046178529754959, | |
| "loss": 3.6306, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.204206478676299, | |
| "eval_accuracy": 0.3523780599932934, | |
| "eval_loss": 3.7092323303222656, | |
| "eval_runtime": 181.978, | |
| "eval_samples_per_second": 91.462, | |
| "eval_steps_per_second": 5.72, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.218771848054067, | |
| "grad_norm": 0.3331759572029114, | |
| "learning_rate": 0.0005041802800466744, | |
| "loss": 3.6116, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.233337217431834, | |
| "grad_norm": 0.33480656147003174, | |
| "learning_rate": 0.0005037427071178529, | |
| "loss": 3.6186, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.2479025868096016, | |
| "grad_norm": 0.32737287878990173, | |
| "learning_rate": 0.0005033051341890314, | |
| "loss": 3.6176, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.262467956187369, | |
| "grad_norm": 0.33219143748283386, | |
| "learning_rate": 0.00050286756126021, | |
| "loss": 3.6299, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.2770333255651365, | |
| "grad_norm": 0.3134367763996124, | |
| "learning_rate": 0.0005024299883313885, | |
| "loss": 3.6269, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.2915986949429037, | |
| "grad_norm": 0.3368885815143585, | |
| "learning_rate": 0.0005019924154025671, | |
| "loss": 3.6383, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.3061640643206713, | |
| "grad_norm": 0.30437996983528137, | |
| "learning_rate": 0.0005015548424737456, | |
| "loss": 3.6245, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.3207294336984385, | |
| "grad_norm": 0.33528828620910645, | |
| "learning_rate": 0.0005011172695449241, | |
| "loss": 3.6251, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.335294803076206, | |
| "grad_norm": 0.33781698346138, | |
| "learning_rate": 0.0005006796966161027, | |
| "loss": 3.63, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.3498601724539734, | |
| "grad_norm": 0.329375296831131, | |
| "learning_rate": 0.0005002421236872811, | |
| "loss": 3.6387, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.364425541831741, | |
| "grad_norm": 0.31199130415916443, | |
| "learning_rate": 0.0004998045507584597, | |
| "loss": 3.6199, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.3789909112095082, | |
| "grad_norm": 0.31993409991264343, | |
| "learning_rate": 0.0004993669778296382, | |
| "loss": 3.6383, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.393556280587276, | |
| "grad_norm": 0.33537372946739197, | |
| "learning_rate": 0.0004989294049008167, | |
| "loss": 3.6409, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.408121649965043, | |
| "grad_norm": 0.3288818299770355, | |
| "learning_rate": 0.0004984918319719953, | |
| "loss": 3.6544, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.4226870193428107, | |
| "grad_norm": 0.3143393099308014, | |
| "learning_rate": 0.0004980542590431738, | |
| "loss": 3.632, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 3.437252388720578, | |
| "grad_norm": 0.3316044211387634, | |
| "learning_rate": 0.0004976166861143524, | |
| "loss": 3.6256, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 3.4518177580983456, | |
| "grad_norm": 0.3158373534679413, | |
| "learning_rate": 0.0004971791131855309, | |
| "loss": 3.6283, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 3.4663831274761128, | |
| "grad_norm": 0.3310090899467468, | |
| "learning_rate": 0.0004967415402567094, | |
| "loss": 3.6383, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 3.4809484968538804, | |
| "grad_norm": 0.3304344415664673, | |
| "learning_rate": 0.000496303967327888, | |
| "loss": 3.6364, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 3.4955138662316476, | |
| "grad_norm": 0.3196583390235901, | |
| "learning_rate": 0.0004958663943990664, | |
| "loss": 3.6245, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4955138662316476, | |
| "eval_accuracy": 0.3539525300396798, | |
| "eval_loss": 3.6910691261291504, | |
| "eval_runtime": 180.2749, | |
| "eval_samples_per_second": 92.326, | |
| "eval_steps_per_second": 5.775, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.510079235609415, | |
| "grad_norm": 0.334721177816391, | |
| "learning_rate": 0.000495428821470245, | |
| "loss": 3.6405, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 3.5246446049871825, | |
| "grad_norm": 0.30898579955101013, | |
| "learning_rate": 0.0004949912485414235, | |
| "loss": 3.633, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 3.53920997436495, | |
| "grad_norm": 0.3296958804130554, | |
| "learning_rate": 0.0004945536756126021, | |
| "loss": 3.6336, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 3.5537753437427173, | |
| "grad_norm": 0.3156227469444275, | |
| "learning_rate": 0.0004941161026837806, | |
| "loss": 3.6264, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 3.5683407131204845, | |
| "grad_norm": 0.32900500297546387, | |
| "learning_rate": 0.0004936785297549591, | |
| "loss": 3.6286, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 3.582906082498252, | |
| "grad_norm": 0.33001989126205444, | |
| "learning_rate": 0.0004932409568261377, | |
| "loss": 3.6485, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 3.59747145187602, | |
| "grad_norm": 0.32858744263648987, | |
| "learning_rate": 0.0004928033838973162, | |
| "loss": 3.6323, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 3.612036821253787, | |
| "grad_norm": 0.35113999247550964, | |
| "learning_rate": 0.0004923658109684946, | |
| "loss": 3.647, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 3.626602190631554, | |
| "grad_norm": 0.3282478153705597, | |
| "learning_rate": 0.0004919282380396732, | |
| "loss": 3.6335, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 3.641167560009322, | |
| "grad_norm": 0.31611868739128113, | |
| "learning_rate": 0.0004914906651108517, | |
| "loss": 3.631, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6557329293870895, | |
| "grad_norm": 0.33487775921821594, | |
| "learning_rate": 0.0004910530921820303, | |
| "loss": 3.6274, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 3.6702982987648567, | |
| "grad_norm": 0.33004793524742126, | |
| "learning_rate": 0.0004906155192532088, | |
| "loss": 3.618, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 3.684863668142624, | |
| "grad_norm": 0.30851587653160095, | |
| "learning_rate": 0.0004901779463243874, | |
| "loss": 3.6229, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 3.6994290375203915, | |
| "grad_norm": 0.325185090303421, | |
| "learning_rate": 0.0004897403733955659, | |
| "loss": 3.6289, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 3.713994406898159, | |
| "grad_norm": 0.3187962770462036, | |
| "learning_rate": 0.0004893028004667444, | |
| "loss": 3.6355, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 3.7285597762759264, | |
| "grad_norm": 0.32004639506340027, | |
| "learning_rate": 0.000488865227537923, | |
| "loss": 3.6424, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 3.7431251456536936, | |
| "grad_norm": 0.331478476524353, | |
| "learning_rate": 0.0004884276546091015, | |
| "loss": 3.624, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 3.7576905150314612, | |
| "grad_norm": 0.31720319390296936, | |
| "learning_rate": 0.00048799008168028, | |
| "loss": 3.6329, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 3.772255884409229, | |
| "grad_norm": 0.32388386130332947, | |
| "learning_rate": 0.00048755250875145853, | |
| "loss": 3.6237, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 3.786821253786996, | |
| "grad_norm": 0.326471209526062, | |
| "learning_rate": 0.0004871149358226371, | |
| "loss": 3.6365, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.786821253786996, | |
| "eval_accuracy": 0.35590465655600817, | |
| "eval_loss": 3.6707494258880615, | |
| "eval_runtime": 180.189, | |
| "eval_samples_per_second": 92.37, | |
| "eval_steps_per_second": 5.777, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.8013866231647633, | |
| "grad_norm": 0.3287231922149658, | |
| "learning_rate": 0.0004866773628938156, | |
| "loss": 3.6351, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 3.815951992542531, | |
| "grad_norm": 0.3224816620349884, | |
| "learning_rate": 0.0004862397899649941, | |
| "loss": 3.631, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 3.8305173619202986, | |
| "grad_norm": 0.34565699100494385, | |
| "learning_rate": 0.00048580221703617264, | |
| "loss": 3.6365, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 3.8450827312980658, | |
| "grad_norm": 0.31353557109832764, | |
| "learning_rate": 0.00048536464410735123, | |
| "loss": 3.6346, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 3.859648100675833, | |
| "grad_norm": 0.31035754084587097, | |
| "learning_rate": 0.00048492707117852966, | |
| "loss": 3.6353, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 3.8742134700536006, | |
| "grad_norm": 0.3304181694984436, | |
| "learning_rate": 0.00048448949824970826, | |
| "loss": 3.631, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 3.888778839431368, | |
| "grad_norm": 0.3305014669895172, | |
| "learning_rate": 0.0004840519253208868, | |
| "loss": 3.6234, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 3.9033442088091355, | |
| "grad_norm": 0.33002111315727234, | |
| "learning_rate": 0.0004836143523920653, | |
| "loss": 3.6389, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 3.9179095781869027, | |
| "grad_norm": 0.3106802701950073, | |
| "learning_rate": 0.0004831767794632438, | |
| "loss": 3.6413, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 3.9324749475646703, | |
| "grad_norm": 0.32683488726615906, | |
| "learning_rate": 0.00048273920653442236, | |
| "loss": 3.6114, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.9470403169424375, | |
| "grad_norm": 0.3140070140361786, | |
| "learning_rate": 0.0004823016336056009, | |
| "loss": 3.6246, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 3.961605686320205, | |
| "grad_norm": 0.3176632523536682, | |
| "learning_rate": 0.0004818640606767794, | |
| "loss": 3.6215, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 3.9761710556979724, | |
| "grad_norm": 0.33348730206489563, | |
| "learning_rate": 0.00048142648774795793, | |
| "loss": 3.6198, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 3.99073642507574, | |
| "grad_norm": 0.3215520679950714, | |
| "learning_rate": 0.0004809889148191365, | |
| "loss": 3.6326, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 4.005243532975996, | |
| "grad_norm": 0.3232531249523163, | |
| "learning_rate": 0.000480551341890315, | |
| "loss": 3.5772, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 4.0198089023537635, | |
| "grad_norm": 0.3443015515804291, | |
| "learning_rate": 0.00048011376896149355, | |
| "loss": 3.5136, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 4.034374271731531, | |
| "grad_norm": 0.3226703405380249, | |
| "learning_rate": 0.0004796761960326721, | |
| "loss": 3.5234, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 4.048939641109299, | |
| "grad_norm": 0.32913267612457275, | |
| "learning_rate": 0.0004792386231038506, | |
| "loss": 3.5215, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 4.063505010487066, | |
| "grad_norm": 0.3350991904735565, | |
| "learning_rate": 0.0004788010501750291, | |
| "loss": 3.5211, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 4.078070379864833, | |
| "grad_norm": 0.3143565058708191, | |
| "learning_rate": 0.00047836347724620766, | |
| "loss": 3.5138, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.078070379864833, | |
| "eval_accuracy": 0.3570977076769612, | |
| "eval_loss": 3.6638998985290527, | |
| "eval_runtime": 180.2734, | |
| "eval_samples_per_second": 92.326, | |
| "eval_steps_per_second": 5.775, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.092635749242601, | |
| "grad_norm": 0.318641722202301, | |
| "learning_rate": 0.0004779259043173862, | |
| "loss": 3.5201, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 4.1072011186203685, | |
| "grad_norm": 0.31839898228645325, | |
| "learning_rate": 0.0004774883313885647, | |
| "loss": 3.5223, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 4.121766487998135, | |
| "grad_norm": 0.3353429436683655, | |
| "learning_rate": 0.0004770507584597433, | |
| "loss": 3.5314, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 4.136331857375903, | |
| "grad_norm": 0.3297600746154785, | |
| "learning_rate": 0.0004766131855309218, | |
| "loss": 3.5386, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 4.150897226753671, | |
| "grad_norm": 0.35828185081481934, | |
| "learning_rate": 0.0004761756126021003, | |
| "loss": 3.5442, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 4.165462596131438, | |
| "grad_norm": 0.32543322443962097, | |
| "learning_rate": 0.00047573803967327884, | |
| "loss": 3.5498, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 4.180027965509205, | |
| "grad_norm": 0.33324652910232544, | |
| "learning_rate": 0.0004753004667444574, | |
| "loss": 3.5393, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 4.194593334886973, | |
| "grad_norm": 0.3401516079902649, | |
| "learning_rate": 0.00047486289381563587, | |
| "loss": 3.5485, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.20915870426474, | |
| "grad_norm": 0.34022200107574463, | |
| "learning_rate": 0.0004744253208868144, | |
| "loss": 3.5287, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 4.223724073642508, | |
| "grad_norm": 0.3375685214996338, | |
| "learning_rate": 0.00047398774795799295, | |
| "loss": 3.5567, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.238289443020275, | |
| "grad_norm": 0.32578080892562866, | |
| "learning_rate": 0.00047355017502917154, | |
| "loss": 3.5511, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 4.252854812398042, | |
| "grad_norm": 0.3124660551548004, | |
| "learning_rate": 0.00047311260210035, | |
| "loss": 3.5519, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 4.26742018177581, | |
| "grad_norm": 0.317643940448761, | |
| "learning_rate": 0.00047267502917152857, | |
| "loss": 3.5485, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 4.281985551153578, | |
| "grad_norm": 0.3317655026912689, | |
| "learning_rate": 0.0004722374562427071, | |
| "loss": 3.5541, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 4.296550920531344, | |
| "grad_norm": 0.32578787207603455, | |
| "learning_rate": 0.0004717998833138856, | |
| "loss": 3.5354, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 4.311116289909112, | |
| "grad_norm": 0.32401853799819946, | |
| "learning_rate": 0.00047136231038506413, | |
| "loss": 3.5608, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 4.32568165928688, | |
| "grad_norm": 0.33071812987327576, | |
| "learning_rate": 0.00047092473745624267, | |
| "loss": 3.5453, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 4.340247028664647, | |
| "grad_norm": 0.3195439577102661, | |
| "learning_rate": 0.00047048716452742116, | |
| "loss": 3.5509, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 4.354812398042414, | |
| "grad_norm": 0.32133200764656067, | |
| "learning_rate": 0.0004700495915985997, | |
| "loss": 3.5631, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 4.369377767420182, | |
| "grad_norm": 0.345612108707428, | |
| "learning_rate": 0.0004696120186697783, | |
| "loss": 3.5632, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.369377767420182, | |
| "eval_accuracy": 0.3582166854554288, | |
| "eval_loss": 3.653571128845215, | |
| "eval_runtime": 180.4957, | |
| "eval_samples_per_second": 92.213, | |
| "eval_steps_per_second": 5.767, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.383943136797949, | |
| "grad_norm": 0.33550721406936646, | |
| "learning_rate": 0.00046917444574095683, | |
| "loss": 3.5562, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 4.398508506175717, | |
| "grad_norm": 0.32593655586242676, | |
| "learning_rate": 0.0004687368728121353, | |
| "loss": 3.5542, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 4.413073875553484, | |
| "grad_norm": 0.32876867055892944, | |
| "learning_rate": 0.00046829929988331386, | |
| "loss": 3.5537, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 4.427639244931251, | |
| "grad_norm": 0.31340348720550537, | |
| "learning_rate": 0.0004678617269544924, | |
| "loss": 3.5547, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 4.442204614309019, | |
| "grad_norm": 0.325003981590271, | |
| "learning_rate": 0.0004674241540256709, | |
| "loss": 3.5638, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 4.456769983686787, | |
| "grad_norm": 0.31941288709640503, | |
| "learning_rate": 0.0004669865810968494, | |
| "loss": 3.5625, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 4.471335353064553, | |
| "grad_norm": 0.32604023814201355, | |
| "learning_rate": 0.00046654900816802796, | |
| "loss": 3.5542, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 4.485900722442321, | |
| "grad_norm": 0.3184167444705963, | |
| "learning_rate": 0.00046611143523920645, | |
| "loss": 3.5597, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 4.500466091820089, | |
| "grad_norm": 0.32676759362220764, | |
| "learning_rate": 0.00046567386231038504, | |
| "loss": 3.5518, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 4.515031461197856, | |
| "grad_norm": 0.3253229260444641, | |
| "learning_rate": 0.0004652362893815636, | |
| "loss": 3.5636, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.529596830575623, | |
| "grad_norm": 0.33474475145339966, | |
| "learning_rate": 0.0004647987164527421, | |
| "loss": 3.5638, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 4.544162199953391, | |
| "grad_norm": 0.34634941816329956, | |
| "learning_rate": 0.0004643611435239206, | |
| "loss": 3.5473, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 4.558727569331158, | |
| "grad_norm": 0.33891260623931885, | |
| "learning_rate": 0.00046392357059509915, | |
| "loss": 3.5675, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 4.573292938708926, | |
| "grad_norm": 0.32942262291908264, | |
| "learning_rate": 0.0004634859976662777, | |
| "loss": 3.5603, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 4.587858308086693, | |
| "grad_norm": 0.3374430239200592, | |
| "learning_rate": 0.0004630484247374562, | |
| "loss": 3.5538, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 4.6024236774644605, | |
| "grad_norm": 0.3401276767253876, | |
| "learning_rate": 0.0004626108518086347, | |
| "loss": 3.5644, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 4.616989046842228, | |
| "grad_norm": 0.3286304473876953, | |
| "learning_rate": 0.0004621732788798133, | |
| "loss": 3.5653, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 4.631554416219995, | |
| "grad_norm": 0.31420665979385376, | |
| "learning_rate": 0.00046173570595099174, | |
| "loss": 3.556, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 4.6461197855977625, | |
| "grad_norm": 0.3286356031894684, | |
| "learning_rate": 0.00046129813302217033, | |
| "loss": 3.552, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 4.66068515497553, | |
| "grad_norm": 0.33006393909454346, | |
| "learning_rate": 0.00046086056009334887, | |
| "loss": 3.5684, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.66068515497553, | |
| "eval_accuracy": 0.3596702866191563, | |
| "eval_loss": 3.640338897705078, | |
| "eval_runtime": 180.2507, | |
| "eval_samples_per_second": 92.338, | |
| "eval_steps_per_second": 5.775, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.675250524353298, | |
| "grad_norm": 0.3313292860984802, | |
| "learning_rate": 0.0004604229871645274, | |
| "loss": 3.5606, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 4.689815893731065, | |
| "grad_norm": 0.31922703981399536, | |
| "learning_rate": 0.0004599854142357059, | |
| "loss": 3.5739, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 4.704381263108832, | |
| "grad_norm": 0.3161007761955261, | |
| "learning_rate": 0.00045954784130688444, | |
| "loss": 3.5688, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 4.7189466324866, | |
| "grad_norm": 0.33094581961631775, | |
| "learning_rate": 0.000459110268378063, | |
| "loss": 3.564, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 4.7335120018643675, | |
| "grad_norm": 0.3282545804977417, | |
| "learning_rate": 0.00045867269544924146, | |
| "loss": 3.5759, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 4.748077371242134, | |
| "grad_norm": 0.32690319418907166, | |
| "learning_rate": 0.00045823512252042, | |
| "loss": 3.5601, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 4.762642740619902, | |
| "grad_norm": 0.3375246524810791, | |
| "learning_rate": 0.0004577975495915986, | |
| "loss": 3.5569, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 4.7772081099976695, | |
| "grad_norm": 0.3194766044616699, | |
| "learning_rate": 0.0004573599766627771, | |
| "loss": 3.5536, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 4.791773479375437, | |
| "grad_norm": 0.31809139251708984, | |
| "learning_rate": 0.0004569224037339556, | |
| "loss": 3.5626, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 4.806338848753205, | |
| "grad_norm": 0.3298538327217102, | |
| "learning_rate": 0.00045648483080513416, | |
| "loss": 3.5597, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.820904218130972, | |
| "grad_norm": 0.343118816614151, | |
| "learning_rate": 0.0004560472578763127, | |
| "loss": 3.563, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 4.835469587508739, | |
| "grad_norm": 0.32174625992774963, | |
| "learning_rate": 0.0004556096849474912, | |
| "loss": 3.5602, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 4.850034956886507, | |
| "grad_norm": 0.3458464741706848, | |
| "learning_rate": 0.00045517211201866973, | |
| "loss": 3.5485, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 4.864600326264274, | |
| "grad_norm": 0.3370623290538788, | |
| "learning_rate": 0.00045473453908984827, | |
| "loss": 3.5624, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 4.879165695642041, | |
| "grad_norm": 0.33553197979927063, | |
| "learning_rate": 0.00045429696616102675, | |
| "loss": 3.5675, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 4.893731065019809, | |
| "grad_norm": 0.3206152617931366, | |
| "learning_rate": 0.00045385939323220535, | |
| "loss": 3.5618, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 4.908296434397577, | |
| "grad_norm": 0.3171241581439972, | |
| "learning_rate": 0.0004534218203033839, | |
| "loss": 3.5692, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 4.922861803775344, | |
| "grad_norm": 0.3172144889831543, | |
| "learning_rate": 0.0004529842473745624, | |
| "loss": 3.5552, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 4.937427173153111, | |
| "grad_norm": 0.32273098826408386, | |
| "learning_rate": 0.0004525466744457409, | |
| "loss": 3.5712, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 4.951992542530879, | |
| "grad_norm": 0.3339548707008362, | |
| "learning_rate": 0.00045210910151691945, | |
| "loss": 3.5646, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.951992542530879, | |
| "eval_accuracy": 0.36071871835716146, | |
| "eval_loss": 3.625553846359253, | |
| "eval_runtime": 180.4809, | |
| "eval_samples_per_second": 92.22, | |
| "eval_steps_per_second": 5.768, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.966557911908646, | |
| "grad_norm": 0.3269366919994354, | |
| "learning_rate": 0.000451671528588098, | |
| "loss": 3.5542, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 4.981123281286413, | |
| "grad_norm": 0.31721675395965576, | |
| "learning_rate": 0.0004512339556592765, | |
| "loss": 3.5729, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 4.995688650664181, | |
| "grad_norm": 0.3314802050590515, | |
| "learning_rate": 0.000450796382730455, | |
| "loss": 3.5643, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 5.010195758564437, | |
| "grad_norm": 0.34938499331474304, | |
| "learning_rate": 0.0004503588098016336, | |
| "loss": 3.4822, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 5.024761127942204, | |
| "grad_norm": 0.3565429449081421, | |
| "learning_rate": 0.0004499212368728121, | |
| "loss": 3.4413, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 5.039326497319972, | |
| "grad_norm": 0.34626901149749756, | |
| "learning_rate": 0.00044948366394399064, | |
| "loss": 3.4539, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 5.0538918666977395, | |
| "grad_norm": 0.336347758769989, | |
| "learning_rate": 0.0004490460910151692, | |
| "loss": 3.4579, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 5.068457236075507, | |
| "grad_norm": 0.3387928605079651, | |
| "learning_rate": 0.00044860851808634767, | |
| "loss": 3.4767, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 5.083022605453274, | |
| "grad_norm": 0.3393719494342804, | |
| "learning_rate": 0.0004481709451575262, | |
| "loss": 3.4596, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 5.0975879748310415, | |
| "grad_norm": 0.3251345157623291, | |
| "learning_rate": 0.00044773337222870475, | |
| "loss": 3.4748, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.112153344208809, | |
| "grad_norm": 0.32468897104263306, | |
| "learning_rate": 0.0004472957992998833, | |
| "loss": 3.4805, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 5.126718713586577, | |
| "grad_norm": 0.3337823450565338, | |
| "learning_rate": 0.00044685822637106177, | |
| "loss": 3.4754, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 5.141284082964344, | |
| "grad_norm": 0.3582659959793091, | |
| "learning_rate": 0.00044642065344224037, | |
| "loss": 3.4752, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 5.155849452342111, | |
| "grad_norm": 0.3382004499435425, | |
| "learning_rate": 0.0004459830805134189, | |
| "loss": 3.4633, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 5.170414821719879, | |
| "grad_norm": 0.33493444323539734, | |
| "learning_rate": 0.0004455455075845974, | |
| "loss": 3.4896, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 5.1849801910976465, | |
| "grad_norm": 0.33412331342697144, | |
| "learning_rate": 0.00044510793465577593, | |
| "loss": 3.4854, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 5.199545560475413, | |
| "grad_norm": 0.3649858832359314, | |
| "learning_rate": 0.00044467036172695447, | |
| "loss": 3.4862, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 5.214110929853181, | |
| "grad_norm": 0.3273285925388336, | |
| "learning_rate": 0.00044423278879813296, | |
| "loss": 3.4898, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 5.228676299230949, | |
| "grad_norm": 0.36678996682167053, | |
| "learning_rate": 0.0004437952158693115, | |
| "loss": 3.4798, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 5.243241668608716, | |
| "grad_norm": 0.33765271306037903, | |
| "learning_rate": 0.00044335764294049004, | |
| "loss": 3.4892, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.243241668608716, | |
| "eval_accuracy": 0.3612939037403981, | |
| "eval_loss": 3.6294894218444824, | |
| "eval_runtime": 180.4902, | |
| "eval_samples_per_second": 92.216, | |
| "eval_steps_per_second": 5.768, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.257807037986483, | |
| "grad_norm": 0.35786503553390503, | |
| "learning_rate": 0.00044292007001166863, | |
| "loss": 3.4823, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 5.272372407364251, | |
| "grad_norm": 0.3416072130203247, | |
| "learning_rate": 0.00044248249708284706, | |
| "loss": 3.4879, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 5.286937776742018, | |
| "grad_norm": 0.34881216287612915, | |
| "learning_rate": 0.00044204492415402566, | |
| "loss": 3.4922, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 5.301503146119786, | |
| "grad_norm": 0.34528306126594543, | |
| "learning_rate": 0.0004416073512252042, | |
| "loss": 3.4949, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 5.316068515497553, | |
| "grad_norm": 0.328722208738327, | |
| "learning_rate": 0.0004411697782963827, | |
| "loss": 3.4898, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 5.33063388487532, | |
| "grad_norm": 0.3258577585220337, | |
| "learning_rate": 0.0004407322053675612, | |
| "loss": 3.4844, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 5.345199254253088, | |
| "grad_norm": 0.3687138855457306, | |
| "learning_rate": 0.00044029463243873976, | |
| "loss": 3.4868, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 5.359764623630856, | |
| "grad_norm": 0.3356603682041168, | |
| "learning_rate": 0.00043985705950991825, | |
| "loss": 3.4854, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 5.374329993008622, | |
| "grad_norm": 0.3445993661880493, | |
| "learning_rate": 0.0004394194865810968, | |
| "loss": 3.4981, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 5.38889536238639, | |
| "grad_norm": 0.328427791595459, | |
| "learning_rate": 0.00043898191365227533, | |
| "loss": 3.4887, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.403460731764158, | |
| "grad_norm": 0.3391731083393097, | |
| "learning_rate": 0.0004385443407234539, | |
| "loss": 3.5023, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 5.418026101141925, | |
| "grad_norm": 0.3405122458934784, | |
| "learning_rate": 0.0004381067677946324, | |
| "loss": 3.5082, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 5.432591470519692, | |
| "grad_norm": 0.32964596152305603, | |
| "learning_rate": 0.00043766919486581095, | |
| "loss": 3.5064, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 5.44715683989746, | |
| "grad_norm": 0.32743725180625916, | |
| "learning_rate": 0.0004372316219369895, | |
| "loss": 3.5069, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 5.461722209275227, | |
| "grad_norm": 0.33889785408973694, | |
| "learning_rate": 0.00043679404900816797, | |
| "loss": 3.4917, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 5.476287578652995, | |
| "grad_norm": 0.3374757468700409, | |
| "learning_rate": 0.0004363564760793465, | |
| "loss": 3.5129, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 5.490852948030762, | |
| "grad_norm": 0.32586970925331116, | |
| "learning_rate": 0.00043591890315052505, | |
| "loss": 3.4983, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 5.505418317408529, | |
| "grad_norm": 0.3159201443195343, | |
| "learning_rate": 0.00043548133022170354, | |
| "loss": 3.5049, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 5.519983686786297, | |
| "grad_norm": 0.3207235634326935, | |
| "learning_rate": 0.0004350437572928821, | |
| "loss": 3.5027, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 5.534549056164065, | |
| "grad_norm": 0.32409095764160156, | |
| "learning_rate": 0.00043460618436406067, | |
| "loss": 3.5037, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.534549056164065, | |
| "eval_accuracy": 0.36227261247507964, | |
| "eval_loss": 3.617015838623047, | |
| "eval_runtime": 180.3747, | |
| "eval_samples_per_second": 92.275, | |
| "eval_steps_per_second": 5.771, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.549114425541831, | |
| "grad_norm": 0.33343568444252014, | |
| "learning_rate": 0.0004341686114352392, | |
| "loss": 3.5044, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 5.563679794919599, | |
| "grad_norm": 0.3471834063529968, | |
| "learning_rate": 0.0004337310385064177, | |
| "loss": 3.4995, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 5.578245164297367, | |
| "grad_norm": 0.32965055108070374, | |
| "learning_rate": 0.00043329346557759624, | |
| "loss": 3.5091, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 5.592810533675134, | |
| "grad_norm": 0.32729023694992065, | |
| "learning_rate": 0.0004328558926487748, | |
| "loss": 3.4987, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 5.607375903052901, | |
| "grad_norm": 0.32407552003860474, | |
| "learning_rate": 0.00043241831971995326, | |
| "loss": 3.5105, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 5.621941272430669, | |
| "grad_norm": 0.3459337055683136, | |
| "learning_rate": 0.0004319807467911318, | |
| "loss": 3.5139, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 5.636506641808436, | |
| "grad_norm": 0.34581705927848816, | |
| "learning_rate": 0.00043154317386231034, | |
| "loss": 3.5169, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 5.651072011186204, | |
| "grad_norm": 0.323258638381958, | |
| "learning_rate": 0.00043110560093348883, | |
| "loss": 3.5006, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 5.665637380563971, | |
| "grad_norm": 0.3501630127429962, | |
| "learning_rate": 0.0004306680280046674, | |
| "loss": 3.5059, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 5.6802027499417385, | |
| "grad_norm": 0.3383364975452423, | |
| "learning_rate": 0.00043023045507584596, | |
| "loss": 3.5082, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.694768119319506, | |
| "grad_norm": 0.3391266465187073, | |
| "learning_rate": 0.0004297928821470245, | |
| "loss": 3.5073, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 5.709333488697274, | |
| "grad_norm": 0.33838364481925964, | |
| "learning_rate": 0.000429355309218203, | |
| "loss": 3.5057, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 5.7238988580750405, | |
| "grad_norm": 0.3325950801372528, | |
| "learning_rate": 0.00042891773628938153, | |
| "loss": 3.518, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 5.738464227452808, | |
| "grad_norm": 0.3349588215351105, | |
| "learning_rate": 0.00042848016336056007, | |
| "loss": 3.5155, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 5.753029596830576, | |
| "grad_norm": 0.33944258093833923, | |
| "learning_rate": 0.00042804259043173855, | |
| "loss": 3.5028, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 5.7675949662083426, | |
| "grad_norm": 0.3170711398124695, | |
| "learning_rate": 0.0004276050175029171, | |
| "loss": 3.5181, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 5.78216033558611, | |
| "grad_norm": 0.3340502083301544, | |
| "learning_rate": 0.0004271674445740957, | |
| "loss": 3.4968, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 5.796725704963878, | |
| "grad_norm": 0.34251272678375244, | |
| "learning_rate": 0.0004267298716452741, | |
| "loss": 3.5079, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 5.8112910743416455, | |
| "grad_norm": 0.3394465446472168, | |
| "learning_rate": 0.0004262922987164527, | |
| "loss": 3.5056, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 5.825856443719413, | |
| "grad_norm": 0.32446908950805664, | |
| "learning_rate": 0.00042585472578763125, | |
| "loss": 3.5029, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.825856443719413, | |
| "eval_accuracy": 0.36320252686510796, | |
| "eval_loss": 3.60426664352417, | |
| "eval_runtime": 180.5652, | |
| "eval_samples_per_second": 92.177, | |
| "eval_steps_per_second": 5.765, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.84042181309718, | |
| "grad_norm": 0.3498161733150482, | |
| "learning_rate": 0.0004254171528588098, | |
| "loss": 3.5086, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 5.8549871824749475, | |
| "grad_norm": 0.33967551589012146, | |
| "learning_rate": 0.0004249795799299883, | |
| "loss": 3.5126, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 5.869552551852715, | |
| "grad_norm": 0.3366953730583191, | |
| "learning_rate": 0.0004245420070011668, | |
| "loss": 3.5301, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 5.884117921230482, | |
| "grad_norm": 0.33286792039871216, | |
| "learning_rate": 0.00042410443407234536, | |
| "loss": 3.511, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 5.89868329060825, | |
| "grad_norm": 0.34662094712257385, | |
| "learning_rate": 0.00042366686114352385, | |
| "loss": 3.512, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 5.913248659986017, | |
| "grad_norm": 0.3202279508113861, | |
| "learning_rate": 0.0004232292882147024, | |
| "loss": 3.5006, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 5.927814029363785, | |
| "grad_norm": 0.34777122735977173, | |
| "learning_rate": 0.000422791715285881, | |
| "loss": 3.521, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 5.9423793987415525, | |
| "grad_norm": 0.34444618225097656, | |
| "learning_rate": 0.00042235414235705947, | |
| "loss": 3.5126, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 5.956944768119319, | |
| "grad_norm": 0.3303092122077942, | |
| "learning_rate": 0.000421916569428238, | |
| "loss": 3.516, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 5.971510137497087, | |
| "grad_norm": 0.34319791197776794, | |
| "learning_rate": 0.00042147899649941654, | |
| "loss": 3.5029, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.986075506874855, | |
| "grad_norm": 0.33462879061698914, | |
| "learning_rate": 0.0004210414235705951, | |
| "loss": 3.509, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 6.0005826147751105, | |
| "grad_norm": 0.332768976688385, | |
| "learning_rate": 0.00042060385064177357, | |
| "loss": 3.5038, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 6.015147984152878, | |
| "grad_norm": 0.32959234714508057, | |
| "learning_rate": 0.0004201662777129521, | |
| "loss": 3.3948, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 6.029713353530646, | |
| "grad_norm": 0.3324235677719116, | |
| "learning_rate": 0.00041972870478413065, | |
| "loss": 3.4112, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 6.044278722908413, | |
| "grad_norm": 0.3403053879737854, | |
| "learning_rate": 0.00041929113185530914, | |
| "loss": 3.4081, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 6.05884409228618, | |
| "grad_norm": 0.3473146855831146, | |
| "learning_rate": 0.00041885355892648773, | |
| "loss": 3.4049, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 6.073409461663948, | |
| "grad_norm": 0.34440669417381287, | |
| "learning_rate": 0.00041841598599766627, | |
| "loss": 3.4084, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 6.087974831041715, | |
| "grad_norm": 0.3304244875907898, | |
| "learning_rate": 0.00041797841306884476, | |
| "loss": 3.4264, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 6.102540200419483, | |
| "grad_norm": 0.33876416087150574, | |
| "learning_rate": 0.0004175408401400233, | |
| "loss": 3.415, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 6.11710556979725, | |
| "grad_norm": 0.3384806215763092, | |
| "learning_rate": 0.00041710326721120184, | |
| "loss": 3.4399, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.11710556979725, | |
| "eval_accuracy": 0.3637827680479111, | |
| "eval_loss": 3.6081302165985107, | |
| "eval_runtime": 180.2744, | |
| "eval_samples_per_second": 92.326, | |
| "eval_steps_per_second": 5.775, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 6.1316709391750175, | |
| "grad_norm": 0.363090455532074, | |
| "learning_rate": 0.0004166656942823804, | |
| "loss": 3.4368, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 6.146236308552785, | |
| "grad_norm": 0.3336644470691681, | |
| "learning_rate": 0.00041622812135355886, | |
| "loss": 3.4255, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 6.160801677930552, | |
| "grad_norm": 0.3573184311389923, | |
| "learning_rate": 0.0004157905484247374, | |
| "loss": 3.43, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 6.1753670473083195, | |
| "grad_norm": 0.3469174802303314, | |
| "learning_rate": 0.000415352975495916, | |
| "loss": 3.4248, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 6.189932416686087, | |
| "grad_norm": 0.33994483947753906, | |
| "learning_rate": 0.0004149154025670945, | |
| "loss": 3.4331, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 6.204497786063855, | |
| "grad_norm": 0.34334084391593933, | |
| "learning_rate": 0.000414477829638273, | |
| "loss": 3.4336, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 6.219063155441622, | |
| "grad_norm": 0.3307756185531616, | |
| "learning_rate": 0.00041404025670945156, | |
| "loss": 3.4457, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 6.233628524819389, | |
| "grad_norm": 0.3440045118331909, | |
| "learning_rate": 0.00041360268378063005, | |
| "loss": 3.4505, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 6.248193894197157, | |
| "grad_norm": 0.32408636808395386, | |
| "learning_rate": 0.0004131651108518086, | |
| "loss": 3.4495, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 6.2627592635749245, | |
| "grad_norm": 0.3418697714805603, | |
| "learning_rate": 0.0004127275379229871, | |
| "loss": 3.4413, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.277324632952691, | |
| "grad_norm": 0.3394606113433838, | |
| "learning_rate": 0.00041228996499416567, | |
| "loss": 3.4419, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 6.291890002330459, | |
| "grad_norm": 0.3462677299976349, | |
| "learning_rate": 0.00041185239206534415, | |
| "loss": 3.4454, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.306455371708227, | |
| "grad_norm": 0.33543628454208374, | |
| "learning_rate": 0.00041141481913652275, | |
| "loss": 3.4359, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 6.321020741085994, | |
| "grad_norm": 0.3553283214569092, | |
| "learning_rate": 0.0004109772462077013, | |
| "loss": 3.4364, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 6.335586110463761, | |
| "grad_norm": 0.3360411822795868, | |
| "learning_rate": 0.00041053967327887977, | |
| "loss": 3.4451, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 6.350151479841529, | |
| "grad_norm": 0.33588552474975586, | |
| "learning_rate": 0.0004101021003500583, | |
| "loss": 3.439, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 6.364716849219296, | |
| "grad_norm": 0.3321113884449005, | |
| "learning_rate": 0.00040966452742123685, | |
| "loss": 3.4385, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 6.379282218597064, | |
| "grad_norm": 0.3304464817047119, | |
| "learning_rate": 0.00040922695449241534, | |
| "loss": 3.4573, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 6.393847587974831, | |
| "grad_norm": 0.3388485014438629, | |
| "learning_rate": 0.0004087893815635939, | |
| "loss": 3.4549, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 6.408412957352598, | |
| "grad_norm": 0.36697396636009216, | |
| "learning_rate": 0.0004083518086347724, | |
| "loss": 3.4438, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.408412957352598, | |
| "eval_accuracy": 0.3643471363716102, | |
| "eval_loss": 3.600018262863159, | |
| "eval_runtime": 180.2847, | |
| "eval_samples_per_second": 92.321, | |
| "eval_steps_per_second": 5.774, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.422978326730366, | |
| "grad_norm": 0.3477044999599457, | |
| "learning_rate": 0.000407914235705951, | |
| "loss": 3.4598, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 6.437543696108134, | |
| "grad_norm": 0.32996484637260437, | |
| "learning_rate": 0.00040747666277712944, | |
| "loss": 3.4483, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 6.4521090654859, | |
| "grad_norm": 0.33145061135292053, | |
| "learning_rate": 0.00040703908984830804, | |
| "loss": 3.4543, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 6.466674434863668, | |
| "grad_norm": 0.33102595806121826, | |
| "learning_rate": 0.0004066015169194866, | |
| "loss": 3.437, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 6.481239804241436, | |
| "grad_norm": 0.34182071685791016, | |
| "learning_rate": 0.00040616394399066506, | |
| "loss": 3.4591, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 6.495805173619203, | |
| "grad_norm": 0.35360977053642273, | |
| "learning_rate": 0.0004057263710618436, | |
| "loss": 3.4661, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 6.51037054299697, | |
| "grad_norm": 0.34044864773750305, | |
| "learning_rate": 0.00040528879813302214, | |
| "loss": 3.4658, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 6.524935912374738, | |
| "grad_norm": 0.3496011793613434, | |
| "learning_rate": 0.00040485122520420063, | |
| "loss": 3.455, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 6.539501281752505, | |
| "grad_norm": 0.31914111971855164, | |
| "learning_rate": 0.00040441365227537917, | |
| "loss": 3.4606, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 6.554066651130273, | |
| "grad_norm": 0.32800233364105225, | |
| "learning_rate": 0.0004039760793465577, | |
| "loss": 3.4562, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.56863202050804, | |
| "grad_norm": 0.33165040612220764, | |
| "learning_rate": 0.0004035385064177363, | |
| "loss": 3.455, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 6.583197389885807, | |
| "grad_norm": 0.3741567134857178, | |
| "learning_rate": 0.0004031009334889148, | |
| "loss": 3.4562, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 6.597762759263575, | |
| "grad_norm": 0.35394638776779175, | |
| "learning_rate": 0.00040266336056009333, | |
| "loss": 3.4607, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 6.612328128641343, | |
| "grad_norm": 0.3237501084804535, | |
| "learning_rate": 0.00040222578763127187, | |
| "loss": 3.4658, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 6.626893498019109, | |
| "grad_norm": 0.34644386172294617, | |
| "learning_rate": 0.00040178821470245035, | |
| "loss": 3.4642, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 6.641458867396877, | |
| "grad_norm": 0.34503695368766785, | |
| "learning_rate": 0.0004013506417736289, | |
| "loss": 3.4739, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 6.656024236774645, | |
| "grad_norm": 0.3343126177787781, | |
| "learning_rate": 0.00040091306884480743, | |
| "loss": 3.4505, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 6.670589606152412, | |
| "grad_norm": 0.33412104845046997, | |
| "learning_rate": 0.0004004754959159859, | |
| "loss": 3.4603, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 6.685154975530179, | |
| "grad_norm": 0.32703226804733276, | |
| "learning_rate": 0.00040003792298716446, | |
| "loss": 3.4612, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 6.699720344907947, | |
| "grad_norm": 0.32835039496421814, | |
| "learning_rate": 0.00039960035005834305, | |
| "loss": 3.4622, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.699720344907947, | |
| "eval_accuracy": 0.36494254495311274, | |
| "eval_loss": 3.5912156105041504, | |
| "eval_runtime": 180.4184, | |
| "eval_samples_per_second": 92.252, | |
| "eval_steps_per_second": 5.77, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.714285714285714, | |
| "grad_norm": 0.3277016878128052, | |
| "learning_rate": 0.0003991627771295216, | |
| "loss": 3.4609, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 6.728851083663482, | |
| "grad_norm": 0.3436872363090515, | |
| "learning_rate": 0.0003987252042007001, | |
| "loss": 3.4605, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 6.743416453041249, | |
| "grad_norm": 0.32483038306236267, | |
| "learning_rate": 0.0003982876312718786, | |
| "loss": 3.468, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 6.7579818224190165, | |
| "grad_norm": 0.3559059500694275, | |
| "learning_rate": 0.00039785005834305716, | |
| "loss": 3.4694, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 6.772547191796784, | |
| "grad_norm": 0.34260398149490356, | |
| "learning_rate": 0.00039741248541423564, | |
| "loss": 3.4727, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 6.787112561174552, | |
| "grad_norm": 0.32523587346076965, | |
| "learning_rate": 0.0003969749124854142, | |
| "loss": 3.4571, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 6.8016779305523185, | |
| "grad_norm": 0.3347657322883606, | |
| "learning_rate": 0.0003965373395565927, | |
| "loss": 3.4717, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 6.816243299930086, | |
| "grad_norm": 0.33626583218574524, | |
| "learning_rate": 0.0003960997666277712, | |
| "loss": 3.4646, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 6.830808669307854, | |
| "grad_norm": 0.36179831624031067, | |
| "learning_rate": 0.0003956621936989498, | |
| "loss": 3.4717, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 6.845374038685621, | |
| "grad_norm": 0.34891805052757263, | |
| "learning_rate": 0.00039522462077012834, | |
| "loss": 3.4699, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.859939408063388, | |
| "grad_norm": 0.37656670808792114, | |
| "learning_rate": 0.0003947870478413069, | |
| "loss": 3.4674, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 6.874504777441156, | |
| "grad_norm": 0.3371601402759552, | |
| "learning_rate": 0.00039434947491248537, | |
| "loss": 3.4684, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 6.8890701468189235, | |
| "grad_norm": 0.3327315151691437, | |
| "learning_rate": 0.0003939119019836639, | |
| "loss": 3.4778, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 6.903635516196691, | |
| "grad_norm": 0.33458471298217773, | |
| "learning_rate": 0.00039347432905484245, | |
| "loss": 3.4688, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 6.918200885574458, | |
| "grad_norm": 0.3311387896537781, | |
| "learning_rate": 0.00039303675612602094, | |
| "loss": 3.4707, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 6.9327662549522255, | |
| "grad_norm": 0.33576178550720215, | |
| "learning_rate": 0.0003925991831971995, | |
| "loss": 3.4649, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 6.947331624329993, | |
| "grad_norm": 0.316501259803772, | |
| "learning_rate": 0.00039216161026837807, | |
| "loss": 3.4702, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 6.961896993707761, | |
| "grad_norm": 0.3234950006008148, | |
| "learning_rate": 0.00039172403733955656, | |
| "loss": 3.4551, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 6.976462363085528, | |
| "grad_norm": 0.34549012780189514, | |
| "learning_rate": 0.0003912864644107351, | |
| "loss": 3.4708, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 6.991027732463295, | |
| "grad_norm": 0.33629485964775085, | |
| "learning_rate": 0.00039084889148191364, | |
| "loss": 3.4761, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.991027732463295, | |
| "eval_accuracy": 0.3658422421224764, | |
| "eval_loss": 3.5796563625335693, | |
| "eval_runtime": 180.4423, | |
| "eval_samples_per_second": 92.24, | |
| "eval_steps_per_second": 5.769, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 7.005534840363552, | |
| "grad_norm": 0.3567199409008026, | |
| "learning_rate": 0.0003904113185530922, | |
| "loss": 3.4224, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 7.020100209741319, | |
| "grad_norm": 0.33759805560112, | |
| "learning_rate": 0.00038997374562427066, | |
| "loss": 3.3554, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 7.034665579119086, | |
| "grad_norm": 0.3463039696216583, | |
| "learning_rate": 0.0003895361726954492, | |
| "loss": 3.3629, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 7.049230948496854, | |
| "grad_norm": 0.34043920040130615, | |
| "learning_rate": 0.00038909859976662774, | |
| "loss": 3.3713, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 7.063796317874622, | |
| "grad_norm": 0.3372809886932373, | |
| "learning_rate": 0.0003886610268378062, | |
| "loss": 3.3729, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 7.0783616872523885, | |
| "grad_norm": 0.3626004159450531, | |
| "learning_rate": 0.0003882234539089848, | |
| "loss": 3.3779, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 7.092927056630156, | |
| "grad_norm": 0.3814680278301239, | |
| "learning_rate": 0.00038778588098016336, | |
| "loss": 3.3831, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 7.107492426007924, | |
| "grad_norm": 0.3421391248703003, | |
| "learning_rate": 0.00038734830805134185, | |
| "loss": 3.3799, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 7.122057795385691, | |
| "grad_norm": 0.34770506620407104, | |
| "learning_rate": 0.0003869107351225204, | |
| "loss": 3.3751, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 7.136623164763458, | |
| "grad_norm": 0.348093181848526, | |
| "learning_rate": 0.0003864731621936989, | |
| "loss": 3.3744, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 7.151188534141226, | |
| "grad_norm": 0.34899893403053284, | |
| "learning_rate": 0.00038603558926487747, | |
| "loss": 3.378, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 7.165753903518993, | |
| "grad_norm": 0.32636308670043945, | |
| "learning_rate": 0.00038559801633605595, | |
| "loss": 3.3811, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 7.180319272896761, | |
| "grad_norm": 0.32693901658058167, | |
| "learning_rate": 0.0003851604434072345, | |
| "loss": 3.3978, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 7.194884642274528, | |
| "grad_norm": 0.35337990522384644, | |
| "learning_rate": 0.0003847228704784131, | |
| "loss": 3.4016, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 7.2094500116522955, | |
| "grad_norm": 0.33998918533325195, | |
| "learning_rate": 0.0003842852975495915, | |
| "loss": 3.3913, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 7.224015381030063, | |
| "grad_norm": 0.34085580706596375, | |
| "learning_rate": 0.0003838477246207701, | |
| "loss": 3.3876, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 7.238580750407831, | |
| "grad_norm": 0.34505322575569153, | |
| "learning_rate": 0.00038341015169194865, | |
| "loss": 3.4013, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 7.2531461197855975, | |
| "grad_norm": 0.35665056109428406, | |
| "learning_rate": 0.00038297257876312714, | |
| "loss": 3.3945, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 7.267711489163365, | |
| "grad_norm": 0.33130306005477905, | |
| "learning_rate": 0.0003825350058343057, | |
| "loss": 3.3973, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 7.282276858541133, | |
| "grad_norm": 0.33717137575149536, | |
| "learning_rate": 0.0003820974329054842, | |
| "loss": 3.4035, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.282276858541133, | |
| "eval_accuracy": 0.36597134137652254, | |
| "eval_loss": 3.5882999897003174, | |
| "eval_runtime": 180.2331, | |
| "eval_samples_per_second": 92.347, | |
| "eval_steps_per_second": 5.776, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 7.2968422279189, | |
| "grad_norm": 0.343801349401474, | |
| "learning_rate": 0.00038165985997666276, | |
| "loss": 3.3994, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 7.311407597296667, | |
| "grad_norm": 0.34225597977638245, | |
| "learning_rate": 0.00038122228704784124, | |
| "loss": 3.4, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 7.325972966674435, | |
| "grad_norm": 0.3473186492919922, | |
| "learning_rate": 0.0003807847141190198, | |
| "loss": 3.397, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 7.3405383360522025, | |
| "grad_norm": 0.3287709653377533, | |
| "learning_rate": 0.0003803471411901984, | |
| "loss": 3.4058, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 7.35510370542997, | |
| "grad_norm": 0.351204514503479, | |
| "learning_rate": 0.00037990956826137686, | |
| "loss": 3.4163, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 7.369669074807737, | |
| "grad_norm": 0.35390347242355347, | |
| "learning_rate": 0.0003794719953325554, | |
| "loss": 3.4228, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 7.384234444185505, | |
| "grad_norm": 0.3401016891002655, | |
| "learning_rate": 0.00037903442240373394, | |
| "loss": 3.406, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 7.398799813563272, | |
| "grad_norm": 0.35391902923583984, | |
| "learning_rate": 0.00037859684947491243, | |
| "loss": 3.4097, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 7.413365182941039, | |
| "grad_norm": 0.342098206281662, | |
| "learning_rate": 0.00037815927654609097, | |
| "loss": 3.4266, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 7.427930552318807, | |
| "grad_norm": 0.34706324338912964, | |
| "learning_rate": 0.0003777217036172695, | |
| "loss": 3.4083, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 7.442495921696574, | |
| "grad_norm": 0.35251185297966003, | |
| "learning_rate": 0.00037728413068844805, | |
| "loss": 3.4125, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 7.457061291074342, | |
| "grad_norm": 0.3509294390678406, | |
| "learning_rate": 0.00037684655775962653, | |
| "loss": 3.4203, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 7.471626660452109, | |
| "grad_norm": 0.3560699224472046, | |
| "learning_rate": 0.00037640898483080513, | |
| "loss": 3.4272, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 7.486192029829876, | |
| "grad_norm": 0.34861257672309875, | |
| "learning_rate": 0.00037597141190198367, | |
| "loss": 3.4182, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 7.500757399207644, | |
| "grad_norm": 0.33859142661094666, | |
| "learning_rate": 0.00037553383897316215, | |
| "loss": 3.4299, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 7.515322768585412, | |
| "grad_norm": 0.35380759835243225, | |
| "learning_rate": 0.0003750962660443407, | |
| "loss": 3.4201, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 7.529888137963178, | |
| "grad_norm": 0.34941068291664124, | |
| "learning_rate": 0.00037465869311551923, | |
| "loss": 3.4167, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 7.544453507340946, | |
| "grad_norm": 0.35646477341651917, | |
| "learning_rate": 0.0003742211201866977, | |
| "loss": 3.4306, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 7.559018876718714, | |
| "grad_norm": 0.35378143191337585, | |
| "learning_rate": 0.00037378354725787626, | |
| "loss": 3.4086, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 7.573584246096481, | |
| "grad_norm": 0.3527311384677887, | |
| "learning_rate": 0.0003733459743290548, | |
| "loss": 3.4207, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.573584246096481, | |
| "eval_accuracy": 0.36644117800600207, | |
| "eval_loss": 3.5814883708953857, | |
| "eval_runtime": 180.4499, | |
| "eval_samples_per_second": 92.236, | |
| "eval_steps_per_second": 5.769, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 7.588149615474248, | |
| "grad_norm": 0.3543234169483185, | |
| "learning_rate": 0.0003729084014002334, | |
| "loss": 3.4278, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 7.602714984852016, | |
| "grad_norm": 0.35281285643577576, | |
| "learning_rate": 0.0003724708284714119, | |
| "loss": 3.421, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 7.617280354229783, | |
| "grad_norm": 0.3394710123538971, | |
| "learning_rate": 0.0003720332555425904, | |
| "loss": 3.4238, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 7.631845723607551, | |
| "grad_norm": 0.34304413199424744, | |
| "learning_rate": 0.00037159568261376896, | |
| "loss": 3.4145, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 7.646411092985318, | |
| "grad_norm": 0.3429325520992279, | |
| "learning_rate": 0.00037115810968494744, | |
| "loss": 3.4268, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 7.660976462363085, | |
| "grad_norm": 0.3383738100528717, | |
| "learning_rate": 0.000370720536756126, | |
| "loss": 3.4228, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 7.675541831740853, | |
| "grad_norm": 0.3476937413215637, | |
| "learning_rate": 0.0003702829638273045, | |
| "loss": 3.4207, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 7.690107201118621, | |
| "grad_norm": 0.342341810464859, | |
| "learning_rate": 0.000369845390898483, | |
| "loss": 3.4207, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 7.704672570496387, | |
| "grad_norm": 0.35490912199020386, | |
| "learning_rate": 0.00036940781796966155, | |
| "loss": 3.4281, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 7.719237939874155, | |
| "grad_norm": 0.34896060824394226, | |
| "learning_rate": 0.00036897024504084014, | |
| "loss": 3.4361, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 7.733803309251923, | |
| "grad_norm": 0.3462172746658325, | |
| "learning_rate": 0.0003685326721120187, | |
| "loss": 3.425, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 7.74836867862969, | |
| "grad_norm": 0.35829275846481323, | |
| "learning_rate": 0.00036809509918319717, | |
| "loss": 3.4346, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 7.762934048007457, | |
| "grad_norm": 0.3367747664451599, | |
| "learning_rate": 0.0003676575262543757, | |
| "loss": 3.4247, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 7.777499417385225, | |
| "grad_norm": 0.33087530732154846, | |
| "learning_rate": 0.00036721995332555425, | |
| "loss": 3.432, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 7.792064786762992, | |
| "grad_norm": 0.3543736934661865, | |
| "learning_rate": 0.00036678238039673274, | |
| "loss": 3.4379, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 7.80663015614076, | |
| "grad_norm": 0.3304196894168854, | |
| "learning_rate": 0.0003663448074679113, | |
| "loss": 3.4238, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 7.821195525518527, | |
| "grad_norm": 0.35223904252052307, | |
| "learning_rate": 0.0003659072345390898, | |
| "loss": 3.426, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 7.8357608948962945, | |
| "grad_norm": 0.34050217270851135, | |
| "learning_rate": 0.0003654696616102683, | |
| "loss": 3.4172, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 7.850326264274062, | |
| "grad_norm": 0.3450503349304199, | |
| "learning_rate": 0.00036503208868144684, | |
| "loss": 3.4337, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 7.86489163365183, | |
| "grad_norm": 0.3508300483226776, | |
| "learning_rate": 0.00036459451575262543, | |
| "loss": 3.4351, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.86489163365183, | |
| "eval_accuracy": 0.36722659058981666, | |
| "eval_loss": 3.571290969848633, | |
| "eval_runtime": 180.2666, | |
| "eval_samples_per_second": 92.33, | |
| "eval_steps_per_second": 5.775, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 7.8794570030295965, | |
| "grad_norm": 0.35574260354042053, | |
| "learning_rate": 0.000364156942823804, | |
| "loss": 3.4341, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 7.894022372407364, | |
| "grad_norm": 0.34330523014068604, | |
| "learning_rate": 0.00036371936989498246, | |
| "loss": 3.4396, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 7.908587741785132, | |
| "grad_norm": 0.35326018929481506, | |
| "learning_rate": 0.000363281796966161, | |
| "loss": 3.4192, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 7.923153111162899, | |
| "grad_norm": 0.356656938791275, | |
| "learning_rate": 0.00036284422403733954, | |
| "loss": 3.4359, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 7.937718480540666, | |
| "grad_norm": 0.32995936274528503, | |
| "learning_rate": 0.000362406651108518, | |
| "loss": 3.4363, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 7.952283849918434, | |
| "grad_norm": 0.3421317934989929, | |
| "learning_rate": 0.00036196907817969657, | |
| "loss": 3.4263, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 7.9668492192962015, | |
| "grad_norm": 0.33741775155067444, | |
| "learning_rate": 0.0003615315052508751, | |
| "loss": 3.4332, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 7.981414588673969, | |
| "grad_norm": 0.34820324182510376, | |
| "learning_rate": 0.0003610939323220536, | |
| "loss": 3.4311, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 7.995979958051736, | |
| "grad_norm": 0.36536943912506104, | |
| "learning_rate": 0.0003606563593932322, | |
| "loss": 3.4287, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 8.010487065951992, | |
| "grad_norm": 0.3467245399951935, | |
| "learning_rate": 0.0003602187864644107, | |
| "loss": 3.3578, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 8.02505243532976, | |
| "grad_norm": 0.36606982350349426, | |
| "learning_rate": 0.00035978121353558927, | |
| "loss": 3.3254, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 8.039617804707527, | |
| "grad_norm": 0.370090126991272, | |
| "learning_rate": 0.00035934364060676775, | |
| "loss": 3.3324, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 8.054183174085296, | |
| "grad_norm": 0.3692021667957306, | |
| "learning_rate": 0.0003589060676779463, | |
| "loss": 3.3385, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 8.068748543463062, | |
| "grad_norm": 0.35137107968330383, | |
| "learning_rate": 0.00035846849474912483, | |
| "loss": 3.3448, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 8.08331391284083, | |
| "grad_norm": 0.3459080755710602, | |
| "learning_rate": 0.0003580309218203033, | |
| "loss": 3.3318, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 8.097879282218598, | |
| "grad_norm": 0.35793742537498474, | |
| "learning_rate": 0.00035759334889148186, | |
| "loss": 3.3345, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 8.112444651596364, | |
| "grad_norm": 0.35751616954803467, | |
| "learning_rate": 0.00035715577596266045, | |
| "loss": 3.3613, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 8.127010020974131, | |
| "grad_norm": 0.3466125428676605, | |
| "learning_rate": 0.00035671820303383894, | |
| "loss": 3.3478, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 8.1415753903519, | |
| "grad_norm": 0.3528430759906769, | |
| "learning_rate": 0.0003562806301050175, | |
| "loss": 3.3564, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 8.156140759729666, | |
| "grad_norm": 0.36010900139808655, | |
| "learning_rate": 0.000355843057176196, | |
| "loss": 3.3456, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.156140759729666, | |
| "eval_accuracy": 0.36759131361900715, | |
| "eval_loss": 3.580268144607544, | |
| "eval_runtime": 180.2651, | |
| "eval_samples_per_second": 92.331, | |
| "eval_steps_per_second": 5.775, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 8.170706129107435, | |
| "grad_norm": 0.3616182804107666, | |
| "learning_rate": 0.00035540548424737456, | |
| "loss": 3.3554, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 8.185271498485202, | |
| "grad_norm": 0.3429206311702728, | |
| "learning_rate": 0.00035496791131855304, | |
| "loss": 3.3689, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 8.199836867862969, | |
| "grad_norm": 0.3601152300834656, | |
| "learning_rate": 0.0003545303383897316, | |
| "loss": 3.3647, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 8.214402237240737, | |
| "grad_norm": 0.346986323595047, | |
| "learning_rate": 0.0003540927654609101, | |
| "loss": 3.3639, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 8.228967606618504, | |
| "grad_norm": 0.3525499105453491, | |
| "learning_rate": 0.0003536551925320886, | |
| "loss": 3.3606, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 8.24353297599627, | |
| "grad_norm": 0.3487248420715332, | |
| "learning_rate": 0.0003532176196032672, | |
| "loss": 3.3582, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 8.258098345374039, | |
| "grad_norm": 0.3517068028450012, | |
| "learning_rate": 0.00035278004667444574, | |
| "loss": 3.3674, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 8.272663714751806, | |
| "grad_norm": 0.3672351837158203, | |
| "learning_rate": 0.00035234247374562423, | |
| "loss": 3.3631, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 8.287229084129574, | |
| "grad_norm": 0.3655698001384735, | |
| "learning_rate": 0.00035190490081680277, | |
| "loss": 3.3605, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 8.301794453507341, | |
| "grad_norm": 0.3492221534252167, | |
| "learning_rate": 0.0003514673278879813, | |
| "loss": 3.3721, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 8.316359822885108, | |
| "grad_norm": 0.33222460746765137, | |
| "learning_rate": 0.00035102975495915985, | |
| "loss": 3.3655, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 8.330925192262876, | |
| "grad_norm": 0.3473198115825653, | |
| "learning_rate": 0.00035059218203033833, | |
| "loss": 3.3765, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 8.345490561640643, | |
| "grad_norm": 0.3525267541408539, | |
| "learning_rate": 0.0003501546091015169, | |
| "loss": 3.3692, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 8.36005593101841, | |
| "grad_norm": 0.3442334234714508, | |
| "learning_rate": 0.00034971703617269547, | |
| "loss": 3.3892, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 8.374621300396178, | |
| "grad_norm": 0.35674968361854553, | |
| "learning_rate": 0.0003492794632438739, | |
| "loss": 3.378, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 8.389186669773945, | |
| "grad_norm": 0.34580376744270325, | |
| "learning_rate": 0.0003488418903150525, | |
| "loss": 3.3797, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 8.403752039151712, | |
| "grad_norm": 0.34604698419570923, | |
| "learning_rate": 0.00034840431738623103, | |
| "loss": 3.3674, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 8.41831740852948, | |
| "grad_norm": 0.3592158854007721, | |
| "learning_rate": 0.0003479667444574095, | |
| "loss": 3.377, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 8.432882777907247, | |
| "grad_norm": 0.359283447265625, | |
| "learning_rate": 0.00034752917152858806, | |
| "loss": 3.3884, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 8.447448147285016, | |
| "grad_norm": 0.34626027941703796, | |
| "learning_rate": 0.0003470915985997666, | |
| "loss": 3.379, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.447448147285016, | |
| "eval_accuracy": 0.3679311103805677, | |
| "eval_loss": 3.57328462600708, | |
| "eval_runtime": 180.157, | |
| "eval_samples_per_second": 92.386, | |
| "eval_steps_per_second": 5.778, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 8.462013516662783, | |
| "grad_norm": 0.3666757643222809, | |
| "learning_rate": 0.00034665402567094514, | |
| "loss": 3.3796, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 8.47657888604055, | |
| "grad_norm": 0.3576624393463135, | |
| "learning_rate": 0.0003462164527421236, | |
| "loss": 3.3846, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 8.491144255418318, | |
| "grad_norm": 0.36986008286476135, | |
| "learning_rate": 0.00034577887981330216, | |
| "loss": 3.3789, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 8.505709624796085, | |
| "grad_norm": 0.34708988666534424, | |
| "learning_rate": 0.00034534130688448076, | |
| "loss": 3.3752, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 8.520274994173853, | |
| "grad_norm": 0.36563989520072937, | |
| "learning_rate": 0.00034490373395565924, | |
| "loss": 3.3733, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 8.53484036355162, | |
| "grad_norm": 0.36509010195732117, | |
| "learning_rate": 0.0003444661610268378, | |
| "loss": 3.3893, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 8.549405732929387, | |
| "grad_norm": 0.3598864674568176, | |
| "learning_rate": 0.0003440285880980163, | |
| "loss": 3.3822, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 8.563971102307155, | |
| "grad_norm": 0.3573833405971527, | |
| "learning_rate": 0.0003435910151691948, | |
| "loss": 3.3875, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 8.578536471684922, | |
| "grad_norm": 0.33812621235847473, | |
| "learning_rate": 0.00034315344224037335, | |
| "loss": 3.3907, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 8.593101841062689, | |
| "grad_norm": 0.3446572422981262, | |
| "learning_rate": 0.0003427158693115519, | |
| "loss": 3.3985, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 8.607667210440457, | |
| "grad_norm": 0.34378212690353394, | |
| "learning_rate": 0.00034227829638273043, | |
| "loss": 3.3868, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 8.622232579818224, | |
| "grad_norm": 0.35360199213027954, | |
| "learning_rate": 0.0003418407234539089, | |
| "loss": 3.4024, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 8.63679794919599, | |
| "grad_norm": 0.35581034421920776, | |
| "learning_rate": 0.0003414031505250875, | |
| "loss": 3.3898, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 8.65136331857376, | |
| "grad_norm": 0.35615333914756775, | |
| "learning_rate": 0.00034096557759626605, | |
| "loss": 3.3959, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 8.665928687951526, | |
| "grad_norm": 0.35061442852020264, | |
| "learning_rate": 0.00034052800466744453, | |
| "loss": 3.39, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 8.680494057329295, | |
| "grad_norm": 0.3618820905685425, | |
| "learning_rate": 0.0003400904317386231, | |
| "loss": 3.3843, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 8.695059426707061, | |
| "grad_norm": 0.34694838523864746, | |
| "learning_rate": 0.0003396528588098016, | |
| "loss": 3.3842, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 8.709624796084828, | |
| "grad_norm": 0.34534159302711487, | |
| "learning_rate": 0.0003392152858809801, | |
| "loss": 3.392, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 8.724190165462597, | |
| "grad_norm": 0.35484299063682556, | |
| "learning_rate": 0.00033877771295215864, | |
| "loss": 3.3917, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 8.738755534840363, | |
| "grad_norm": 0.34446921944618225, | |
| "learning_rate": 0.0003383401400233372, | |
| "loss": 3.3926, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.738755534840363, | |
| "eval_accuracy": 0.3686057656808563, | |
| "eval_loss": 3.563647985458374, | |
| "eval_runtime": 180.6198, | |
| "eval_samples_per_second": 92.149, | |
| "eval_steps_per_second": 5.763, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 8.753320904218132, | |
| "grad_norm": 0.38022834062576294, | |
| "learning_rate": 0.0003379025670945158, | |
| "loss": 3.3971, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 8.767886273595899, | |
| "grad_norm": 0.3438722491264343, | |
| "learning_rate": 0.00033746499416569426, | |
| "loss": 3.3964, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 8.782451642973665, | |
| "grad_norm": 0.3680688142776489, | |
| "learning_rate": 0.0003370274212368728, | |
| "loss": 3.4007, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 8.797017012351434, | |
| "grad_norm": 0.3484303653240204, | |
| "learning_rate": 0.00033658984830805134, | |
| "loss": 3.3908, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 8.8115823817292, | |
| "grad_norm": 0.34887251257896423, | |
| "learning_rate": 0.0003361522753792298, | |
| "loss": 3.3864, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 8.826147751106967, | |
| "grad_norm": 0.38353490829467773, | |
| "learning_rate": 0.00033571470245040837, | |
| "loss": 3.3924, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 8.840713120484736, | |
| "grad_norm": 0.3619522750377655, | |
| "learning_rate": 0.0003352771295215869, | |
| "loss": 3.3943, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 8.855278489862503, | |
| "grad_norm": 0.34908801317214966, | |
| "learning_rate": 0.0003348395565927654, | |
| "loss": 3.4001, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 8.86984385924027, | |
| "grad_norm": 0.3426980972290039, | |
| "learning_rate": 0.00033440198366394393, | |
| "loss": 3.3924, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 8.884409228618038, | |
| "grad_norm": 0.3510425090789795, | |
| "learning_rate": 0.0003339644107351225, | |
| "loss": 3.3992, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 8.898974597995805, | |
| "grad_norm": 0.3653899133205414, | |
| "learning_rate": 0.00033352683780630107, | |
| "loss": 3.3945, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 8.913539967373573, | |
| "grad_norm": 0.3542352318763733, | |
| "learning_rate": 0.00033308926487747955, | |
| "loss": 3.4124, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 8.92810533675134, | |
| "grad_norm": 0.3585004508495331, | |
| "learning_rate": 0.0003326516919486581, | |
| "loss": 3.3896, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 8.942670706129107, | |
| "grad_norm": 0.3748304843902588, | |
| "learning_rate": 0.00033221411901983663, | |
| "loss": 3.3986, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 8.957236075506875, | |
| "grad_norm": 0.3452504277229309, | |
| "learning_rate": 0.0003317765460910151, | |
| "loss": 3.3955, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 8.971801444884642, | |
| "grad_norm": 0.3590230941772461, | |
| "learning_rate": 0.00033133897316219366, | |
| "loss": 3.396, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 8.986366814262409, | |
| "grad_norm": 0.3469237685203552, | |
| "learning_rate": 0.0003309014002333722, | |
| "loss": 3.3886, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 9.000873922162667, | |
| "grad_norm": 0.34952783584594727, | |
| "learning_rate": 0.0003304638273045507, | |
| "loss": 3.3905, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 9.015439291540433, | |
| "grad_norm": 0.3712822198867798, | |
| "learning_rate": 0.0003300262543757292, | |
| "loss": 3.2828, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 9.0300046609182, | |
| "grad_norm": 0.371115505695343, | |
| "learning_rate": 0.0003295886814469078, | |
| "loss": 3.305, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.0300046609182, | |
| "eval_accuracy": 0.36886643330036484, | |
| "eval_loss": 3.5686051845550537, | |
| "eval_runtime": 181.9038, | |
| "eval_samples_per_second": 91.499, | |
| "eval_steps_per_second": 5.723, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 9.044570030295969, | |
| "grad_norm": 0.3616848289966583, | |
| "learning_rate": 0.00032915110851808636, | |
| "loss": 3.2979, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 9.059135399673735, | |
| "grad_norm": 0.3446025550365448, | |
| "learning_rate": 0.00032871353558926484, | |
| "loss": 3.3074, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 9.073700769051504, | |
| "grad_norm": 0.36741337180137634, | |
| "learning_rate": 0.0003282759626604434, | |
| "loss": 3.2965, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 9.08826613842927, | |
| "grad_norm": 0.3401558995246887, | |
| "learning_rate": 0.0003278383897316219, | |
| "loss": 3.305, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 9.102831507807037, | |
| "grad_norm": 0.364580363035202, | |
| "learning_rate": 0.0003274008168028004, | |
| "loss": 3.3159, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 9.117396877184806, | |
| "grad_norm": 0.3511641025543213, | |
| "learning_rate": 0.00032696324387397895, | |
| "loss": 3.3232, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 9.131962246562573, | |
| "grad_norm": 0.3950608968734741, | |
| "learning_rate": 0.0003265256709451575, | |
| "loss": 3.3135, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 9.14652761594034, | |
| "grad_norm": 0.35723525285720825, | |
| "learning_rate": 0.00032608809801633597, | |
| "loss": 3.3117, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 9.161092985318108, | |
| "grad_norm": 0.38060232996940613, | |
| "learning_rate": 0.00032565052508751457, | |
| "loss": 3.3211, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 9.175658354695875, | |
| "grad_norm": 0.3600994944572449, | |
| "learning_rate": 0.0003252129521586931, | |
| "loss": 3.3195, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 9.190223724073643, | |
| "grad_norm": 0.38114050030708313, | |
| "learning_rate": 0.00032477537922987165, | |
| "loss": 3.3272, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 9.20478909345141, | |
| "grad_norm": 0.35321810841560364, | |
| "learning_rate": 0.00032433780630105013, | |
| "loss": 3.3198, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 9.219354462829177, | |
| "grad_norm": 0.3709951639175415, | |
| "learning_rate": 0.00032390023337222867, | |
| "loss": 3.3285, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 9.233919832206945, | |
| "grad_norm": 0.3782629668712616, | |
| "learning_rate": 0.0003234626604434072, | |
| "loss": 3.3391, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 9.248485201584712, | |
| "grad_norm": 0.3684213161468506, | |
| "learning_rate": 0.0003230250875145857, | |
| "loss": 3.3321, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 9.263050570962479, | |
| "grad_norm": 0.37817445397377014, | |
| "learning_rate": 0.00032258751458576424, | |
| "loss": 3.3253, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 9.277615940340247, | |
| "grad_norm": 0.37698403000831604, | |
| "learning_rate": 0.00032214994165694283, | |
| "loss": 3.3263, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 9.292181309718014, | |
| "grad_norm": 0.35898423194885254, | |
| "learning_rate": 0.0003217123687281213, | |
| "loss": 3.3407, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 9.306746679095783, | |
| "grad_norm": 0.37121668457984924, | |
| "learning_rate": 0.00032127479579929986, | |
| "loss": 3.3469, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 9.32131204847355, | |
| "grad_norm": 0.3602243959903717, | |
| "learning_rate": 0.0003208372228704784, | |
| "loss": 3.333, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.32131204847355, | |
| "eval_accuracy": 0.36881940260672325, | |
| "eval_loss": 3.568206787109375, | |
| "eval_runtime": 181.6399, | |
| "eval_samples_per_second": 91.632, | |
| "eval_steps_per_second": 5.731, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 9.335877417851316, | |
| "grad_norm": 0.3777805268764496, | |
| "learning_rate": 0.00032039964994165694, | |
| "loss": 3.3404, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 9.350442787229085, | |
| "grad_norm": 0.36622655391693115, | |
| "learning_rate": 0.0003199620770128354, | |
| "loss": 3.3462, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 9.365008156606851, | |
| "grad_norm": 0.3432258069515228, | |
| "learning_rate": 0.00031952450408401396, | |
| "loss": 3.3499, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 9.379573525984618, | |
| "grad_norm": 0.3571391999721527, | |
| "learning_rate": 0.0003190869311551925, | |
| "loss": 3.3437, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 9.394138895362387, | |
| "grad_norm": 0.3796580731868744, | |
| "learning_rate": 0.000318649358226371, | |
| "loss": 3.3445, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 9.408704264740154, | |
| "grad_norm": 0.3999924659729004, | |
| "learning_rate": 0.0003182117852975496, | |
| "loss": 3.3398, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 9.423269634117922, | |
| "grad_norm": 0.3521633744239807, | |
| "learning_rate": 0.0003177742123687281, | |
| "loss": 3.3518, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 9.437835003495689, | |
| "grad_norm": 0.34816059470176697, | |
| "learning_rate": 0.0003173366394399066, | |
| "loss": 3.3498, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 9.452400372873456, | |
| "grad_norm": 0.3519940674304962, | |
| "learning_rate": 0.00031689906651108515, | |
| "loss": 3.3491, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 9.466965742251224, | |
| "grad_norm": 0.366641104221344, | |
| "learning_rate": 0.0003164614935822637, | |
| "loss": 3.3575, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 9.48153111162899, | |
| "grad_norm": 0.3859027028083801, | |
| "learning_rate": 0.00031602392065344223, | |
| "loss": 3.3477, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 9.496096481006758, | |
| "grad_norm": 0.3514662981033325, | |
| "learning_rate": 0.0003155863477246207, | |
| "loss": 3.3536, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 9.510661850384526, | |
| "grad_norm": 0.37433597445487976, | |
| "learning_rate": 0.00031514877479579925, | |
| "loss": 3.3606, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 9.525227219762293, | |
| "grad_norm": 0.3747974932193756, | |
| "learning_rate": 0.00031471120186697785, | |
| "loss": 3.3621, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 9.53979258914006, | |
| "grad_norm": 0.38271790742874146, | |
| "learning_rate": 0.00031427362893815633, | |
| "loss": 3.3599, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 9.554357958517828, | |
| "grad_norm": 0.3738161027431488, | |
| "learning_rate": 0.0003138360560093349, | |
| "loss": 3.35, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 9.568923327895595, | |
| "grad_norm": 0.37082603573799133, | |
| "learning_rate": 0.0003133984830805134, | |
| "loss": 3.3593, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 9.583488697273363, | |
| "grad_norm": 0.38742882013320923, | |
| "learning_rate": 0.0003129609101516919, | |
| "loss": 3.3513, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 9.59805406665113, | |
| "grad_norm": 0.36848726868629456, | |
| "learning_rate": 0.00031252333722287044, | |
| "loss": 3.3619, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 9.612619436028897, | |
| "grad_norm": 0.34450680017471313, | |
| "learning_rate": 0.000312085764294049, | |
| "loss": 3.3523, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.612619436028897, | |
| "eval_accuracy": 0.36961469163620253, | |
| "eval_loss": 3.5594406127929688, | |
| "eval_runtime": 181.5027, | |
| "eval_samples_per_second": 91.701, | |
| "eval_steps_per_second": 5.735, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 9.627184805406666, | |
| "grad_norm": 0.34740373492240906, | |
| "learning_rate": 0.0003116481913652275, | |
| "loss": 3.3523, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 9.641750174784432, | |
| "grad_norm": 0.3639390468597412, | |
| "learning_rate": 0.000311210618436406, | |
| "loss": 3.3569, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 9.6563155441622, | |
| "grad_norm": 0.3668532073497772, | |
| "learning_rate": 0.0003107730455075846, | |
| "loss": 3.3582, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 9.670880913539968, | |
| "grad_norm": 0.3689277172088623, | |
| "learning_rate": 0.00031033547257876314, | |
| "loss": 3.3615, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 9.685446282917734, | |
| "grad_norm": 0.3605565130710602, | |
| "learning_rate": 0.0003098978996499416, | |
| "loss": 3.3501, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 9.700011652295503, | |
| "grad_norm": 0.3678613007068634, | |
| "learning_rate": 0.00030946032672112016, | |
| "loss": 3.3653, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 9.71457702167327, | |
| "grad_norm": 0.360675185918808, | |
| "learning_rate": 0.0003090227537922987, | |
| "loss": 3.3637, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 9.729142391051036, | |
| "grad_norm": 0.3719678819179535, | |
| "learning_rate": 0.0003085851808634772, | |
| "loss": 3.357, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 9.743707760428805, | |
| "grad_norm": 0.3562043607234955, | |
| "learning_rate": 0.00030814760793465573, | |
| "loss": 3.3531, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 9.758273129806572, | |
| "grad_norm": 0.37112271785736084, | |
| "learning_rate": 0.00030771003500583427, | |
| "loss": 3.3629, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 9.772838499184338, | |
| "grad_norm": 0.3823767900466919, | |
| "learning_rate": 0.00030727246207701286, | |
| "loss": 3.3711, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 9.787403868562107, | |
| "grad_norm": 0.3594043552875519, | |
| "learning_rate": 0.0003068348891481913, | |
| "loss": 3.3712, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 9.801969237939874, | |
| "grad_norm": 0.3566214442253113, | |
| "learning_rate": 0.0003063973162193699, | |
| "loss": 3.3842, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 9.816534607317642, | |
| "grad_norm": 0.36310461163520813, | |
| "learning_rate": 0.00030595974329054843, | |
| "loss": 3.3596, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 9.831099976695409, | |
| "grad_norm": 0.36038920283317566, | |
| "learning_rate": 0.0003055221703617269, | |
| "loss": 3.3677, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 9.845665346073176, | |
| "grad_norm": 0.34875422716140747, | |
| "learning_rate": 0.00030508459743290546, | |
| "loss": 3.3738, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 9.860230715450944, | |
| "grad_norm": 0.3687998056411743, | |
| "learning_rate": 0.000304647024504084, | |
| "loss": 3.3725, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 9.874796084828711, | |
| "grad_norm": 0.3492382764816284, | |
| "learning_rate": 0.0003042094515752625, | |
| "loss": 3.3642, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 9.88936145420648, | |
| "grad_norm": 0.34819406270980835, | |
| "learning_rate": 0.000303771878646441, | |
| "loss": 3.3783, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 9.903926823584246, | |
| "grad_norm": 0.36848151683807373, | |
| "learning_rate": 0.00030333430571761956, | |
| "loss": 3.3654, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.903926823584246, | |
| "eval_accuracy": 0.3703816446727628, | |
| "eval_loss": 3.5503089427948, | |
| "eval_runtime": 181.4493, | |
| "eval_samples_per_second": 91.728, | |
| "eval_steps_per_second": 5.737, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 9.918492192962013, | |
| "grad_norm": 0.39530327916145325, | |
| "learning_rate": 0.00030289673278879816, | |
| "loss": 3.3634, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 9.933057562339782, | |
| "grad_norm": 0.3620380759239197, | |
| "learning_rate": 0.00030245915985997664, | |
| "loss": 3.3729, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 9.947622931717548, | |
| "grad_norm": 0.356423020362854, | |
| "learning_rate": 0.0003020215869311552, | |
| "loss": 3.3823, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 9.962188301095315, | |
| "grad_norm": 0.35574576258659363, | |
| "learning_rate": 0.0003015840140023337, | |
| "loss": 3.3698, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 9.976753670473084, | |
| "grad_norm": 0.3700348734855652, | |
| "learning_rate": 0.0003011464410735122, | |
| "loss": 3.368, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 9.99131903985085, | |
| "grad_norm": 0.3582363724708557, | |
| "learning_rate": 0.00030070886814469075, | |
| "loss": 3.3747, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 10.005826147751106, | |
| "grad_norm": 0.3664577901363373, | |
| "learning_rate": 0.0003002712952158693, | |
| "loss": 3.3242, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 10.020391517128875, | |
| "grad_norm": 0.3791219890117645, | |
| "learning_rate": 0.0002998337222870478, | |
| "loss": 3.2693, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 10.034956886506642, | |
| "grad_norm": 0.3724304437637329, | |
| "learning_rate": 0.0002993961493582263, | |
| "loss": 3.2642, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 10.049522255884408, | |
| "grad_norm": 0.36520498991012573, | |
| "learning_rate": 0.0002989585764294049, | |
| "loss": 3.2756, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 10.064087625262177, | |
| "grad_norm": 0.3840792179107666, | |
| "learning_rate": 0.0002985210035005834, | |
| "loss": 3.28, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 10.078652994639944, | |
| "grad_norm": 0.3588644564151764, | |
| "learning_rate": 0.00029808343057176193, | |
| "loss": 3.2725, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 10.093218364017712, | |
| "grad_norm": 0.3608386516571045, | |
| "learning_rate": 0.00029764585764294047, | |
| "loss": 3.2845, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 10.107783733395479, | |
| "grad_norm": 0.37310636043548584, | |
| "learning_rate": 0.00029720828471411896, | |
| "loss": 3.2934, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 10.122349102773246, | |
| "grad_norm": 0.3664185404777527, | |
| "learning_rate": 0.00029677071178529755, | |
| "loss": 3.2953, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 10.136914472151014, | |
| "grad_norm": 0.3596240282058716, | |
| "learning_rate": 0.00029633313885647604, | |
| "loss": 3.304, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 10.151479841528781, | |
| "grad_norm": 0.3951849341392517, | |
| "learning_rate": 0.0002958955659276546, | |
| "loss": 3.2999, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 10.166045210906548, | |
| "grad_norm": 0.38322994112968445, | |
| "learning_rate": 0.0002954579929988331, | |
| "loss": 3.304, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 10.180610580284316, | |
| "grad_norm": 0.36491626501083374, | |
| "learning_rate": 0.00029502042007001166, | |
| "loss": 3.2911, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 10.195175949662083, | |
| "grad_norm": 0.37527546286582947, | |
| "learning_rate": 0.0002945828471411902, | |
| "loss": 3.2834, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.195175949662083, | |
| "eval_accuracy": 0.3699247414840347, | |
| "eval_loss": 3.5616824626922607, | |
| "eval_runtime": 180.3891, | |
| "eval_samples_per_second": 92.267, | |
| "eval_steps_per_second": 5.771, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 10.209741319039852, | |
| "grad_norm": 0.3800257742404938, | |
| "learning_rate": 0.0002941452742123687, | |
| "loss": 3.2981, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 10.224306688417618, | |
| "grad_norm": 0.3631065785884857, | |
| "learning_rate": 0.0002937077012835472, | |
| "loss": 3.3006, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 10.238872057795385, | |
| "grad_norm": 0.3786700963973999, | |
| "learning_rate": 0.00029327012835472576, | |
| "loss": 3.3138, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 10.253437427173154, | |
| "grad_norm": 0.35197684168815613, | |
| "learning_rate": 0.0002928325554259043, | |
| "loss": 3.3003, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 10.26800279655092, | |
| "grad_norm": 0.36957064270973206, | |
| "learning_rate": 0.00029239498249708284, | |
| "loss": 3.3105, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 10.282568165928687, | |
| "grad_norm": 0.37074217200279236, | |
| "learning_rate": 0.00029195740956826133, | |
| "loss": 3.3123, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 10.297133535306456, | |
| "grad_norm": 0.3711046278476715, | |
| "learning_rate": 0.0002915198366394399, | |
| "loss": 3.302, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 10.311698904684222, | |
| "grad_norm": 0.3888838589191437, | |
| "learning_rate": 0.0002910822637106184, | |
| "loss": 3.3077, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 10.326264274061991, | |
| "grad_norm": 0.3660491704940796, | |
| "learning_rate": 0.00029064469078179695, | |
| "loss": 3.3186, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 10.340829643439758, | |
| "grad_norm": 0.3750646412372589, | |
| "learning_rate": 0.0002902071178529755, | |
| "loss": 3.3088, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 10.355395012817525, | |
| "grad_norm": 0.3611460030078888, | |
| "learning_rate": 0.000289769544924154, | |
| "loss": 3.3126, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 10.369960382195293, | |
| "grad_norm": 0.3784548342227936, | |
| "learning_rate": 0.00028933197199533257, | |
| "loss": 3.3199, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 10.38452575157306, | |
| "grad_norm": 0.3654816746711731, | |
| "learning_rate": 0.00028889439906651105, | |
| "loss": 3.317, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 10.399091120950827, | |
| "grad_norm": 0.3819401264190674, | |
| "learning_rate": 0.0002884568261376896, | |
| "loss": 3.3174, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 10.413656490328595, | |
| "grad_norm": 0.3685275912284851, | |
| "learning_rate": 0.00028801925320886813, | |
| "loss": 3.3172, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 10.428221859706362, | |
| "grad_norm": 0.3687780201435089, | |
| "learning_rate": 0.0002875816802800466, | |
| "loss": 3.3185, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 10.44278722908413, | |
| "grad_norm": 0.3637807369232178, | |
| "learning_rate": 0.0002871441073512252, | |
| "loss": 3.3257, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 10.457352598461897, | |
| "grad_norm": 0.3877573013305664, | |
| "learning_rate": 0.0002867065344224037, | |
| "loss": 3.3316, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 10.471917967839664, | |
| "grad_norm": 0.37709304690361023, | |
| "learning_rate": 0.00028626896149358224, | |
| "loss": 3.3028, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 10.486483337217432, | |
| "grad_norm": 0.36883544921875, | |
| "learning_rate": 0.0002858313885647608, | |
| "loss": 3.3313, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.486483337217432, | |
| "eval_accuracy": 0.37047523575310953, | |
| "eval_loss": 3.555171489715576, | |
| "eval_runtime": 180.6124, | |
| "eval_samples_per_second": 92.153, | |
| "eval_steps_per_second": 5.764, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 10.5010487065952, | |
| "grad_norm": 0.37332433462142944, | |
| "learning_rate": 0.0002853938156359393, | |
| "loss": 3.3147, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 10.515614075972966, | |
| "grad_norm": 0.3772258758544922, | |
| "learning_rate": 0.00028495624270711786, | |
| "loss": 3.3325, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 10.530179445350734, | |
| "grad_norm": 0.35568490624427795, | |
| "learning_rate": 0.00028451866977829634, | |
| "loss": 3.3381, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 10.544744814728501, | |
| "grad_norm": 0.3858466148376465, | |
| "learning_rate": 0.0002840810968494749, | |
| "loss": 3.3242, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 10.55931018410627, | |
| "grad_norm": 0.3936407268047333, | |
| "learning_rate": 0.0002836435239206534, | |
| "loss": 3.3213, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 10.573875553484037, | |
| "grad_norm": 0.3783574104309082, | |
| "learning_rate": 0.00028320595099183196, | |
| "loss": 3.3174, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 10.588440922861803, | |
| "grad_norm": 0.3611924648284912, | |
| "learning_rate": 0.0002827683780630105, | |
| "loss": 3.3281, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 10.603006292239572, | |
| "grad_norm": 0.36673375964164734, | |
| "learning_rate": 0.000282330805134189, | |
| "loss": 3.3274, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 10.617571661617339, | |
| "grad_norm": 0.3864386975765228, | |
| "learning_rate": 0.00028189323220536753, | |
| "loss": 3.3263, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 10.632137030995105, | |
| "grad_norm": 0.37186652421951294, | |
| "learning_rate": 0.00028145565927654607, | |
| "loss": 3.3256, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 10.646702400372874, | |
| "grad_norm": 0.3645637333393097, | |
| "learning_rate": 0.0002810180863477246, | |
| "loss": 3.3304, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 10.66126776975064, | |
| "grad_norm": 0.3960283696651459, | |
| "learning_rate": 0.00028058051341890315, | |
| "loss": 3.3293, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 10.675833139128407, | |
| "grad_norm": 0.3968350291252136, | |
| "learning_rate": 0.00028014294049008164, | |
| "loss": 3.3289, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 10.690398508506176, | |
| "grad_norm": 0.3649657666683197, | |
| "learning_rate": 0.0002797053675612602, | |
| "loss": 3.326, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 10.704963877883943, | |
| "grad_norm": 0.366464227437973, | |
| "learning_rate": 0.0002792677946324387, | |
| "loss": 3.3453, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 10.719529247261711, | |
| "grad_norm": 0.36643803119659424, | |
| "learning_rate": 0.00027883022170361726, | |
| "loss": 3.3272, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 10.734094616639478, | |
| "grad_norm": 0.35845255851745605, | |
| "learning_rate": 0.0002783926487747958, | |
| "loss": 3.3341, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 10.748659986017245, | |
| "grad_norm": 0.3823663294315338, | |
| "learning_rate": 0.0002779550758459743, | |
| "loss": 3.3404, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 10.763225355395013, | |
| "grad_norm": 0.36972370743751526, | |
| "learning_rate": 0.0002775175029171528, | |
| "loss": 3.3372, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 10.77779072477278, | |
| "grad_norm": 0.3613353967666626, | |
| "learning_rate": 0.00027707992998833136, | |
| "loss": 3.3399, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.77779072477278, | |
| "eval_accuracy": 0.37111332468909186, | |
| "eval_loss": 3.54584002494812, | |
| "eval_runtime": 180.3064, | |
| "eval_samples_per_second": 92.31, | |
| "eval_steps_per_second": 5.774, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 10.792356094150549, | |
| "grad_norm": 0.37549999356269836, | |
| "learning_rate": 0.0002766423570595099, | |
| "loss": 3.3357, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 10.806921463528315, | |
| "grad_norm": 0.38684558868408203, | |
| "learning_rate": 0.00027620478413068844, | |
| "loss": 3.3365, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 10.821486832906082, | |
| "grad_norm": 0.3710017204284668, | |
| "learning_rate": 0.000275767211201867, | |
| "loss": 3.3438, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 10.83605220228385, | |
| "grad_norm": 0.3841908872127533, | |
| "learning_rate": 0.00027532963827304547, | |
| "loss": 3.3451, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 10.850617571661617, | |
| "grad_norm": 0.37406450510025024, | |
| "learning_rate": 0.000274892065344224, | |
| "loss": 3.3508, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 10.865182941039384, | |
| "grad_norm": 0.37421655654907227, | |
| "learning_rate": 0.00027445449241540255, | |
| "loss": 3.3346, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 10.879748310417153, | |
| "grad_norm": 0.3724828064441681, | |
| "learning_rate": 0.0002740169194865811, | |
| "loss": 3.3251, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 10.89431367979492, | |
| "grad_norm": 0.37316784262657166, | |
| "learning_rate": 0.0002735793465577596, | |
| "loss": 3.3407, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 10.908879049172686, | |
| "grad_norm": 0.35748419165611267, | |
| "learning_rate": 0.0002731417736289381, | |
| "loss": 3.3403, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 10.923444418550455, | |
| "grad_norm": 0.3694682717323303, | |
| "learning_rate": 0.00027270420070011665, | |
| "loss": 3.3413, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 10.938009787928221, | |
| "grad_norm": 0.37885811924934387, | |
| "learning_rate": 0.0002722666277712952, | |
| "loss": 3.3426, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 10.95257515730599, | |
| "grad_norm": 0.38499969244003296, | |
| "learning_rate": 0.00027182905484247373, | |
| "loss": 3.3389, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 10.967140526683757, | |
| "grad_norm": 0.36556389927864075, | |
| "learning_rate": 0.00027139148191365227, | |
| "loss": 3.3459, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 10.981705896061523, | |
| "grad_norm": 0.3706257939338684, | |
| "learning_rate": 0.00027095390898483076, | |
| "loss": 3.3406, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 10.996271265439292, | |
| "grad_norm": 0.36823248863220215, | |
| "learning_rate": 0.0002705163360560093, | |
| "loss": 3.3361, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 11.010778373339548, | |
| "grad_norm": 0.35125017166137695, | |
| "learning_rate": 0.00027007876312718784, | |
| "loss": 3.2779, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 11.025343742717315, | |
| "grad_norm": 0.3946673274040222, | |
| "learning_rate": 0.0002696411901983664, | |
| "loss": 3.2391, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 11.039909112095083, | |
| "grad_norm": 0.37063831090927124, | |
| "learning_rate": 0.0002692036172695449, | |
| "loss": 3.2478, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 11.05447448147285, | |
| "grad_norm": 0.36030757427215576, | |
| "learning_rate": 0.0002687660443407234, | |
| "loss": 3.2521, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 11.069039850850617, | |
| "grad_norm": 0.3777706027030945, | |
| "learning_rate": 0.00026832847141190194, | |
| "loss": 3.2605, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.069039850850617, | |
| "eval_accuracy": 0.370730142112647, | |
| "eval_loss": 3.558288097381592, | |
| "eval_runtime": 180.3247, | |
| "eval_samples_per_second": 92.3, | |
| "eval_steps_per_second": 5.773, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 11.083605220228385, | |
| "grad_norm": 0.3810880482196808, | |
| "learning_rate": 0.0002678908984830805, | |
| "loss": 3.2545, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 11.098170589606152, | |
| "grad_norm": 0.38010433316230774, | |
| "learning_rate": 0.000267453325554259, | |
| "loss": 3.2619, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 11.11273595898392, | |
| "grad_norm": 0.38559970259666443, | |
| "learning_rate": 0.00026701575262543756, | |
| "loss": 3.26, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 11.127301328361687, | |
| "grad_norm": 0.37837737798690796, | |
| "learning_rate": 0.00026657817969661605, | |
| "loss": 3.2626, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 11.141866697739454, | |
| "grad_norm": 0.3784601092338562, | |
| "learning_rate": 0.00026614060676779464, | |
| "loss": 3.258, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 11.156432067117223, | |
| "grad_norm": 0.35845887660980225, | |
| "learning_rate": 0.00026570303383897313, | |
| "loss": 3.2586, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 11.17099743649499, | |
| "grad_norm": 0.37323319911956787, | |
| "learning_rate": 0.00026526546091015167, | |
| "loss": 3.2596, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 11.185562805872756, | |
| "grad_norm": 0.4025420546531677, | |
| "learning_rate": 0.0002648278879813302, | |
| "loss": 3.2722, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 11.200128175250525, | |
| "grad_norm": 0.3721407651901245, | |
| "learning_rate": 0.0002643903150525087, | |
| "loss": 3.2737, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 11.214693544628291, | |
| "grad_norm": 0.3787324130535126, | |
| "learning_rate": 0.0002639527421236873, | |
| "loss": 3.267, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 11.22925891400606, | |
| "grad_norm": 0.385883092880249, | |
| "learning_rate": 0.0002635151691948658, | |
| "loss": 3.2813, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 11.243824283383827, | |
| "grad_norm": 0.3823045492172241, | |
| "learning_rate": 0.0002630775962660443, | |
| "loss": 3.2806, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 11.258389652761593, | |
| "grad_norm": 0.3888196349143982, | |
| "learning_rate": 0.00026264002333722285, | |
| "loss": 3.2669, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 11.272955022139362, | |
| "grad_norm": 0.4065677523612976, | |
| "learning_rate": 0.00026220245040840134, | |
| "loss": 3.2815, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 11.287520391517129, | |
| "grad_norm": 0.36197319626808167, | |
| "learning_rate": 0.00026176487747957993, | |
| "loss": 3.2846, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 11.302085760894895, | |
| "grad_norm": 0.379085510969162, | |
| "learning_rate": 0.0002613273045507584, | |
| "loss": 3.2861, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 11.316651130272664, | |
| "grad_norm": 0.39975711703300476, | |
| "learning_rate": 0.00026088973162193696, | |
| "loss": 3.3018, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 11.33121649965043, | |
| "grad_norm": 0.390550434589386, | |
| "learning_rate": 0.0002604521586931155, | |
| "loss": 3.2864, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 11.3457818690282, | |
| "grad_norm": 0.40258410573005676, | |
| "learning_rate": 0.00026001458576429404, | |
| "loss": 3.2845, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 11.360347238405966, | |
| "grad_norm": 0.36794647574424744, | |
| "learning_rate": 0.0002595770128354726, | |
| "loss": 3.2906, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.360347238405966, | |
| "eval_accuracy": 0.37129404012940964, | |
| "eval_loss": 3.552013397216797, | |
| "eval_runtime": 180.3711, | |
| "eval_samples_per_second": 92.276, | |
| "eval_steps_per_second": 5.771, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 11.374912607783733, | |
| "grad_norm": 0.3978714048862457, | |
| "learning_rate": 0.00025913943990665106, | |
| "loss": 3.2952, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 11.389477977161501, | |
| "grad_norm": 0.3712661862373352, | |
| "learning_rate": 0.0002587018669778296, | |
| "loss": 3.2887, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 11.404043346539268, | |
| "grad_norm": 0.3962024748325348, | |
| "learning_rate": 0.00025826429404900814, | |
| "loss": 3.2818, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 11.418608715917035, | |
| "grad_norm": 0.3791441321372986, | |
| "learning_rate": 0.0002578267211201867, | |
| "loss": 3.2947, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 11.433174085294803, | |
| "grad_norm": 0.38361239433288574, | |
| "learning_rate": 0.0002573891481913652, | |
| "loss": 3.3016, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 11.44773945467257, | |
| "grad_norm": 0.3753799498081207, | |
| "learning_rate": 0.0002569515752625437, | |
| "loss": 3.291, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 11.462304824050339, | |
| "grad_norm": 0.37291768193244934, | |
| "learning_rate": 0.0002565140023337223, | |
| "loss": 3.2858, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 11.476870193428105, | |
| "grad_norm": 0.382899671792984, | |
| "learning_rate": 0.0002560764294049008, | |
| "loss": 3.2902, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 11.491435562805872, | |
| "grad_norm": 0.38682591915130615, | |
| "learning_rate": 0.00025563885647607933, | |
| "loss": 3.3095, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 11.50600093218364, | |
| "grad_norm": 0.39052340388298035, | |
| "learning_rate": 0.00025520128354725787, | |
| "loss": 3.3024, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 11.520566301561407, | |
| "grad_norm": 0.38648220896720886, | |
| "learning_rate": 0.00025476371061843636, | |
| "loss": 3.3037, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 11.535131670939174, | |
| "grad_norm": 0.37323859333992004, | |
| "learning_rate": 0.00025432613768961495, | |
| "loss": 3.2976, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 11.549697040316943, | |
| "grad_norm": 0.3768618106842041, | |
| "learning_rate": 0.00025388856476079343, | |
| "loss": 3.3111, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 11.56426240969471, | |
| "grad_norm": 0.3987623155117035, | |
| "learning_rate": 0.000253450991831972, | |
| "loss": 3.301, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 11.578827779072478, | |
| "grad_norm": 0.42070692777633667, | |
| "learning_rate": 0.0002530134189031505, | |
| "loss": 3.3047, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 11.593393148450245, | |
| "grad_norm": 0.37431833148002625, | |
| "learning_rate": 0.000252575845974329, | |
| "loss": 3.3051, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 11.607958517828012, | |
| "grad_norm": 0.3832058310508728, | |
| "learning_rate": 0.0002521382730455076, | |
| "loss": 3.2968, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 11.62252388720578, | |
| "grad_norm": 0.3984127342700958, | |
| "learning_rate": 0.0002517007001166861, | |
| "loss": 3.3157, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 11.637089256583547, | |
| "grad_norm": 0.40139371156692505, | |
| "learning_rate": 0.0002512631271878646, | |
| "loss": 3.2996, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 11.651654625961314, | |
| "grad_norm": 0.3891284465789795, | |
| "learning_rate": 0.00025082555425904316, | |
| "loss": 3.3088, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.651654625961314, | |
| "eval_accuracy": 0.3715696399941494, | |
| "eval_loss": 3.5464000701904297, | |
| "eval_runtime": 180.4742, | |
| "eval_samples_per_second": 92.224, | |
| "eval_steps_per_second": 5.768, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 11.666219995339082, | |
| "grad_norm": 0.37767043709754944, | |
| "learning_rate": 0.0002503879813302217, | |
| "loss": 3.2951, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 11.680785364716849, | |
| "grad_norm": 0.40471839904785156, | |
| "learning_rate": 0.00024995040840140024, | |
| "loss": 3.2946, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 11.695350734094617, | |
| "grad_norm": 0.3795658051967621, | |
| "learning_rate": 0.0002495128354725787, | |
| "loss": 3.3187, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 11.709916103472384, | |
| "grad_norm": 0.3852717876434326, | |
| "learning_rate": 0.00024907526254375727, | |
| "loss": 3.2955, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 11.724481472850151, | |
| "grad_norm": 0.37112799286842346, | |
| "learning_rate": 0.0002486376896149358, | |
| "loss": 3.3023, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 11.73904684222792, | |
| "grad_norm": 0.37619829177856445, | |
| "learning_rate": 0.00024820011668611435, | |
| "loss": 3.3103, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 11.753612211605686, | |
| "grad_norm": 0.3923087418079376, | |
| "learning_rate": 0.0002477625437572929, | |
| "loss": 3.3131, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 11.768177580983453, | |
| "grad_norm": 0.3909642696380615, | |
| "learning_rate": 0.00024732497082847137, | |
| "loss": 3.3085, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 11.782742950361222, | |
| "grad_norm": 0.3891732096672058, | |
| "learning_rate": 0.0002468873978996499, | |
| "loss": 3.3118, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 11.797308319738988, | |
| "grad_norm": 0.39520296454429626, | |
| "learning_rate": 0.00024644982497082845, | |
| "loss": 3.3134, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 11.811873689116755, | |
| "grad_norm": 0.3944683074951172, | |
| "learning_rate": 0.000246012252042007, | |
| "loss": 3.3071, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 11.826439058494524, | |
| "grad_norm": 0.3806307315826416, | |
| "learning_rate": 0.00024557467911318553, | |
| "loss": 3.3058, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 11.84100442787229, | |
| "grad_norm": 0.38682928681373596, | |
| "learning_rate": 0.000245137106184364, | |
| "loss": 3.3067, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 11.855569797250059, | |
| "grad_norm": 0.3885536789894104, | |
| "learning_rate": 0.00024469953325554256, | |
| "loss": 3.3188, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 11.870135166627826, | |
| "grad_norm": 0.39508283138275146, | |
| "learning_rate": 0.0002442619603267211, | |
| "loss": 3.3167, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 11.884700536005592, | |
| "grad_norm": 0.37365779280662537, | |
| "learning_rate": 0.00024382438739789964, | |
| "loss": 3.2989, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 11.899265905383361, | |
| "grad_norm": 0.37982553243637085, | |
| "learning_rate": 0.00024338681446907818, | |
| "loss": 3.3162, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 11.913831274761128, | |
| "grad_norm": 0.3698308765888214, | |
| "learning_rate": 0.0002429492415402567, | |
| "loss": 3.3203, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 11.928396644138896, | |
| "grad_norm": 0.3770948052406311, | |
| "learning_rate": 0.0002425116686114352, | |
| "loss": 3.315, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 11.942962013516663, | |
| "grad_norm": 0.3643822968006134, | |
| "learning_rate": 0.00024207409568261377, | |
| "loss": 3.3188, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.942962013516663, | |
| "eval_accuracy": 0.37253788439949564, | |
| "eval_loss": 3.536294460296631, | |
| "eval_runtime": 180.6316, | |
| "eval_samples_per_second": 92.143, | |
| "eval_steps_per_second": 5.763, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 11.95752738289443, | |
| "grad_norm": 0.37310171127319336, | |
| "learning_rate": 0.00024163652275379228, | |
| "loss": 3.3067, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 11.972092752272198, | |
| "grad_norm": 0.3831028342247009, | |
| "learning_rate": 0.00024119894982497082, | |
| "loss": 3.3263, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 11.986658121649965, | |
| "grad_norm": 0.3900957405567169, | |
| "learning_rate": 0.00024076137689614933, | |
| "loss": 3.3106, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 12.001165229550221, | |
| "grad_norm": 0.3873760998249054, | |
| "learning_rate": 0.00024032380396732785, | |
| "loss": 3.3022, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 12.01573059892799, | |
| "grad_norm": 0.3788856863975525, | |
| "learning_rate": 0.00023988623103850641, | |
| "loss": 3.2189, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 12.030295968305756, | |
| "grad_norm": 0.4146612584590912, | |
| "learning_rate": 0.00023944865810968493, | |
| "loss": 3.2262, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 12.044861337683523, | |
| "grad_norm": 0.3976421356201172, | |
| "learning_rate": 0.00023901108518086347, | |
| "loss": 3.2397, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 12.059426707061291, | |
| "grad_norm": 0.3815682828426361, | |
| "learning_rate": 0.00023857351225204198, | |
| "loss": 3.2293, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 12.073992076439058, | |
| "grad_norm": 0.3816235363483429, | |
| "learning_rate": 0.0002381359393232205, | |
| "loss": 3.2318, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 12.088557445816827, | |
| "grad_norm": 0.3730505108833313, | |
| "learning_rate": 0.00023769836639439906, | |
| "loss": 3.2306, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 12.103122815194594, | |
| "grad_norm": 0.36907413601875305, | |
| "learning_rate": 0.00023726079346557757, | |
| "loss": 3.2172, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 12.11768818457236, | |
| "grad_norm": 0.3938505947589874, | |
| "learning_rate": 0.0002368232205367561, | |
| "loss": 3.2349, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 12.132253553950129, | |
| "grad_norm": 0.39459192752838135, | |
| "learning_rate": 0.00023638564760793463, | |
| "loss": 3.244, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 12.146818923327896, | |
| "grad_norm": 0.3762718141078949, | |
| "learning_rate": 0.00023594807467911317, | |
| "loss": 3.2336, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 12.161384292705662, | |
| "grad_norm": 0.38366296887397766, | |
| "learning_rate": 0.0002355105017502917, | |
| "loss": 3.2468, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 12.17594966208343, | |
| "grad_norm": 0.37330591678619385, | |
| "learning_rate": 0.00023507292882147022, | |
| "loss": 3.2367, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 12.190515031461198, | |
| "grad_norm": 0.39677342772483826, | |
| "learning_rate": 0.00023463535589264876, | |
| "loss": 3.2597, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 12.205080400838966, | |
| "grad_norm": 0.3829995393753052, | |
| "learning_rate": 0.0002341977829638273, | |
| "loss": 3.2453, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 12.219645770216733, | |
| "grad_norm": 0.40625911951065063, | |
| "learning_rate": 0.0002337602100350058, | |
| "loss": 3.2457, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 12.2342111395945, | |
| "grad_norm": 0.3920283019542694, | |
| "learning_rate": 0.00023332263710618435, | |
| "loss": 3.2678, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.2342111395945, | |
| "eval_accuracy": 0.3720842733593225, | |
| "eval_loss": 3.546815872192383, | |
| "eval_runtime": 180.3312, | |
| "eval_samples_per_second": 92.297, | |
| "eval_steps_per_second": 5.773, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 12.248776508972268, | |
| "grad_norm": 0.398946613073349, | |
| "learning_rate": 0.00023288506417736286, | |
| "loss": 3.245, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 12.263341878350035, | |
| "grad_norm": 0.3961947560310364, | |
| "learning_rate": 0.00023244749124854143, | |
| "loss": 3.2517, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 12.277907247727802, | |
| "grad_norm": 0.3835267722606659, | |
| "learning_rate": 0.00023200991831971994, | |
| "loss": 3.2522, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 12.29247261710557, | |
| "grad_norm": 0.42905566096305847, | |
| "learning_rate": 0.00023157234539089846, | |
| "loss": 3.2587, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 12.307037986483337, | |
| "grad_norm": 0.39819803833961487, | |
| "learning_rate": 0.000231134772462077, | |
| "loss": 3.2556, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 12.321603355861104, | |
| "grad_norm": 0.393216997385025, | |
| "learning_rate": 0.0002306971995332555, | |
| "loss": 3.2738, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 12.336168725238872, | |
| "grad_norm": 0.3680713176727295, | |
| "learning_rate": 0.00023025962660443408, | |
| "loss": 3.2608, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 12.350734094616639, | |
| "grad_norm": 0.3907005488872528, | |
| "learning_rate": 0.0002298220536756126, | |
| "loss": 3.2592, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 12.365299463994408, | |
| "grad_norm": 0.39694586396217346, | |
| "learning_rate": 0.0002293844807467911, | |
| "loss": 3.2655, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 12.379864833372174, | |
| "grad_norm": 0.3920033276081085, | |
| "learning_rate": 0.00022894690781796964, | |
| "loss": 3.2694, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 12.394430202749941, | |
| "grad_norm": 0.38897332549095154, | |
| "learning_rate": 0.00022850933488914815, | |
| "loss": 3.2831, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 12.40899557212771, | |
| "grad_norm": 0.3850444257259369, | |
| "learning_rate": 0.00022807176196032672, | |
| "loss": 3.2698, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 12.423560941505476, | |
| "grad_norm": 0.39484626054763794, | |
| "learning_rate": 0.00022763418903150523, | |
| "loss": 3.2621, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 12.438126310883243, | |
| "grad_norm": 0.4130299985408783, | |
| "learning_rate": 0.00022719661610268375, | |
| "loss": 3.2765, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 12.452691680261012, | |
| "grad_norm": 0.3730163872241974, | |
| "learning_rate": 0.0002267590431738623, | |
| "loss": 3.2681, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 12.467257049638778, | |
| "grad_norm": 0.3927021026611328, | |
| "learning_rate": 0.00022632147024504083, | |
| "loss": 3.2684, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 12.481822419016547, | |
| "grad_norm": 0.39260363578796387, | |
| "learning_rate": 0.00022588389731621937, | |
| "loss": 3.2652, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 12.496387788394314, | |
| "grad_norm": 0.3787255883216858, | |
| "learning_rate": 0.00022544632438739788, | |
| "loss": 3.2619, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 12.51095315777208, | |
| "grad_norm": 0.38174960017204285, | |
| "learning_rate": 0.0002250087514585764, | |
| "loss": 3.2742, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 12.525518527149849, | |
| "grad_norm": 0.39155444502830505, | |
| "learning_rate": 0.00022457117852975496, | |
| "loss": 3.2818, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.525518527149849, | |
| "eval_accuracy": 0.37251695574082516, | |
| "eval_loss": 3.541425943374634, | |
| "eval_runtime": 180.4575, | |
| "eval_samples_per_second": 92.232, | |
| "eval_steps_per_second": 5.769, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 12.540083896527616, | |
| "grad_norm": 0.3748781979084015, | |
| "learning_rate": 0.00022413360560093347, | |
| "loss": 3.28, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 12.554649265905383, | |
| "grad_norm": 0.3864782452583313, | |
| "learning_rate": 0.000223696032672112, | |
| "loss": 3.2746, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 12.569214635283151, | |
| "grad_norm": 0.39516115188598633, | |
| "learning_rate": 0.00022325845974329053, | |
| "loss": 3.2664, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 12.583780004660918, | |
| "grad_norm": 0.3874489367008209, | |
| "learning_rate": 0.00022282088681446904, | |
| "loss": 3.2765, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 12.598345374038686, | |
| "grad_norm": 0.4148963689804077, | |
| "learning_rate": 0.0002223833138856476, | |
| "loss": 3.2833, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 12.612910743416453, | |
| "grad_norm": 0.38245537877082825, | |
| "learning_rate": 0.00022194574095682612, | |
| "loss": 3.2826, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 12.62747611279422, | |
| "grad_norm": 0.3959484100341797, | |
| "learning_rate": 0.00022150816802800466, | |
| "loss": 3.2772, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 12.642041482171988, | |
| "grad_norm": 0.3956339359283447, | |
| "learning_rate": 0.00022107059509918317, | |
| "loss": 3.2741, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 12.656606851549755, | |
| "grad_norm": 0.3839803636074066, | |
| "learning_rate": 0.00022063302217036168, | |
| "loss": 3.2662, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 12.671172220927522, | |
| "grad_norm": 0.40059152245521545, | |
| "learning_rate": 0.00022019544924154025, | |
| "loss": 3.2851, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 12.68573759030529, | |
| "grad_norm": 0.3880845904350281, | |
| "learning_rate": 0.00021975787631271876, | |
| "loss": 3.2854, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 12.700302959683057, | |
| "grad_norm": 0.3912261128425598, | |
| "learning_rate": 0.0002193203033838973, | |
| "loss": 3.2838, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 12.714868329060826, | |
| "grad_norm": 0.41812238097190857, | |
| "learning_rate": 0.00021888273045507582, | |
| "loss": 3.2902, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 12.729433698438593, | |
| "grad_norm": 0.3847753703594208, | |
| "learning_rate": 0.00021844515752625436, | |
| "loss": 3.2726, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 12.74399906781636, | |
| "grad_norm": 0.3847730755805969, | |
| "learning_rate": 0.0002180075845974329, | |
| "loss": 3.2761, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 12.758564437194128, | |
| "grad_norm": 0.3814358413219452, | |
| "learning_rate": 0.0002175700116686114, | |
| "loss": 3.2816, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 12.773129806571895, | |
| "grad_norm": 0.39806804060935974, | |
| "learning_rate": 0.00021713243873978995, | |
| "loss": 3.2889, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 12.787695175949661, | |
| "grad_norm": 0.3872688412666321, | |
| "learning_rate": 0.0002166948658109685, | |
| "loss": 3.2889, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 12.80226054532743, | |
| "grad_norm": 0.3840930461883545, | |
| "learning_rate": 0.000216257292882147, | |
| "loss": 3.2767, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 12.816825914705197, | |
| "grad_norm": 0.3970656096935272, | |
| "learning_rate": 0.00021581971995332554, | |
| "loss": 3.2772, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.816825914705197, | |
| "eval_accuracy": 0.37332153333229867, | |
| "eval_loss": 3.5360467433929443, | |
| "eval_runtime": 180.2447, | |
| "eval_samples_per_second": 92.341, | |
| "eval_steps_per_second": 5.775, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 12.831391284082965, | |
| "grad_norm": 0.3944132328033447, | |
| "learning_rate": 0.00021538214702450405, | |
| "loss": 3.2893, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 12.845956653460732, | |
| "grad_norm": 0.40921568870544434, | |
| "learning_rate": 0.00021494457409568262, | |
| "loss": 3.2811, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 12.860522022838499, | |
| "grad_norm": 0.37589746713638306, | |
| "learning_rate": 0.00021450700116686113, | |
| "loss": 3.278, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 12.875087392216267, | |
| "grad_norm": 0.4068247377872467, | |
| "learning_rate": 0.00021406942823803965, | |
| "loss": 3.2848, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 12.889652761594034, | |
| "grad_norm": 0.41013479232788086, | |
| "learning_rate": 0.0002136318553092182, | |
| "loss": 3.2929, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 12.9042181309718, | |
| "grad_norm": 0.39379021525382996, | |
| "learning_rate": 0.0002131942823803967, | |
| "loss": 3.2898, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 12.91878350034957, | |
| "grad_norm": 0.38993388414382935, | |
| "learning_rate": 0.00021275670945157527, | |
| "loss": 3.287, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 12.933348869727336, | |
| "grad_norm": 0.4032069146633148, | |
| "learning_rate": 0.00021231913652275378, | |
| "loss": 3.2932, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 12.947914239105105, | |
| "grad_norm": 0.40004608035087585, | |
| "learning_rate": 0.0002118815635939323, | |
| "loss": 3.2954, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 12.962479608482871, | |
| "grad_norm": 0.39480239152908325, | |
| "learning_rate": 0.00021144399066511083, | |
| "loss": 3.2967, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 12.977044977860638, | |
| "grad_norm": 0.4099850058555603, | |
| "learning_rate": 0.00021100641773628935, | |
| "loss": 3.2809, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 12.991610347238407, | |
| "grad_norm": 0.38592153787612915, | |
| "learning_rate": 0.0002105688448074679, | |
| "loss": 3.2948, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 13.006117455138662, | |
| "grad_norm": 0.3878653049468994, | |
| "learning_rate": 0.00021013127187864643, | |
| "loss": 3.2447, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 13.02068282451643, | |
| "grad_norm": 0.40472573041915894, | |
| "learning_rate": 0.00020969369894982494, | |
| "loss": 3.1872, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 13.035248193894198, | |
| "grad_norm": 0.38480067253112793, | |
| "learning_rate": 0.00020925612602100348, | |
| "loss": 3.1966, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 13.049813563271965, | |
| "grad_norm": 0.4043852388858795, | |
| "learning_rate": 0.00020881855309218202, | |
| "loss": 3.1928, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 13.064378932649731, | |
| "grad_norm": 0.3920169174671173, | |
| "learning_rate": 0.00020838098016336056, | |
| "loss": 3.2142, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 13.0789443020275, | |
| "grad_norm": 0.4085189402103424, | |
| "learning_rate": 0.00020794340723453907, | |
| "loss": 3.1996, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 13.093509671405267, | |
| "grad_norm": 0.39081132411956787, | |
| "learning_rate": 0.00020750583430571758, | |
| "loss": 3.2162, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 13.108075040783035, | |
| "grad_norm": 0.4104847311973572, | |
| "learning_rate": 0.00020706826137689615, | |
| "loss": 3.215, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.108075040783035, | |
| "eval_accuracy": 0.3724899130919812, | |
| "eval_loss": 3.5466861724853516, | |
| "eval_runtime": 180.2123, | |
| "eval_samples_per_second": 92.358, | |
| "eval_steps_per_second": 5.777, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 13.122640410160802, | |
| "grad_norm": 0.40783169865608215, | |
| "learning_rate": 0.00020663068844807466, | |
| "loss": 3.217, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 13.137205779538569, | |
| "grad_norm": 0.3994167149066925, | |
| "learning_rate": 0.0002061931155192532, | |
| "loss": 3.214, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 13.151771148916337, | |
| "grad_norm": 0.41038912534713745, | |
| "learning_rate": 0.00020575554259043172, | |
| "loss": 3.2265, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 13.166336518294104, | |
| "grad_norm": 0.3970767557621002, | |
| "learning_rate": 0.00020531796966161023, | |
| "loss": 3.2219, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 13.18090188767187, | |
| "grad_norm": 0.4076697528362274, | |
| "learning_rate": 0.0002048803967327888, | |
| "loss": 3.2133, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 13.19546725704964, | |
| "grad_norm": 0.40613362193107605, | |
| "learning_rate": 0.0002044428238039673, | |
| "loss": 3.22, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 13.210032626427406, | |
| "grad_norm": 0.39395052194595337, | |
| "learning_rate": 0.00020400525087514585, | |
| "loss": 3.2279, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 13.224597995805174, | |
| "grad_norm": 0.3916940987110138, | |
| "learning_rate": 0.00020356767794632436, | |
| "loss": 3.2346, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 13.239163365182941, | |
| "grad_norm": 0.41231533885002136, | |
| "learning_rate": 0.00020313010501750287, | |
| "loss": 3.2357, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 13.253728734560708, | |
| "grad_norm": 0.4182799160480499, | |
| "learning_rate": 0.00020269253208868144, | |
| "loss": 3.2334, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 13.268294103938477, | |
| "grad_norm": 0.4099382162094116, | |
| "learning_rate": 0.00020225495915985995, | |
| "loss": 3.2341, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 13.282859473316243, | |
| "grad_norm": 0.4044232666492462, | |
| "learning_rate": 0.0002018173862310385, | |
| "loss": 3.2213, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 13.29742484269401, | |
| "grad_norm": 0.39154335856437683, | |
| "learning_rate": 0.000201379813302217, | |
| "loss": 3.237, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 13.311990212071779, | |
| "grad_norm": 0.4079340398311615, | |
| "learning_rate": 0.00020094224037339555, | |
| "loss": 3.2376, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 13.326555581449545, | |
| "grad_norm": 0.39542028307914734, | |
| "learning_rate": 0.0002005046674445741, | |
| "loss": 3.2315, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 13.341120950827314, | |
| "grad_norm": 0.39488768577575684, | |
| "learning_rate": 0.0002000670945157526, | |
| "loss": 3.2401, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 13.35568632020508, | |
| "grad_norm": 0.41860339045524597, | |
| "learning_rate": 0.00019962952158693114, | |
| "loss": 3.2385, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 13.370251689582847, | |
| "grad_norm": 0.4021410644054413, | |
| "learning_rate": 0.00019919194865810968, | |
| "loss": 3.2472, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 13.384817058960616, | |
| "grad_norm": 0.3935169279575348, | |
| "learning_rate": 0.0001987543757292882, | |
| "loss": 3.2474, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 13.399382428338383, | |
| "grad_norm": 0.4164498448371887, | |
| "learning_rate": 0.00019831680280046673, | |
| "loss": 3.2448, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.399382428338383, | |
| "eval_accuracy": 0.37297574015729884, | |
| "eval_loss": 3.5430777072906494, | |
| "eval_runtime": 180.1561, | |
| "eval_samples_per_second": 92.387, | |
| "eval_steps_per_second": 5.778, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 13.41394779771615, | |
| "grad_norm": 0.4161559045314789, | |
| "learning_rate": 0.00019787922987164524, | |
| "loss": 3.2397, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 13.428513167093918, | |
| "grad_norm": 0.40776827931404114, | |
| "learning_rate": 0.0001974416569428238, | |
| "loss": 3.2325, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 13.443078536471685, | |
| "grad_norm": 0.3878330886363983, | |
| "learning_rate": 0.00019700408401400232, | |
| "loss": 3.2609, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 13.457643905849451, | |
| "grad_norm": 0.40034887194633484, | |
| "learning_rate": 0.00019656651108518084, | |
| "loss": 3.2584, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 13.47220927522722, | |
| "grad_norm": 0.40647125244140625, | |
| "learning_rate": 0.00019612893815635938, | |
| "loss": 3.2431, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 13.486774644604987, | |
| "grad_norm": 0.3935099244117737, | |
| "learning_rate": 0.0001956913652275379, | |
| "loss": 3.2455, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 13.501340013982755, | |
| "grad_norm": 0.3952663540840149, | |
| "learning_rate": 0.00019525379229871646, | |
| "loss": 3.2482, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 13.515905383360522, | |
| "grad_norm": 0.390480637550354, | |
| "learning_rate": 0.00019481621936989497, | |
| "loss": 3.2544, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 13.530470752738289, | |
| "grad_norm": 0.40572217106819153, | |
| "learning_rate": 0.00019437864644107348, | |
| "loss": 3.2502, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 13.545036122116057, | |
| "grad_norm": 0.38214248418807983, | |
| "learning_rate": 0.00019394107351225202, | |
| "loss": 3.2427, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 13.559601491493824, | |
| "grad_norm": 0.4259106516838074, | |
| "learning_rate": 0.00019350350058343054, | |
| "loss": 3.2479, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 13.574166860871593, | |
| "grad_norm": 0.3941766917705536, | |
| "learning_rate": 0.0001930659276546091, | |
| "loss": 3.2628, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 13.58873223024936, | |
| "grad_norm": 0.40022504329681396, | |
| "learning_rate": 0.00019262835472578762, | |
| "loss": 3.2478, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 13.603297599627126, | |
| "grad_norm": 0.3927033841609955, | |
| "learning_rate": 0.00019219078179696613, | |
| "loss": 3.2597, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 13.617862969004895, | |
| "grad_norm": 0.4204312562942505, | |
| "learning_rate": 0.00019175320886814467, | |
| "loss": 3.2552, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 13.632428338382661, | |
| "grad_norm": 0.4014910161495209, | |
| "learning_rate": 0.0001913156359393232, | |
| "loss": 3.2582, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 13.646993707760428, | |
| "grad_norm": 0.3960302770137787, | |
| "learning_rate": 0.00019087806301050175, | |
| "loss": 3.247, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 13.661559077138197, | |
| "grad_norm": 0.40421754121780396, | |
| "learning_rate": 0.00019044049008168026, | |
| "loss": 3.2509, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 13.676124446515963, | |
| "grad_norm": 0.4028851091861725, | |
| "learning_rate": 0.00019000291715285877, | |
| "loss": 3.2603, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 13.69068981589373, | |
| "grad_norm": 0.4152960181236267, | |
| "learning_rate": 0.00018956534422403734, | |
| "loss": 3.2716, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.69068981589373, | |
| "eval_accuracy": 0.37343640580151827, | |
| "eval_loss": 3.536942481994629, | |
| "eval_runtime": 180.1541, | |
| "eval_samples_per_second": 92.388, | |
| "eval_steps_per_second": 5.778, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 13.705255185271499, | |
| "grad_norm": 0.40029028058052063, | |
| "learning_rate": 0.00018912777129521585, | |
| "loss": 3.2608, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 13.719820554649266, | |
| "grad_norm": 0.4005506634712219, | |
| "learning_rate": 0.0001886901983663944, | |
| "loss": 3.2562, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 13.734385924027034, | |
| "grad_norm": 0.4043956398963928, | |
| "learning_rate": 0.0001882526254375729, | |
| "loss": 3.2553, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 13.7489512934048, | |
| "grad_norm": 0.393660306930542, | |
| "learning_rate": 0.00018781505250875142, | |
| "loss": 3.2504, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 13.763516662782568, | |
| "grad_norm": 0.41873812675476074, | |
| "learning_rate": 0.00018737747957992999, | |
| "loss": 3.2641, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 13.778082032160336, | |
| "grad_norm": 0.39937934279441833, | |
| "learning_rate": 0.0001869399066511085, | |
| "loss": 3.2601, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 13.792647401538103, | |
| "grad_norm": 0.39644569158554077, | |
| "learning_rate": 0.00018650233372228704, | |
| "loss": 3.2579, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 13.80721277091587, | |
| "grad_norm": 0.4110250174999237, | |
| "learning_rate": 0.00018606476079346555, | |
| "loss": 3.2545, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 13.821778140293638, | |
| "grad_norm": 0.39572134613990784, | |
| "learning_rate": 0.00018562718786464406, | |
| "loss": 3.2551, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 13.836343509671405, | |
| "grad_norm": 0.40120694041252136, | |
| "learning_rate": 0.00018518961493582263, | |
| "loss": 3.2497, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 13.850908879049173, | |
| "grad_norm": 0.3942031264305115, | |
| "learning_rate": 0.00018475204200700114, | |
| "loss": 3.2592, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 13.86547424842694, | |
| "grad_norm": 0.4140487611293793, | |
| "learning_rate": 0.00018431446907817968, | |
| "loss": 3.2552, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 13.880039617804707, | |
| "grad_norm": 0.39110127091407776, | |
| "learning_rate": 0.0001838768961493582, | |
| "loss": 3.261, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 13.894604987182475, | |
| "grad_norm": 0.4091663360595703, | |
| "learning_rate": 0.00018343932322053674, | |
| "loss": 3.2709, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 13.909170356560242, | |
| "grad_norm": 0.39773812890052795, | |
| "learning_rate": 0.00018300175029171528, | |
| "loss": 3.2645, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 13.923735725938009, | |
| "grad_norm": 0.4022299647331238, | |
| "learning_rate": 0.0001825641773628938, | |
| "loss": 3.2597, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 13.938301095315778, | |
| "grad_norm": 0.3977898061275482, | |
| "learning_rate": 0.00018212660443407233, | |
| "loss": 3.2697, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 13.952866464693544, | |
| "grad_norm": 0.38834723830223083, | |
| "learning_rate": 0.00018168903150525087, | |
| "loss": 3.2585, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 13.967431834071313, | |
| "grad_norm": 0.3896270990371704, | |
| "learning_rate": 0.00018125145857642938, | |
| "loss": 3.2654, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 13.98199720344908, | |
| "grad_norm": 0.41397517919540405, | |
| "learning_rate": 0.00018081388564760792, | |
| "loss": 3.2672, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.98199720344908, | |
| "eval_accuracy": 0.37412834488172014, | |
| "eval_loss": 3.5272507667541504, | |
| "eval_runtime": 180.2142, | |
| "eval_samples_per_second": 92.357, | |
| "eval_steps_per_second": 5.776, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 13.996562572826846, | |
| "grad_norm": 0.3925948739051819, | |
| "learning_rate": 0.00018037631271878644, | |
| "loss": 3.2785, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 14.011069680727104, | |
| "grad_norm": 0.39326012134552, | |
| "learning_rate": 0.000179938739789965, | |
| "loss": 3.2021, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 14.02563505010487, | |
| "grad_norm": 0.40781304240226746, | |
| "learning_rate": 0.00017950116686114352, | |
| "loss": 3.1782, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 14.040200419482638, | |
| "grad_norm": 0.3889636695384979, | |
| "learning_rate": 0.00017906359393232203, | |
| "loss": 3.1885, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 14.054765788860406, | |
| "grad_norm": 0.4008404314517975, | |
| "learning_rate": 0.00017862602100350057, | |
| "loss": 3.1821, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 14.069331158238173, | |
| "grad_norm": 0.4058891832828522, | |
| "learning_rate": 0.00017818844807467908, | |
| "loss": 3.1926, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 14.08389652761594, | |
| "grad_norm": 0.3980492949485779, | |
| "learning_rate": 0.00017775087514585765, | |
| "loss": 3.1981, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 14.098461896993708, | |
| "grad_norm": 0.4085221588611603, | |
| "learning_rate": 0.00017731330221703616, | |
| "loss": 3.1935, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 14.113027266371475, | |
| "grad_norm": 0.41492384672164917, | |
| "learning_rate": 0.00017687572928821467, | |
| "loss": 3.194, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 14.127592635749243, | |
| "grad_norm": 0.4290497899055481, | |
| "learning_rate": 0.00017643815635939321, | |
| "loss": 3.1947, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 14.14215800512701, | |
| "grad_norm": 0.42287999391555786, | |
| "learning_rate": 0.00017600058343057173, | |
| "loss": 3.1991, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 14.156723374504777, | |
| "grad_norm": 0.39472466707229614, | |
| "learning_rate": 0.0001755630105017503, | |
| "loss": 3.2116, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 14.171288743882545, | |
| "grad_norm": 0.4188964068889618, | |
| "learning_rate": 0.0001751254375729288, | |
| "loss": 3.1942, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 14.185854113260312, | |
| "grad_norm": 0.4070267975330353, | |
| "learning_rate": 0.00017468786464410732, | |
| "loss": 3.1969, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 14.200419482638079, | |
| "grad_norm": 0.40462633967399597, | |
| "learning_rate": 0.00017425029171528586, | |
| "loss": 3.2017, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 14.214984852015847, | |
| "grad_norm": 0.40400370955467224, | |
| "learning_rate": 0.0001738127187864644, | |
| "loss": 3.2093, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 14.229550221393614, | |
| "grad_norm": 0.3998878002166748, | |
| "learning_rate": 0.00017337514585764294, | |
| "loss": 3.2056, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 14.244115590771383, | |
| "grad_norm": 0.3977794945240021, | |
| "learning_rate": 0.00017293757292882145, | |
| "loss": 3.2038, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 14.25868096014915, | |
| "grad_norm": 0.4316108226776123, | |
| "learning_rate": 0.00017249999999999996, | |
| "loss": 3.2037, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 14.273246329526916, | |
| "grad_norm": 0.41260573267936707, | |
| "learning_rate": 0.00017206242707117853, | |
| "loss": 3.2114, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.273246329526916, | |
| "eval_accuracy": 0.37353940302059335, | |
| "eval_loss": 3.538419723510742, | |
| "eval_runtime": 180.2527, | |
| "eval_samples_per_second": 92.337, | |
| "eval_steps_per_second": 5.775, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 14.287811698904685, | |
| "grad_norm": 0.4174029231071472, | |
| "learning_rate": 0.00017162485414235704, | |
| "loss": 3.2237, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 14.302377068282452, | |
| "grad_norm": 0.42132076621055603, | |
| "learning_rate": 0.00017118728121353558, | |
| "loss": 3.2049, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 14.316942437660218, | |
| "grad_norm": 0.41422000527381897, | |
| "learning_rate": 0.0001707497082847141, | |
| "loss": 3.208, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 14.331507807037987, | |
| "grad_norm": 0.4296468198299408, | |
| "learning_rate": 0.0001703121353558926, | |
| "loss": 3.2076, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 14.346073176415754, | |
| "grad_norm": 0.40375787019729614, | |
| "learning_rate": 0.00016987456242707118, | |
| "loss": 3.2195, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 14.360638545793522, | |
| "grad_norm": 0.4078134298324585, | |
| "learning_rate": 0.0001694369894982497, | |
| "loss": 3.2231, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 14.375203915171289, | |
| "grad_norm": 0.4103347063064575, | |
| "learning_rate": 0.00016899941656942823, | |
| "loss": 3.224, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 14.389769284549056, | |
| "grad_norm": 0.4056347906589508, | |
| "learning_rate": 0.00016856184364060674, | |
| "loss": 3.2211, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 14.404334653926824, | |
| "grad_norm": 0.43045109510421753, | |
| "learning_rate": 0.00016812427071178528, | |
| "loss": 3.2178, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 14.418900023304591, | |
| "grad_norm": 0.4060124158859253, | |
| "learning_rate": 0.00016768669778296382, | |
| "loss": 3.2132, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 14.433465392682358, | |
| "grad_norm": 0.40384456515312195, | |
| "learning_rate": 0.00016724912485414234, | |
| "loss": 3.2172, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 14.448030762060126, | |
| "grad_norm": 0.40116435289382935, | |
| "learning_rate": 0.00016681155192532088, | |
| "loss": 3.2099, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 14.462596131437893, | |
| "grad_norm": 0.4094943404197693, | |
| "learning_rate": 0.00016637397899649942, | |
| "loss": 3.2127, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 14.477161500815662, | |
| "grad_norm": 0.40145185589790344, | |
| "learning_rate": 0.00016593640606767793, | |
| "loss": 3.22, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 14.491726870193428, | |
| "grad_norm": 0.42102572321891785, | |
| "learning_rate": 0.00016549883313885647, | |
| "loss": 3.2253, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 14.506292239571195, | |
| "grad_norm": 0.41271886229515076, | |
| "learning_rate": 0.00016506126021003498, | |
| "loss": 3.207, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 14.520857608948964, | |
| "grad_norm": 0.41741323471069336, | |
| "learning_rate": 0.00016462368728121355, | |
| "loss": 3.2322, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 14.53542297832673, | |
| "grad_norm": 0.40796613693237305, | |
| "learning_rate": 0.00016418611435239206, | |
| "loss": 3.2178, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 14.549988347704497, | |
| "grad_norm": 0.4142317771911621, | |
| "learning_rate": 0.00016374854142357057, | |
| "loss": 3.2196, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 14.564553717082266, | |
| "grad_norm": 0.4134737253189087, | |
| "learning_rate": 0.0001633109684947491, | |
| "loss": 3.229, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.564553717082266, | |
| "eval_accuracy": 0.3739324620427029, | |
| "eval_loss": 3.535773515701294, | |
| "eval_runtime": 180.2641, | |
| "eval_samples_per_second": 92.331, | |
| "eval_steps_per_second": 5.775, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 14.579119086460032, | |
| "grad_norm": 0.42424920201301575, | |
| "learning_rate": 0.00016287339556592763, | |
| "loss": 3.2276, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 14.5936844558378, | |
| "grad_norm": 0.40107038617134094, | |
| "learning_rate": 0.0001624358226371062, | |
| "loss": 3.232, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 14.608249825215568, | |
| "grad_norm": 0.38732558488845825, | |
| "learning_rate": 0.0001619982497082847, | |
| "loss": 3.2306, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 14.622815194593334, | |
| "grad_norm": 0.42881304025650024, | |
| "learning_rate": 0.00016156067677946322, | |
| "loss": 3.2385, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 14.637380563971103, | |
| "grad_norm": 0.41138574481010437, | |
| "learning_rate": 0.00016112310385064176, | |
| "loss": 3.2379, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 14.65194593334887, | |
| "grad_norm": 0.41807669401168823, | |
| "learning_rate": 0.00016068553092182027, | |
| "loss": 3.2306, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 14.666511302726637, | |
| "grad_norm": 0.4081842005252838, | |
| "learning_rate": 0.00016024795799299884, | |
| "loss": 3.238, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 14.681076672104405, | |
| "grad_norm": 0.39501240849494934, | |
| "learning_rate": 0.00015981038506417735, | |
| "loss": 3.2364, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 14.695642041482172, | |
| "grad_norm": 0.4069629907608032, | |
| "learning_rate": 0.00015937281213535586, | |
| "loss": 3.2281, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 14.71020741085994, | |
| "grad_norm": 0.4017820656299591, | |
| "learning_rate": 0.0001589352392065344, | |
| "loss": 3.2359, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 14.724772780237707, | |
| "grad_norm": 0.430561900138855, | |
| "learning_rate": 0.00015849766627771294, | |
| "loss": 3.2218, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 14.739338149615474, | |
| "grad_norm": 0.4142705202102661, | |
| "learning_rate": 0.00015806009334889148, | |
| "loss": 3.2357, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 14.753903518993242, | |
| "grad_norm": 0.42446601390838623, | |
| "learning_rate": 0.00015762252042007, | |
| "loss": 3.2325, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 14.76846888837101, | |
| "grad_norm": 0.41279682517051697, | |
| "learning_rate": 0.0001571849474912485, | |
| "loss": 3.2272, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 14.783034257748776, | |
| "grad_norm": 0.4026637375354767, | |
| "learning_rate": 0.00015674737456242708, | |
| "loss": 3.2323, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 14.797599627126544, | |
| "grad_norm": 0.4120595157146454, | |
| "learning_rate": 0.0001563098016336056, | |
| "loss": 3.241, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 14.812164996504311, | |
| "grad_norm": 0.4040710926055908, | |
| "learning_rate": 0.00015587222870478413, | |
| "loss": 3.2423, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 14.826730365882078, | |
| "grad_norm": 0.4115070402622223, | |
| "learning_rate": 0.00015543465577596264, | |
| "loss": 3.2362, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 14.841295735259846, | |
| "grad_norm": 0.41906213760375977, | |
| "learning_rate": 0.00015499708284714116, | |
| "loss": 3.2435, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 14.855861104637613, | |
| "grad_norm": 0.42066025733947754, | |
| "learning_rate": 0.00015455950991831972, | |
| "loss": 3.2327, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.855861104637613, | |
| "eval_accuracy": 0.37450694196553497, | |
| "eval_loss": 3.529242515563965, | |
| "eval_runtime": 180.1778, | |
| "eval_samples_per_second": 92.375, | |
| "eval_steps_per_second": 5.778, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 14.870426474015382, | |
| "grad_norm": 0.4169136881828308, | |
| "learning_rate": 0.00015412193698949824, | |
| "loss": 3.2329, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 14.884991843393149, | |
| "grad_norm": 0.4026949405670166, | |
| "learning_rate": 0.00015368436406067677, | |
| "loss": 3.2343, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 14.899557212770915, | |
| "grad_norm": 0.4167788028717041, | |
| "learning_rate": 0.0001532467911318553, | |
| "loss": 3.2343, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 14.914122582148684, | |
| "grad_norm": 0.41968563199043274, | |
| "learning_rate": 0.0001528092182030338, | |
| "loss": 3.2343, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 14.92868795152645, | |
| "grad_norm": 0.4078276753425598, | |
| "learning_rate": 0.00015237164527421237, | |
| "loss": 3.2437, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 14.943253320904217, | |
| "grad_norm": 0.4137614667415619, | |
| "learning_rate": 0.00015193407234539088, | |
| "loss": 3.2419, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 14.957818690281986, | |
| "grad_norm": 0.4062293469905853, | |
| "learning_rate": 0.00015149649941656942, | |
| "loss": 3.2358, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 14.972384059659753, | |
| "grad_norm": 0.42490682005882263, | |
| "learning_rate": 0.00015105892648774793, | |
| "loss": 3.2424, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 14.986949429037521, | |
| "grad_norm": 0.4156704545021057, | |
| "learning_rate": 0.00015062135355892647, | |
| "loss": 3.2426, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 15.001456536937777, | |
| "grad_norm": 0.42323753237724304, | |
| "learning_rate": 0.000150183780630105, | |
| "loss": 3.2332, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 15.016021906315544, | |
| "grad_norm": 0.41732102632522583, | |
| "learning_rate": 0.00014974620770128353, | |
| "loss": 3.1514, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 15.030587275693312, | |
| "grad_norm": 0.407942533493042, | |
| "learning_rate": 0.00014930863477246207, | |
| "loss": 3.16, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 15.045152645071079, | |
| "grad_norm": 0.4309288561344147, | |
| "learning_rate": 0.0001488710618436406, | |
| "loss": 3.1614, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 15.059718014448846, | |
| "grad_norm": 0.42493224143981934, | |
| "learning_rate": 0.00014843348891481912, | |
| "loss": 3.1634, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 15.074283383826614, | |
| "grad_norm": 0.39733976125717163, | |
| "learning_rate": 0.00014799591598599766, | |
| "loss": 3.1639, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 15.088848753204381, | |
| "grad_norm": 0.42916181683540344, | |
| "learning_rate": 0.00014755834305717617, | |
| "loss": 3.1924, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 15.103414122582148, | |
| "grad_norm": 0.4103613495826721, | |
| "learning_rate": 0.0001471207701283547, | |
| "loss": 3.1701, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 15.117979491959916, | |
| "grad_norm": 0.409218430519104, | |
| "learning_rate": 0.00014668319719953325, | |
| "loss": 3.169, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 15.132544861337683, | |
| "grad_norm": 0.42851245403289795, | |
| "learning_rate": 0.00014624562427071176, | |
| "loss": 3.1857, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 15.147110230715452, | |
| "grad_norm": 0.41377684473991394, | |
| "learning_rate": 0.0001458080513418903, | |
| "loss": 3.1816, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.147110230715452, | |
| "eval_accuracy": 0.3739413978744948, | |
| "eval_loss": 3.5385658740997314, | |
| "eval_runtime": 180.1964, | |
| "eval_samples_per_second": 92.366, | |
| "eval_steps_per_second": 5.777, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 15.161675600093218, | |
| "grad_norm": 0.4140619933605194, | |
| "learning_rate": 0.00014537047841306882, | |
| "loss": 3.1848, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 15.176240969470985, | |
| "grad_norm": 0.4383089244365692, | |
| "learning_rate": 0.00014493290548424736, | |
| "loss": 3.1781, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 15.190806338848754, | |
| "grad_norm": 0.4208312928676605, | |
| "learning_rate": 0.0001444953325554259, | |
| "loss": 3.1806, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 15.20537170822652, | |
| "grad_norm": 0.4226909279823303, | |
| "learning_rate": 0.00014405775962660444, | |
| "loss": 3.1649, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 15.219937077604287, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.00014362018669778295, | |
| "loss": 3.1823, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 15.234502446982056, | |
| "grad_norm": 0.40975677967071533, | |
| "learning_rate": 0.00014318261376896146, | |
| "loss": 3.183, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 15.249067816359823, | |
| "grad_norm": 0.4181591868400574, | |
| "learning_rate": 0.00014274504084014, | |
| "loss": 3.1905, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 15.263633185737591, | |
| "grad_norm": 0.41215869784355164, | |
| "learning_rate": 0.00014230746791131854, | |
| "loss": 3.1817, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 15.278198555115358, | |
| "grad_norm": 0.412744402885437, | |
| "learning_rate": 0.00014186989498249708, | |
| "loss": 3.195, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 15.292763924493125, | |
| "grad_norm": 0.42229944467544556, | |
| "learning_rate": 0.0001414323220536756, | |
| "loss": 3.1861, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 15.307329293870893, | |
| "grad_norm": 0.4238891899585724, | |
| "learning_rate": 0.00014099474912485413, | |
| "loss": 3.1816, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 15.32189466324866, | |
| "grad_norm": 0.40468868613243103, | |
| "learning_rate": 0.00014055717619603265, | |
| "loss": 3.2001, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 15.336460032626427, | |
| "grad_norm": 0.41819027066230774, | |
| "learning_rate": 0.0001401196032672112, | |
| "loss": 3.182, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 15.351025402004195, | |
| "grad_norm": 0.4215780794620514, | |
| "learning_rate": 0.00013968203033838973, | |
| "loss": 3.1968, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 15.365590771381962, | |
| "grad_norm": 0.4218878149986267, | |
| "learning_rate": 0.00013924445740956827, | |
| "loss": 3.1884, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 15.38015614075973, | |
| "grad_norm": 0.4183255434036255, | |
| "learning_rate": 0.00013880688448074678, | |
| "loss": 3.1919, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 15.394721510137497, | |
| "grad_norm": 0.42416125535964966, | |
| "learning_rate": 0.0001383693115519253, | |
| "loss": 3.1977, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 15.409286879515264, | |
| "grad_norm": 0.41610389947891235, | |
| "learning_rate": 0.00013793173862310383, | |
| "loss": 3.2003, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 15.423852248893033, | |
| "grad_norm": 0.4229474365711212, | |
| "learning_rate": 0.00013749416569428237, | |
| "loss": 3.1983, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 15.4384176182708, | |
| "grad_norm": 0.4127897024154663, | |
| "learning_rate": 0.0001370565927654609, | |
| "loss": 3.1996, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.4384176182708, | |
| "eval_accuracy": 0.3744517984772402, | |
| "eval_loss": 3.5346455574035645, | |
| "eval_runtime": 180.9262, | |
| "eval_samples_per_second": 91.993, | |
| "eval_steps_per_second": 5.754, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 15.452982987648566, | |
| "grad_norm": 0.4062581956386566, | |
| "learning_rate": 0.00013661901983663943, | |
| "loss": 3.2082, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 15.467548357026335, | |
| "grad_norm": 0.4083750545978546, | |
| "learning_rate": 0.00013618144690781797, | |
| "loss": 3.2046, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 15.482113726404101, | |
| "grad_norm": 0.44948476552963257, | |
| "learning_rate": 0.00013574387397899648, | |
| "loss": 3.1963, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 15.49667909578187, | |
| "grad_norm": 0.4223315715789795, | |
| "learning_rate": 0.00013530630105017502, | |
| "loss": 3.2006, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 15.511244465159637, | |
| "grad_norm": 0.43807777762413025, | |
| "learning_rate": 0.00013486872812135356, | |
| "loss": 3.19, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 15.525809834537403, | |
| "grad_norm": 0.4165053367614746, | |
| "learning_rate": 0.00013443115519253207, | |
| "loss": 3.2033, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 15.540375203915172, | |
| "grad_norm": 0.4165057837963104, | |
| "learning_rate": 0.0001339935822637106, | |
| "loss": 3.2031, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 15.554940573292939, | |
| "grad_norm": 0.4309650957584381, | |
| "learning_rate": 0.00013355600933488912, | |
| "loss": 3.2105, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 15.569505942670705, | |
| "grad_norm": 0.41557958722114563, | |
| "learning_rate": 0.00013311843640606766, | |
| "loss": 3.2082, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 15.584071312048474, | |
| "grad_norm": 0.44082722067832947, | |
| "learning_rate": 0.0001326808634772462, | |
| "loss": 3.2097, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 15.59863668142624, | |
| "grad_norm": 0.4245944619178772, | |
| "learning_rate": 0.00013224329054842472, | |
| "loss": 3.2059, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 15.61320205080401, | |
| "grad_norm": 0.41007092595100403, | |
| "learning_rate": 0.00013180571761960326, | |
| "loss": 3.1955, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 15.627767420181776, | |
| "grad_norm": 0.42977604269981384, | |
| "learning_rate": 0.0001313681446907818, | |
| "loss": 3.2066, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 15.642332789559543, | |
| "grad_norm": 0.40306100249290466, | |
| "learning_rate": 0.0001309305717619603, | |
| "loss": 3.2192, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 15.656898158937311, | |
| "grad_norm": 0.4248296916484833, | |
| "learning_rate": 0.00013049299883313885, | |
| "loss": 3.207, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 15.671463528315078, | |
| "grad_norm": 0.4259008765220642, | |
| "learning_rate": 0.00013005542590431736, | |
| "loss": 3.2088, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 15.686028897692845, | |
| "grad_norm": 0.43237951397895813, | |
| "learning_rate": 0.0001296178529754959, | |
| "loss": 3.2053, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 15.700594267070613, | |
| "grad_norm": 0.42358967661857605, | |
| "learning_rate": 0.00012918028004667444, | |
| "loss": 3.2073, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 15.71515963644838, | |
| "grad_norm": 0.42595556378364563, | |
| "learning_rate": 0.00012874270711785295, | |
| "loss": 3.2038, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 15.729725005826147, | |
| "grad_norm": 0.4171249568462372, | |
| "learning_rate": 0.0001283051341890315, | |
| "loss": 3.2029, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.729725005826147, | |
| "eval_accuracy": 0.3747556167581649, | |
| "eval_loss": 3.530050754547119, | |
| "eval_runtime": 180.9342, | |
| "eval_samples_per_second": 91.989, | |
| "eval_steps_per_second": 5.753, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 15.744290375203915, | |
| "grad_norm": 0.42589497566223145, | |
| "learning_rate": 0.00012786756126021, | |
| "loss": 3.2115, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 15.758855744581682, | |
| "grad_norm": 0.4358471632003784, | |
| "learning_rate": 0.00012742998833138855, | |
| "loss": 3.2096, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 15.77342111395945, | |
| "grad_norm": 0.4199720323085785, | |
| "learning_rate": 0.0001269924154025671, | |
| "loss": 3.1963, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 15.787986483337217, | |
| "grad_norm": 0.43083542585372925, | |
| "learning_rate": 0.00012655484247374563, | |
| "loss": 3.2158, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 15.802551852714984, | |
| "grad_norm": 0.42154741287231445, | |
| "learning_rate": 0.00012611726954492414, | |
| "loss": 3.2087, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 15.817117222092753, | |
| "grad_norm": 0.441120445728302, | |
| "learning_rate": 0.00012567969661610265, | |
| "loss": 3.2178, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 15.83168259147052, | |
| "grad_norm": 0.4148479402065277, | |
| "learning_rate": 0.0001252421236872812, | |
| "loss": 3.2268, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 15.846247960848288, | |
| "grad_norm": 0.419406920671463, | |
| "learning_rate": 0.00012480455075845973, | |
| "loss": 3.2142, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 15.860813330226055, | |
| "grad_norm": 0.42558178305625916, | |
| "learning_rate": 0.00012436697782963827, | |
| "loss": 3.2133, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 15.875378699603822, | |
| "grad_norm": 0.4201781153678894, | |
| "learning_rate": 0.00012392940490081679, | |
| "loss": 3.2105, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 15.88994406898159, | |
| "grad_norm": 0.4343336522579193, | |
| "learning_rate": 0.00012349183197199533, | |
| "loss": 3.2153, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 15.904509438359357, | |
| "grad_norm": 0.4077042043209076, | |
| "learning_rate": 0.00012305425904317384, | |
| "loss": 3.2177, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 15.919074807737124, | |
| "grad_norm": 0.4239185154438019, | |
| "learning_rate": 0.00012261668611435238, | |
| "loss": 3.219, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 15.933640177114892, | |
| "grad_norm": 0.41873300075531006, | |
| "learning_rate": 0.00012217911318553092, | |
| "loss": 3.21, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 15.948205546492659, | |
| "grad_norm": 0.41313689947128296, | |
| "learning_rate": 0.00012174154025670944, | |
| "loss": 3.204, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 15.962770915870426, | |
| "grad_norm": 0.41441574692726135, | |
| "learning_rate": 0.00012130396732788796, | |
| "loss": 3.2034, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 15.977336285248194, | |
| "grad_norm": 0.43360719084739685, | |
| "learning_rate": 0.0001208663943990665, | |
| "loss": 3.2189, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 15.991901654625961, | |
| "grad_norm": 0.4335940480232239, | |
| "learning_rate": 0.00012042882147024502, | |
| "loss": 3.2178, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 16.006408762526217, | |
| "grad_norm": 0.4299444258213043, | |
| "learning_rate": 0.00011999124854142356, | |
| "loss": 3.1882, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 16.020974131903984, | |
| "grad_norm": 0.4137655794620514, | |
| "learning_rate": 0.00011955367561260209, | |
| "loss": 3.151, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.020974131903984, | |
| "eval_accuracy": 0.37487225287839604, | |
| "eval_loss": 3.5326309204101562, | |
| "eval_runtime": 181.1353, | |
| "eval_samples_per_second": 91.887, | |
| "eval_steps_per_second": 5.747, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 16.035539501281754, | |
| "grad_norm": 0.4282309114933014, | |
| "learning_rate": 0.00011911610268378062, | |
| "loss": 3.1378, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 16.05010487065952, | |
| "grad_norm": 0.43013036251068115, | |
| "learning_rate": 0.00011867852975495914, | |
| "loss": 3.1477, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 16.064670240037287, | |
| "grad_norm": 0.4036613404750824, | |
| "learning_rate": 0.00011824095682613768, | |
| "loss": 3.1385, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 16.079235609415054, | |
| "grad_norm": 0.4135037660598755, | |
| "learning_rate": 0.00011780338389731621, | |
| "loss": 3.1613, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 16.09380097879282, | |
| "grad_norm": 0.4376969635486603, | |
| "learning_rate": 0.00011736581096849475, | |
| "loss": 3.1427, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 16.10836634817059, | |
| "grad_norm": 0.449358731508255, | |
| "learning_rate": 0.00011692823803967326, | |
| "loss": 3.1549, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 16.122931717548358, | |
| "grad_norm": 0.4239208698272705, | |
| "learning_rate": 0.00011649066511085179, | |
| "loss": 3.1624, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 16.137497086926125, | |
| "grad_norm": 0.42871618270874023, | |
| "learning_rate": 0.00011605309218203033, | |
| "loss": 3.1611, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 16.15206245630389, | |
| "grad_norm": 0.4187133014202118, | |
| "learning_rate": 0.00011561551925320885, | |
| "loss": 3.16, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 16.16662782568166, | |
| "grad_norm": 0.43837517499923706, | |
| "learning_rate": 0.0001151779463243874, | |
| "loss": 3.1494, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 16.181193195059425, | |
| "grad_norm": 0.4250280261039734, | |
| "learning_rate": 0.00011474037339556591, | |
| "loss": 3.1543, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 16.195758564437195, | |
| "grad_norm": 0.42530959844589233, | |
| "learning_rate": 0.00011430280046674445, | |
| "loss": 3.1667, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 16.210323933814962, | |
| "grad_norm": 0.4092886447906494, | |
| "learning_rate": 0.00011386522753792297, | |
| "loss": 3.1579, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 16.22488930319273, | |
| "grad_norm": 0.4281422793865204, | |
| "learning_rate": 0.00011342765460910151, | |
| "loss": 3.1603, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 16.239454672570496, | |
| "grad_norm": 0.41307032108306885, | |
| "learning_rate": 0.00011299008168028004, | |
| "loss": 3.1687, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 16.254020041948262, | |
| "grad_norm": 0.44364210963249207, | |
| "learning_rate": 0.00011255250875145855, | |
| "loss": 3.1769, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 16.268585411326033, | |
| "grad_norm": 0.42538779973983765, | |
| "learning_rate": 0.00011211493582263709, | |
| "loss": 3.1639, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 16.2831507807038, | |
| "grad_norm": 0.42053961753845215, | |
| "learning_rate": 0.00011167736289381562, | |
| "loss": 3.1692, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 16.297716150081566, | |
| "grad_norm": 0.41357651352882385, | |
| "learning_rate": 0.00011123978996499416, | |
| "loss": 3.1697, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 16.312281519459333, | |
| "grad_norm": 0.42656847834587097, | |
| "learning_rate": 0.00011080221703617269, | |
| "loss": 3.1679, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.312281519459333, | |
| "eval_accuracy": 0.374931041245448, | |
| "eval_loss": 3.5328683853149414, | |
| "eval_runtime": 180.861, | |
| "eval_samples_per_second": 92.026, | |
| "eval_steps_per_second": 5.756, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 16.3268468888371, | |
| "grad_norm": 0.4274371266365051, | |
| "learning_rate": 0.00011036464410735121, | |
| "loss": 3.1626, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 16.34141225821487, | |
| "grad_norm": 0.41279909014701843, | |
| "learning_rate": 0.00010992707117852974, | |
| "loss": 3.1719, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 16.355977627592637, | |
| "grad_norm": 0.420175701379776, | |
| "learning_rate": 0.00010948949824970828, | |
| "loss": 3.172, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 16.370542996970403, | |
| "grad_norm": 0.4424479603767395, | |
| "learning_rate": 0.0001090519253208868, | |
| "loss": 3.1707, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 16.38510836634817, | |
| "grad_norm": 0.42122504115104675, | |
| "learning_rate": 0.00010861435239206534, | |
| "loss": 3.1643, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 16.399673735725937, | |
| "grad_norm": 0.4253459572792053, | |
| "learning_rate": 0.00010817677946324386, | |
| "loss": 3.1608, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 16.414239105103704, | |
| "grad_norm": 0.4293667674064636, | |
| "learning_rate": 0.00010773920653442238, | |
| "loss": 3.181, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 16.428804474481474, | |
| "grad_norm": 0.42189478874206543, | |
| "learning_rate": 0.00010730163360560092, | |
| "loss": 3.1808, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 16.44336984385924, | |
| "grad_norm": 0.41094401478767395, | |
| "learning_rate": 0.00010686406067677945, | |
| "loss": 3.1792, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 16.457935213237008, | |
| "grad_norm": 0.43915942311286926, | |
| "learning_rate": 0.00010642648774795799, | |
| "loss": 3.1726, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 16.472500582614774, | |
| "grad_norm": 0.4174824357032776, | |
| "learning_rate": 0.0001059889148191365, | |
| "loss": 3.1749, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 16.48706595199254, | |
| "grad_norm": 0.4097413420677185, | |
| "learning_rate": 0.00010555134189031504, | |
| "loss": 3.1743, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 16.50163132137031, | |
| "grad_norm": 0.4119010269641876, | |
| "learning_rate": 0.00010511376896149357, | |
| "loss": 3.1766, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 16.516196690748078, | |
| "grad_norm": 0.4229290187358856, | |
| "learning_rate": 0.00010467619603267211, | |
| "loss": 3.1763, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 16.530762060125845, | |
| "grad_norm": 0.42102253437042236, | |
| "learning_rate": 0.00010423862310385064, | |
| "loss": 3.1693, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 16.54532742950361, | |
| "grad_norm": 0.40270158648490906, | |
| "learning_rate": 0.00010380105017502915, | |
| "loss": 3.1758, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 16.55989279888138, | |
| "grad_norm": 0.4175575375556946, | |
| "learning_rate": 0.00010336347724620769, | |
| "loss": 3.1769, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 16.57445816825915, | |
| "grad_norm": 0.43574297428131104, | |
| "learning_rate": 0.00010292590431738621, | |
| "loss": 3.1885, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 16.589023537636916, | |
| "grad_norm": 0.41739705204963684, | |
| "learning_rate": 0.00010248833138856475, | |
| "loss": 3.177, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 16.603588907014682, | |
| "grad_norm": 0.43442845344543457, | |
| "learning_rate": 0.00010205075845974328, | |
| "loss": 3.183, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.603588907014682, | |
| "eval_accuracy": 0.3752198097044074, | |
| "eval_loss": 3.528458595275879, | |
| "eval_runtime": 180.2565, | |
| "eval_samples_per_second": 92.335, | |
| "eval_steps_per_second": 5.775, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 16.61815427639245, | |
| "grad_norm": 0.4157189130783081, | |
| "learning_rate": 0.00010161318553092181, | |
| "loss": 3.1774, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 16.632719645770216, | |
| "grad_norm": 0.4215414822101593, | |
| "learning_rate": 0.00010117561260210033, | |
| "loss": 3.1864, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 16.647285015147983, | |
| "grad_norm": 0.42653682827949524, | |
| "learning_rate": 0.00010073803967327887, | |
| "loss": 3.185, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 16.661850384525753, | |
| "grad_norm": 0.43946152925491333, | |
| "learning_rate": 0.0001003004667444574, | |
| "loss": 3.191, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 16.67641575390352, | |
| "grad_norm": 0.4414174258708954, | |
| "learning_rate": 9.986289381563594e-05, | |
| "loss": 3.1942, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 16.690981123281286, | |
| "grad_norm": 0.4162786602973938, | |
| "learning_rate": 9.942532088681445e-05, | |
| "loss": 3.1902, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 16.705546492659053, | |
| "grad_norm": 0.4169985353946686, | |
| "learning_rate": 9.898774795799299e-05, | |
| "loss": 3.1915, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 16.72011186203682, | |
| "grad_norm": 0.4140052795410156, | |
| "learning_rate": 9.855017502917152e-05, | |
| "loss": 3.1698, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 16.73467723141459, | |
| "grad_norm": 0.45123833417892456, | |
| "learning_rate": 9.811260210035005e-05, | |
| "loss": 3.1909, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 16.749242600792357, | |
| "grad_norm": 0.4276806116104126, | |
| "learning_rate": 9.767502917152858e-05, | |
| "loss": 3.188, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 16.763807970170124, | |
| "grad_norm": 0.431024432182312, | |
| "learning_rate": 9.72374562427071e-05, | |
| "loss": 3.1845, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 16.77837333954789, | |
| "grad_norm": 0.418350487947464, | |
| "learning_rate": 9.679988331388564e-05, | |
| "loss": 3.1848, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 16.792938708925657, | |
| "grad_norm": 0.4283568561077118, | |
| "learning_rate": 9.636231038506416e-05, | |
| "loss": 3.1821, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 16.807504078303424, | |
| "grad_norm": 0.434356153011322, | |
| "learning_rate": 9.59247374562427e-05, | |
| "loss": 3.1919, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 16.822069447681194, | |
| "grad_norm": 0.43099457025527954, | |
| "learning_rate": 9.548716452742123e-05, | |
| "loss": 3.1865, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 16.83663481705896, | |
| "grad_norm": 0.4274565577507019, | |
| "learning_rate": 9.504959159859976e-05, | |
| "loss": 3.1867, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 16.851200186436728, | |
| "grad_norm": 0.42834699153900146, | |
| "learning_rate": 9.461201866977828e-05, | |
| "loss": 3.1868, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 16.865765555814495, | |
| "grad_norm": 0.4234914481639862, | |
| "learning_rate": 9.417444574095682e-05, | |
| "loss": 3.1787, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 16.88033092519226, | |
| "grad_norm": 0.42309653759002686, | |
| "learning_rate": 9.373687281213535e-05, | |
| "loss": 3.1881, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 16.89489629457003, | |
| "grad_norm": 0.4064479470252991, | |
| "learning_rate": 9.329929988331389e-05, | |
| "loss": 3.1985, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.89489629457003, | |
| "eval_accuracy": 0.37572386116351114, | |
| "eval_loss": 3.522873640060425, | |
| "eval_runtime": 180.6669, | |
| "eval_samples_per_second": 92.125, | |
| "eval_steps_per_second": 5.762, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 16.9094616639478, | |
| "grad_norm": 0.4073335528373718, | |
| "learning_rate": 9.28617269544924e-05, | |
| "loss": 3.193, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 16.924027033325565, | |
| "grad_norm": 0.42059651017189026, | |
| "learning_rate": 9.242415402567093e-05, | |
| "loss": 3.194, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 16.938592402703332, | |
| "grad_norm": 0.4243110716342926, | |
| "learning_rate": 9.198658109684947e-05, | |
| "loss": 3.1857, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 16.9531577720811, | |
| "grad_norm": 0.4175581932067871, | |
| "learning_rate": 9.1549008168028e-05, | |
| "loss": 3.1957, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 16.96772314145887, | |
| "grad_norm": 0.4244025647640228, | |
| "learning_rate": 9.111143523920653e-05, | |
| "loss": 3.1841, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 16.982288510836636, | |
| "grad_norm": 0.4220493733882904, | |
| "learning_rate": 9.067386231038505e-05, | |
| "loss": 3.1947, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 16.996853880214402, | |
| "grad_norm": 0.43514513969421387, | |
| "learning_rate": 9.023628938156359e-05, | |
| "loss": 3.185, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 17.01136098811466, | |
| "grad_norm": 0.44147107005119324, | |
| "learning_rate": 8.979871645274211e-05, | |
| "loss": 3.1409, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 17.025926357492427, | |
| "grad_norm": 0.41750553250312805, | |
| "learning_rate": 8.936114352392065e-05, | |
| "loss": 3.1336, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 17.040491726870194, | |
| "grad_norm": 0.43038827180862427, | |
| "learning_rate": 8.892357059509918e-05, | |
| "loss": 3.136, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 17.05505709624796, | |
| "grad_norm": 0.43373844027519226, | |
| "learning_rate": 8.848599766627769e-05, | |
| "loss": 3.1353, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 17.069622465625727, | |
| "grad_norm": 0.4187583923339844, | |
| "learning_rate": 8.804842473745623e-05, | |
| "loss": 3.1367, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 17.084187835003497, | |
| "grad_norm": 0.43599000573158264, | |
| "learning_rate": 8.761085180863476e-05, | |
| "loss": 3.1356, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 17.098753204381264, | |
| "grad_norm": 0.4198746085166931, | |
| "learning_rate": 8.71732788798133e-05, | |
| "loss": 3.1435, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 17.11331857375903, | |
| "grad_norm": 0.4513174891471863, | |
| "learning_rate": 8.673570595099183e-05, | |
| "loss": 3.136, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 17.127883943136798, | |
| "grad_norm": 0.43155333399772644, | |
| "learning_rate": 8.629813302217035e-05, | |
| "loss": 3.1333, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 17.142449312514564, | |
| "grad_norm": 0.4096786379814148, | |
| "learning_rate": 8.586056009334888e-05, | |
| "loss": 3.1427, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 17.15701468189233, | |
| "grad_norm": 0.43581393361091614, | |
| "learning_rate": 8.542298716452742e-05, | |
| "loss": 3.1465, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 17.1715800512701, | |
| "grad_norm": 0.4341733753681183, | |
| "learning_rate": 8.498541423570594e-05, | |
| "loss": 3.1471, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 17.18614542064787, | |
| "grad_norm": 0.4542756676673889, | |
| "learning_rate": 8.454784130688448e-05, | |
| "loss": 3.1424, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.18614542064787, | |
| "eval_accuracy": 0.37545496317261534, | |
| "eval_loss": 3.531616687774658, | |
| "eval_runtime": 180.3206, | |
| "eval_samples_per_second": 92.302, | |
| "eval_steps_per_second": 5.773, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 17.200710790025635, | |
| "grad_norm": 0.4263310134410858, | |
| "learning_rate": 8.4110268378063e-05, | |
| "loss": 3.1367, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 17.215276159403402, | |
| "grad_norm": 0.4178173840045929, | |
| "learning_rate": 8.367269544924152e-05, | |
| "loss": 3.1371, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 17.22984152878117, | |
| "grad_norm": 0.4277113676071167, | |
| "learning_rate": 8.323512252042006e-05, | |
| "loss": 3.1427, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 17.24440689815894, | |
| "grad_norm": 0.42608314752578735, | |
| "learning_rate": 8.279754959159859e-05, | |
| "loss": 3.1416, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 17.258972267536706, | |
| "grad_norm": 0.43384233117103577, | |
| "learning_rate": 8.235997666277713e-05, | |
| "loss": 3.1399, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 17.273537636914472, | |
| "grad_norm": 0.4369141459465027, | |
| "learning_rate": 8.192240373395564e-05, | |
| "loss": 3.1502, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 17.28810300629224, | |
| "grad_norm": 0.42754146456718445, | |
| "learning_rate": 8.148483080513418e-05, | |
| "loss": 3.1351, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 17.302668375670006, | |
| "grad_norm": 0.42747077345848083, | |
| "learning_rate": 8.104725787631271e-05, | |
| "loss": 3.1469, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 17.317233745047773, | |
| "grad_norm": 0.4387149512767792, | |
| "learning_rate": 8.060968494749125e-05, | |
| "loss": 3.1374, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 17.331799114425543, | |
| "grad_norm": 0.4392690658569336, | |
| "learning_rate": 8.017211201866978e-05, | |
| "loss": 3.1515, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 17.34636448380331, | |
| "grad_norm": 0.4403955042362213, | |
| "learning_rate": 7.973453908984829e-05, | |
| "loss": 3.1615, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 17.360929853181077, | |
| "grad_norm": 0.43617773056030273, | |
| "learning_rate": 7.929696616102683e-05, | |
| "loss": 3.1528, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 17.375495222558843, | |
| "grad_norm": 0.4383864998817444, | |
| "learning_rate": 7.885939323220535e-05, | |
| "loss": 3.1529, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 17.39006059193661, | |
| "grad_norm": 0.43385154008865356, | |
| "learning_rate": 7.84218203033839e-05, | |
| "loss": 3.1476, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 17.40462596131438, | |
| "grad_norm": 0.4384395182132721, | |
| "learning_rate": 7.798424737456242e-05, | |
| "loss": 3.1535, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 17.419191330692147, | |
| "grad_norm": 0.431538462638855, | |
| "learning_rate": 7.754667444574096e-05, | |
| "loss": 3.1563, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 17.433756700069914, | |
| "grad_norm": 0.42602774500846863, | |
| "learning_rate": 7.710910151691947e-05, | |
| "loss": 3.1524, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 17.44832206944768, | |
| "grad_norm": 0.4278332591056824, | |
| "learning_rate": 7.667152858809801e-05, | |
| "loss": 3.1448, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 17.462887438825447, | |
| "grad_norm": 0.4381519556045532, | |
| "learning_rate": 7.623395565927654e-05, | |
| "loss": 3.1589, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 17.477452808203218, | |
| "grad_norm": 0.4320782721042633, | |
| "learning_rate": 7.579638273045508e-05, | |
| "loss": 3.1539, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.477452808203218, | |
| "eval_accuracy": 0.37566424975932045, | |
| "eval_loss": 3.527134418487549, | |
| "eval_runtime": 181.5925, | |
| "eval_samples_per_second": 91.656, | |
| "eval_steps_per_second": 5.733, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 17.492018177580984, | |
| "grad_norm": 0.4376213848590851, | |
| "learning_rate": 7.53588098016336e-05, | |
| "loss": 3.1551, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 17.50658354695875, | |
| "grad_norm": 0.43057137727737427, | |
| "learning_rate": 7.492123687281213e-05, | |
| "loss": 3.1608, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 17.521148916336518, | |
| "grad_norm": 0.42992404103279114, | |
| "learning_rate": 7.448366394399066e-05, | |
| "loss": 3.1541, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 17.535714285714285, | |
| "grad_norm": 0.4488855302333832, | |
| "learning_rate": 7.404609101516919e-05, | |
| "loss": 3.1475, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 17.55027965509205, | |
| "grad_norm": 0.4396044909954071, | |
| "learning_rate": 7.360851808634771e-05, | |
| "loss": 3.1491, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 17.56484502446982, | |
| "grad_norm": 0.43914586305618286, | |
| "learning_rate": 7.317094515752625e-05, | |
| "loss": 3.1591, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 17.57941039384759, | |
| "grad_norm": 0.42674726247787476, | |
| "learning_rate": 7.273337222870478e-05, | |
| "loss": 3.1557, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 17.593975763225355, | |
| "grad_norm": 0.4434099495410919, | |
| "learning_rate": 7.22957992998833e-05, | |
| "loss": 3.1607, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 17.608541132603122, | |
| "grad_norm": 0.4289220869541168, | |
| "learning_rate": 7.185822637106184e-05, | |
| "loss": 3.1573, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 17.62310650198089, | |
| "grad_norm": 0.4319080412387848, | |
| "learning_rate": 7.142065344224036e-05, | |
| "loss": 3.1555, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 17.63767187135866, | |
| "grad_norm": 0.4276769459247589, | |
| "learning_rate": 7.09830805134189e-05, | |
| "loss": 3.1646, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 17.652237240736426, | |
| "grad_norm": 0.42025482654571533, | |
| "learning_rate": 7.054550758459742e-05, | |
| "loss": 3.1637, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 17.666802610114193, | |
| "grad_norm": 0.44079354405403137, | |
| "learning_rate": 7.010793465577595e-05, | |
| "loss": 3.1607, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 17.68136797949196, | |
| "grad_norm": 0.4618414342403412, | |
| "learning_rate": 6.967036172695449e-05, | |
| "loss": 3.1516, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 17.695933348869726, | |
| "grad_norm": 0.43492668867111206, | |
| "learning_rate": 6.923278879813302e-05, | |
| "loss": 3.159, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 17.710498718247496, | |
| "grad_norm": 0.4092758297920227, | |
| "learning_rate": 6.879521586931154e-05, | |
| "loss": 3.152, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 17.725064087625263, | |
| "grad_norm": 0.4256676137447357, | |
| "learning_rate": 6.835764294049008e-05, | |
| "loss": 3.1687, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 17.73962945700303, | |
| "grad_norm": 0.440639466047287, | |
| "learning_rate": 6.792007001166861e-05, | |
| "loss": 3.161, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 17.754194826380797, | |
| "grad_norm": 0.4303901493549347, | |
| "learning_rate": 6.748249708284714e-05, | |
| "loss": 3.1631, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 17.768760195758563, | |
| "grad_norm": 0.4405215084552765, | |
| "learning_rate": 6.704492415402566e-05, | |
| "loss": 3.1562, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.768760195758563, | |
| "eval_accuracy": 0.3759568982505052, | |
| "eval_loss": 3.524013042449951, | |
| "eval_runtime": 180.1467, | |
| "eval_samples_per_second": 92.391, | |
| "eval_steps_per_second": 5.779, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 17.78332556513633, | |
| "grad_norm": 0.4302792251110077, | |
| "learning_rate": 6.660735122520419e-05, | |
| "loss": 3.1568, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 17.7978909345141, | |
| "grad_norm": 0.4325387179851532, | |
| "learning_rate": 6.616977829638273e-05, | |
| "loss": 3.1577, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 17.812456303891867, | |
| "grad_norm": 0.43648290634155273, | |
| "learning_rate": 6.573220536756125e-05, | |
| "loss": 3.1602, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 17.827021673269634, | |
| "grad_norm": 0.4282855987548828, | |
| "learning_rate": 6.529463243873978e-05, | |
| "loss": 3.166, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 17.8415870426474, | |
| "grad_norm": 0.4502982497215271, | |
| "learning_rate": 6.485705950991831e-05, | |
| "loss": 3.1627, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 17.856152412025168, | |
| "grad_norm": 0.42750799655914307, | |
| "learning_rate": 6.441948658109685e-05, | |
| "loss": 3.1722, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 17.870717781402938, | |
| "grad_norm": 0.4294120669364929, | |
| "learning_rate": 6.398191365227537e-05, | |
| "loss": 3.1663, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 17.885283150780705, | |
| "grad_norm": 0.4335509240627289, | |
| "learning_rate": 6.35443407234539e-05, | |
| "loss": 3.1612, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 17.89984852015847, | |
| "grad_norm": 0.44340866804122925, | |
| "learning_rate": 6.310676779463244e-05, | |
| "loss": 3.1593, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 17.914413889536238, | |
| "grad_norm": 0.44225451350212097, | |
| "learning_rate": 6.266919486581095e-05, | |
| "loss": 3.1667, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 17.928979258914005, | |
| "grad_norm": 0.4261557459831238, | |
| "learning_rate": 6.223162193698949e-05, | |
| "loss": 3.1746, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 17.943544628291775, | |
| "grad_norm": 0.43242040276527405, | |
| "learning_rate": 6.179404900816802e-05, | |
| "loss": 3.1554, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 17.958109997669542, | |
| "grad_norm": 0.43706896901130676, | |
| "learning_rate": 6.135647607934655e-05, | |
| "loss": 3.1525, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 17.97267536704731, | |
| "grad_norm": 0.43134328722953796, | |
| "learning_rate": 6.0918903150525085e-05, | |
| "loss": 3.1647, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 17.987240736425075, | |
| "grad_norm": 0.4260832667350769, | |
| "learning_rate": 6.048133022170361e-05, | |
| "loss": 3.1699, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 18.001747844325333, | |
| "grad_norm": 0.4271147549152374, | |
| "learning_rate": 6.0043757292882145e-05, | |
| "loss": 3.1569, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 18.0163132137031, | |
| "grad_norm": 0.4241034984588623, | |
| "learning_rate": 5.960618436406067e-05, | |
| "loss": 3.1166, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 18.030878583080867, | |
| "grad_norm": 0.443732887506485, | |
| "learning_rate": 5.91686114352392e-05, | |
| "loss": 3.1261, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 18.045443952458633, | |
| "grad_norm": 0.4324038624763489, | |
| "learning_rate": 5.873103850641773e-05, | |
| "loss": 3.1091, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 18.0600093218364, | |
| "grad_norm": 0.43087828159332275, | |
| "learning_rate": 5.829346557759626e-05, | |
| "loss": 3.1148, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.0600093218364, | |
| "eval_accuracy": 0.3759096324033954, | |
| "eval_loss": 3.526399612426758, | |
| "eval_runtime": 180.2992, | |
| "eval_samples_per_second": 92.313, | |
| "eval_steps_per_second": 5.774, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 18.07457469121417, | |
| "grad_norm": 0.43519923090934753, | |
| "learning_rate": 5.785589264877479e-05, | |
| "loss": 3.1092, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 18.089140060591937, | |
| "grad_norm": 0.42430856823921204, | |
| "learning_rate": 5.7418319719953323e-05, | |
| "loss": 3.1264, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 18.103705429969704, | |
| "grad_norm": 0.43190985918045044, | |
| "learning_rate": 5.698074679113185e-05, | |
| "loss": 3.1285, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 18.11827079934747, | |
| "grad_norm": 0.4296327829360962, | |
| "learning_rate": 5.654317386231038e-05, | |
| "loss": 3.1147, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 18.132836168725238, | |
| "grad_norm": 0.42043790221214294, | |
| "learning_rate": 5.610560093348891e-05, | |
| "loss": 3.121, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 18.147401538103008, | |
| "grad_norm": 0.4322208762168884, | |
| "learning_rate": 5.566802800466744e-05, | |
| "loss": 3.1246, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 18.161966907480775, | |
| "grad_norm": 0.4340595006942749, | |
| "learning_rate": 5.5230455075845976e-05, | |
| "loss": 3.1167, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 18.17653227685854, | |
| "grad_norm": 0.4333188533782959, | |
| "learning_rate": 5.4792882147024495e-05, | |
| "loss": 3.1339, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 18.191097646236308, | |
| "grad_norm": 0.43425410985946655, | |
| "learning_rate": 5.435530921820303e-05, | |
| "loss": 3.1255, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 18.205663015614075, | |
| "grad_norm": 0.4476664960384369, | |
| "learning_rate": 5.3917736289381555e-05, | |
| "loss": 3.1252, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 18.22022838499184, | |
| "grad_norm": 0.4289979040622711, | |
| "learning_rate": 5.348016336056009e-05, | |
| "loss": 3.1282, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 18.234793754369612, | |
| "grad_norm": 0.4287092685699463, | |
| "learning_rate": 5.304259043173862e-05, | |
| "loss": 3.1205, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 18.24935912374738, | |
| "grad_norm": 0.44103768467903137, | |
| "learning_rate": 5.260501750291715e-05, | |
| "loss": 3.1336, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 18.263924493125145, | |
| "grad_norm": 0.45782148838043213, | |
| "learning_rate": 5.216744457409568e-05, | |
| "loss": 3.1296, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 18.278489862502912, | |
| "grad_norm": 0.4396607279777527, | |
| "learning_rate": 5.172987164527421e-05, | |
| "loss": 3.1249, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 18.29305523188068, | |
| "grad_norm": 0.4368782639503479, | |
| "learning_rate": 5.129229871645274e-05, | |
| "loss": 3.1074, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 18.30762060125845, | |
| "grad_norm": 0.4307306706905365, | |
| "learning_rate": 5.085472578763127e-05, | |
| "loss": 3.1278, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 18.322185970636216, | |
| "grad_norm": 0.45161545276641846, | |
| "learning_rate": 5.041715285880979e-05, | |
| "loss": 3.1397, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 18.336751340013983, | |
| "grad_norm": 0.42871034145355225, | |
| "learning_rate": 4.9979579929988326e-05, | |
| "loss": 3.1199, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 18.35131670939175, | |
| "grad_norm": 0.43537962436676025, | |
| "learning_rate": 4.954200700116685e-05, | |
| "loss": 3.1308, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.35131670939175, | |
| "eval_accuracy": 0.3758205092389446, | |
| "eval_loss": 3.527667284011841, | |
| "eval_runtime": 180.5705, | |
| "eval_samples_per_second": 92.175, | |
| "eval_steps_per_second": 5.765, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 18.365882078769516, | |
| "grad_norm": 0.43271610140800476, | |
| "learning_rate": 4.9104434072345386e-05, | |
| "loss": 3.1386, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 18.380447448147287, | |
| "grad_norm": 0.4385630190372467, | |
| "learning_rate": 4.866686114352392e-05, | |
| "loss": 3.1426, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 18.395012817525053, | |
| "grad_norm": 0.4242112934589386, | |
| "learning_rate": 4.8229288214702445e-05, | |
| "loss": 3.1247, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 18.40957818690282, | |
| "grad_norm": 0.43731290102005005, | |
| "learning_rate": 4.779171528588098e-05, | |
| "loss": 3.1273, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 18.424143556280587, | |
| "grad_norm": 0.43987950682640076, | |
| "learning_rate": 4.7354142357059505e-05, | |
| "loss": 3.1375, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 18.438708925658354, | |
| "grad_norm": 0.4338039755821228, | |
| "learning_rate": 4.691656942823804e-05, | |
| "loss": 3.1448, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 18.45327429503612, | |
| "grad_norm": 0.43445321917533875, | |
| "learning_rate": 4.647899649941657e-05, | |
| "loss": 3.1226, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 18.46783966441389, | |
| "grad_norm": 0.4330557882785797, | |
| "learning_rate": 4.604142357059509e-05, | |
| "loss": 3.1239, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 18.482405033791657, | |
| "grad_norm": 0.4354225993156433, | |
| "learning_rate": 4.5603850641773624e-05, | |
| "loss": 3.1444, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 18.496970403169424, | |
| "grad_norm": 0.4307340085506439, | |
| "learning_rate": 4.516627771295215e-05, | |
| "loss": 3.1399, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 18.51153577254719, | |
| "grad_norm": 0.4360226094722748, | |
| "learning_rate": 4.472870478413068e-05, | |
| "loss": 3.1405, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 18.526101141924958, | |
| "grad_norm": 0.4463973343372345, | |
| "learning_rate": 4.4291131855309216e-05, | |
| "loss": 3.1232, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 18.540666511302728, | |
| "grad_norm": 0.43620696663856506, | |
| "learning_rate": 4.385355892648774e-05, | |
| "loss": 3.1366, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 18.555231880680495, | |
| "grad_norm": 0.4371740221977234, | |
| "learning_rate": 4.3415985997666276e-05, | |
| "loss": 3.1259, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 18.56979725005826, | |
| "grad_norm": 0.43216463923454285, | |
| "learning_rate": 4.29784130688448e-05, | |
| "loss": 3.1226, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 18.58436261943603, | |
| "grad_norm": 0.4308457374572754, | |
| "learning_rate": 4.2540840140023335e-05, | |
| "loss": 3.1279, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 18.598927988813795, | |
| "grad_norm": 0.4411686360836029, | |
| "learning_rate": 4.210326721120187e-05, | |
| "loss": 3.1335, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 18.613493358191565, | |
| "grad_norm": 0.4220650792121887, | |
| "learning_rate": 4.166569428238039e-05, | |
| "loss": 3.1346, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 18.628058727569332, | |
| "grad_norm": 0.4371688961982727, | |
| "learning_rate": 4.122812135355892e-05, | |
| "loss": 3.1426, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 18.6426240969471, | |
| "grad_norm": 0.4275096356868744, | |
| "learning_rate": 4.079054842473745e-05, | |
| "loss": 3.1398, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.6426240969471, | |
| "eval_accuracy": 0.37634160932449345, | |
| "eval_loss": 3.5230259895324707, | |
| "eval_runtime": 180.4164, | |
| "eval_samples_per_second": 92.253, | |
| "eval_steps_per_second": 5.77, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 18.657189466324866, | |
| "grad_norm": 0.4456894099712372, | |
| "learning_rate": 4.035297549591598e-05, | |
| "loss": 3.1328, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 18.671754835702632, | |
| "grad_norm": 0.4323650896549225, | |
| "learning_rate": 3.9915402567094514e-05, | |
| "loss": 3.1381, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 18.6863202050804, | |
| "grad_norm": 0.4565901756286621, | |
| "learning_rate": 3.947782963827304e-05, | |
| "loss": 3.1454, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 18.70088557445817, | |
| "grad_norm": 0.4353121221065521, | |
| "learning_rate": 3.9040256709451574e-05, | |
| "loss": 3.1341, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 18.715450943835936, | |
| "grad_norm": 0.43359240889549255, | |
| "learning_rate": 3.86026837806301e-05, | |
| "loss": 3.1411, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 18.730016313213703, | |
| "grad_norm": 0.4561856985092163, | |
| "learning_rate": 3.816511085180863e-05, | |
| "loss": 3.1373, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 18.74458168259147, | |
| "grad_norm": 0.45646873116493225, | |
| "learning_rate": 3.7727537922987166e-05, | |
| "loss": 3.1408, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 18.759147051969236, | |
| "grad_norm": 0.4355615973472595, | |
| "learning_rate": 3.728996499416569e-05, | |
| "loss": 3.1335, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 18.773712421347007, | |
| "grad_norm": 0.4390393793582916, | |
| "learning_rate": 3.685239206534422e-05, | |
| "loss": 3.1276, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 18.788277790724774, | |
| "grad_norm": 0.4501241147518158, | |
| "learning_rate": 3.641481913652275e-05, | |
| "loss": 3.137, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 18.80284316010254, | |
| "grad_norm": 0.44744792580604553, | |
| "learning_rate": 3.597724620770128e-05, | |
| "loss": 3.1348, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 18.817408529480307, | |
| "grad_norm": 0.4351864755153656, | |
| "learning_rate": 3.553967327887981e-05, | |
| "loss": 3.1368, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 18.831973898858074, | |
| "grad_norm": 0.44498175382614136, | |
| "learning_rate": 3.5102100350058345e-05, | |
| "loss": 3.1375, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 18.846539268235844, | |
| "grad_norm": 0.4396421015262604, | |
| "learning_rate": 3.466452742123687e-05, | |
| "loss": 3.124, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 18.86110463761361, | |
| "grad_norm": 0.43623414635658264, | |
| "learning_rate": 3.42269544924154e-05, | |
| "loss": 3.1377, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 18.875670006991378, | |
| "grad_norm": 0.4486501216888428, | |
| "learning_rate": 3.378938156359393e-05, | |
| "loss": 3.1481, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 18.890235376369144, | |
| "grad_norm": 0.4401872754096985, | |
| "learning_rate": 3.335180863477246e-05, | |
| "loss": 3.135, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 18.90480074574691, | |
| "grad_norm": 0.43119847774505615, | |
| "learning_rate": 3.291423570595099e-05, | |
| "loss": 3.1324, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 18.919366115124678, | |
| "grad_norm": 0.44835567474365234, | |
| "learning_rate": 3.247666277712952e-05, | |
| "loss": 3.1322, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 18.93393148450245, | |
| "grad_norm": 0.4232603907585144, | |
| "learning_rate": 3.203908984830805e-05, | |
| "loss": 3.1284, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.93393148450245, | |
| "eval_accuracy": 0.3767057444700135, | |
| "eval_loss": 3.518871307373047, | |
| "eval_runtime": 180.463, | |
| "eval_samples_per_second": 92.229, | |
| "eval_steps_per_second": 5.768, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 18.948496853880215, | |
| "grad_norm": 0.4348292350769043, | |
| "learning_rate": 3.1601516919486576e-05, | |
| "loss": 3.1338, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 18.96306222325798, | |
| "grad_norm": 0.4335695207118988, | |
| "learning_rate": 3.116394399066511e-05, | |
| "loss": 3.1433, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 18.97762759263575, | |
| "grad_norm": 0.43353986740112305, | |
| "learning_rate": 3.072637106184364e-05, | |
| "loss": 3.1328, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 18.992192962013515, | |
| "grad_norm": 0.42962124943733215, | |
| "learning_rate": 3.028879813302217e-05, | |
| "loss": 3.1416, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 19.006700069913773, | |
| "grad_norm": 0.42514732480049133, | |
| "learning_rate": 2.98512252042007e-05, | |
| "loss": 3.1258, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 19.02126543929154, | |
| "grad_norm": 0.4310900866985321, | |
| "learning_rate": 2.9413652275379225e-05, | |
| "loss": 3.096, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 19.035830808669306, | |
| "grad_norm": 0.4301711618900299, | |
| "learning_rate": 2.8976079346557755e-05, | |
| "loss": 3.1224, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 19.050396178047077, | |
| "grad_norm": 0.45508843660354614, | |
| "learning_rate": 2.8538506417736288e-05, | |
| "loss": 3.105, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 19.064961547424844, | |
| "grad_norm": 0.4345990717411041, | |
| "learning_rate": 2.8100933488914818e-05, | |
| "loss": 3.0982, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 19.07952691680261, | |
| "grad_norm": 0.459573894739151, | |
| "learning_rate": 2.7663360560093347e-05, | |
| "loss": 3.1139, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 19.094092286180377, | |
| "grad_norm": 0.4281415045261383, | |
| "learning_rate": 2.7225787631271874e-05, | |
| "loss": 3.0963, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 19.108657655558144, | |
| "grad_norm": 0.43718597292900085, | |
| "learning_rate": 2.6788214702450404e-05, | |
| "loss": 3.0962, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 19.123223024935914, | |
| "grad_norm": 0.44362103939056396, | |
| "learning_rate": 2.6350641773628937e-05, | |
| "loss": 3.1158, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 19.13778839431368, | |
| "grad_norm": 0.4409373700618744, | |
| "learning_rate": 2.5913068844807467e-05, | |
| "loss": 3.1172, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 19.152353763691448, | |
| "grad_norm": 0.4337795078754425, | |
| "learning_rate": 2.5475495915985996e-05, | |
| "loss": 3.0972, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 19.166919133069214, | |
| "grad_norm": 0.41607293486595154, | |
| "learning_rate": 2.5037922987164523e-05, | |
| "loss": 3.0982, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 19.18148450244698, | |
| "grad_norm": 0.43771815299987793, | |
| "learning_rate": 2.4600350058343052e-05, | |
| "loss": 3.1119, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 19.196049871824748, | |
| "grad_norm": 0.43401068449020386, | |
| "learning_rate": 2.4162777129521586e-05, | |
| "loss": 3.1157, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 19.210615241202518, | |
| "grad_norm": 0.4301515817642212, | |
| "learning_rate": 2.3725204200700115e-05, | |
| "loss": 3.106, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 19.225180610580285, | |
| "grad_norm": 0.43195417523384094, | |
| "learning_rate": 2.3287631271878645e-05, | |
| "loss": 3.1103, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.225180610580285, | |
| "eval_accuracy": 0.37647012069486907, | |
| "eval_loss": 3.523946762084961, | |
| "eval_runtime": 180.3151, | |
| "eval_samples_per_second": 92.305, | |
| "eval_steps_per_second": 5.773, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 19.23974597995805, | |
| "grad_norm": 0.43875738978385925, | |
| "learning_rate": 2.2850058343057175e-05, | |
| "loss": 3.1139, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 19.25431134933582, | |
| "grad_norm": 0.4393763542175293, | |
| "learning_rate": 2.24124854142357e-05, | |
| "loss": 3.1166, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 19.268876718713585, | |
| "grad_norm": 0.4409915804862976, | |
| "learning_rate": 2.1974912485414234e-05, | |
| "loss": 3.1088, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 19.283442088091356, | |
| "grad_norm": 0.4492509961128235, | |
| "learning_rate": 2.1537339556592764e-05, | |
| "loss": 3.1115, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 19.298007457469122, | |
| "grad_norm": 0.44549205899238586, | |
| "learning_rate": 2.1099766627771294e-05, | |
| "loss": 3.1064, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 19.31257282684689, | |
| "grad_norm": 0.4245654046535492, | |
| "learning_rate": 2.0662193698949824e-05, | |
| "loss": 3.1117, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 19.327138196224656, | |
| "grad_norm": 0.44473904371261597, | |
| "learning_rate": 2.022462077012835e-05, | |
| "loss": 3.1209, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 19.341703565602423, | |
| "grad_norm": 0.42317965626716614, | |
| "learning_rate": 1.9787047841306883e-05, | |
| "loss": 3.1024, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 19.356268934980193, | |
| "grad_norm": 0.4334244728088379, | |
| "learning_rate": 1.9349474912485413e-05, | |
| "loss": 3.1172, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 19.37083430435796, | |
| "grad_norm": 0.44188904762268066, | |
| "learning_rate": 1.8911901983663943e-05, | |
| "loss": 3.1083, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 19.385399673735726, | |
| "grad_norm": 0.4287189245223999, | |
| "learning_rate": 1.8474329054842473e-05, | |
| "loss": 3.1162, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 19.399965043113493, | |
| "grad_norm": 0.4341401159763336, | |
| "learning_rate": 1.8036756126021002e-05, | |
| "loss": 3.1143, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 19.41453041249126, | |
| "grad_norm": 0.4297761619091034, | |
| "learning_rate": 1.7599183197199532e-05, | |
| "loss": 3.1174, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 19.429095781869027, | |
| "grad_norm": 0.4521428644657135, | |
| "learning_rate": 1.7161610268378062e-05, | |
| "loss": 3.0979, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 19.443661151246797, | |
| "grad_norm": 0.43041300773620605, | |
| "learning_rate": 1.672403733955659e-05, | |
| "loss": 3.1183, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 19.458226520624564, | |
| "grad_norm": 0.43775051832199097, | |
| "learning_rate": 1.628646441073512e-05, | |
| "loss": 3.1043, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 19.47279189000233, | |
| "grad_norm": 0.4432560205459595, | |
| "learning_rate": 1.584889148191365e-05, | |
| "loss": 3.1062, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 19.487357259380097, | |
| "grad_norm": 0.4381709396839142, | |
| "learning_rate": 1.541131855309218e-05, | |
| "loss": 3.1185, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 19.501922628757864, | |
| "grad_norm": 0.42818522453308105, | |
| "learning_rate": 1.497374562427071e-05, | |
| "loss": 3.115, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 19.516487998135634, | |
| "grad_norm": 0.43918377161026, | |
| "learning_rate": 1.453617269544924e-05, | |
| "loss": 3.1006, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.516487998135634, | |
| "eval_accuracy": 0.3768343734171232, | |
| "eval_loss": 3.5216574668884277, | |
| "eval_runtime": 180.3716, | |
| "eval_samples_per_second": 92.276, | |
| "eval_steps_per_second": 5.771, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 19.5310533675134, | |
| "grad_norm": 0.42537999153137207, | |
| "learning_rate": 1.409859976662777e-05, | |
| "loss": 3.1003, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 19.545618736891168, | |
| "grad_norm": 0.4393448531627655, | |
| "learning_rate": 1.36610268378063e-05, | |
| "loss": 3.1052, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 19.560184106268935, | |
| "grad_norm": 0.4335193336009979, | |
| "learning_rate": 1.322345390898483e-05, | |
| "loss": 3.1101, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 19.5747494756467, | |
| "grad_norm": 0.444814532995224, | |
| "learning_rate": 1.278588098016336e-05, | |
| "loss": 3.1153, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 19.589314845024468, | |
| "grad_norm": 0.43410614132881165, | |
| "learning_rate": 1.234830805134189e-05, | |
| "loss": 3.1015, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 19.60388021440224, | |
| "grad_norm": 0.4306865930557251, | |
| "learning_rate": 1.191073512252042e-05, | |
| "loss": 3.109, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 19.618445583780005, | |
| "grad_norm": 0.4378669857978821, | |
| "learning_rate": 1.1473162193698949e-05, | |
| "loss": 3.1203, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 19.633010953157772, | |
| "grad_norm": 0.4324400722980499, | |
| "learning_rate": 1.1035589264877479e-05, | |
| "loss": 3.1169, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 19.64757632253554, | |
| "grad_norm": 0.4444780945777893, | |
| "learning_rate": 1.0598016336056008e-05, | |
| "loss": 3.1076, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 19.662141691913305, | |
| "grad_norm": 0.45415130257606506, | |
| "learning_rate": 1.0160443407234538e-05, | |
| "loss": 3.1047, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 19.676707061291076, | |
| "grad_norm": 0.4280719459056854, | |
| "learning_rate": 9.72287047841307e-06, | |
| "loss": 3.1012, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 19.691272430668842, | |
| "grad_norm": 0.44762077927589417, | |
| "learning_rate": 9.285297549591598e-06, | |
| "loss": 3.1153, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 19.70583780004661, | |
| "grad_norm": 0.4268578290939331, | |
| "learning_rate": 8.847724620770127e-06, | |
| "loss": 3.1035, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 19.720403169424376, | |
| "grad_norm": 0.4514009356498718, | |
| "learning_rate": 8.410151691948657e-06, | |
| "loss": 3.1084, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 19.734968538802143, | |
| "grad_norm": 0.4248492121696472, | |
| "learning_rate": 7.972578763127187e-06, | |
| "loss": 3.1178, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 19.749533908179913, | |
| "grad_norm": 0.4309752881526947, | |
| "learning_rate": 7.535005834305717e-06, | |
| "loss": 3.1176, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 19.76409927755768, | |
| "grad_norm": 0.43412092328071594, | |
| "learning_rate": 7.0974329054842465e-06, | |
| "loss": 3.1268, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 19.778664646935447, | |
| "grad_norm": 0.42582693696022034, | |
| "learning_rate": 6.659859976662777e-06, | |
| "loss": 3.1031, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 19.793230016313213, | |
| "grad_norm": 0.4391798973083496, | |
| "learning_rate": 6.222287047841307e-06, | |
| "loss": 3.1095, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 19.80779538569098, | |
| "grad_norm": 0.4283028841018677, | |
| "learning_rate": 5.784714119019836e-06, | |
| "loss": 3.1085, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.80779538569098, | |
| "eval_accuracy": 0.3768690585536839, | |
| "eval_loss": 3.520920753479004, | |
| "eval_runtime": 180.5112, | |
| "eval_samples_per_second": 92.205, | |
| "eval_steps_per_second": 5.767, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 19.822360755068747, | |
| "grad_norm": 0.43811023235321045, | |
| "learning_rate": 5.3471411901983655e-06, | |
| "loss": 3.1013, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 19.836926124446517, | |
| "grad_norm": 0.4432801306247711, | |
| "learning_rate": 4.909568261376895e-06, | |
| "loss": 3.1118, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 19.851491493824284, | |
| "grad_norm": 0.433024525642395, | |
| "learning_rate": 4.471995332555426e-06, | |
| "loss": 3.1143, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 19.86605686320205, | |
| "grad_norm": 0.42264324426651, | |
| "learning_rate": 4.034422403733955e-06, | |
| "loss": 3.1129, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 19.880622232579817, | |
| "grad_norm": 0.42544034123420715, | |
| "learning_rate": 3.596849474912485e-06, | |
| "loss": 3.1102, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 19.895187601957584, | |
| "grad_norm": 0.4385799765586853, | |
| "learning_rate": 3.159276546091015e-06, | |
| "loss": 3.1155, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 19.909752971335354, | |
| "grad_norm": 0.4359734058380127, | |
| "learning_rate": 2.7217036172695445e-06, | |
| "loss": 3.0982, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 19.92431834071312, | |
| "grad_norm": 0.43336305022239685, | |
| "learning_rate": 2.2841306884480747e-06, | |
| "loss": 3.1107, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 19.938883710090888, | |
| "grad_norm": 0.4339163899421692, | |
| "learning_rate": 1.8465577596266043e-06, | |
| "loss": 3.1284, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 19.953449079468655, | |
| "grad_norm": 0.4268462359905243, | |
| "learning_rate": 1.408984830805134e-06, | |
| "loss": 3.1164, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 19.96801444884642, | |
| "grad_norm": 0.4449523091316223, | |
| "learning_rate": 9.714119019836638e-07, | |
| "loss": 3.1082, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 19.982579818224192, | |
| "grad_norm": 0.4338447153568268, | |
| "learning_rate": 5.338389731621937e-07, | |
| "loss": 3.1144, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 19.99714518760196, | |
| "grad_norm": 0.430207222700119, | |
| "learning_rate": 9.626604434072343e-08, | |
| "loss": 3.11, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "step": 68660, | |
| "total_flos": 1.43513603407872e+18, | |
| "train_loss": 3.4363333413179364, | |
| "train_runtime": 137192.0021, | |
| "train_samples_per_second": 40.035, | |
| "train_steps_per_second": 0.5 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 68660, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 20, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 3 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.43513603407872e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |