diff --git "a/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json" "b/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/cost_to_hit_frequency_2128/checkpoint-40000/trainer_state.json" @@ -0,0 +1,6003 @@ +{ + "best_global_step": 40000, + "best_metric": 3.5464437007904053, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_2128/checkpoint-40000", + "epoch": 11.651654625961314, + "eval_steps": 1000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01456536937776742, + "grad_norm": 1.1913238763809204, + "learning_rate": 0.000294, + "loss": 8.4438, + "step": 50 + }, + { + "epoch": 0.02913073875553484, + "grad_norm": 0.7407680749893188, + "learning_rate": 0.0005939999999999999, + "loss": 6.7092, + "step": 100 + }, + { + "epoch": 0.04369610813330226, + "grad_norm": 0.637363851070404, + "learning_rate": 0.0005995711785297549, + "loss": 6.3304, + "step": 150 + }, + { + "epoch": 0.05826147751106968, + "grad_norm": 0.5566583275794983, + "learning_rate": 0.0005991336056009335, + "loss": 6.1355, + "step": 200 + }, + { + "epoch": 0.0728268468888371, + "grad_norm": 0.47791680693626404, + "learning_rate": 0.000598696032672112, + "loss": 5.9734, + "step": 250 + }, + { + "epoch": 0.08739221626660452, + "grad_norm": 0.4901290833950043, + "learning_rate": 0.0005982584597432905, + "loss": 5.8595, + "step": 300 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 0.4541548788547516, + "learning_rate": 0.0005978208868144691, + "loss": 5.7462, + "step": 350 + }, + { + "epoch": 0.11652295502213936, + "grad_norm": 0.5347028374671936, + "learning_rate": 0.0005973833138856476, + "loss": 5.621, + "step": 400 + }, + { + "epoch": 0.13108832439990678, + "grad_norm": 0.5162431597709656, + "learning_rate": 0.000596945740956826, + "loss": 5.5125, + "step": 450 + }, + { + "epoch": 0.1456536937776742, + "grad_norm": 0.46286195516586304, + "learning_rate": 0.0005965081680280046, + "loss": 5.4126, + "step": 500 + }, + { + "epoch": 0.16021906315544163, + "grad_norm": 0.4214600622653961, + "learning_rate": 0.0005960705950991831, + "loss": 5.3353, + "step": 550 + }, + { + "epoch": 0.17478443253320905, + "grad_norm": 0.4491782486438751, + "learning_rate": 0.0005956330221703616, + "loss": 5.2624, + "step": 600 + }, + { + "epoch": 0.18934980191097647, + "grad_norm": 0.5439804792404175, + "learning_rate": 0.0005951954492415402, + "loss": 5.1937, + "step": 650 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 0.4217555522918701, + "learning_rate": 0.0005947578763127188, + "loss": 5.1259, + "step": 700 + }, + { + "epoch": 0.2184805406665113, + "grad_norm": 0.3988608121871948, + "learning_rate": 0.0005943203033838973, + "loss": 5.0741, + "step": 750 + }, + { + "epoch": 0.23304591004427871, + "grad_norm": 0.3983953297138214, + "learning_rate": 0.0005938827304550758, + "loss": 5.0375, + "step": 800 + }, + { + "epoch": 0.24761127942204614, + "grad_norm": 0.4305328130722046, + "learning_rate": 0.0005934451575262544, + "loss": 4.9659, + "step": 850 + }, + { + "epoch": 0.26217664879981356, + "grad_norm": 0.5058379173278809, + "learning_rate": 0.0005930075845974328, + "loss": 4.9392, + "step": 900 + }, + { + "epoch": 0.276742018177581, + "grad_norm": 0.4486232399940491, + "learning_rate": 0.0005925700116686113, + "loss": 4.888, + "step": 950 + }, + { + "epoch": 0.2913073875553484, + "grad_norm": 0.41395482420921326, + "learning_rate": 0.0005921324387397899, + "loss": 4.836, + "step": 1000 + }, + { + "epoch": 0.2913073875553484, + "eval_accuracy": 0.25472411559956376, + "eval_loss": 4.754175662994385, + "eval_runtime": 179.9073, + "eval_samples_per_second": 92.514, + "eval_steps_per_second": 5.786, + "step": 1000 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.4587019383907318, + "learning_rate": 0.0005916948658109684, + "loss": 4.7942, + "step": 1050 + }, + { + "epoch": 0.32043812631088325, + "grad_norm": 0.45875293016433716, + "learning_rate": 0.000591257292882147, + "loss": 4.7273, + "step": 1100 + }, + { + "epoch": 0.3350034956886507, + "grad_norm": 0.4781964123249054, + "learning_rate": 0.0005908197199533255, + "loss": 4.7014, + "step": 1150 + }, + { + "epoch": 0.3495688650664181, + "grad_norm": 0.492291122674942, + "learning_rate": 0.0005903821470245041, + "loss": 4.6737, + "step": 1200 + }, + { + "epoch": 0.3641342344441855, + "grad_norm": 0.42865535616874695, + "learning_rate": 0.0005899445740956826, + "loss": 4.6405, + "step": 1250 + }, + { + "epoch": 0.37869960382195295, + "grad_norm": 0.43935948610305786, + "learning_rate": 0.0005895070011668611, + "loss": 4.594, + "step": 1300 + }, + { + "epoch": 0.39326497319972037, + "grad_norm": 0.43477925658226013, + "learning_rate": 0.0005890694282380397, + "loss": 4.5799, + "step": 1350 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.43411099910736084, + "learning_rate": 0.0005886318553092181, + "loss": 4.5577, + "step": 1400 + }, + { + "epoch": 0.42239571195525516, + "grad_norm": 0.4020370841026306, + "learning_rate": 0.0005881942823803966, + "loss": 4.5398, + "step": 1450 + }, + { + "epoch": 0.4369610813330226, + "grad_norm": 0.4862031936645508, + "learning_rate": 0.0005877567094515752, + "loss": 4.5186, + "step": 1500 + }, + { + "epoch": 0.45152645071079, + "grad_norm": 0.3783005177974701, + "learning_rate": 0.0005873191365227537, + "loss": 4.4805, + "step": 1550 + }, + { + "epoch": 0.46609182008855743, + "grad_norm": 0.4555555582046509, + "learning_rate": 0.0005868815635939323, + "loss": 4.4699, + "step": 1600 + }, + { + "epoch": 0.48065718946632485, + "grad_norm": 0.3953476846218109, + "learning_rate": 0.0005864439906651108, + "loss": 4.4484, + "step": 1650 + }, + { + "epoch": 0.4952225588440923, + "grad_norm": 0.44499197602272034, + "learning_rate": 0.0005860064177362894, + "loss": 4.4183, + "step": 1700 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.45466503500938416, + "learning_rate": 0.0005855688448074679, + "loss": 4.4102, + "step": 1750 + }, + { + "epoch": 0.5243532975996271, + "grad_norm": 0.43747514486312866, + "learning_rate": 0.0005851312718786464, + "loss": 4.3962, + "step": 1800 + }, + { + "epoch": 0.5389186669773945, + "grad_norm": 0.3796531558036804, + "learning_rate": 0.0005846936989498249, + "loss": 4.3864, + "step": 1850 + }, + { + "epoch": 0.553484036355162, + "grad_norm": 0.4407300353050232, + "learning_rate": 0.0005842561260210034, + "loss": 4.3651, + "step": 1900 + }, + { + "epoch": 0.5680494057329294, + "grad_norm": 0.4483519196510315, + "learning_rate": 0.000583818553092182, + "loss": 4.3465, + "step": 1950 + }, + { + "epoch": 0.5826147751106968, + "grad_norm": 0.4081006348133087, + "learning_rate": 0.0005833809801633605, + "loss": 4.3423, + "step": 2000 + }, + { + "epoch": 0.5826147751106968, + "eval_accuracy": 0.29926300551528945, + "eval_loss": 4.2851409912109375, + "eval_runtime": 179.879, + "eval_samples_per_second": 92.529, + "eval_steps_per_second": 5.787, + "step": 2000 + }, + { + "epoch": 0.5971801444884642, + "grad_norm": 0.3948284685611725, + "learning_rate": 0.000582943407234539, + "loss": 4.3215, + "step": 2050 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.38788270950317383, + "learning_rate": 0.0005825058343057176, + "loss": 4.3212, + "step": 2100 + }, + { + "epoch": 0.6263108832439991, + "grad_norm": 0.37169507145881653, + "learning_rate": 0.0005820682613768961, + "loss": 4.3175, + "step": 2150 + }, + { + "epoch": 0.6408762526217665, + "grad_norm": 0.36698848009109497, + "learning_rate": 0.0005816306884480747, + "loss": 4.2876, + "step": 2200 + }, + { + "epoch": 0.6554416219995339, + "grad_norm": 0.42129504680633545, + "learning_rate": 0.0005811931155192532, + "loss": 4.2811, + "step": 2250 + }, + { + "epoch": 0.6700069913773014, + "grad_norm": 0.40725767612457275, + "learning_rate": 0.0005807555425904316, + "loss": 4.2676, + "step": 2300 + }, + { + "epoch": 0.6845723607550688, + "grad_norm": 0.3666594922542572, + "learning_rate": 0.0005803179696616102, + "loss": 4.259, + "step": 2350 + }, + { + "epoch": 0.6991377301328362, + "grad_norm": 0.411522775888443, + "learning_rate": 0.0005798803967327887, + "loss": 4.2528, + "step": 2400 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.343448668718338, + "learning_rate": 0.0005794428238039673, + "loss": 4.2396, + "step": 2450 + }, + { + "epoch": 0.728268468888371, + "grad_norm": 0.3675486445426941, + "learning_rate": 0.0005790052508751458, + "loss": 4.2248, + "step": 2500 + }, + { + "epoch": 0.7428338382661385, + "grad_norm": 0.36124101281166077, + "learning_rate": 0.0005785676779463243, + "loss": 4.2171, + "step": 2550 + }, + { + "epoch": 0.7573992076439059, + "grad_norm": 0.4069299101829529, + "learning_rate": 0.0005781301050175029, + "loss": 4.2053, + "step": 2600 + }, + { + "epoch": 0.7719645770216733, + "grad_norm": 0.39434754848480225, + "learning_rate": 0.0005776925320886814, + "loss": 4.21, + "step": 2650 + }, + { + "epoch": 0.7865299463994407, + "grad_norm": 0.3574545085430145, + "learning_rate": 0.00057725495915986, + "loss": 4.1949, + "step": 2700 + }, + { + "epoch": 0.8010953157772082, + "grad_norm": 0.36652427911758423, + "learning_rate": 0.0005768173862310384, + "loss": 4.1699, + "step": 2750 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.3449154496192932, + "learning_rate": 0.0005763798133022169, + "loss": 4.1699, + "step": 2800 + }, + { + "epoch": 0.8302260545327429, + "grad_norm": 0.3682800531387329, + "learning_rate": 0.0005759422403733955, + "loss": 4.1685, + "step": 2850 + }, + { + "epoch": 0.8447914239105103, + "grad_norm": 0.39873358607292175, + "learning_rate": 0.000575504667444574, + "loss": 4.1501, + "step": 2900 + }, + { + "epoch": 0.8593567932882777, + "grad_norm": 0.3736512362957001, + "learning_rate": 0.0005750670945157526, + "loss": 4.1473, + "step": 2950 + }, + { + "epoch": 0.8739221626660452, + "grad_norm": 0.38885951042175293, + "learning_rate": 0.0005746295215869311, + "loss": 4.1418, + "step": 3000 + }, + { + "epoch": 0.8739221626660452, + "eval_accuracy": 0.31548835966817024, + "eval_loss": 4.09907865524292, + "eval_runtime": 179.8464, + "eval_samples_per_second": 92.546, + "eval_steps_per_second": 5.788, + "step": 3000 + }, + { + "epoch": 0.8884875320438126, + "grad_norm": 0.34839940071105957, + "learning_rate": 0.0005741919486581096, + "loss": 4.1499, + "step": 3050 + }, + { + "epoch": 0.90305290142158, + "grad_norm": 0.3890705704689026, + "learning_rate": 0.0005737543757292882, + "loss": 4.1416, + "step": 3100 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.3607308566570282, + "learning_rate": 0.0005733168028004667, + "loss": 4.1086, + "step": 3150 + }, + { + "epoch": 0.9321836401771149, + "grad_norm": 0.351632297039032, + "learning_rate": 0.0005728792298716453, + "loss": 4.1185, + "step": 3200 + }, + { + "epoch": 0.9467490095548823, + "grad_norm": 0.3545263707637787, + "learning_rate": 0.0005724416569428237, + "loss": 4.1113, + "step": 3250 + }, + { + "epoch": 0.9613143789326497, + "grad_norm": 0.37550088763237, + "learning_rate": 0.0005720040840140023, + "loss": 4.107, + "step": 3300 + }, + { + "epoch": 0.9758797483104171, + "grad_norm": 0.40243715047836304, + "learning_rate": 0.0005715665110851808, + "loss": 4.1036, + "step": 3350 + }, + { + "epoch": 0.9904451176881846, + "grad_norm": 0.3520545959472656, + "learning_rate": 0.0005711289381563593, + "loss": 4.0915, + "step": 3400 + }, + { + "epoch": 1.0049522255884409, + "grad_norm": 0.3803820013999939, + "learning_rate": 0.0005706913652275379, + "loss": 4.0746, + "step": 3450 + }, + { + "epoch": 1.0195175949662083, + "grad_norm": 0.3791220784187317, + "learning_rate": 0.0005702537922987164, + "loss": 4.0221, + "step": 3500 + }, + { + "epoch": 1.0340829643439757, + "grad_norm": 0.3519602417945862, + "learning_rate": 0.0005698162193698949, + "loss": 4.0213, + "step": 3550 + }, + { + "epoch": 1.0486483337217432, + "grad_norm": 0.36723291873931885, + "learning_rate": 0.0005693786464410735, + "loss": 4.0157, + "step": 3600 + }, + { + "epoch": 1.0632137030995106, + "grad_norm": 0.3530062139034271, + "learning_rate": 0.000568941073512252, + "loss": 4.0094, + "step": 3650 + }, + { + "epoch": 1.077779072477278, + "grad_norm": 0.3847055435180664, + "learning_rate": 0.0005685035005834305, + "loss": 4.0082, + "step": 3700 + }, + { + "epoch": 1.0923444418550454, + "grad_norm": 0.3274436593055725, + "learning_rate": 0.000568065927654609, + "loss": 4.0044, + "step": 3750 + }, + { + "epoch": 1.1069098112328128, + "grad_norm": 0.34672999382019043, + "learning_rate": 0.0005676283547257876, + "loss": 4.0247, + "step": 3800 + }, + { + "epoch": 1.1214751806105803, + "grad_norm": 0.33540818095207214, + "learning_rate": 0.0005671907817969661, + "loss": 3.9935, + "step": 3850 + }, + { + "epoch": 1.1360405499883477, + "grad_norm": 0.3458578288555145, + "learning_rate": 0.0005667532088681446, + "loss": 3.9953, + "step": 3900 + }, + { + "epoch": 1.1506059193661151, + "grad_norm": 0.3823903799057007, + "learning_rate": 0.0005663156359393232, + "loss": 4.0052, + "step": 3950 + }, + { + "epoch": 1.1651712887438825, + "grad_norm": 0.37612202763557434, + "learning_rate": 0.0005658780630105017, + "loss": 3.991, + "step": 4000 + }, + { + "epoch": 1.1651712887438825, + "eval_accuracy": 0.3252509910542918, + "eval_loss": 3.993682384490967, + "eval_runtime": 179.8018, + "eval_samples_per_second": 92.569, + "eval_steps_per_second": 5.79, + "step": 4000 + }, + { + "epoch": 1.17973665812165, + "grad_norm": 0.36251288652420044, + "learning_rate": 0.0005654404900816802, + "loss": 3.986, + "step": 4050 + }, + { + "epoch": 1.1943020274994174, + "grad_norm": 0.3479040265083313, + "learning_rate": 0.0005650029171528588, + "loss": 3.9921, + "step": 4100 + }, + { + "epoch": 1.2088673968771848, + "grad_norm": 0.3300114870071411, + "learning_rate": 0.0005645653442240373, + "loss": 3.9802, + "step": 4150 + }, + { + "epoch": 1.2234327662549522, + "grad_norm": 0.33517342805862427, + "learning_rate": 0.0005641277712952158, + "loss": 3.9771, + "step": 4200 + }, + { + "epoch": 1.2379981356327197, + "grad_norm": 0.3363065719604492, + "learning_rate": 0.0005636901983663943, + "loss": 3.9818, + "step": 4250 + }, + { + "epoch": 1.252563505010487, + "grad_norm": 0.39410993456840515, + "learning_rate": 0.0005632526254375729, + "loss": 3.9672, + "step": 4300 + }, + { + "epoch": 1.2671288743882545, + "grad_norm": 0.3455098271369934, + "learning_rate": 0.0005628150525087514, + "loss": 3.9731, + "step": 4350 + }, + { + "epoch": 1.281694243766022, + "grad_norm": 0.3514918088912964, + "learning_rate": 0.0005623774795799299, + "loss": 3.9632, + "step": 4400 + }, + { + "epoch": 1.2962596131437893, + "grad_norm": 0.35402923822402954, + "learning_rate": 0.0005619399066511085, + "loss": 3.9597, + "step": 4450 + }, + { + "epoch": 1.3108249825215568, + "grad_norm": 0.331036239862442, + "learning_rate": 0.000561502333722287, + "loss": 3.9444, + "step": 4500 + }, + { + "epoch": 1.3253903518993242, + "grad_norm": 0.35744568705558777, + "learning_rate": 0.0005610647607934655, + "loss": 3.9531, + "step": 4550 + }, + { + "epoch": 1.3399557212770916, + "grad_norm": 0.3515322208404541, + "learning_rate": 0.000560627187864644, + "loss": 3.9612, + "step": 4600 + }, + { + "epoch": 1.354521090654859, + "grad_norm": 0.32326042652130127, + "learning_rate": 0.0005601896149358226, + "loss": 3.9558, + "step": 4650 + }, + { + "epoch": 1.3690864600326265, + "grad_norm": 0.32586470246315, + "learning_rate": 0.0005597520420070011, + "loss": 3.9452, + "step": 4700 + }, + { + "epoch": 1.3836518294103939, + "grad_norm": 0.3541634976863861, + "learning_rate": 0.0005593144690781796, + "loss": 3.9452, + "step": 4750 + }, + { + "epoch": 1.3982171987881613, + "grad_norm": 0.33788588643074036, + "learning_rate": 0.0005588768961493582, + "loss": 3.9384, + "step": 4800 + }, + { + "epoch": 1.4127825681659287, + "grad_norm": 0.3441423773765564, + "learning_rate": 0.0005584393232205367, + "loss": 3.936, + "step": 4850 + }, + { + "epoch": 1.4273479375436962, + "grad_norm": 0.3493860960006714, + "learning_rate": 0.0005580017502917152, + "loss": 3.939, + "step": 4900 + }, + { + "epoch": 1.4419133069214636, + "grad_norm": 0.3450307846069336, + "learning_rate": 0.0005575641773628938, + "loss": 3.9262, + "step": 4950 + }, + { + "epoch": 1.456478676299231, + "grad_norm": 0.3555508553981781, + "learning_rate": 0.0005571266044340723, + "loss": 3.9297, + "step": 5000 + }, + { + "epoch": 1.456478676299231, + "eval_accuracy": 0.33168749420934585, + "eval_loss": 3.914679527282715, + "eval_runtime": 179.6257, + "eval_samples_per_second": 92.659, + "eval_steps_per_second": 5.795, + "step": 5000 + }, + { + "epoch": 1.4710440456769984, + "grad_norm": 0.31701815128326416, + "learning_rate": 0.0005566890315052507, + "loss": 3.9287, + "step": 5050 + }, + { + "epoch": 1.4856094150547658, + "grad_norm": 0.34660544991493225, + "learning_rate": 0.0005562514585764293, + "loss": 3.9227, + "step": 5100 + }, + { + "epoch": 1.500174784432533, + "grad_norm": 0.31572017073631287, + "learning_rate": 0.0005558138856476079, + "loss": 3.9105, + "step": 5150 + }, + { + "epoch": 1.5147401538103007, + "grad_norm": 0.36169928312301636, + "learning_rate": 0.0005553763127187864, + "loss": 3.9195, + "step": 5200 + }, + { + "epoch": 1.529305523188068, + "grad_norm": 0.3572075068950653, + "learning_rate": 0.0005549387397899649, + "loss": 3.9057, + "step": 5250 + }, + { + "epoch": 1.5438708925658355, + "grad_norm": 0.35884082317352295, + "learning_rate": 0.0005545011668611435, + "loss": 3.9182, + "step": 5300 + }, + { + "epoch": 1.5584362619436027, + "grad_norm": 0.33528369665145874, + "learning_rate": 0.000554063593932322, + "loss": 3.9182, + "step": 5350 + }, + { + "epoch": 1.5730016313213704, + "grad_norm": 0.31933894753456116, + "learning_rate": 0.0005536260210035005, + "loss": 3.9024, + "step": 5400 + }, + { + "epoch": 1.5875670006991376, + "grad_norm": 0.34619244933128357, + "learning_rate": 0.0005531884480746791, + "loss": 3.9082, + "step": 5450 + }, + { + "epoch": 1.6021323700769052, + "grad_norm": 0.34431859850883484, + "learning_rate": 0.0005527508751458577, + "loss": 3.9046, + "step": 5500 + }, + { + "epoch": 1.6166977394546724, + "grad_norm": 0.3247990608215332, + "learning_rate": 0.0005523133022170361, + "loss": 3.8984, + "step": 5550 + }, + { + "epoch": 1.63126310883244, + "grad_norm": 0.33569249510765076, + "learning_rate": 0.0005518757292882146, + "loss": 3.8857, + "step": 5600 + }, + { + "epoch": 1.6458284782102073, + "grad_norm": 0.326007217168808, + "learning_rate": 0.0005514381563593932, + "loss": 3.8984, + "step": 5650 + }, + { + "epoch": 1.660393847587975, + "grad_norm": 0.3553823232650757, + "learning_rate": 0.0005510005834305717, + "loss": 3.8857, + "step": 5700 + }, + { + "epoch": 1.6749592169657421, + "grad_norm": 0.31800103187561035, + "learning_rate": 0.0005505630105017502, + "loss": 3.8956, + "step": 5750 + }, + { + "epoch": 1.6895245863435098, + "grad_norm": 0.3153764009475708, + "learning_rate": 0.0005501254375729288, + "loss": 3.8895, + "step": 5800 + }, + { + "epoch": 1.704089955721277, + "grad_norm": 0.3242632746696472, + "learning_rate": 0.0005496878646441073, + "loss": 3.888, + "step": 5850 + }, + { + "epoch": 1.7186553250990446, + "grad_norm": 0.33604300022125244, + "learning_rate": 0.0005492502917152858, + "loss": 3.886, + "step": 5900 + }, + { + "epoch": 1.7332206944768118, + "grad_norm": 0.35920587182044983, + "learning_rate": 0.0005488127187864644, + "loss": 3.8789, + "step": 5950 + }, + { + "epoch": 1.7477860638545795, + "grad_norm": 0.3230355381965637, + "learning_rate": 0.000548375145857643, + "loss": 3.8779, + "step": 6000 + }, + { + "epoch": 1.7477860638545795, + "eval_accuracy": 0.33706357279951615, + "eval_loss": 3.8543789386749268, + "eval_runtime": 179.7654, + "eval_samples_per_second": 92.587, + "eval_steps_per_second": 5.791, + "step": 6000 + }, + { + "epoch": 1.7623514332323467, + "grad_norm": 0.3344341814517975, + "learning_rate": 0.0005479375729288214, + "loss": 3.8676, + "step": 6050 + }, + { + "epoch": 1.7769168026101143, + "grad_norm": 0.3225545585155487, + "learning_rate": 0.0005474999999999999, + "loss": 3.8757, + "step": 6100 + }, + { + "epoch": 1.7914821719878815, + "grad_norm": 0.33318787813186646, + "learning_rate": 0.0005470624270711785, + "loss": 3.8667, + "step": 6150 + }, + { + "epoch": 1.8060475413656492, + "grad_norm": 0.3072070777416229, + "learning_rate": 0.000546624854142357, + "loss": 3.8627, + "step": 6200 + }, + { + "epoch": 1.8206129107434164, + "grad_norm": 0.32514503598213196, + "learning_rate": 0.0005461872812135355, + "loss": 3.8647, + "step": 6250 + }, + { + "epoch": 1.835178280121184, + "grad_norm": 0.3363496661186218, + "learning_rate": 0.0005457497082847141, + "loss": 3.8712, + "step": 6300 + }, + { + "epoch": 1.8497436494989512, + "grad_norm": 0.31175696849823, + "learning_rate": 0.0005453121353558927, + "loss": 3.8674, + "step": 6350 + }, + { + "epoch": 1.8643090188767188, + "grad_norm": 0.33162710070610046, + "learning_rate": 0.0005448745624270712, + "loss": 3.8721, + "step": 6400 + }, + { + "epoch": 1.878874388254486, + "grad_norm": 0.3097343146800995, + "learning_rate": 0.0005444369894982496, + "loss": 3.8637, + "step": 6450 + }, + { + "epoch": 1.8934397576322537, + "grad_norm": 0.33735862374305725, + "learning_rate": 0.0005439994165694282, + "loss": 3.8451, + "step": 6500 + }, + { + "epoch": 1.908005127010021, + "grad_norm": 0.3092336058616638, + "learning_rate": 0.0005435618436406067, + "loss": 3.8552, + "step": 6550 + }, + { + "epoch": 1.9225704963877885, + "grad_norm": 0.32548123598098755, + "learning_rate": 0.0005431242707117852, + "loss": 3.8537, + "step": 6600 + }, + { + "epoch": 1.9371358657655557, + "grad_norm": 0.3191821575164795, + "learning_rate": 0.0005426866977829638, + "loss": 3.846, + "step": 6650 + }, + { + "epoch": 1.9517012351433234, + "grad_norm": 0.33417537808418274, + "learning_rate": 0.0005422491248541423, + "loss": 3.8518, + "step": 6700 + }, + { + "epoch": 1.9662666045210906, + "grad_norm": 0.32248765230178833, + "learning_rate": 0.0005418115519253208, + "loss": 3.8579, + "step": 6750 + }, + { + "epoch": 1.9808319738988582, + "grad_norm": 0.33786964416503906, + "learning_rate": 0.0005413739789964994, + "loss": 3.8316, + "step": 6800 + }, + { + "epoch": 1.9953973432766254, + "grad_norm": 0.3355385661125183, + "learning_rate": 0.000540936406067678, + "loss": 3.8404, + "step": 6850 + }, + { + "epoch": 2.0099044511768818, + "grad_norm": 0.32335567474365234, + "learning_rate": 0.0005404988331388564, + "loss": 3.7795, + "step": 6900 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.3314116895198822, + "learning_rate": 0.0005400612602100349, + "loss": 3.7512, + "step": 6950 + }, + { + "epoch": 2.0390351899324166, + "grad_norm": 0.3404753804206848, + "learning_rate": 0.0005396236872812135, + "loss": 3.7427, + "step": 7000 + }, + { + "epoch": 2.0390351899324166, + "eval_accuracy": 0.34164365689980253, + "eval_loss": 3.8123531341552734, + "eval_runtime": 180.0803, + "eval_samples_per_second": 92.425, + "eval_steps_per_second": 5.781, + "step": 7000 + }, + { + "epoch": 2.0536005593101843, + "grad_norm": 0.35776785016059875, + "learning_rate": 0.000539186114352392, + "loss": 3.7381, + "step": 7050 + }, + { + "epoch": 2.0681659286879515, + "grad_norm": 0.3245725929737091, + "learning_rate": 0.0005387485414235705, + "loss": 3.7485, + "step": 7100 + }, + { + "epoch": 2.082731298065719, + "grad_norm": 0.3353221118450165, + "learning_rate": 0.0005383109684947491, + "loss": 3.7375, + "step": 7150 + }, + { + "epoch": 2.0972966674434863, + "grad_norm": 0.33368387818336487, + "learning_rate": 0.0005378733955659276, + "loss": 3.7618, + "step": 7200 + }, + { + "epoch": 2.111862036821254, + "grad_norm": 0.35302773118019104, + "learning_rate": 0.0005374358226371061, + "loss": 3.7519, + "step": 7250 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.3177225589752197, + "learning_rate": 0.0005369982497082847, + "loss": 3.7598, + "step": 7300 + }, + { + "epoch": 2.140992775576789, + "grad_norm": 0.34392455220222473, + "learning_rate": 0.0005365606767794633, + "loss": 3.7631, + "step": 7350 + }, + { + "epoch": 2.155558144954556, + "grad_norm": 0.3233015835285187, + "learning_rate": 0.0005361231038506417, + "loss": 3.7618, + "step": 7400 + }, + { + "epoch": 2.1701235143323236, + "grad_norm": 0.31578004360198975, + "learning_rate": 0.0005356855309218202, + "loss": 3.7679, + "step": 7450 + }, + { + "epoch": 2.184688883710091, + "grad_norm": 0.3494773209095001, + "learning_rate": 0.0005352479579929988, + "loss": 3.7493, + "step": 7500 + }, + { + "epoch": 2.1992542530878585, + "grad_norm": 0.3228057026863098, + "learning_rate": 0.0005348103850641773, + "loss": 3.7475, + "step": 7550 + }, + { + "epoch": 2.2138196224656257, + "grad_norm": 0.3268294632434845, + "learning_rate": 0.0005343728121353558, + "loss": 3.7601, + "step": 7600 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.32106488943099976, + "learning_rate": 0.0005339352392065344, + "loss": 3.7661, + "step": 7650 + }, + { + "epoch": 2.2429503612211605, + "grad_norm": 0.3412560224533081, + "learning_rate": 0.000533497666277713, + "loss": 3.7514, + "step": 7700 + }, + { + "epoch": 2.257515730598928, + "grad_norm": 0.33492568135261536, + "learning_rate": 0.0005330600933488915, + "loss": 3.7667, + "step": 7750 + }, + { + "epoch": 2.2720810999766954, + "grad_norm": 0.34016239643096924, + "learning_rate": 0.00053262252042007, + "loss": 3.7499, + "step": 7800 + }, + { + "epoch": 2.286646469354463, + "grad_norm": 0.327343225479126, + "learning_rate": 0.0005321849474912485, + "loss": 3.7587, + "step": 7850 + }, + { + "epoch": 2.3012118387322302, + "grad_norm": 0.3204312026500702, + "learning_rate": 0.000531747374562427, + "loss": 3.7561, + "step": 7900 + }, + { + "epoch": 2.3157772081099974, + "grad_norm": 0.3505200445652008, + "learning_rate": 0.0005313098016336055, + "loss": 3.7534, + "step": 7950 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.31797534227371216, + "learning_rate": 0.0005308722287047841, + "loss": 3.7492, + "step": 8000 + }, + { + "epoch": 2.330342577487765, + "eval_accuracy": 0.3447044144419973, + "eval_loss": 3.781189441680908, + "eval_runtime": 180.3098, + "eval_samples_per_second": 92.308, + "eval_steps_per_second": 5.773, + "step": 8000 + }, + { + "epoch": 2.3449079468655327, + "grad_norm": 0.3364429771900177, + "learning_rate": 0.0005304346557759626, + "loss": 3.7339, + "step": 8050 + }, + { + "epoch": 2.3594733162433, + "grad_norm": 0.31472164392471313, + "learning_rate": 0.0005299970828471411, + "loss": 3.73, + "step": 8100 + }, + { + "epoch": 2.374038685621067, + "grad_norm": 0.3065818250179291, + "learning_rate": 0.0005295595099183197, + "loss": 3.7486, + "step": 8150 + }, + { + "epoch": 2.3886040549988348, + "grad_norm": 0.3349589705467224, + "learning_rate": 0.0005291219369894983, + "loss": 3.7328, + "step": 8200 + }, + { + "epoch": 2.4031694243766024, + "grad_norm": 0.33365535736083984, + "learning_rate": 0.0005286843640606768, + "loss": 3.7319, + "step": 8250 + }, + { + "epoch": 2.4177347937543696, + "grad_norm": 0.34016332030296326, + "learning_rate": 0.0005282467911318552, + "loss": 3.7389, + "step": 8300 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.32065296173095703, + "learning_rate": 0.0005278092182030338, + "loss": 3.7554, + "step": 8350 + }, + { + "epoch": 2.4468655325099045, + "grad_norm": 0.3385309875011444, + "learning_rate": 0.0005273716452742123, + "loss": 3.7452, + "step": 8400 + }, + { + "epoch": 2.461430901887672, + "grad_norm": 0.3271431028842926, + "learning_rate": 0.0005269340723453908, + "loss": 3.7506, + "step": 8450 + }, + { + "epoch": 2.4759962712654393, + "grad_norm": 0.3121528625488281, + "learning_rate": 0.0005264964994165694, + "loss": 3.7521, + "step": 8500 + }, + { + "epoch": 2.4905616406432065, + "grad_norm": 0.3151465654373169, + "learning_rate": 0.000526058926487748, + "loss": 3.7439, + "step": 8550 + }, + { + "epoch": 2.505127010020974, + "grad_norm": 0.3443325161933899, + "learning_rate": 0.0005256213535589265, + "loss": 3.7441, + "step": 8600 + }, + { + "epoch": 2.519692379398742, + "grad_norm": 0.32063865661621094, + "learning_rate": 0.000525183780630105, + "loss": 3.7501, + "step": 8650 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.3402131497859955, + "learning_rate": 0.0005247462077012836, + "loss": 3.7476, + "step": 8700 + }, + { + "epoch": 2.548823118154276, + "grad_norm": 0.322399377822876, + "learning_rate": 0.000524308634772462, + "loss": 3.7502, + "step": 8750 + }, + { + "epoch": 2.563388487532044, + "grad_norm": 0.33992311358451843, + "learning_rate": 0.0005238710618436405, + "loss": 3.7434, + "step": 8800 + }, + { + "epoch": 2.5779538569098115, + "grad_norm": 0.31691116094589233, + "learning_rate": 0.0005234334889148191, + "loss": 3.7352, + "step": 8850 + }, + { + "epoch": 2.5925192262875787, + "grad_norm": 0.3292892277240753, + "learning_rate": 0.0005229959159859976, + "loss": 3.7338, + "step": 8900 + }, + { + "epoch": 2.607084595665346, + "grad_norm": 0.31549301743507385, + "learning_rate": 0.0005225583430571761, + "loss": 3.7453, + "step": 8950 + }, + { + "epoch": 2.6216499650431135, + "grad_norm": 0.3125064969062805, + "learning_rate": 0.0005221207701283547, + "loss": 3.7372, + "step": 9000 + }, + { + "epoch": 2.6216499650431135, + "eval_accuracy": 0.34737834452898997, + "eval_loss": 3.7514328956604004, + "eval_runtime": 179.7257, + "eval_samples_per_second": 92.608, + "eval_steps_per_second": 5.792, + "step": 9000 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.3117610514163971, + "learning_rate": 0.0005216831971995333, + "loss": 3.7072, + "step": 9050 + }, + { + "epoch": 2.6507807037986484, + "grad_norm": 0.3186918795108795, + "learning_rate": 0.0005212456242707118, + "loss": 3.7313, + "step": 9100 + }, + { + "epoch": 2.6653460731764156, + "grad_norm": 0.33037465810775757, + "learning_rate": 0.0005208080513418903, + "loss": 3.7283, + "step": 9150 + }, + { + "epoch": 2.6799114425541832, + "grad_norm": 0.32975658774375916, + "learning_rate": 0.0005203704784130689, + "loss": 3.7291, + "step": 9200 + }, + { + "epoch": 2.6944768119319504, + "grad_norm": 0.3381806015968323, + "learning_rate": 0.0005199329054842473, + "loss": 3.7192, + "step": 9250 + }, + { + "epoch": 2.709042181309718, + "grad_norm": 0.3226553797721863, + "learning_rate": 0.0005194953325554258, + "loss": 3.7279, + "step": 9300 + }, + { + "epoch": 2.7236075506874853, + "grad_norm": 0.320027232170105, + "learning_rate": 0.0005190577596266044, + "loss": 3.7251, + "step": 9350 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.3155761957168579, + "learning_rate": 0.0005186201866977829, + "loss": 3.7159, + "step": 9400 + }, + { + "epoch": 2.75273828944302, + "grad_norm": 0.3248502314090729, + "learning_rate": 0.0005181826137689614, + "loss": 3.7201, + "step": 9450 + }, + { + "epoch": 2.7673036588207878, + "grad_norm": 0.3296821117401123, + "learning_rate": 0.00051774504084014, + "loss": 3.7267, + "step": 9500 + }, + { + "epoch": 2.781869028198555, + "grad_norm": 0.3494844138622284, + "learning_rate": 0.0005173074679113186, + "loss": 3.725, + "step": 9550 + }, + { + "epoch": 2.7964343975763226, + "grad_norm": 0.3254939913749695, + "learning_rate": 0.0005168698949824971, + "loss": 3.7221, + "step": 9600 + }, + { + "epoch": 2.81099976695409, + "grad_norm": 0.3118003308773041, + "learning_rate": 0.0005164323220536755, + "loss": 3.719, + "step": 9650 + }, + { + "epoch": 2.8255651363318575, + "grad_norm": 0.3459826707839966, + "learning_rate": 0.0005159947491248541, + "loss": 3.7268, + "step": 9700 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.31478872895240784, + "learning_rate": 0.0005155571761960326, + "loss": 3.7241, + "step": 9750 + }, + { + "epoch": 2.8546958750873923, + "grad_norm": 0.3117355704307556, + "learning_rate": 0.0005151196032672111, + "loss": 3.7101, + "step": 9800 + }, + { + "epoch": 2.8692612444651595, + "grad_norm": 0.3150256872177124, + "learning_rate": 0.0005146820303383897, + "loss": 3.7186, + "step": 9850 + }, + { + "epoch": 2.883826613842927, + "grad_norm": 0.3244450092315674, + "learning_rate": 0.0005142444574095682, + "loss": 3.7062, + "step": 9900 + }, + { + "epoch": 2.8983919832206944, + "grad_norm": 0.3051128387451172, + "learning_rate": 0.0005138068844807468, + "loss": 3.7218, + "step": 9950 + }, + { + "epoch": 2.912957352598462, + "grad_norm": 0.35902562737464905, + "learning_rate": 0.0005133693115519253, + "loss": 3.7056, + "step": 10000 + }, + { + "epoch": 2.912957352598462, + "eval_accuracy": 0.3498066568184394, + "eval_loss": 3.726057767868042, + "eval_runtime": 180.1415, + "eval_samples_per_second": 92.394, + "eval_steps_per_second": 5.779, + "step": 10000 + }, + { + "epoch": 2.927522721976229, + "grad_norm": 0.31749260425567627, + "learning_rate": 0.0005129317386231039, + "loss": 3.7179, + "step": 10050 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.3012363612651825, + "learning_rate": 0.0005124941656942824, + "loss": 3.712, + "step": 10100 + }, + { + "epoch": 2.956653460731764, + "grad_norm": 0.3233492076396942, + "learning_rate": 0.0005120565927654608, + "loss": 3.7127, + "step": 10150 + }, + { + "epoch": 2.9712188301095317, + "grad_norm": 0.3380107283592224, + "learning_rate": 0.0005116190198366394, + "loss": 3.7189, + "step": 10200 + }, + { + "epoch": 2.985784199487299, + "grad_norm": 0.3121177852153778, + "learning_rate": 0.0005111814469078179, + "loss": 3.7162, + "step": 10250 + }, + { + "epoch": 3.0002913073875552, + "grad_norm": 0.3301170766353607, + "learning_rate": 0.0005107438739789964, + "loss": 3.7098, + "step": 10300 + }, + { + "epoch": 3.014856676765323, + "grad_norm": 0.3270030617713928, + "learning_rate": 0.000510306301050175, + "loss": 3.6062, + "step": 10350 + }, + { + "epoch": 3.02942204614309, + "grad_norm": 0.3287683129310608, + "learning_rate": 0.0005098687281213535, + "loss": 3.597, + "step": 10400 + }, + { + "epoch": 3.0439874155208577, + "grad_norm": 0.3282028138637543, + "learning_rate": 0.0005094311551925321, + "loss": 3.6045, + "step": 10450 + }, + { + "epoch": 3.058552784898625, + "grad_norm": 0.34291520714759827, + "learning_rate": 0.0005089935822637106, + "loss": 3.6151, + "step": 10500 + }, + { + "epoch": 3.0731181542763926, + "grad_norm": 0.31705862283706665, + "learning_rate": 0.0005085560093348892, + "loss": 3.6231, + "step": 10550 + }, + { + "epoch": 3.0876835236541598, + "grad_norm": 0.3140444755554199, + "learning_rate": 0.0005081184364060676, + "loss": 3.6261, + "step": 10600 + }, + { + "epoch": 3.1022488930319274, + "grad_norm": 0.3536335527896881, + "learning_rate": 0.0005076808634772461, + "loss": 3.6241, + "step": 10650 + }, + { + "epoch": 3.1168142624096946, + "grad_norm": 0.33482107520103455, + "learning_rate": 0.0005072432905484247, + "loss": 3.6337, + "step": 10700 + }, + { + "epoch": 3.1313796317874623, + "grad_norm": 0.32872864603996277, + "learning_rate": 0.0005068057176196032, + "loss": 3.6181, + "step": 10750 + }, + { + "epoch": 3.1459450011652295, + "grad_norm": 0.326739639043808, + "learning_rate": 0.0005063681446907818, + "loss": 3.6269, + "step": 10800 + }, + { + "epoch": 3.160510370542997, + "grad_norm": 0.31593042612075806, + "learning_rate": 0.0005059305717619603, + "loss": 3.6352, + "step": 10850 + }, + { + "epoch": 3.1750757399207643, + "grad_norm": 0.3400070369243622, + "learning_rate": 0.0005054929988331388, + "loss": 3.6435, + "step": 10900 + }, + { + "epoch": 3.189641109298532, + "grad_norm": 0.33030685782432556, + "learning_rate": 0.0005050554259043174, + "loss": 3.6292, + "step": 10950 + }, + { + "epoch": 3.204206478676299, + "grad_norm": 0.3333072066307068, + "learning_rate": 0.0005046178529754959, + "loss": 3.6285, + "step": 11000 + }, + { + "epoch": 3.204206478676299, + "eval_accuracy": 0.35201263150369827, + "eval_loss": 3.7096149921417236, + "eval_runtime": 180.2303, + "eval_samples_per_second": 92.348, + "eval_steps_per_second": 5.776, + "step": 11000 + }, + { + "epoch": 3.218771848054067, + "grad_norm": 0.32576659321784973, + "learning_rate": 0.0005041802800466744, + "loss": 3.6225, + "step": 11050 + }, + { + "epoch": 3.233337217431834, + "grad_norm": 0.3293328881263733, + "learning_rate": 0.0005037427071178529, + "loss": 3.6389, + "step": 11100 + }, + { + "epoch": 3.2479025868096016, + "grad_norm": 0.30969351530075073, + "learning_rate": 0.0005033051341890314, + "loss": 3.6297, + "step": 11150 + }, + { + "epoch": 3.262467956187369, + "grad_norm": 0.3289102613925934, + "learning_rate": 0.00050286756126021, + "loss": 3.6321, + "step": 11200 + }, + { + "epoch": 3.2770333255651365, + "grad_norm": 0.340069979429245, + "learning_rate": 0.0005024299883313885, + "loss": 3.6328, + "step": 11250 + }, + { + "epoch": 3.2915986949429037, + "grad_norm": 0.3201046586036682, + "learning_rate": 0.0005019924154025671, + "loss": 3.6342, + "step": 11300 + }, + { + "epoch": 3.3061640643206713, + "grad_norm": 0.356716513633728, + "learning_rate": 0.0005015548424737456, + "loss": 3.6261, + "step": 11350 + }, + { + "epoch": 3.3207294336984385, + "grad_norm": 0.3404761254787445, + "learning_rate": 0.0005011172695449241, + "loss": 3.6347, + "step": 11400 + }, + { + "epoch": 3.335294803076206, + "grad_norm": 0.3204871714115143, + "learning_rate": 0.0005006796966161027, + "loss": 3.6267, + "step": 11450 + }, + { + "epoch": 3.3498601724539734, + "grad_norm": 0.33498939871788025, + "learning_rate": 0.0005002421236872811, + "loss": 3.6321, + "step": 11500 + }, + { + "epoch": 3.364425541831741, + "grad_norm": 0.33658871054649353, + "learning_rate": 0.0004998045507584597, + "loss": 3.6384, + "step": 11550 + }, + { + "epoch": 3.3789909112095082, + "grad_norm": 0.3517683148384094, + "learning_rate": 0.0004993669778296382, + "loss": 3.6373, + "step": 11600 + }, + { + "epoch": 3.393556280587276, + "grad_norm": 0.33312374353408813, + "learning_rate": 0.0004989294049008167, + "loss": 3.6344, + "step": 11650 + }, + { + "epoch": 3.408121649965043, + "grad_norm": 0.330685555934906, + "learning_rate": 0.0004984918319719953, + "loss": 3.633, + "step": 11700 + }, + { + "epoch": 3.4226870193428107, + "grad_norm": 0.3397407829761505, + "learning_rate": 0.0004980542590431738, + "loss": 3.6344, + "step": 11750 + }, + { + "epoch": 3.437252388720578, + "grad_norm": 0.3209087550640106, + "learning_rate": 0.0004976166861143524, + "loss": 3.612, + "step": 11800 + }, + { + "epoch": 3.4518177580983456, + "grad_norm": 0.3204724192619324, + "learning_rate": 0.0004971791131855309, + "loss": 3.6334, + "step": 11850 + }, + { + "epoch": 3.4663831274761128, + "grad_norm": 0.33590126037597656, + "learning_rate": 0.0004967415402567094, + "loss": 3.6389, + "step": 11900 + }, + { + "epoch": 3.4809484968538804, + "grad_norm": 0.34319397807121277, + "learning_rate": 0.000496303967327888, + "loss": 3.6334, + "step": 11950 + }, + { + "epoch": 3.4955138662316476, + "grad_norm": 0.34008243680000305, + "learning_rate": 0.0004958663943990664, + "loss": 3.638, + "step": 12000 + }, + { + "epoch": 3.4955138662316476, + "eval_accuracy": 0.35395158942580696, + "eval_loss": 3.6934237480163574, + "eval_runtime": 179.6583, + "eval_samples_per_second": 92.643, + "eval_steps_per_second": 5.794, + "step": 12000 + }, + { + "epoch": 3.510079235609415, + "grad_norm": 0.3379422724246979, + "learning_rate": 0.000495428821470245, + "loss": 3.6293, + "step": 12050 + }, + { + "epoch": 3.5246446049871825, + "grad_norm": 0.31869712471961975, + "learning_rate": 0.0004949912485414235, + "loss": 3.6372, + "step": 12100 + }, + { + "epoch": 3.53920997436495, + "grad_norm": 0.3277227282524109, + "learning_rate": 0.0004945536756126021, + "loss": 3.6288, + "step": 12150 + }, + { + "epoch": 3.5537753437427173, + "grad_norm": 0.3422459661960602, + "learning_rate": 0.0004941161026837806, + "loss": 3.6445, + "step": 12200 + }, + { + "epoch": 3.5683407131204845, + "grad_norm": 0.3337474465370178, + "learning_rate": 0.0004936785297549591, + "loss": 3.64, + "step": 12250 + }, + { + "epoch": 3.582906082498252, + "grad_norm": 0.3305768370628357, + "learning_rate": 0.0004932409568261377, + "loss": 3.6281, + "step": 12300 + }, + { + "epoch": 3.59747145187602, + "grad_norm": 0.332967609167099, + "learning_rate": 0.0004928033838973162, + "loss": 3.6366, + "step": 12350 + }, + { + "epoch": 3.612036821253787, + "grad_norm": 0.32114502787590027, + "learning_rate": 0.0004923658109684946, + "loss": 3.6434, + "step": 12400 + }, + { + "epoch": 3.626602190631554, + "grad_norm": 0.32796093821525574, + "learning_rate": 0.0004919282380396732, + "loss": 3.6469, + "step": 12450 + }, + { + "epoch": 3.641167560009322, + "grad_norm": 0.3242839574813843, + "learning_rate": 0.0004914906651108517, + "loss": 3.6295, + "step": 12500 + }, + { + "epoch": 3.6557329293870895, + "grad_norm": 0.35202574729919434, + "learning_rate": 0.0004910530921820303, + "loss": 3.6405, + "step": 12550 + }, + { + "epoch": 3.6702982987648567, + "grad_norm": 0.32159286737442017, + "learning_rate": 0.0004906155192532088, + "loss": 3.6284, + "step": 12600 + }, + { + "epoch": 3.684863668142624, + "grad_norm": 0.3278411030769348, + "learning_rate": 0.0004901779463243874, + "loss": 3.6271, + "step": 12650 + }, + { + "epoch": 3.6994290375203915, + "grad_norm": 0.35316187143325806, + "learning_rate": 0.0004897403733955659, + "loss": 3.6254, + "step": 12700 + }, + { + "epoch": 3.713994406898159, + "grad_norm": 0.31677183508872986, + "learning_rate": 0.0004893028004667444, + "loss": 3.6352, + "step": 12750 + }, + { + "epoch": 3.7285597762759264, + "grad_norm": 0.32869166135787964, + "learning_rate": 0.000488865227537923, + "loss": 3.6263, + "step": 12800 + }, + { + "epoch": 3.7431251456536936, + "grad_norm": 0.323147714138031, + "learning_rate": 0.0004884276546091015, + "loss": 3.6161, + "step": 12850 + }, + { + "epoch": 3.7576905150314612, + "grad_norm": 0.33107396960258484, + "learning_rate": 0.00048799008168028, + "loss": 3.6205, + "step": 12900 + }, + { + "epoch": 3.772255884409229, + "grad_norm": 0.3485564589500427, + "learning_rate": 0.00048755250875145853, + "loss": 3.6427, + "step": 12950 + }, + { + "epoch": 3.786821253786996, + "grad_norm": 0.31731998920440674, + "learning_rate": 0.0004871149358226371, + "loss": 3.616, + "step": 13000 + }, + { + "epoch": 3.786821253786996, + "eval_accuracy": 0.35548902280095057, + "eval_loss": 3.6741294860839844, + "eval_runtime": 179.6397, + "eval_samples_per_second": 92.652, + "eval_steps_per_second": 5.795, + "step": 13000 + }, + { + "epoch": 3.8013866231647633, + "grad_norm": 0.3320983052253723, + "learning_rate": 0.0004866773628938156, + "loss": 3.6431, + "step": 13050 + }, + { + "epoch": 3.815951992542531, + "grad_norm": 0.34173107147216797, + "learning_rate": 0.0004862397899649941, + "loss": 3.6304, + "step": 13100 + }, + { + "epoch": 3.8305173619202986, + "grad_norm": 0.3142543435096741, + "learning_rate": 0.00048580221703617264, + "loss": 3.644, + "step": 13150 + }, + { + "epoch": 3.8450827312980658, + "grad_norm": 0.3302863836288452, + "learning_rate": 0.00048536464410735123, + "loss": 3.6304, + "step": 13200 + }, + { + "epoch": 3.859648100675833, + "grad_norm": 0.3198452889919281, + "learning_rate": 0.00048492707117852966, + "loss": 3.6437, + "step": 13250 + }, + { + "epoch": 3.8742134700536006, + "grad_norm": 0.3371002674102783, + "learning_rate": 0.00048448949824970826, + "loss": 3.6221, + "step": 13300 + }, + { + "epoch": 3.888778839431368, + "grad_norm": 0.330727756023407, + "learning_rate": 0.0004840519253208868, + "loss": 3.629, + "step": 13350 + }, + { + "epoch": 3.9033442088091355, + "grad_norm": 0.33198925852775574, + "learning_rate": 0.0004836143523920653, + "loss": 3.6337, + "step": 13400 + }, + { + "epoch": 3.9179095781869027, + "grad_norm": 0.3273507356643677, + "learning_rate": 0.0004831767794632438, + "loss": 3.6255, + "step": 13450 + }, + { + "epoch": 3.9324749475646703, + "grad_norm": 0.31935301423072815, + "learning_rate": 0.00048273920653442236, + "loss": 3.6307, + "step": 13500 + }, + { + "epoch": 3.9470403169424375, + "grad_norm": 0.3273596167564392, + "learning_rate": 0.0004823016336056009, + "loss": 3.6276, + "step": 13550 + }, + { + "epoch": 3.961605686320205, + "grad_norm": 0.3235807716846466, + "learning_rate": 0.0004818640606767794, + "loss": 3.6299, + "step": 13600 + }, + { + "epoch": 3.9761710556979724, + "grad_norm": 0.3113962709903717, + "learning_rate": 0.00048142648774795793, + "loss": 3.6323, + "step": 13650 + }, + { + "epoch": 3.99073642507574, + "grad_norm": 0.3231075704097748, + "learning_rate": 0.0004809889148191365, + "loss": 3.6357, + "step": 13700 + }, + { + "epoch": 4.005243532975996, + "grad_norm": 0.35100987553596497, + "learning_rate": 0.000480551341890315, + "loss": 3.5912, + "step": 13750 + }, + { + "epoch": 4.0198089023537635, + "grad_norm": 0.37618836760520935, + "learning_rate": 0.00048011376896149355, + "loss": 3.5127, + "step": 13800 + }, + { + "epoch": 4.034374271731531, + "grad_norm": 0.35220426321029663, + "learning_rate": 0.0004796761960326721, + "loss": 3.534, + "step": 13850 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.3237028419971466, + "learning_rate": 0.0004792386231038506, + "loss": 3.5277, + "step": 13900 + }, + { + "epoch": 4.063505010487066, + "grad_norm": 0.32475364208221436, + "learning_rate": 0.0004788010501750291, + "loss": 3.526, + "step": 13950 + }, + { + "epoch": 4.078070379864833, + "grad_norm": 0.3234672546386719, + "learning_rate": 0.00047836347724620766, + "loss": 3.5277, + "step": 14000 + }, + { + "epoch": 4.078070379864833, + "eval_accuracy": 0.3567962409307186, + "eval_loss": 3.6671650409698486, + "eval_runtime": 179.7799, + "eval_samples_per_second": 92.58, + "eval_steps_per_second": 5.79, + "step": 14000 + }, + { + "epoch": 4.092635749242601, + "grad_norm": 0.31527358293533325, + "learning_rate": 0.0004779259043173862, + "loss": 3.5321, + "step": 14050 + }, + { + "epoch": 4.1072011186203685, + "grad_norm": 0.32927918434143066, + "learning_rate": 0.0004774883313885647, + "loss": 3.5355, + "step": 14100 + }, + { + "epoch": 4.121766487998135, + "grad_norm": 0.33241936564445496, + "learning_rate": 0.0004770507584597433, + "loss": 3.5547, + "step": 14150 + }, + { + "epoch": 4.136331857375903, + "grad_norm": 0.32307010889053345, + "learning_rate": 0.0004766131855309218, + "loss": 3.5384, + "step": 14200 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.31569403409957886, + "learning_rate": 0.0004761756126021003, + "loss": 3.5468, + "step": 14250 + }, + { + "epoch": 4.165462596131438, + "grad_norm": 0.3237732946872711, + "learning_rate": 0.00047573803967327884, + "loss": 3.5403, + "step": 14300 + }, + { + "epoch": 4.180027965509205, + "grad_norm": 0.33519721031188965, + "learning_rate": 0.0004753004667444574, + "loss": 3.5417, + "step": 14350 + }, + { + "epoch": 4.194593334886973, + "grad_norm": 0.3299810588359833, + "learning_rate": 0.00047486289381563587, + "loss": 3.5518, + "step": 14400 + }, + { + "epoch": 4.20915870426474, + "grad_norm": 0.3094955384731293, + "learning_rate": 0.0004744253208868144, + "loss": 3.5611, + "step": 14450 + }, + { + "epoch": 4.223724073642508, + "grad_norm": 0.3562442660331726, + "learning_rate": 0.00047398774795799295, + "loss": 3.5484, + "step": 14500 + }, + { + "epoch": 4.238289443020275, + "grad_norm": 0.31193310022354126, + "learning_rate": 0.00047355017502917154, + "loss": 3.5554, + "step": 14550 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.32945749163627625, + "learning_rate": 0.00047311260210035, + "loss": 3.5588, + "step": 14600 + }, + { + "epoch": 4.26742018177581, + "grad_norm": 0.30832111835479736, + "learning_rate": 0.00047267502917152857, + "loss": 3.5549, + "step": 14650 + }, + { + "epoch": 4.281985551153578, + "grad_norm": 0.31878674030303955, + "learning_rate": 0.0004722374562427071, + "loss": 3.5508, + "step": 14700 + }, + { + "epoch": 4.296550920531344, + "grad_norm": 0.3359583914279938, + "learning_rate": 0.0004717998833138856, + "loss": 3.5533, + "step": 14750 + }, + { + "epoch": 4.311116289909112, + "grad_norm": 0.3579261302947998, + "learning_rate": 0.00047136231038506413, + "loss": 3.5548, + "step": 14800 + }, + { + "epoch": 4.32568165928688, + "grad_norm": 0.3416401147842407, + "learning_rate": 0.00047092473745624267, + "loss": 3.5548, + "step": 14850 + }, + { + "epoch": 4.340247028664647, + "grad_norm": 0.31429240107536316, + "learning_rate": 0.00047048716452742116, + "loss": 3.5629, + "step": 14900 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.3537607192993164, + "learning_rate": 0.0004700495915985997, + "loss": 3.5561, + "step": 14950 + }, + { + "epoch": 4.369377767420182, + "grad_norm": 0.32719892263412476, + "learning_rate": 0.0004696120186697783, + "loss": 3.5578, + "step": 15000 + }, + { + "epoch": 4.369377767420182, + "eval_accuracy": 0.35763656184935977, + "eval_loss": 3.656822681427002, + "eval_runtime": 179.9827, + "eval_samples_per_second": 92.476, + "eval_steps_per_second": 5.784, + "step": 15000 + }, + { + "epoch": 4.383943136797949, + "grad_norm": 0.33471086621284485, + "learning_rate": 0.00046917444574095683, + "loss": 3.557, + "step": 15050 + }, + { + "epoch": 4.398508506175717, + "grad_norm": 0.33362334966659546, + "learning_rate": 0.0004687368728121353, + "loss": 3.5585, + "step": 15100 + }, + { + "epoch": 4.413073875553484, + "grad_norm": 0.32362911105155945, + "learning_rate": 0.00046829929988331386, + "loss": 3.546, + "step": 15150 + }, + { + "epoch": 4.427639244931251, + "grad_norm": 0.32808783650398254, + "learning_rate": 0.0004678617269544924, + "loss": 3.5373, + "step": 15200 + }, + { + "epoch": 4.442204614309019, + "grad_norm": 0.33728596568107605, + "learning_rate": 0.0004674241540256709, + "loss": 3.5617, + "step": 15250 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.3254978358745575, + "learning_rate": 0.0004669865810968494, + "loss": 3.5573, + "step": 15300 + }, + { + "epoch": 4.471335353064553, + "grad_norm": 0.35697412490844727, + "learning_rate": 0.00046654900816802796, + "loss": 3.557, + "step": 15350 + }, + { + "epoch": 4.485900722442321, + "grad_norm": 0.36106449365615845, + "learning_rate": 0.00046611143523920645, + "loss": 3.5659, + "step": 15400 + }, + { + "epoch": 4.500466091820089, + "grad_norm": 0.3264298141002655, + "learning_rate": 0.00046567386231038504, + "loss": 3.5652, + "step": 15450 + }, + { + "epoch": 4.515031461197856, + "grad_norm": 0.3491400480270386, + "learning_rate": 0.0004652362893815636, + "loss": 3.5647, + "step": 15500 + }, + { + "epoch": 4.529596830575623, + "grad_norm": 0.36997708678245544, + "learning_rate": 0.0004647987164527421, + "loss": 3.5688, + "step": 15550 + }, + { + "epoch": 4.544162199953391, + "grad_norm": 0.3246486783027649, + "learning_rate": 0.0004643611435239206, + "loss": 3.5807, + "step": 15600 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.3414935767650604, + "learning_rate": 0.00046392357059509915, + "loss": 3.5806, + "step": 15650 + }, + { + "epoch": 4.573292938708926, + "grad_norm": 0.33668363094329834, + "learning_rate": 0.0004634859976662777, + "loss": 3.5698, + "step": 15700 + }, + { + "epoch": 4.587858308086693, + "grad_norm": 0.3316305875778198, + "learning_rate": 0.0004630484247374562, + "loss": 3.5598, + "step": 15750 + }, + { + "epoch": 4.6024236774644605, + "grad_norm": 0.34208086133003235, + "learning_rate": 0.0004626108518086347, + "loss": 3.5574, + "step": 15800 + }, + { + "epoch": 4.616989046842228, + "grad_norm": 0.3184557259082794, + "learning_rate": 0.0004621732788798133, + "loss": 3.5573, + "step": 15850 + }, + { + "epoch": 4.631554416219995, + "grad_norm": 0.33165499567985535, + "learning_rate": 0.00046173570595099174, + "loss": 3.5583, + "step": 15900 + }, + { + "epoch": 4.6461197855977625, + "grad_norm": 0.3253953456878662, + "learning_rate": 0.00046129813302217033, + "loss": 3.5671, + "step": 15950 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.33802923560142517, + "learning_rate": 0.00046086056009334887, + "loss": 3.5631, + "step": 16000 + }, + { + "epoch": 4.66068515497553, + "eval_accuracy": 0.3593661156080293, + "eval_loss": 3.642476797103882, + "eval_runtime": 179.5652, + "eval_samples_per_second": 92.691, + "eval_steps_per_second": 5.797, + "step": 16000 + }, + { + "epoch": 4.675250524353298, + "grad_norm": 0.3159468472003937, + "learning_rate": 0.0004604229871645274, + "loss": 3.5539, + "step": 16050 + }, + { + "epoch": 4.689815893731065, + "grad_norm": 0.31020215153694153, + "learning_rate": 0.0004599854142357059, + "loss": 3.5584, + "step": 16100 + }, + { + "epoch": 4.704381263108832, + "grad_norm": 0.32496050000190735, + "learning_rate": 0.00045954784130688444, + "loss": 3.5691, + "step": 16150 + }, + { + "epoch": 4.7189466324866, + "grad_norm": 0.3142569363117218, + "learning_rate": 0.000459110268378063, + "loss": 3.5447, + "step": 16200 + }, + { + "epoch": 4.7335120018643675, + "grad_norm": 0.32899901270866394, + "learning_rate": 0.00045867269544924146, + "loss": 3.5739, + "step": 16250 + }, + { + "epoch": 4.748077371242134, + "grad_norm": 0.3278380334377289, + "learning_rate": 0.00045823512252042, + "loss": 3.5614, + "step": 16300 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.3125869631767273, + "learning_rate": 0.0004577975495915986, + "loss": 3.5607, + "step": 16350 + }, + { + "epoch": 4.7772081099976695, + "grad_norm": 0.34363460540771484, + "learning_rate": 0.0004573599766627771, + "loss": 3.5555, + "step": 16400 + }, + { + "epoch": 4.791773479375437, + "grad_norm": 0.4942171275615692, + "learning_rate": 0.0004569224037339556, + "loss": 3.5587, + "step": 16450 + }, + { + "epoch": 4.806338848753205, + "grad_norm": 0.3390342891216278, + "learning_rate": 0.00045648483080513416, + "loss": 3.5731, + "step": 16500 + }, + { + "epoch": 4.820904218130972, + "grad_norm": 0.3250505328178406, + "learning_rate": 0.0004560472578763127, + "loss": 3.5623, + "step": 16550 + }, + { + "epoch": 4.835469587508739, + "grad_norm": 0.3381325602531433, + "learning_rate": 0.0004556096849474912, + "loss": 3.5687, + "step": 16600 + }, + { + "epoch": 4.850034956886507, + "grad_norm": 0.3354376256465912, + "learning_rate": 0.00045517211201866973, + "loss": 3.5694, + "step": 16650 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.3529720902442932, + "learning_rate": 0.00045473453908984827, + "loss": 3.5484, + "step": 16700 + }, + { + "epoch": 4.879165695642041, + "grad_norm": 0.32668349146842957, + "learning_rate": 0.00045429696616102675, + "loss": 3.5741, + "step": 16750 + }, + { + "epoch": 4.893731065019809, + "grad_norm": 0.31390804052352905, + "learning_rate": 0.00045385939323220535, + "loss": 3.5553, + "step": 16800 + }, + { + "epoch": 4.908296434397577, + "grad_norm": 0.3186156153678894, + "learning_rate": 0.0004534218203033839, + "loss": 3.5625, + "step": 16850 + }, + { + "epoch": 4.922861803775344, + "grad_norm": 0.32360565662384033, + "learning_rate": 0.0004529842473745624, + "loss": 3.5605, + "step": 16900 + }, + { + "epoch": 4.937427173153111, + "grad_norm": 0.3357037305831909, + "learning_rate": 0.0004525466744457409, + "loss": 3.5627, + "step": 16950 + }, + { + "epoch": 4.951992542530879, + "grad_norm": 0.3252997100353241, + "learning_rate": 0.00045210910151691945, + "loss": 3.5746, + "step": 17000 + }, + { + "epoch": 4.951992542530879, + "eval_accuracy": 0.3607067255302828, + "eval_loss": 3.6273179054260254, + "eval_runtime": 179.7214, + "eval_samples_per_second": 92.61, + "eval_steps_per_second": 5.792, + "step": 17000 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.33965301513671875, + "learning_rate": 0.000451671528588098, + "loss": 3.564, + "step": 17050 + }, + { + "epoch": 4.981123281286413, + "grad_norm": 0.31816357374191284, + "learning_rate": 0.0004512339556592765, + "loss": 3.5581, + "step": 17100 + }, + { + "epoch": 4.995688650664181, + "grad_norm": 0.3402574360370636, + "learning_rate": 0.000450796382730455, + "loss": 3.5655, + "step": 17150 + }, + { + "epoch": 5.010195758564437, + "grad_norm": 0.33764371275901794, + "learning_rate": 0.0004503588098016336, + "loss": 3.4851, + "step": 17200 + }, + { + "epoch": 5.024761127942204, + "grad_norm": 0.3261650502681732, + "learning_rate": 0.0004499212368728121, + "loss": 3.4587, + "step": 17250 + }, + { + "epoch": 5.039326497319972, + "grad_norm": 0.33577781915664673, + "learning_rate": 0.00044948366394399064, + "loss": 3.4494, + "step": 17300 + }, + { + "epoch": 5.0538918666977395, + "grad_norm": 0.3431420922279358, + "learning_rate": 0.0004490460910151692, + "loss": 3.4595, + "step": 17350 + }, + { + "epoch": 5.068457236075507, + "grad_norm": 0.3210841119289398, + "learning_rate": 0.00044860851808634767, + "loss": 3.4537, + "step": 17400 + }, + { + "epoch": 5.083022605453274, + "grad_norm": 0.33073899149894714, + "learning_rate": 0.0004481709451575262, + "loss": 3.4588, + "step": 17450 + }, + { + "epoch": 5.0975879748310415, + "grad_norm": 0.32355406880378723, + "learning_rate": 0.00044773337222870475, + "loss": 3.4677, + "step": 17500 + }, + { + "epoch": 5.112153344208809, + "grad_norm": 0.3393101990222931, + "learning_rate": 0.0004472957992998833, + "loss": 3.4768, + "step": 17550 + }, + { + "epoch": 5.126718713586577, + "grad_norm": 0.3306352198123932, + "learning_rate": 0.00044685822637106177, + "loss": 3.4789, + "step": 17600 + }, + { + "epoch": 5.141284082964344, + "grad_norm": 0.32835835218429565, + "learning_rate": 0.00044642065344224037, + "loss": 3.4766, + "step": 17650 + }, + { + "epoch": 5.155849452342111, + "grad_norm": 0.3455768823623657, + "learning_rate": 0.0004459830805134189, + "loss": 3.4781, + "step": 17700 + }, + { + "epoch": 5.170414821719879, + "grad_norm": 0.33673617243766785, + "learning_rate": 0.0004455455075845974, + "loss": 3.4899, + "step": 17750 + }, + { + "epoch": 5.1849801910976465, + "grad_norm": 0.3145007789134979, + "learning_rate": 0.00044510793465577593, + "loss": 3.4957, + "step": 17800 + }, + { + "epoch": 5.199545560475413, + "grad_norm": 0.3510107696056366, + "learning_rate": 0.00044467036172695447, + "loss": 3.4766, + "step": 17850 + }, + { + "epoch": 5.214110929853181, + "grad_norm": 0.3493402302265167, + "learning_rate": 0.00044423278879813296, + "loss": 3.4842, + "step": 17900 + }, + { + "epoch": 5.228676299230949, + "grad_norm": 0.33777111768722534, + "learning_rate": 0.0004437952158693115, + "loss": 3.4779, + "step": 17950 + }, + { + "epoch": 5.243241668608716, + "grad_norm": 0.3274925947189331, + "learning_rate": 0.00044335764294049004, + "loss": 3.4835, + "step": 18000 + }, + { + "epoch": 5.243241668608716, + "eval_accuracy": 0.3611752688156872, + "eval_loss": 3.630121946334839, + "eval_runtime": 179.8645, + "eval_samples_per_second": 92.536, + "eval_steps_per_second": 5.788, + "step": 18000 + }, + { + "epoch": 5.257807037986483, + "grad_norm": 0.3470570743083954, + "learning_rate": 0.00044292007001166863, + "loss": 3.4807, + "step": 18050 + }, + { + "epoch": 5.272372407364251, + "grad_norm": 0.3509993255138397, + "learning_rate": 0.00044248249708284706, + "loss": 3.492, + "step": 18100 + }, + { + "epoch": 5.286937776742018, + "grad_norm": 0.3337821662425995, + "learning_rate": 0.00044204492415402566, + "loss": 3.4895, + "step": 18150 + }, + { + "epoch": 5.301503146119786, + "grad_norm": 0.3395850956439972, + "learning_rate": 0.0004416073512252042, + "loss": 3.488, + "step": 18200 + }, + { + "epoch": 5.316068515497553, + "grad_norm": 0.32552629709243774, + "learning_rate": 0.0004411697782963827, + "loss": 3.4976, + "step": 18250 + }, + { + "epoch": 5.33063388487532, + "grad_norm": 0.34504395723342896, + "learning_rate": 0.0004407322053675612, + "loss": 3.4972, + "step": 18300 + }, + { + "epoch": 5.345199254253088, + "grad_norm": 0.3230566084384918, + "learning_rate": 0.00044029463243873976, + "loss": 3.4979, + "step": 18350 + }, + { + "epoch": 5.359764623630856, + "grad_norm": 0.337710440158844, + "learning_rate": 0.00043985705950991825, + "loss": 3.5005, + "step": 18400 + }, + { + "epoch": 5.374329993008622, + "grad_norm": 0.3194814920425415, + "learning_rate": 0.0004394194865810968, + "loss": 3.5184, + "step": 18450 + }, + { + "epoch": 5.38889536238639, + "grad_norm": 0.3493628203868866, + "learning_rate": 0.00043898191365227533, + "loss": 3.5122, + "step": 18500 + }, + { + "epoch": 5.403460731764158, + "grad_norm": 0.3941914737224579, + "learning_rate": 0.0004385443407234539, + "loss": 3.5004, + "step": 18550 + }, + { + "epoch": 5.418026101141925, + "grad_norm": 0.35434696078300476, + "learning_rate": 0.0004381067677946324, + "loss": 3.5012, + "step": 18600 + }, + { + "epoch": 5.432591470519692, + "grad_norm": 0.34531646966934204, + "learning_rate": 0.00043766919486581095, + "loss": 3.493, + "step": 18650 + }, + { + "epoch": 5.44715683989746, + "grad_norm": 0.33792534470558167, + "learning_rate": 0.0004372316219369895, + "loss": 3.516, + "step": 18700 + }, + { + "epoch": 5.461722209275227, + "grad_norm": 0.3471498191356659, + "learning_rate": 0.00043679404900816797, + "loss": 3.4946, + "step": 18750 + }, + { + "epoch": 5.476287578652995, + "grad_norm": 0.32506677508354187, + "learning_rate": 0.0004363564760793465, + "loss": 3.5004, + "step": 18800 + }, + { + "epoch": 5.490852948030762, + "grad_norm": 0.3650604486465454, + "learning_rate": 0.00043591890315052505, + "loss": 3.4976, + "step": 18850 + }, + { + "epoch": 5.505418317408529, + "grad_norm": 0.3289859890937805, + "learning_rate": 0.00043548133022170354, + "loss": 3.5157, + "step": 18900 + }, + { + "epoch": 5.519983686786297, + "grad_norm": 0.3407946527004242, + "learning_rate": 0.0004350437572928821, + "loss": 3.5102, + "step": 18950 + }, + { + "epoch": 5.534549056164065, + "grad_norm": 0.3405243158340454, + "learning_rate": 0.00043460618436406067, + "loss": 3.5199, + "step": 19000 + }, + { + "epoch": 5.534549056164065, + "eval_accuracy": 0.36196820631048443, + "eval_loss": 3.6200644969940186, + "eval_runtime": 179.7535, + "eval_samples_per_second": 92.593, + "eval_steps_per_second": 5.791, + "step": 19000 + }, + { + "epoch": 5.549114425541831, + "grad_norm": 0.3322257995605469, + "learning_rate": 0.0004341686114352392, + "loss": 3.5128, + "step": 19050 + }, + { + "epoch": 5.563679794919599, + "grad_norm": 0.3334507346153259, + "learning_rate": 0.0004337310385064177, + "loss": 3.5019, + "step": 19100 + }, + { + "epoch": 5.578245164297367, + "grad_norm": 0.3119942247867584, + "learning_rate": 0.00043329346557759624, + "loss": 3.5248, + "step": 19150 + }, + { + "epoch": 5.592810533675134, + "grad_norm": 0.32865530252456665, + "learning_rate": 0.0004328558926487748, + "loss": 3.5017, + "step": 19200 + }, + { + "epoch": 5.607375903052901, + "grad_norm": 0.32390671968460083, + "learning_rate": 0.00043241831971995326, + "loss": 3.5087, + "step": 19250 + }, + { + "epoch": 5.621941272430669, + "grad_norm": 0.3391891419887543, + "learning_rate": 0.0004319807467911318, + "loss": 3.5117, + "step": 19300 + }, + { + "epoch": 5.636506641808436, + "grad_norm": 0.3408890664577484, + "learning_rate": 0.00043154317386231034, + "loss": 3.5164, + "step": 19350 + }, + { + "epoch": 5.651072011186204, + "grad_norm": 0.33605512976646423, + "learning_rate": 0.00043110560093348883, + "loss": 3.5009, + "step": 19400 + }, + { + "epoch": 5.665637380563971, + "grad_norm": 0.35284584760665894, + "learning_rate": 0.0004306680280046674, + "loss": 3.5017, + "step": 19450 + }, + { + "epoch": 5.6802027499417385, + "grad_norm": 0.35283035039901733, + "learning_rate": 0.00043023045507584596, + "loss": 3.5166, + "step": 19500 + }, + { + "epoch": 5.694768119319506, + "grad_norm": 0.33758777379989624, + "learning_rate": 0.0004297928821470245, + "loss": 3.5104, + "step": 19550 + }, + { + "epoch": 5.709333488697274, + "grad_norm": 0.3389108180999756, + "learning_rate": 0.000429355309218203, + "loss": 3.5154, + "step": 19600 + }, + { + "epoch": 5.7238988580750405, + "grad_norm": 0.43688979744911194, + "learning_rate": 0.00042891773628938153, + "loss": 3.5108, + "step": 19650 + }, + { + "epoch": 5.738464227452808, + "grad_norm": 0.33347204327583313, + "learning_rate": 0.00042848016336056007, + "loss": 3.511, + "step": 19700 + }, + { + "epoch": 5.753029596830576, + "grad_norm": 0.30911001563072205, + "learning_rate": 0.00042804259043173855, + "loss": 3.512, + "step": 19750 + }, + { + "epoch": 5.7675949662083426, + "grad_norm": 0.32809844613075256, + "learning_rate": 0.0004276050175029171, + "loss": 3.498, + "step": 19800 + }, + { + "epoch": 5.78216033558611, + "grad_norm": 0.3727506697177887, + "learning_rate": 0.0004271674445740957, + "loss": 3.5211, + "step": 19850 + }, + { + "epoch": 5.796725704963878, + "grad_norm": 0.33282268047332764, + "learning_rate": 0.0004267298716452741, + "loss": 3.5089, + "step": 19900 + }, + { + "epoch": 5.8112910743416455, + "grad_norm": 0.3280541002750397, + "learning_rate": 0.0004262922987164527, + "loss": 3.521, + "step": 19950 + }, + { + "epoch": 5.825856443719413, + "grad_norm": 0.3236095905303955, + "learning_rate": 0.00042585472578763125, + "loss": 3.5176, + "step": 20000 + }, + { + "epoch": 5.825856443719413, + "eval_accuracy": 0.36285532276929894, + "eval_loss": 3.609638214111328, + "eval_runtime": 179.7926, + "eval_samples_per_second": 92.573, + "eval_steps_per_second": 5.79, + "step": 20000 + }, + { + "epoch": 5.84042181309718, + "grad_norm": 0.3244244456291199, + "learning_rate": 0.0004254171528588098, + "loss": 3.5114, + "step": 20050 + }, + { + "epoch": 5.8549871824749475, + "grad_norm": 0.33964627981185913, + "learning_rate": 0.0004249795799299883, + "loss": 3.5003, + "step": 20100 + }, + { + "epoch": 5.869552551852715, + "grad_norm": 0.33158302307128906, + "learning_rate": 0.0004245420070011668, + "loss": 3.5042, + "step": 20150 + }, + { + "epoch": 5.884117921230482, + "grad_norm": 0.3141254186630249, + "learning_rate": 0.00042410443407234536, + "loss": 3.5203, + "step": 20200 + }, + { + "epoch": 5.89868329060825, + "grad_norm": 0.33124253153800964, + "learning_rate": 0.00042366686114352385, + "loss": 3.5171, + "step": 20250 + }, + { + "epoch": 5.913248659986017, + "grad_norm": 0.31624454259872437, + "learning_rate": 0.0004232292882147024, + "loss": 3.5317, + "step": 20300 + }, + { + "epoch": 5.927814029363785, + "grad_norm": 0.3152634799480438, + "learning_rate": 0.000422791715285881, + "loss": 3.5065, + "step": 20350 + }, + { + "epoch": 5.9423793987415525, + "grad_norm": 0.3478392958641052, + "learning_rate": 0.00042235414235705947, + "loss": 3.5132, + "step": 20400 + }, + { + "epoch": 5.956944768119319, + "grad_norm": 0.33496901392936707, + "learning_rate": 0.000421916569428238, + "loss": 3.5178, + "step": 20450 + }, + { + "epoch": 5.971510137497087, + "grad_norm": 0.3329765796661377, + "learning_rate": 0.00042147899649941654, + "loss": 3.5147, + "step": 20500 + }, + { + "epoch": 5.986075506874855, + "grad_norm": 0.3469819724559784, + "learning_rate": 0.0004210414235705951, + "loss": 3.5163, + "step": 20550 + }, + { + "epoch": 6.0005826147751105, + "grad_norm": 0.3333797752857208, + "learning_rate": 0.00042060385064177357, + "loss": 3.5088, + "step": 20600 + }, + { + "epoch": 6.015147984152878, + "grad_norm": 0.3532175123691559, + "learning_rate": 0.0004201662777129521, + "loss": 3.3952, + "step": 20650 + }, + { + "epoch": 6.029713353530646, + "grad_norm": 0.35990819334983826, + "learning_rate": 0.00041972870478413065, + "loss": 3.3972, + "step": 20700 + }, + { + "epoch": 6.044278722908413, + "grad_norm": 0.3456185758113861, + "learning_rate": 0.00041929113185530914, + "loss": 3.4024, + "step": 20750 + }, + { + "epoch": 6.05884409228618, + "grad_norm": 0.34514689445495605, + "learning_rate": 0.00041885355892648773, + "loss": 3.4088, + "step": 20800 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.32557061314582825, + "learning_rate": 0.00041841598599766627, + "loss": 3.4169, + "step": 20850 + }, + { + "epoch": 6.087974831041715, + "grad_norm": 0.3310924172401428, + "learning_rate": 0.00041797841306884476, + "loss": 3.4211, + "step": 20900 + }, + { + "epoch": 6.102540200419483, + "grad_norm": 0.34286898374557495, + "learning_rate": 0.0004175408401400233, + "loss": 3.4188, + "step": 20950 + }, + { + "epoch": 6.11710556979725, + "grad_norm": 0.33326268196105957, + "learning_rate": 0.00041710326721120184, + "loss": 3.4233, + "step": 21000 + }, + { + "epoch": 6.11710556979725, + "eval_accuracy": 0.3633101095768131, + "eval_loss": 3.612283945083618, + "eval_runtime": 179.8679, + "eval_samples_per_second": 92.535, + "eval_steps_per_second": 5.788, + "step": 21000 + }, + { + "epoch": 6.1316709391750175, + "grad_norm": 0.3543689548969269, + "learning_rate": 0.0004166656942823804, + "loss": 3.4277, + "step": 21050 + }, + { + "epoch": 6.146236308552785, + "grad_norm": 0.3318440616130829, + "learning_rate": 0.00041622812135355886, + "loss": 3.4383, + "step": 21100 + }, + { + "epoch": 6.160801677930552, + "grad_norm": 0.3441929817199707, + "learning_rate": 0.0004157905484247374, + "loss": 3.4261, + "step": 21150 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.35300642251968384, + "learning_rate": 0.000415352975495916, + "loss": 3.4354, + "step": 21200 + }, + { + "epoch": 6.189932416686087, + "grad_norm": 0.3246099054813385, + "learning_rate": 0.0004149154025670945, + "loss": 3.4363, + "step": 21250 + }, + { + "epoch": 6.204497786063855, + "grad_norm": 0.34103959798812866, + "learning_rate": 0.000414477829638273, + "loss": 3.4373, + "step": 21300 + }, + { + "epoch": 6.219063155441622, + "grad_norm": 0.3544823229312897, + "learning_rate": 0.00041404025670945156, + "loss": 3.4519, + "step": 21350 + }, + { + "epoch": 6.233628524819389, + "grad_norm": 0.34362295269966125, + "learning_rate": 0.00041360268378063005, + "loss": 3.4422, + "step": 21400 + }, + { + "epoch": 6.248193894197157, + "grad_norm": 0.32189178466796875, + "learning_rate": 0.0004131651108518086, + "loss": 3.4465, + "step": 21450 + }, + { + "epoch": 6.2627592635749245, + "grad_norm": 0.336997389793396, + "learning_rate": 0.0004127275379229871, + "loss": 3.4324, + "step": 21500 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.3384324610233307, + "learning_rate": 0.00041228996499416567, + "loss": 3.4471, + "step": 21550 + }, + { + "epoch": 6.291890002330459, + "grad_norm": 0.3506065607070923, + "learning_rate": 0.00041185239206534415, + "loss": 3.4416, + "step": 21600 + }, + { + "epoch": 6.306455371708227, + "grad_norm": 0.3377762734889984, + "learning_rate": 0.00041141481913652275, + "loss": 3.4461, + "step": 21650 + }, + { + "epoch": 6.321020741085994, + "grad_norm": 0.35600459575653076, + "learning_rate": 0.0004109772462077013, + "loss": 3.4513, + "step": 21700 + }, + { + "epoch": 6.335586110463761, + "grad_norm": 0.3653104305267334, + "learning_rate": 0.00041053967327887977, + "loss": 3.4612, + "step": 21750 + }, + { + "epoch": 6.350151479841529, + "grad_norm": 0.3408794403076172, + "learning_rate": 0.0004101021003500583, + "loss": 3.4538, + "step": 21800 + }, + { + "epoch": 6.364716849219296, + "grad_norm": 0.32172125577926636, + "learning_rate": 0.00040966452742123685, + "loss": 3.4484, + "step": 21850 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.35422587394714355, + "learning_rate": 0.00040922695449241534, + "loss": 3.4553, + "step": 21900 + }, + { + "epoch": 6.393847587974831, + "grad_norm": 0.36388930678367615, + "learning_rate": 0.0004087893815635939, + "loss": 3.4626, + "step": 21950 + }, + { + "epoch": 6.408412957352598, + "grad_norm": 0.3398551940917969, + "learning_rate": 0.0004083518086347724, + "loss": 3.4484, + "step": 22000 + }, + { + "epoch": 6.408412957352598, + "eval_accuracy": 0.36410598648996295, + "eval_loss": 3.6026387214660645, + "eval_runtime": 179.6634, + "eval_samples_per_second": 92.64, + "eval_steps_per_second": 5.794, + "step": 22000 + }, + { + "epoch": 6.422978326730366, + "grad_norm": 0.3316047489643097, + "learning_rate": 0.000407914235705951, + "loss": 3.4635, + "step": 22050 + }, + { + "epoch": 6.437543696108134, + "grad_norm": 0.3395221531391144, + "learning_rate": 0.00040747666277712944, + "loss": 3.4541, + "step": 22100 + }, + { + "epoch": 6.4521090654859, + "grad_norm": 0.33023080229759216, + "learning_rate": 0.00040703908984830804, + "loss": 3.455, + "step": 22150 + }, + { + "epoch": 6.466674434863668, + "grad_norm": 0.3635895848274231, + "learning_rate": 0.0004066015169194866, + "loss": 3.454, + "step": 22200 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.33634689450263977, + "learning_rate": 0.00040616394399066506, + "loss": 3.4543, + "step": 22250 + }, + { + "epoch": 6.495805173619203, + "grad_norm": 0.3248524069786072, + "learning_rate": 0.0004057263710618436, + "loss": 3.46, + "step": 22300 + }, + { + "epoch": 6.51037054299697, + "grad_norm": 0.35410505533218384, + "learning_rate": 0.00040528879813302214, + "loss": 3.4507, + "step": 22350 + }, + { + "epoch": 6.524935912374738, + "grad_norm": 0.3269991874694824, + "learning_rate": 0.00040485122520420063, + "loss": 3.4695, + "step": 22400 + }, + { + "epoch": 6.539501281752505, + "grad_norm": 0.35111793875694275, + "learning_rate": 0.00040441365227537917, + "loss": 3.4761, + "step": 22450 + }, + { + "epoch": 6.554066651130273, + "grad_norm": 0.33786168694496155, + "learning_rate": 0.0004039760793465577, + "loss": 3.4625, + "step": 22500 + }, + { + "epoch": 6.56863202050804, + "grad_norm": 0.34099647402763367, + "learning_rate": 0.0004035385064177363, + "loss": 3.4667, + "step": 22550 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.33352482318878174, + "learning_rate": 0.0004031009334889148, + "loss": 3.4585, + "step": 22600 + }, + { + "epoch": 6.597762759263575, + "grad_norm": 0.3284354507923126, + "learning_rate": 0.00040266336056009333, + "loss": 3.4651, + "step": 22650 + }, + { + "epoch": 6.612328128641343, + "grad_norm": 0.3649536073207855, + "learning_rate": 0.00040222578763127187, + "loss": 3.4706, + "step": 22700 + }, + { + "epoch": 6.626893498019109, + "grad_norm": 0.32932668924331665, + "learning_rate": 0.00040178821470245035, + "loss": 3.4653, + "step": 22750 + }, + { + "epoch": 6.641458867396877, + "grad_norm": 0.3425554037094116, + "learning_rate": 0.0004013506417736289, + "loss": 3.455, + "step": 22800 + }, + { + "epoch": 6.656024236774645, + "grad_norm": 0.36532074213027954, + "learning_rate": 0.00040091306884480743, + "loss": 3.468, + "step": 22850 + }, + { + "epoch": 6.670589606152412, + "grad_norm": 0.33179134130477905, + "learning_rate": 0.0004004754959159859, + "loss": 3.4657, + "step": 22900 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.3254914879798889, + "learning_rate": 0.00040003792298716446, + "loss": 3.4557, + "step": 22950 + }, + { + "epoch": 6.699720344907947, + "grad_norm": 0.3517943322658539, + "learning_rate": 0.00039960035005834305, + "loss": 3.4673, + "step": 23000 + }, + { + "epoch": 6.699720344907947, + "eval_accuracy": 0.36481873665210124, + "eval_loss": 3.593592405319214, + "eval_runtime": 179.8124, + "eval_samples_per_second": 92.563, + "eval_steps_per_second": 5.789, + "step": 23000 + }, + { + "epoch": 6.714285714285714, + "grad_norm": 0.35354673862457275, + "learning_rate": 0.0003991627771295216, + "loss": 3.4631, + "step": 23050 + }, + { + "epoch": 6.728851083663482, + "grad_norm": 0.3463275730609894, + "learning_rate": 0.0003987252042007001, + "loss": 3.4739, + "step": 23100 + }, + { + "epoch": 6.743416453041249, + "grad_norm": 0.33963003754615784, + "learning_rate": 0.0003982876312718786, + "loss": 3.4689, + "step": 23150 + }, + { + "epoch": 6.7579818224190165, + "grad_norm": 0.3357282280921936, + "learning_rate": 0.00039785005834305716, + "loss": 3.4717, + "step": 23200 + }, + { + "epoch": 6.772547191796784, + "grad_norm": 0.34744784235954285, + "learning_rate": 0.00039741248541423564, + "loss": 3.4744, + "step": 23250 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.32183945178985596, + "learning_rate": 0.0003969749124854142, + "loss": 3.4596, + "step": 23300 + }, + { + "epoch": 6.8016779305523185, + "grad_norm": 0.33145228028297424, + "learning_rate": 0.0003965373395565927, + "loss": 3.4604, + "step": 23350 + }, + { + "epoch": 6.816243299930086, + "grad_norm": 0.32555410265922546, + "learning_rate": 0.0003960997666277712, + "loss": 3.4691, + "step": 23400 + }, + { + "epoch": 6.830808669307854, + "grad_norm": 0.3336983323097229, + "learning_rate": 0.0003956621936989498, + "loss": 3.4653, + "step": 23450 + }, + { + "epoch": 6.845374038685621, + "grad_norm": 0.3609044551849365, + "learning_rate": 0.00039522462077012834, + "loss": 3.4661, + "step": 23500 + }, + { + "epoch": 6.859939408063388, + "grad_norm": 0.3462687134742737, + "learning_rate": 0.0003947870478413069, + "loss": 3.4719, + "step": 23550 + }, + { + "epoch": 6.874504777441156, + "grad_norm": 0.3334873914718628, + "learning_rate": 0.00039434947491248537, + "loss": 3.4628, + "step": 23600 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.32324063777923584, + "learning_rate": 0.0003939119019836639, + "loss": 3.4662, + "step": 23650 + }, + { + "epoch": 6.903635516196691, + "grad_norm": 0.3384288251399994, + "learning_rate": 0.00039347432905484245, + "loss": 3.4789, + "step": 23700 + }, + { + "epoch": 6.918200885574458, + "grad_norm": 0.3569772243499756, + "learning_rate": 0.00039303675612602094, + "loss": 3.4659, + "step": 23750 + }, + { + "epoch": 6.9327662549522255, + "grad_norm": 0.3427547514438629, + "learning_rate": 0.0003925991831971995, + "loss": 3.4815, + "step": 23800 + }, + { + "epoch": 6.947331624329993, + "grad_norm": 0.33504244685173035, + "learning_rate": 0.00039216161026837807, + "loss": 3.4714, + "step": 23850 + }, + { + "epoch": 6.961896993707761, + "grad_norm": 0.3363841474056244, + "learning_rate": 0.00039172403733955656, + "loss": 3.4772, + "step": 23900 + }, + { + "epoch": 6.976462363085528, + "grad_norm": 0.34754055738449097, + "learning_rate": 0.0003912864644107351, + "loss": 3.4624, + "step": 23950 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.332419216632843, + "learning_rate": 0.00039084889148191364, + "loss": 3.4728, + "step": 24000 + }, + { + "epoch": 6.991027732463295, + "eval_accuracy": 0.3658571743677076, + "eval_loss": 3.583200693130493, + "eval_runtime": 179.9492, + "eval_samples_per_second": 92.493, + "eval_steps_per_second": 5.785, + "step": 24000 + }, + { + "epoch": 7.005534840363552, + "grad_norm": 0.34445714950561523, + "learning_rate": 0.0003904113185530922, + "loss": 3.4335, + "step": 24050 + }, + { + "epoch": 7.020100209741319, + "grad_norm": 0.359764963388443, + "learning_rate": 0.00038997374562427066, + "loss": 3.357, + "step": 24100 + }, + { + "epoch": 7.034665579119086, + "grad_norm": 0.3404935598373413, + "learning_rate": 0.0003895361726954492, + "loss": 3.3683, + "step": 24150 + }, + { + "epoch": 7.049230948496854, + "grad_norm": 0.3708736300468445, + "learning_rate": 0.00038909859976662774, + "loss": 3.3762, + "step": 24200 + }, + { + "epoch": 7.063796317874622, + "grad_norm": 0.35109418630599976, + "learning_rate": 0.0003886610268378062, + "loss": 3.3722, + "step": 24250 + }, + { + "epoch": 7.0783616872523885, + "grad_norm": 0.38338735699653625, + "learning_rate": 0.0003882234539089848, + "loss": 3.3831, + "step": 24300 + }, + { + "epoch": 7.092927056630156, + "grad_norm": 0.34883055090904236, + "learning_rate": 0.00038778588098016336, + "loss": 3.3716, + "step": 24350 + }, + { + "epoch": 7.107492426007924, + "grad_norm": 0.3360002040863037, + "learning_rate": 0.00038734830805134185, + "loss": 3.3854, + "step": 24400 + }, + { + "epoch": 7.122057795385691, + "grad_norm": 0.36411646008491516, + "learning_rate": 0.0003869107351225204, + "loss": 3.3736, + "step": 24450 + }, + { + "epoch": 7.136623164763458, + "grad_norm": 0.3585089445114136, + "learning_rate": 0.0003864731621936989, + "loss": 3.3879, + "step": 24500 + }, + { + "epoch": 7.151188534141226, + "grad_norm": 0.34876003861427307, + "learning_rate": 0.00038603558926487747, + "loss": 3.3885, + "step": 24550 + }, + { + "epoch": 7.165753903518993, + "grad_norm": 0.34521597623825073, + "learning_rate": 0.00038559801633605595, + "loss": 3.3867, + "step": 24600 + }, + { + "epoch": 7.180319272896761, + "grad_norm": 0.3658317029476166, + "learning_rate": 0.0003851604434072345, + "loss": 3.3843, + "step": 24650 + }, + { + "epoch": 7.194884642274528, + "grad_norm": 0.3635459840297699, + "learning_rate": 0.0003847228704784131, + "loss": 3.3932, + "step": 24700 + }, + { + "epoch": 7.2094500116522955, + "grad_norm": 0.3436873257160187, + "learning_rate": 0.0003842852975495915, + "loss": 3.411, + "step": 24750 + }, + { + "epoch": 7.224015381030063, + "grad_norm": 0.3474794924259186, + "learning_rate": 0.0003838477246207701, + "loss": 3.3999, + "step": 24800 + }, + { + "epoch": 7.238580750407831, + "grad_norm": 0.3771800100803375, + "learning_rate": 0.00038341015169194865, + "loss": 3.4002, + "step": 24850 + }, + { + "epoch": 7.2531461197855975, + "grad_norm": 0.3322898745536804, + "learning_rate": 0.00038297257876312714, + "loss": 3.4123, + "step": 24900 + }, + { + "epoch": 7.267711489163365, + "grad_norm": 0.3354223072528839, + "learning_rate": 0.0003825350058343057, + "loss": 3.3915, + "step": 24950 + }, + { + "epoch": 7.282276858541133, + "grad_norm": 0.359828919172287, + "learning_rate": 0.0003820974329054842, + "loss": 3.4081, + "step": 25000 + }, + { + "epoch": 7.282276858541133, + "eval_accuracy": 0.36572290173736083, + "eval_loss": 3.591287136077881, + "eval_runtime": 179.8422, + "eval_samples_per_second": 92.548, + "eval_steps_per_second": 5.788, + "step": 25000 + }, + { + "epoch": 7.2968422279189, + "grad_norm": 0.3653123676776886, + "learning_rate": 0.00038165985997666276, + "loss": 3.4106, + "step": 25050 + }, + { + "epoch": 7.311407597296667, + "grad_norm": 0.38982659578323364, + "learning_rate": 0.00038122228704784124, + "loss": 3.408, + "step": 25100 + }, + { + "epoch": 7.325972966674435, + "grad_norm": 0.3451847732067108, + "learning_rate": 0.0003807847141190198, + "loss": 3.4184, + "step": 25150 + }, + { + "epoch": 7.3405383360522025, + "grad_norm": 0.33326441049575806, + "learning_rate": 0.0003803471411901984, + "loss": 3.4236, + "step": 25200 + }, + { + "epoch": 7.35510370542997, + "grad_norm": 0.3506428003311157, + "learning_rate": 0.00037990956826137686, + "loss": 3.4202, + "step": 25250 + }, + { + "epoch": 7.369669074807737, + "grad_norm": 0.35344335436820984, + "learning_rate": 0.0003794719953325554, + "loss": 3.4086, + "step": 25300 + }, + { + "epoch": 7.384234444185505, + "grad_norm": 0.35276249051094055, + "learning_rate": 0.00037903442240373394, + "loss": 3.4224, + "step": 25350 + }, + { + "epoch": 7.398799813563272, + "grad_norm": 0.34853968024253845, + "learning_rate": 0.00037859684947491243, + "loss": 3.4125, + "step": 25400 + }, + { + "epoch": 7.413365182941039, + "grad_norm": 0.3490748107433319, + "learning_rate": 0.00037815927654609097, + "loss": 3.4044, + "step": 25450 + }, + { + "epoch": 7.427930552318807, + "grad_norm": 0.34673872590065, + "learning_rate": 0.0003777217036172695, + "loss": 3.4131, + "step": 25500 + }, + { + "epoch": 7.442495921696574, + "grad_norm": 0.3763735592365265, + "learning_rate": 0.00037728413068844805, + "loss": 3.407, + "step": 25550 + }, + { + "epoch": 7.457061291074342, + "grad_norm": 0.3731195032596588, + "learning_rate": 0.00037684655775962653, + "loss": 3.4255, + "step": 25600 + }, + { + "epoch": 7.471626660452109, + "grad_norm": 0.34575656056404114, + "learning_rate": 0.00037640898483080513, + "loss": 3.4082, + "step": 25650 + }, + { + "epoch": 7.486192029829876, + "grad_norm": 0.3527657985687256, + "learning_rate": 0.00037597141190198367, + "loss": 3.4289, + "step": 25700 + }, + { + "epoch": 7.500757399207644, + "grad_norm": 0.3446659743785858, + "learning_rate": 0.00037553383897316215, + "loss": 3.4194, + "step": 25750 + }, + { + "epoch": 7.515322768585412, + "grad_norm": 0.3565196394920349, + "learning_rate": 0.0003750962660443407, + "loss": 3.416, + "step": 25800 + }, + { + "epoch": 7.529888137963178, + "grad_norm": 0.3380652666091919, + "learning_rate": 0.00037465869311551923, + "loss": 3.4235, + "step": 25850 + }, + { + "epoch": 7.544453507340946, + "grad_norm": 0.34874868392944336, + "learning_rate": 0.0003742211201866977, + "loss": 3.4153, + "step": 25900 + }, + { + "epoch": 7.559018876718714, + "grad_norm": 0.3525692820549011, + "learning_rate": 0.00037378354725787626, + "loss": 3.4213, + "step": 25950 + }, + { + "epoch": 7.573584246096481, + "grad_norm": 0.38578587770462036, + "learning_rate": 0.0003733459743290548, + "loss": 3.4258, + "step": 26000 + }, + { + "epoch": 7.573584246096481, + "eval_accuracy": 0.3662004984312912, + "eval_loss": 3.582775354385376, + "eval_runtime": 179.914, + "eval_samples_per_second": 92.511, + "eval_steps_per_second": 5.786, + "step": 26000 + }, + { + "epoch": 7.588149615474248, + "grad_norm": 0.36569854617118835, + "learning_rate": 0.0003729084014002334, + "loss": 3.4249, + "step": 26050 + }, + { + "epoch": 7.602714984852016, + "grad_norm": 0.35230007767677307, + "learning_rate": 0.0003724708284714119, + "loss": 3.4167, + "step": 26100 + }, + { + "epoch": 7.617280354229783, + "grad_norm": 0.3570927381515503, + "learning_rate": 0.0003720332555425904, + "loss": 3.4219, + "step": 26150 + }, + { + "epoch": 7.631845723607551, + "grad_norm": 0.3544664680957794, + "learning_rate": 0.00037159568261376896, + "loss": 3.4319, + "step": 26200 + }, + { + "epoch": 7.646411092985318, + "grad_norm": 0.32789161801338196, + "learning_rate": 0.00037115810968494744, + "loss": 3.4319, + "step": 26250 + }, + { + "epoch": 7.660976462363085, + "grad_norm": 0.38033467531204224, + "learning_rate": 0.000370720536756126, + "loss": 3.4309, + "step": 26300 + }, + { + "epoch": 7.675541831740853, + "grad_norm": 0.35393211245536804, + "learning_rate": 0.0003702829638273045, + "loss": 3.4309, + "step": 26350 + }, + { + "epoch": 7.690107201118621, + "grad_norm": 0.3252100348472595, + "learning_rate": 0.000369845390898483, + "loss": 3.4214, + "step": 26400 + }, + { + "epoch": 7.704672570496387, + "grad_norm": 0.34850725531578064, + "learning_rate": 0.00036940781796966155, + "loss": 3.4223, + "step": 26450 + }, + { + "epoch": 7.719237939874155, + "grad_norm": 0.36846786737442017, + "learning_rate": 0.00036897024504084014, + "loss": 3.4336, + "step": 26500 + }, + { + "epoch": 7.733803309251923, + "grad_norm": 0.36332714557647705, + "learning_rate": 0.0003685326721120187, + "loss": 3.4295, + "step": 26550 + }, + { + "epoch": 7.74836867862969, + "grad_norm": 0.34099632501602173, + "learning_rate": 0.00036809509918319717, + "loss": 3.4295, + "step": 26600 + }, + { + "epoch": 7.762934048007457, + "grad_norm": 0.3657893240451813, + "learning_rate": 0.0003676575262543757, + "loss": 3.4278, + "step": 26650 + }, + { + "epoch": 7.777499417385225, + "grad_norm": 0.3631269931793213, + "learning_rate": 0.00036721995332555425, + "loss": 3.436, + "step": 26700 + }, + { + "epoch": 7.792064786762992, + "grad_norm": 0.3591324985027313, + "learning_rate": 0.00036678238039673274, + "loss": 3.4196, + "step": 26750 + }, + { + "epoch": 7.80663015614076, + "grad_norm": 0.3412059247493744, + "learning_rate": 0.0003663448074679113, + "loss": 3.436, + "step": 26800 + }, + { + "epoch": 7.821195525518527, + "grad_norm": 0.3525434732437134, + "learning_rate": 0.0003659072345390898, + "loss": 3.4165, + "step": 26850 + }, + { + "epoch": 7.8357608948962945, + "grad_norm": 0.3410341441631317, + "learning_rate": 0.0003654696616102683, + "loss": 3.4301, + "step": 26900 + }, + { + "epoch": 7.850326264274062, + "grad_norm": 0.3529902696609497, + "learning_rate": 0.00036503208868144684, + "loss": 3.4276, + "step": 26950 + }, + { + "epoch": 7.86489163365183, + "grad_norm": 0.36684471368789673, + "learning_rate": 0.00036459451575262543, + "loss": 3.4357, + "step": 27000 + }, + { + "epoch": 7.86489163365183, + "eval_accuracy": 0.36754980903186846, + "eval_loss": 3.5753531455993652, + "eval_runtime": 179.9, + "eval_samples_per_second": 92.518, + "eval_steps_per_second": 5.787, + "step": 27000 + }, + { + "epoch": 7.8794570030295965, + "grad_norm": 0.33539390563964844, + "learning_rate": 0.000364156942823804, + "loss": 3.4317, + "step": 27050 + }, + { + "epoch": 7.894022372407364, + "grad_norm": 0.3382086157798767, + "learning_rate": 0.00036371936989498246, + "loss": 3.4443, + "step": 27100 + }, + { + "epoch": 7.908587741785132, + "grad_norm": 0.37185636162757874, + "learning_rate": 0.000363281796966161, + "loss": 3.4331, + "step": 27150 + }, + { + "epoch": 7.923153111162899, + "grad_norm": 0.3374182879924774, + "learning_rate": 0.00036284422403733954, + "loss": 3.4368, + "step": 27200 + }, + { + "epoch": 7.937718480540666, + "grad_norm": 0.3447705805301666, + "learning_rate": 0.000362406651108518, + "loss": 3.4381, + "step": 27250 + }, + { + "epoch": 7.952283849918434, + "grad_norm": 0.3456777334213257, + "learning_rate": 0.00036196907817969657, + "loss": 3.4348, + "step": 27300 + }, + { + "epoch": 7.9668492192962015, + "grad_norm": 0.34811699390411377, + "learning_rate": 0.0003615315052508751, + "loss": 3.4324, + "step": 27350 + }, + { + "epoch": 7.981414588673969, + "grad_norm": 0.350790798664093, + "learning_rate": 0.0003610939323220536, + "loss": 3.4395, + "step": 27400 + }, + { + "epoch": 7.995979958051736, + "grad_norm": 0.3383653163909912, + "learning_rate": 0.0003606563593932322, + "loss": 3.4285, + "step": 27450 + }, + { + "epoch": 8.010487065951992, + "grad_norm": 0.39396196603775024, + "learning_rate": 0.0003602187864644107, + "loss": 3.3725, + "step": 27500 + }, + { + "epoch": 8.02505243532976, + "grad_norm": 0.34884384274482727, + "learning_rate": 0.00035978121353558927, + "loss": 3.3305, + "step": 27550 + }, + { + "epoch": 8.039617804707527, + "grad_norm": 0.35126855969429016, + "learning_rate": 0.00035934364060676775, + "loss": 3.3353, + "step": 27600 + }, + { + "epoch": 8.054183174085296, + "grad_norm": 0.3708069324493408, + "learning_rate": 0.0003589060676779463, + "loss": 3.3266, + "step": 27650 + }, + { + "epoch": 8.068748543463062, + "grad_norm": 0.33984529972076416, + "learning_rate": 0.00035846849474912483, + "loss": 3.3367, + "step": 27700 + }, + { + "epoch": 8.08331391284083, + "grad_norm": 0.3667299449443817, + "learning_rate": 0.0003580309218203033, + "loss": 3.3359, + "step": 27750 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.35364869236946106, + "learning_rate": 0.00035759334889148186, + "loss": 3.3403, + "step": 27800 + }, + { + "epoch": 8.112444651596364, + "grad_norm": 0.39318904280662537, + "learning_rate": 0.00035715577596266045, + "loss": 3.3463, + "step": 27850 + }, + { + "epoch": 8.127010020974131, + "grad_norm": 0.3744325041770935, + "learning_rate": 0.00035671820303383894, + "loss": 3.3489, + "step": 27900 + }, + { + "epoch": 8.1415753903519, + "grad_norm": 0.34499189257621765, + "learning_rate": 0.0003562806301050175, + "loss": 3.3631, + "step": 27950 + }, + { + "epoch": 8.156140759729666, + "grad_norm": 0.3449557423591614, + "learning_rate": 0.000355843057176196, + "loss": 3.3437, + "step": 28000 + }, + { + "epoch": 8.156140759729666, + "eval_accuracy": 0.3672165965674178, + "eval_loss": 3.5825846195220947, + "eval_runtime": 179.8108, + "eval_samples_per_second": 92.564, + "eval_steps_per_second": 5.789, + "step": 28000 + }, + { + "epoch": 8.170706129107435, + "grad_norm": 0.3479335606098175, + "learning_rate": 0.00035540548424737456, + "loss": 3.3574, + "step": 28050 + }, + { + "epoch": 8.185271498485202, + "grad_norm": 0.3682080805301666, + "learning_rate": 0.00035496791131855304, + "loss": 3.361, + "step": 28100 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.36186373233795166, + "learning_rate": 0.0003545303383897316, + "loss": 3.3727, + "step": 28150 + }, + { + "epoch": 8.214402237240737, + "grad_norm": 0.3528668284416199, + "learning_rate": 0.0003540927654609101, + "loss": 3.3544, + "step": 28200 + }, + { + "epoch": 8.228967606618504, + "grad_norm": 0.3623187839984894, + "learning_rate": 0.0003536551925320886, + "loss": 3.3608, + "step": 28250 + }, + { + "epoch": 8.24353297599627, + "grad_norm": 0.36072465777397156, + "learning_rate": 0.0003532176196032672, + "loss": 3.3592, + "step": 28300 + }, + { + "epoch": 8.258098345374039, + "grad_norm": 0.35227975249290466, + "learning_rate": 0.00035278004667444574, + "loss": 3.3801, + "step": 28350 + }, + { + "epoch": 8.272663714751806, + "grad_norm": 0.3518006205558777, + "learning_rate": 0.00035234247374562423, + "loss": 3.3707, + "step": 28400 + }, + { + "epoch": 8.287229084129574, + "grad_norm": 0.36904311180114746, + "learning_rate": 0.00035190490081680277, + "loss": 3.3709, + "step": 28450 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.3661719262599945, + "learning_rate": 0.0003514673278879813, + "loss": 3.3712, + "step": 28500 + }, + { + "epoch": 8.316359822885108, + "grad_norm": 0.3581629693508148, + "learning_rate": 0.00035102975495915985, + "loss": 3.3655, + "step": 28550 + }, + { + "epoch": 8.330925192262876, + "grad_norm": 0.34586071968078613, + "learning_rate": 0.00035059218203033833, + "loss": 3.3784, + "step": 28600 + }, + { + "epoch": 8.345490561640643, + "grad_norm": 0.3492356240749359, + "learning_rate": 0.0003501546091015169, + "loss": 3.3773, + "step": 28650 + }, + { + "epoch": 8.36005593101841, + "grad_norm": 0.35812652111053467, + "learning_rate": 0.00034971703617269547, + "loss": 3.3898, + "step": 28700 + }, + { + "epoch": 8.374621300396178, + "grad_norm": 0.3359387218952179, + "learning_rate": 0.0003492794632438739, + "loss": 3.3724, + "step": 28750 + }, + { + "epoch": 8.389186669773945, + "grad_norm": 0.3566528856754303, + "learning_rate": 0.0003488418903150525, + "loss": 3.3749, + "step": 28800 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.3726493716239929, + "learning_rate": 0.00034840431738623103, + "loss": 3.3748, + "step": 28850 + }, + { + "epoch": 8.41831740852948, + "grad_norm": 0.36885538697242737, + "learning_rate": 0.0003479667444574095, + "loss": 3.382, + "step": 28900 + }, + { + "epoch": 8.432882777907247, + "grad_norm": 0.37485557794570923, + "learning_rate": 0.00034752917152858806, + "loss": 3.3809, + "step": 28950 + }, + { + "epoch": 8.447448147285016, + "grad_norm": 0.3613327443599701, + "learning_rate": 0.0003470915985997666, + "loss": 3.3846, + "step": 29000 + }, + { + "epoch": 8.447448147285016, + "eval_accuracy": 0.3677331111603366, + "eval_loss": 3.574125289916992, + "eval_runtime": 179.9699, + "eval_samples_per_second": 92.482, + "eval_steps_per_second": 5.784, + "step": 29000 + }, + { + "epoch": 8.462013516662783, + "grad_norm": 0.3570326566696167, + "learning_rate": 0.00034665402567094514, + "loss": 3.3894, + "step": 29050 + }, + { + "epoch": 8.47657888604055, + "grad_norm": 0.36068421602249146, + "learning_rate": 0.0003462164527421236, + "loss": 3.3895, + "step": 29100 + }, + { + "epoch": 8.491144255418318, + "grad_norm": 0.36702850461006165, + "learning_rate": 0.00034577887981330216, + "loss": 3.393, + "step": 29150 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.3736766278743744, + "learning_rate": 0.00034534130688448076, + "loss": 3.3906, + "step": 29200 + }, + { + "epoch": 8.520274994173853, + "grad_norm": 0.3675585091114044, + "learning_rate": 0.00034490373395565924, + "loss": 3.3849, + "step": 29250 + }, + { + "epoch": 8.53484036355162, + "grad_norm": 0.3682818114757538, + "learning_rate": 0.0003444661610268378, + "loss": 3.381, + "step": 29300 + }, + { + "epoch": 8.549405732929387, + "grad_norm": 0.41155508160591125, + "learning_rate": 0.0003440285880980163, + "loss": 3.3882, + "step": 29350 + }, + { + "epoch": 8.563971102307155, + "grad_norm": 0.38835835456848145, + "learning_rate": 0.0003435910151691948, + "loss": 3.3873, + "step": 29400 + }, + { + "epoch": 8.578536471684922, + "grad_norm": 0.36495402455329895, + "learning_rate": 0.00034315344224037335, + "loss": 3.3954, + "step": 29450 + }, + { + "epoch": 8.593101841062689, + "grad_norm": 0.35648706555366516, + "learning_rate": 0.0003427158693115519, + "loss": 3.3944, + "step": 29500 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.36086899042129517, + "learning_rate": 0.00034227829638273043, + "loss": 3.3892, + "step": 29550 + }, + { + "epoch": 8.622232579818224, + "grad_norm": 0.3824726343154907, + "learning_rate": 0.0003418407234539089, + "loss": 3.3972, + "step": 29600 + }, + { + "epoch": 8.63679794919599, + "grad_norm": 0.3354533910751343, + "learning_rate": 0.0003414031505250875, + "loss": 3.3857, + "step": 29650 + }, + { + "epoch": 8.65136331857376, + "grad_norm": 0.3533078730106354, + "learning_rate": 0.00034096557759626605, + "loss": 3.4028, + "step": 29700 + }, + { + "epoch": 8.665928687951526, + "grad_norm": 0.35228005051612854, + "learning_rate": 0.00034052800466744453, + "loss": 3.3961, + "step": 29750 + }, + { + "epoch": 8.680494057329295, + "grad_norm": 0.3567520081996918, + "learning_rate": 0.0003400904317386231, + "loss": 3.3975, + "step": 29800 + }, + { + "epoch": 8.695059426707061, + "grad_norm": 0.36763066053390503, + "learning_rate": 0.0003396528588098016, + "loss": 3.402, + "step": 29850 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.35932788252830505, + "learning_rate": 0.0003392152858809801, + "loss": 3.3969, + "step": 29900 + }, + { + "epoch": 8.724190165462597, + "grad_norm": 0.3732490837574005, + "learning_rate": 0.00033877771295215864, + "loss": 3.3923, + "step": 29950 + }, + { + "epoch": 8.738755534840363, + "grad_norm": 0.35130372643470764, + "learning_rate": 0.0003383401400233372, + "loss": 3.397, + "step": 30000 + }, + { + "epoch": 8.738755534840363, + "eval_accuracy": 0.36869112638981577, + "eval_loss": 3.565412998199463, + "eval_runtime": 179.8685, + "eval_samples_per_second": 92.534, + "eval_steps_per_second": 5.788, + "step": 30000 + }, + { + "epoch": 8.753320904218132, + "grad_norm": 0.36606141924858093, + "learning_rate": 0.0003379025670945158, + "loss": 3.3837, + "step": 30050 + }, + { + "epoch": 8.767886273595899, + "grad_norm": 0.3647548258304596, + "learning_rate": 0.00033746499416569426, + "loss": 3.3854, + "step": 30100 + }, + { + "epoch": 8.782451642973665, + "grad_norm": 0.3695903718471527, + "learning_rate": 0.0003370274212368728, + "loss": 3.4027, + "step": 30150 + }, + { + "epoch": 8.797017012351434, + "grad_norm": 0.35933852195739746, + "learning_rate": 0.00033658984830805134, + "loss": 3.3907, + "step": 30200 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.3846684396266937, + "learning_rate": 0.0003361522753792298, + "loss": 3.403, + "step": 30250 + }, + { + "epoch": 8.826147751106967, + "grad_norm": 0.35925909876823425, + "learning_rate": 0.00033571470245040837, + "loss": 3.3971, + "step": 30300 + }, + { + "epoch": 8.840713120484736, + "grad_norm": 0.3694295287132263, + "learning_rate": 0.0003352771295215869, + "loss": 3.4021, + "step": 30350 + }, + { + "epoch": 8.855278489862503, + "grad_norm": 0.352285236120224, + "learning_rate": 0.0003348395565927654, + "loss": 3.3962, + "step": 30400 + }, + { + "epoch": 8.86984385924027, + "grad_norm": 0.35616302490234375, + "learning_rate": 0.00033440198366394393, + "loss": 3.3957, + "step": 30450 + }, + { + "epoch": 8.884409228618038, + "grad_norm": 0.35298022627830505, + "learning_rate": 0.0003339644107351225, + "loss": 3.3837, + "step": 30500 + }, + { + "epoch": 8.898974597995805, + "grad_norm": 0.38288286328315735, + "learning_rate": 0.00033352683780630107, + "loss": 3.4005, + "step": 30550 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.3534913659095764, + "learning_rate": 0.00033308926487747955, + "loss": 3.4102, + "step": 30600 + }, + { + "epoch": 8.92810533675134, + "grad_norm": 0.34419965744018555, + "learning_rate": 0.0003326516919486581, + "loss": 3.4074, + "step": 30650 + }, + { + "epoch": 8.942670706129107, + "grad_norm": 0.361150324344635, + "learning_rate": 0.00033221411901983663, + "loss": 3.3897, + "step": 30700 + }, + { + "epoch": 8.957236075506875, + "grad_norm": 0.3879886567592621, + "learning_rate": 0.0003317765460910151, + "loss": 3.405, + "step": 30750 + }, + { + "epoch": 8.971801444884642, + "grad_norm": 0.37308424711227417, + "learning_rate": 0.00033133897316219366, + "loss": 3.3974, + "step": 30800 + }, + { + "epoch": 8.986366814262409, + "grad_norm": 0.3615827262401581, + "learning_rate": 0.0003309014002333722, + "loss": 3.3938, + "step": 30850 + }, + { + "epoch": 9.000873922162667, + "grad_norm": 0.3499894142150879, + "learning_rate": 0.0003304638273045507, + "loss": 3.398, + "step": 30900 + }, + { + "epoch": 9.015439291540433, + "grad_norm": 0.3602531850337982, + "learning_rate": 0.0003300262543757292, + "loss": 3.2991, + "step": 30950 + }, + { + "epoch": 9.0300046609182, + "grad_norm": 0.3546774089336395, + "learning_rate": 0.0003295886814469078, + "loss": 3.2933, + "step": 31000 + }, + { + "epoch": 9.0300046609182, + "eval_accuracy": 0.36841858352016277, + "eval_loss": 3.570578098297119, + "eval_runtime": 181.1213, + "eval_samples_per_second": 91.894, + "eval_steps_per_second": 5.748, + "step": 31000 + }, + { + "epoch": 9.044570030295969, + "grad_norm": 0.3855380117893219, + "learning_rate": 0.00032915110851808636, + "loss": 3.3132, + "step": 31050 + }, + { + "epoch": 9.059135399673735, + "grad_norm": 0.35755249857902527, + "learning_rate": 0.00032871353558926484, + "loss": 3.3147, + "step": 31100 + }, + { + "epoch": 9.073700769051504, + "grad_norm": 0.34872257709503174, + "learning_rate": 0.0003282759626604434, + "loss": 3.3101, + "step": 31150 + }, + { + "epoch": 9.08826613842927, + "grad_norm": 0.36924898624420166, + "learning_rate": 0.0003278383897316219, + "loss": 3.3097, + "step": 31200 + }, + { + "epoch": 9.102831507807037, + "grad_norm": 0.353230357170105, + "learning_rate": 0.0003274008168028004, + "loss": 3.3117, + "step": 31250 + }, + { + "epoch": 9.117396877184806, + "grad_norm": 0.3573058247566223, + "learning_rate": 0.00032696324387397895, + "loss": 3.3038, + "step": 31300 + }, + { + "epoch": 9.131962246562573, + "grad_norm": 0.3910951316356659, + "learning_rate": 0.0003265256709451575, + "loss": 3.3208, + "step": 31350 + }, + { + "epoch": 9.14652761594034, + "grad_norm": 0.3972455859184265, + "learning_rate": 0.00032608809801633597, + "loss": 3.3242, + "step": 31400 + }, + { + "epoch": 9.161092985318108, + "grad_norm": 0.4021930694580078, + "learning_rate": 0.00032565052508751457, + "loss": 3.3291, + "step": 31450 + }, + { + "epoch": 9.175658354695875, + "grad_norm": 0.36197373270988464, + "learning_rate": 0.0003252129521586931, + "loss": 3.325, + "step": 31500 + }, + { + "epoch": 9.190223724073643, + "grad_norm": 0.363652765750885, + "learning_rate": 0.00032477537922987165, + "loss": 3.3357, + "step": 31550 + }, + { + "epoch": 9.20478909345141, + "grad_norm": 0.37855982780456543, + "learning_rate": 0.00032433780630105013, + "loss": 3.3269, + "step": 31600 + }, + { + "epoch": 9.219354462829177, + "grad_norm": 0.36747410893440247, + "learning_rate": 0.00032390023337222867, + "loss": 3.3331, + "step": 31650 + }, + { + "epoch": 9.233919832206945, + "grad_norm": 0.36995381116867065, + "learning_rate": 0.0003234626604434072, + "loss": 3.3424, + "step": 31700 + }, + { + "epoch": 9.248485201584712, + "grad_norm": 0.3854932188987732, + "learning_rate": 0.0003230250875145857, + "loss": 3.3534, + "step": 31750 + }, + { + "epoch": 9.263050570962479, + "grad_norm": 0.3708399534225464, + "learning_rate": 0.00032258751458576424, + "loss": 3.3303, + "step": 31800 + }, + { + "epoch": 9.277615940340247, + "grad_norm": 0.3602900803089142, + "learning_rate": 0.00032214994165694283, + "loss": 3.3368, + "step": 31850 + }, + { + "epoch": 9.292181309718014, + "grad_norm": 0.3634423017501831, + "learning_rate": 0.0003217123687281213, + "loss": 3.3472, + "step": 31900 + }, + { + "epoch": 9.306746679095783, + "grad_norm": 0.35544899106025696, + "learning_rate": 0.00032127479579929986, + "loss": 3.3291, + "step": 31950 + }, + { + "epoch": 9.32131204847355, + "grad_norm": 0.37147095799446106, + "learning_rate": 0.0003208372228704784, + "loss": 3.3448, + "step": 32000 + }, + { + "epoch": 9.32131204847355, + "eval_accuracy": 0.36875214871481576, + "eval_loss": 3.570387840270996, + "eval_runtime": 181.1619, + "eval_samples_per_second": 91.874, + "eval_steps_per_second": 5.746, + "step": 32000 + }, + { + "epoch": 9.335877417851316, + "grad_norm": 0.3772315979003906, + "learning_rate": 0.00032039964994165694, + "loss": 3.3357, + "step": 32050 + }, + { + "epoch": 9.350442787229085, + "grad_norm": 0.3781045973300934, + "learning_rate": 0.0003199620770128354, + "loss": 3.3417, + "step": 32100 + }, + { + "epoch": 9.365008156606851, + "grad_norm": 0.3656591773033142, + "learning_rate": 0.00031952450408401396, + "loss": 3.3503, + "step": 32150 + }, + { + "epoch": 9.379573525984618, + "grad_norm": 0.3560366928577423, + "learning_rate": 0.0003190869311551925, + "loss": 3.3543, + "step": 32200 + }, + { + "epoch": 9.394138895362387, + "grad_norm": 0.37504252791404724, + "learning_rate": 0.000318649358226371, + "loss": 3.3364, + "step": 32250 + }, + { + "epoch": 9.408704264740154, + "grad_norm": 0.3750084936618805, + "learning_rate": 0.0003182117852975496, + "loss": 3.3571, + "step": 32300 + }, + { + "epoch": 9.423269634117922, + "grad_norm": 0.3538859486579895, + "learning_rate": 0.0003177742123687281, + "loss": 3.3531, + "step": 32350 + }, + { + "epoch": 9.437835003495689, + "grad_norm": 0.3584098815917969, + "learning_rate": 0.0003173366394399066, + "loss": 3.3625, + "step": 32400 + }, + { + "epoch": 9.452400372873456, + "grad_norm": 0.373279333114624, + "learning_rate": 0.00031689906651108515, + "loss": 3.3453, + "step": 32450 + }, + { + "epoch": 9.466965742251224, + "grad_norm": 0.35882455110549927, + "learning_rate": 0.0003164614935822637, + "loss": 3.358, + "step": 32500 + }, + { + "epoch": 9.48153111162899, + "grad_norm": 0.3616470694541931, + "learning_rate": 0.00031602392065344223, + "loss": 3.3555, + "step": 32550 + }, + { + "epoch": 9.496096481006758, + "grad_norm": 0.3601871132850647, + "learning_rate": 0.0003155863477246207, + "loss": 3.3421, + "step": 32600 + }, + { + "epoch": 9.510661850384526, + "grad_norm": 0.3817446231842041, + "learning_rate": 0.00031514877479579925, + "loss": 3.3635, + "step": 32650 + }, + { + "epoch": 9.525227219762293, + "grad_norm": 0.3797430992126465, + "learning_rate": 0.00031471120186697785, + "loss": 3.3555, + "step": 32700 + }, + { + "epoch": 9.53979258914006, + "grad_norm": 0.3647226095199585, + "learning_rate": 0.00031427362893815633, + "loss": 3.3539, + "step": 32750 + }, + { + "epoch": 9.554357958517828, + "grad_norm": 0.38534408807754517, + "learning_rate": 0.0003138360560093349, + "loss": 3.353, + "step": 32800 + }, + { + "epoch": 9.568923327895595, + "grad_norm": 0.35576704144477844, + "learning_rate": 0.0003133984830805134, + "loss": 3.3503, + "step": 32850 + }, + { + "epoch": 9.583488697273363, + "grad_norm": 0.3623790740966797, + "learning_rate": 0.0003129609101516919, + "loss": 3.3567, + "step": 32900 + }, + { + "epoch": 9.59805406665113, + "grad_norm": 0.37745630741119385, + "learning_rate": 0.00031252333722287044, + "loss": 3.3555, + "step": 32950 + }, + { + "epoch": 9.612619436028897, + "grad_norm": 0.35436564683914185, + "learning_rate": 0.000312085764294049, + "loss": 3.362, + "step": 33000 + }, + { + "epoch": 9.612619436028897, + "eval_accuracy": 0.3694510248223298, + "eval_loss": 3.5591442584991455, + "eval_runtime": 181.163, + "eval_samples_per_second": 91.873, + "eval_steps_per_second": 5.746, + "step": 33000 + }, + { + "epoch": 9.627184805406666, + "grad_norm": 0.3626103401184082, + "learning_rate": 0.0003116481913652275, + "loss": 3.3583, + "step": 33050 + }, + { + "epoch": 9.641750174784432, + "grad_norm": 0.3621595501899719, + "learning_rate": 0.000311210618436406, + "loss": 3.3653, + "step": 33100 + }, + { + "epoch": 9.6563155441622, + "grad_norm": 0.37698203325271606, + "learning_rate": 0.0003107730455075846, + "loss": 3.369, + "step": 33150 + }, + { + "epoch": 9.670880913539968, + "grad_norm": 0.3685494661331177, + "learning_rate": 0.00031033547257876314, + "loss": 3.3695, + "step": 33200 + }, + { + "epoch": 9.685446282917734, + "grad_norm": 0.3542827367782593, + "learning_rate": 0.0003098978996499416, + "loss": 3.3554, + "step": 33250 + }, + { + "epoch": 9.700011652295503, + "grad_norm": 0.3731638491153717, + "learning_rate": 0.00030946032672112016, + "loss": 3.3616, + "step": 33300 + }, + { + "epoch": 9.71457702167327, + "grad_norm": 0.3674522638320923, + "learning_rate": 0.0003090227537922987, + "loss": 3.3553, + "step": 33350 + }, + { + "epoch": 9.729142391051036, + "grad_norm": 0.3795247972011566, + "learning_rate": 0.0003085851808634772, + "loss": 3.3591, + "step": 33400 + }, + { + "epoch": 9.743707760428805, + "grad_norm": 0.3751859664916992, + "learning_rate": 0.00030814760793465573, + "loss": 3.3737, + "step": 33450 + }, + { + "epoch": 9.758273129806572, + "grad_norm": 0.36385780572891235, + "learning_rate": 0.00030771003500583427, + "loss": 3.363, + "step": 33500 + }, + { + "epoch": 9.772838499184338, + "grad_norm": 0.3745647370815277, + "learning_rate": 0.00030727246207701286, + "loss": 3.365, + "step": 33550 + }, + { + "epoch": 9.787403868562107, + "grad_norm": 0.3658393621444702, + "learning_rate": 0.0003068348891481913, + "loss": 3.3603, + "step": 33600 + }, + { + "epoch": 9.801969237939874, + "grad_norm": 0.3640536963939667, + "learning_rate": 0.0003063973162193699, + "loss": 3.3559, + "step": 33650 + }, + { + "epoch": 9.816534607317642, + "grad_norm": 0.34730592370033264, + "learning_rate": 0.00030595974329054843, + "loss": 3.3719, + "step": 33700 + }, + { + "epoch": 9.831099976695409, + "grad_norm": 0.4093884229660034, + "learning_rate": 0.0003055221703617269, + "loss": 3.3566, + "step": 33750 + }, + { + "epoch": 9.845665346073176, + "grad_norm": 0.379686176776886, + "learning_rate": 0.00030508459743290546, + "loss": 3.3694, + "step": 33800 + }, + { + "epoch": 9.860230715450944, + "grad_norm": 0.3721369206905365, + "learning_rate": 0.000304647024504084, + "loss": 3.3565, + "step": 33850 + }, + { + "epoch": 9.874796084828711, + "grad_norm": 0.40420958399772644, + "learning_rate": 0.0003042094515752625, + "loss": 3.3739, + "step": 33900 + }, + { + "epoch": 9.88936145420648, + "grad_norm": 0.3653509020805359, + "learning_rate": 0.000303771878646441, + "loss": 3.3874, + "step": 33950 + }, + { + "epoch": 9.903926823584246, + "grad_norm": 0.3726848363876343, + "learning_rate": 0.00030333430571761956, + "loss": 3.3664, + "step": 34000 + }, + { + "epoch": 9.903926823584246, + "eval_accuracy": 0.37010945453331207, + "eval_loss": 3.552675724029541, + "eval_runtime": 180.1762, + "eval_samples_per_second": 92.376, + "eval_steps_per_second": 5.778, + "step": 34000 + }, + { + "epoch": 9.918492192962013, + "grad_norm": 0.37708285450935364, + "learning_rate": 0.00030289673278879816, + "loss": 3.3704, + "step": 34050 + }, + { + "epoch": 9.933057562339782, + "grad_norm": 0.3552982211112976, + "learning_rate": 0.00030245915985997664, + "loss": 3.3698, + "step": 34100 + }, + { + "epoch": 9.947622931717548, + "grad_norm": 0.36946627497673035, + "learning_rate": 0.0003020215869311552, + "loss": 3.3806, + "step": 34150 + }, + { + "epoch": 9.962188301095315, + "grad_norm": 0.35199493169784546, + "learning_rate": 0.0003015840140023337, + "loss": 3.3826, + "step": 34200 + }, + { + "epoch": 9.976753670473084, + "grad_norm": 0.3719441592693329, + "learning_rate": 0.0003011464410735122, + "loss": 3.3646, + "step": 34250 + }, + { + "epoch": 9.99131903985085, + "grad_norm": 0.363594651222229, + "learning_rate": 0.00030070886814469075, + "loss": 3.3622, + "step": 34300 + }, + { + "epoch": 10.005826147751106, + "grad_norm": 0.35639119148254395, + "learning_rate": 0.0003002712952158693, + "loss": 3.3249, + "step": 34350 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.37611323595046997, + "learning_rate": 0.0002998337222870478, + "loss": 3.273, + "step": 34400 + }, + { + "epoch": 10.034956886506642, + "grad_norm": 0.37235331535339355, + "learning_rate": 0.0002993961493582263, + "loss": 3.2791, + "step": 34450 + }, + { + "epoch": 10.049522255884408, + "grad_norm": 0.37707796692848206, + "learning_rate": 0.0002989585764294049, + "loss": 3.2771, + "step": 34500 + }, + { + "epoch": 10.064087625262177, + "grad_norm": 0.3716845214366913, + "learning_rate": 0.0002985210035005834, + "loss": 3.2701, + "step": 34550 + }, + { + "epoch": 10.078652994639944, + "grad_norm": 0.36124205589294434, + "learning_rate": 0.00029808343057176193, + "loss": 3.2854, + "step": 34600 + }, + { + "epoch": 10.093218364017712, + "grad_norm": 0.35858553647994995, + "learning_rate": 0.00029764585764294047, + "loss": 3.2791, + "step": 34650 + }, + { + "epoch": 10.107783733395479, + "grad_norm": 0.37975695729255676, + "learning_rate": 0.00029720828471411896, + "loss": 3.2846, + "step": 34700 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.3737419545650482, + "learning_rate": 0.00029677071178529755, + "loss": 3.2932, + "step": 34750 + }, + { + "epoch": 10.136914472151014, + "grad_norm": 0.3647433817386627, + "learning_rate": 0.00029633313885647604, + "loss": 3.2903, + "step": 34800 + }, + { + "epoch": 10.151479841528781, + "grad_norm": 0.3602680265903473, + "learning_rate": 0.0002958955659276546, + "loss": 3.2915, + "step": 34850 + }, + { + "epoch": 10.166045210906548, + "grad_norm": 0.3720751404762268, + "learning_rate": 0.0002954579929988331, + "loss": 3.2912, + "step": 34900 + }, + { + "epoch": 10.180610580284316, + "grad_norm": 0.36713486909866333, + "learning_rate": 0.00029502042007001166, + "loss": 3.3051, + "step": 34950 + }, + { + "epoch": 10.195175949662083, + "grad_norm": 0.3789300322532654, + "learning_rate": 0.0002945828471411902, + "loss": 3.3097, + "step": 35000 + }, + { + "epoch": 10.195175949662083, + "eval_accuracy": 0.3695753034302777, + "eval_loss": 3.563159227371216, + "eval_runtime": 179.9801, + "eval_samples_per_second": 92.477, + "eval_steps_per_second": 5.784, + "step": 35000 + }, + { + "epoch": 10.209741319039852, + "grad_norm": 0.3704666793346405, + "learning_rate": 0.0002941452742123687, + "loss": 3.305, + "step": 35050 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.36152052879333496, + "learning_rate": 0.0002937077012835472, + "loss": 3.3008, + "step": 35100 + }, + { + "epoch": 10.238872057795385, + "grad_norm": 0.3830110728740692, + "learning_rate": 0.00029327012835472576, + "loss": 3.3154, + "step": 35150 + }, + { + "epoch": 10.253437427173154, + "grad_norm": 0.3606763482093811, + "learning_rate": 0.0002928325554259043, + "loss": 3.2964, + "step": 35200 + }, + { + "epoch": 10.26800279655092, + "grad_norm": 0.3661087453365326, + "learning_rate": 0.00029239498249708284, + "loss": 3.3058, + "step": 35250 + }, + { + "epoch": 10.282568165928687, + "grad_norm": 0.3702368438243866, + "learning_rate": 0.00029195740956826133, + "loss": 3.315, + "step": 35300 + }, + { + "epoch": 10.297133535306456, + "grad_norm": 0.3803708553314209, + "learning_rate": 0.0002915198366394399, + "loss": 3.3081, + "step": 35350 + }, + { + "epoch": 10.311698904684222, + "grad_norm": 0.36514583230018616, + "learning_rate": 0.0002910822637106184, + "loss": 3.301, + "step": 35400 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.36688533425331116, + "learning_rate": 0.00029064469078179695, + "loss": 3.3136, + "step": 35450 + }, + { + "epoch": 10.340829643439758, + "grad_norm": 0.36582139134407043, + "learning_rate": 0.0002902071178529755, + "loss": 3.321, + "step": 35500 + }, + { + "epoch": 10.355395012817525, + "grad_norm": 0.36554041504859924, + "learning_rate": 0.000289769544924154, + "loss": 3.3259, + "step": 35550 + }, + { + "epoch": 10.369960382195293, + "grad_norm": 0.3956117331981659, + "learning_rate": 0.00028933197199533257, + "loss": 3.3208, + "step": 35600 + }, + { + "epoch": 10.38452575157306, + "grad_norm": 0.3716447949409485, + "learning_rate": 0.00028889439906651105, + "loss": 3.3143, + "step": 35650 + }, + { + "epoch": 10.399091120950827, + "grad_norm": 0.38861215114593506, + "learning_rate": 0.0002884568261376896, + "loss": 3.3173, + "step": 35700 + }, + { + "epoch": 10.413656490328595, + "grad_norm": 0.39447951316833496, + "learning_rate": 0.00028801925320886813, + "loss": 3.3316, + "step": 35750 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.3649604618549347, + "learning_rate": 0.0002875816802800466, + "loss": 3.3131, + "step": 35800 + }, + { + "epoch": 10.44278722908413, + "grad_norm": 0.3737121522426605, + "learning_rate": 0.0002871441073512252, + "loss": 3.3156, + "step": 35850 + }, + { + "epoch": 10.457352598461897, + "grad_norm": 0.3692328929901123, + "learning_rate": 0.0002867065344224037, + "loss": 3.3158, + "step": 35900 + }, + { + "epoch": 10.471917967839664, + "grad_norm": 0.3931700587272644, + "learning_rate": 0.00028626896149358224, + "loss": 3.3158, + "step": 35950 + }, + { + "epoch": 10.486483337217432, + "grad_norm": 0.35947710275650024, + "learning_rate": 0.0002858313885647608, + "loss": 3.3296, + "step": 36000 + }, + { + "epoch": 10.486483337217432, + "eval_accuracy": 0.37002562232189595, + "eval_loss": 3.5596346855163574, + "eval_runtime": 201.7421, + "eval_samples_per_second": 82.501, + "eval_steps_per_second": 5.16, + "step": 36000 + }, + { + "epoch": 10.5010487065952, + "grad_norm": 0.3691442310810089, + "learning_rate": 0.0002853938156359393, + "loss": 3.3234, + "step": 36050 + }, + { + "epoch": 10.515614075972966, + "grad_norm": 0.36635181307792664, + "learning_rate": 0.00028495624270711786, + "loss": 3.3215, + "step": 36100 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.3895931839942932, + "learning_rate": 0.00028451866977829634, + "loss": 3.327, + "step": 36150 + }, + { + "epoch": 10.544744814728501, + "grad_norm": 0.369814395904541, + "learning_rate": 0.0002840810968494749, + "loss": 3.3193, + "step": 36200 + }, + { + "epoch": 10.55931018410627, + "grad_norm": 0.3740673363208771, + "learning_rate": 0.0002836435239206534, + "loss": 3.3338, + "step": 36250 + }, + { + "epoch": 10.573875553484037, + "grad_norm": 0.3762377202510834, + "learning_rate": 0.00028320595099183196, + "loss": 3.3323, + "step": 36300 + }, + { + "epoch": 10.588440922861803, + "grad_norm": 0.3816418945789337, + "learning_rate": 0.0002827683780630105, + "loss": 3.326, + "step": 36350 + }, + { + "epoch": 10.603006292239572, + "grad_norm": 0.3672488331794739, + "learning_rate": 0.000282330805134189, + "loss": 3.3272, + "step": 36400 + }, + { + "epoch": 10.617571661617339, + "grad_norm": 0.3663332462310791, + "learning_rate": 0.00028189323220536753, + "loss": 3.3412, + "step": 36450 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.36435452103614807, + "learning_rate": 0.00028145565927654607, + "loss": 3.3405, + "step": 36500 + }, + { + "epoch": 10.646702400372874, + "grad_norm": 0.3970140218734741, + "learning_rate": 0.0002810180863477246, + "loss": 3.3301, + "step": 36550 + }, + { + "epoch": 10.66126776975064, + "grad_norm": 0.37601590156555176, + "learning_rate": 0.00028058051341890315, + "loss": 3.3305, + "step": 36600 + }, + { + "epoch": 10.675833139128407, + "grad_norm": 0.38534775376319885, + "learning_rate": 0.00028014294049008164, + "loss": 3.3418, + "step": 36650 + }, + { + "epoch": 10.690398508506176, + "grad_norm": 0.3686717450618744, + "learning_rate": 0.0002797053675612602, + "loss": 3.334, + "step": 36700 + }, + { + "epoch": 10.704963877883943, + "grad_norm": 0.37328463792800903, + "learning_rate": 0.0002792677946324387, + "loss": 3.3337, + "step": 36750 + }, + { + "epoch": 10.719529247261711, + "grad_norm": 0.37012574076652527, + "learning_rate": 0.00027883022170361726, + "loss": 3.3439, + "step": 36800 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.3841780722141266, + "learning_rate": 0.0002783926487747958, + "loss": 3.3365, + "step": 36850 + }, + { + "epoch": 10.748659986017245, + "grad_norm": 0.3672944903373718, + "learning_rate": 0.0002779550758459743, + "loss": 3.3333, + "step": 36900 + }, + { + "epoch": 10.763225355395013, + "grad_norm": 0.3785678744316101, + "learning_rate": 0.0002775175029171528, + "loss": 3.3393, + "step": 36950 + }, + { + "epoch": 10.77779072477278, + "grad_norm": 0.3595028817653656, + "learning_rate": 0.00027707992998833136, + "loss": 3.3485, + "step": 37000 + }, + { + "epoch": 10.77779072477278, + "eval_accuracy": 0.37081573797507467, + "eval_loss": 3.5510876178741455, + "eval_runtime": 180.0166, + "eval_samples_per_second": 92.458, + "eval_steps_per_second": 5.783, + "step": 37000 + }, + { + "epoch": 10.792356094150549, + "grad_norm": 0.37431466579437256, + "learning_rate": 0.0002766423570595099, + "loss": 3.3371, + "step": 37050 + }, + { + "epoch": 10.806921463528315, + "grad_norm": 0.3843117654323578, + "learning_rate": 0.00027620478413068844, + "loss": 3.3304, + "step": 37100 + }, + { + "epoch": 10.821486832906082, + "grad_norm": 0.3688303530216217, + "learning_rate": 0.000275767211201867, + "loss": 3.3318, + "step": 37150 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.38997411727905273, + "learning_rate": 0.00027532963827304547, + "loss": 3.3445, + "step": 37200 + }, + { + "epoch": 10.850617571661617, + "grad_norm": 0.3861274719238281, + "learning_rate": 0.000274892065344224, + "loss": 3.3383, + "step": 37250 + }, + { + "epoch": 10.865182941039384, + "grad_norm": 0.3819262683391571, + "learning_rate": 0.00027445449241540255, + "loss": 3.349, + "step": 37300 + }, + { + "epoch": 10.879748310417153, + "grad_norm": 0.3718022406101227, + "learning_rate": 0.0002740169194865811, + "loss": 3.3363, + "step": 37350 + }, + { + "epoch": 10.89431367979492, + "grad_norm": 0.3736136257648468, + "learning_rate": 0.0002735793465577596, + "loss": 3.3459, + "step": 37400 + }, + { + "epoch": 10.908879049172686, + "grad_norm": 0.37865564227104187, + "learning_rate": 0.0002731417736289381, + "loss": 3.3505, + "step": 37450 + }, + { + "epoch": 10.923444418550455, + "grad_norm": 0.3839370310306549, + "learning_rate": 0.00027270420070011665, + "loss": 3.3508, + "step": 37500 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.39603373408317566, + "learning_rate": 0.0002722666277712952, + "loss": 3.3416, + "step": 37550 + }, + { + "epoch": 10.95257515730599, + "grad_norm": 0.4063169062137604, + "learning_rate": 0.00027182905484247373, + "loss": 3.3462, + "step": 37600 + }, + { + "epoch": 10.967140526683757, + "grad_norm": 0.37765178084373474, + "learning_rate": 0.00027139148191365227, + "loss": 3.347, + "step": 37650 + }, + { + "epoch": 10.981705896061523, + "grad_norm": 0.3753688931465149, + "learning_rate": 0.00027095390898483076, + "loss": 3.3402, + "step": 37700 + }, + { + "epoch": 10.996271265439292, + "grad_norm": 0.39264529943466187, + "learning_rate": 0.0002705163360560093, + "loss": 3.3455, + "step": 37750 + }, + { + "epoch": 11.010778373339548, + "grad_norm": 0.3830313980579376, + "learning_rate": 0.00027007876312718784, + "loss": 3.2566, + "step": 37800 + }, + { + "epoch": 11.025343742717315, + "grad_norm": 0.39328911900520325, + "learning_rate": 0.0002696411901983664, + "loss": 3.2371, + "step": 37850 + }, + { + "epoch": 11.039909112095083, + "grad_norm": 0.39774519205093384, + "learning_rate": 0.0002692036172695449, + "loss": 3.2478, + "step": 37900 + }, + { + "epoch": 11.05447448147285, + "grad_norm": 0.36643147468566895, + "learning_rate": 0.0002687660443407234, + "loss": 3.2477, + "step": 37950 + }, + { + "epoch": 11.069039850850617, + "grad_norm": 0.39002707600593567, + "learning_rate": 0.00026832847141190194, + "loss": 3.2602, + "step": 38000 + }, + { + "epoch": 11.069039850850617, + "eval_accuracy": 0.3708169137424157, + "eval_loss": 3.5576553344726562, + "eval_runtime": 179.8126, + "eval_samples_per_second": 92.563, + "eval_steps_per_second": 5.789, + "step": 38000 + }, + { + "epoch": 11.083605220228385, + "grad_norm": 0.3755168318748474, + "learning_rate": 0.0002678908984830805, + "loss": 3.2608, + "step": 38050 + }, + { + "epoch": 11.098170589606152, + "grad_norm": 0.38174381852149963, + "learning_rate": 0.000267453325554259, + "loss": 3.2649, + "step": 38100 + }, + { + "epoch": 11.11273595898392, + "grad_norm": 0.372764527797699, + "learning_rate": 0.00026701575262543756, + "loss": 3.265, + "step": 38150 + }, + { + "epoch": 11.127301328361687, + "grad_norm": 0.3840652406215668, + "learning_rate": 0.00026657817969661605, + "loss": 3.2659, + "step": 38200 + }, + { + "epoch": 11.141866697739454, + "grad_norm": 0.3895314037799835, + "learning_rate": 0.00026614060676779464, + "loss": 3.2548, + "step": 38250 + }, + { + "epoch": 11.156432067117223, + "grad_norm": 0.3741670250892639, + "learning_rate": 0.00026570303383897313, + "loss": 3.2596, + "step": 38300 + }, + { + "epoch": 11.17099743649499, + "grad_norm": 0.4137849807739258, + "learning_rate": 0.00026526546091015167, + "loss": 3.2742, + "step": 38350 + }, + { + "epoch": 11.185562805872756, + "grad_norm": 0.40267008543014526, + "learning_rate": 0.0002648278879813302, + "loss": 3.2766, + "step": 38400 + }, + { + "epoch": 11.200128175250525, + "grad_norm": 0.38519883155822754, + "learning_rate": 0.0002643903150525087, + "loss": 3.2839, + "step": 38450 + }, + { + "epoch": 11.214693544628291, + "grad_norm": 0.37960830330848694, + "learning_rate": 0.0002639527421236873, + "loss": 3.2779, + "step": 38500 + }, + { + "epoch": 11.22925891400606, + "grad_norm": 0.40030044317245483, + "learning_rate": 0.0002635151691948658, + "loss": 3.2851, + "step": 38550 + }, + { + "epoch": 11.243824283383827, + "grad_norm": 0.37748202681541443, + "learning_rate": 0.0002630775962660443, + "loss": 3.2801, + "step": 38600 + }, + { + "epoch": 11.258389652761593, + "grad_norm": 0.393085241317749, + "learning_rate": 0.00026264002333722285, + "loss": 3.2861, + "step": 38650 + }, + { + "epoch": 11.272955022139362, + "grad_norm": 0.40957602858543396, + "learning_rate": 0.00026220245040840134, + "loss": 3.2805, + "step": 38700 + }, + { + "epoch": 11.287520391517129, + "grad_norm": 0.3886652886867523, + "learning_rate": 0.00026176487747957993, + "loss": 3.293, + "step": 38750 + }, + { + "epoch": 11.302085760894895, + "grad_norm": 0.3791932463645935, + "learning_rate": 0.0002613273045507584, + "loss": 3.2911, + "step": 38800 + }, + { + "epoch": 11.316651130272664, + "grad_norm": 0.3910757899284363, + "learning_rate": 0.00026088973162193696, + "loss": 3.2861, + "step": 38850 + }, + { + "epoch": 11.33121649965043, + "grad_norm": 0.38568395376205444, + "learning_rate": 0.0002604521586931155, + "loss": 3.2795, + "step": 38900 + }, + { + "epoch": 11.3457818690282, + "grad_norm": 0.3812981843948364, + "learning_rate": 0.00026001458576429404, + "loss": 3.2975, + "step": 38950 + }, + { + "epoch": 11.360347238405966, + "grad_norm": 0.4014233648777008, + "learning_rate": 0.0002595770128354726, + "loss": 3.2927, + "step": 39000 + }, + { + "epoch": 11.360347238405966, + "eval_accuracy": 0.3709491875682827, + "eval_loss": 3.5536797046661377, + "eval_runtime": 179.8, + "eval_samples_per_second": 92.57, + "eval_steps_per_second": 5.79, + "step": 39000 + }, + { + "epoch": 11.374912607783733, + "grad_norm": 0.4019148349761963, + "learning_rate": 0.00025913943990665106, + "loss": 3.2973, + "step": 39050 + }, + { + "epoch": 11.389477977161501, + "grad_norm": 0.386180579662323, + "learning_rate": 0.0002587018669778296, + "loss": 3.3019, + "step": 39100 + }, + { + "epoch": 11.404043346539268, + "grad_norm": 0.367136687040329, + "learning_rate": 0.00025826429404900814, + "loss": 3.2868, + "step": 39150 + }, + { + "epoch": 11.418608715917035, + "grad_norm": 0.38054171204566956, + "learning_rate": 0.0002578267211201867, + "loss": 3.2943, + "step": 39200 + }, + { + "epoch": 11.433174085294803, + "grad_norm": 0.38401472568511963, + "learning_rate": 0.0002573891481913652, + "loss": 3.2884, + "step": 39250 + }, + { + "epoch": 11.44773945467257, + "grad_norm": 0.3807559609413147, + "learning_rate": 0.0002569515752625437, + "loss": 3.3096, + "step": 39300 + }, + { + "epoch": 11.462304824050339, + "grad_norm": 0.3866496682167053, + "learning_rate": 0.0002565140023337223, + "loss": 3.2945, + "step": 39350 + }, + { + "epoch": 11.476870193428105, + "grad_norm": 0.39391186833381653, + "learning_rate": 0.0002560764294049008, + "loss": 3.3052, + "step": 39400 + }, + { + "epoch": 11.491435562805872, + "grad_norm": 0.3963194489479065, + "learning_rate": 0.00025563885647607933, + "loss": 3.307, + "step": 39450 + }, + { + "epoch": 11.50600093218364, + "grad_norm": 0.378620445728302, + "learning_rate": 0.00025520128354725787, + "loss": 3.2964, + "step": 39500 + }, + { + "epoch": 11.520566301561407, + "grad_norm": 0.3991275727748871, + "learning_rate": 0.00025476371061843636, + "loss": 3.3136, + "step": 39550 + }, + { + "epoch": 11.535131670939174, + "grad_norm": 0.403228759765625, + "learning_rate": 0.00025432613768961495, + "loss": 3.3107, + "step": 39600 + }, + { + "epoch": 11.549697040316943, + "grad_norm": 0.3849295377731323, + "learning_rate": 0.00025388856476079343, + "loss": 3.2815, + "step": 39650 + }, + { + "epoch": 11.56426240969471, + "grad_norm": 0.39309900999069214, + "learning_rate": 0.000253450991831972, + "loss": 3.2973, + "step": 39700 + }, + { + "epoch": 11.578827779072478, + "grad_norm": 0.376312255859375, + "learning_rate": 0.0002530134189031505, + "loss": 3.3027, + "step": 39750 + }, + { + "epoch": 11.593393148450245, + "grad_norm": 0.3923032283782959, + "learning_rate": 0.000252575845974329, + "loss": 3.2931, + "step": 39800 + }, + { + "epoch": 11.607958517828012, + "grad_norm": 0.37784478068351746, + "learning_rate": 0.0002521382730455076, + "loss": 3.3102, + "step": 39850 + }, + { + "epoch": 11.62252388720578, + "grad_norm": 0.3833751380443573, + "learning_rate": 0.0002517007001166861, + "loss": 3.3157, + "step": 39900 + }, + { + "epoch": 11.637089256583547, + "grad_norm": 0.38745492696762085, + "learning_rate": 0.0002512631271878646, + "loss": 3.3039, + "step": 39950 + }, + { + "epoch": 11.651654625961314, + "grad_norm": 0.3720225393772125, + "learning_rate": 0.00025082555425904316, + "loss": 3.2965, + "step": 40000 + }, + { + "epoch": 11.651654625961314, + "eval_accuracy": 0.3716299568587447, + "eval_loss": 3.5464437007904053, + "eval_runtime": 180.224, + "eval_samples_per_second": 92.352, + "eval_steps_per_second": 5.776, + "step": 40000 + } + ], + "logging_steps": 50, + "max_steps": 68660, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 10000, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 20, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.36085640790016e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}