| { |
| "best_global_step": 78000, |
| "best_metric": 3.2760121822357178, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/high_0_1208/checkpoint-70000", |
| "epoch": 29.602121016365203, |
| "eval_steps": 1000, |
| "global_step": 110000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013458225667527994, |
| "grad_norm": 1.9294723272323608, |
| "learning_rate": 0.000294, |
| "loss": 8.5675, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.026916451335055987, |
| "grad_norm": 4.372699737548828, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7649, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04037467700258398, |
| "grad_norm": 0.6613487601280212, |
| "learning_rate": 0.0005998020735155513, |
| "loss": 6.3115, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.053832902670111975, |
| "grad_norm": 1.9673986434936523, |
| "learning_rate": 0.0005996001077150935, |
| "loss": 6.0152, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06729112833763996, |
| "grad_norm": 2.4221529960632324, |
| "learning_rate": 0.0005993981419146358, |
| "loss": 5.8697, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08074935400516796, |
| "grad_norm": 1.3177257776260376, |
| "learning_rate": 0.0005991961761141779, |
| "loss": 5.7306, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.09420757967269595, |
| "grad_norm": 1.0655848979949951, |
| "learning_rate": 0.0005989942103137202, |
| "loss": 5.6299, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.10766580534022395, |
| "grad_norm": 1.2664433717727661, |
| "learning_rate": 0.0005987922445132624, |
| "loss": 5.5363, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.12112403100775193, |
| "grad_norm": 1.265374779701233, |
| "learning_rate": 0.0005985902787128047, |
| "loss": 5.3807, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.13458225667527993, |
| "grad_norm": 1.2221217155456543, |
| "learning_rate": 0.0005983883129123468, |
| "loss": 5.2511, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.14804048234280792, |
| "grad_norm": 1.1258769035339355, |
| "learning_rate": 0.000598186347111889, |
| "loss": 5.1464, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.16149870801033592, |
| "grad_norm": 1.3253281116485596, |
| "learning_rate": 0.0005979843813114312, |
| "loss": 5.0664, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1749569336778639, |
| "grad_norm": 1.076049566268921, |
| "learning_rate": 0.0005977824155109734, |
| "loss": 5.013, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1884151593453919, |
| "grad_norm": 1.1340619325637817, |
| "learning_rate": 0.0005975804497105157, |
| "loss": 4.9208, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2018733850129199, |
| "grad_norm": 0.8552718758583069, |
| "learning_rate": 0.0005973784839100578, |
| "loss": 4.8547, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.2153316106804479, |
| "grad_norm": 1.0275166034698486, |
| "learning_rate": 0.0005971765181096001, |
| "loss": 4.7937, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.2287898363479759, |
| "grad_norm": 0.8931620121002197, |
| "learning_rate": 0.0005969745523091422, |
| "loss": 4.7444, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.24224806201550386, |
| "grad_norm": 0.7615459561347961, |
| "learning_rate": 0.0005967725865086845, |
| "loss": 4.6897, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2557062876830319, |
| "grad_norm": 0.860148549079895, |
| "learning_rate": 0.0005965706207082267, |
| "loss": 4.669, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.26916451335055985, |
| "grad_norm": 1.0182037353515625, |
| "learning_rate": 0.0005963686549077689, |
| "loss": 4.6123, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.26916451335055985, |
| "eval_accuracy": 0.26530030208171723, |
| "eval_loss": 4.529453277587891, |
| "eval_runtime": 54.5835, |
| "eval_samples_per_second": 329.99, |
| "eval_steps_per_second": 20.629, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2826227390180879, |
| "grad_norm": 0.9250805974006653, |
| "learning_rate": 0.0005961666891073111, |
| "loss": 4.5473, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.29608096468561584, |
| "grad_norm": 0.8115113973617554, |
| "learning_rate": 0.0005959647233068533, |
| "loss": 4.5016, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.30953919035314387, |
| "grad_norm": 0.8632411956787109, |
| "learning_rate": 0.0005957627575063955, |
| "loss": 4.4742, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.32299741602067183, |
| "grad_norm": 0.7869690656661987, |
| "learning_rate": 0.0005955607917059378, |
| "loss": 4.4344, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.3364556416881998, |
| "grad_norm": 0.6230737566947937, |
| "learning_rate": 0.00059535882590548, |
| "loss": 4.4066, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.3499138673557278, |
| "grad_norm": 0.935434103012085, |
| "learning_rate": 0.0005951568601050221, |
| "loss": 4.3607, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3633720930232558, |
| "grad_norm": 0.8798397779464722, |
| "learning_rate": 0.0005949548943045644, |
| "loss": 4.3366, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.3768303186907838, |
| "grad_norm": 0.7297024726867676, |
| "learning_rate": 0.0005947529285041066, |
| "loss": 4.3083, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3902885443583118, |
| "grad_norm": 0.7763248682022095, |
| "learning_rate": 0.0005945509627036488, |
| "loss": 4.2737, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4037467700258398, |
| "grad_norm": 0.5607454776763916, |
| "learning_rate": 0.000594348996903191, |
| "loss": 4.2506, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.4172049956933678, |
| "grad_norm": 0.7158567309379578, |
| "learning_rate": 0.0005941470311027332, |
| "loss": 4.2292, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4306632213608958, |
| "grad_norm": 0.7625264525413513, |
| "learning_rate": 0.0005939450653022754, |
| "loss": 4.2232, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.44412144702842377, |
| "grad_norm": 0.6740456819534302, |
| "learning_rate": 0.0005937430995018177, |
| "loss": 4.1809, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.4575796726959518, |
| "grad_norm": 0.7420955300331116, |
| "learning_rate": 0.0005935411337013598, |
| "loss": 4.1803, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.47103789836347976, |
| "grad_norm": 0.8082626461982727, |
| "learning_rate": 0.0005933391679009021, |
| "loss": 4.1628, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.4844961240310077, |
| "grad_norm": 0.6156010627746582, |
| "learning_rate": 0.0005931372021004442, |
| "loss": 4.1499, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.49795434969853575, |
| "grad_norm": 0.6687771677970886, |
| "learning_rate": 0.0005929352362999865, |
| "loss": 4.1337, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5114125753660638, |
| "grad_norm": 0.67042076587677, |
| "learning_rate": 0.0005927332704995287, |
| "loss": 4.1201, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5248708010335917, |
| "grad_norm": 0.6833565831184387, |
| "learning_rate": 0.0005925313046990709, |
| "loss": 4.0887, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5383290267011197, |
| "grad_norm": 0.6024614572525024, |
| "learning_rate": 0.0005923293388986131, |
| "loss": 4.0841, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5383290267011197, |
| "eval_accuracy": 0.31777640737877294, |
| "eval_loss": 4.024723052978516, |
| "eval_runtime": 53.7356, |
| "eval_samples_per_second": 335.197, |
| "eval_steps_per_second": 20.954, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5517872523686477, |
| "grad_norm": 0.5663601160049438, |
| "learning_rate": 0.0005921273730981553, |
| "loss": 4.0555, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.5652454780361758, |
| "grad_norm": 0.6550332307815552, |
| "learning_rate": 0.0005919254072976975, |
| "loss": 4.0597, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.5787037037037037, |
| "grad_norm": 0.6065599322319031, |
| "learning_rate": 0.0005917234414972398, |
| "loss": 4.048, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5921619293712317, |
| "grad_norm": 0.6482925415039062, |
| "learning_rate": 0.000591521475696782, |
| "loss": 4.0347, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6056201550387597, |
| "grad_norm": 0.5995512008666992, |
| "learning_rate": 0.0005913195098963241, |
| "loss": 4.0295, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.6190783807062877, |
| "grad_norm": 0.6398453712463379, |
| "learning_rate": 0.0005911175440958664, |
| "loss": 4.0111, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6325366063738157, |
| "grad_norm": 0.6352100968360901, |
| "learning_rate": 0.0005909155782954086, |
| "loss": 4.0079, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6459948320413437, |
| "grad_norm": 0.4817008674144745, |
| "learning_rate": 0.0005907136124949508, |
| "loss": 4.0007, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.6594530577088716, |
| "grad_norm": 0.6795246005058289, |
| "learning_rate": 0.000590511646694493, |
| "loss": 3.9933, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.6729112833763996, |
| "grad_norm": 0.5192553997039795, |
| "learning_rate": 0.0005903096808940352, |
| "loss": 3.9658, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6863695090439277, |
| "grad_norm": 0.5410998463630676, |
| "learning_rate": 0.0005901077150935774, |
| "loss": 3.9671, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6998277347114557, |
| "grad_norm": 0.48503005504608154, |
| "learning_rate": 0.0005899057492931197, |
| "loss": 3.9541, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7132859603789836, |
| "grad_norm": 0.474247545003891, |
| "learning_rate": 0.0005897037834926618, |
| "loss": 3.9415, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7267441860465116, |
| "grad_norm": 0.5398220419883728, |
| "learning_rate": 0.0005895018176922041, |
| "loss": 3.9447, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.7402024117140397, |
| "grad_norm": 0.5455359816551208, |
| "learning_rate": 0.0005892998518917462, |
| "loss": 3.9324, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.7536606373815676, |
| "grad_norm": 0.556117832660675, |
| "learning_rate": 0.0005890978860912885, |
| "loss": 3.9207, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.7671188630490956, |
| "grad_norm": 0.550010621547699, |
| "learning_rate": 0.0005888959202908307, |
| "loss": 3.9291, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.7805770887166236, |
| "grad_norm": 0.6481958627700806, |
| "learning_rate": 0.0005886939544903729, |
| "loss": 3.9165, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.7940353143841516, |
| "grad_norm": 0.4899815320968628, |
| "learning_rate": 0.0005884919886899151, |
| "loss": 3.8995, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8074935400516796, |
| "grad_norm": 0.5277990698814392, |
| "learning_rate": 0.0005882900228894573, |
| "loss": 3.9046, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8074935400516796, |
| "eval_accuracy": 0.33397511030915245, |
| "eval_loss": 3.841794729232788, |
| "eval_runtime": 53.7996, |
| "eval_samples_per_second": 334.798, |
| "eval_steps_per_second": 20.93, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8209517657192076, |
| "grad_norm": 0.477497935295105, |
| "learning_rate": 0.0005880880570889996, |
| "loss": 3.8943, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.8344099913867356, |
| "grad_norm": 0.4592600464820862, |
| "learning_rate": 0.0005878860912885418, |
| "loss": 3.8894, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.8478682170542635, |
| "grad_norm": 0.5376394987106323, |
| "learning_rate": 0.000587684125488084, |
| "loss": 3.8786, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.8613264427217916, |
| "grad_norm": 0.47509804368019104, |
| "learning_rate": 0.0005874821596876261, |
| "loss": 3.8695, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.8747846683893196, |
| "grad_norm": 0.590036928653717, |
| "learning_rate": 0.0005872801938871684, |
| "loss": 3.87, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.8882428940568475, |
| "grad_norm": 0.5190223455429077, |
| "learning_rate": 0.0005870782280867106, |
| "loss": 3.8557, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9017011197243755, |
| "grad_norm": 0.5153225064277649, |
| "learning_rate": 0.0005868762622862528, |
| "loss": 3.852, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9151593453919036, |
| "grad_norm": 0.42253756523132324, |
| "learning_rate": 0.000586674296485795, |
| "loss": 3.867, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.9286175710594315, |
| "grad_norm": 0.48400136828422546, |
| "learning_rate": 0.0005864723306853373, |
| "loss": 3.842, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.9420757967269595, |
| "grad_norm": 0.4630362093448639, |
| "learning_rate": 0.0005862703648848794, |
| "loss": 3.8455, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.9555340223944875, |
| "grad_norm": 0.526874840259552, |
| "learning_rate": 0.0005860683990844217, |
| "loss": 3.8223, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.9689922480620154, |
| "grad_norm": 0.5283749103546143, |
| "learning_rate": 0.0005858664332839638, |
| "loss": 3.8237, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.9824504737295435, |
| "grad_norm": 0.5012550950050354, |
| "learning_rate": 0.0005856644674835061, |
| "loss": 3.825, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.9959086993970715, |
| "grad_norm": 0.4376530051231384, |
| "learning_rate": 0.0005854625016830483, |
| "loss": 3.805, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.009151593453919, |
| "grad_norm": 0.4493418037891388, |
| "learning_rate": 0.0005852605358825905, |
| "loss": 3.7713, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.022609819121447, |
| "grad_norm": 0.44258996844291687, |
| "learning_rate": 0.0005850585700821327, |
| "loss": 3.7372, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.036068044788975, |
| "grad_norm": 0.5155314803123474, |
| "learning_rate": 0.0005848566042816749, |
| "loss": 3.7485, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.049526270456503, |
| "grad_norm": 0.4668378233909607, |
| "learning_rate": 0.0005846546384812171, |
| "loss": 3.739, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.062984496124031, |
| "grad_norm": 0.5066882371902466, |
| "learning_rate": 0.0005844526726807594, |
| "loss": 3.7466, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.076442721791559, |
| "grad_norm": 0.46061182022094727, |
| "learning_rate": 0.0005842507068803016, |
| "loss": 3.7493, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.076442721791559, |
| "eval_accuracy": 0.34418400344540906, |
| "eval_loss": 3.738100051879883, |
| "eval_runtime": 53.6959, |
| "eval_samples_per_second": 335.444, |
| "eval_steps_per_second": 20.97, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.089900947459087, |
| "grad_norm": 0.5034427642822266, |
| "learning_rate": 0.0005840487410798437, |
| "loss": 3.7378, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.103359173126615, |
| "grad_norm": 0.4937703609466553, |
| "learning_rate": 0.000583846775279386, |
| "loss": 3.7272, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.116817398794143, |
| "grad_norm": 0.43894490599632263, |
| "learning_rate": 0.0005836448094789282, |
| "loss": 3.7287, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.130275624461671, |
| "grad_norm": 0.48713067173957825, |
| "learning_rate": 0.0005834428436784704, |
| "loss": 3.7364, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.143733850129199, |
| "grad_norm": 0.4943729341030121, |
| "learning_rate": 0.0005832408778780126, |
| "loss": 3.739, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.157192075796727, |
| "grad_norm": 0.46905580163002014, |
| "learning_rate": 0.0005830389120775548, |
| "loss": 3.7298, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.1706503014642549, |
| "grad_norm": 0.4387616217136383, |
| "learning_rate": 0.000582836946277097, |
| "loss": 3.7308, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.1841085271317828, |
| "grad_norm": 0.455136239528656, |
| "learning_rate": 0.0005826349804766393, |
| "loss": 3.7206, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.197566752799311, |
| "grad_norm": 0.44222962856292725, |
| "learning_rate": 0.0005824330146761814, |
| "loss": 3.7197, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.211024978466839, |
| "grad_norm": 0.4487605392932892, |
| "learning_rate": 0.0005822310488757237, |
| "loss": 3.718, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.224483204134367, |
| "grad_norm": 0.4749026894569397, |
| "learning_rate": 0.0005820290830752658, |
| "loss": 3.7054, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.237941429801895, |
| "grad_norm": 0.44181352853775024, |
| "learning_rate": 0.0005818271172748081, |
| "loss": 3.701, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.251399655469423, |
| "grad_norm": 0.4515010714530945, |
| "learning_rate": 0.0005816251514743503, |
| "loss": 3.7134, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.2648578811369509, |
| "grad_norm": 0.4210042357444763, |
| "learning_rate": 0.0005814231856738925, |
| "loss": 3.6976, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.2783161068044788, |
| "grad_norm": 0.509992241859436, |
| "learning_rate": 0.0005812212198734347, |
| "loss": 3.6931, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.291774332472007, |
| "grad_norm": 0.42137229442596436, |
| "learning_rate": 0.0005810192540729769, |
| "loss": 3.7055, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.3052325581395348, |
| "grad_norm": 0.39782291650772095, |
| "learning_rate": 0.0005808172882725192, |
| "loss": 3.6862, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.318690783807063, |
| "grad_norm": 0.4171382188796997, |
| "learning_rate": 0.0005806153224720614, |
| "loss": 3.6812, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.332149009474591, |
| "grad_norm": 0.4413432478904724, |
| "learning_rate": 0.0005804133566716036, |
| "loss": 3.6806, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.3456072351421189, |
| "grad_norm": 0.42678385972976685, |
| "learning_rate": 0.0005802113908711457, |
| "loss": 3.6752, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.3456072351421189, |
| "eval_accuracy": 0.35101408802046735, |
| "eval_loss": 3.6677448749542236, |
| "eval_runtime": 53.6849, |
| "eval_samples_per_second": 335.514, |
| "eval_steps_per_second": 20.974, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.3590654608096469, |
| "grad_norm": 0.3610088527202606, |
| "learning_rate": 0.000580009425070688, |
| "loss": 3.6851, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.3725236864771748, |
| "grad_norm": 0.3927769064903259, |
| "learning_rate": 0.0005798074592702302, |
| "loss": 3.688, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.3859819121447028, |
| "grad_norm": 0.4166286289691925, |
| "learning_rate": 0.0005796054934697724, |
| "loss": 3.6718, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.3994401378122308, |
| "grad_norm": 0.39592215418815613, |
| "learning_rate": 0.0005794035276693146, |
| "loss": 3.6768, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.412898363479759, |
| "grad_norm": 0.4000367820262909, |
| "learning_rate": 0.0005792015618688568, |
| "loss": 3.6685, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.4263565891472867, |
| "grad_norm": 0.38646310567855835, |
| "learning_rate": 0.000578999596068399, |
| "loss": 3.674, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.4398148148148149, |
| "grad_norm": 0.37530556321144104, |
| "learning_rate": 0.0005787976302679413, |
| "loss": 3.6658, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.4532730404823428, |
| "grad_norm": 0.37342968583106995, |
| "learning_rate": 0.0005785956644674834, |
| "loss": 3.6638, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.4667312661498708, |
| "grad_norm": 0.48188892006874084, |
| "learning_rate": 0.0005783936986670257, |
| "loss": 3.65, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.4801894918173988, |
| "grad_norm": 0.4143880605697632, |
| "learning_rate": 0.0005781917328665678, |
| "loss": 3.6685, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.4936477174849268, |
| "grad_norm": 0.3935947120189667, |
| "learning_rate": 0.0005779897670661101, |
| "loss": 3.6693, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.507105943152455, |
| "grad_norm": 0.3887571692466736, |
| "learning_rate": 0.0005777878012656523, |
| "loss": 3.6481, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.5205641688199827, |
| "grad_norm": 0.38384705781936646, |
| "learning_rate": 0.0005775858354651945, |
| "loss": 3.6564, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.5340223944875109, |
| "grad_norm": 0.36646515130996704, |
| "learning_rate": 0.0005773838696647367, |
| "loss": 3.645, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.5474806201550386, |
| "grad_norm": 0.41495048999786377, |
| "learning_rate": 0.000577181903864279, |
| "loss": 3.6521, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.5609388458225668, |
| "grad_norm": 0.4048604369163513, |
| "learning_rate": 0.0005769799380638212, |
| "loss": 3.6421, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.5743970714900948, |
| "grad_norm": 0.4305686950683594, |
| "learning_rate": 0.0005767779722633633, |
| "loss": 3.6493, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.5878552971576227, |
| "grad_norm": 0.414792001247406, |
| "learning_rate": 0.0005765760064629056, |
| "loss": 3.6338, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.6013135228251507, |
| "grad_norm": 0.38997161388397217, |
| "learning_rate": 0.0005763740406624477, |
| "loss": 3.6387, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.6147717484926787, |
| "grad_norm": 0.37915465235710144, |
| "learning_rate": 0.00057617207486199, |
| "loss": 3.6324, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.6147717484926787, |
| "eval_accuracy": 0.3565350866328297, |
| "eval_loss": 3.6097068786621094, |
| "eval_runtime": 53.6997, |
| "eval_samples_per_second": 335.421, |
| "eval_steps_per_second": 20.968, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.6282299741602069, |
| "grad_norm": 0.4142931401729584, |
| "learning_rate": 0.0005759701090615322, |
| "loss": 3.6351, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.6416881998277346, |
| "grad_norm": 0.4205895960330963, |
| "learning_rate": 0.0005757681432610744, |
| "loss": 3.6229, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.6551464254952628, |
| "grad_norm": 0.42943212389945984, |
| "learning_rate": 0.0005755661774606166, |
| "loss": 3.6268, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.6686046511627906, |
| "grad_norm": 0.4431730806827545, |
| "learning_rate": 0.0005753642116601588, |
| "loss": 3.6232, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.6820628768303187, |
| "grad_norm": 0.4297159016132355, |
| "learning_rate": 0.000575162245859701, |
| "loss": 3.6256, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.6955211024978467, |
| "grad_norm": 0.3931505084037781, |
| "learning_rate": 0.0005749602800592433, |
| "loss": 3.6251, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.7089793281653747, |
| "grad_norm": 0.41490694880485535, |
| "learning_rate": 0.0005747583142587854, |
| "loss": 3.6127, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.7224375538329026, |
| "grad_norm": 0.3898763656616211, |
| "learning_rate": 0.0005745563484583277, |
| "loss": 3.6275, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.7358957795004306, |
| "grad_norm": 0.3727113902568817, |
| "learning_rate": 0.0005743543826578698, |
| "loss": 3.6163, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.7493540051679588, |
| "grad_norm": 0.3718154728412628, |
| "learning_rate": 0.0005741524168574121, |
| "loss": 3.6241, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.7628122308354865, |
| "grad_norm": 0.38011330366134644, |
| "learning_rate": 0.0005739504510569543, |
| "loss": 3.603, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.7762704565030147, |
| "grad_norm": 0.3921981155872345, |
| "learning_rate": 0.0005737484852564966, |
| "loss": 3.6211, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.7897286821705425, |
| "grad_norm": 0.3750942349433899, |
| "learning_rate": 0.0005735465194560387, |
| "loss": 3.6174, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.8031869078380707, |
| "grad_norm": 0.341286838054657, |
| "learning_rate": 0.000573344553655581, |
| "loss": 3.6087, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.8166451335055986, |
| "grad_norm": 0.4016365706920624, |
| "learning_rate": 0.0005731425878551232, |
| "loss": 3.6186, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.8301033591731266, |
| "grad_norm": 0.37889373302459717, |
| "learning_rate": 0.0005729406220546653, |
| "loss": 3.5999, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.8435615848406546, |
| "grad_norm": 0.392206609249115, |
| "learning_rate": 0.0005727386562542076, |
| "loss": 3.5989, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.8570198105081825, |
| "grad_norm": 0.39812007546424866, |
| "learning_rate": 0.0005725366904537497, |
| "loss": 3.587, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.8704780361757107, |
| "grad_norm": 0.37985455989837646, |
| "learning_rate": 0.000572334724653292, |
| "loss": 3.609, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.8839362618432385, |
| "grad_norm": 0.41674208641052246, |
| "learning_rate": 0.0005721327588528342, |
| "loss": 3.5971, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.8839362618432385, |
| "eval_accuracy": 0.3611523607006071, |
| "eval_loss": 3.5647761821746826, |
| "eval_runtime": 53.6742, |
| "eval_samples_per_second": 335.58, |
| "eval_steps_per_second": 20.978, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.8973944875107667, |
| "grad_norm": 0.4201742112636566, |
| "learning_rate": 0.0005719307930523764, |
| "loss": 3.6045, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.9108527131782944, |
| "grad_norm": 0.35860392451286316, |
| "learning_rate": 0.0005717288272519186, |
| "loss": 3.5862, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.9243109388458226, |
| "grad_norm": 0.42538923025131226, |
| "learning_rate": 0.0005715268614514608, |
| "loss": 3.5803, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.9377691645133506, |
| "grad_norm": 0.3780660331249237, |
| "learning_rate": 0.000571324895651003, |
| "loss": 3.5995, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.9512273901808785, |
| "grad_norm": 0.37989741563796997, |
| "learning_rate": 0.0005711229298505453, |
| "loss": 3.5968, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.9646856158484065, |
| "grad_norm": 0.43703949451446533, |
| "learning_rate": 0.0005709209640500874, |
| "loss": 3.5722, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.9781438415159345, |
| "grad_norm": 0.4558578431606293, |
| "learning_rate": 0.0005707189982496297, |
| "loss": 3.5875, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.9916020671834627, |
| "grad_norm": 0.35950618982315063, |
| "learning_rate": 0.0005705170324491719, |
| "loss": 3.5881, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.00484496124031, |
| "grad_norm": 0.39500686526298523, |
| "learning_rate": 0.0005703150666487141, |
| "loss": 3.5579, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.018303186907838, |
| "grad_norm": 0.341329425573349, |
| "learning_rate": 0.0005701131008482563, |
| "loss": 3.4853, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.0317614125753662, |
| "grad_norm": 0.35838985443115234, |
| "learning_rate": 0.0005699111350477986, |
| "loss": 3.4963, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.045219638242894, |
| "grad_norm": 0.3362147808074951, |
| "learning_rate": 0.0005697091692473408, |
| "loss": 3.4946, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.058677863910422, |
| "grad_norm": 0.40855300426483154, |
| "learning_rate": 0.000569507203446883, |
| "loss": 3.5035, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.07213608957795, |
| "grad_norm": 0.38907238841056824, |
| "learning_rate": 0.0005693052376464252, |
| "loss": 3.5011, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.085594315245478, |
| "grad_norm": 0.39034557342529297, |
| "learning_rate": 0.0005691032718459673, |
| "loss": 3.4969, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.099052540913006, |
| "grad_norm": 0.41712623834609985, |
| "learning_rate": 0.0005689013060455096, |
| "loss": 3.5032, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.112510766580534, |
| "grad_norm": 0.39129939675331116, |
| "learning_rate": 0.0005686993402450518, |
| "loss": 3.4977, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.125968992248062, |
| "grad_norm": 0.36673828959465027, |
| "learning_rate": 0.000568497374444594, |
| "loss": 3.5007, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.13942721791559, |
| "grad_norm": 0.37993109226226807, |
| "learning_rate": 0.0005682954086441362, |
| "loss": 3.5074, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.152885443583118, |
| "grad_norm": 0.37266653776168823, |
| "learning_rate": 0.0005680934428436784, |
| "loss": 3.5112, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.152885443583118, |
| "eval_accuracy": 0.3640361741878539, |
| "eval_loss": 3.537073850631714, |
| "eval_runtime": 53.612, |
| "eval_samples_per_second": 335.97, |
| "eval_steps_per_second": 21.003, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.166343669250646, |
| "grad_norm": 0.369495153427124, |
| "learning_rate": 0.0005678914770432206, |
| "loss": 3.504, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.179801894918174, |
| "grad_norm": 0.3906343877315521, |
| "learning_rate": 0.0005676895112427629, |
| "loss": 3.4954, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.193260120585702, |
| "grad_norm": 0.4126552641391754, |
| "learning_rate": 0.000567487545442305, |
| "loss": 3.5038, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.20671834625323, |
| "grad_norm": 0.3587755560874939, |
| "learning_rate": 0.0005672855796418473, |
| "loss": 3.5045, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.220176571920758, |
| "grad_norm": 0.3550557792186737, |
| "learning_rate": 0.0005670836138413894, |
| "loss": 3.5078, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.233634797588286, |
| "grad_norm": 0.38642674684524536, |
| "learning_rate": 0.0005668816480409317, |
| "loss": 3.506, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.2470930232558137, |
| "grad_norm": 0.3594475984573364, |
| "learning_rate": 0.0005666796822404739, |
| "loss": 3.4996, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.260551248923342, |
| "grad_norm": 0.3822736144065857, |
| "learning_rate": 0.0005664777164400162, |
| "loss": 3.5008, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.27400947459087, |
| "grad_norm": 0.3965492844581604, |
| "learning_rate": 0.0005662757506395583, |
| "loss": 3.5014, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.287467700258398, |
| "grad_norm": 0.3902672231197357, |
| "learning_rate": 0.0005660737848391006, |
| "loss": 3.5048, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.300925925925926, |
| "grad_norm": 0.38660308718681335, |
| "learning_rate": 0.0005658718190386428, |
| "loss": 3.5106, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.314384151593454, |
| "grad_norm": 0.38404619693756104, |
| "learning_rate": 0.000565669853238185, |
| "loss": 3.5119, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.327842377260982, |
| "grad_norm": 0.385078102350235, |
| "learning_rate": 0.0005654678874377272, |
| "loss": 3.5057, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.3413006029285097, |
| "grad_norm": 0.3999466300010681, |
| "learning_rate": 0.0005652659216372693, |
| "loss": 3.5044, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.354758828596038, |
| "grad_norm": 0.36590397357940674, |
| "learning_rate": 0.0005650639558368116, |
| "loss": 3.4961, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.3682170542635657, |
| "grad_norm": 0.38425323367118835, |
| "learning_rate": 0.0005648619900363538, |
| "loss": 3.5168, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.381675279931094, |
| "grad_norm": 0.3574206531047821, |
| "learning_rate": 0.000564660024235896, |
| "loss": 3.498, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.395133505598622, |
| "grad_norm": 0.3823363482952118, |
| "learning_rate": 0.0005644580584354382, |
| "loss": 3.4875, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.40859173126615, |
| "grad_norm": 0.3611487150192261, |
| "learning_rate": 0.0005642560926349804, |
| "loss": 3.4983, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.422049956933678, |
| "grad_norm": 0.38465991616249084, |
| "learning_rate": 0.0005640541268345226, |
| "loss": 3.4948, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.422049956933678, |
| "eval_accuracy": 0.36659046176217375, |
| "eval_loss": 3.5099785327911377, |
| "eval_runtime": 53.8716, |
| "eval_samples_per_second": 334.351, |
| "eval_steps_per_second": 20.902, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.4355081826012057, |
| "grad_norm": 0.38401660323143005, |
| "learning_rate": 0.0005638521610340649, |
| "loss": 3.5016, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.448966408268734, |
| "grad_norm": 0.36064672470092773, |
| "learning_rate": 0.000563650195233607, |
| "loss": 3.4943, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.4624246339362617, |
| "grad_norm": 0.39322274923324585, |
| "learning_rate": 0.0005634482294331493, |
| "loss": 3.4883, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.47588285960379, |
| "grad_norm": 0.36240771412849426, |
| "learning_rate": 0.0005632462636326914, |
| "loss": 3.4901, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.4893410852713176, |
| "grad_norm": 0.37943360209465027, |
| "learning_rate": 0.0005630442978322337, |
| "loss": 3.494, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.502799310938846, |
| "grad_norm": 0.36113718152046204, |
| "learning_rate": 0.0005628423320317759, |
| "loss": 3.4915, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.516257536606374, |
| "grad_norm": 0.3588191568851471, |
| "learning_rate": 0.0005626403662313182, |
| "loss": 3.4769, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.5297157622739017, |
| "grad_norm": 0.3674715459346771, |
| "learning_rate": 0.0005624384004308603, |
| "loss": 3.4824, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.54317398794143, |
| "grad_norm": 0.3795354962348938, |
| "learning_rate": 0.0005622364346304026, |
| "loss": 3.4977, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.5566322136089576, |
| "grad_norm": 0.3495054244995117, |
| "learning_rate": 0.0005620344688299448, |
| "loss": 3.4829, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.570090439276486, |
| "grad_norm": 0.3756536543369293, |
| "learning_rate": 0.0005618325030294869, |
| "loss": 3.4937, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.583548664944014, |
| "grad_norm": 0.33491694927215576, |
| "learning_rate": 0.0005616305372290292, |
| "loss": 3.4838, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.5970068906115418, |
| "grad_norm": 0.3515341877937317, |
| "learning_rate": 0.0005614285714285713, |
| "loss": 3.4769, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.6104651162790695, |
| "grad_norm": 0.35633155703544617, |
| "learning_rate": 0.0005612266056281136, |
| "loss": 3.482, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.6239233419465977, |
| "grad_norm": 0.4096840023994446, |
| "learning_rate": 0.0005610246398276558, |
| "loss": 3.4919, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.637381567614126, |
| "grad_norm": 0.38962793350219727, |
| "learning_rate": 0.000560822674027198, |
| "loss": 3.4777, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.6508397932816536, |
| "grad_norm": 0.35541465878486633, |
| "learning_rate": 0.0005606207082267402, |
| "loss": 3.4946, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.664298018949182, |
| "grad_norm": 0.36673033237457275, |
| "learning_rate": 0.0005604187424262824, |
| "loss": 3.478, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.6777562446167096, |
| "grad_norm": 0.38117632269859314, |
| "learning_rate": 0.0005602167766258246, |
| "loss": 3.4992, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.6912144702842378, |
| "grad_norm": 0.3540132939815521, |
| "learning_rate": 0.0005600148108253669, |
| "loss": 3.4936, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.6912144702842378, |
| "eval_accuracy": 0.36949480950512226, |
| "eval_loss": 3.4849045276641846, |
| "eval_runtime": 53.8648, |
| "eval_samples_per_second": 334.393, |
| "eval_steps_per_second": 20.904, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.704672695951766, |
| "grad_norm": 0.3549436330795288, |
| "learning_rate": 0.000559812845024909, |
| "loss": 3.482, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.7181309216192937, |
| "grad_norm": 0.3870752155780792, |
| "learning_rate": 0.0005596108792244513, |
| "loss": 3.4803, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.7315891472868215, |
| "grad_norm": 0.3666519522666931, |
| "learning_rate": 0.0005594089134239935, |
| "loss": 3.4813, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.7450473729543496, |
| "grad_norm": 0.3458859622478485, |
| "learning_rate": 0.0005592069476235358, |
| "loss": 3.4725, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.758505598621878, |
| "grad_norm": 0.3320566415786743, |
| "learning_rate": 0.0005590049818230779, |
| "loss": 3.4812, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.7719638242894056, |
| "grad_norm": 0.354028582572937, |
| "learning_rate": 0.0005588030160226202, |
| "loss": 3.4935, |
| "step": 10300 |
| }, |
| { |
| "epoch": 2.7854220499569338, |
| "grad_norm": 0.38568511605262756, |
| "learning_rate": 0.0005586010502221623, |
| "loss": 3.4781, |
| "step": 10350 |
| }, |
| { |
| "epoch": 2.7988802756244615, |
| "grad_norm": 0.4278452694416046, |
| "learning_rate": 0.0005583990844217045, |
| "loss": 3.4732, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.8123385012919897, |
| "grad_norm": 0.33488065004348755, |
| "learning_rate": 0.0005581971186212468, |
| "loss": 3.4761, |
| "step": 10450 |
| }, |
| { |
| "epoch": 2.825796726959518, |
| "grad_norm": 0.37788429856300354, |
| "learning_rate": 0.0005579951528207889, |
| "loss": 3.4835, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.8392549526270456, |
| "grad_norm": 0.3877081573009491, |
| "learning_rate": 0.0005577931870203312, |
| "loss": 3.4722, |
| "step": 10550 |
| }, |
| { |
| "epoch": 2.8527131782945734, |
| "grad_norm": 0.37572547793388367, |
| "learning_rate": 0.0005575912212198733, |
| "loss": 3.4701, |
| "step": 10600 |
| }, |
| { |
| "epoch": 2.8661714039621016, |
| "grad_norm": 0.38266733288764954, |
| "learning_rate": 0.0005573892554194156, |
| "loss": 3.4855, |
| "step": 10650 |
| }, |
| { |
| "epoch": 2.8796296296296298, |
| "grad_norm": 0.3513830304145813, |
| "learning_rate": 0.0005571872896189578, |
| "loss": 3.4572, |
| "step": 10700 |
| }, |
| { |
| "epoch": 2.8930878552971575, |
| "grad_norm": 0.3514174818992615, |
| "learning_rate": 0.0005569853238185, |
| "loss": 3.4694, |
| "step": 10750 |
| }, |
| { |
| "epoch": 2.9065460809646857, |
| "grad_norm": 0.3365142345428467, |
| "learning_rate": 0.0005567833580180422, |
| "loss": 3.4767, |
| "step": 10800 |
| }, |
| { |
| "epoch": 2.9200043066322134, |
| "grad_norm": 0.3384622037410736, |
| "learning_rate": 0.0005565813922175844, |
| "loss": 3.4679, |
| "step": 10850 |
| }, |
| { |
| "epoch": 2.9334625322997416, |
| "grad_norm": 0.3428475260734558, |
| "learning_rate": 0.0005563794264171266, |
| "loss": 3.4704, |
| "step": 10900 |
| }, |
| { |
| "epoch": 2.94692075796727, |
| "grad_norm": 0.3406570553779602, |
| "learning_rate": 0.0005561774606166689, |
| "loss": 3.4632, |
| "step": 10950 |
| }, |
| { |
| "epoch": 2.9603789836347976, |
| "grad_norm": 0.36909055709838867, |
| "learning_rate": 0.000555975494816211, |
| "loss": 3.4683, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.9603789836347976, |
| "eval_accuracy": 0.37134626057079584, |
| "eval_loss": 3.4629483222961426, |
| "eval_runtime": 53.8353, |
| "eval_samples_per_second": 334.576, |
| "eval_steps_per_second": 20.916, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.9738372093023253, |
| "grad_norm": 0.36576735973358154, |
| "learning_rate": 0.0005557735290157534, |
| "loss": 3.4682, |
| "step": 11050 |
| }, |
| { |
| "epoch": 2.9872954349698535, |
| "grad_norm": 0.3771063983440399, |
| "learning_rate": 0.0005555715632152955, |
| "loss": 3.4609, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.000538329026701, |
| "grad_norm": 0.3679291009902954, |
| "learning_rate": 0.0005553695974148378, |
| "loss": 3.4581, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.013996554694229, |
| "grad_norm": 0.4077318012714386, |
| "learning_rate": 0.0005551676316143799, |
| "loss": 3.3805, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.027454780361757, |
| "grad_norm": 0.3519296646118164, |
| "learning_rate": 0.0005549656658139222, |
| "loss": 3.3762, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.0409130060292853, |
| "grad_norm": 0.3560413718223572, |
| "learning_rate": 0.0005547637000134644, |
| "loss": 3.3698, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.054371231696813, |
| "grad_norm": 0.4313643276691437, |
| "learning_rate": 0.0005545617342130065, |
| "loss": 3.3794, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.067829457364341, |
| "grad_norm": 0.3495795726776123, |
| "learning_rate": 0.0005543597684125488, |
| "loss": 3.3729, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.081287683031869, |
| "grad_norm": 0.37050846219062805, |
| "learning_rate": 0.0005541578026120909, |
| "loss": 3.3754, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.094745908699397, |
| "grad_norm": 0.364422470331192, |
| "learning_rate": 0.0005539558368116332, |
| "loss": 3.3805, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.108204134366925, |
| "grad_norm": 0.39504632353782654, |
| "learning_rate": 0.0005537538710111754, |
| "loss": 3.39, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.121662360034453, |
| "grad_norm": 0.3495160937309265, |
| "learning_rate": 0.0005535519052107176, |
| "loss": 3.3926, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.135120585701981, |
| "grad_norm": 0.3640858232975006, |
| "learning_rate": 0.0005533499394102598, |
| "loss": 3.3842, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.148578811369509, |
| "grad_norm": 0.35223206877708435, |
| "learning_rate": 0.000553147973609802, |
| "loss": 3.3832, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.162037037037037, |
| "grad_norm": 0.3473789691925049, |
| "learning_rate": 0.0005529460078093442, |
| "loss": 3.3901, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.175495262704565, |
| "grad_norm": 0.3820473253726959, |
| "learning_rate": 0.0005527440420088865, |
| "loss": 3.4031, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.188953488372093, |
| "grad_norm": 0.36691343784332275, |
| "learning_rate": 0.0005525420762084286, |
| "loss": 3.3887, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.202411714039621, |
| "grad_norm": 0.3371462821960449, |
| "learning_rate": 0.0005523401104079709, |
| "loss": 3.3819, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.215869939707149, |
| "grad_norm": 0.34302136301994324, |
| "learning_rate": 0.0005521381446075131, |
| "loss": 3.4062, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.229328165374677, |
| "grad_norm": 0.3512645363807678, |
| "learning_rate": 0.0005519361788070554, |
| "loss": 3.3893, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.229328165374677, |
| "eval_accuracy": 0.373407834655131, |
| "eval_loss": 3.449765205383301, |
| "eval_runtime": 53.7784, |
| "eval_samples_per_second": 334.93, |
| "eval_steps_per_second": 20.938, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.242786391042205, |
| "grad_norm": 0.3671615719795227, |
| "learning_rate": 0.0005517342130065975, |
| "loss": 3.4015, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.2562446167097328, |
| "grad_norm": 0.33293598890304565, |
| "learning_rate": 0.0005515322472061398, |
| "loss": 3.3848, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.269702842377261, |
| "grad_norm": 0.3794623017311096, |
| "learning_rate": 0.0005513302814056819, |
| "loss": 3.3976, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.283161068044789, |
| "grad_norm": 0.3835780918598175, |
| "learning_rate": 0.0005511283156052242, |
| "loss": 3.3949, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.296619293712317, |
| "grad_norm": 0.35747066140174866, |
| "learning_rate": 0.0005509263498047664, |
| "loss": 3.3953, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.310077519379845, |
| "grad_norm": 0.34848782420158386, |
| "learning_rate": 0.0005507243840043085, |
| "loss": 3.3953, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.323535745047373, |
| "grad_norm": 0.34142157435417175, |
| "learning_rate": 0.0005505224182038508, |
| "loss": 3.4, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.336993970714901, |
| "grad_norm": 0.3376274108886719, |
| "learning_rate": 0.0005503204524033929, |
| "loss": 3.39, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.3504521963824287, |
| "grad_norm": 0.3726096749305725, |
| "learning_rate": 0.0005501184866029352, |
| "loss": 3.3992, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.363910422049957, |
| "grad_norm": 0.37750956416130066, |
| "learning_rate": 0.0005499165208024774, |
| "loss": 3.3888, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.3773686477174847, |
| "grad_norm": 0.35686615109443665, |
| "learning_rate": 0.0005497145550020196, |
| "loss": 3.3925, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.390826873385013, |
| "grad_norm": 0.3921195864677429, |
| "learning_rate": 0.0005495125892015618, |
| "loss": 3.3896, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.404285099052541, |
| "grad_norm": 0.3748328387737274, |
| "learning_rate": 0.000549310623401104, |
| "loss": 3.3959, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.417743324720069, |
| "grad_norm": 0.35698792338371277, |
| "learning_rate": 0.0005491086576006462, |
| "loss": 3.4172, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.431201550387597, |
| "grad_norm": 0.34931182861328125, |
| "learning_rate": 0.0005489066918001885, |
| "loss": 3.4007, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.4446597760551247, |
| "grad_norm": 0.36840617656707764, |
| "learning_rate": 0.0005487047259997306, |
| "loss": 3.3995, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.458118001722653, |
| "grad_norm": 0.37831541895866394, |
| "learning_rate": 0.0005485027601992729, |
| "loss": 3.4005, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.471576227390181, |
| "grad_norm": 0.3618316054344177, |
| "learning_rate": 0.0005483007943988151, |
| "loss": 3.4051, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.485034453057709, |
| "grad_norm": 0.3680399954319, |
| "learning_rate": 0.0005480988285983574, |
| "loss": 3.3998, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.4984926787252366, |
| "grad_norm": 0.36591285467147827, |
| "learning_rate": 0.0005478968627978995, |
| "loss": 3.3915, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.4984926787252366, |
| "eval_accuracy": 0.37471398715272664, |
| "eval_loss": 3.436342239379883, |
| "eval_runtime": 53.7514, |
| "eval_samples_per_second": 335.098, |
| "eval_steps_per_second": 20.948, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.511950904392765, |
| "grad_norm": 0.34238138794898987, |
| "learning_rate": 0.0005476948969974418, |
| "loss": 3.4124, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.525409130060293, |
| "grad_norm": 0.36642181873321533, |
| "learning_rate": 0.0005474929311969839, |
| "loss": 3.3979, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.5388673557278207, |
| "grad_norm": 0.378031462430954, |
| "learning_rate": 0.0005472909653965261, |
| "loss": 3.3842, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.552325581395349, |
| "grad_norm": 0.34340566396713257, |
| "learning_rate": 0.0005470889995960684, |
| "loss": 3.3945, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.5657838070628767, |
| "grad_norm": 0.37123072147369385, |
| "learning_rate": 0.0005468870337956105, |
| "loss": 3.4008, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.579242032730405, |
| "grad_norm": 0.38739728927612305, |
| "learning_rate": 0.0005466850679951528, |
| "loss": 3.4049, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.592700258397933, |
| "grad_norm": 0.36094415187835693, |
| "learning_rate": 0.0005464831021946949, |
| "loss": 3.408, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.606158484065461, |
| "grad_norm": 0.35295525193214417, |
| "learning_rate": 0.0005462811363942372, |
| "loss": 3.3991, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.6196167097329885, |
| "grad_norm": 0.35480549931526184, |
| "learning_rate": 0.0005460791705937794, |
| "loss": 3.4071, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.6330749354005167, |
| "grad_norm": 0.35453832149505615, |
| "learning_rate": 0.0005458772047933216, |
| "loss": 3.3911, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.646533161068045, |
| "grad_norm": 0.37023717164993286, |
| "learning_rate": 0.0005456752389928638, |
| "loss": 3.3983, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.6599913867355727, |
| "grad_norm": 0.35451245307922363, |
| "learning_rate": 0.000545473273192406, |
| "loss": 3.3998, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.673449612403101, |
| "grad_norm": 0.35649922490119934, |
| "learning_rate": 0.0005452713073919482, |
| "loss": 3.3961, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.6869078380706286, |
| "grad_norm": 0.36602070927619934, |
| "learning_rate": 0.0005450693415914905, |
| "loss": 3.3947, |
| "step": 13700 |
| }, |
| { |
| "epoch": 3.700366063738157, |
| "grad_norm": 0.37440329790115356, |
| "learning_rate": 0.0005448673757910327, |
| "loss": 3.3986, |
| "step": 13750 |
| }, |
| { |
| "epoch": 3.713824289405685, |
| "grad_norm": 0.35884660482406616, |
| "learning_rate": 0.0005446654099905749, |
| "loss": 3.3951, |
| "step": 13800 |
| }, |
| { |
| "epoch": 3.7272825150732127, |
| "grad_norm": 0.3605027496814728, |
| "learning_rate": 0.0005444634441901171, |
| "loss": 3.3956, |
| "step": 13850 |
| }, |
| { |
| "epoch": 3.7407407407407405, |
| "grad_norm": 0.3374119997024536, |
| "learning_rate": 0.0005442614783896594, |
| "loss": 3.3976, |
| "step": 13900 |
| }, |
| { |
| "epoch": 3.7541989664082687, |
| "grad_norm": 0.34979817271232605, |
| "learning_rate": 0.0005440595125892015, |
| "loss": 3.3934, |
| "step": 13950 |
| }, |
| { |
| "epoch": 3.767657192075797, |
| "grad_norm": 0.38936689496040344, |
| "learning_rate": 0.0005438575467887438, |
| "loss": 3.3858, |
| "step": 14000 |
| }, |
| { |
| "epoch": 3.767657192075797, |
| "eval_accuracy": 0.37646504852385865, |
| "eval_loss": 3.4231066703796387, |
| "eval_runtime": 53.7446, |
| "eval_samples_per_second": 335.141, |
| "eval_steps_per_second": 20.951, |
| "step": 14000 |
| }, |
| { |
| "epoch": 3.7811154177433246, |
| "grad_norm": 0.3927522301673889, |
| "learning_rate": 0.0005436555809882859, |
| "loss": 3.3889, |
| "step": 14050 |
| }, |
| { |
| "epoch": 3.794573643410853, |
| "grad_norm": 0.35339125990867615, |
| "learning_rate": 0.0005434536151878281, |
| "loss": 3.4018, |
| "step": 14100 |
| }, |
| { |
| "epoch": 3.8080318690783805, |
| "grad_norm": 0.3210137188434601, |
| "learning_rate": 0.0005432516493873704, |
| "loss": 3.3906, |
| "step": 14150 |
| }, |
| { |
| "epoch": 3.8214900947459087, |
| "grad_norm": 0.35777273774147034, |
| "learning_rate": 0.0005430496835869125, |
| "loss": 3.3963, |
| "step": 14200 |
| }, |
| { |
| "epoch": 3.834948320413437, |
| "grad_norm": 0.358101487159729, |
| "learning_rate": 0.0005428477177864548, |
| "loss": 3.4034, |
| "step": 14250 |
| }, |
| { |
| "epoch": 3.8484065460809647, |
| "grad_norm": 0.35471367835998535, |
| "learning_rate": 0.0005426457519859969, |
| "loss": 3.4062, |
| "step": 14300 |
| }, |
| { |
| "epoch": 3.8618647717484924, |
| "grad_norm": 0.3749210834503174, |
| "learning_rate": 0.0005424437861855392, |
| "loss": 3.3965, |
| "step": 14350 |
| }, |
| { |
| "epoch": 3.8753229974160206, |
| "grad_norm": 0.3748970627784729, |
| "learning_rate": 0.0005422418203850814, |
| "loss": 3.3907, |
| "step": 14400 |
| }, |
| { |
| "epoch": 3.888781223083549, |
| "grad_norm": 0.3373413681983948, |
| "learning_rate": 0.0005420398545846236, |
| "loss": 3.4029, |
| "step": 14450 |
| }, |
| { |
| "epoch": 3.9022394487510765, |
| "grad_norm": 0.3378717005252838, |
| "learning_rate": 0.0005418378887841658, |
| "loss": 3.3926, |
| "step": 14500 |
| }, |
| { |
| "epoch": 3.9156976744186047, |
| "grad_norm": 0.33396196365356445, |
| "learning_rate": 0.000541635922983708, |
| "loss": 3.4127, |
| "step": 14550 |
| }, |
| { |
| "epoch": 3.9291559000861325, |
| "grad_norm": 0.3631775379180908, |
| "learning_rate": 0.0005414339571832503, |
| "loss": 3.3993, |
| "step": 14600 |
| }, |
| { |
| "epoch": 3.9426141257536607, |
| "grad_norm": 0.3275887966156006, |
| "learning_rate": 0.0005412319913827925, |
| "loss": 3.3836, |
| "step": 14650 |
| }, |
| { |
| "epoch": 3.956072351421189, |
| "grad_norm": 0.336851567029953, |
| "learning_rate": 0.0005410300255823347, |
| "loss": 3.3843, |
| "step": 14700 |
| }, |
| { |
| "epoch": 3.9695305770887166, |
| "grad_norm": 0.3692421019077301, |
| "learning_rate": 0.0005408280597818769, |
| "loss": 3.389, |
| "step": 14750 |
| }, |
| { |
| "epoch": 3.9829888027562443, |
| "grad_norm": 0.35839107632637024, |
| "learning_rate": 0.0005406260939814191, |
| "loss": 3.3787, |
| "step": 14800 |
| }, |
| { |
| "epoch": 3.9964470284237725, |
| "grad_norm": 0.34140458703041077, |
| "learning_rate": 0.0005404241281809614, |
| "loss": 3.385, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.00968992248062, |
| "grad_norm": 0.37650322914123535, |
| "learning_rate": 0.0005402221623805035, |
| "loss": 3.3252, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.023148148148148, |
| "grad_norm": 0.3383863866329193, |
| "learning_rate": 0.0005400201965800457, |
| "loss": 3.2848, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.036606373815676, |
| "grad_norm": 0.35266733169555664, |
| "learning_rate": 0.000539818230779588, |
| "loss": 3.3038, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.036606373815676, |
| "eval_accuracy": 0.3777076426109491, |
| "eval_loss": 3.412649154663086, |
| "eval_runtime": 53.7186, |
| "eval_samples_per_second": 335.303, |
| "eval_steps_per_second": 20.961, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.050064599483204, |
| "grad_norm": 0.3664638102054596, |
| "learning_rate": 0.0005396162649791301, |
| "loss": 3.3013, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.0635228251507325, |
| "grad_norm": 0.3443576991558075, |
| "learning_rate": 0.0005394142991786724, |
| "loss": 3.306, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.07698105081826, |
| "grad_norm": 0.3623943030834198, |
| "learning_rate": 0.0005392123333782145, |
| "loss": 3.3083, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.090439276485788, |
| "grad_norm": 0.345225989818573, |
| "learning_rate": 0.0005390103675777568, |
| "loss": 3.3135, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.103897502153316, |
| "grad_norm": 0.3902343809604645, |
| "learning_rate": 0.000538808401777299, |
| "loss": 3.3166, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.117355727820844, |
| "grad_norm": 0.3565817177295685, |
| "learning_rate": 0.0005386064359768412, |
| "loss": 3.3177, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.1308139534883725, |
| "grad_norm": 0.33238816261291504, |
| "learning_rate": 0.0005384044701763834, |
| "loss": 3.3145, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.1442721791559, |
| "grad_norm": 0.36902859807014465, |
| "learning_rate": 0.0005382025043759256, |
| "loss": 3.3058, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.157730404823428, |
| "grad_norm": 0.349520742893219, |
| "learning_rate": 0.0005380005385754678, |
| "loss": 3.3161, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.171188630490956, |
| "grad_norm": 0.39129889011383057, |
| "learning_rate": 0.0005377985727750101, |
| "loss": 3.3209, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.184646856158484, |
| "grad_norm": 0.36261945962905884, |
| "learning_rate": 0.0005375966069745523, |
| "loss": 3.3263, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.198105081826012, |
| "grad_norm": 0.3307056725025177, |
| "learning_rate": 0.0005373946411740945, |
| "loss": 3.3133, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.21156330749354, |
| "grad_norm": 0.3555365800857544, |
| "learning_rate": 0.0005371926753736367, |
| "loss": 3.3177, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.225021533161068, |
| "grad_norm": 0.3683795630931854, |
| "learning_rate": 0.000536990709573179, |
| "loss": 3.3199, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.238479758828596, |
| "grad_norm": 0.3561367988586426, |
| "learning_rate": 0.0005367887437727211, |
| "loss": 3.3146, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.251937984496124, |
| "grad_norm": 0.3561237156391144, |
| "learning_rate": 0.0005365867779722634, |
| "loss": 3.3396, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.265396210163652, |
| "grad_norm": 0.3543408215045929, |
| "learning_rate": 0.0005363848121718055, |
| "loss": 3.3308, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.27885443583118, |
| "grad_norm": 0.36262819170951843, |
| "learning_rate": 0.0005361828463713477, |
| "loss": 3.3253, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.292312661498708, |
| "grad_norm": 0.36303988099098206, |
| "learning_rate": 0.00053598088057089, |
| "loss": 3.3338, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.305770887166236, |
| "grad_norm": 0.34338095784187317, |
| "learning_rate": 0.0005357789147704321, |
| "loss": 3.3297, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.305770887166236, |
| "eval_accuracy": 0.3787833551278926, |
| "eval_loss": 3.404193162918091, |
| "eval_runtime": 53.6662, |
| "eval_samples_per_second": 335.63, |
| "eval_steps_per_second": 20.982, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.319229112833764, |
| "grad_norm": 0.3415316939353943, |
| "learning_rate": 0.0005355769489699744, |
| "loss": 3.3426, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.332687338501292, |
| "grad_norm": 0.34920433163642883, |
| "learning_rate": 0.0005353749831695165, |
| "loss": 3.3309, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.34614556416882, |
| "grad_norm": 0.3775346875190735, |
| "learning_rate": 0.0005351730173690588, |
| "loss": 3.329, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.359603789836348, |
| "grad_norm": 0.34089094400405884, |
| "learning_rate": 0.000534971051568601, |
| "loss": 3.311, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.373062015503876, |
| "grad_norm": 0.3913591206073761, |
| "learning_rate": 0.0005347690857681432, |
| "loss": 3.3375, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.386520241171404, |
| "grad_norm": 0.3345584273338318, |
| "learning_rate": 0.0005345671199676854, |
| "loss": 3.3189, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.399978466838932, |
| "grad_norm": 0.3587518632411957, |
| "learning_rate": 0.0005343651541672276, |
| "loss": 3.3273, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.41343669250646, |
| "grad_norm": 0.35501107573509216, |
| "learning_rate": 0.0005341631883667699, |
| "loss": 3.3295, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.426894918173988, |
| "grad_norm": 0.36069580912590027, |
| "learning_rate": 0.0005339612225663121, |
| "loss": 3.3267, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.440353143841516, |
| "grad_norm": 0.36240604519844055, |
| "learning_rate": 0.0005337592567658543, |
| "loss": 3.3333, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.453811369509044, |
| "grad_norm": 0.35791751742362976, |
| "learning_rate": 0.0005335572909653965, |
| "loss": 3.3298, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.467269595176572, |
| "grad_norm": 0.3455749750137329, |
| "learning_rate": 0.0005333553251649387, |
| "loss": 3.3251, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.4807278208441, |
| "grad_norm": 0.3759973347187042, |
| "learning_rate": 0.000533153359364481, |
| "loss": 3.3341, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.4941860465116275, |
| "grad_norm": 0.3809243440628052, |
| "learning_rate": 0.0005329513935640231, |
| "loss": 3.3308, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.507644272179156, |
| "grad_norm": 0.3631037771701813, |
| "learning_rate": 0.0005327494277635654, |
| "loss": 3.3494, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.521102497846684, |
| "grad_norm": 0.3409591317176819, |
| "learning_rate": 0.0005325474619631075, |
| "loss": 3.3306, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.534560723514212, |
| "grad_norm": 0.3606366813182831, |
| "learning_rate": 0.0005323454961626497, |
| "loss": 3.3328, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.54801894918174, |
| "grad_norm": 0.33501338958740234, |
| "learning_rate": 0.000532143530362192, |
| "loss": 3.3409, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.5614771748492675, |
| "grad_norm": 0.3617742657661438, |
| "learning_rate": 0.0005319415645617341, |
| "loss": 3.3329, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.574935400516796, |
| "grad_norm": 0.8550599813461304, |
| "learning_rate": 0.0005317395987612764, |
| "loss": 3.3282, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.574935400516796, |
| "eval_accuracy": 0.3796109182267269, |
| "eval_loss": 3.39310884475708, |
| "eval_runtime": 53.8374, |
| "eval_samples_per_second": 334.563, |
| "eval_steps_per_second": 20.915, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.588393626184324, |
| "grad_norm": 0.35077497363090515, |
| "learning_rate": 0.0005315376329608185, |
| "loss": 3.3273, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.601851851851852, |
| "grad_norm": 0.3806801438331604, |
| "learning_rate": 0.0005313356671603608, |
| "loss": 3.3338, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.61531007751938, |
| "grad_norm": 0.3507062792778015, |
| "learning_rate": 0.000531133701359903, |
| "loss": 3.3406, |
| "step": 17150 |
| }, |
| { |
| "epoch": 4.628768303186908, |
| "grad_norm": 0.35898759961128235, |
| "learning_rate": 0.0005309317355594452, |
| "loss": 3.3315, |
| "step": 17200 |
| }, |
| { |
| "epoch": 4.642226528854436, |
| "grad_norm": 0.3401118814945221, |
| "learning_rate": 0.0005307297697589874, |
| "loss": 3.329, |
| "step": 17250 |
| }, |
| { |
| "epoch": 4.655684754521964, |
| "grad_norm": 0.38454142212867737, |
| "learning_rate": 0.0005305278039585297, |
| "loss": 3.3373, |
| "step": 17300 |
| }, |
| { |
| "epoch": 4.669142980189492, |
| "grad_norm": 0.3205120265483856, |
| "learning_rate": 0.0005303258381580719, |
| "loss": 3.331, |
| "step": 17350 |
| }, |
| { |
| "epoch": 4.682601205857019, |
| "grad_norm": 0.37140244245529175, |
| "learning_rate": 0.0005301238723576141, |
| "loss": 3.3317, |
| "step": 17400 |
| }, |
| { |
| "epoch": 4.696059431524548, |
| "grad_norm": 0.34272000193595886, |
| "learning_rate": 0.0005299219065571563, |
| "loss": 3.3464, |
| "step": 17450 |
| }, |
| { |
| "epoch": 4.709517657192076, |
| "grad_norm": 0.3573205769062042, |
| "learning_rate": 0.0005297199407566985, |
| "loss": 3.3392, |
| "step": 17500 |
| }, |
| { |
| "epoch": 4.722975882859604, |
| "grad_norm": 0.3326584994792938, |
| "learning_rate": 0.0005295179749562407, |
| "loss": 3.3355, |
| "step": 17550 |
| }, |
| { |
| "epoch": 4.736434108527131, |
| "grad_norm": 0.33970969915390015, |
| "learning_rate": 0.000529316009155783, |
| "loss": 3.3304, |
| "step": 17600 |
| }, |
| { |
| "epoch": 4.7498923341946595, |
| "grad_norm": 0.34760308265686035, |
| "learning_rate": 0.0005291140433553251, |
| "loss": 3.3451, |
| "step": 17650 |
| }, |
| { |
| "epoch": 4.763350559862188, |
| "grad_norm": 0.34035566449165344, |
| "learning_rate": 0.0005289120775548673, |
| "loss": 3.3336, |
| "step": 17700 |
| }, |
| { |
| "epoch": 4.776808785529716, |
| "grad_norm": 0.36723145842552185, |
| "learning_rate": 0.0005287101117544095, |
| "loss": 3.3358, |
| "step": 17750 |
| }, |
| { |
| "epoch": 4.790267011197244, |
| "grad_norm": 0.3488785922527313, |
| "learning_rate": 0.0005285081459539517, |
| "loss": 3.3446, |
| "step": 17800 |
| }, |
| { |
| "epoch": 4.803725236864771, |
| "grad_norm": 0.33435118198394775, |
| "learning_rate": 0.000528306180153494, |
| "loss": 3.3367, |
| "step": 17850 |
| }, |
| { |
| "epoch": 4.8171834625323, |
| "grad_norm": 0.35024383664131165, |
| "learning_rate": 0.0005281042143530361, |
| "loss": 3.3474, |
| "step": 17900 |
| }, |
| { |
| "epoch": 4.830641688199828, |
| "grad_norm": 0.32620319724082947, |
| "learning_rate": 0.0005279022485525784, |
| "loss": 3.3375, |
| "step": 17950 |
| }, |
| { |
| "epoch": 4.844099913867356, |
| "grad_norm": 0.34913358092308044, |
| "learning_rate": 0.0005277002827521205, |
| "loss": 3.335, |
| "step": 18000 |
| }, |
| { |
| "epoch": 4.844099913867356, |
| "eval_accuracy": 0.3806844578065591, |
| "eval_loss": 3.3818206787109375, |
| "eval_runtime": 54.2008, |
| "eval_samples_per_second": 332.32, |
| "eval_steps_per_second": 20.775, |
| "step": 18000 |
| }, |
| { |
| "epoch": 4.857558139534884, |
| "grad_norm": 0.3346174955368042, |
| "learning_rate": 0.0005274983169516628, |
| "loss": 3.334, |
| "step": 18050 |
| }, |
| { |
| "epoch": 4.871016365202411, |
| "grad_norm": 0.3737140893936157, |
| "learning_rate": 0.000527296351151205, |
| "loss": 3.3378, |
| "step": 18100 |
| }, |
| { |
| "epoch": 4.88447459086994, |
| "grad_norm": 0.3536180853843689, |
| "learning_rate": 0.0005270943853507472, |
| "loss": 3.3514, |
| "step": 18150 |
| }, |
| { |
| "epoch": 4.897932816537468, |
| "grad_norm": 0.3579419255256653, |
| "learning_rate": 0.0005268924195502894, |
| "loss": 3.3251, |
| "step": 18200 |
| }, |
| { |
| "epoch": 4.911391042204996, |
| "grad_norm": 0.3723459541797638, |
| "learning_rate": 0.0005266904537498317, |
| "loss": 3.3348, |
| "step": 18250 |
| }, |
| { |
| "epoch": 4.924849267872523, |
| "grad_norm": 0.3395371735095978, |
| "learning_rate": 0.0005264884879493739, |
| "loss": 3.3409, |
| "step": 18300 |
| }, |
| { |
| "epoch": 4.9383074935400515, |
| "grad_norm": 0.3212871253490448, |
| "learning_rate": 0.0005262865221489161, |
| "loss": 3.3269, |
| "step": 18350 |
| }, |
| { |
| "epoch": 4.95176571920758, |
| "grad_norm": 0.33720704913139343, |
| "learning_rate": 0.0005260845563484583, |
| "loss": 3.3345, |
| "step": 18400 |
| }, |
| { |
| "epoch": 4.965223944875108, |
| "grad_norm": 0.38691216707229614, |
| "learning_rate": 0.0005258825905480005, |
| "loss": 3.3425, |
| "step": 18450 |
| }, |
| { |
| "epoch": 4.978682170542635, |
| "grad_norm": 0.3235993981361389, |
| "learning_rate": 0.0005256806247475427, |
| "loss": 3.3389, |
| "step": 18500 |
| }, |
| { |
| "epoch": 4.992140396210163, |
| "grad_norm": 0.33104822039604187, |
| "learning_rate": 0.000525478658947085, |
| "loss": 3.3442, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.0053832902670115, |
| "grad_norm": 0.3393186032772064, |
| "learning_rate": 0.0005252766931466271, |
| "loss": 3.2944, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.018841515934539, |
| "grad_norm": 0.37101826071739197, |
| "learning_rate": 0.0005250747273461693, |
| "loss": 3.2355, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.032299741602067, |
| "grad_norm": 0.3399945795536041, |
| "learning_rate": 0.0005248727615457115, |
| "loss": 3.2344, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.045757967269595, |
| "grad_norm": 0.35850459337234497, |
| "learning_rate": 0.0005246707957452537, |
| "loss": 3.2436, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.059216192937123, |
| "grad_norm": 0.35731905698776245, |
| "learning_rate": 0.000524468829944796, |
| "loss": 3.252, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.0726744186046515, |
| "grad_norm": 0.35466647148132324, |
| "learning_rate": 0.0005242668641443381, |
| "loss": 3.2462, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.086132644272179, |
| "grad_norm": 0.3400084376335144, |
| "learning_rate": 0.0005240648983438804, |
| "loss": 3.253, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.099590869939707, |
| "grad_norm": 0.3490995466709137, |
| "learning_rate": 0.0005238629325434225, |
| "loss": 3.2463, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.113049095607235, |
| "grad_norm": 0.3619450032711029, |
| "learning_rate": 0.0005236609667429648, |
| "loss": 3.2534, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.113049095607235, |
| "eval_accuracy": 0.38128494897726367, |
| "eval_loss": 3.382563591003418, |
| "eval_runtime": 53.8344, |
| "eval_samples_per_second": 334.582, |
| "eval_steps_per_second": 20.916, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.126507321274763, |
| "grad_norm": 0.36200740933418274, |
| "learning_rate": 0.000523459000942507, |
| "loss": 3.2613, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.139965546942291, |
| "grad_norm": 0.3502586781978607, |
| "learning_rate": 0.0005232570351420493, |
| "loss": 3.2516, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.153423772609819, |
| "grad_norm": 0.3829094171524048, |
| "learning_rate": 0.0005230550693415915, |
| "loss": 3.2667, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.166881998277347, |
| "grad_norm": 0.38154736161231995, |
| "learning_rate": 0.0005228531035411337, |
| "loss": 3.257, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.180340223944875, |
| "grad_norm": 0.3990512192249298, |
| "learning_rate": 0.0005226511377406759, |
| "loss": 3.2685, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.1937984496124034, |
| "grad_norm": 0.35598447918891907, |
| "learning_rate": 0.0005224491719402181, |
| "loss": 3.2561, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.207256675279931, |
| "grad_norm": 0.3600831925868988, |
| "learning_rate": 0.0005222472061397603, |
| "loss": 3.2696, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.220714900947459, |
| "grad_norm": 0.340609610080719, |
| "learning_rate": 0.0005220452403393026, |
| "loss": 3.2744, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.234173126614987, |
| "grad_norm": 0.32513388991355896, |
| "learning_rate": 0.0005218432745388447, |
| "loss": 3.2763, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.247631352282515, |
| "grad_norm": 0.33820098638534546, |
| "learning_rate": 0.000521641308738387, |
| "loss": 3.2638, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.2610895779500435, |
| "grad_norm": 0.3647639751434326, |
| "learning_rate": 0.0005214393429379291, |
| "loss": 3.2744, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.274547803617571, |
| "grad_norm": 0.36164987087249756, |
| "learning_rate": 0.0005212373771374713, |
| "loss": 3.2734, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.288006029285099, |
| "grad_norm": 0.3829108774662018, |
| "learning_rate": 0.0005210354113370136, |
| "loss": 3.2782, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.301464254952627, |
| "grad_norm": 0.3650548458099365, |
| "learning_rate": 0.0005208334455365557, |
| "loss": 3.2766, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.314922480620155, |
| "grad_norm": 0.37036004662513733, |
| "learning_rate": 0.000520631479736098, |
| "loss": 3.2846, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.328380706287683, |
| "grad_norm": 0.3291724920272827, |
| "learning_rate": 0.0005204295139356401, |
| "loss": 3.2703, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.341838931955211, |
| "grad_norm": 0.3196430206298828, |
| "learning_rate": 0.0005202275481351824, |
| "loss": 3.2797, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.355297157622739, |
| "grad_norm": 0.3584645092487335, |
| "learning_rate": 0.0005200255823347246, |
| "loss": 3.2801, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.368755383290267, |
| "grad_norm": 0.34693044424057007, |
| "learning_rate": 0.0005198236165342669, |
| "loss": 3.2844, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.3822136089577945, |
| "grad_norm": 0.3746398091316223, |
| "learning_rate": 0.000519621650733809, |
| "loss": 3.2736, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.3822136089577945, |
| "eval_accuracy": 0.38213033016041054, |
| "eval_loss": 3.3756988048553467, |
| "eval_runtime": 53.7995, |
| "eval_samples_per_second": 334.799, |
| "eval_steps_per_second": 20.93, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.395671834625323, |
| "grad_norm": 0.348362535238266, |
| "learning_rate": 0.0005194196849333513, |
| "loss": 3.2986, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.409130060292851, |
| "grad_norm": 0.37827175855636597, |
| "learning_rate": 0.0005192177191328935, |
| "loss": 3.2875, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.422588285960379, |
| "grad_norm": 0.3439246714115143, |
| "learning_rate": 0.0005190157533324357, |
| "loss": 3.2836, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.436046511627907, |
| "grad_norm": 0.37693148851394653, |
| "learning_rate": 0.0005188137875319779, |
| "loss": 3.2799, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.449504737295435, |
| "grad_norm": 0.3539809584617615, |
| "learning_rate": 0.00051861182173152, |
| "loss": 3.2826, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.462962962962963, |
| "grad_norm": 0.37011033296585083, |
| "learning_rate": 0.0005184098559310623, |
| "loss": 3.2864, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.476421188630491, |
| "grad_norm": 0.371745765209198, |
| "learning_rate": 0.0005182078901306046, |
| "loss": 3.2917, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.489879414298019, |
| "grad_norm": 0.3410128951072693, |
| "learning_rate": 0.0005180059243301467, |
| "loss": 3.2862, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.503337639965547, |
| "grad_norm": 0.33623674511909485, |
| "learning_rate": 0.000517803958529689, |
| "loss": 3.2809, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.516795865633075, |
| "grad_norm": 0.37878212332725525, |
| "learning_rate": 0.0005176019927292311, |
| "loss": 3.2855, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.530254091300603, |
| "grad_norm": 0.35254615545272827, |
| "learning_rate": 0.0005174000269287733, |
| "loss": 3.2833, |
| "step": 20550 |
| }, |
| { |
| "epoch": 5.543712316968131, |
| "grad_norm": 0.340909481048584, |
| "learning_rate": 0.0005171980611283156, |
| "loss": 3.2962, |
| "step": 20600 |
| }, |
| { |
| "epoch": 5.557170542635659, |
| "grad_norm": 0.3362419307231903, |
| "learning_rate": 0.0005169960953278577, |
| "loss": 3.2961, |
| "step": 20650 |
| }, |
| { |
| "epoch": 5.5706287683031865, |
| "grad_norm": 0.3751087486743927, |
| "learning_rate": 0.0005167941295274, |
| "loss": 3.2949, |
| "step": 20700 |
| }, |
| { |
| "epoch": 5.584086993970715, |
| "grad_norm": 0.3268432021141052, |
| "learning_rate": 0.0005165921637269421, |
| "loss": 3.2864, |
| "step": 20750 |
| }, |
| { |
| "epoch": 5.597545219638243, |
| "grad_norm": 0.33362266421318054, |
| "learning_rate": 0.0005163901979264844, |
| "loss": 3.2922, |
| "step": 20800 |
| }, |
| { |
| "epoch": 5.611003445305771, |
| "grad_norm": 0.3502647876739502, |
| "learning_rate": 0.0005161882321260266, |
| "loss": 3.2945, |
| "step": 20850 |
| }, |
| { |
| "epoch": 5.624461670973298, |
| "grad_norm": 0.3486431837081909, |
| "learning_rate": 0.0005159862663255689, |
| "loss": 3.2938, |
| "step": 20900 |
| }, |
| { |
| "epoch": 5.637919896640827, |
| "grad_norm": 0.38896527886390686, |
| "learning_rate": 0.000515784300525111, |
| "loss": 3.2806, |
| "step": 20950 |
| }, |
| { |
| "epoch": 5.651378122308355, |
| "grad_norm": 0.35072192549705505, |
| "learning_rate": 0.0005155823347246533, |
| "loss": 3.2934, |
| "step": 21000 |
| }, |
| { |
| "epoch": 5.651378122308355, |
| "eval_accuracy": 0.38315747753291673, |
| "eval_loss": 3.3653366565704346, |
| "eval_runtime": 53.8377, |
| "eval_samples_per_second": 334.561, |
| "eval_steps_per_second": 20.915, |
| "step": 21000 |
| }, |
| { |
| "epoch": 5.664836347975883, |
| "grad_norm": 0.33917105197906494, |
| "learning_rate": 0.0005153803689241955, |
| "loss": 3.2948, |
| "step": 21050 |
| }, |
| { |
| "epoch": 5.678294573643411, |
| "grad_norm": 0.39053285121917725, |
| "learning_rate": 0.0005151784031237377, |
| "loss": 3.2933, |
| "step": 21100 |
| }, |
| { |
| "epoch": 5.6917527993109385, |
| "grad_norm": 0.34076425433158875, |
| "learning_rate": 0.0005149764373232799, |
| "loss": 3.284, |
| "step": 21150 |
| }, |
| { |
| "epoch": 5.705211024978467, |
| "grad_norm": 0.3462291657924652, |
| "learning_rate": 0.000514774471522822, |
| "loss": 3.2927, |
| "step": 21200 |
| }, |
| { |
| "epoch": 5.718669250645995, |
| "grad_norm": 0.34847456216812134, |
| "learning_rate": 0.0005145725057223643, |
| "loss": 3.2929, |
| "step": 21250 |
| }, |
| { |
| "epoch": 5.732127476313523, |
| "grad_norm": 0.35302457213401794, |
| "learning_rate": 0.0005143705399219066, |
| "loss": 3.2986, |
| "step": 21300 |
| }, |
| { |
| "epoch": 5.745585701981051, |
| "grad_norm": 0.3497825562953949, |
| "learning_rate": 0.0005141685741214487, |
| "loss": 3.2843, |
| "step": 21350 |
| }, |
| { |
| "epoch": 5.7590439276485785, |
| "grad_norm": 0.34568512439727783, |
| "learning_rate": 0.0005139666083209909, |
| "loss": 3.286, |
| "step": 21400 |
| }, |
| { |
| "epoch": 5.772502153316107, |
| "grad_norm": 0.3420438766479492, |
| "learning_rate": 0.0005137646425205331, |
| "loss": 3.2816, |
| "step": 21450 |
| }, |
| { |
| "epoch": 5.785960378983635, |
| "grad_norm": 0.372644305229187, |
| "learning_rate": 0.0005135626767200753, |
| "loss": 3.2862, |
| "step": 21500 |
| }, |
| { |
| "epoch": 5.799418604651163, |
| "grad_norm": 0.3587517738342285, |
| "learning_rate": 0.0005133607109196176, |
| "loss": 3.2871, |
| "step": 21550 |
| }, |
| { |
| "epoch": 5.81287683031869, |
| "grad_norm": 0.353718638420105, |
| "learning_rate": 0.0005131587451191597, |
| "loss": 3.2912, |
| "step": 21600 |
| }, |
| { |
| "epoch": 5.826335055986219, |
| "grad_norm": 0.33927828073501587, |
| "learning_rate": 0.000512956779318702, |
| "loss": 3.2915, |
| "step": 21650 |
| }, |
| { |
| "epoch": 5.839793281653747, |
| "grad_norm": 0.34568026661872864, |
| "learning_rate": 0.0005127548135182441, |
| "loss": 3.3025, |
| "step": 21700 |
| }, |
| { |
| "epoch": 5.853251507321275, |
| "grad_norm": 0.3527640402317047, |
| "learning_rate": 0.0005125528477177865, |
| "loss": 3.2974, |
| "step": 21750 |
| }, |
| { |
| "epoch": 5.866709732988802, |
| "grad_norm": 0.3370378911495209, |
| "learning_rate": 0.0005123508819173286, |
| "loss": 3.2858, |
| "step": 21800 |
| }, |
| { |
| "epoch": 5.8801679586563305, |
| "grad_norm": 0.3530554175376892, |
| "learning_rate": 0.0005121489161168709, |
| "loss": 3.2923, |
| "step": 21850 |
| }, |
| { |
| "epoch": 5.893626184323859, |
| "grad_norm": 0.3588743507862091, |
| "learning_rate": 0.000511946950316413, |
| "loss": 3.2968, |
| "step": 21900 |
| }, |
| { |
| "epoch": 5.907084409991387, |
| "grad_norm": 0.3334029018878937, |
| "learning_rate": 0.0005117449845159553, |
| "loss": 3.292, |
| "step": 21950 |
| }, |
| { |
| "epoch": 5.920542635658915, |
| "grad_norm": 0.3445538878440857, |
| "learning_rate": 0.0005115430187154975, |
| "loss": 3.2915, |
| "step": 22000 |
| }, |
| { |
| "epoch": 5.920542635658915, |
| "eval_accuracy": 0.3842970744009321, |
| "eval_loss": 3.3547863960266113, |
| "eval_runtime": 53.6807, |
| "eval_samples_per_second": 335.54, |
| "eval_steps_per_second": 20.976, |
| "step": 22000 |
| }, |
| { |
| "epoch": 5.934000861326442, |
| "grad_norm": 0.35701045393943787, |
| "learning_rate": 0.0005113410529150397, |
| "loss": 3.2871, |
| "step": 22050 |
| }, |
| { |
| "epoch": 5.9474590869939705, |
| "grad_norm": 0.3606407046318054, |
| "learning_rate": 0.0005111390871145819, |
| "loss": 3.2977, |
| "step": 22100 |
| }, |
| { |
| "epoch": 5.960917312661499, |
| "grad_norm": 0.3338751494884491, |
| "learning_rate": 0.000510937121314124, |
| "loss": 3.2957, |
| "step": 22150 |
| }, |
| { |
| "epoch": 5.974375538329027, |
| "grad_norm": 0.3611808717250824, |
| "learning_rate": 0.0005107351555136663, |
| "loss": 3.286, |
| "step": 22200 |
| }, |
| { |
| "epoch": 5.987833763996555, |
| "grad_norm": 0.34626203775405884, |
| "learning_rate": 0.0005105331897132085, |
| "loss": 3.2976, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.001076658053402, |
| "grad_norm": 0.37035489082336426, |
| "learning_rate": 0.0005103312239127507, |
| "loss": 3.2967, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.0145348837209305, |
| "grad_norm": 0.36504310369491577, |
| "learning_rate": 0.0005101292581122929, |
| "loss": 3.19, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.027993109388458, |
| "grad_norm": 0.37124723196029663, |
| "learning_rate": 0.0005099272923118351, |
| "loss": 3.2022, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.041451335055986, |
| "grad_norm": 0.3739500343799591, |
| "learning_rate": 0.0005097253265113773, |
| "loss": 3.1889, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.054909560723514, |
| "grad_norm": 0.3512820303440094, |
| "learning_rate": 0.0005095233607109196, |
| "loss": 3.1982, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.068367786391042, |
| "grad_norm": 0.36028966307640076, |
| "learning_rate": 0.0005093213949104617, |
| "loss": 3.2053, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.0818260120585705, |
| "grad_norm": 0.3446792662143707, |
| "learning_rate": 0.000509119429110004, |
| "loss": 3.211, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.095284237726098, |
| "grad_norm": 0.33103981614112854, |
| "learning_rate": 0.0005089174633095462, |
| "loss": 3.2067, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.108742463393626, |
| "grad_norm": 0.3572562038898468, |
| "learning_rate": 0.0005087154975090885, |
| "loss": 3.2146, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.122200689061154, |
| "grad_norm": 0.340128093957901, |
| "learning_rate": 0.0005085135317086306, |
| "loss": 3.2187, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.135658914728682, |
| "grad_norm": 0.3321945071220398, |
| "learning_rate": 0.0005083115659081729, |
| "loss": 3.2173, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.149117140396211, |
| "grad_norm": 0.34041503071784973, |
| "learning_rate": 0.0005081096001077151, |
| "loss": 3.2212, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.162575366063738, |
| "grad_norm": 0.33608099818229675, |
| "learning_rate": 0.0005079076343072573, |
| "loss": 3.2335, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.176033591731266, |
| "grad_norm": 0.3682544529438019, |
| "learning_rate": 0.0005077056685067995, |
| "loss": 3.2197, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.189491817398794, |
| "grad_norm": 0.3501201570034027, |
| "learning_rate": 0.0005075037027063417, |
| "loss": 3.2243, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.189491817398794, |
| "eval_accuracy": 0.3844461378867665, |
| "eval_loss": 3.3621091842651367, |
| "eval_runtime": 53.6366, |
| "eval_samples_per_second": 335.815, |
| "eval_steps_per_second": 20.993, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.2029500430663225, |
| "grad_norm": 0.3694973289966583, |
| "learning_rate": 0.0005073017369058839, |
| "loss": 3.2362, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.21640826873385, |
| "grad_norm": 0.35488101840019226, |
| "learning_rate": 0.0005070997711054262, |
| "loss": 3.2353, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.229866494401378, |
| "grad_norm": 0.3596543073654175, |
| "learning_rate": 0.0005068978053049683, |
| "loss": 3.226, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.243324720068906, |
| "grad_norm": 0.36401212215423584, |
| "learning_rate": 0.0005066958395045105, |
| "loss": 3.2394, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.256782945736434, |
| "grad_norm": 0.3978697657585144, |
| "learning_rate": 0.0005064938737040527, |
| "loss": 3.2418, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.270241171403962, |
| "grad_norm": 0.33834999799728394, |
| "learning_rate": 0.0005062919079035949, |
| "loss": 3.2326, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.28369939707149, |
| "grad_norm": 0.3549429178237915, |
| "learning_rate": 0.0005060899421031372, |
| "loss": 3.2363, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.297157622739018, |
| "grad_norm": 0.3379305899143219, |
| "learning_rate": 0.0005058879763026793, |
| "loss": 3.2259, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.310615848406546, |
| "grad_norm": 0.3503647446632385, |
| "learning_rate": 0.0005056860105022216, |
| "loss": 3.2491, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.324074074074074, |
| "grad_norm": 0.3466089963912964, |
| "learning_rate": 0.0005054840447017637, |
| "loss": 3.2312, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.337532299741602, |
| "grad_norm": 0.3370702862739563, |
| "learning_rate": 0.0005052820789013061, |
| "loss": 3.2449, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.35099052540913, |
| "grad_norm": 0.3398071825504303, |
| "learning_rate": 0.0005050801131008482, |
| "loss": 3.2457, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.364448751076658, |
| "grad_norm": 0.3322971761226654, |
| "learning_rate": 0.0005048781473003905, |
| "loss": 3.2388, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.377906976744186, |
| "grad_norm": 0.33924156427383423, |
| "learning_rate": 0.0005046761814999326, |
| "loss": 3.2471, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.3913652024117145, |
| "grad_norm": 0.3498065173625946, |
| "learning_rate": 0.0005044742156994749, |
| "loss": 3.2248, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.404823428079242, |
| "grad_norm": 0.3730420768260956, |
| "learning_rate": 0.0005042722498990171, |
| "loss": 3.2432, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.41828165374677, |
| "grad_norm": 0.35002046823501587, |
| "learning_rate": 0.0005040702840985593, |
| "loss": 3.2248, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.431739879414298, |
| "grad_norm": 0.37649956345558167, |
| "learning_rate": 0.0005038683182981015, |
| "loss": 3.24, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.445198105081826, |
| "grad_norm": 0.3526284992694855, |
| "learning_rate": 0.0005036663524976436, |
| "loss": 3.2304, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.458656330749354, |
| "grad_norm": 0.3461792767047882, |
| "learning_rate": 0.0005034643866971859, |
| "loss": 3.2413, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.458656330749354, |
| "eval_accuracy": 0.3850296801480031, |
| "eval_loss": 3.3537731170654297, |
| "eval_runtime": 53.6425, |
| "eval_samples_per_second": 335.778, |
| "eval_steps_per_second": 20.991, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.472114556416882, |
| "grad_norm": 0.36732858419418335, |
| "learning_rate": 0.0005032624208967281, |
| "loss": 3.2483, |
| "step": 24050 |
| }, |
| { |
| "epoch": 6.48557278208441, |
| "grad_norm": 0.3334197998046875, |
| "learning_rate": 0.0005030604550962703, |
| "loss": 3.243, |
| "step": 24100 |
| }, |
| { |
| "epoch": 6.499031007751938, |
| "grad_norm": 0.3647319972515106, |
| "learning_rate": 0.0005028584892958125, |
| "loss": 3.2491, |
| "step": 24150 |
| }, |
| { |
| "epoch": 6.5124892334194655, |
| "grad_norm": 0.3465515077114105, |
| "learning_rate": 0.0005026565234953547, |
| "loss": 3.2504, |
| "step": 24200 |
| }, |
| { |
| "epoch": 6.525947459086994, |
| "grad_norm": 0.36729180812835693, |
| "learning_rate": 0.0005024545576948969, |
| "loss": 3.2485, |
| "step": 24250 |
| }, |
| { |
| "epoch": 6.539405684754522, |
| "grad_norm": 0.3396286964416504, |
| "learning_rate": 0.0005022525918944392, |
| "loss": 3.2503, |
| "step": 24300 |
| }, |
| { |
| "epoch": 6.55286391042205, |
| "grad_norm": 0.4158382713794708, |
| "learning_rate": 0.0005020506260939813, |
| "loss": 3.2527, |
| "step": 24350 |
| }, |
| { |
| "epoch": 6.566322136089578, |
| "grad_norm": 0.35420548915863037, |
| "learning_rate": 0.0005018486602935236, |
| "loss": 3.2511, |
| "step": 24400 |
| }, |
| { |
| "epoch": 6.579780361757106, |
| "grad_norm": 0.3404446840286255, |
| "learning_rate": 0.0005016466944930658, |
| "loss": 3.2485, |
| "step": 24450 |
| }, |
| { |
| "epoch": 6.593238587424634, |
| "grad_norm": 0.3330139219760895, |
| "learning_rate": 0.0005014447286926081, |
| "loss": 3.2608, |
| "step": 24500 |
| }, |
| { |
| "epoch": 6.606696813092162, |
| "grad_norm": 0.32995307445526123, |
| "learning_rate": 0.0005012427628921502, |
| "loss": 3.2426, |
| "step": 24550 |
| }, |
| { |
| "epoch": 6.62015503875969, |
| "grad_norm": 0.33813372254371643, |
| "learning_rate": 0.0005010407970916925, |
| "loss": 3.256, |
| "step": 24600 |
| }, |
| { |
| "epoch": 6.633613264427218, |
| "grad_norm": 0.3447318971157074, |
| "learning_rate": 0.0005008388312912346, |
| "loss": 3.2448, |
| "step": 24650 |
| }, |
| { |
| "epoch": 6.647071490094746, |
| "grad_norm": 0.34425389766693115, |
| "learning_rate": 0.0005006368654907769, |
| "loss": 3.253, |
| "step": 24700 |
| }, |
| { |
| "epoch": 6.660529715762274, |
| "grad_norm": 0.34749671816825867, |
| "learning_rate": 0.0005004348996903191, |
| "loss": 3.2517, |
| "step": 24750 |
| }, |
| { |
| "epoch": 6.673987941429802, |
| "grad_norm": 0.33765843510627747, |
| "learning_rate": 0.0005002329338898613, |
| "loss": 3.2594, |
| "step": 24800 |
| }, |
| { |
| "epoch": 6.68744616709733, |
| "grad_norm": 0.34231141209602356, |
| "learning_rate": 0.0005000309680894035, |
| "loss": 3.2659, |
| "step": 24850 |
| }, |
| { |
| "epoch": 6.7009043927648575, |
| "grad_norm": 0.3280162811279297, |
| "learning_rate": 0.0004998290022889456, |
| "loss": 3.2453, |
| "step": 24900 |
| }, |
| { |
| "epoch": 6.714362618432386, |
| "grad_norm": 0.34672361612319946, |
| "learning_rate": 0.0004996270364884879, |
| "loss": 3.2519, |
| "step": 24950 |
| }, |
| { |
| "epoch": 6.727820844099914, |
| "grad_norm": 0.36213499307632446, |
| "learning_rate": 0.0004994250706880301, |
| "loss": 3.2643, |
| "step": 25000 |
| }, |
| { |
| "epoch": 6.727820844099914, |
| "eval_accuracy": 0.38583725222541354, |
| "eval_loss": 3.3438923358917236, |
| "eval_runtime": 53.6863, |
| "eval_samples_per_second": 335.505, |
| "eval_steps_per_second": 20.974, |
| "step": 25000 |
| }, |
| { |
| "epoch": 6.741279069767442, |
| "grad_norm": 0.34953513741493225, |
| "learning_rate": 0.0004992231048875723, |
| "loss": 3.2652, |
| "step": 25050 |
| }, |
| { |
| "epoch": 6.754737295434969, |
| "grad_norm": 0.3408108949661255, |
| "learning_rate": 0.0004990211390871145, |
| "loss": 3.2599, |
| "step": 25100 |
| }, |
| { |
| "epoch": 6.768195521102498, |
| "grad_norm": 0.3221488893032074, |
| "learning_rate": 0.0004988191732866567, |
| "loss": 3.2672, |
| "step": 25150 |
| }, |
| { |
| "epoch": 6.781653746770026, |
| "grad_norm": 0.36635658144950867, |
| "learning_rate": 0.0004986172074861989, |
| "loss": 3.2519, |
| "step": 25200 |
| }, |
| { |
| "epoch": 6.795111972437554, |
| "grad_norm": 0.395259827375412, |
| "learning_rate": 0.0004984152416857412, |
| "loss": 3.2548, |
| "step": 25250 |
| }, |
| { |
| "epoch": 6.808570198105082, |
| "grad_norm": 0.3348065912723541, |
| "learning_rate": 0.0004982132758852834, |
| "loss": 3.258, |
| "step": 25300 |
| }, |
| { |
| "epoch": 6.822028423772609, |
| "grad_norm": 0.37395408749580383, |
| "learning_rate": 0.0004980113100848256, |
| "loss": 3.2477, |
| "step": 25350 |
| }, |
| { |
| "epoch": 6.835486649440138, |
| "grad_norm": 0.32347390055656433, |
| "learning_rate": 0.0004978093442843678, |
| "loss": 3.2514, |
| "step": 25400 |
| }, |
| { |
| "epoch": 6.848944875107666, |
| "grad_norm": 0.34834301471710205, |
| "learning_rate": 0.0004976073784839101, |
| "loss": 3.2541, |
| "step": 25450 |
| }, |
| { |
| "epoch": 6.862403100775194, |
| "grad_norm": 0.3680538535118103, |
| "learning_rate": 0.0004974054126834522, |
| "loss": 3.2515, |
| "step": 25500 |
| }, |
| { |
| "epoch": 6.875861326442722, |
| "grad_norm": 0.3374391496181488, |
| "learning_rate": 0.0004972034468829945, |
| "loss": 3.2601, |
| "step": 25550 |
| }, |
| { |
| "epoch": 6.8893195521102495, |
| "grad_norm": 0.37110382318496704, |
| "learning_rate": 0.0004970014810825366, |
| "loss": 3.2651, |
| "step": 25600 |
| }, |
| { |
| "epoch": 6.902777777777778, |
| "grad_norm": 0.33359917998313904, |
| "learning_rate": 0.0004967995152820789, |
| "loss": 3.2554, |
| "step": 25650 |
| }, |
| { |
| "epoch": 6.916236003445306, |
| "grad_norm": 0.3703191578388214, |
| "learning_rate": 0.0004965975494816211, |
| "loss": 3.262, |
| "step": 25700 |
| }, |
| { |
| "epoch": 6.929694229112834, |
| "grad_norm": 0.3683694303035736, |
| "learning_rate": 0.0004963955836811633, |
| "loss": 3.2602, |
| "step": 25750 |
| }, |
| { |
| "epoch": 6.943152454780362, |
| "grad_norm": 0.34347638487815857, |
| "learning_rate": 0.0004961936178807055, |
| "loss": 3.2569, |
| "step": 25800 |
| }, |
| { |
| "epoch": 6.9566106804478895, |
| "grad_norm": 0.3468749225139618, |
| "learning_rate": 0.0004959916520802476, |
| "loss": 3.2657, |
| "step": 25850 |
| }, |
| { |
| "epoch": 6.970068906115418, |
| "grad_norm": 0.31962108612060547, |
| "learning_rate": 0.0004957896862797899, |
| "loss": 3.2497, |
| "step": 25900 |
| }, |
| { |
| "epoch": 6.983527131782946, |
| "grad_norm": 0.3544672727584839, |
| "learning_rate": 0.0004955877204793321, |
| "loss": 3.2562, |
| "step": 25950 |
| }, |
| { |
| "epoch": 6.996985357450473, |
| "grad_norm": 0.35126814246177673, |
| "learning_rate": 0.0004953857546788743, |
| "loss": 3.2581, |
| "step": 26000 |
| }, |
| { |
| "epoch": 6.996985357450473, |
| "eval_accuracy": 0.3859895751169149, |
| "eval_loss": 3.33788800239563, |
| "eval_runtime": 53.8045, |
| "eval_samples_per_second": 334.767, |
| "eval_steps_per_second": 20.928, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.010228251507321, |
| "grad_norm": 0.3702124059200287, |
| "learning_rate": 0.0004951837888784165, |
| "loss": 3.1869, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.0236864771748495, |
| "grad_norm": 0.36202797293663025, |
| "learning_rate": 0.0004949818230779587, |
| "loss": 3.1512, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.037144702842378, |
| "grad_norm": 0.3474515676498413, |
| "learning_rate": 0.0004947798572775009, |
| "loss": 3.1569, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.050602928509905, |
| "grad_norm": 0.3655959963798523, |
| "learning_rate": 0.0004945778914770432, |
| "loss": 3.1773, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.064061154177433, |
| "grad_norm": 0.34737685322761536, |
| "learning_rate": 0.0004943759256765854, |
| "loss": 3.171, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.077519379844961, |
| "grad_norm": 0.377200186252594, |
| "learning_rate": 0.0004941739598761276, |
| "loss": 3.1669, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.09097760551249, |
| "grad_norm": 0.3763810694217682, |
| "learning_rate": 0.0004939719940756698, |
| "loss": 3.1819, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.104435831180017, |
| "grad_norm": 0.3821322023868561, |
| "learning_rate": 0.0004937700282752121, |
| "loss": 3.1822, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.117894056847545, |
| "grad_norm": 0.36671724915504456, |
| "learning_rate": 0.0004935680624747542, |
| "loss": 3.1802, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.131352282515073, |
| "grad_norm": 0.3423325717449188, |
| "learning_rate": 0.0004933660966742965, |
| "loss": 3.1887, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.144810508182601, |
| "grad_norm": 0.34335920214653015, |
| "learning_rate": 0.0004931641308738386, |
| "loss": 3.1763, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.15826873385013, |
| "grad_norm": 0.32979974150657654, |
| "learning_rate": 0.0004929621650733809, |
| "loss": 3.1907, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.171726959517657, |
| "grad_norm": 0.3438250720500946, |
| "learning_rate": 0.0004927601992729231, |
| "loss": 3.1968, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.185185185185185, |
| "grad_norm": 0.3741178512573242, |
| "learning_rate": 0.0004925582334724652, |
| "loss": 3.1963, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.198643410852713, |
| "grad_norm": 0.34840378165245056, |
| "learning_rate": 0.0004923562676720075, |
| "loss": 3.1908, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.2121016365202415, |
| "grad_norm": 0.3589928150177002, |
| "learning_rate": 0.0004921543018715497, |
| "loss": 3.1877, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.225559862187769, |
| "grad_norm": 0.3323129415512085, |
| "learning_rate": 0.0004919523360710919, |
| "loss": 3.1912, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.239018087855297, |
| "grad_norm": 0.3843107223510742, |
| "learning_rate": 0.0004917503702706341, |
| "loss": 3.1922, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.252476313522825, |
| "grad_norm": 0.32951635122299194, |
| "learning_rate": 0.0004915484044701763, |
| "loss": 3.1885, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.265934539190353, |
| "grad_norm": 0.35862505435943604, |
| "learning_rate": 0.0004913464386697185, |
| "loss": 3.198, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.265934539190353, |
| "eval_accuracy": 0.38612983820744856, |
| "eval_loss": 3.345175266265869, |
| "eval_runtime": 53.7729, |
| "eval_samples_per_second": 334.964, |
| "eval_steps_per_second": 20.94, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.279392764857882, |
| "grad_norm": 0.3674672842025757, |
| "learning_rate": 0.0004911444728692608, |
| "loss": 3.1976, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.292850990525409, |
| "grad_norm": 0.35447216033935547, |
| "learning_rate": 0.000490942507068803, |
| "loss": 3.2064, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.306309216192937, |
| "grad_norm": 0.36305779218673706, |
| "learning_rate": 0.0004907405412683452, |
| "loss": 3.2001, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.319767441860465, |
| "grad_norm": 0.3749120831489563, |
| "learning_rate": 0.0004905385754678874, |
| "loss": 3.2083, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.333225667527993, |
| "grad_norm": 0.3669654428958893, |
| "learning_rate": 0.0004903366096674297, |
| "loss": 3.2036, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.346683893195521, |
| "grad_norm": 0.3420581817626953, |
| "learning_rate": 0.0004901346438669718, |
| "loss": 3.2078, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.360142118863049, |
| "grad_norm": 0.4257405400276184, |
| "learning_rate": 0.0004899326780665141, |
| "loss": 3.2083, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.373600344530577, |
| "grad_norm": 0.3865572512149811, |
| "learning_rate": 0.0004897307122660562, |
| "loss": 3.2109, |
| "step": 27400 |
| }, |
| { |
| "epoch": 7.387058570198105, |
| "grad_norm": 0.3718532621860504, |
| "learning_rate": 0.0004895287464655985, |
| "loss": 3.207, |
| "step": 27450 |
| }, |
| { |
| "epoch": 7.4005167958656335, |
| "grad_norm": 0.33398640155792236, |
| "learning_rate": 0.0004893267806651407, |
| "loss": 3.2055, |
| "step": 27500 |
| }, |
| { |
| "epoch": 7.413975021533161, |
| "grad_norm": 0.3843555152416229, |
| "learning_rate": 0.0004891248148646829, |
| "loss": 3.2098, |
| "step": 27550 |
| }, |
| { |
| "epoch": 7.427433247200689, |
| "grad_norm": 0.3726537227630615, |
| "learning_rate": 0.0004889228490642251, |
| "loss": 3.2024, |
| "step": 27600 |
| }, |
| { |
| "epoch": 7.440891472868217, |
| "grad_norm": 0.37347468733787537, |
| "learning_rate": 0.0004887208832637672, |
| "loss": 3.2197, |
| "step": 27650 |
| }, |
| { |
| "epoch": 7.454349698535745, |
| "grad_norm": 0.3620690107345581, |
| "learning_rate": 0.0004885189174633095, |
| "loss": 3.1987, |
| "step": 27700 |
| }, |
| { |
| "epoch": 7.467807924203273, |
| "grad_norm": 0.35882100462913513, |
| "learning_rate": 0.0004883169516628517, |
| "loss": 3.2174, |
| "step": 27750 |
| }, |
| { |
| "epoch": 7.481266149870801, |
| "grad_norm": 0.36509430408477783, |
| "learning_rate": 0.0004881149858623939, |
| "loss": 3.2147, |
| "step": 27800 |
| }, |
| { |
| "epoch": 7.494724375538329, |
| "grad_norm": 0.35377126932144165, |
| "learning_rate": 0.00048791302006193614, |
| "loss": 3.2112, |
| "step": 27850 |
| }, |
| { |
| "epoch": 7.508182601205857, |
| "grad_norm": 0.35412663221359253, |
| "learning_rate": 0.00048771105426147833, |
| "loss": 3.2221, |
| "step": 27900 |
| }, |
| { |
| "epoch": 7.521640826873385, |
| "grad_norm": 0.3604266941547394, |
| "learning_rate": 0.00048750908846102053, |
| "loss": 3.2176, |
| "step": 27950 |
| }, |
| { |
| "epoch": 7.535099052540913, |
| "grad_norm": 0.36388853192329407, |
| "learning_rate": 0.0004873071226605628, |
| "loss": 3.2262, |
| "step": 28000 |
| }, |
| { |
| "epoch": 7.535099052540913, |
| "eval_accuracy": 0.3867142496435297, |
| "eval_loss": 3.3380982875823975, |
| "eval_runtime": 53.7774, |
| "eval_samples_per_second": 334.936, |
| "eval_steps_per_second": 20.938, |
| "step": 28000 |
| }, |
| { |
| "epoch": 7.548557278208441, |
| "grad_norm": 0.350801557302475, |
| "learning_rate": 0.00048710515686010503, |
| "loss": 3.2109, |
| "step": 28050 |
| }, |
| { |
| "epoch": 7.562015503875969, |
| "grad_norm": 0.34448304772377014, |
| "learning_rate": 0.0004869031910596472, |
| "loss": 3.2345, |
| "step": 28100 |
| }, |
| { |
| "epoch": 7.575473729543497, |
| "grad_norm": 0.3337467908859253, |
| "learning_rate": 0.0004867012252591894, |
| "loss": 3.2291, |
| "step": 28150 |
| }, |
| { |
| "epoch": 7.588931955211025, |
| "grad_norm": 0.37222522497177124, |
| "learning_rate": 0.0004864992594587316, |
| "loss": 3.222, |
| "step": 28200 |
| }, |
| { |
| "epoch": 7.602390180878553, |
| "grad_norm": 0.34009498357772827, |
| "learning_rate": 0.0004862972936582738, |
| "loss": 3.2206, |
| "step": 28250 |
| }, |
| { |
| "epoch": 7.615848406546081, |
| "grad_norm": 0.3641204535961151, |
| "learning_rate": 0.00048609532785781606, |
| "loss": 3.2147, |
| "step": 28300 |
| }, |
| { |
| "epoch": 7.629306632213609, |
| "grad_norm": 0.3570398986339569, |
| "learning_rate": 0.00048589336205735826, |
| "loss": 3.2101, |
| "step": 28350 |
| }, |
| { |
| "epoch": 7.6427648578811365, |
| "grad_norm": 0.3372342586517334, |
| "learning_rate": 0.00048569139625690046, |
| "loss": 3.226, |
| "step": 28400 |
| }, |
| { |
| "epoch": 7.656223083548665, |
| "grad_norm": 0.34732359647750854, |
| "learning_rate": 0.00048548943045644265, |
| "loss": 3.2353, |
| "step": 28450 |
| }, |
| { |
| "epoch": 7.669681309216193, |
| "grad_norm": 0.38042765855789185, |
| "learning_rate": 0.00048528746465598485, |
| "loss": 3.2211, |
| "step": 28500 |
| }, |
| { |
| "epoch": 7.683139534883721, |
| "grad_norm": 0.35742899775505066, |
| "learning_rate": 0.0004850854988555271, |
| "loss": 3.2208, |
| "step": 28550 |
| }, |
| { |
| "epoch": 7.696597760551249, |
| "grad_norm": 0.37352654337882996, |
| "learning_rate": 0.0004848835330550693, |
| "loss": 3.2207, |
| "step": 28600 |
| }, |
| { |
| "epoch": 7.7100559862187765, |
| "grad_norm": 0.35837510228157043, |
| "learning_rate": 0.0004846815672546115, |
| "loss": 3.2348, |
| "step": 28650 |
| }, |
| { |
| "epoch": 7.723514211886305, |
| "grad_norm": 0.32850074768066406, |
| "learning_rate": 0.0004844796014541537, |
| "loss": 3.2138, |
| "step": 28700 |
| }, |
| { |
| "epoch": 7.736972437553833, |
| "grad_norm": 0.373390257358551, |
| "learning_rate": 0.0004842776356536959, |
| "loss": 3.2324, |
| "step": 28750 |
| }, |
| { |
| "epoch": 7.750430663221361, |
| "grad_norm": 0.3398002088069916, |
| "learning_rate": 0.00048407566985323813, |
| "loss": 3.2306, |
| "step": 28800 |
| }, |
| { |
| "epoch": 7.763888888888889, |
| "grad_norm": 0.4056737422943115, |
| "learning_rate": 0.0004838737040527804, |
| "loss": 3.217, |
| "step": 28850 |
| }, |
| { |
| "epoch": 7.777347114556417, |
| "grad_norm": 0.3630368113517761, |
| "learning_rate": 0.0004836717382523226, |
| "loss": 3.2092, |
| "step": 28900 |
| }, |
| { |
| "epoch": 7.790805340223945, |
| "grad_norm": 0.36135610938072205, |
| "learning_rate": 0.0004834697724518648, |
| "loss": 3.219, |
| "step": 28950 |
| }, |
| { |
| "epoch": 7.804263565891473, |
| "grad_norm": 0.33417677879333496, |
| "learning_rate": 0.000483267806651407, |
| "loss": 3.2175, |
| "step": 29000 |
| }, |
| { |
| "epoch": 7.804263565891473, |
| "eval_accuracy": 0.38752779729799613, |
| "eval_loss": 3.331707000732422, |
| "eval_runtime": 53.6927, |
| "eval_samples_per_second": 335.465, |
| "eval_steps_per_second": 20.971, |
| "step": 29000 |
| }, |
| { |
| "epoch": 7.817721791559001, |
| "grad_norm": 0.3444252014160156, |
| "learning_rate": 0.0004830658408509492, |
| "loss": 3.2306, |
| "step": 29050 |
| }, |
| { |
| "epoch": 7.831180017226529, |
| "grad_norm": 0.34740421175956726, |
| "learning_rate": 0.0004828638750504914, |
| "loss": 3.2341, |
| "step": 29100 |
| }, |
| { |
| "epoch": 7.844638242894057, |
| "grad_norm": 0.37734100222587585, |
| "learning_rate": 0.0004826619092500336, |
| "loss": 3.2191, |
| "step": 29150 |
| }, |
| { |
| "epoch": 7.858096468561585, |
| "grad_norm": 0.3513396680355072, |
| "learning_rate": 0.0004824599434495758, |
| "loss": 3.2282, |
| "step": 29200 |
| }, |
| { |
| "epoch": 7.871554694229113, |
| "grad_norm": 0.3746366798877716, |
| "learning_rate": 0.00048225797764911806, |
| "loss": 3.2241, |
| "step": 29250 |
| }, |
| { |
| "epoch": 7.885012919896641, |
| "grad_norm": 0.3567333519458771, |
| "learning_rate": 0.00048205601184866026, |
| "loss": 3.2393, |
| "step": 29300 |
| }, |
| { |
| "epoch": 7.8984711455641685, |
| "grad_norm": 0.33819180727005005, |
| "learning_rate": 0.00048185404604820245, |
| "loss": 3.2353, |
| "step": 29350 |
| }, |
| { |
| "epoch": 7.911929371231697, |
| "grad_norm": 0.38296690583229065, |
| "learning_rate": 0.00048165208024774465, |
| "loss": 3.2292, |
| "step": 29400 |
| }, |
| { |
| "epoch": 7.925387596899225, |
| "grad_norm": 0.3398057222366333, |
| "learning_rate": 0.00048145011444728685, |
| "loss": 3.2312, |
| "step": 29450 |
| }, |
| { |
| "epoch": 7.938845822566753, |
| "grad_norm": 0.35328567028045654, |
| "learning_rate": 0.0004812481486468291, |
| "loss": 3.231, |
| "step": 29500 |
| }, |
| { |
| "epoch": 7.95230404823428, |
| "grad_norm": 0.3437725603580475, |
| "learning_rate": 0.0004810461828463713, |
| "loss": 3.2242, |
| "step": 29550 |
| }, |
| { |
| "epoch": 7.965762273901809, |
| "grad_norm": 0.342734158039093, |
| "learning_rate": 0.0004808442170459135, |
| "loss": 3.2265, |
| "step": 29600 |
| }, |
| { |
| "epoch": 7.979220499569337, |
| "grad_norm": 0.33320966362953186, |
| "learning_rate": 0.0004806422512454557, |
| "loss": 3.2315, |
| "step": 29650 |
| }, |
| { |
| "epoch": 7.992678725236865, |
| "grad_norm": 0.3411356508731842, |
| "learning_rate": 0.00048044028544499793, |
| "loss": 3.232, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.005921619293712, |
| "grad_norm": 0.40168818831443787, |
| "learning_rate": 0.0004802383196445402, |
| "loss": 3.1846, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.01937984496124, |
| "grad_norm": 0.35845109820365906, |
| "learning_rate": 0.0004800363538440824, |
| "loss": 3.1328, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.032838070628769, |
| "grad_norm": 0.34396156668663025, |
| "learning_rate": 0.0004798343880436246, |
| "loss": 3.126, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.046296296296296, |
| "grad_norm": 0.3609023094177246, |
| "learning_rate": 0.00047963242224316683, |
| "loss": 3.1423, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.059754521963825, |
| "grad_norm": 0.34926462173461914, |
| "learning_rate": 0.000479430456442709, |
| "loss": 3.1416, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.073212747631352, |
| "grad_norm": 0.3574993312358856, |
| "learning_rate": 0.0004792284906422512, |
| "loss": 3.146, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.073212747631352, |
| "eval_accuracy": 0.3879771606926107, |
| "eval_loss": 3.3365345001220703, |
| "eval_runtime": 53.7752, |
| "eval_samples_per_second": 334.95, |
| "eval_steps_per_second": 20.939, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.08667097329888, |
| "grad_norm": 0.3560766875743866, |
| "learning_rate": 0.0004790265248417934, |
| "loss": 3.1485, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.100129198966409, |
| "grad_norm": 0.3525884747505188, |
| "learning_rate": 0.0004788245590413356, |
| "loss": 3.1476, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.113587424633936, |
| "grad_norm": 0.3512996435165405, |
| "learning_rate": 0.00047862259324087786, |
| "loss": 3.1545, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.127045650301465, |
| "grad_norm": 0.3729488253593445, |
| "learning_rate": 0.00047842062744042006, |
| "loss": 3.1522, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.140503875968992, |
| "grad_norm": 0.3845618963241577, |
| "learning_rate": 0.00047821866163996225, |
| "loss": 3.1521, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.15396210163652, |
| "grad_norm": 0.36919400095939636, |
| "learning_rate": 0.00047801669583950445, |
| "loss": 3.1622, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.167420327304049, |
| "grad_norm": 0.35437729954719543, |
| "learning_rate": 0.00047781473003904665, |
| "loss": 3.1565, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.180878552971576, |
| "grad_norm": 0.37341639399528503, |
| "learning_rate": 0.0004776127642385889, |
| "loss": 3.169, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.194336778639105, |
| "grad_norm": 0.3353579342365265, |
| "learning_rate": 0.0004774107984381311, |
| "loss": 3.1711, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.207795004306632, |
| "grad_norm": 0.36150503158569336, |
| "learning_rate": 0.0004772088326376733, |
| "loss": 3.1728, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.22125322997416, |
| "grad_norm": 0.36009085178375244, |
| "learning_rate": 0.0004770068668372155, |
| "loss": 3.1609, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.234711455641689, |
| "grad_norm": 0.33860263228416443, |
| "learning_rate": 0.0004768049010367577, |
| "loss": 3.168, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.248169681309216, |
| "grad_norm": 0.392787367105484, |
| "learning_rate": 0.0004766029352363, |
| "loss": 3.1719, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.261627906976745, |
| "grad_norm": 0.3563990592956543, |
| "learning_rate": 0.0004764009694358422, |
| "loss": 3.1695, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.275086132644272, |
| "grad_norm": 0.3538586497306824, |
| "learning_rate": 0.0004761990036353844, |
| "loss": 3.1694, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.2885443583118, |
| "grad_norm": 0.37058621644973755, |
| "learning_rate": 0.0004759970378349266, |
| "loss": 3.1726, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.302002583979329, |
| "grad_norm": 0.34298816323280334, |
| "learning_rate": 0.0004757950720344688, |
| "loss": 3.1585, |
| "step": 30850 |
| }, |
| { |
| "epoch": 8.315460809646856, |
| "grad_norm": 0.35922834277153015, |
| "learning_rate": 0.000475593106234011, |
| "loss": 3.1808, |
| "step": 30900 |
| }, |
| { |
| "epoch": 8.328919035314383, |
| "grad_norm": 0.3559612035751343, |
| "learning_rate": 0.0004753911404335532, |
| "loss": 3.1752, |
| "step": 30950 |
| }, |
| { |
| "epoch": 8.342377260981912, |
| "grad_norm": 0.38853004574775696, |
| "learning_rate": 0.0004751891746330954, |
| "loss": 3.1766, |
| "step": 31000 |
| }, |
| { |
| "epoch": 8.342377260981912, |
| "eval_accuracy": 0.38834275736158497, |
| "eval_loss": 3.3302693367004395, |
| "eval_runtime": 53.7698, |
| "eval_samples_per_second": 334.984, |
| "eval_steps_per_second": 20.941, |
| "step": 31000 |
| }, |
| { |
| "epoch": 8.35583548664944, |
| "grad_norm": 0.3335705101490021, |
| "learning_rate": 0.0004749872088326376, |
| "loss": 3.1689, |
| "step": 31050 |
| }, |
| { |
| "epoch": 8.369293712316969, |
| "grad_norm": 0.34073248505592346, |
| "learning_rate": 0.00047478524303217986, |
| "loss": 3.1727, |
| "step": 31100 |
| }, |
| { |
| "epoch": 8.382751937984496, |
| "grad_norm": 0.3616897165775299, |
| "learning_rate": 0.00047458327723172206, |
| "loss": 3.1768, |
| "step": 31150 |
| }, |
| { |
| "epoch": 8.396210163652023, |
| "grad_norm": 0.3575231432914734, |
| "learning_rate": 0.00047438131143126425, |
| "loss": 3.1921, |
| "step": 31200 |
| }, |
| { |
| "epoch": 8.409668389319553, |
| "grad_norm": 0.3506259620189667, |
| "learning_rate": 0.00047417934563080645, |
| "loss": 3.1823, |
| "step": 31250 |
| }, |
| { |
| "epoch": 8.42312661498708, |
| "grad_norm": 0.36722439527511597, |
| "learning_rate": 0.00047397737983034864, |
| "loss": 3.1934, |
| "step": 31300 |
| }, |
| { |
| "epoch": 8.436584840654609, |
| "grad_norm": 0.38640642166137695, |
| "learning_rate": 0.0004737754140298909, |
| "loss": 3.1819, |
| "step": 31350 |
| }, |
| { |
| "epoch": 8.450043066322136, |
| "grad_norm": 0.3896099328994751, |
| "learning_rate": 0.0004735734482294331, |
| "loss": 3.1933, |
| "step": 31400 |
| }, |
| { |
| "epoch": 8.463501291989663, |
| "grad_norm": 0.39949628710746765, |
| "learning_rate": 0.0004733714824289753, |
| "loss": 3.1912, |
| "step": 31450 |
| }, |
| { |
| "epoch": 8.476959517657193, |
| "grad_norm": 0.3628818690776825, |
| "learning_rate": 0.0004731695166285175, |
| "loss": 3.1906, |
| "step": 31500 |
| }, |
| { |
| "epoch": 8.49041774332472, |
| "grad_norm": 0.3500027060508728, |
| "learning_rate": 0.0004729675508280598, |
| "loss": 3.1806, |
| "step": 31550 |
| }, |
| { |
| "epoch": 8.503875968992247, |
| "grad_norm": 0.37505653500556946, |
| "learning_rate": 0.000472765585027602, |
| "loss": 3.1896, |
| "step": 31600 |
| }, |
| { |
| "epoch": 8.517334194659776, |
| "grad_norm": 0.34353429079055786, |
| "learning_rate": 0.0004725636192271442, |
| "loss": 3.1806, |
| "step": 31650 |
| }, |
| { |
| "epoch": 8.530792420327304, |
| "grad_norm": 0.3562263250350952, |
| "learning_rate": 0.0004723616534266864, |
| "loss": 3.1983, |
| "step": 31700 |
| }, |
| { |
| "epoch": 8.544250645994833, |
| "grad_norm": 0.36144253611564636, |
| "learning_rate": 0.0004721596876262286, |
| "loss": 3.1921, |
| "step": 31750 |
| }, |
| { |
| "epoch": 8.55770887166236, |
| "grad_norm": 0.3487912118434906, |
| "learning_rate": 0.0004719577218257708, |
| "loss": 3.1899, |
| "step": 31800 |
| }, |
| { |
| "epoch": 8.571167097329887, |
| "grad_norm": 0.3909497559070587, |
| "learning_rate": 0.000471755756025313, |
| "loss": 3.1832, |
| "step": 31850 |
| }, |
| { |
| "epoch": 8.584625322997416, |
| "grad_norm": 0.358192503452301, |
| "learning_rate": 0.0004715537902248552, |
| "loss": 3.202, |
| "step": 31900 |
| }, |
| { |
| "epoch": 8.598083548664944, |
| "grad_norm": 0.37098532915115356, |
| "learning_rate": 0.0004713518244243974, |
| "loss": 3.1867, |
| "step": 31950 |
| }, |
| { |
| "epoch": 8.611541774332473, |
| "grad_norm": 0.36137476563453674, |
| "learning_rate": 0.00047114985862393966, |
| "loss": 3.2014, |
| "step": 32000 |
| }, |
| { |
| "epoch": 8.611541774332473, |
| "eval_accuracy": 0.3883745908902654, |
| "eval_loss": 3.3255698680877686, |
| "eval_runtime": 53.8982, |
| "eval_samples_per_second": 334.185, |
| "eval_steps_per_second": 20.891, |
| "step": 32000 |
| }, |
| { |
| "epoch": 8.625, |
| "grad_norm": 0.35992470383644104, |
| "learning_rate": 0.00047094789282348186, |
| "loss": 3.1849, |
| "step": 32050 |
| }, |
| { |
| "epoch": 8.638458225667527, |
| "grad_norm": 0.33300545811653137, |
| "learning_rate": 0.00047074592702302405, |
| "loss": 3.1975, |
| "step": 32100 |
| }, |
| { |
| "epoch": 8.651916451335056, |
| "grad_norm": 0.34040096402168274, |
| "learning_rate": 0.00047054396122256625, |
| "loss": 3.1932, |
| "step": 32150 |
| }, |
| { |
| "epoch": 8.665374677002584, |
| "grad_norm": 0.36708134412765503, |
| "learning_rate": 0.00047034199542210844, |
| "loss": 3.1929, |
| "step": 32200 |
| }, |
| { |
| "epoch": 8.678832902670113, |
| "grad_norm": 0.37494978308677673, |
| "learning_rate": 0.0004701400296216507, |
| "loss": 3.1904, |
| "step": 32250 |
| }, |
| { |
| "epoch": 8.69229112833764, |
| "grad_norm": 0.3530576825141907, |
| "learning_rate": 0.0004699380638211929, |
| "loss": 3.2015, |
| "step": 32300 |
| }, |
| { |
| "epoch": 8.705749354005167, |
| "grad_norm": 0.3551010489463806, |
| "learning_rate": 0.0004697360980207351, |
| "loss": 3.2013, |
| "step": 32350 |
| }, |
| { |
| "epoch": 8.719207579672696, |
| "grad_norm": 0.3860146105289459, |
| "learning_rate": 0.0004695341322202773, |
| "loss": 3.1963, |
| "step": 32400 |
| }, |
| { |
| "epoch": 8.732665805340224, |
| "grad_norm": 0.3952493965625763, |
| "learning_rate": 0.0004693321664198196, |
| "loss": 3.2054, |
| "step": 32450 |
| }, |
| { |
| "epoch": 8.746124031007753, |
| "grad_norm": 0.35887420177459717, |
| "learning_rate": 0.0004691302006193618, |
| "loss": 3.2014, |
| "step": 32500 |
| }, |
| { |
| "epoch": 8.75958225667528, |
| "grad_norm": 0.3608771860599518, |
| "learning_rate": 0.000468928234818904, |
| "loss": 3.2075, |
| "step": 32550 |
| }, |
| { |
| "epoch": 8.773040482342807, |
| "grad_norm": 0.3411955237388611, |
| "learning_rate": 0.0004687262690184462, |
| "loss": 3.1987, |
| "step": 32600 |
| }, |
| { |
| "epoch": 8.786498708010337, |
| "grad_norm": 0.3674717843532562, |
| "learning_rate": 0.00046852430321798837, |
| "loss": 3.2043, |
| "step": 32650 |
| }, |
| { |
| "epoch": 8.799956933677864, |
| "grad_norm": 0.34659647941589355, |
| "learning_rate": 0.0004683223374175306, |
| "loss": 3.1958, |
| "step": 32700 |
| }, |
| { |
| "epoch": 8.813415159345391, |
| "grad_norm": 0.3701222538948059, |
| "learning_rate": 0.0004681203716170728, |
| "loss": 3.183, |
| "step": 32750 |
| }, |
| { |
| "epoch": 8.82687338501292, |
| "grad_norm": 0.355498731136322, |
| "learning_rate": 0.000467918405816615, |
| "loss": 3.1937, |
| "step": 32800 |
| }, |
| { |
| "epoch": 8.840331610680447, |
| "grad_norm": 0.3362954556941986, |
| "learning_rate": 0.0004677164400161572, |
| "loss": 3.1952, |
| "step": 32850 |
| }, |
| { |
| "epoch": 8.853789836347977, |
| "grad_norm": 0.3454212248325348, |
| "learning_rate": 0.0004675144742156994, |
| "loss": 3.1975, |
| "step": 32900 |
| }, |
| { |
| "epoch": 8.867248062015504, |
| "grad_norm": 0.3511376976966858, |
| "learning_rate": 0.00046731250841524166, |
| "loss": 3.1951, |
| "step": 32950 |
| }, |
| { |
| "epoch": 8.880706287683031, |
| "grad_norm": 0.3271363377571106, |
| "learning_rate": 0.00046711054261478385, |
| "loss": 3.1996, |
| "step": 33000 |
| }, |
| { |
| "epoch": 8.880706287683031, |
| "eval_accuracy": 0.38930091398080774, |
| "eval_loss": 3.3175978660583496, |
| "eval_runtime": 53.6786, |
| "eval_samples_per_second": 335.553, |
| "eval_steps_per_second": 20.977, |
| "step": 33000 |
| }, |
| { |
| "epoch": 8.89416451335056, |
| "grad_norm": 0.3616025149822235, |
| "learning_rate": 0.00046690857681432605, |
| "loss": 3.1985, |
| "step": 33050 |
| }, |
| { |
| "epoch": 8.907622739018088, |
| "grad_norm": 0.33482256531715393, |
| "learning_rate": 0.00046670661101386825, |
| "loss": 3.2106, |
| "step": 33100 |
| }, |
| { |
| "epoch": 8.921080964685617, |
| "grad_norm": 0.35267388820648193, |
| "learning_rate": 0.00046650464521341044, |
| "loss": 3.1885, |
| "step": 33150 |
| }, |
| { |
| "epoch": 8.934539190353144, |
| "grad_norm": 0.387184202671051, |
| "learning_rate": 0.0004663026794129527, |
| "loss": 3.2031, |
| "step": 33200 |
| }, |
| { |
| "epoch": 8.947997416020671, |
| "grad_norm": 0.34149935841560364, |
| "learning_rate": 0.0004661007136124949, |
| "loss": 3.2134, |
| "step": 33250 |
| }, |
| { |
| "epoch": 8.9614556416882, |
| "grad_norm": 0.34807565808296204, |
| "learning_rate": 0.00046589874781203714, |
| "loss": 3.206, |
| "step": 33300 |
| }, |
| { |
| "epoch": 8.974913867355728, |
| "grad_norm": 0.3618689775466919, |
| "learning_rate": 0.0004656967820115794, |
| "loss": 3.2077, |
| "step": 33350 |
| }, |
| { |
| "epoch": 8.988372093023255, |
| "grad_norm": 0.35687363147735596, |
| "learning_rate": 0.0004654948162111216, |
| "loss": 3.2034, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.001614987080103, |
| "grad_norm": 0.3490736484527588, |
| "learning_rate": 0.0004652928504106638, |
| "loss": 3.1925, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.015073212747632, |
| "grad_norm": 0.3292122185230255, |
| "learning_rate": 0.000465090884610206, |
| "loss": 3.0937, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.02853143841516, |
| "grad_norm": 0.38094958662986755, |
| "learning_rate": 0.00046488891880974817, |
| "loss": 3.1038, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.041989664082687, |
| "grad_norm": 0.3638545274734497, |
| "learning_rate": 0.0004646869530092904, |
| "loss": 3.1066, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.055447889750216, |
| "grad_norm": 0.3722701668739319, |
| "learning_rate": 0.0004644849872088326, |
| "loss": 3.1154, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.068906115417743, |
| "grad_norm": 0.39622655510902405, |
| "learning_rate": 0.0004642830214083748, |
| "loss": 3.1048, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.082364341085272, |
| "grad_norm": 0.3642023801803589, |
| "learning_rate": 0.000464081055607917, |
| "loss": 3.1062, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.0958225667528, |
| "grad_norm": 0.36213210225105286, |
| "learning_rate": 0.0004638790898074592, |
| "loss": 3.1246, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.109280792420327, |
| "grad_norm": 0.3247149884700775, |
| "learning_rate": 0.00046367712400700146, |
| "loss": 3.118, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.122739018087856, |
| "grad_norm": 0.37343013286590576, |
| "learning_rate": 0.00046347515820654365, |
| "loss": 3.1159, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.136197243755383, |
| "grad_norm": 0.36302250623703003, |
| "learning_rate": 0.00046327319240608585, |
| "loss": 3.1286, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.14965546942291, |
| "grad_norm": 0.38092878460884094, |
| "learning_rate": 0.00046307122660562805, |
| "loss": 3.137, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.14965546942291, |
| "eval_accuracy": 0.38890011573063055, |
| "eval_loss": 3.3283283710479736, |
| "eval_runtime": 53.7606, |
| "eval_samples_per_second": 335.041, |
| "eval_steps_per_second": 20.945, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.16311369509044, |
| "grad_norm": 0.35080328583717346, |
| "learning_rate": 0.00046286926080517024, |
| "loss": 3.1347, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.176571920757967, |
| "grad_norm": 0.34843164682388306, |
| "learning_rate": 0.0004626672950047125, |
| "loss": 3.1405, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.190030146425496, |
| "grad_norm": 0.4157335162162781, |
| "learning_rate": 0.0004624653292042547, |
| "loss": 3.136, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.203488372093023, |
| "grad_norm": 0.3641476333141327, |
| "learning_rate": 0.00046226336340379694, |
| "loss": 3.1512, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.21694659776055, |
| "grad_norm": 0.341251015663147, |
| "learning_rate": 0.00046206139760333913, |
| "loss": 3.1524, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.23040482342808, |
| "grad_norm": 0.35386136174201965, |
| "learning_rate": 0.0004618594318028814, |
| "loss": 3.1352, |
| "step": 34300 |
| }, |
| { |
| "epoch": 9.243863049095607, |
| "grad_norm": 0.3733835816383362, |
| "learning_rate": 0.0004616574660024236, |
| "loss": 3.1424, |
| "step": 34350 |
| }, |
| { |
| "epoch": 9.257321274763136, |
| "grad_norm": 0.33062437176704407, |
| "learning_rate": 0.0004614555002019658, |
| "loss": 3.153, |
| "step": 34400 |
| }, |
| { |
| "epoch": 9.270779500430663, |
| "grad_norm": 0.34706413745880127, |
| "learning_rate": 0.000461253534401508, |
| "loss": 3.1459, |
| "step": 34450 |
| }, |
| { |
| "epoch": 9.28423772609819, |
| "grad_norm": 0.35080230236053467, |
| "learning_rate": 0.00046105156860105017, |
| "loss": 3.1433, |
| "step": 34500 |
| }, |
| { |
| "epoch": 9.29769595176572, |
| "grad_norm": 0.37283027172088623, |
| "learning_rate": 0.0004608496028005924, |
| "loss": 3.145, |
| "step": 34550 |
| }, |
| { |
| "epoch": 9.311154177433247, |
| "grad_norm": 0.37630441784858704, |
| "learning_rate": 0.0004606476370001346, |
| "loss": 3.1535, |
| "step": 34600 |
| }, |
| { |
| "epoch": 9.324612403100776, |
| "grad_norm": 0.34792250394821167, |
| "learning_rate": 0.0004604456711996768, |
| "loss": 3.1609, |
| "step": 34650 |
| }, |
| { |
| "epoch": 9.338070628768303, |
| "grad_norm": 0.35801804065704346, |
| "learning_rate": 0.000460243705399219, |
| "loss": 3.1578, |
| "step": 34700 |
| }, |
| { |
| "epoch": 9.35152885443583, |
| "grad_norm": 0.40342605113983154, |
| "learning_rate": 0.0004600417395987612, |
| "loss": 3.1523, |
| "step": 34750 |
| }, |
| { |
| "epoch": 9.36498708010336, |
| "grad_norm": 0.3323036730289459, |
| "learning_rate": 0.00045983977379830345, |
| "loss": 3.1518, |
| "step": 34800 |
| }, |
| { |
| "epoch": 9.378445305770887, |
| "grad_norm": 0.3701625466346741, |
| "learning_rate": 0.00045963780799784565, |
| "loss": 3.1529, |
| "step": 34850 |
| }, |
| { |
| "epoch": 9.391903531438416, |
| "grad_norm": 0.34849992394447327, |
| "learning_rate": 0.00045943584219738785, |
| "loss": 3.159, |
| "step": 34900 |
| }, |
| { |
| "epoch": 9.405361757105943, |
| "grad_norm": 0.3489476144313812, |
| "learning_rate": 0.00045923387639693004, |
| "loss": 3.1557, |
| "step": 34950 |
| }, |
| { |
| "epoch": 9.41881998277347, |
| "grad_norm": 0.3345106542110443, |
| "learning_rate": 0.00045903191059647224, |
| "loss": 3.1566, |
| "step": 35000 |
| }, |
| { |
| "epoch": 9.41881998277347, |
| "eval_accuracy": 0.38953950247562724, |
| "eval_loss": 3.321983575820923, |
| "eval_runtime": 53.7047, |
| "eval_samples_per_second": 335.389, |
| "eval_steps_per_second": 20.966, |
| "step": 35000 |
| }, |
| { |
| "epoch": 9.432278208441, |
| "grad_norm": 0.3547540605068207, |
| "learning_rate": 0.0004588299447960145, |
| "loss": 3.1598, |
| "step": 35050 |
| }, |
| { |
| "epoch": 9.445736434108527, |
| "grad_norm": 0.35543152689933777, |
| "learning_rate": 0.00045862797899555674, |
| "loss": 3.1773, |
| "step": 35100 |
| }, |
| { |
| "epoch": 9.459194659776054, |
| "grad_norm": 0.3571447432041168, |
| "learning_rate": 0.00045842601319509894, |
| "loss": 3.1631, |
| "step": 35150 |
| }, |
| { |
| "epoch": 9.472652885443583, |
| "grad_norm": 0.35717859864234924, |
| "learning_rate": 0.00045822404739464113, |
| "loss": 3.1635, |
| "step": 35200 |
| }, |
| { |
| "epoch": 9.48611111111111, |
| "grad_norm": 0.36354541778564453, |
| "learning_rate": 0.0004580220815941834, |
| "loss": 3.1556, |
| "step": 35250 |
| }, |
| { |
| "epoch": 9.49956933677864, |
| "grad_norm": 0.36084792017936707, |
| "learning_rate": 0.0004578201157937256, |
| "loss": 3.1599, |
| "step": 35300 |
| }, |
| { |
| "epoch": 9.513027562446167, |
| "grad_norm": 0.3501897156238556, |
| "learning_rate": 0.0004576181499932678, |
| "loss": 3.1769, |
| "step": 35350 |
| }, |
| { |
| "epoch": 9.526485788113694, |
| "grad_norm": 0.3838970959186554, |
| "learning_rate": 0.00045741618419280997, |
| "loss": 3.1579, |
| "step": 35400 |
| }, |
| { |
| "epoch": 9.539944013781223, |
| "grad_norm": 0.38200482726097107, |
| "learning_rate": 0.0004572142183923522, |
| "loss": 3.1472, |
| "step": 35450 |
| }, |
| { |
| "epoch": 9.55340223944875, |
| "grad_norm": 0.3594954311847687, |
| "learning_rate": 0.0004570122525918944, |
| "loss": 3.1716, |
| "step": 35500 |
| }, |
| { |
| "epoch": 9.56686046511628, |
| "grad_norm": 0.3559810519218445, |
| "learning_rate": 0.0004568102867914366, |
| "loss": 3.1719, |
| "step": 35550 |
| }, |
| { |
| "epoch": 9.580318690783807, |
| "grad_norm": 0.356738418340683, |
| "learning_rate": 0.0004566083209909788, |
| "loss": 3.1838, |
| "step": 35600 |
| }, |
| { |
| "epoch": 9.593776916451334, |
| "grad_norm": 0.3445727229118347, |
| "learning_rate": 0.000456406355190521, |
| "loss": 3.1601, |
| "step": 35650 |
| }, |
| { |
| "epoch": 9.607235142118864, |
| "grad_norm": 0.35387566685676575, |
| "learning_rate": 0.00045620438939006326, |
| "loss": 3.1663, |
| "step": 35700 |
| }, |
| { |
| "epoch": 9.62069336778639, |
| "grad_norm": 0.3698170483112335, |
| "learning_rate": 0.00045600242358960545, |
| "loss": 3.173, |
| "step": 35750 |
| }, |
| { |
| "epoch": 9.634151593453918, |
| "grad_norm": 0.35212114453315735, |
| "learning_rate": 0.00045580045778914765, |
| "loss": 3.1671, |
| "step": 35800 |
| }, |
| { |
| "epoch": 9.647609819121447, |
| "grad_norm": 0.39634934067726135, |
| "learning_rate": 0.00045559849198868984, |
| "loss": 3.1782, |
| "step": 35850 |
| }, |
| { |
| "epoch": 9.661068044788975, |
| "grad_norm": 0.3627624809741974, |
| "learning_rate": 0.00045539652618823204, |
| "loss": 3.1689, |
| "step": 35900 |
| }, |
| { |
| "epoch": 9.674526270456504, |
| "grad_norm": 0.3928319215774536, |
| "learning_rate": 0.0004551945603877743, |
| "loss": 3.1757, |
| "step": 35950 |
| }, |
| { |
| "epoch": 9.687984496124031, |
| "grad_norm": 0.37066513299942017, |
| "learning_rate": 0.00045499259458731654, |
| "loss": 3.1774, |
| "step": 36000 |
| }, |
| { |
| "epoch": 9.687984496124031, |
| "eval_accuracy": 0.39003808289581243, |
| "eval_loss": 3.3119335174560547, |
| "eval_runtime": 53.867, |
| "eval_samples_per_second": 334.379, |
| "eval_steps_per_second": 20.903, |
| "step": 36000 |
| }, |
| { |
| "epoch": 9.701442721791558, |
| "grad_norm": 0.36574748158454895, |
| "learning_rate": 0.00045479062878685874, |
| "loss": 3.169, |
| "step": 36050 |
| }, |
| { |
| "epoch": 9.714900947459087, |
| "grad_norm": 0.3367103040218353, |
| "learning_rate": 0.00045458866298640093, |
| "loss": 3.1758, |
| "step": 36100 |
| }, |
| { |
| "epoch": 9.728359173126615, |
| "grad_norm": 0.3776065707206726, |
| "learning_rate": 0.0004543866971859432, |
| "loss": 3.1861, |
| "step": 36150 |
| }, |
| { |
| "epoch": 9.741817398794144, |
| "grad_norm": 0.3644520044326782, |
| "learning_rate": 0.0004541847313854854, |
| "loss": 3.1765, |
| "step": 36200 |
| }, |
| { |
| "epoch": 9.755275624461671, |
| "grad_norm": 0.3726387917995453, |
| "learning_rate": 0.0004539827655850276, |
| "loss": 3.1772, |
| "step": 36250 |
| }, |
| { |
| "epoch": 9.768733850129198, |
| "grad_norm": 0.3458203375339508, |
| "learning_rate": 0.00045378079978456977, |
| "loss": 3.1767, |
| "step": 36300 |
| }, |
| { |
| "epoch": 9.782192075796727, |
| "grad_norm": 0.3517729640007019, |
| "learning_rate": 0.00045357883398411197, |
| "loss": 3.1872, |
| "step": 36350 |
| }, |
| { |
| "epoch": 9.795650301464255, |
| "grad_norm": 0.37035584449768066, |
| "learning_rate": 0.0004533768681836542, |
| "loss": 3.176, |
| "step": 36400 |
| }, |
| { |
| "epoch": 9.809108527131784, |
| "grad_norm": 0.331911563873291, |
| "learning_rate": 0.0004531749023831964, |
| "loss": 3.1869, |
| "step": 36450 |
| }, |
| { |
| "epoch": 9.822566752799311, |
| "grad_norm": 0.378213107585907, |
| "learning_rate": 0.0004529729365827386, |
| "loss": 3.1803, |
| "step": 36500 |
| }, |
| { |
| "epoch": 9.836024978466838, |
| "grad_norm": 0.36638858914375305, |
| "learning_rate": 0.0004527709707822808, |
| "loss": 3.1897, |
| "step": 36550 |
| }, |
| { |
| "epoch": 9.849483204134367, |
| "grad_norm": 0.35065630078315735, |
| "learning_rate": 0.000452569004981823, |
| "loss": 3.1761, |
| "step": 36600 |
| }, |
| { |
| "epoch": 9.862941429801895, |
| "grad_norm": 0.3395127058029175, |
| "learning_rate": 0.00045236703918136525, |
| "loss": 3.1697, |
| "step": 36650 |
| }, |
| { |
| "epoch": 9.876399655469424, |
| "grad_norm": 0.3822707235813141, |
| "learning_rate": 0.00045216507338090745, |
| "loss": 3.1762, |
| "step": 36700 |
| }, |
| { |
| "epoch": 9.889857881136951, |
| "grad_norm": 0.363520085811615, |
| "learning_rate": 0.00045196310758044964, |
| "loss": 3.1838, |
| "step": 36750 |
| }, |
| { |
| "epoch": 9.903316106804478, |
| "grad_norm": 0.3515053391456604, |
| "learning_rate": 0.00045176114177999184, |
| "loss": 3.1785, |
| "step": 36800 |
| }, |
| { |
| "epoch": 9.916774332472007, |
| "grad_norm": 0.349691778421402, |
| "learning_rate": 0.00045155917597953404, |
| "loss": 3.1721, |
| "step": 36850 |
| }, |
| { |
| "epoch": 9.930232558139535, |
| "grad_norm": 0.35931867361068726, |
| "learning_rate": 0.00045135721017907634, |
| "loss": 3.1714, |
| "step": 36900 |
| }, |
| { |
| "epoch": 9.943690783807062, |
| "grad_norm": 0.34536248445510864, |
| "learning_rate": 0.00045115524437861854, |
| "loss": 3.1884, |
| "step": 36950 |
| }, |
| { |
| "epoch": 9.957149009474591, |
| "grad_norm": 0.36185145378112793, |
| "learning_rate": 0.00045095327857816073, |
| "loss": 3.1915, |
| "step": 37000 |
| }, |
| { |
| "epoch": 9.957149009474591, |
| "eval_accuracy": 0.3906682346580862, |
| "eval_loss": 3.307512044906616, |
| "eval_runtime": 53.9755, |
| "eval_samples_per_second": 333.707, |
| "eval_steps_per_second": 20.861, |
| "step": 37000 |
| }, |
| { |
| "epoch": 9.970607235142118, |
| "grad_norm": 0.37505844235420227, |
| "learning_rate": 0.00045075131277770293, |
| "loss": 3.1809, |
| "step": 37050 |
| }, |
| { |
| "epoch": 9.984065460809648, |
| "grad_norm": 0.38229846954345703, |
| "learning_rate": 0.0004505493469772452, |
| "loss": 3.1617, |
| "step": 37100 |
| }, |
| { |
| "epoch": 9.997523686477175, |
| "grad_norm": 0.3569982051849365, |
| "learning_rate": 0.0004503473811767874, |
| "loss": 3.181, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.010766580534023, |
| "grad_norm": 0.3492891490459442, |
| "learning_rate": 0.00045014541537632957, |
| "loss": 3.0966, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.02422480620155, |
| "grad_norm": 0.3511972427368164, |
| "learning_rate": 0.00044994344957587177, |
| "loss": 3.0796, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.037683031869078, |
| "grad_norm": 0.37117940187454224, |
| "learning_rate": 0.00044974148377541396, |
| "loss": 3.0834, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.051141257536607, |
| "grad_norm": 0.35840341448783875, |
| "learning_rate": 0.0004495395179749562, |
| "loss": 3.0828, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.064599483204134, |
| "grad_norm": 0.34897172451019287, |
| "learning_rate": 0.0004493375521744984, |
| "loss": 3.0958, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.078057708871663, |
| "grad_norm": 0.39569246768951416, |
| "learning_rate": 0.0004491355863740406, |
| "loss": 3.089, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.09151593453919, |
| "grad_norm": 0.3689843416213989, |
| "learning_rate": 0.0004489336205735828, |
| "loss": 3.0977, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.104974160206718, |
| "grad_norm": 0.34813550114631653, |
| "learning_rate": 0.00044873165477312505, |
| "loss": 3.1025, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.118432385874247, |
| "grad_norm": 0.3824092447757721, |
| "learning_rate": 0.00044852968897266725, |
| "loss": 3.0957, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.131890611541774, |
| "grad_norm": 0.38753488659858704, |
| "learning_rate": 0.00044832772317220945, |
| "loss": 3.0925, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.145348837209303, |
| "grad_norm": 0.3686502277851105, |
| "learning_rate": 0.00044812575737175164, |
| "loss": 3.0996, |
| "step": 37700 |
| }, |
| { |
| "epoch": 10.15880706287683, |
| "grad_norm": 0.37750762701034546, |
| "learning_rate": 0.00044792379157129384, |
| "loss": 3.1156, |
| "step": 37750 |
| }, |
| { |
| "epoch": 10.172265288544358, |
| "grad_norm": 0.3830547332763672, |
| "learning_rate": 0.00044772182577083614, |
| "loss": 3.1072, |
| "step": 37800 |
| }, |
| { |
| "epoch": 10.185723514211887, |
| "grad_norm": 0.40359804034233093, |
| "learning_rate": 0.00044751985997037834, |
| "loss": 3.1251, |
| "step": 37850 |
| }, |
| { |
| "epoch": 10.199181739879414, |
| "grad_norm": 0.35740941762924194, |
| "learning_rate": 0.00044731789416992053, |
| "loss": 3.1123, |
| "step": 37900 |
| }, |
| { |
| "epoch": 10.212639965546943, |
| "grad_norm": 0.3627743124961853, |
| "learning_rate": 0.00044711592836946273, |
| "loss": 3.1226, |
| "step": 37950 |
| }, |
| { |
| "epoch": 10.22609819121447, |
| "grad_norm": 0.4064336121082306, |
| "learning_rate": 0.000446913962569005, |
| "loss": 3.1193, |
| "step": 38000 |
| }, |
| { |
| "epoch": 10.22609819121447, |
| "eval_accuracy": 0.39022843218676134, |
| "eval_loss": 3.3206238746643066, |
| "eval_runtime": 53.6924, |
| "eval_samples_per_second": 335.467, |
| "eval_steps_per_second": 20.971, |
| "step": 38000 |
| }, |
| { |
| "epoch": 10.239556416881998, |
| "grad_norm": 0.40242040157318115, |
| "learning_rate": 0.0004467119967685472, |
| "loss": 3.112, |
| "step": 38050 |
| }, |
| { |
| "epoch": 10.253014642549527, |
| "grad_norm": 0.3557272255420685, |
| "learning_rate": 0.00044651003096808937, |
| "loss": 3.1364, |
| "step": 38100 |
| }, |
| { |
| "epoch": 10.266472868217054, |
| "grad_norm": 0.3683841824531555, |
| "learning_rate": 0.00044630806516763157, |
| "loss": 3.1146, |
| "step": 38150 |
| }, |
| { |
| "epoch": 10.279931093884581, |
| "grad_norm": 0.33976754546165466, |
| "learning_rate": 0.00044610609936717376, |
| "loss": 3.1277, |
| "step": 38200 |
| }, |
| { |
| "epoch": 10.29338931955211, |
| "grad_norm": 0.3842891752719879, |
| "learning_rate": 0.000445904133566716, |
| "loss": 3.128, |
| "step": 38250 |
| }, |
| { |
| "epoch": 10.306847545219638, |
| "grad_norm": 0.3514265716075897, |
| "learning_rate": 0.0004457021677662582, |
| "loss": 3.1333, |
| "step": 38300 |
| }, |
| { |
| "epoch": 10.320305770887167, |
| "grad_norm": 0.34577810764312744, |
| "learning_rate": 0.0004455002019658004, |
| "loss": 3.1148, |
| "step": 38350 |
| }, |
| { |
| "epoch": 10.333763996554694, |
| "grad_norm": 0.37134677171707153, |
| "learning_rate": 0.0004452982361653426, |
| "loss": 3.1305, |
| "step": 38400 |
| }, |
| { |
| "epoch": 10.347222222222221, |
| "grad_norm": 0.39002493023872375, |
| "learning_rate": 0.0004450962703648848, |
| "loss": 3.1265, |
| "step": 38450 |
| }, |
| { |
| "epoch": 10.36068044788975, |
| "grad_norm": 0.377231240272522, |
| "learning_rate": 0.00044489430456442705, |
| "loss": 3.1283, |
| "step": 38500 |
| }, |
| { |
| "epoch": 10.374138673557278, |
| "grad_norm": 0.36426547169685364, |
| "learning_rate": 0.00044469233876396925, |
| "loss": 3.1356, |
| "step": 38550 |
| }, |
| { |
| "epoch": 10.387596899224807, |
| "grad_norm": 0.3630325198173523, |
| "learning_rate": 0.00044449037296351144, |
| "loss": 3.1403, |
| "step": 38600 |
| }, |
| { |
| "epoch": 10.401055124892334, |
| "grad_norm": 0.3570690453052521, |
| "learning_rate": 0.0004442884071630537, |
| "loss": 3.1327, |
| "step": 38650 |
| }, |
| { |
| "epoch": 10.414513350559861, |
| "grad_norm": 0.3494502604007721, |
| "learning_rate": 0.00044408644136259594, |
| "loss": 3.1401, |
| "step": 38700 |
| }, |
| { |
| "epoch": 10.42797157622739, |
| "grad_norm": 0.3937465250492096, |
| "learning_rate": 0.00044388447556213814, |
| "loss": 3.1323, |
| "step": 38750 |
| }, |
| { |
| "epoch": 10.441429801894918, |
| "grad_norm": 0.3534800410270691, |
| "learning_rate": 0.00044368250976168033, |
| "loss": 3.1363, |
| "step": 38800 |
| }, |
| { |
| "epoch": 10.454888027562447, |
| "grad_norm": 0.35517048835754395, |
| "learning_rate": 0.00044348054396122253, |
| "loss": 3.1398, |
| "step": 38850 |
| }, |
| { |
| "epoch": 10.468346253229974, |
| "grad_norm": 0.3748292028903961, |
| "learning_rate": 0.0004432785781607647, |
| "loss": 3.1397, |
| "step": 38900 |
| }, |
| { |
| "epoch": 10.481804478897502, |
| "grad_norm": 0.3736768066883087, |
| "learning_rate": 0.000443076612360307, |
| "loss": 3.1524, |
| "step": 38950 |
| }, |
| { |
| "epoch": 10.49526270456503, |
| "grad_norm": 0.3461846709251404, |
| "learning_rate": 0.0004428746465598492, |
| "loss": 3.1495, |
| "step": 39000 |
| }, |
| { |
| "epoch": 10.49526270456503, |
| "eval_accuracy": 0.3906297736712164, |
| "eval_loss": 3.3105146884918213, |
| "eval_runtime": 53.6571, |
| "eval_samples_per_second": 335.687, |
| "eval_steps_per_second": 20.985, |
| "step": 39000 |
| }, |
| { |
| "epoch": 10.508720930232558, |
| "grad_norm": 0.3518693149089813, |
| "learning_rate": 0.00044267268075939137, |
| "loss": 3.1377, |
| "step": 39050 |
| }, |
| { |
| "epoch": 10.522179155900087, |
| "grad_norm": 0.35536989569664, |
| "learning_rate": 0.00044247071495893357, |
| "loss": 3.1556, |
| "step": 39100 |
| }, |
| { |
| "epoch": 10.535637381567614, |
| "grad_norm": 0.3642809987068176, |
| "learning_rate": 0.00044226874915847576, |
| "loss": 3.151, |
| "step": 39150 |
| }, |
| { |
| "epoch": 10.549095607235142, |
| "grad_norm": 0.3890918791294098, |
| "learning_rate": 0.000442066783358018, |
| "loss": 3.1532, |
| "step": 39200 |
| }, |
| { |
| "epoch": 10.56255383290267, |
| "grad_norm": 0.37581443786621094, |
| "learning_rate": 0.0004418648175575602, |
| "loss": 3.1472, |
| "step": 39250 |
| }, |
| { |
| "epoch": 10.576012058570198, |
| "grad_norm": 0.3536636233329773, |
| "learning_rate": 0.0004416628517571024, |
| "loss": 3.1446, |
| "step": 39300 |
| }, |
| { |
| "epoch": 10.589470284237725, |
| "grad_norm": 0.3826201856136322, |
| "learning_rate": 0.0004414608859566446, |
| "loss": 3.1564, |
| "step": 39350 |
| }, |
| { |
| "epoch": 10.602928509905254, |
| "grad_norm": 0.40004265308380127, |
| "learning_rate": 0.00044125892015618685, |
| "loss": 3.1413, |
| "step": 39400 |
| }, |
| { |
| "epoch": 10.616386735572782, |
| "grad_norm": 0.3891655504703522, |
| "learning_rate": 0.00044105695435572905, |
| "loss": 3.1628, |
| "step": 39450 |
| }, |
| { |
| "epoch": 10.62984496124031, |
| "grad_norm": 0.3744836747646332, |
| "learning_rate": 0.00044085498855527124, |
| "loss": 3.1475, |
| "step": 39500 |
| }, |
| { |
| "epoch": 10.643303186907838, |
| "grad_norm": 0.3514010012149811, |
| "learning_rate": 0.0004406530227548135, |
| "loss": 3.161, |
| "step": 39550 |
| }, |
| { |
| "epoch": 10.656761412575365, |
| "grad_norm": 0.35587388277053833, |
| "learning_rate": 0.00044045105695435574, |
| "loss": 3.1597, |
| "step": 39600 |
| }, |
| { |
| "epoch": 10.670219638242894, |
| "grad_norm": 0.36579716205596924, |
| "learning_rate": 0.00044024909115389794, |
| "loss": 3.154, |
| "step": 39650 |
| }, |
| { |
| "epoch": 10.683677863910422, |
| "grad_norm": 0.3504290282726288, |
| "learning_rate": 0.00044004712535344014, |
| "loss": 3.1476, |
| "step": 39700 |
| }, |
| { |
| "epoch": 10.69713608957795, |
| "grad_norm": 0.4184782803058624, |
| "learning_rate": 0.00043984515955298233, |
| "loss": 3.1578, |
| "step": 39750 |
| }, |
| { |
| "epoch": 10.710594315245478, |
| "grad_norm": 0.40172553062438965, |
| "learning_rate": 0.00043964319375252453, |
| "loss": 3.1515, |
| "step": 39800 |
| }, |
| { |
| "epoch": 10.724052540913005, |
| "grad_norm": 0.35225343704223633, |
| "learning_rate": 0.0004394412279520668, |
| "loss": 3.1479, |
| "step": 39850 |
| }, |
| { |
| "epoch": 10.737510766580534, |
| "grad_norm": 0.37683871388435364, |
| "learning_rate": 0.000439239262151609, |
| "loss": 3.1586, |
| "step": 39900 |
| }, |
| { |
| "epoch": 10.750968992248062, |
| "grad_norm": 0.36777010560035706, |
| "learning_rate": 0.00043903729635115117, |
| "loss": 3.1583, |
| "step": 39950 |
| }, |
| { |
| "epoch": 10.764427217915589, |
| "grad_norm": 0.36609140038490295, |
| "learning_rate": 0.00043883533055069337, |
| "loss": 3.1636, |
| "step": 40000 |
| }, |
| { |
| "epoch": 10.764427217915589, |
| "eval_accuracy": 0.3912790472800694, |
| "eval_loss": 3.3053524494171143, |
| "eval_runtime": 53.8099, |
| "eval_samples_per_second": 334.734, |
| "eval_steps_per_second": 20.926, |
| "step": 40000 |
| }, |
| { |
| "epoch": 10.777885443583118, |
| "grad_norm": 0.3523204028606415, |
| "learning_rate": 0.00043863336475023556, |
| "loss": 3.1587, |
| "step": 40050 |
| }, |
| { |
| "epoch": 10.791343669250645, |
| "grad_norm": 0.3931622803211212, |
| "learning_rate": 0.0004384313989497778, |
| "loss": 3.157, |
| "step": 40100 |
| }, |
| { |
| "epoch": 10.804801894918175, |
| "grad_norm": 0.3558056354522705, |
| "learning_rate": 0.00043822943314932, |
| "loss": 3.1674, |
| "step": 40150 |
| }, |
| { |
| "epoch": 10.818260120585702, |
| "grad_norm": 0.3576938807964325, |
| "learning_rate": 0.0004380274673488622, |
| "loss": 3.156, |
| "step": 40200 |
| }, |
| { |
| "epoch": 10.83171834625323, |
| "grad_norm": 0.3403548002243042, |
| "learning_rate": 0.0004378255015484044, |
| "loss": 3.1594, |
| "step": 40250 |
| }, |
| { |
| "epoch": 10.845176571920758, |
| "grad_norm": 0.3665982186794281, |
| "learning_rate": 0.0004376235357479466, |
| "loss": 3.1629, |
| "step": 40300 |
| }, |
| { |
| "epoch": 10.858634797588286, |
| "grad_norm": 0.3475422263145447, |
| "learning_rate": 0.00043742156994748885, |
| "loss": 3.1553, |
| "step": 40350 |
| }, |
| { |
| "epoch": 10.872093023255815, |
| "grad_norm": 0.33809924125671387, |
| "learning_rate": 0.00043721960414703104, |
| "loss": 3.1654, |
| "step": 40400 |
| }, |
| { |
| "epoch": 10.885551248923342, |
| "grad_norm": 0.3743918836116791, |
| "learning_rate": 0.0004370176383465733, |
| "loss": 3.1581, |
| "step": 40450 |
| }, |
| { |
| "epoch": 10.89900947459087, |
| "grad_norm": 0.36682623624801636, |
| "learning_rate": 0.0004368156725461155, |
| "loss": 3.1621, |
| "step": 40500 |
| }, |
| { |
| "epoch": 10.912467700258398, |
| "grad_norm": 0.3797398507595062, |
| "learning_rate": 0.00043661370674565774, |
| "loss": 3.1628, |
| "step": 40550 |
| }, |
| { |
| "epoch": 10.925925925925926, |
| "grad_norm": 0.3800196051597595, |
| "learning_rate": 0.00043641174094519994, |
| "loss": 3.1664, |
| "step": 40600 |
| }, |
| { |
| "epoch": 10.939384151593455, |
| "grad_norm": 0.3516307473182678, |
| "learning_rate": 0.00043620977514474213, |
| "loss": 3.1608, |
| "step": 40650 |
| }, |
| { |
| "epoch": 10.952842377260982, |
| "grad_norm": 0.37099623680114746, |
| "learning_rate": 0.00043600780934428433, |
| "loss": 3.1647, |
| "step": 40700 |
| }, |
| { |
| "epoch": 10.96630060292851, |
| "grad_norm": 0.38898375630378723, |
| "learning_rate": 0.0004358058435438265, |
| "loss": 3.1516, |
| "step": 40750 |
| }, |
| { |
| "epoch": 10.979758828596038, |
| "grad_norm": 0.3787337839603424, |
| "learning_rate": 0.0004356038777433688, |
| "loss": 3.1438, |
| "step": 40800 |
| }, |
| { |
| "epoch": 10.993217054263566, |
| "grad_norm": 0.3420003354549408, |
| "learning_rate": 0.00043540191194291097, |
| "loss": 3.1638, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.006459948320414, |
| "grad_norm": 0.373717337846756, |
| "learning_rate": 0.00043519994614245317, |
| "loss": 3.0953, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.019918173987941, |
| "grad_norm": 0.36699631810188293, |
| "learning_rate": 0.00043499798034199536, |
| "loss": 3.0586, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.03337639965547, |
| "grad_norm": 0.351595401763916, |
| "learning_rate": 0.00043479601454153756, |
| "loss": 3.0765, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.03337639965547, |
| "eval_accuracy": 0.3908649941135134, |
| "eval_loss": 3.3149876594543457, |
| "eval_runtime": 53.8322, |
| "eval_samples_per_second": 334.596, |
| "eval_steps_per_second": 20.917, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.046834625322997, |
| "grad_norm": 0.36418431997299194, |
| "learning_rate": 0.0004345940487410798, |
| "loss": 3.0751, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.060292850990525, |
| "grad_norm": 0.3695267140865326, |
| "learning_rate": 0.000434392082940622, |
| "loss": 3.0719, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.073751076658054, |
| "grad_norm": 0.3487664461135864, |
| "learning_rate": 0.0004341901171401642, |
| "loss": 3.0753, |
| "step": 41150 |
| }, |
| { |
| "epoch": 11.087209302325581, |
| "grad_norm": 0.3637808859348297, |
| "learning_rate": 0.0004339881513397064, |
| "loss": 3.0803, |
| "step": 41200 |
| }, |
| { |
| "epoch": 11.10066752799311, |
| "grad_norm": 0.3343772888183594, |
| "learning_rate": 0.0004337861855392486, |
| "loss": 3.0836, |
| "step": 41250 |
| }, |
| { |
| "epoch": 11.114125753660637, |
| "grad_norm": 0.3723909854888916, |
| "learning_rate": 0.00043358421973879084, |
| "loss": 3.0935, |
| "step": 41300 |
| }, |
| { |
| "epoch": 11.127583979328165, |
| "grad_norm": 0.34839126467704773, |
| "learning_rate": 0.0004333822539383331, |
| "loss": 3.0799, |
| "step": 41350 |
| }, |
| { |
| "epoch": 11.141042204995694, |
| "grad_norm": 0.40357962250709534, |
| "learning_rate": 0.0004331802881378753, |
| "loss": 3.0947, |
| "step": 41400 |
| }, |
| { |
| "epoch": 11.154500430663221, |
| "grad_norm": 0.3761771619319916, |
| "learning_rate": 0.00043297832233741754, |
| "loss": 3.0852, |
| "step": 41450 |
| }, |
| { |
| "epoch": 11.16795865633075, |
| "grad_norm": 0.3732476532459259, |
| "learning_rate": 0.00043277635653695974, |
| "loss": 3.1048, |
| "step": 41500 |
| }, |
| { |
| "epoch": 11.181416881998278, |
| "grad_norm": 0.3786817789077759, |
| "learning_rate": 0.00043257439073650193, |
| "loss": 3.098, |
| "step": 41550 |
| }, |
| { |
| "epoch": 11.194875107665805, |
| "grad_norm": 0.359264999628067, |
| "learning_rate": 0.00043237242493604413, |
| "loss": 3.0977, |
| "step": 41600 |
| }, |
| { |
| "epoch": 11.208333333333334, |
| "grad_norm": 0.38861802220344543, |
| "learning_rate": 0.0004321704591355863, |
| "loss": 3.1024, |
| "step": 41650 |
| }, |
| { |
| "epoch": 11.221791559000861, |
| "grad_norm": 0.3696124255657196, |
| "learning_rate": 0.0004319684933351286, |
| "loss": 3.1105, |
| "step": 41700 |
| }, |
| { |
| "epoch": 11.235249784668389, |
| "grad_norm": 0.3674962520599365, |
| "learning_rate": 0.00043176652753467077, |
| "loss": 3.1026, |
| "step": 41750 |
| }, |
| { |
| "epoch": 11.248708010335918, |
| "grad_norm": 0.3563523292541504, |
| "learning_rate": 0.00043156456173421297, |
| "loss": 3.0967, |
| "step": 41800 |
| }, |
| { |
| "epoch": 11.262166236003445, |
| "grad_norm": 0.36378583312034607, |
| "learning_rate": 0.00043136259593375516, |
| "loss": 3.1061, |
| "step": 41850 |
| }, |
| { |
| "epoch": 11.275624461670974, |
| "grad_norm": 0.3854440152645111, |
| "learning_rate": 0.00043116063013329736, |
| "loss": 3.0973, |
| "step": 41900 |
| }, |
| { |
| "epoch": 11.289082687338501, |
| "grad_norm": 0.37868574261665344, |
| "learning_rate": 0.0004309586643328396, |
| "loss": 3.1045, |
| "step": 41950 |
| }, |
| { |
| "epoch": 11.302540913006029, |
| "grad_norm": 0.37912723422050476, |
| "learning_rate": 0.0004307566985323818, |
| "loss": 3.1059, |
| "step": 42000 |
| }, |
| { |
| "epoch": 11.302540913006029, |
| "eval_accuracy": 0.3909425679683864, |
| "eval_loss": 3.3138086795806885, |
| "eval_runtime": 54.0416, |
| "eval_samples_per_second": 333.299, |
| "eval_steps_per_second": 20.836, |
| "step": 42000 |
| }, |
| { |
| "epoch": 11.315999138673558, |
| "grad_norm": 0.37423864006996155, |
| "learning_rate": 0.000430554732731924, |
| "loss": 3.1092, |
| "step": 42050 |
| }, |
| { |
| "epoch": 11.329457364341085, |
| "grad_norm": 0.3635264039039612, |
| "learning_rate": 0.0004303527669314662, |
| "loss": 3.1067, |
| "step": 42100 |
| }, |
| { |
| "epoch": 11.342915590008614, |
| "grad_norm": 0.37760043144226074, |
| "learning_rate": 0.0004301508011310084, |
| "loss": 3.1138, |
| "step": 42150 |
| }, |
| { |
| "epoch": 11.356373815676141, |
| "grad_norm": 0.35156896710395813, |
| "learning_rate": 0.00042994883533055065, |
| "loss": 3.1106, |
| "step": 42200 |
| }, |
| { |
| "epoch": 11.369832041343669, |
| "grad_norm": 0.366276353597641, |
| "learning_rate": 0.0004297468695300929, |
| "loss": 3.1133, |
| "step": 42250 |
| }, |
| { |
| "epoch": 11.383290267011198, |
| "grad_norm": 0.35794273018836975, |
| "learning_rate": 0.0004295449037296351, |
| "loss": 3.122, |
| "step": 42300 |
| }, |
| { |
| "epoch": 11.396748492678725, |
| "grad_norm": 0.3940405547618866, |
| "learning_rate": 0.0004293429379291773, |
| "loss": 3.1278, |
| "step": 42350 |
| }, |
| { |
| "epoch": 11.410206718346252, |
| "grad_norm": 0.369793176651001, |
| "learning_rate": 0.00042914097212871954, |
| "loss": 3.111, |
| "step": 42400 |
| }, |
| { |
| "epoch": 11.423664944013781, |
| "grad_norm": 0.40873241424560547, |
| "learning_rate": 0.00042893900632826173, |
| "loss": 3.1187, |
| "step": 42450 |
| }, |
| { |
| "epoch": 11.437123169681309, |
| "grad_norm": 0.3555212616920471, |
| "learning_rate": 0.00042873704052780393, |
| "loss": 3.1219, |
| "step": 42500 |
| }, |
| { |
| "epoch": 11.450581395348838, |
| "grad_norm": 0.3871249556541443, |
| "learning_rate": 0.0004285350747273461, |
| "loss": 3.1233, |
| "step": 42550 |
| }, |
| { |
| "epoch": 11.464039621016365, |
| "grad_norm": 0.36317795515060425, |
| "learning_rate": 0.0004283331089268883, |
| "loss": 3.1194, |
| "step": 42600 |
| }, |
| { |
| "epoch": 11.477497846683892, |
| "grad_norm": 0.36784833669662476, |
| "learning_rate": 0.00042813114312643057, |
| "loss": 3.1254, |
| "step": 42650 |
| }, |
| { |
| "epoch": 11.490956072351421, |
| "grad_norm": 0.38175830245018005, |
| "learning_rate": 0.00042792917732597277, |
| "loss": 3.1371, |
| "step": 42700 |
| }, |
| { |
| "epoch": 11.504414298018949, |
| "grad_norm": 0.3648242652416229, |
| "learning_rate": 0.00042772721152551496, |
| "loss": 3.1285, |
| "step": 42750 |
| }, |
| { |
| "epoch": 11.517872523686478, |
| "grad_norm": 0.37506625056266785, |
| "learning_rate": 0.00042752524572505716, |
| "loss": 3.1295, |
| "step": 42800 |
| }, |
| { |
| "epoch": 11.531330749354005, |
| "grad_norm": 0.37387633323669434, |
| "learning_rate": 0.00042732327992459936, |
| "loss": 3.1248, |
| "step": 42850 |
| }, |
| { |
| "epoch": 11.544788975021532, |
| "grad_norm": 0.3871805667877197, |
| "learning_rate": 0.0004271213141241416, |
| "loss": 3.1234, |
| "step": 42900 |
| }, |
| { |
| "epoch": 11.558247200689062, |
| "grad_norm": 0.35133877396583557, |
| "learning_rate": 0.0004269193483236838, |
| "loss": 3.1269, |
| "step": 42950 |
| }, |
| { |
| "epoch": 11.571705426356589, |
| "grad_norm": 0.36315521597862244, |
| "learning_rate": 0.000426717382523226, |
| "loss": 3.1231, |
| "step": 43000 |
| }, |
| { |
| "epoch": 11.571705426356589, |
| "eval_accuracy": 0.39158260659451644, |
| "eval_loss": 3.3041725158691406, |
| "eval_runtime": 53.8797, |
| "eval_samples_per_second": 334.3, |
| "eval_steps_per_second": 20.898, |
| "step": 43000 |
| }, |
| { |
| "epoch": 11.585163652024118, |
| "grad_norm": 0.367910772562027, |
| "learning_rate": 0.0004265154167227682, |
| "loss": 3.123, |
| "step": 43050 |
| }, |
| { |
| "epoch": 11.598621877691645, |
| "grad_norm": 0.38121262192726135, |
| "learning_rate": 0.0004263134509223105, |
| "loss": 3.1253, |
| "step": 43100 |
| }, |
| { |
| "epoch": 11.612080103359173, |
| "grad_norm": 0.7838655114173889, |
| "learning_rate": 0.0004261114851218527, |
| "loss": 3.1172, |
| "step": 43150 |
| }, |
| { |
| "epoch": 11.625538329026702, |
| "grad_norm": 0.3997848629951477, |
| "learning_rate": 0.0004259095193213949, |
| "loss": 3.1365, |
| "step": 43200 |
| }, |
| { |
| "epoch": 11.638996554694229, |
| "grad_norm": 0.34914714097976685, |
| "learning_rate": 0.0004257075535209371, |
| "loss": 3.1371, |
| "step": 43250 |
| }, |
| { |
| "epoch": 11.652454780361758, |
| "grad_norm": 0.3584500849246979, |
| "learning_rate": 0.00042550558772047934, |
| "loss": 3.1419, |
| "step": 43300 |
| }, |
| { |
| "epoch": 11.665913006029285, |
| "grad_norm": 0.3795340359210968, |
| "learning_rate": 0.00042530362192002153, |
| "loss": 3.1181, |
| "step": 43350 |
| }, |
| { |
| "epoch": 11.679371231696813, |
| "grad_norm": 0.3768688142299652, |
| "learning_rate": 0.00042510165611956373, |
| "loss": 3.1343, |
| "step": 43400 |
| }, |
| { |
| "epoch": 11.692829457364342, |
| "grad_norm": 0.37140101194381714, |
| "learning_rate": 0.0004248996903191059, |
| "loss": 3.1356, |
| "step": 43450 |
| }, |
| { |
| "epoch": 11.706287683031869, |
| "grad_norm": 0.34071606397628784, |
| "learning_rate": 0.0004246977245186481, |
| "loss": 3.1279, |
| "step": 43500 |
| }, |
| { |
| "epoch": 11.719745908699396, |
| "grad_norm": 0.39517703652381897, |
| "learning_rate": 0.0004244957587181904, |
| "loss": 3.1408, |
| "step": 43550 |
| }, |
| { |
| "epoch": 11.733204134366925, |
| "grad_norm": 0.3629872500896454, |
| "learning_rate": 0.00042429379291773257, |
| "loss": 3.1351, |
| "step": 43600 |
| }, |
| { |
| "epoch": 11.746662360034453, |
| "grad_norm": 0.4008491635322571, |
| "learning_rate": 0.00042409182711727477, |
| "loss": 3.137, |
| "step": 43650 |
| }, |
| { |
| "epoch": 11.760120585701982, |
| "grad_norm": 0.3763497471809387, |
| "learning_rate": 0.00042388986131681696, |
| "loss": 3.1371, |
| "step": 43700 |
| }, |
| { |
| "epoch": 11.773578811369509, |
| "grad_norm": 0.37213990092277527, |
| "learning_rate": 0.00042368789551635916, |
| "loss": 3.1363, |
| "step": 43750 |
| }, |
| { |
| "epoch": 11.787037037037036, |
| "grad_norm": 0.3561500906944275, |
| "learning_rate": 0.0004234859297159014, |
| "loss": 3.141, |
| "step": 43800 |
| }, |
| { |
| "epoch": 11.800495262704565, |
| "grad_norm": 0.3931141793727875, |
| "learning_rate": 0.0004232839639154436, |
| "loss": 3.1416, |
| "step": 43850 |
| }, |
| { |
| "epoch": 11.813953488372093, |
| "grad_norm": 0.35057225823402405, |
| "learning_rate": 0.0004230819981149858, |
| "loss": 3.1379, |
| "step": 43900 |
| }, |
| { |
| "epoch": 11.827411714039622, |
| "grad_norm": 0.35676315426826477, |
| "learning_rate": 0.000422880032314528, |
| "loss": 3.1338, |
| "step": 43950 |
| }, |
| { |
| "epoch": 11.840869939707149, |
| "grad_norm": 0.35738396644592285, |
| "learning_rate": 0.0004226780665140703, |
| "loss": 3.1332, |
| "step": 44000 |
| }, |
| { |
| "epoch": 11.840869939707149, |
| "eval_accuracy": 0.39190430993384273, |
| "eval_loss": 3.3014116287231445, |
| "eval_runtime": 53.7607, |
| "eval_samples_per_second": 335.041, |
| "eval_steps_per_second": 20.945, |
| "step": 44000 |
| }, |
| { |
| "epoch": 11.854328165374676, |
| "grad_norm": 0.3747558891773224, |
| "learning_rate": 0.0004224761007136125, |
| "loss": 3.1428, |
| "step": 44050 |
| }, |
| { |
| "epoch": 11.867786391042205, |
| "grad_norm": 0.3758200407028198, |
| "learning_rate": 0.0004222741349131547, |
| "loss": 3.1328, |
| "step": 44100 |
| }, |
| { |
| "epoch": 11.881244616709733, |
| "grad_norm": 0.3885456621646881, |
| "learning_rate": 0.0004220721691126969, |
| "loss": 3.1333, |
| "step": 44150 |
| }, |
| { |
| "epoch": 11.89470284237726, |
| "grad_norm": 0.35970941185951233, |
| "learning_rate": 0.0004218702033122391, |
| "loss": 3.1387, |
| "step": 44200 |
| }, |
| { |
| "epoch": 11.90816106804479, |
| "grad_norm": 0.3670229911804199, |
| "learning_rate": 0.00042166823751178134, |
| "loss": 3.139, |
| "step": 44250 |
| }, |
| { |
| "epoch": 11.921619293712316, |
| "grad_norm": 0.3680804967880249, |
| "learning_rate": 0.00042146627171132353, |
| "loss": 3.1381, |
| "step": 44300 |
| }, |
| { |
| "epoch": 11.935077519379846, |
| "grad_norm": 0.367384135723114, |
| "learning_rate": 0.00042126430591086573, |
| "loss": 3.1405, |
| "step": 44350 |
| }, |
| { |
| "epoch": 11.948535745047373, |
| "grad_norm": 0.3529140055179596, |
| "learning_rate": 0.0004210623401104079, |
| "loss": 3.1475, |
| "step": 44400 |
| }, |
| { |
| "epoch": 11.9619939707149, |
| "grad_norm": 0.34686964750289917, |
| "learning_rate": 0.0004208603743099501, |
| "loss": 3.1463, |
| "step": 44450 |
| }, |
| { |
| "epoch": 11.97545219638243, |
| "grad_norm": 0.3600054085254669, |
| "learning_rate": 0.00042065840850949237, |
| "loss": 3.149, |
| "step": 44500 |
| }, |
| { |
| "epoch": 11.988910422049956, |
| "grad_norm": 0.37668830156326294, |
| "learning_rate": 0.00042045644270903457, |
| "loss": 3.1494, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.002153316106805, |
| "grad_norm": 0.39227724075317383, |
| "learning_rate": 0.00042025447690857676, |
| "loss": 3.12, |
| "step": 44600 |
| }, |
| { |
| "epoch": 12.015611541774332, |
| "grad_norm": 0.3704946041107178, |
| "learning_rate": 0.00042005251110811896, |
| "loss": 3.0452, |
| "step": 44650 |
| }, |
| { |
| "epoch": 12.029069767441861, |
| "grad_norm": 0.3810844421386719, |
| "learning_rate": 0.00041985054530766115, |
| "loss": 3.0469, |
| "step": 44700 |
| }, |
| { |
| "epoch": 12.042527993109388, |
| "grad_norm": 0.4110226035118103, |
| "learning_rate": 0.0004196485795072034, |
| "loss": 3.053, |
| "step": 44750 |
| }, |
| { |
| "epoch": 12.055986218776916, |
| "grad_norm": 0.38302749395370483, |
| "learning_rate": 0.0004194466137067456, |
| "loss": 3.0359, |
| "step": 44800 |
| }, |
| { |
| "epoch": 12.069444444444445, |
| "grad_norm": 0.38793662190437317, |
| "learning_rate": 0.0004192446479062878, |
| "loss": 3.0611, |
| "step": 44850 |
| }, |
| { |
| "epoch": 12.082902670111972, |
| "grad_norm": 0.3869202136993408, |
| "learning_rate": 0.00041904268210583005, |
| "loss": 3.0617, |
| "step": 44900 |
| }, |
| { |
| "epoch": 12.096360895779501, |
| "grad_norm": 0.40806224942207336, |
| "learning_rate": 0.0004188407163053723, |
| "loss": 3.0621, |
| "step": 44950 |
| }, |
| { |
| "epoch": 12.109819121447028, |
| "grad_norm": 0.3624691367149353, |
| "learning_rate": 0.0004186387505049145, |
| "loss": 3.0655, |
| "step": 45000 |
| }, |
| { |
| "epoch": 12.109819121447028, |
| "eval_accuracy": 0.3916841914044692, |
| "eval_loss": 3.3107516765594482, |
| "eval_runtime": 53.6121, |
| "eval_samples_per_second": 335.969, |
| "eval_steps_per_second": 21.003, |
| "step": 45000 |
| }, |
| { |
| "epoch": 12.123277347114556, |
| "grad_norm": 0.38657304644584656, |
| "learning_rate": 0.0004184367847044567, |
| "loss": 3.0653, |
| "step": 45050 |
| }, |
| { |
| "epoch": 12.136735572782085, |
| "grad_norm": 0.42197513580322266, |
| "learning_rate": 0.0004182348189039989, |
| "loss": 3.0755, |
| "step": 45100 |
| }, |
| { |
| "epoch": 12.150193798449612, |
| "grad_norm": 0.41983577609062195, |
| "learning_rate": 0.00041803285310354114, |
| "loss": 3.0759, |
| "step": 45150 |
| }, |
| { |
| "epoch": 12.163652024117141, |
| "grad_norm": 0.352384090423584, |
| "learning_rate": 0.00041783088730308333, |
| "loss": 3.0761, |
| "step": 45200 |
| }, |
| { |
| "epoch": 12.177110249784668, |
| "grad_norm": 0.42857182025909424, |
| "learning_rate": 0.00041762892150262553, |
| "loss": 3.0656, |
| "step": 45250 |
| }, |
| { |
| "epoch": 12.190568475452196, |
| "grad_norm": 0.35385847091674805, |
| "learning_rate": 0.0004174269557021677, |
| "loss": 3.0833, |
| "step": 45300 |
| }, |
| { |
| "epoch": 12.204026701119725, |
| "grad_norm": 0.40004876255989075, |
| "learning_rate": 0.0004172249899017099, |
| "loss": 3.0804, |
| "step": 45350 |
| }, |
| { |
| "epoch": 12.217484926787252, |
| "grad_norm": 0.36738428473472595, |
| "learning_rate": 0.00041702302410125217, |
| "loss": 3.0868, |
| "step": 45400 |
| }, |
| { |
| "epoch": 12.230943152454781, |
| "grad_norm": 0.3519749939441681, |
| "learning_rate": 0.00041682105830079437, |
| "loss": 3.0793, |
| "step": 45450 |
| }, |
| { |
| "epoch": 12.244401378122308, |
| "grad_norm": 0.3788878917694092, |
| "learning_rate": 0.00041661909250033656, |
| "loss": 3.0942, |
| "step": 45500 |
| }, |
| { |
| "epoch": 12.257859603789836, |
| "grad_norm": 0.3716530501842499, |
| "learning_rate": 0.00041641712669987876, |
| "loss": 3.0917, |
| "step": 45550 |
| }, |
| { |
| "epoch": 12.271317829457365, |
| "grad_norm": 0.3835557699203491, |
| "learning_rate": 0.00041621516089942096, |
| "loss": 3.0832, |
| "step": 45600 |
| }, |
| { |
| "epoch": 12.284776055124892, |
| "grad_norm": 0.42075875401496887, |
| "learning_rate": 0.0004160131950989632, |
| "loss": 3.0948, |
| "step": 45650 |
| }, |
| { |
| "epoch": 12.298234280792421, |
| "grad_norm": 0.372883677482605, |
| "learning_rate": 0.0004158112292985054, |
| "loss": 3.0969, |
| "step": 45700 |
| }, |
| { |
| "epoch": 12.311692506459949, |
| "grad_norm": 0.35647454857826233, |
| "learning_rate": 0.0004156092634980476, |
| "loss": 3.0887, |
| "step": 45750 |
| }, |
| { |
| "epoch": 12.325150732127476, |
| "grad_norm": 0.3993690013885498, |
| "learning_rate": 0.00041540729769758985, |
| "loss": 3.0938, |
| "step": 45800 |
| }, |
| { |
| "epoch": 12.338608957795005, |
| "grad_norm": 0.36539211869239807, |
| "learning_rate": 0.0004152053318971321, |
| "loss": 3.1063, |
| "step": 45850 |
| }, |
| { |
| "epoch": 12.352067183462532, |
| "grad_norm": 0.38160133361816406, |
| "learning_rate": 0.0004150033660966743, |
| "loss": 3.0854, |
| "step": 45900 |
| }, |
| { |
| "epoch": 12.36552540913006, |
| "grad_norm": 0.34852227568626404, |
| "learning_rate": 0.0004148014002962165, |
| "loss": 3.0945, |
| "step": 45950 |
| }, |
| { |
| "epoch": 12.378983634797589, |
| "grad_norm": 0.38215118646621704, |
| "learning_rate": 0.0004145994344957587, |
| "loss": 3.0919, |
| "step": 46000 |
| }, |
| { |
| "epoch": 12.378983634797589, |
| "eval_accuracy": 0.3920929208751026, |
| "eval_loss": 3.30391001701355, |
| "eval_runtime": 53.9181, |
| "eval_samples_per_second": 334.063, |
| "eval_steps_per_second": 20.884, |
| "step": 46000 |
| }, |
| { |
| "epoch": 12.392441860465116, |
| "grad_norm": 0.40114298462867737, |
| "learning_rate": 0.0004143974686953009, |
| "loss": 3.1014, |
| "step": 46050 |
| }, |
| { |
| "epoch": 12.405900086132645, |
| "grad_norm": 0.40130308270454407, |
| "learning_rate": 0.00041419550289484313, |
| "loss": 3.1058, |
| "step": 46100 |
| }, |
| { |
| "epoch": 12.419358311800172, |
| "grad_norm": 0.37803414463996887, |
| "learning_rate": 0.00041399353709438533, |
| "loss": 3.0961, |
| "step": 46150 |
| }, |
| { |
| "epoch": 12.4328165374677, |
| "grad_norm": 0.4074687957763672, |
| "learning_rate": 0.0004137915712939275, |
| "loss": 3.1013, |
| "step": 46200 |
| }, |
| { |
| "epoch": 12.446274763135229, |
| "grad_norm": 0.35416853427886963, |
| "learning_rate": 0.0004135896054934697, |
| "loss": 3.107, |
| "step": 46250 |
| }, |
| { |
| "epoch": 12.459732988802756, |
| "grad_norm": 0.394949734210968, |
| "learning_rate": 0.0004133876396930119, |
| "loss": 3.1127, |
| "step": 46300 |
| }, |
| { |
| "epoch": 12.473191214470285, |
| "grad_norm": 0.3806135654449463, |
| "learning_rate": 0.00041318567389255417, |
| "loss": 3.1096, |
| "step": 46350 |
| }, |
| { |
| "epoch": 12.486649440137812, |
| "grad_norm": 0.3582363724708557, |
| "learning_rate": 0.00041298370809209636, |
| "loss": 3.0983, |
| "step": 46400 |
| }, |
| { |
| "epoch": 12.50010766580534, |
| "grad_norm": 0.36571231484413147, |
| "learning_rate": 0.00041278174229163856, |
| "loss": 3.0931, |
| "step": 46450 |
| }, |
| { |
| "epoch": 12.513565891472869, |
| "grad_norm": 0.37472379207611084, |
| "learning_rate": 0.00041257977649118076, |
| "loss": 3.1149, |
| "step": 46500 |
| }, |
| { |
| "epoch": 12.527024117140396, |
| "grad_norm": 0.3692905306816101, |
| "learning_rate": 0.00041237781069072295, |
| "loss": 3.1073, |
| "step": 46550 |
| }, |
| { |
| "epoch": 12.540482342807923, |
| "grad_norm": 0.4097956418991089, |
| "learning_rate": 0.0004121758448902652, |
| "loss": 3.1159, |
| "step": 46600 |
| }, |
| { |
| "epoch": 12.553940568475452, |
| "grad_norm": 0.38633161783218384, |
| "learning_rate": 0.0004119738790898074, |
| "loss": 3.1091, |
| "step": 46650 |
| }, |
| { |
| "epoch": 12.56739879414298, |
| "grad_norm": 0.3908534049987793, |
| "learning_rate": 0.00041177191328934965, |
| "loss": 3.1229, |
| "step": 46700 |
| }, |
| { |
| "epoch": 12.580857019810509, |
| "grad_norm": 0.4074409306049347, |
| "learning_rate": 0.00041156994748889185, |
| "loss": 3.1045, |
| "step": 46750 |
| }, |
| { |
| "epoch": 12.594315245478036, |
| "grad_norm": 0.3800044655799866, |
| "learning_rate": 0.0004113679816884341, |
| "loss": 3.1112, |
| "step": 46800 |
| }, |
| { |
| "epoch": 12.607773471145563, |
| "grad_norm": 0.34563085436820984, |
| "learning_rate": 0.0004111660158879763, |
| "loss": 3.1177, |
| "step": 46850 |
| }, |
| { |
| "epoch": 12.621231696813092, |
| "grad_norm": 0.36784085631370544, |
| "learning_rate": 0.0004109640500875185, |
| "loss": 3.1148, |
| "step": 46900 |
| }, |
| { |
| "epoch": 12.63468992248062, |
| "grad_norm": 0.39189931750297546, |
| "learning_rate": 0.0004107620842870607, |
| "loss": 3.1105, |
| "step": 46950 |
| }, |
| { |
| "epoch": 12.648148148148149, |
| "grad_norm": 0.36800920963287354, |
| "learning_rate": 0.0004105601184866029, |
| "loss": 3.1198, |
| "step": 47000 |
| }, |
| { |
| "epoch": 12.648148148148149, |
| "eval_accuracy": 0.3922400287175369, |
| "eval_loss": 3.3033642768859863, |
| "eval_runtime": 53.9166, |
| "eval_samples_per_second": 334.071, |
| "eval_steps_per_second": 20.884, |
| "step": 47000 |
| }, |
| { |
| "epoch": 12.661606373815676, |
| "grad_norm": 0.3976839780807495, |
| "learning_rate": 0.00041035815268614513, |
| "loss": 3.1156, |
| "step": 47050 |
| }, |
| { |
| "epoch": 12.675064599483203, |
| "grad_norm": 0.39268913865089417, |
| "learning_rate": 0.0004101561868856873, |
| "loss": 3.1203, |
| "step": 47100 |
| }, |
| { |
| "epoch": 12.688522825150732, |
| "grad_norm": 0.36905089020729065, |
| "learning_rate": 0.0004099542210852295, |
| "loss": 3.1219, |
| "step": 47150 |
| }, |
| { |
| "epoch": 12.70198105081826, |
| "grad_norm": 0.3765939772129059, |
| "learning_rate": 0.0004097522552847717, |
| "loss": 3.1171, |
| "step": 47200 |
| }, |
| { |
| "epoch": 12.715439276485789, |
| "grad_norm": 0.37911146879196167, |
| "learning_rate": 0.00040955028948431397, |
| "loss": 3.1116, |
| "step": 47250 |
| }, |
| { |
| "epoch": 12.728897502153316, |
| "grad_norm": 0.39323848485946655, |
| "learning_rate": 0.00040934832368385616, |
| "loss": 3.1152, |
| "step": 47300 |
| }, |
| { |
| "epoch": 12.742355727820843, |
| "grad_norm": 0.3719523251056671, |
| "learning_rate": 0.00040914635788339836, |
| "loss": 3.107, |
| "step": 47350 |
| }, |
| { |
| "epoch": 12.755813953488373, |
| "grad_norm": 0.358672559261322, |
| "learning_rate": 0.00040894439208294056, |
| "loss": 3.1216, |
| "step": 47400 |
| }, |
| { |
| "epoch": 12.7692721791559, |
| "grad_norm": 0.3891298174858093, |
| "learning_rate": 0.00040874242628248275, |
| "loss": 3.1341, |
| "step": 47450 |
| }, |
| { |
| "epoch": 12.782730404823429, |
| "grad_norm": 0.37830036878585815, |
| "learning_rate": 0.000408540460482025, |
| "loss": 3.1233, |
| "step": 47500 |
| }, |
| { |
| "epoch": 12.796188630490956, |
| "grad_norm": 0.3913838565349579, |
| "learning_rate": 0.00040833849468156725, |
| "loss": 3.1199, |
| "step": 47550 |
| }, |
| { |
| "epoch": 12.809646856158484, |
| "grad_norm": 0.3707485795021057, |
| "learning_rate": 0.00040813652888110945, |
| "loss": 3.118, |
| "step": 47600 |
| }, |
| { |
| "epoch": 12.823105081826013, |
| "grad_norm": 0.42476046085357666, |
| "learning_rate": 0.00040793456308065165, |
| "loss": 3.1103, |
| "step": 47650 |
| }, |
| { |
| "epoch": 12.83656330749354, |
| "grad_norm": 0.41528019309043884, |
| "learning_rate": 0.0004077325972801939, |
| "loss": 3.1213, |
| "step": 47700 |
| }, |
| { |
| "epoch": 12.850021533161069, |
| "grad_norm": 0.3821558654308319, |
| "learning_rate": 0.0004075306314797361, |
| "loss": 3.1253, |
| "step": 47750 |
| }, |
| { |
| "epoch": 12.863479758828596, |
| "grad_norm": 0.3485643267631531, |
| "learning_rate": 0.0004073286656792783, |
| "loss": 3.1242, |
| "step": 47800 |
| }, |
| { |
| "epoch": 12.876937984496124, |
| "grad_norm": 0.3739717900753021, |
| "learning_rate": 0.0004071266998788205, |
| "loss": 3.1311, |
| "step": 47850 |
| }, |
| { |
| "epoch": 12.890396210163653, |
| "grad_norm": 0.36342811584472656, |
| "learning_rate": 0.0004069247340783627, |
| "loss": 3.1201, |
| "step": 47900 |
| }, |
| { |
| "epoch": 12.90385443583118, |
| "grad_norm": 0.3751324713230133, |
| "learning_rate": 0.00040672276827790493, |
| "loss": 3.127, |
| "step": 47950 |
| }, |
| { |
| "epoch": 12.917312661498707, |
| "grad_norm": 0.3644934594631195, |
| "learning_rate": 0.0004065208024774471, |
| "loss": 3.1283, |
| "step": 48000 |
| }, |
| { |
| "epoch": 12.917312661498707, |
| "eval_accuracy": 0.39249176348187964, |
| "eval_loss": 3.295912504196167, |
| "eval_runtime": 54.0321, |
| "eval_samples_per_second": 333.358, |
| "eval_steps_per_second": 20.839, |
| "step": 48000 |
| }, |
| { |
| "epoch": 12.930770887166236, |
| "grad_norm": 0.3778104782104492, |
| "learning_rate": 0.0004063188366769893, |
| "loss": 3.1223, |
| "step": 48050 |
| }, |
| { |
| "epoch": 12.944229112833764, |
| "grad_norm": 0.3311636447906494, |
| "learning_rate": 0.0004061168708765315, |
| "loss": 3.1324, |
| "step": 48100 |
| }, |
| { |
| "epoch": 12.957687338501293, |
| "grad_norm": 0.36411207914352417, |
| "learning_rate": 0.0004059149050760737, |
| "loss": 3.1244, |
| "step": 48150 |
| }, |
| { |
| "epoch": 12.97114556416882, |
| "grad_norm": 0.3728819191455841, |
| "learning_rate": 0.00040571293927561597, |
| "loss": 3.13, |
| "step": 48200 |
| }, |
| { |
| "epoch": 12.984603789836347, |
| "grad_norm": 0.3733992278575897, |
| "learning_rate": 0.00040551097347515816, |
| "loss": 3.1203, |
| "step": 48250 |
| }, |
| { |
| "epoch": 12.998062015503876, |
| "grad_norm": 0.34325742721557617, |
| "learning_rate": 0.00040530900767470036, |
| "loss": 3.1313, |
| "step": 48300 |
| }, |
| { |
| "epoch": 13.011304909560723, |
| "grad_norm": 0.36356621980667114, |
| "learning_rate": 0.00040510704187424255, |
| "loss": 3.043, |
| "step": 48350 |
| }, |
| { |
| "epoch": 13.024763135228252, |
| "grad_norm": 0.37825655937194824, |
| "learning_rate": 0.00040490507607378475, |
| "loss": 3.0414, |
| "step": 48400 |
| }, |
| { |
| "epoch": 13.038221360895779, |
| "grad_norm": 0.4046306908130646, |
| "learning_rate": 0.00040470311027332705, |
| "loss": 3.0372, |
| "step": 48450 |
| }, |
| { |
| "epoch": 13.051679586563308, |
| "grad_norm": 0.36423417925834656, |
| "learning_rate": 0.00040450114447286925, |
| "loss": 3.0349, |
| "step": 48500 |
| }, |
| { |
| "epoch": 13.065137812230835, |
| "grad_norm": 0.3580648899078369, |
| "learning_rate": 0.00040429917867241145, |
| "loss": 3.036, |
| "step": 48550 |
| }, |
| { |
| "epoch": 13.078596037898363, |
| "grad_norm": 0.3821873664855957, |
| "learning_rate": 0.00040409721287195364, |
| "loss": 3.0446, |
| "step": 48600 |
| }, |
| { |
| "epoch": 13.092054263565892, |
| "grad_norm": 0.38778361678123474, |
| "learning_rate": 0.0004038952470714959, |
| "loss": 3.0454, |
| "step": 48650 |
| }, |
| { |
| "epoch": 13.10551248923342, |
| "grad_norm": 0.3847378194332123, |
| "learning_rate": 0.0004036932812710381, |
| "loss": 3.0419, |
| "step": 48700 |
| }, |
| { |
| "epoch": 13.118970714900948, |
| "grad_norm": 0.36374303698539734, |
| "learning_rate": 0.0004034913154705803, |
| "loss": 3.0542, |
| "step": 48750 |
| }, |
| { |
| "epoch": 13.132428940568476, |
| "grad_norm": 0.394603967666626, |
| "learning_rate": 0.0004032893496701225, |
| "loss": 3.047, |
| "step": 48800 |
| }, |
| { |
| "epoch": 13.145887166236003, |
| "grad_norm": 0.4271048903465271, |
| "learning_rate": 0.0004030873838696647, |
| "loss": 3.0521, |
| "step": 48850 |
| }, |
| { |
| "epoch": 13.159345391903532, |
| "grad_norm": 0.38402384519577026, |
| "learning_rate": 0.00040288541806920693, |
| "loss": 3.057, |
| "step": 48900 |
| }, |
| { |
| "epoch": 13.17280361757106, |
| "grad_norm": 0.38647517561912537, |
| "learning_rate": 0.0004026834522687491, |
| "loss": 3.0643, |
| "step": 48950 |
| }, |
| { |
| "epoch": 13.186261843238588, |
| "grad_norm": 0.372152715921402, |
| "learning_rate": 0.0004024814864682913, |
| "loss": 3.0591, |
| "step": 49000 |
| }, |
| { |
| "epoch": 13.186261843238588, |
| "eval_accuracy": 0.39232064468436567, |
| "eval_loss": 3.306389570236206, |
| "eval_runtime": 53.7783, |
| "eval_samples_per_second": 334.93, |
| "eval_steps_per_second": 20.938, |
| "step": 49000 |
| }, |
| { |
| "epoch": 13.199720068906116, |
| "grad_norm": 0.3545859456062317, |
| "learning_rate": 0.0004022795206678335, |
| "loss": 3.0649, |
| "step": 49050 |
| }, |
| { |
| "epoch": 13.213178294573643, |
| "grad_norm": 0.3903788924217224, |
| "learning_rate": 0.00040207755486737577, |
| "loss": 3.063, |
| "step": 49100 |
| }, |
| { |
| "epoch": 13.226636520241172, |
| "grad_norm": 0.3961975574493408, |
| "learning_rate": 0.00040187558906691796, |
| "loss": 3.0666, |
| "step": 49150 |
| }, |
| { |
| "epoch": 13.2400947459087, |
| "grad_norm": 0.3676101863384247, |
| "learning_rate": 0.00040167362326646016, |
| "loss": 3.0687, |
| "step": 49200 |
| }, |
| { |
| "epoch": 13.253552971576227, |
| "grad_norm": 0.3698784112930298, |
| "learning_rate": 0.00040147165746600235, |
| "loss": 3.0722, |
| "step": 49250 |
| }, |
| { |
| "epoch": 13.267011197243756, |
| "grad_norm": 0.3777436316013336, |
| "learning_rate": 0.00040126969166554455, |
| "loss": 3.0763, |
| "step": 49300 |
| }, |
| { |
| "epoch": 13.280469422911283, |
| "grad_norm": 0.3777233958244324, |
| "learning_rate": 0.00040106772586508686, |
| "loss": 3.0669, |
| "step": 49350 |
| }, |
| { |
| "epoch": 13.293927648578812, |
| "grad_norm": 0.3873639404773712, |
| "learning_rate": 0.00040086576006462905, |
| "loss": 3.0709, |
| "step": 49400 |
| }, |
| { |
| "epoch": 13.30738587424634, |
| "grad_norm": 0.36705899238586426, |
| "learning_rate": 0.00040066379426417125, |
| "loss": 3.0626, |
| "step": 49450 |
| }, |
| { |
| "epoch": 13.320844099913867, |
| "grad_norm": 0.35446983575820923, |
| "learning_rate": 0.00040046182846371344, |
| "loss": 3.076, |
| "step": 49500 |
| }, |
| { |
| "epoch": 13.334302325581396, |
| "grad_norm": 0.393531858921051, |
| "learning_rate": 0.0004002598626632557, |
| "loss": 3.0867, |
| "step": 49550 |
| }, |
| { |
| "epoch": 13.347760551248923, |
| "grad_norm": 0.4465448260307312, |
| "learning_rate": 0.0004000578968627979, |
| "loss": 3.0735, |
| "step": 49600 |
| }, |
| { |
| "epoch": 13.361218776916452, |
| "grad_norm": 0.3967099189758301, |
| "learning_rate": 0.0003998559310623401, |
| "loss": 3.0843, |
| "step": 49650 |
| }, |
| { |
| "epoch": 13.37467700258398, |
| "grad_norm": 0.3714440166950226, |
| "learning_rate": 0.0003996539652618823, |
| "loss": 3.0741, |
| "step": 49700 |
| }, |
| { |
| "epoch": 13.388135228251507, |
| "grad_norm": 0.4029998481273651, |
| "learning_rate": 0.0003994519994614245, |
| "loss": 3.0777, |
| "step": 49750 |
| }, |
| { |
| "epoch": 13.401593453919036, |
| "grad_norm": 0.39566588401794434, |
| "learning_rate": 0.00039925003366096673, |
| "loss": 3.0849, |
| "step": 49800 |
| }, |
| { |
| "epoch": 13.415051679586563, |
| "grad_norm": 0.3758895993232727, |
| "learning_rate": 0.0003990480678605089, |
| "loss": 3.0984, |
| "step": 49850 |
| }, |
| { |
| "epoch": 13.428509905254092, |
| "grad_norm": 0.3822093605995178, |
| "learning_rate": 0.0003988461020600511, |
| "loss": 3.0859, |
| "step": 49900 |
| }, |
| { |
| "epoch": 13.44196813092162, |
| "grad_norm": 0.35837680101394653, |
| "learning_rate": 0.0003986441362595933, |
| "loss": 3.0958, |
| "step": 49950 |
| }, |
| { |
| "epoch": 13.455426356589147, |
| "grad_norm": 0.42340973019599915, |
| "learning_rate": 0.0003984421704591355, |
| "loss": 3.0828, |
| "step": 50000 |
| }, |
| { |
| "epoch": 13.455426356589147, |
| "eval_accuracy": 0.39285638232915393, |
| "eval_loss": 3.3012328147888184, |
| "eval_runtime": 53.8418, |
| "eval_samples_per_second": 334.536, |
| "eval_steps_per_second": 20.913, |
| "step": 50000 |
| }, |
| { |
| "epoch": 13.468884582256676, |
| "grad_norm": 0.4042765200138092, |
| "learning_rate": 0.00039824020465867776, |
| "loss": 3.0829, |
| "step": 50050 |
| }, |
| { |
| "epoch": 13.482342807924203, |
| "grad_norm": 0.42345064878463745, |
| "learning_rate": 0.00039803823885821996, |
| "loss": 3.0972, |
| "step": 50100 |
| }, |
| { |
| "epoch": 13.49580103359173, |
| "grad_norm": 0.4129413366317749, |
| "learning_rate": 0.00039783627305776216, |
| "loss": 3.0934, |
| "step": 50150 |
| }, |
| { |
| "epoch": 13.50925925925926, |
| "grad_norm": 0.4112797677516937, |
| "learning_rate": 0.00039763430725730435, |
| "loss": 3.0947, |
| "step": 50200 |
| }, |
| { |
| "epoch": 13.522717484926787, |
| "grad_norm": 0.36507686972618103, |
| "learning_rate": 0.00039743234145684666, |
| "loss": 3.0975, |
| "step": 50250 |
| }, |
| { |
| "epoch": 13.536175710594316, |
| "grad_norm": 0.3800624907016754, |
| "learning_rate": 0.00039723037565638885, |
| "loss": 3.1025, |
| "step": 50300 |
| }, |
| { |
| "epoch": 13.549633936261843, |
| "grad_norm": 0.40299904346466064, |
| "learning_rate": 0.00039702840985593105, |
| "loss": 3.0832, |
| "step": 50350 |
| }, |
| { |
| "epoch": 13.56309216192937, |
| "grad_norm": 0.38481107354164124, |
| "learning_rate": 0.00039682644405547324, |
| "loss": 3.0907, |
| "step": 50400 |
| }, |
| { |
| "epoch": 13.5765503875969, |
| "grad_norm": 0.3586687445640564, |
| "learning_rate": 0.00039662447825501544, |
| "loss": 3.1058, |
| "step": 50450 |
| }, |
| { |
| "epoch": 13.590008613264427, |
| "grad_norm": 0.37668395042419434, |
| "learning_rate": 0.0003964225124545577, |
| "loss": 3.0936, |
| "step": 50500 |
| }, |
| { |
| "epoch": 13.603466838931956, |
| "grad_norm": 0.4070712625980377, |
| "learning_rate": 0.0003962205466540999, |
| "loss": 3.1099, |
| "step": 50550 |
| }, |
| { |
| "epoch": 13.616925064599483, |
| "grad_norm": 0.3600846230983734, |
| "learning_rate": 0.0003960185808536421, |
| "loss": 3.0942, |
| "step": 50600 |
| }, |
| { |
| "epoch": 13.63038329026701, |
| "grad_norm": 0.3763042390346527, |
| "learning_rate": 0.0003958166150531843, |
| "loss": 3.1044, |
| "step": 50650 |
| }, |
| { |
| "epoch": 13.64384151593454, |
| "grad_norm": 0.3834032714366913, |
| "learning_rate": 0.0003956146492527265, |
| "loss": 3.1118, |
| "step": 50700 |
| }, |
| { |
| "epoch": 13.657299741602067, |
| "grad_norm": 0.3641510009765625, |
| "learning_rate": 0.0003954126834522687, |
| "loss": 3.0999, |
| "step": 50750 |
| }, |
| { |
| "epoch": 13.670757967269594, |
| "grad_norm": 0.3956216275691986, |
| "learning_rate": 0.0003952107176518109, |
| "loss": 3.0896, |
| "step": 50800 |
| }, |
| { |
| "epoch": 13.684216192937123, |
| "grad_norm": 0.39530178904533386, |
| "learning_rate": 0.0003950087518513531, |
| "loss": 3.1057, |
| "step": 50850 |
| }, |
| { |
| "epoch": 13.69767441860465, |
| "grad_norm": 0.38161706924438477, |
| "learning_rate": 0.0003948067860508953, |
| "loss": 3.1006, |
| "step": 50900 |
| }, |
| { |
| "epoch": 13.71113264427218, |
| "grad_norm": 0.3645673990249634, |
| "learning_rate": 0.0003946048202504375, |
| "loss": 3.1121, |
| "step": 50950 |
| }, |
| { |
| "epoch": 13.724590869939707, |
| "grad_norm": 0.37178608775138855, |
| "learning_rate": 0.00039440285444997976, |
| "loss": 3.1093, |
| "step": 51000 |
| }, |
| { |
| "epoch": 13.724590869939707, |
| "eval_accuracy": 0.3933404040706935, |
| "eval_loss": 3.2926268577575684, |
| "eval_runtime": 53.8199, |
| "eval_samples_per_second": 334.672, |
| "eval_steps_per_second": 20.922, |
| "step": 51000 |
| }, |
| { |
| "epoch": 13.738049095607234, |
| "grad_norm": 0.3872733414173126, |
| "learning_rate": 0.00039420088864952196, |
| "loss": 3.0982, |
| "step": 51050 |
| }, |
| { |
| "epoch": 13.751507321274763, |
| "grad_norm": 0.3768724203109741, |
| "learning_rate": 0.00039399892284906415, |
| "loss": 3.1018, |
| "step": 51100 |
| }, |
| { |
| "epoch": 13.76496554694229, |
| "grad_norm": 0.3499247431755066, |
| "learning_rate": 0.00039379695704860646, |
| "loss": 3.1137, |
| "step": 51150 |
| }, |
| { |
| "epoch": 13.77842377260982, |
| "grad_norm": 0.3912452757358551, |
| "learning_rate": 0.00039359499124814865, |
| "loss": 3.1003, |
| "step": 51200 |
| }, |
| { |
| "epoch": 13.791881998277347, |
| "grad_norm": 0.40462765097618103, |
| "learning_rate": 0.00039339302544769085, |
| "loss": 3.109, |
| "step": 51250 |
| }, |
| { |
| "epoch": 13.805340223944874, |
| "grad_norm": 0.38249385356903076, |
| "learning_rate": 0.00039319105964723305, |
| "loss": 3.0964, |
| "step": 51300 |
| }, |
| { |
| "epoch": 13.818798449612403, |
| "grad_norm": 0.38811343908309937, |
| "learning_rate": 0.00039298909384677524, |
| "loss": 3.1095, |
| "step": 51350 |
| }, |
| { |
| "epoch": 13.83225667527993, |
| "grad_norm": 0.3731904625892639, |
| "learning_rate": 0.0003927871280463175, |
| "loss": 3.1067, |
| "step": 51400 |
| }, |
| { |
| "epoch": 13.84571490094746, |
| "grad_norm": 0.35964900255203247, |
| "learning_rate": 0.0003925851622458597, |
| "loss": 3.1094, |
| "step": 51450 |
| }, |
| { |
| "epoch": 13.859173126614987, |
| "grad_norm": 0.3533209562301636, |
| "learning_rate": 0.0003923831964454019, |
| "loss": 3.1167, |
| "step": 51500 |
| }, |
| { |
| "epoch": 13.872631352282514, |
| "grad_norm": 0.368437796831131, |
| "learning_rate": 0.0003921812306449441, |
| "loss": 3.1096, |
| "step": 51550 |
| }, |
| { |
| "epoch": 13.886089577950044, |
| "grad_norm": 0.39540213346481323, |
| "learning_rate": 0.0003919792648444863, |
| "loss": 3.1123, |
| "step": 51600 |
| }, |
| { |
| "epoch": 13.89954780361757, |
| "grad_norm": 0.38563069701194763, |
| "learning_rate": 0.0003917772990440285, |
| "loss": 3.1218, |
| "step": 51650 |
| }, |
| { |
| "epoch": 13.9130060292851, |
| "grad_norm": 0.3552553057670593, |
| "learning_rate": 0.0003915753332435707, |
| "loss": 3.1082, |
| "step": 51700 |
| }, |
| { |
| "epoch": 13.926464254952627, |
| "grad_norm": 0.3842296600341797, |
| "learning_rate": 0.0003913733674431129, |
| "loss": 3.097, |
| "step": 51750 |
| }, |
| { |
| "epoch": 13.939922480620154, |
| "grad_norm": 0.3653877377510071, |
| "learning_rate": 0.0003911714016426551, |
| "loss": 3.1161, |
| "step": 51800 |
| }, |
| { |
| "epoch": 13.953380706287684, |
| "grad_norm": 0.3652689754962921, |
| "learning_rate": 0.0003909694358421973, |
| "loss": 3.096, |
| "step": 51850 |
| }, |
| { |
| "epoch": 13.96683893195521, |
| "grad_norm": 0.3673049211502075, |
| "learning_rate": 0.00039076747004173956, |
| "loss": 3.1018, |
| "step": 51900 |
| }, |
| { |
| "epoch": 13.98029715762274, |
| "grad_norm": 0.3873627185821533, |
| "learning_rate": 0.00039056550424128176, |
| "loss": 3.1102, |
| "step": 51950 |
| }, |
| { |
| "epoch": 13.993755383290267, |
| "grad_norm": 0.3623334467411041, |
| "learning_rate": 0.00039036353844082395, |
| "loss": 3.1115, |
| "step": 52000 |
| }, |
| { |
| "epoch": 13.993755383290267, |
| "eval_accuracy": 0.39365352430843015, |
| "eval_loss": 3.287444591522217, |
| "eval_runtime": 54.0713, |
| "eval_samples_per_second": 333.115, |
| "eval_steps_per_second": 20.824, |
| "step": 52000 |
| }, |
| { |
| "epoch": 14.006998277347115, |
| "grad_norm": 0.39216455817222595, |
| "learning_rate": 0.0003901615726403662, |
| "loss": 3.0531, |
| "step": 52050 |
| }, |
| { |
| "epoch": 14.020456503014643, |
| "grad_norm": 0.39898359775543213, |
| "learning_rate": 0.00038995960683990845, |
| "loss": 3.0271, |
| "step": 52100 |
| }, |
| { |
| "epoch": 14.03391472868217, |
| "grad_norm": 0.3908878564834595, |
| "learning_rate": 0.00038975764103945065, |
| "loss": 3.0249, |
| "step": 52150 |
| }, |
| { |
| "epoch": 14.047372954349699, |
| "grad_norm": 0.3882172703742981, |
| "learning_rate": 0.00038955567523899285, |
| "loss": 3.0202, |
| "step": 52200 |
| }, |
| { |
| "epoch": 14.060831180017226, |
| "grad_norm": 0.3726351261138916, |
| "learning_rate": 0.00038935370943853504, |
| "loss": 3.0342, |
| "step": 52250 |
| }, |
| { |
| "epoch": 14.074289405684755, |
| "grad_norm": 0.3756456971168518, |
| "learning_rate": 0.00038915174363807724, |
| "loss": 3.0319, |
| "step": 52300 |
| }, |
| { |
| "epoch": 14.087747631352283, |
| "grad_norm": 0.39908862113952637, |
| "learning_rate": 0.0003889497778376195, |
| "loss": 3.0268, |
| "step": 52350 |
| }, |
| { |
| "epoch": 14.10120585701981, |
| "grad_norm": 0.3887988030910492, |
| "learning_rate": 0.0003887478120371617, |
| "loss": 3.0341, |
| "step": 52400 |
| }, |
| { |
| "epoch": 14.114664082687339, |
| "grad_norm": 0.38865944743156433, |
| "learning_rate": 0.0003885458462367039, |
| "loss": 3.038, |
| "step": 52450 |
| }, |
| { |
| "epoch": 14.128122308354866, |
| "grad_norm": 0.37640053033828735, |
| "learning_rate": 0.0003883438804362461, |
| "loss": 3.0369, |
| "step": 52500 |
| }, |
| { |
| "epoch": 14.141580534022394, |
| "grad_norm": 0.3813260793685913, |
| "learning_rate": 0.0003881419146357883, |
| "loss": 3.045, |
| "step": 52550 |
| }, |
| { |
| "epoch": 14.155038759689923, |
| "grad_norm": 0.3812572658061981, |
| "learning_rate": 0.0003879399488353305, |
| "loss": 3.06, |
| "step": 52600 |
| }, |
| { |
| "epoch": 14.16849698535745, |
| "grad_norm": 0.3978724777698517, |
| "learning_rate": 0.0003877379830348727, |
| "loss": 3.0308, |
| "step": 52650 |
| }, |
| { |
| "epoch": 14.18195521102498, |
| "grad_norm": 0.40393364429473877, |
| "learning_rate": 0.0003875360172344149, |
| "loss": 3.0453, |
| "step": 52700 |
| }, |
| { |
| "epoch": 14.195413436692506, |
| "grad_norm": 0.3880857229232788, |
| "learning_rate": 0.0003873340514339571, |
| "loss": 3.0465, |
| "step": 52750 |
| }, |
| { |
| "epoch": 14.208871662360034, |
| "grad_norm": 0.39600270986557007, |
| "learning_rate": 0.0003871320856334993, |
| "loss": 3.0441, |
| "step": 52800 |
| }, |
| { |
| "epoch": 14.222329888027563, |
| "grad_norm": 0.42481228709220886, |
| "learning_rate": 0.00038693011983304156, |
| "loss": 3.0545, |
| "step": 52850 |
| }, |
| { |
| "epoch": 14.23578811369509, |
| "grad_norm": 0.39608439803123474, |
| "learning_rate": 0.0003867281540325838, |
| "loss": 3.0596, |
| "step": 52900 |
| }, |
| { |
| "epoch": 14.24924633936262, |
| "grad_norm": 0.35995277762413025, |
| "learning_rate": 0.000386526188232126, |
| "loss": 3.0558, |
| "step": 52950 |
| }, |
| { |
| "epoch": 14.262704565030146, |
| "grad_norm": 0.3757532835006714, |
| "learning_rate": 0.00038632422243166825, |
| "loss": 3.0686, |
| "step": 53000 |
| }, |
| { |
| "epoch": 14.262704565030146, |
| "eval_accuracy": 0.3930424943927358, |
| "eval_loss": 3.301429271697998, |
| "eval_runtime": 53.8948, |
| "eval_samples_per_second": 334.207, |
| "eval_steps_per_second": 20.893, |
| "step": 53000 |
| }, |
| { |
| "epoch": 14.276162790697674, |
| "grad_norm": 0.3709987998008728, |
| "learning_rate": 0.00038612225663121045, |
| "loss": 3.0512, |
| "step": 53050 |
| }, |
| { |
| "epoch": 14.289621016365203, |
| "grad_norm": 0.40017008781433105, |
| "learning_rate": 0.00038592029083075265, |
| "loss": 3.0583, |
| "step": 53100 |
| }, |
| { |
| "epoch": 14.30307924203273, |
| "grad_norm": 0.37422966957092285, |
| "learning_rate": 0.00038571832503029484, |
| "loss": 3.0723, |
| "step": 53150 |
| }, |
| { |
| "epoch": 14.31653746770026, |
| "grad_norm": 0.3892733156681061, |
| "learning_rate": 0.00038551635922983704, |
| "loss": 3.0606, |
| "step": 53200 |
| }, |
| { |
| "epoch": 14.329995693367787, |
| "grad_norm": 0.39466214179992676, |
| "learning_rate": 0.0003853143934293793, |
| "loss": 3.0575, |
| "step": 53250 |
| }, |
| { |
| "epoch": 14.343453919035314, |
| "grad_norm": 0.3730536103248596, |
| "learning_rate": 0.0003851124276289215, |
| "loss": 3.0646, |
| "step": 53300 |
| }, |
| { |
| "epoch": 14.356912144702843, |
| "grad_norm": 0.37154144048690796, |
| "learning_rate": 0.0003849104618284637, |
| "loss": 3.0496, |
| "step": 53350 |
| }, |
| { |
| "epoch": 14.37037037037037, |
| "grad_norm": 0.3818061351776123, |
| "learning_rate": 0.0003847084960280059, |
| "loss": 3.0638, |
| "step": 53400 |
| }, |
| { |
| "epoch": 14.383828596037898, |
| "grad_norm": 0.38460275530815125, |
| "learning_rate": 0.0003845065302275481, |
| "loss": 3.0647, |
| "step": 53450 |
| }, |
| { |
| "epoch": 14.397286821705427, |
| "grad_norm": 0.37454620003700256, |
| "learning_rate": 0.0003843045644270903, |
| "loss": 3.0702, |
| "step": 53500 |
| }, |
| { |
| "epoch": 14.410745047372954, |
| "grad_norm": 0.3800989091396332, |
| "learning_rate": 0.0003841025986266325, |
| "loss": 3.0693, |
| "step": 53550 |
| }, |
| { |
| "epoch": 14.424203273040483, |
| "grad_norm": 0.40141618251800537, |
| "learning_rate": 0.0003839006328261747, |
| "loss": 3.0764, |
| "step": 53600 |
| }, |
| { |
| "epoch": 14.43766149870801, |
| "grad_norm": 0.40033701062202454, |
| "learning_rate": 0.0003836986670257169, |
| "loss": 3.0703, |
| "step": 53650 |
| }, |
| { |
| "epoch": 14.451119724375538, |
| "grad_norm": 0.37985333800315857, |
| "learning_rate": 0.0003834967012252591, |
| "loss": 3.0714, |
| "step": 53700 |
| }, |
| { |
| "epoch": 14.464577950043067, |
| "grad_norm": 0.368487685918808, |
| "learning_rate": 0.00038329473542480136, |
| "loss": 3.0718, |
| "step": 53750 |
| }, |
| { |
| "epoch": 14.478036175710594, |
| "grad_norm": 0.3814280331134796, |
| "learning_rate": 0.0003830927696243436, |
| "loss": 3.0695, |
| "step": 53800 |
| }, |
| { |
| "epoch": 14.491494401378123, |
| "grad_norm": 0.3550291061401367, |
| "learning_rate": 0.0003828908038238858, |
| "loss": 3.0714, |
| "step": 53850 |
| }, |
| { |
| "epoch": 14.50495262704565, |
| "grad_norm": 0.38511765003204346, |
| "learning_rate": 0.000382688838023428, |
| "loss": 3.0839, |
| "step": 53900 |
| }, |
| { |
| "epoch": 14.518410852713178, |
| "grad_norm": 0.40868499875068665, |
| "learning_rate": 0.00038248687222297025, |
| "loss": 3.0847, |
| "step": 53950 |
| }, |
| { |
| "epoch": 14.531869078380707, |
| "grad_norm": 0.41092634201049805, |
| "learning_rate": 0.00038228490642251245, |
| "loss": 3.0762, |
| "step": 54000 |
| }, |
| { |
| "epoch": 14.531869078380707, |
| "eval_accuracy": 0.393471975412782, |
| "eval_loss": 3.2951552867889404, |
| "eval_runtime": 53.9287, |
| "eval_samples_per_second": 333.997, |
| "eval_steps_per_second": 20.879, |
| "step": 54000 |
| }, |
| { |
| "epoch": 14.545327304048234, |
| "grad_norm": 0.37341392040252686, |
| "learning_rate": 0.00038208294062205464, |
| "loss": 3.0923, |
| "step": 54050 |
| }, |
| { |
| "epoch": 14.558785529715763, |
| "grad_norm": 0.40408065915107727, |
| "learning_rate": 0.00038188097482159684, |
| "loss": 3.0862, |
| "step": 54100 |
| }, |
| { |
| "epoch": 14.57224375538329, |
| "grad_norm": 0.3749300241470337, |
| "learning_rate": 0.00038167900902113904, |
| "loss": 3.0847, |
| "step": 54150 |
| }, |
| { |
| "epoch": 14.585701981050818, |
| "grad_norm": 0.3992476463317871, |
| "learning_rate": 0.0003814770432206813, |
| "loss": 3.0899, |
| "step": 54200 |
| }, |
| { |
| "epoch": 14.599160206718347, |
| "grad_norm": 0.41293609142303467, |
| "learning_rate": 0.0003812750774202235, |
| "loss": 3.0751, |
| "step": 54250 |
| }, |
| { |
| "epoch": 14.612618432385874, |
| "grad_norm": 0.3896488547325134, |
| "learning_rate": 0.0003810731116197657, |
| "loss": 3.0844, |
| "step": 54300 |
| }, |
| { |
| "epoch": 14.626076658053403, |
| "grad_norm": 0.3871552348136902, |
| "learning_rate": 0.0003808711458193079, |
| "loss": 3.0875, |
| "step": 54350 |
| }, |
| { |
| "epoch": 14.63953488372093, |
| "grad_norm": 0.3741537034511566, |
| "learning_rate": 0.00038066918001885007, |
| "loss": 3.0883, |
| "step": 54400 |
| }, |
| { |
| "epoch": 14.652993109388458, |
| "grad_norm": 0.40278348326683044, |
| "learning_rate": 0.0003804672142183923, |
| "loss": 3.0921, |
| "step": 54450 |
| }, |
| { |
| "epoch": 14.666451335055987, |
| "grad_norm": 0.40198415517807007, |
| "learning_rate": 0.0003802652484179345, |
| "loss": 3.094, |
| "step": 54500 |
| }, |
| { |
| "epoch": 14.679909560723514, |
| "grad_norm": 0.36237287521362305, |
| "learning_rate": 0.0003800632826174767, |
| "loss": 3.0858, |
| "step": 54550 |
| }, |
| { |
| "epoch": 14.693367786391041, |
| "grad_norm": 0.38690119981765747, |
| "learning_rate": 0.0003798613168170189, |
| "loss": 3.0753, |
| "step": 54600 |
| }, |
| { |
| "epoch": 14.70682601205857, |
| "grad_norm": 0.39163732528686523, |
| "learning_rate": 0.0003796593510165611, |
| "loss": 3.0878, |
| "step": 54650 |
| }, |
| { |
| "epoch": 14.720284237726098, |
| "grad_norm": 0.38994866609573364, |
| "learning_rate": 0.0003794573852161034, |
| "loss": 3.0708, |
| "step": 54700 |
| }, |
| { |
| "epoch": 14.733742463393627, |
| "grad_norm": 0.3824878931045532, |
| "learning_rate": 0.0003792554194156456, |
| "loss": 3.0862, |
| "step": 54750 |
| }, |
| { |
| "epoch": 14.747200689061154, |
| "grad_norm": 0.3619995415210724, |
| "learning_rate": 0.0003790534536151878, |
| "loss": 3.0844, |
| "step": 54800 |
| }, |
| { |
| "epoch": 14.760658914728682, |
| "grad_norm": 0.40019071102142334, |
| "learning_rate": 0.00037885148781473005, |
| "loss": 3.0864, |
| "step": 54850 |
| }, |
| { |
| "epoch": 14.77411714039621, |
| "grad_norm": 0.36875399947166443, |
| "learning_rate": 0.00037864952201427225, |
| "loss": 3.0859, |
| "step": 54900 |
| }, |
| { |
| "epoch": 14.787575366063738, |
| "grad_norm": 0.3560780882835388, |
| "learning_rate": 0.00037844755621381444, |
| "loss": 3.0997, |
| "step": 54950 |
| }, |
| { |
| "epoch": 14.801033591731267, |
| "grad_norm": 0.42874544858932495, |
| "learning_rate": 0.00037824559041335664, |
| "loss": 3.0797, |
| "step": 55000 |
| }, |
| { |
| "epoch": 14.801033591731267, |
| "eval_accuracy": 0.3938084547244651, |
| "eval_loss": 3.2878448963165283, |
| "eval_runtime": 53.7813, |
| "eval_samples_per_second": 334.912, |
| "eval_steps_per_second": 20.937, |
| "step": 55000 |
| }, |
| { |
| "epoch": 14.814491817398794, |
| "grad_norm": 0.3775944709777832, |
| "learning_rate": 0.00037804362461289884, |
| "loss": 3.0984, |
| "step": 55050 |
| }, |
| { |
| "epoch": 14.827950043066322, |
| "grad_norm": 0.3809267282485962, |
| "learning_rate": 0.0003778416588124411, |
| "loss": 3.0919, |
| "step": 55100 |
| }, |
| { |
| "epoch": 14.84140826873385, |
| "grad_norm": 0.3942723274230957, |
| "learning_rate": 0.0003776396930119833, |
| "loss": 3.1, |
| "step": 55150 |
| }, |
| { |
| "epoch": 14.854866494401378, |
| "grad_norm": 0.3672431707382202, |
| "learning_rate": 0.0003774377272115255, |
| "loss": 3.0864, |
| "step": 55200 |
| }, |
| { |
| "epoch": 14.868324720068905, |
| "grad_norm": 0.38476642966270447, |
| "learning_rate": 0.0003772357614110677, |
| "loss": 3.0911, |
| "step": 55250 |
| }, |
| { |
| "epoch": 14.881782945736434, |
| "grad_norm": 0.3824423849582672, |
| "learning_rate": 0.00037703379561060987, |
| "loss": 3.0828, |
| "step": 55300 |
| }, |
| { |
| "epoch": 14.895241171403962, |
| "grad_norm": 0.352585107088089, |
| "learning_rate": 0.0003768318298101521, |
| "loss": 3.0994, |
| "step": 55350 |
| }, |
| { |
| "epoch": 14.90869939707149, |
| "grad_norm": 0.4190617501735687, |
| "learning_rate": 0.0003766298640096943, |
| "loss": 3.0963, |
| "step": 55400 |
| }, |
| { |
| "epoch": 14.922157622739018, |
| "grad_norm": 0.3812563419342041, |
| "learning_rate": 0.0003764278982092365, |
| "loss": 3.0994, |
| "step": 55450 |
| }, |
| { |
| "epoch": 14.935615848406545, |
| "grad_norm": 0.37324008345603943, |
| "learning_rate": 0.0003762259324087787, |
| "loss": 3.0875, |
| "step": 55500 |
| }, |
| { |
| "epoch": 14.949074074074074, |
| "grad_norm": 0.3767263889312744, |
| "learning_rate": 0.0003760239666083209, |
| "loss": 3.0994, |
| "step": 55550 |
| }, |
| { |
| "epoch": 14.962532299741602, |
| "grad_norm": 0.39034217596054077, |
| "learning_rate": 0.0003758220008078632, |
| "loss": 3.103, |
| "step": 55600 |
| }, |
| { |
| "epoch": 14.97599052540913, |
| "grad_norm": 0.4098099172115326, |
| "learning_rate": 0.0003756200350074054, |
| "loss": 3.0952, |
| "step": 55650 |
| }, |
| { |
| "epoch": 14.989448751076658, |
| "grad_norm": 0.36337199807167053, |
| "learning_rate": 0.0003754180692069476, |
| "loss": 3.0987, |
| "step": 55700 |
| }, |
| { |
| "epoch": 15.002691645133506, |
| "grad_norm": 0.41866105794906616, |
| "learning_rate": 0.0003752161034064898, |
| "loss": 3.0784, |
| "step": 55750 |
| }, |
| { |
| "epoch": 15.016149870801033, |
| "grad_norm": 0.3725742995738983, |
| "learning_rate": 0.00037501413760603205, |
| "loss": 3.0073, |
| "step": 55800 |
| }, |
| { |
| "epoch": 15.02960809646856, |
| "grad_norm": 0.37522321939468384, |
| "learning_rate": 0.00037481217180557425, |
| "loss": 3.0118, |
| "step": 55850 |
| }, |
| { |
| "epoch": 15.04306632213609, |
| "grad_norm": 0.39416176080703735, |
| "learning_rate": 0.00037461020600511644, |
| "loss": 3.0055, |
| "step": 55900 |
| }, |
| { |
| "epoch": 15.056524547803617, |
| "grad_norm": 0.3889634311199188, |
| "learning_rate": 0.00037440824020465864, |
| "loss": 3.015, |
| "step": 55950 |
| }, |
| { |
| "epoch": 15.069982773471146, |
| "grad_norm": 0.4061291217803955, |
| "learning_rate": 0.00037420627440420083, |
| "loss": 3.0089, |
| "step": 56000 |
| }, |
| { |
| "epoch": 15.069982773471146, |
| "eval_accuracy": 0.39340830835542123, |
| "eval_loss": 3.298887252807617, |
| "eval_runtime": 53.7996, |
| "eval_samples_per_second": 334.798, |
| "eval_steps_per_second": 20.93, |
| "step": 56000 |
| }, |
| { |
| "epoch": 15.083440999138674, |
| "grad_norm": 0.3685813546180725, |
| "learning_rate": 0.0003740043086037431, |
| "loss": 3.0137, |
| "step": 56050 |
| }, |
| { |
| "epoch": 15.0968992248062, |
| "grad_norm": 0.3749956786632538, |
| "learning_rate": 0.0003738023428032853, |
| "loss": 3.0182, |
| "step": 56100 |
| }, |
| { |
| "epoch": 15.11035745047373, |
| "grad_norm": 0.3896404802799225, |
| "learning_rate": 0.0003736003770028275, |
| "loss": 3.0167, |
| "step": 56150 |
| }, |
| { |
| "epoch": 15.123815676141257, |
| "grad_norm": 0.40831461548805237, |
| "learning_rate": 0.00037339841120236967, |
| "loss": 3.0115, |
| "step": 56200 |
| }, |
| { |
| "epoch": 15.137273901808786, |
| "grad_norm": 0.36938607692718506, |
| "learning_rate": 0.00037319644540191187, |
| "loss": 3.0303, |
| "step": 56250 |
| }, |
| { |
| "epoch": 15.150732127476314, |
| "grad_norm": 0.4110319912433624, |
| "learning_rate": 0.0003729944796014541, |
| "loss": 3.0322, |
| "step": 56300 |
| }, |
| { |
| "epoch": 15.164190353143841, |
| "grad_norm": 0.383331835269928, |
| "learning_rate": 0.0003727925138009963, |
| "loss": 3.0447, |
| "step": 56350 |
| }, |
| { |
| "epoch": 15.17764857881137, |
| "grad_norm": 0.371985524892807, |
| "learning_rate": 0.0003725905480005385, |
| "loss": 3.0273, |
| "step": 56400 |
| }, |
| { |
| "epoch": 15.191106804478897, |
| "grad_norm": 0.3929747939109802, |
| "learning_rate": 0.0003723885822000807, |
| "loss": 3.0346, |
| "step": 56450 |
| }, |
| { |
| "epoch": 15.204565030146426, |
| "grad_norm": 0.3700959086418152, |
| "learning_rate": 0.000372186616399623, |
| "loss": 3.0414, |
| "step": 56500 |
| }, |
| { |
| "epoch": 15.218023255813954, |
| "grad_norm": 0.3915750980377197, |
| "learning_rate": 0.0003719846505991652, |
| "loss": 3.0328, |
| "step": 56550 |
| }, |
| { |
| "epoch": 15.231481481481481, |
| "grad_norm": 0.39838945865631104, |
| "learning_rate": 0.0003717826847987074, |
| "loss": 3.0331, |
| "step": 56600 |
| }, |
| { |
| "epoch": 15.24493970714901, |
| "grad_norm": 0.3869384825229645, |
| "learning_rate": 0.0003715807189982496, |
| "loss": 3.0534, |
| "step": 56650 |
| }, |
| { |
| "epoch": 15.258397932816537, |
| "grad_norm": 0.394001305103302, |
| "learning_rate": 0.00037137875319779185, |
| "loss": 3.0312, |
| "step": 56700 |
| }, |
| { |
| "epoch": 15.271856158484065, |
| "grad_norm": 0.4037325084209442, |
| "learning_rate": 0.00037117678739733405, |
| "loss": 3.051, |
| "step": 56750 |
| }, |
| { |
| "epoch": 15.285314384151594, |
| "grad_norm": 0.4047858715057373, |
| "learning_rate": 0.00037097482159687624, |
| "loss": 3.0406, |
| "step": 56800 |
| }, |
| { |
| "epoch": 15.298772609819121, |
| "grad_norm": 0.39127621054649353, |
| "learning_rate": 0.00037077285579641844, |
| "loss": 3.0564, |
| "step": 56850 |
| }, |
| { |
| "epoch": 15.31223083548665, |
| "grad_norm": 0.4089362323284149, |
| "learning_rate": 0.00037057088999596063, |
| "loss": 3.0495, |
| "step": 56900 |
| }, |
| { |
| "epoch": 15.325689061154177, |
| "grad_norm": 0.4318329095840454, |
| "learning_rate": 0.0003703689241955029, |
| "loss": 3.0614, |
| "step": 56950 |
| }, |
| { |
| "epoch": 15.339147286821705, |
| "grad_norm": 0.3942766487598419, |
| "learning_rate": 0.0003701669583950451, |
| "loss": 3.0535, |
| "step": 57000 |
| }, |
| { |
| "epoch": 15.339147286821705, |
| "eval_accuracy": 0.39346882465397065, |
| "eval_loss": 3.296555280685425, |
| "eval_runtime": 53.8696, |
| "eval_samples_per_second": 334.363, |
| "eval_steps_per_second": 20.902, |
| "step": 57000 |
| }, |
| { |
| "epoch": 15.352605512489234, |
| "grad_norm": 0.45541346073150635, |
| "learning_rate": 0.0003699649925945873, |
| "loss": 3.0515, |
| "step": 57050 |
| }, |
| { |
| "epoch": 15.366063738156761, |
| "grad_norm": 0.40245404839515686, |
| "learning_rate": 0.0003697630267941295, |
| "loss": 3.0566, |
| "step": 57100 |
| }, |
| { |
| "epoch": 15.37952196382429, |
| "grad_norm": 0.3851865231990814, |
| "learning_rate": 0.00036956106099367167, |
| "loss": 3.0555, |
| "step": 57150 |
| }, |
| { |
| "epoch": 15.392980189491817, |
| "grad_norm": 0.3949066996574402, |
| "learning_rate": 0.0003693590951932139, |
| "loss": 3.0416, |
| "step": 57200 |
| }, |
| { |
| "epoch": 15.406438415159345, |
| "grad_norm": 0.38792628049850464, |
| "learning_rate": 0.0003691571293927561, |
| "loss": 3.0546, |
| "step": 57250 |
| }, |
| { |
| "epoch": 15.419896640826874, |
| "grad_norm": 0.41789019107818604, |
| "learning_rate": 0.0003689551635922983, |
| "loss": 3.0661, |
| "step": 57300 |
| }, |
| { |
| "epoch": 15.433354866494401, |
| "grad_norm": 0.38184309005737305, |
| "learning_rate": 0.00036875319779184056, |
| "loss": 3.0535, |
| "step": 57350 |
| }, |
| { |
| "epoch": 15.44681309216193, |
| "grad_norm": 0.4123072326183319, |
| "learning_rate": 0.0003685512319913828, |
| "loss": 3.0622, |
| "step": 57400 |
| }, |
| { |
| "epoch": 15.460271317829458, |
| "grad_norm": 0.38529086112976074, |
| "learning_rate": 0.000368349266190925, |
| "loss": 3.0553, |
| "step": 57450 |
| }, |
| { |
| "epoch": 15.473729543496985, |
| "grad_norm": 0.37527531385421753, |
| "learning_rate": 0.0003681473003904672, |
| "loss": 3.0544, |
| "step": 57500 |
| }, |
| { |
| "epoch": 15.487187769164514, |
| "grad_norm": 0.38482967019081116, |
| "learning_rate": 0.0003679453345900094, |
| "loss": 3.063, |
| "step": 57550 |
| }, |
| { |
| "epoch": 15.500645994832041, |
| "grad_norm": 0.3870854377746582, |
| "learning_rate": 0.0003677433687895516, |
| "loss": 3.0678, |
| "step": 57600 |
| }, |
| { |
| "epoch": 15.514104220499568, |
| "grad_norm": 0.4207375645637512, |
| "learning_rate": 0.00036754140298909385, |
| "loss": 3.0578, |
| "step": 57650 |
| }, |
| { |
| "epoch": 15.527562446167098, |
| "grad_norm": 0.38036173582077026, |
| "learning_rate": 0.00036733943718863604, |
| "loss": 3.0655, |
| "step": 57700 |
| }, |
| { |
| "epoch": 15.541020671834625, |
| "grad_norm": 0.37135618925094604, |
| "learning_rate": 0.00036713747138817824, |
| "loss": 3.0561, |
| "step": 57750 |
| }, |
| { |
| "epoch": 15.554478897502154, |
| "grad_norm": 0.4144805073738098, |
| "learning_rate": 0.00036693550558772044, |
| "loss": 3.0701, |
| "step": 57800 |
| }, |
| { |
| "epoch": 15.567937123169681, |
| "grad_norm": 0.3989149332046509, |
| "learning_rate": 0.00036673353978726263, |
| "loss": 3.0672, |
| "step": 57850 |
| }, |
| { |
| "epoch": 15.581395348837209, |
| "grad_norm": 0.4064973294734955, |
| "learning_rate": 0.0003665315739868049, |
| "loss": 3.0692, |
| "step": 57900 |
| }, |
| { |
| "epoch": 15.594853574504738, |
| "grad_norm": 0.3864794373512268, |
| "learning_rate": 0.0003663296081863471, |
| "loss": 3.0669, |
| "step": 57950 |
| }, |
| { |
| "epoch": 15.608311800172265, |
| "grad_norm": 0.3883097767829895, |
| "learning_rate": 0.0003661276423858893, |
| "loss": 3.0641, |
| "step": 58000 |
| }, |
| { |
| "epoch": 15.608311800172265, |
| "eval_accuracy": 0.393839636372012, |
| "eval_loss": 3.2892065048217773, |
| "eval_runtime": 53.801, |
| "eval_samples_per_second": 334.789, |
| "eval_steps_per_second": 20.929, |
| "step": 58000 |
| }, |
| { |
| "epoch": 15.621770025839794, |
| "grad_norm": 0.4003264605998993, |
| "learning_rate": 0.00036592567658543147, |
| "loss": 3.0776, |
| "step": 58050 |
| }, |
| { |
| "epoch": 15.635228251507321, |
| "grad_norm": 0.40963396430015564, |
| "learning_rate": 0.00036572371078497367, |
| "loss": 3.0615, |
| "step": 58100 |
| }, |
| { |
| "epoch": 15.648686477174849, |
| "grad_norm": 0.3997986912727356, |
| "learning_rate": 0.0003655217449845159, |
| "loss": 3.0714, |
| "step": 58150 |
| }, |
| { |
| "epoch": 15.662144702842378, |
| "grad_norm": 0.40300729870796204, |
| "learning_rate": 0.0003653197791840581, |
| "loss": 3.0714, |
| "step": 58200 |
| }, |
| { |
| "epoch": 15.675602928509905, |
| "grad_norm": 0.4134303033351898, |
| "learning_rate": 0.00036511781338360036, |
| "loss": 3.0696, |
| "step": 58250 |
| }, |
| { |
| "epoch": 15.689061154177434, |
| "grad_norm": 0.36091458797454834, |
| "learning_rate": 0.00036491584758314256, |
| "loss": 3.0863, |
| "step": 58300 |
| }, |
| { |
| "epoch": 15.702519379844961, |
| "grad_norm": 0.380769819021225, |
| "learning_rate": 0.0003647138817826848, |
| "loss": 3.0681, |
| "step": 58350 |
| }, |
| { |
| "epoch": 15.715977605512489, |
| "grad_norm": 0.3838481605052948, |
| "learning_rate": 0.000364511915982227, |
| "loss": 3.0671, |
| "step": 58400 |
| }, |
| { |
| "epoch": 15.729435831180018, |
| "grad_norm": 0.4186742901802063, |
| "learning_rate": 0.0003643099501817692, |
| "loss": 3.0736, |
| "step": 58450 |
| }, |
| { |
| "epoch": 15.742894056847545, |
| "grad_norm": 0.4187677800655365, |
| "learning_rate": 0.0003641079843813114, |
| "loss": 3.0783, |
| "step": 58500 |
| }, |
| { |
| "epoch": 15.756352282515074, |
| "grad_norm": 0.3893994987010956, |
| "learning_rate": 0.0003639060185808536, |
| "loss": 3.074, |
| "step": 58550 |
| }, |
| { |
| "epoch": 15.769810508182601, |
| "grad_norm": 0.41808056831359863, |
| "learning_rate": 0.00036370405278039584, |
| "loss": 3.0731, |
| "step": 58600 |
| }, |
| { |
| "epoch": 15.783268733850129, |
| "grad_norm": 0.38791757822036743, |
| "learning_rate": 0.00036350208697993804, |
| "loss": 3.0766, |
| "step": 58650 |
| }, |
| { |
| "epoch": 15.796726959517658, |
| "grad_norm": 0.3836047649383545, |
| "learning_rate": 0.00036330012117948024, |
| "loss": 3.0732, |
| "step": 58700 |
| }, |
| { |
| "epoch": 15.810185185185185, |
| "grad_norm": 0.3814838230609894, |
| "learning_rate": 0.00036309815537902243, |
| "loss": 3.086, |
| "step": 58750 |
| }, |
| { |
| "epoch": 15.823643410852712, |
| "grad_norm": 0.3762393891811371, |
| "learning_rate": 0.0003628961895785647, |
| "loss": 3.0793, |
| "step": 58800 |
| }, |
| { |
| "epoch": 15.837101636520241, |
| "grad_norm": 0.4067472517490387, |
| "learning_rate": 0.0003626942237781069, |
| "loss": 3.084, |
| "step": 58850 |
| }, |
| { |
| "epoch": 15.850559862187769, |
| "grad_norm": 0.3889180123806, |
| "learning_rate": 0.0003624922579776491, |
| "loss": 3.0767, |
| "step": 58900 |
| }, |
| { |
| "epoch": 15.864018087855298, |
| "grad_norm": 0.36162832379341125, |
| "learning_rate": 0.00036229029217719127, |
| "loss": 3.0917, |
| "step": 58950 |
| }, |
| { |
| "epoch": 15.877476313522825, |
| "grad_norm": 0.41509488224983215, |
| "learning_rate": 0.00036208832637673347, |
| "loss": 3.082, |
| "step": 59000 |
| }, |
| { |
| "epoch": 15.877476313522825, |
| "eval_accuracy": 0.39411407832916784, |
| "eval_loss": 3.2861170768737793, |
| "eval_runtime": 53.7782, |
| "eval_samples_per_second": 334.931, |
| "eval_steps_per_second": 20.938, |
| "step": 59000 |
| }, |
| { |
| "epoch": 15.890934539190352, |
| "grad_norm": 0.38733652234077454, |
| "learning_rate": 0.0003618863605762757, |
| "loss": 3.0888, |
| "step": 59050 |
| }, |
| { |
| "epoch": 15.904392764857882, |
| "grad_norm": 0.3730623126029968, |
| "learning_rate": 0.0003616843947758179, |
| "loss": 3.0815, |
| "step": 59100 |
| }, |
| { |
| "epoch": 15.917850990525409, |
| "grad_norm": 0.4350191056728363, |
| "learning_rate": 0.00036148242897536016, |
| "loss": 3.0842, |
| "step": 59150 |
| }, |
| { |
| "epoch": 15.931309216192938, |
| "grad_norm": 0.3823108971118927, |
| "learning_rate": 0.00036128046317490236, |
| "loss": 3.0824, |
| "step": 59200 |
| }, |
| { |
| "epoch": 15.944767441860465, |
| "grad_norm": 0.3937095105648041, |
| "learning_rate": 0.0003610784973744446, |
| "loss": 3.0881, |
| "step": 59250 |
| }, |
| { |
| "epoch": 15.958225667527993, |
| "grad_norm": 0.43180474638938904, |
| "learning_rate": 0.0003608765315739868, |
| "loss": 3.0817, |
| "step": 59300 |
| }, |
| { |
| "epoch": 15.971683893195522, |
| "grad_norm": 0.37747758626937866, |
| "learning_rate": 0.000360674565773529, |
| "loss": 3.0744, |
| "step": 59350 |
| }, |
| { |
| "epoch": 15.985142118863049, |
| "grad_norm": 0.38189202547073364, |
| "learning_rate": 0.0003604725999730712, |
| "loss": 3.0809, |
| "step": 59400 |
| }, |
| { |
| "epoch": 15.998600344530576, |
| "grad_norm": 0.38622957468032837, |
| "learning_rate": 0.0003602706341726134, |
| "loss": 3.0887, |
| "step": 59450 |
| }, |
| { |
| "epoch": 16.011843238587424, |
| "grad_norm": 0.39193543791770935, |
| "learning_rate": 0.00036006866837215564, |
| "loss": 3.0044, |
| "step": 59500 |
| }, |
| { |
| "epoch": 16.02530146425495, |
| "grad_norm": 0.36499685049057007, |
| "learning_rate": 0.00035986670257169784, |
| "loss": 2.9916, |
| "step": 59550 |
| }, |
| { |
| "epoch": 16.03875968992248, |
| "grad_norm": 0.4056093096733093, |
| "learning_rate": 0.00035966473677124004, |
| "loss": 2.9866, |
| "step": 59600 |
| }, |
| { |
| "epoch": 16.05221791559001, |
| "grad_norm": 0.3871045708656311, |
| "learning_rate": 0.00035946277097078223, |
| "loss": 2.999, |
| "step": 59650 |
| }, |
| { |
| "epoch": 16.065676141257537, |
| "grad_norm": 0.40806224942207336, |
| "learning_rate": 0.00035926080517032443, |
| "loss": 3.0065, |
| "step": 59700 |
| }, |
| { |
| "epoch": 16.079134366925064, |
| "grad_norm": 0.3846936523914337, |
| "learning_rate": 0.0003590588393698667, |
| "loss": 3.0043, |
| "step": 59750 |
| }, |
| { |
| "epoch": 16.09259259259259, |
| "grad_norm": 0.41798853874206543, |
| "learning_rate": 0.0003588568735694089, |
| "loss": 3.005, |
| "step": 59800 |
| }, |
| { |
| "epoch": 16.10605081826012, |
| "grad_norm": 0.38085705041885376, |
| "learning_rate": 0.00035865490776895107, |
| "loss": 2.99, |
| "step": 59850 |
| }, |
| { |
| "epoch": 16.11950904392765, |
| "grad_norm": 0.37874123454093933, |
| "learning_rate": 0.00035845294196849327, |
| "loss": 3.0201, |
| "step": 59900 |
| }, |
| { |
| "epoch": 16.132967269595177, |
| "grad_norm": 0.38884493708610535, |
| "learning_rate": 0.00035825097616803546, |
| "loss": 2.9971, |
| "step": 59950 |
| }, |
| { |
| "epoch": 16.146425495262704, |
| "grad_norm": 0.39228013157844543, |
| "learning_rate": 0.0003580490103675777, |
| "loss": 3.0178, |
| "step": 60000 |
| }, |
| { |
| "epoch": 16.146425495262704, |
| "eval_accuracy": 0.3935985489995146, |
| "eval_loss": 3.300384283065796, |
| "eval_runtime": 53.764, |
| "eval_samples_per_second": 335.02, |
| "eval_steps_per_second": 20.943, |
| "step": 60000 |
| }, |
| { |
| "epoch": 16.15988372093023, |
| "grad_norm": 0.37571796774864197, |
| "learning_rate": 0.00035784704456711996, |
| "loss": 3.0206, |
| "step": 60050 |
| }, |
| { |
| "epoch": 16.17334194659776, |
| "grad_norm": 0.3760213255882263, |
| "learning_rate": 0.00035764507876666216, |
| "loss": 3.024, |
| "step": 60100 |
| }, |
| { |
| "epoch": 16.18680017226529, |
| "grad_norm": 0.4139486849308014, |
| "learning_rate": 0.00035744311296620436, |
| "loss": 3.0225, |
| "step": 60150 |
| }, |
| { |
| "epoch": 16.200258397932817, |
| "grad_norm": 0.41361162066459656, |
| "learning_rate": 0.0003572411471657466, |
| "loss": 3.0177, |
| "step": 60200 |
| }, |
| { |
| "epoch": 16.213716623600344, |
| "grad_norm": 0.4155104160308838, |
| "learning_rate": 0.0003570391813652888, |
| "loss": 3.0305, |
| "step": 60250 |
| }, |
| { |
| "epoch": 16.227174849267872, |
| "grad_norm": 0.4072454869747162, |
| "learning_rate": 0.000356837215564831, |
| "loss": 3.0319, |
| "step": 60300 |
| }, |
| { |
| "epoch": 16.2406330749354, |
| "grad_norm": 0.3964226543903351, |
| "learning_rate": 0.0003566352497643732, |
| "loss": 3.0271, |
| "step": 60350 |
| }, |
| { |
| "epoch": 16.25409130060293, |
| "grad_norm": 0.40195783972740173, |
| "learning_rate": 0.0003564332839639154, |
| "loss": 3.0424, |
| "step": 60400 |
| }, |
| { |
| "epoch": 16.267549526270457, |
| "grad_norm": 0.39406704902648926, |
| "learning_rate": 0.00035623131816345764, |
| "loss": 3.0325, |
| "step": 60450 |
| }, |
| { |
| "epoch": 16.281007751937985, |
| "grad_norm": 0.3998791575431824, |
| "learning_rate": 0.00035602935236299984, |
| "loss": 3.0188, |
| "step": 60500 |
| }, |
| { |
| "epoch": 16.294465977605512, |
| "grad_norm": 0.4291344881057739, |
| "learning_rate": 0.00035582738656254203, |
| "loss": 3.0384, |
| "step": 60550 |
| }, |
| { |
| "epoch": 16.30792420327304, |
| "grad_norm": 0.42071303725242615, |
| "learning_rate": 0.00035562542076208423, |
| "loss": 3.04, |
| "step": 60600 |
| }, |
| { |
| "epoch": 16.32138242894057, |
| "grad_norm": 0.40110665559768677, |
| "learning_rate": 0.0003554234549616264, |
| "loss": 3.0441, |
| "step": 60650 |
| }, |
| { |
| "epoch": 16.334840654608097, |
| "grad_norm": 0.4020436704158783, |
| "learning_rate": 0.0003552214891611687, |
| "loss": 3.035, |
| "step": 60700 |
| }, |
| { |
| "epoch": 16.348298880275625, |
| "grad_norm": 0.39630720019340515, |
| "learning_rate": 0.00035501952336071087, |
| "loss": 3.0418, |
| "step": 60750 |
| }, |
| { |
| "epoch": 16.361757105943152, |
| "grad_norm": 0.39753347635269165, |
| "learning_rate": 0.00035481755756025307, |
| "loss": 3.0448, |
| "step": 60800 |
| }, |
| { |
| "epoch": 16.37521533161068, |
| "grad_norm": 0.39796334505081177, |
| "learning_rate": 0.00035461559175979526, |
| "loss": 3.0384, |
| "step": 60850 |
| }, |
| { |
| "epoch": 16.38867355727821, |
| "grad_norm": 0.4023888111114502, |
| "learning_rate": 0.0003544136259593375, |
| "loss": 3.0512, |
| "step": 60900 |
| }, |
| { |
| "epoch": 16.402131782945737, |
| "grad_norm": 0.3885258436203003, |
| "learning_rate": 0.00035421166015887977, |
| "loss": 3.0415, |
| "step": 60950 |
| }, |
| { |
| "epoch": 16.415590008613265, |
| "grad_norm": 0.39868536591529846, |
| "learning_rate": 0.00035400969435842196, |
| "loss": 3.0405, |
| "step": 61000 |
| }, |
| { |
| "epoch": 16.415590008613265, |
| "eval_accuracy": 0.39402955107553866, |
| "eval_loss": 3.2945430278778076, |
| "eval_runtime": 53.9312, |
| "eval_samples_per_second": 333.981, |
| "eval_steps_per_second": 20.878, |
| "step": 61000 |
| }, |
| { |
| "epoch": 16.429048234280792, |
| "grad_norm": 0.4093899130821228, |
| "learning_rate": 0.00035380772855796416, |
| "loss": 3.0436, |
| "step": 61050 |
| }, |
| { |
| "epoch": 16.44250645994832, |
| "grad_norm": 0.44660383462905884, |
| "learning_rate": 0.0003536057627575064, |
| "loss": 3.041, |
| "step": 61100 |
| }, |
| { |
| "epoch": 16.45596468561585, |
| "grad_norm": 0.39885213971138, |
| "learning_rate": 0.0003534037969570486, |
| "loss": 3.05, |
| "step": 61150 |
| }, |
| { |
| "epoch": 16.469422911283377, |
| "grad_norm": 0.3952399492263794, |
| "learning_rate": 0.0003532018311565908, |
| "loss": 3.0525, |
| "step": 61200 |
| }, |
| { |
| "epoch": 16.482881136950905, |
| "grad_norm": 0.3856141269207001, |
| "learning_rate": 0.000352999865356133, |
| "loss": 3.0495, |
| "step": 61250 |
| }, |
| { |
| "epoch": 16.496339362618432, |
| "grad_norm": 0.4019649028778076, |
| "learning_rate": 0.0003527978995556752, |
| "loss": 3.0524, |
| "step": 61300 |
| }, |
| { |
| "epoch": 16.50979758828596, |
| "grad_norm": 0.40786245465278625, |
| "learning_rate": 0.00035259593375521744, |
| "loss": 3.0536, |
| "step": 61350 |
| }, |
| { |
| "epoch": 16.52325581395349, |
| "grad_norm": 0.39933323860168457, |
| "learning_rate": 0.00035239396795475964, |
| "loss": 3.0513, |
| "step": 61400 |
| }, |
| { |
| "epoch": 16.536714039621017, |
| "grad_norm": 0.4114389717578888, |
| "learning_rate": 0.00035219200215430183, |
| "loss": 3.0541, |
| "step": 61450 |
| }, |
| { |
| "epoch": 16.550172265288545, |
| "grad_norm": 0.378556489944458, |
| "learning_rate": 0.00035199003635384403, |
| "loss": 3.0464, |
| "step": 61500 |
| }, |
| { |
| "epoch": 16.563630490956072, |
| "grad_norm": 0.3831023573875427, |
| "learning_rate": 0.0003517880705533862, |
| "loss": 3.0496, |
| "step": 61550 |
| }, |
| { |
| "epoch": 16.5770887166236, |
| "grad_norm": 0.41064491868019104, |
| "learning_rate": 0.0003515861047529285, |
| "loss": 3.0466, |
| "step": 61600 |
| }, |
| { |
| "epoch": 16.590546942291127, |
| "grad_norm": 0.3654628098011017, |
| "learning_rate": 0.0003513841389524707, |
| "loss": 3.0648, |
| "step": 61650 |
| }, |
| { |
| "epoch": 16.604005167958658, |
| "grad_norm": 0.4042568504810333, |
| "learning_rate": 0.00035118217315201287, |
| "loss": 3.046, |
| "step": 61700 |
| }, |
| { |
| "epoch": 16.617463393626185, |
| "grad_norm": 0.3990447223186493, |
| "learning_rate": 0.00035098020735155507, |
| "loss": 3.0588, |
| "step": 61750 |
| }, |
| { |
| "epoch": 16.630921619293712, |
| "grad_norm": 0.3792515695095062, |
| "learning_rate": 0.00035077824155109737, |
| "loss": 3.0543, |
| "step": 61800 |
| }, |
| { |
| "epoch": 16.64437984496124, |
| "grad_norm": 0.40316513180732727, |
| "learning_rate": 0.00035057627575063957, |
| "loss": 3.0559, |
| "step": 61850 |
| }, |
| { |
| "epoch": 16.657838070628767, |
| "grad_norm": 0.4269405007362366, |
| "learning_rate": 0.00035037430995018176, |
| "loss": 3.0573, |
| "step": 61900 |
| }, |
| { |
| "epoch": 16.671296296296298, |
| "grad_norm": 0.3970986008644104, |
| "learning_rate": 0.00035017234414972396, |
| "loss": 3.0569, |
| "step": 61950 |
| }, |
| { |
| "epoch": 16.684754521963825, |
| "grad_norm": 0.40799281001091003, |
| "learning_rate": 0.00034997037834926615, |
| "loss": 3.0625, |
| "step": 62000 |
| }, |
| { |
| "epoch": 16.684754521963825, |
| "eval_accuracy": 0.3946576385475567, |
| "eval_loss": 3.286635637283325, |
| "eval_runtime": 53.7597, |
| "eval_samples_per_second": 335.047, |
| "eval_steps_per_second": 20.945, |
| "step": 62000 |
| }, |
| { |
| "epoch": 16.698212747631352, |
| "grad_norm": 0.43438488245010376, |
| "learning_rate": 0.0003497684125488084, |
| "loss": 3.0667, |
| "step": 62050 |
| }, |
| { |
| "epoch": 16.71167097329888, |
| "grad_norm": 0.385447233915329, |
| "learning_rate": 0.0003495664467483506, |
| "loss": 3.0618, |
| "step": 62100 |
| }, |
| { |
| "epoch": 16.725129198966407, |
| "grad_norm": 0.4281361699104309, |
| "learning_rate": 0.0003493644809478928, |
| "loss": 3.0627, |
| "step": 62150 |
| }, |
| { |
| "epoch": 16.738587424633938, |
| "grad_norm": 0.41003167629241943, |
| "learning_rate": 0.000349162515147435, |
| "loss": 3.0556, |
| "step": 62200 |
| }, |
| { |
| "epoch": 16.752045650301465, |
| "grad_norm": 0.40472331643104553, |
| "learning_rate": 0.0003489605493469772, |
| "loss": 3.0607, |
| "step": 62250 |
| }, |
| { |
| "epoch": 16.765503875968992, |
| "grad_norm": 0.39419615268707275, |
| "learning_rate": 0.00034875858354651944, |
| "loss": 3.0669, |
| "step": 62300 |
| }, |
| { |
| "epoch": 16.77896210163652, |
| "grad_norm": 0.40481939911842346, |
| "learning_rate": 0.00034855661774606164, |
| "loss": 3.0657, |
| "step": 62350 |
| }, |
| { |
| "epoch": 16.792420327304047, |
| "grad_norm": 0.4163655936717987, |
| "learning_rate": 0.00034835465194560383, |
| "loss": 3.0697, |
| "step": 62400 |
| }, |
| { |
| "epoch": 16.805878552971578, |
| "grad_norm": 0.3779263496398926, |
| "learning_rate": 0.00034815268614514603, |
| "loss": 3.0607, |
| "step": 62450 |
| }, |
| { |
| "epoch": 16.819336778639105, |
| "grad_norm": 0.3963411748409271, |
| "learning_rate": 0.0003479507203446882, |
| "loss": 3.0616, |
| "step": 62500 |
| }, |
| { |
| "epoch": 16.832795004306632, |
| "grad_norm": 0.3897510766983032, |
| "learning_rate": 0.0003477487545442305, |
| "loss": 3.0729, |
| "step": 62550 |
| }, |
| { |
| "epoch": 16.84625322997416, |
| "grad_norm": 0.3864540159702301, |
| "learning_rate": 0.00034754678874377267, |
| "loss": 3.0724, |
| "step": 62600 |
| }, |
| { |
| "epoch": 16.859711455641687, |
| "grad_norm": 0.42142173647880554, |
| "learning_rate": 0.00034734482294331487, |
| "loss": 3.0703, |
| "step": 62650 |
| }, |
| { |
| "epoch": 16.873169681309218, |
| "grad_norm": 0.3723422884941101, |
| "learning_rate": 0.00034714285714285717, |
| "loss": 3.0714, |
| "step": 62700 |
| }, |
| { |
| "epoch": 16.886627906976745, |
| "grad_norm": 0.39334335923194885, |
| "learning_rate": 0.00034694089134239937, |
| "loss": 3.0651, |
| "step": 62750 |
| }, |
| { |
| "epoch": 16.900086132644272, |
| "grad_norm": 0.408385306596756, |
| "learning_rate": 0.00034673892554194156, |
| "loss": 3.0698, |
| "step": 62800 |
| }, |
| { |
| "epoch": 16.9135443583118, |
| "grad_norm": 0.40279293060302734, |
| "learning_rate": 0.00034653695974148376, |
| "loss": 3.0715, |
| "step": 62850 |
| }, |
| { |
| "epoch": 16.927002583979327, |
| "grad_norm": 0.39347708225250244, |
| "learning_rate": 0.00034633499394102595, |
| "loss": 3.0641, |
| "step": 62900 |
| }, |
| { |
| "epoch": 16.940460809646858, |
| "grad_norm": 0.3958011269569397, |
| "learning_rate": 0.0003461330281405682, |
| "loss": 3.0754, |
| "step": 62950 |
| }, |
| { |
| "epoch": 16.953919035314385, |
| "grad_norm": 0.3987782895565033, |
| "learning_rate": 0.0003459310623401104, |
| "loss": 3.0662, |
| "step": 63000 |
| }, |
| { |
| "epoch": 16.953919035314385, |
| "eval_accuracy": 0.39508483798363603, |
| "eval_loss": 3.2805721759796143, |
| "eval_runtime": 53.7065, |
| "eval_samples_per_second": 335.378, |
| "eval_steps_per_second": 20.966, |
| "step": 63000 |
| }, |
| { |
| "epoch": 16.967377260981912, |
| "grad_norm": 0.4031330645084381, |
| "learning_rate": 0.0003457290965396526, |
| "loss": 3.0679, |
| "step": 63050 |
| }, |
| { |
| "epoch": 16.98083548664944, |
| "grad_norm": 0.410900741815567, |
| "learning_rate": 0.0003455271307391948, |
| "loss": 3.072, |
| "step": 63100 |
| }, |
| { |
| "epoch": 16.994293712316967, |
| "grad_norm": 0.3965557813644409, |
| "learning_rate": 0.000345325164938737, |
| "loss": 3.0664, |
| "step": 63150 |
| }, |
| { |
| "epoch": 17.007536606373815, |
| "grad_norm": 0.41293439269065857, |
| "learning_rate": 0.00034512319913827924, |
| "loss": 3.013, |
| "step": 63200 |
| }, |
| { |
| "epoch": 17.020994832041342, |
| "grad_norm": 0.414392352104187, |
| "learning_rate": 0.00034492123333782144, |
| "loss": 2.9733, |
| "step": 63250 |
| }, |
| { |
| "epoch": 17.034453057708873, |
| "grad_norm": 0.4076540470123291, |
| "learning_rate": 0.00034471926753736363, |
| "loss": 2.9819, |
| "step": 63300 |
| }, |
| { |
| "epoch": 17.0479112833764, |
| "grad_norm": 0.4018961489200592, |
| "learning_rate": 0.00034451730173690583, |
| "loss": 2.9865, |
| "step": 63350 |
| }, |
| { |
| "epoch": 17.061369509043928, |
| "grad_norm": 0.4264177978038788, |
| "learning_rate": 0.000344315335936448, |
| "loss": 2.9894, |
| "step": 63400 |
| }, |
| { |
| "epoch": 17.074827734711455, |
| "grad_norm": 0.43880993127822876, |
| "learning_rate": 0.0003441133701359903, |
| "loss": 2.997, |
| "step": 63450 |
| }, |
| { |
| "epoch": 17.088285960378983, |
| "grad_norm": 0.4118068218231201, |
| "learning_rate": 0.00034391140433553247, |
| "loss": 2.9963, |
| "step": 63500 |
| }, |
| { |
| "epoch": 17.101744186046513, |
| "grad_norm": 0.37996986508369446, |
| "learning_rate": 0.00034370943853507467, |
| "loss": 2.9818, |
| "step": 63550 |
| }, |
| { |
| "epoch": 17.11520241171404, |
| "grad_norm": 0.41833794116973877, |
| "learning_rate": 0.0003435074727346169, |
| "loss": 3.0017, |
| "step": 63600 |
| }, |
| { |
| "epoch": 17.128660637381568, |
| "grad_norm": 0.39799895882606506, |
| "learning_rate": 0.00034330550693415917, |
| "loss": 2.9962, |
| "step": 63650 |
| }, |
| { |
| "epoch": 17.142118863049095, |
| "grad_norm": 0.41376993060112, |
| "learning_rate": 0.00034310354113370136, |
| "loss": 3.0022, |
| "step": 63700 |
| }, |
| { |
| "epoch": 17.155577088716623, |
| "grad_norm": 0.37583568692207336, |
| "learning_rate": 0.00034290157533324356, |
| "loss": 3.0003, |
| "step": 63750 |
| }, |
| { |
| "epoch": 17.16903531438415, |
| "grad_norm": 0.4121802747249603, |
| "learning_rate": 0.00034269960953278576, |
| "loss": 3.0103, |
| "step": 63800 |
| }, |
| { |
| "epoch": 17.18249354005168, |
| "grad_norm": 0.41035139560699463, |
| "learning_rate": 0.00034249764373232795, |
| "loss": 3.0181, |
| "step": 63850 |
| }, |
| { |
| "epoch": 17.195951765719208, |
| "grad_norm": 0.4261062741279602, |
| "learning_rate": 0.0003422956779318702, |
| "loss": 3.0084, |
| "step": 63900 |
| }, |
| { |
| "epoch": 17.209409991386735, |
| "grad_norm": 0.40924733877182007, |
| "learning_rate": 0.0003420937121314124, |
| "loss": 2.9905, |
| "step": 63950 |
| }, |
| { |
| "epoch": 17.222868217054263, |
| "grad_norm": 0.40608179569244385, |
| "learning_rate": 0.0003418917463309546, |
| "loss": 3.0215, |
| "step": 64000 |
| }, |
| { |
| "epoch": 17.222868217054263, |
| "eval_accuracy": 0.39425086472032345, |
| "eval_loss": 3.298827886581421, |
| "eval_runtime": 53.7996, |
| "eval_samples_per_second": 334.798, |
| "eval_steps_per_second": 20.93, |
| "step": 64000 |
| }, |
| { |
| "epoch": 17.23632644272179, |
| "grad_norm": 0.39848390221595764, |
| "learning_rate": 0.0003416897805304968, |
| "loss": 3.0149, |
| "step": 64050 |
| }, |
| { |
| "epoch": 17.24978466838932, |
| "grad_norm": 0.40141576528549194, |
| "learning_rate": 0.000341487814730039, |
| "loss": 3.016, |
| "step": 64100 |
| }, |
| { |
| "epoch": 17.263242894056848, |
| "grad_norm": 0.40395596623420715, |
| "learning_rate": 0.00034128584892958124, |
| "loss": 3.0128, |
| "step": 64150 |
| }, |
| { |
| "epoch": 17.276701119724375, |
| "grad_norm": 0.3964282274246216, |
| "learning_rate": 0.00034108388312912343, |
| "loss": 3.0192, |
| "step": 64200 |
| }, |
| { |
| "epoch": 17.290159345391903, |
| "grad_norm": 0.3771671950817108, |
| "learning_rate": 0.00034088191732866563, |
| "loss": 3.026, |
| "step": 64250 |
| }, |
| { |
| "epoch": 17.30361757105943, |
| "grad_norm": 0.4377134442329407, |
| "learning_rate": 0.0003406799515282078, |
| "loss": 3.0216, |
| "step": 64300 |
| }, |
| { |
| "epoch": 17.31707579672696, |
| "grad_norm": 0.4398791790008545, |
| "learning_rate": 0.00034047798572775, |
| "loss": 3.024, |
| "step": 64350 |
| }, |
| { |
| "epoch": 17.330534022394488, |
| "grad_norm": 0.4335786998271942, |
| "learning_rate": 0.00034027601992729227, |
| "loss": 3.0369, |
| "step": 64400 |
| }, |
| { |
| "epoch": 17.343992248062015, |
| "grad_norm": 0.43530410528182983, |
| "learning_rate": 0.00034007405412683447, |
| "loss": 3.0229, |
| "step": 64450 |
| }, |
| { |
| "epoch": 17.357450473729543, |
| "grad_norm": 0.38939180970191956, |
| "learning_rate": 0.0003398720883263767, |
| "loss": 3.0386, |
| "step": 64500 |
| }, |
| { |
| "epoch": 17.37090869939707, |
| "grad_norm": 0.4043862521648407, |
| "learning_rate": 0.00033967012252591897, |
| "loss": 3.0247, |
| "step": 64550 |
| }, |
| { |
| "epoch": 17.3843669250646, |
| "grad_norm": 0.40165212750434875, |
| "learning_rate": 0.00033946815672546116, |
| "loss": 3.0206, |
| "step": 64600 |
| }, |
| { |
| "epoch": 17.397825150732128, |
| "grad_norm": 0.40205591917037964, |
| "learning_rate": 0.00033926619092500336, |
| "loss": 3.031, |
| "step": 64650 |
| }, |
| { |
| "epoch": 17.411283376399656, |
| "grad_norm": 0.40525221824645996, |
| "learning_rate": 0.00033906422512454556, |
| "loss": 3.0279, |
| "step": 64700 |
| }, |
| { |
| "epoch": 17.424741602067183, |
| "grad_norm": 0.3944297134876251, |
| "learning_rate": 0.00033886225932408775, |
| "loss": 3.045, |
| "step": 64750 |
| }, |
| { |
| "epoch": 17.43819982773471, |
| "grad_norm": 0.4010119140148163, |
| "learning_rate": 0.00033866029352363, |
| "loss": 3.0263, |
| "step": 64800 |
| }, |
| { |
| "epoch": 17.45165805340224, |
| "grad_norm": 0.41308003664016724, |
| "learning_rate": 0.0003384583277231722, |
| "loss": 3.0358, |
| "step": 64850 |
| }, |
| { |
| "epoch": 17.46511627906977, |
| "grad_norm": 0.39949336647987366, |
| "learning_rate": 0.0003382563619227144, |
| "loss": 3.0366, |
| "step": 64900 |
| }, |
| { |
| "epoch": 17.478574504737296, |
| "grad_norm": 0.39073440432548523, |
| "learning_rate": 0.0003380543961222566, |
| "loss": 3.0358, |
| "step": 64950 |
| }, |
| { |
| "epoch": 17.492032730404823, |
| "grad_norm": 0.4053451716899872, |
| "learning_rate": 0.0003378524303217988, |
| "loss": 3.0386, |
| "step": 65000 |
| }, |
| { |
| "epoch": 17.492032730404823, |
| "eval_accuracy": 0.39411951067194606, |
| "eval_loss": 3.292527675628662, |
| "eval_runtime": 53.6555, |
| "eval_samples_per_second": 335.697, |
| "eval_steps_per_second": 20.986, |
| "step": 65000 |
| }, |
| { |
| "epoch": 17.50549095607235, |
| "grad_norm": 0.38078823685646057, |
| "learning_rate": 0.00033765046452134104, |
| "loss": 3.0278, |
| "step": 65050 |
| }, |
| { |
| "epoch": 17.51894918173988, |
| "grad_norm": 0.4012027084827423, |
| "learning_rate": 0.00033744849872088323, |
| "loss": 3.0415, |
| "step": 65100 |
| }, |
| { |
| "epoch": 17.53240740740741, |
| "grad_norm": 0.4206222891807556, |
| "learning_rate": 0.00033724653292042543, |
| "loss": 3.0358, |
| "step": 65150 |
| }, |
| { |
| "epoch": 17.545865633074936, |
| "grad_norm": 0.4432675838470459, |
| "learning_rate": 0.0003370445671199676, |
| "loss": 3.0415, |
| "step": 65200 |
| }, |
| { |
| "epoch": 17.559323858742463, |
| "grad_norm": 0.4019043445587158, |
| "learning_rate": 0.0003368426013195098, |
| "loss": 3.0508, |
| "step": 65250 |
| }, |
| { |
| "epoch": 17.57278208440999, |
| "grad_norm": 0.4079228639602661, |
| "learning_rate": 0.00033664063551905207, |
| "loss": 3.038, |
| "step": 65300 |
| }, |
| { |
| "epoch": 17.58624031007752, |
| "grad_norm": 0.404167115688324, |
| "learning_rate": 0.00033643866971859427, |
| "loss": 3.0394, |
| "step": 65350 |
| }, |
| { |
| "epoch": 17.59969853574505, |
| "grad_norm": 0.37649834156036377, |
| "learning_rate": 0.0003362367039181365, |
| "loss": 3.0455, |
| "step": 65400 |
| }, |
| { |
| "epoch": 17.613156761412576, |
| "grad_norm": 0.406495600938797, |
| "learning_rate": 0.0003360347381176787, |
| "loss": 3.0486, |
| "step": 65450 |
| }, |
| { |
| "epoch": 17.626614987080103, |
| "grad_norm": 0.3711218237876892, |
| "learning_rate": 0.00033583277231722097, |
| "loss": 3.0505, |
| "step": 65500 |
| }, |
| { |
| "epoch": 17.64007321274763, |
| "grad_norm": 0.4153537154197693, |
| "learning_rate": 0.00033563080651676316, |
| "loss": 3.0454, |
| "step": 65550 |
| }, |
| { |
| "epoch": 17.653531438415158, |
| "grad_norm": 0.4080434739589691, |
| "learning_rate": 0.00033542884071630536, |
| "loss": 3.0557, |
| "step": 65600 |
| }, |
| { |
| "epoch": 17.66698966408269, |
| "grad_norm": 0.403916597366333, |
| "learning_rate": 0.00033522687491584755, |
| "loss": 3.0523, |
| "step": 65650 |
| }, |
| { |
| "epoch": 17.680447889750216, |
| "grad_norm": 0.40088391304016113, |
| "learning_rate": 0.00033502490911538975, |
| "loss": 3.0431, |
| "step": 65700 |
| }, |
| { |
| "epoch": 17.693906115417743, |
| "grad_norm": 0.4177820086479187, |
| "learning_rate": 0.000334822943314932, |
| "loss": 3.0592, |
| "step": 65750 |
| }, |
| { |
| "epoch": 17.70736434108527, |
| "grad_norm": 0.4082454741001129, |
| "learning_rate": 0.0003346209775144742, |
| "loss": 3.0556, |
| "step": 65800 |
| }, |
| { |
| "epoch": 17.720822566752798, |
| "grad_norm": 0.4115118086338043, |
| "learning_rate": 0.0003344190117140164, |
| "loss": 3.0538, |
| "step": 65850 |
| }, |
| { |
| "epoch": 17.73428079242033, |
| "grad_norm": 0.4031737744808197, |
| "learning_rate": 0.0003342170459135586, |
| "loss": 3.047, |
| "step": 65900 |
| }, |
| { |
| "epoch": 17.747739018087856, |
| "grad_norm": 0.3955917954444885, |
| "learning_rate": 0.0003340150801131008, |
| "loss": 3.0408, |
| "step": 65950 |
| }, |
| { |
| "epoch": 17.761197243755383, |
| "grad_norm": 0.4012543559074402, |
| "learning_rate": 0.00033381311431264303, |
| "loss": 3.0558, |
| "step": 66000 |
| }, |
| { |
| "epoch": 17.761197243755383, |
| "eval_accuracy": 0.3950584367977339, |
| "eval_loss": 3.285689115524292, |
| "eval_runtime": 53.7002, |
| "eval_samples_per_second": 335.418, |
| "eval_steps_per_second": 20.968, |
| "step": 66000 |
| }, |
| { |
| "epoch": 17.77465546942291, |
| "grad_norm": 0.4043067693710327, |
| "learning_rate": 0.00033361114851218523, |
| "loss": 3.0489, |
| "step": 66050 |
| }, |
| { |
| "epoch": 17.788113695090438, |
| "grad_norm": 0.39902979135513306, |
| "learning_rate": 0.0003334091827117274, |
| "loss": 3.0574, |
| "step": 66100 |
| }, |
| { |
| "epoch": 17.80157192075797, |
| "grad_norm": 0.38922590017318726, |
| "learning_rate": 0.0003332072169112696, |
| "loss": 3.0436, |
| "step": 66150 |
| }, |
| { |
| "epoch": 17.815030146425496, |
| "grad_norm": 0.40953120589256287, |
| "learning_rate": 0.0003330052511108118, |
| "loss": 3.0573, |
| "step": 66200 |
| }, |
| { |
| "epoch": 17.828488372093023, |
| "grad_norm": 0.3957615792751312, |
| "learning_rate": 0.0003328032853103541, |
| "loss": 3.0469, |
| "step": 66250 |
| }, |
| { |
| "epoch": 17.84194659776055, |
| "grad_norm": 0.3910478949546814, |
| "learning_rate": 0.0003326013195098963, |
| "loss": 3.0572, |
| "step": 66300 |
| }, |
| { |
| "epoch": 17.855404823428078, |
| "grad_norm": 0.3920937776565552, |
| "learning_rate": 0.0003323993537094385, |
| "loss": 3.0609, |
| "step": 66350 |
| }, |
| { |
| "epoch": 17.86886304909561, |
| "grad_norm": 0.4142955243587494, |
| "learning_rate": 0.00033219738790898077, |
| "loss": 3.0574, |
| "step": 66400 |
| }, |
| { |
| "epoch": 17.882321274763136, |
| "grad_norm": 0.405781090259552, |
| "learning_rate": 0.00033199542210852296, |
| "loss": 3.0626, |
| "step": 66450 |
| }, |
| { |
| "epoch": 17.895779500430663, |
| "grad_norm": 0.4257899522781372, |
| "learning_rate": 0.00033179345630806516, |
| "loss": 3.0595, |
| "step": 66500 |
| }, |
| { |
| "epoch": 17.90923772609819, |
| "grad_norm": 0.39462465047836304, |
| "learning_rate": 0.00033159149050760735, |
| "loss": 3.0525, |
| "step": 66550 |
| }, |
| { |
| "epoch": 17.922695951765718, |
| "grad_norm": 0.4202898442745209, |
| "learning_rate": 0.00033138952470714955, |
| "loss": 3.0605, |
| "step": 66600 |
| }, |
| { |
| "epoch": 17.93615417743325, |
| "grad_norm": 0.40152961015701294, |
| "learning_rate": 0.0003311875589066918, |
| "loss": 3.0562, |
| "step": 66650 |
| }, |
| { |
| "epoch": 17.949612403100776, |
| "grad_norm": 0.40167486667633057, |
| "learning_rate": 0.000330985593106234, |
| "loss": 3.0591, |
| "step": 66700 |
| }, |
| { |
| "epoch": 17.963070628768303, |
| "grad_norm": 0.4044479727745056, |
| "learning_rate": 0.0003307836273057762, |
| "loss": 3.0511, |
| "step": 66750 |
| }, |
| { |
| "epoch": 17.97652885443583, |
| "grad_norm": 0.4120575785636902, |
| "learning_rate": 0.0003305816615053184, |
| "loss": 3.0666, |
| "step": 66800 |
| }, |
| { |
| "epoch": 17.989987080103358, |
| "grad_norm": 0.4056987762451172, |
| "learning_rate": 0.0003303796957048606, |
| "loss": 3.0567, |
| "step": 66850 |
| }, |
| { |
| "epoch": 18.003229974160206, |
| "grad_norm": 0.3993924558162689, |
| "learning_rate": 0.00033017772990440284, |
| "loss": 3.025, |
| "step": 66900 |
| }, |
| { |
| "epoch": 18.016688199827733, |
| "grad_norm": 0.4064941704273224, |
| "learning_rate": 0.00032997576410394503, |
| "loss": 2.9688, |
| "step": 66950 |
| }, |
| { |
| "epoch": 18.030146425495264, |
| "grad_norm": 0.41245749592781067, |
| "learning_rate": 0.00032977379830348723, |
| "loss": 2.9668, |
| "step": 67000 |
| }, |
| { |
| "epoch": 18.030146425495264, |
| "eval_accuracy": 0.39444577717920604, |
| "eval_loss": 3.294330358505249, |
| "eval_runtime": 53.7092, |
| "eval_samples_per_second": 335.361, |
| "eval_steps_per_second": 20.965, |
| "step": 67000 |
| }, |
| { |
| "epoch": 18.04360465116279, |
| "grad_norm": 0.40845730900764465, |
| "learning_rate": 0.0003295718325030294, |
| "loss": 2.9719, |
| "step": 67050 |
| }, |
| { |
| "epoch": 18.05706287683032, |
| "grad_norm": 0.3882702887058258, |
| "learning_rate": 0.0003293698667025716, |
| "loss": 2.9772, |
| "step": 67100 |
| }, |
| { |
| "epoch": 18.070521102497846, |
| "grad_norm": 0.41161108016967773, |
| "learning_rate": 0.0003291679009021139, |
| "loss": 2.9702, |
| "step": 67150 |
| }, |
| { |
| "epoch": 18.083979328165373, |
| "grad_norm": 0.40619149804115295, |
| "learning_rate": 0.0003289659351016561, |
| "loss": 2.9882, |
| "step": 67200 |
| }, |
| { |
| "epoch": 18.097437553832904, |
| "grad_norm": 0.4130079448223114, |
| "learning_rate": 0.0003287639693011983, |
| "loss": 2.9782, |
| "step": 67250 |
| }, |
| { |
| "epoch": 18.11089577950043, |
| "grad_norm": 0.4122353196144104, |
| "learning_rate": 0.0003285620035007405, |
| "loss": 2.9827, |
| "step": 67300 |
| }, |
| { |
| "epoch": 18.12435400516796, |
| "grad_norm": 0.42838728427886963, |
| "learning_rate": 0.00032836003770028276, |
| "loss": 2.9885, |
| "step": 67350 |
| }, |
| { |
| "epoch": 18.137812230835486, |
| "grad_norm": 0.4504906237125397, |
| "learning_rate": 0.00032815807189982496, |
| "loss": 3.0084, |
| "step": 67400 |
| }, |
| { |
| "epoch": 18.151270456503013, |
| "grad_norm": 0.4178699254989624, |
| "learning_rate": 0.00032795610609936715, |
| "loss": 2.9822, |
| "step": 67450 |
| }, |
| { |
| "epoch": 18.164728682170544, |
| "grad_norm": 0.4099842309951782, |
| "learning_rate": 0.00032775414029890935, |
| "loss": 3.001, |
| "step": 67500 |
| }, |
| { |
| "epoch": 18.17818690783807, |
| "grad_norm": 0.4077083170413971, |
| "learning_rate": 0.00032755217449845155, |
| "loss": 2.9985, |
| "step": 67550 |
| }, |
| { |
| "epoch": 18.1916451335056, |
| "grad_norm": 0.40748822689056396, |
| "learning_rate": 0.0003273502086979938, |
| "loss": 2.9892, |
| "step": 67600 |
| }, |
| { |
| "epoch": 18.205103359173126, |
| "grad_norm": 0.3962273597717285, |
| "learning_rate": 0.000327148242897536, |
| "loss": 2.9994, |
| "step": 67650 |
| }, |
| { |
| "epoch": 18.218561584840653, |
| "grad_norm": 0.4352484345436096, |
| "learning_rate": 0.0003269462770970782, |
| "loss": 3.0084, |
| "step": 67700 |
| }, |
| { |
| "epoch": 18.232019810508184, |
| "grad_norm": 0.393781453371048, |
| "learning_rate": 0.0003267443112966204, |
| "loss": 3.0002, |
| "step": 67750 |
| }, |
| { |
| "epoch": 18.24547803617571, |
| "grad_norm": 0.4139980673789978, |
| "learning_rate": 0.0003265423454961626, |
| "loss": 3.0153, |
| "step": 67800 |
| }, |
| { |
| "epoch": 18.25893626184324, |
| "grad_norm": 0.5772366523742676, |
| "learning_rate": 0.00032634037969570483, |
| "loss": 3.0038, |
| "step": 67850 |
| }, |
| { |
| "epoch": 18.272394487510766, |
| "grad_norm": 0.40956127643585205, |
| "learning_rate": 0.00032613841389524703, |
| "loss": 3.0068, |
| "step": 67900 |
| }, |
| { |
| "epoch": 18.285852713178294, |
| "grad_norm": 0.417221337556839, |
| "learning_rate": 0.0003259364480947892, |
| "loss": 3.0055, |
| "step": 67950 |
| }, |
| { |
| "epoch": 18.29931093884582, |
| "grad_norm": 0.4156797230243683, |
| "learning_rate": 0.0003257344822943314, |
| "loss": 3.006, |
| "step": 68000 |
| }, |
| { |
| "epoch": 18.29931093884582, |
| "eval_accuracy": 0.394656552079001, |
| "eval_loss": 3.295771360397339, |
| "eval_runtime": 53.9336, |
| "eval_samples_per_second": 333.966, |
| "eval_steps_per_second": 20.878, |
| "step": 68000 |
| }, |
| { |
| "epoch": 18.31276916451335, |
| "grad_norm": 0.3963870108127594, |
| "learning_rate": 0.0003255325164938737, |
| "loss": 3.0138, |
| "step": 68050 |
| }, |
| { |
| "epoch": 18.32622739018088, |
| "grad_norm": 0.39834779500961304, |
| "learning_rate": 0.0003253305506934159, |
| "loss": 3.0103, |
| "step": 68100 |
| }, |
| { |
| "epoch": 18.339685615848406, |
| "grad_norm": 0.43453970551490784, |
| "learning_rate": 0.0003251285848929581, |
| "loss": 3.0127, |
| "step": 68150 |
| }, |
| { |
| "epoch": 18.353143841515934, |
| "grad_norm": 0.40500518679618835, |
| "learning_rate": 0.0003249266190925003, |
| "loss": 3.0052, |
| "step": 68200 |
| }, |
| { |
| "epoch": 18.36660206718346, |
| "grad_norm": 0.4253470003604889, |
| "learning_rate": 0.0003247246532920425, |
| "loss": 3.0107, |
| "step": 68250 |
| }, |
| { |
| "epoch": 18.38006029285099, |
| "grad_norm": 0.39556410908699036, |
| "learning_rate": 0.00032452268749158476, |
| "loss": 3.0187, |
| "step": 68300 |
| }, |
| { |
| "epoch": 18.39351851851852, |
| "grad_norm": 0.41645529866218567, |
| "learning_rate": 0.00032432072169112696, |
| "loss": 3.0237, |
| "step": 68350 |
| }, |
| { |
| "epoch": 18.406976744186046, |
| "grad_norm": 0.4263641834259033, |
| "learning_rate": 0.00032411875589066915, |
| "loss": 3.0154, |
| "step": 68400 |
| }, |
| { |
| "epoch": 18.420434969853574, |
| "grad_norm": 0.4121094048023224, |
| "learning_rate": 0.00032391679009021135, |
| "loss": 3.0223, |
| "step": 68450 |
| }, |
| { |
| "epoch": 18.4338931955211, |
| "grad_norm": 0.41028955578804016, |
| "learning_rate": 0.0003237148242897536, |
| "loss": 3.0178, |
| "step": 68500 |
| }, |
| { |
| "epoch": 18.447351421188632, |
| "grad_norm": 0.4091287851333618, |
| "learning_rate": 0.0003235128584892958, |
| "loss": 3.0271, |
| "step": 68550 |
| }, |
| { |
| "epoch": 18.46080964685616, |
| "grad_norm": 0.4232977032661438, |
| "learning_rate": 0.000323310892688838, |
| "loss": 3.0182, |
| "step": 68600 |
| }, |
| { |
| "epoch": 18.474267872523686, |
| "grad_norm": 0.42817702889442444, |
| "learning_rate": 0.0003231089268883802, |
| "loss": 3.0154, |
| "step": 68650 |
| }, |
| { |
| "epoch": 18.487726098191214, |
| "grad_norm": 0.40232494473457336, |
| "learning_rate": 0.0003229069610879224, |
| "loss": 3.0274, |
| "step": 68700 |
| }, |
| { |
| "epoch": 18.50118432385874, |
| "grad_norm": 0.44148021936416626, |
| "learning_rate": 0.00032270499528746463, |
| "loss": 3.0238, |
| "step": 68750 |
| }, |
| { |
| "epoch": 18.514642549526272, |
| "grad_norm": 0.4260505437850952, |
| "learning_rate": 0.00032250302948700683, |
| "loss": 3.0144, |
| "step": 68800 |
| }, |
| { |
| "epoch": 18.5281007751938, |
| "grad_norm": 0.4293977916240692, |
| "learning_rate": 0.000322301063686549, |
| "loss": 3.0332, |
| "step": 68850 |
| }, |
| { |
| "epoch": 18.541559000861326, |
| "grad_norm": 0.40571919083595276, |
| "learning_rate": 0.0003220990978860912, |
| "loss": 3.0276, |
| "step": 68900 |
| }, |
| { |
| "epoch": 18.555017226528854, |
| "grad_norm": 0.42529913783073425, |
| "learning_rate": 0.0003218971320856335, |
| "loss": 3.0315, |
| "step": 68950 |
| }, |
| { |
| "epoch": 18.56847545219638, |
| "grad_norm": 0.41675060987472534, |
| "learning_rate": 0.0003216951662851757, |
| "loss": 3.033, |
| "step": 69000 |
| }, |
| { |
| "epoch": 18.56847545219638, |
| "eval_accuracy": 0.3946870818454146, |
| "eval_loss": 3.29016375541687, |
| "eval_runtime": 53.8549, |
| "eval_samples_per_second": 334.455, |
| "eval_steps_per_second": 20.908, |
| "step": 69000 |
| }, |
| { |
| "epoch": 18.581933677863912, |
| "grad_norm": 0.4186652898788452, |
| "learning_rate": 0.0003214932004847179, |
| "loss": 3.0313, |
| "step": 69050 |
| }, |
| { |
| "epoch": 18.59539190353144, |
| "grad_norm": 0.38911905884742737, |
| "learning_rate": 0.0003212912346842601, |
| "loss": 3.0319, |
| "step": 69100 |
| }, |
| { |
| "epoch": 18.608850129198967, |
| "grad_norm": 0.39345499873161316, |
| "learning_rate": 0.0003210892688838023, |
| "loss": 3.0337, |
| "step": 69150 |
| }, |
| { |
| "epoch": 18.622308354866494, |
| "grad_norm": 0.40627118945121765, |
| "learning_rate": 0.00032088730308334456, |
| "loss": 3.0292, |
| "step": 69200 |
| }, |
| { |
| "epoch": 18.63576658053402, |
| "grad_norm": 0.4483228623867035, |
| "learning_rate": 0.00032068533728288676, |
| "loss": 3.0318, |
| "step": 69250 |
| }, |
| { |
| "epoch": 18.649224806201552, |
| "grad_norm": 0.46649301052093506, |
| "learning_rate": 0.00032048337148242895, |
| "loss": 3.033, |
| "step": 69300 |
| }, |
| { |
| "epoch": 18.66268303186908, |
| "grad_norm": 0.4463086426258087, |
| "learning_rate": 0.00032028140568197115, |
| "loss": 3.0324, |
| "step": 69350 |
| }, |
| { |
| "epoch": 18.676141257536607, |
| "grad_norm": 0.38854748010635376, |
| "learning_rate": 0.00032007943988151334, |
| "loss": 3.0428, |
| "step": 69400 |
| }, |
| { |
| "epoch": 18.689599483204134, |
| "grad_norm": 0.39255139231681824, |
| "learning_rate": 0.0003198774740810556, |
| "loss": 3.0289, |
| "step": 69450 |
| }, |
| { |
| "epoch": 18.70305770887166, |
| "grad_norm": 0.3995414972305298, |
| "learning_rate": 0.0003196755082805978, |
| "loss": 3.0391, |
| "step": 69500 |
| }, |
| { |
| "epoch": 18.716515934539192, |
| "grad_norm": 0.391249418258667, |
| "learning_rate": 0.00031947354248014, |
| "loss": 3.047, |
| "step": 69550 |
| }, |
| { |
| "epoch": 18.72997416020672, |
| "grad_norm": 0.4146568179130554, |
| "learning_rate": 0.0003192715766796822, |
| "loss": 3.0407, |
| "step": 69600 |
| }, |
| { |
| "epoch": 18.743432385874247, |
| "grad_norm": 0.40213024616241455, |
| "learning_rate": 0.0003190696108792244, |
| "loss": 3.0424, |
| "step": 69650 |
| }, |
| { |
| "epoch": 18.756890611541774, |
| "grad_norm": 0.4518072009086609, |
| "learning_rate": 0.00031886764507876663, |
| "loss": 3.0393, |
| "step": 69700 |
| }, |
| { |
| "epoch": 18.7703488372093, |
| "grad_norm": 0.4141794741153717, |
| "learning_rate": 0.0003186656792783088, |
| "loss": 3.0423, |
| "step": 69750 |
| }, |
| { |
| "epoch": 18.783807062876832, |
| "grad_norm": 0.40530627965927124, |
| "learning_rate": 0.000318463713477851, |
| "loss": 3.0372, |
| "step": 69800 |
| }, |
| { |
| "epoch": 18.79726528854436, |
| "grad_norm": 0.4177812933921814, |
| "learning_rate": 0.00031826174767739327, |
| "loss": 3.0382, |
| "step": 69850 |
| }, |
| { |
| "epoch": 18.810723514211887, |
| "grad_norm": 0.41861647367477417, |
| "learning_rate": 0.0003180597818769355, |
| "loss": 3.0386, |
| "step": 69900 |
| }, |
| { |
| "epoch": 18.824181739879414, |
| "grad_norm": 0.41492319107055664, |
| "learning_rate": 0.0003178578160764777, |
| "loss": 3.0423, |
| "step": 69950 |
| }, |
| { |
| "epoch": 18.83763996554694, |
| "grad_norm": 0.41332483291625977, |
| "learning_rate": 0.0003176558502760199, |
| "loss": 3.0561, |
| "step": 70000 |
| }, |
| { |
| "epoch": 18.83763996554694, |
| "eval_accuracy": 0.3955084520734818, |
| "eval_loss": 3.280143976211548, |
| "eval_runtime": 55.4316, |
| "eval_samples_per_second": 324.941, |
| "eval_steps_per_second": 20.313, |
| "step": 70000 |
| }, |
| { |
| "epoch": 18.85109819121447, |
| "grad_norm": 0.40138715505599976, |
| "learning_rate": 0.0003174538844755621, |
| "loss": 3.0547, |
| "step": 70050 |
| }, |
| { |
| "epoch": 18.864556416882, |
| "grad_norm": 0.451656311750412, |
| "learning_rate": 0.0003172519186751043, |
| "loss": 3.0475, |
| "step": 70100 |
| }, |
| { |
| "epoch": 18.878014642549527, |
| "grad_norm": 0.4022556245326996, |
| "learning_rate": 0.00031704995287464656, |
| "loss": 3.0491, |
| "step": 70150 |
| }, |
| { |
| "epoch": 18.891472868217054, |
| "grad_norm": 0.43887948989868164, |
| "learning_rate": 0.00031684798707418875, |
| "loss": 3.055, |
| "step": 70200 |
| }, |
| { |
| "epoch": 18.90493109388458, |
| "grad_norm": 0.4141213595867157, |
| "learning_rate": 0.00031664602127373095, |
| "loss": 3.0429, |
| "step": 70250 |
| }, |
| { |
| "epoch": 18.91838931955211, |
| "grad_norm": 0.43518728017807007, |
| "learning_rate": 0.00031644405547327315, |
| "loss": 3.0606, |
| "step": 70300 |
| }, |
| { |
| "epoch": 18.93184754521964, |
| "grad_norm": 0.45341476798057556, |
| "learning_rate": 0.00031624208967281534, |
| "loss": 3.0438, |
| "step": 70350 |
| }, |
| { |
| "epoch": 18.945305770887167, |
| "grad_norm": 0.3871983587741852, |
| "learning_rate": 0.0003160401238723576, |
| "loss": 3.0371, |
| "step": 70400 |
| }, |
| { |
| "epoch": 18.958763996554694, |
| "grad_norm": 0.38054120540618896, |
| "learning_rate": 0.0003158381580718998, |
| "loss": 3.0451, |
| "step": 70450 |
| }, |
| { |
| "epoch": 18.97222222222222, |
| "grad_norm": 0.4231681227684021, |
| "learning_rate": 0.000315636192271442, |
| "loss": 3.0478, |
| "step": 70500 |
| }, |
| { |
| "epoch": 18.98568044788975, |
| "grad_norm": 0.4074268043041229, |
| "learning_rate": 0.0003154342264709842, |
| "loss": 3.0456, |
| "step": 70550 |
| }, |
| { |
| "epoch": 18.99913867355728, |
| "grad_norm": 0.4081641137599945, |
| "learning_rate": 0.00031523226067052643, |
| "loss": 3.0445, |
| "step": 70600 |
| }, |
| { |
| "epoch": 19.012381567614124, |
| "grad_norm": 0.40542617440223694, |
| "learning_rate": 0.0003150302948700686, |
| "loss": 2.9738, |
| "step": 70650 |
| }, |
| { |
| "epoch": 19.025839793281655, |
| "grad_norm": 0.4279390275478363, |
| "learning_rate": 0.0003148283290696108, |
| "loss": 2.9535, |
| "step": 70700 |
| }, |
| { |
| "epoch": 19.039298018949182, |
| "grad_norm": 0.4095859229564667, |
| "learning_rate": 0.0003146263632691531, |
| "loss": 2.9532, |
| "step": 70750 |
| }, |
| { |
| "epoch": 19.05275624461671, |
| "grad_norm": 0.39089035987854004, |
| "learning_rate": 0.0003144243974686953, |
| "loss": 2.9654, |
| "step": 70800 |
| }, |
| { |
| "epoch": 19.066214470284237, |
| "grad_norm": 0.42628535628318787, |
| "learning_rate": 0.0003142224316682375, |
| "loss": 2.9724, |
| "step": 70850 |
| }, |
| { |
| "epoch": 19.079672695951764, |
| "grad_norm": 0.42668628692626953, |
| "learning_rate": 0.0003140204658677797, |
| "loss": 2.9648, |
| "step": 70900 |
| }, |
| { |
| "epoch": 19.093130921619295, |
| "grad_norm": 0.44374603033065796, |
| "learning_rate": 0.0003138185000673219, |
| "loss": 2.9687, |
| "step": 70950 |
| }, |
| { |
| "epoch": 19.106589147286822, |
| "grad_norm": 0.4030226767063141, |
| "learning_rate": 0.0003136165342668641, |
| "loss": 2.9788, |
| "step": 71000 |
| }, |
| { |
| "epoch": 19.106589147286822, |
| "eval_accuracy": 0.3948139813727139, |
| "eval_loss": 3.2952170372009277, |
| "eval_runtime": 55.0491, |
| "eval_samples_per_second": 327.199, |
| "eval_steps_per_second": 20.454, |
| "step": 71000 |
| }, |
| { |
| "epoch": 19.12004737295435, |
| "grad_norm": 0.44227829575538635, |
| "learning_rate": 0.00031341456846640636, |
| "loss": 2.9831, |
| "step": 71050 |
| }, |
| { |
| "epoch": 19.133505598621877, |
| "grad_norm": 0.42472559213638306, |
| "learning_rate": 0.00031321260266594855, |
| "loss": 2.9756, |
| "step": 71100 |
| }, |
| { |
| "epoch": 19.146963824289404, |
| "grad_norm": 0.4363155961036682, |
| "learning_rate": 0.00031301063686549075, |
| "loss": 2.9772, |
| "step": 71150 |
| }, |
| { |
| "epoch": 19.160422049956935, |
| "grad_norm": 0.387218177318573, |
| "learning_rate": 0.00031280867106503295, |
| "loss": 2.9856, |
| "step": 71200 |
| }, |
| { |
| "epoch": 19.173880275624462, |
| "grad_norm": 0.43504536151885986, |
| "learning_rate": 0.00031260670526457514, |
| "loss": 2.9884, |
| "step": 71250 |
| }, |
| { |
| "epoch": 19.18733850129199, |
| "grad_norm": 0.4170171618461609, |
| "learning_rate": 0.0003124047394641174, |
| "loss": 2.9937, |
| "step": 71300 |
| }, |
| { |
| "epoch": 19.200796726959517, |
| "grad_norm": 0.4231520891189575, |
| "learning_rate": 0.0003122027736636596, |
| "loss": 2.988, |
| "step": 71350 |
| }, |
| { |
| "epoch": 19.214254952627044, |
| "grad_norm": 0.4176693260669708, |
| "learning_rate": 0.0003120008078632018, |
| "loss": 2.9804, |
| "step": 71400 |
| }, |
| { |
| "epoch": 19.227713178294575, |
| "grad_norm": 0.41988492012023926, |
| "learning_rate": 0.000311798842062744, |
| "loss": 2.9922, |
| "step": 71450 |
| }, |
| { |
| "epoch": 19.241171403962102, |
| "grad_norm": 0.4519420862197876, |
| "learning_rate": 0.0003115968762622862, |
| "loss": 2.9901, |
| "step": 71500 |
| }, |
| { |
| "epoch": 19.25462962962963, |
| "grad_norm": 0.4187524914741516, |
| "learning_rate": 0.00031139491046182843, |
| "loss": 2.993, |
| "step": 71550 |
| }, |
| { |
| "epoch": 19.268087855297157, |
| "grad_norm": 0.41358649730682373, |
| "learning_rate": 0.0003111929446613707, |
| "loss": 2.9979, |
| "step": 71600 |
| }, |
| { |
| "epoch": 19.281546080964684, |
| "grad_norm": 0.44196566939353943, |
| "learning_rate": 0.0003109909788609129, |
| "loss": 3.0035, |
| "step": 71650 |
| }, |
| { |
| "epoch": 19.295004306632215, |
| "grad_norm": 0.44156354665756226, |
| "learning_rate": 0.00031078901306045507, |
| "loss": 2.9925, |
| "step": 71700 |
| }, |
| { |
| "epoch": 19.308462532299743, |
| "grad_norm": 0.41277235746383667, |
| "learning_rate": 0.0003105870472599973, |
| "loss": 2.9963, |
| "step": 71750 |
| }, |
| { |
| "epoch": 19.32192075796727, |
| "grad_norm": 0.4259941577911377, |
| "learning_rate": 0.0003103850814595395, |
| "loss": 3.0017, |
| "step": 71800 |
| }, |
| { |
| "epoch": 19.335378983634797, |
| "grad_norm": 0.4394524097442627, |
| "learning_rate": 0.0003101831156590817, |
| "loss": 3.0053, |
| "step": 71850 |
| }, |
| { |
| "epoch": 19.348837209302324, |
| "grad_norm": 0.40097951889038086, |
| "learning_rate": 0.0003099811498586239, |
| "loss": 3.0032, |
| "step": 71900 |
| }, |
| { |
| "epoch": 19.362295434969855, |
| "grad_norm": 0.4460085928440094, |
| "learning_rate": 0.0003097791840581661, |
| "loss": 3.005, |
| "step": 71950 |
| }, |
| { |
| "epoch": 19.375753660637383, |
| "grad_norm": 0.4067431092262268, |
| "learning_rate": 0.00030957721825770836, |
| "loss": 3.0036, |
| "step": 72000 |
| }, |
| { |
| "epoch": 19.375753660637383, |
| "eval_accuracy": 0.39494533542109134, |
| "eval_loss": 3.2903969287872314, |
| "eval_runtime": 55.4344, |
| "eval_samples_per_second": 324.925, |
| "eval_steps_per_second": 20.312, |
| "step": 72000 |
| }, |
| { |
| "epoch": 19.38921188630491, |
| "grad_norm": 0.42211583256721497, |
| "learning_rate": 0.00030937525245725055, |
| "loss": 3.0002, |
| "step": 72050 |
| }, |
| { |
| "epoch": 19.402670111972437, |
| "grad_norm": 0.4175995886325836, |
| "learning_rate": 0.00030917328665679275, |
| "loss": 3.0021, |
| "step": 72100 |
| }, |
| { |
| "epoch": 19.416128337639964, |
| "grad_norm": 0.4194367229938507, |
| "learning_rate": 0.00030897132085633494, |
| "loss": 3.0103, |
| "step": 72150 |
| }, |
| { |
| "epoch": 19.429586563307495, |
| "grad_norm": 0.42266571521759033, |
| "learning_rate": 0.00030876935505587714, |
| "loss": 3.0068, |
| "step": 72200 |
| }, |
| { |
| "epoch": 19.443044788975023, |
| "grad_norm": 0.41117116808891296, |
| "learning_rate": 0.0003085673892554194, |
| "loss": 3.0042, |
| "step": 72250 |
| }, |
| { |
| "epoch": 19.45650301464255, |
| "grad_norm": 0.43733495473861694, |
| "learning_rate": 0.0003083654234549616, |
| "loss": 3.0077, |
| "step": 72300 |
| }, |
| { |
| "epoch": 19.469961240310077, |
| "grad_norm": 0.4256683588027954, |
| "learning_rate": 0.0003081634576545038, |
| "loss": 3.0079, |
| "step": 72350 |
| }, |
| { |
| "epoch": 19.483419465977605, |
| "grad_norm": 0.4018738865852356, |
| "learning_rate": 0.000307961491854046, |
| "loss": 3.0079, |
| "step": 72400 |
| }, |
| { |
| "epoch": 19.496877691645132, |
| "grad_norm": 0.41685667634010315, |
| "learning_rate": 0.00030775952605358823, |
| "loss": 3.0111, |
| "step": 72450 |
| }, |
| { |
| "epoch": 19.510335917312663, |
| "grad_norm": 0.3987107574939728, |
| "learning_rate": 0.0003075575602531305, |
| "loss": 3.0036, |
| "step": 72500 |
| }, |
| { |
| "epoch": 19.52379414298019, |
| "grad_norm": 0.45025312900543213, |
| "learning_rate": 0.0003073555944526727, |
| "loss": 3.0131, |
| "step": 72550 |
| }, |
| { |
| "epoch": 19.537252368647717, |
| "grad_norm": 0.43616893887519836, |
| "learning_rate": 0.00030715362865221487, |
| "loss": 3.014, |
| "step": 72600 |
| }, |
| { |
| "epoch": 19.550710594315245, |
| "grad_norm": 0.39793211221694946, |
| "learning_rate": 0.0003069516628517571, |
| "loss": 3.0105, |
| "step": 72650 |
| }, |
| { |
| "epoch": 19.564168819982772, |
| "grad_norm": 0.438885360956192, |
| "learning_rate": 0.0003067496970512993, |
| "loss": 3.017, |
| "step": 72700 |
| }, |
| { |
| "epoch": 19.577627045650303, |
| "grad_norm": 0.4122118055820465, |
| "learning_rate": 0.0003065477312508415, |
| "loss": 3.0191, |
| "step": 72750 |
| }, |
| { |
| "epoch": 19.59108527131783, |
| "grad_norm": 0.42040184140205383, |
| "learning_rate": 0.0003063457654503837, |
| "loss": 3.0237, |
| "step": 72800 |
| }, |
| { |
| "epoch": 19.604543496985357, |
| "grad_norm": 0.4153655767440796, |
| "learning_rate": 0.0003061437996499259, |
| "loss": 3.0278, |
| "step": 72850 |
| }, |
| { |
| "epoch": 19.618001722652885, |
| "grad_norm": 0.4188932478427887, |
| "learning_rate": 0.00030594183384946816, |
| "loss": 3.0302, |
| "step": 72900 |
| }, |
| { |
| "epoch": 19.631459948320412, |
| "grad_norm": 0.41482555866241455, |
| "learning_rate": 0.00030573986804901035, |
| "loss": 3.0148, |
| "step": 72950 |
| }, |
| { |
| "epoch": 19.644918173987943, |
| "grad_norm": 0.3990839421749115, |
| "learning_rate": 0.00030553790224855255, |
| "loss": 3.0251, |
| "step": 73000 |
| }, |
| { |
| "epoch": 19.644918173987943, |
| "eval_accuracy": 0.3955169265282158, |
| "eval_loss": 3.2822790145874023, |
| "eval_runtime": 55.3891, |
| "eval_samples_per_second": 325.19, |
| "eval_steps_per_second": 20.329, |
| "step": 73000 |
| }, |
| { |
| "epoch": 19.65837639965547, |
| "grad_norm": 0.4105415940284729, |
| "learning_rate": 0.00030533593644809474, |
| "loss": 3.035, |
| "step": 73050 |
| }, |
| { |
| "epoch": 19.671834625322997, |
| "grad_norm": 0.40522801876068115, |
| "learning_rate": 0.00030513397064763694, |
| "loss": 3.0141, |
| "step": 73100 |
| }, |
| { |
| "epoch": 19.685292850990525, |
| "grad_norm": 0.43990591168403625, |
| "learning_rate": 0.0003049320048471792, |
| "loss": 3.0327, |
| "step": 73150 |
| }, |
| { |
| "epoch": 19.698751076658052, |
| "grad_norm": 0.41591787338256836, |
| "learning_rate": 0.0003047300390467214, |
| "loss": 3.0286, |
| "step": 73200 |
| }, |
| { |
| "epoch": 19.712209302325583, |
| "grad_norm": 0.4017585217952728, |
| "learning_rate": 0.0003045280732462636, |
| "loss": 3.0219, |
| "step": 73250 |
| }, |
| { |
| "epoch": 19.72566752799311, |
| "grad_norm": 0.42853009700775146, |
| "learning_rate": 0.0003043261074458058, |
| "loss": 3.0272, |
| "step": 73300 |
| }, |
| { |
| "epoch": 19.739125753660637, |
| "grad_norm": 0.43009036779403687, |
| "learning_rate": 0.000304124141645348, |
| "loss": 3.0274, |
| "step": 73350 |
| }, |
| { |
| "epoch": 19.752583979328165, |
| "grad_norm": 0.383428692817688, |
| "learning_rate": 0.0003039221758448903, |
| "loss": 3.0356, |
| "step": 73400 |
| }, |
| { |
| "epoch": 19.766042204995692, |
| "grad_norm": 0.42961567640304565, |
| "learning_rate": 0.0003037202100444325, |
| "loss": 3.0221, |
| "step": 73450 |
| }, |
| { |
| "epoch": 19.779500430663223, |
| "grad_norm": 0.4073396623134613, |
| "learning_rate": 0.00030351824424397467, |
| "loss": 3.0323, |
| "step": 73500 |
| }, |
| { |
| "epoch": 19.79295865633075, |
| "grad_norm": 0.4351046681404114, |
| "learning_rate": 0.00030331627844351687, |
| "loss": 3.0328, |
| "step": 73550 |
| }, |
| { |
| "epoch": 19.806416881998278, |
| "grad_norm": 0.42110610008239746, |
| "learning_rate": 0.0003031143126430591, |
| "loss": 3.0431, |
| "step": 73600 |
| }, |
| { |
| "epoch": 19.819875107665805, |
| "grad_norm": 0.39867541193962097, |
| "learning_rate": 0.0003029123468426013, |
| "loss": 3.0301, |
| "step": 73650 |
| }, |
| { |
| "epoch": 19.833333333333332, |
| "grad_norm": 0.42126715183258057, |
| "learning_rate": 0.0003027103810421435, |
| "loss": 3.0318, |
| "step": 73700 |
| }, |
| { |
| "epoch": 19.846791559000863, |
| "grad_norm": 0.4099518954753876, |
| "learning_rate": 0.0003025084152416857, |
| "loss": 3.0313, |
| "step": 73750 |
| }, |
| { |
| "epoch": 19.86024978466839, |
| "grad_norm": 0.43412184715270996, |
| "learning_rate": 0.0003023064494412279, |
| "loss": 3.0272, |
| "step": 73800 |
| }, |
| { |
| "epoch": 19.873708010335918, |
| "grad_norm": 0.3960249125957489, |
| "learning_rate": 0.00030210448364077015, |
| "loss": 3.0411, |
| "step": 73850 |
| }, |
| { |
| "epoch": 19.887166236003445, |
| "grad_norm": 0.4127246141433716, |
| "learning_rate": 0.00030190251784031235, |
| "loss": 3.0356, |
| "step": 73900 |
| }, |
| { |
| "epoch": 19.900624461670972, |
| "grad_norm": 0.4184736907482147, |
| "learning_rate": 0.00030170055203985454, |
| "loss": 3.0384, |
| "step": 73950 |
| }, |
| { |
| "epoch": 19.9140826873385, |
| "grad_norm": 0.4029821753501892, |
| "learning_rate": 0.00030149858623939674, |
| "loss": 3.0421, |
| "step": 74000 |
| }, |
| { |
| "epoch": 19.9140826873385, |
| "eval_accuracy": 0.3958698115150891, |
| "eval_loss": 3.27805757522583, |
| "eval_runtime": 55.2649, |
| "eval_samples_per_second": 325.921, |
| "eval_steps_per_second": 20.375, |
| "step": 74000 |
| }, |
| { |
| "epoch": 19.92754091300603, |
| "grad_norm": 0.41179102659225464, |
| "learning_rate": 0.00030129662043893894, |
| "loss": 3.0358, |
| "step": 74050 |
| }, |
| { |
| "epoch": 19.940999138673558, |
| "grad_norm": 0.4140405058860779, |
| "learning_rate": 0.0003010946546384812, |
| "loss": 3.0384, |
| "step": 74100 |
| }, |
| { |
| "epoch": 19.954457364341085, |
| "grad_norm": 0.4397096335887909, |
| "learning_rate": 0.0003008926888380234, |
| "loss": 3.0453, |
| "step": 74150 |
| }, |
| { |
| "epoch": 19.967915590008612, |
| "grad_norm": 0.410493940114975, |
| "learning_rate": 0.0003006907230375656, |
| "loss": 3.0347, |
| "step": 74200 |
| }, |
| { |
| "epoch": 19.98137381567614, |
| "grad_norm": 0.4063168466091156, |
| "learning_rate": 0.0003004887572371078, |
| "loss": 3.0357, |
| "step": 74250 |
| }, |
| { |
| "epoch": 19.99483204134367, |
| "grad_norm": 0.4114035964012146, |
| "learning_rate": 0.0003002867914366501, |
| "loss": 3.0458, |
| "step": 74300 |
| }, |
| { |
| "epoch": 20.00807493540052, |
| "grad_norm": 0.4396079480648041, |
| "learning_rate": 0.0003000848256361923, |
| "loss": 2.9857, |
| "step": 74350 |
| }, |
| { |
| "epoch": 20.021533161068046, |
| "grad_norm": 0.41727790236473083, |
| "learning_rate": 0.00029988285983573447, |
| "loss": 2.9466, |
| "step": 74400 |
| }, |
| { |
| "epoch": 20.034991386735573, |
| "grad_norm": 0.41952675580978394, |
| "learning_rate": 0.00029968089403527667, |
| "loss": 2.9494, |
| "step": 74450 |
| }, |
| { |
| "epoch": 20.0484496124031, |
| "grad_norm": 0.48774582147598267, |
| "learning_rate": 0.0002994789282348189, |
| "loss": 2.9569, |
| "step": 74500 |
| }, |
| { |
| "epoch": 20.061907838070628, |
| "grad_norm": 0.41048768162727356, |
| "learning_rate": 0.0002992769624343611, |
| "loss": 2.9548, |
| "step": 74550 |
| }, |
| { |
| "epoch": 20.075366063738155, |
| "grad_norm": 0.4365278482437134, |
| "learning_rate": 0.0002990749966339033, |
| "loss": 2.9549, |
| "step": 74600 |
| }, |
| { |
| "epoch": 20.088824289405686, |
| "grad_norm": 0.4222956597805023, |
| "learning_rate": 0.0002988730308334455, |
| "loss": 2.9585, |
| "step": 74650 |
| }, |
| { |
| "epoch": 20.102282515073213, |
| "grad_norm": 0.40463319420814514, |
| "learning_rate": 0.0002986710650329877, |
| "loss": 2.9616, |
| "step": 74700 |
| }, |
| { |
| "epoch": 20.11574074074074, |
| "grad_norm": 0.4185885488986969, |
| "learning_rate": 0.00029846909923252995, |
| "loss": 2.9653, |
| "step": 74750 |
| }, |
| { |
| "epoch": 20.129198966408268, |
| "grad_norm": 0.4026871621608734, |
| "learning_rate": 0.00029826713343207215, |
| "loss": 2.9658, |
| "step": 74800 |
| }, |
| { |
| "epoch": 20.142657192075795, |
| "grad_norm": 0.43527159094810486, |
| "learning_rate": 0.00029806516763161435, |
| "loss": 2.9718, |
| "step": 74850 |
| }, |
| { |
| "epoch": 20.156115417743326, |
| "grad_norm": 0.4299376308917999, |
| "learning_rate": 0.00029786320183115654, |
| "loss": 2.9793, |
| "step": 74900 |
| }, |
| { |
| "epoch": 20.169573643410853, |
| "grad_norm": 0.4304753541946411, |
| "learning_rate": 0.0002976612360306988, |
| "loss": 2.978, |
| "step": 74950 |
| }, |
| { |
| "epoch": 20.18303186907838, |
| "grad_norm": 0.4182604253292084, |
| "learning_rate": 0.000297459270230241, |
| "loss": 2.9677, |
| "step": 75000 |
| }, |
| { |
| "epoch": 20.18303186907838, |
| "eval_accuracy": 0.39509135679496993, |
| "eval_loss": 3.2936933040618896, |
| "eval_runtime": 55.0997, |
| "eval_samples_per_second": 326.898, |
| "eval_steps_per_second": 20.436, |
| "step": 75000 |
| }, |
| { |
| "epoch": 20.196490094745908, |
| "grad_norm": 0.4104526937007904, |
| "learning_rate": 0.0002972573044297832, |
| "loss": 2.9908, |
| "step": 75050 |
| }, |
| { |
| "epoch": 20.209948320413435, |
| "grad_norm": 0.4046146273612976, |
| "learning_rate": 0.00029705533862932543, |
| "loss": 2.9804, |
| "step": 75100 |
| }, |
| { |
| "epoch": 20.223406546080966, |
| "grad_norm": 0.4415130615234375, |
| "learning_rate": 0.00029685337282886763, |
| "loss": 2.9857, |
| "step": 75150 |
| }, |
| { |
| "epoch": 20.236864771748493, |
| "grad_norm": 0.39041709899902344, |
| "learning_rate": 0.0002966514070284098, |
| "loss": 2.975, |
| "step": 75200 |
| }, |
| { |
| "epoch": 20.25032299741602, |
| "grad_norm": 0.4639005959033966, |
| "learning_rate": 0.000296449441227952, |
| "loss": 2.9851, |
| "step": 75250 |
| }, |
| { |
| "epoch": 20.263781223083548, |
| "grad_norm": 0.39461809396743774, |
| "learning_rate": 0.0002962474754274942, |
| "loss": 2.9904, |
| "step": 75300 |
| }, |
| { |
| "epoch": 20.277239448751075, |
| "grad_norm": 0.43576109409332275, |
| "learning_rate": 0.00029604550962703647, |
| "loss": 2.9891, |
| "step": 75350 |
| }, |
| { |
| "epoch": 20.290697674418606, |
| "grad_norm": 0.4303674101829529, |
| "learning_rate": 0.00029584354382657867, |
| "loss": 2.9851, |
| "step": 75400 |
| }, |
| { |
| "epoch": 20.304155900086133, |
| "grad_norm": 0.49273186922073364, |
| "learning_rate": 0.0002956415780261209, |
| "loss": 2.9925, |
| "step": 75450 |
| }, |
| { |
| "epoch": 20.31761412575366, |
| "grad_norm": 0.420663982629776, |
| "learning_rate": 0.0002954396122256631, |
| "loss": 2.9861, |
| "step": 75500 |
| }, |
| { |
| "epoch": 20.331072351421188, |
| "grad_norm": 0.436514675617218, |
| "learning_rate": 0.0002952376464252053, |
| "loss": 2.9873, |
| "step": 75550 |
| }, |
| { |
| "epoch": 20.344530577088715, |
| "grad_norm": 0.43746113777160645, |
| "learning_rate": 0.0002950356806247475, |
| "loss": 3.0009, |
| "step": 75600 |
| }, |
| { |
| "epoch": 20.357988802756246, |
| "grad_norm": 0.4135793447494507, |
| "learning_rate": 0.0002948337148242897, |
| "loss": 2.9983, |
| "step": 75650 |
| }, |
| { |
| "epoch": 20.371447028423773, |
| "grad_norm": 0.4526557922363281, |
| "learning_rate": 0.00029463174902383195, |
| "loss": 2.9872, |
| "step": 75700 |
| }, |
| { |
| "epoch": 20.3849052540913, |
| "grad_norm": 0.4632878303527832, |
| "learning_rate": 0.00029442978322337415, |
| "loss": 2.9979, |
| "step": 75750 |
| }, |
| { |
| "epoch": 20.398363479758828, |
| "grad_norm": 0.40115952491760254, |
| "learning_rate": 0.00029422781742291634, |
| "loss": 2.9991, |
| "step": 75800 |
| }, |
| { |
| "epoch": 20.411821705426355, |
| "grad_norm": 0.4160281717777252, |
| "learning_rate": 0.0002940258516224586, |
| "loss": 2.9976, |
| "step": 75850 |
| }, |
| { |
| "epoch": 20.425279931093886, |
| "grad_norm": 0.40379562973976135, |
| "learning_rate": 0.0002938238858220008, |
| "loss": 3.006, |
| "step": 75900 |
| }, |
| { |
| "epoch": 20.438738156761413, |
| "grad_norm": 0.45283401012420654, |
| "learning_rate": 0.000293621920021543, |
| "loss": 3.0042, |
| "step": 75950 |
| }, |
| { |
| "epoch": 20.45219638242894, |
| "grad_norm": 0.421700119972229, |
| "learning_rate": 0.0002934199542210852, |
| "loss": 2.9962, |
| "step": 76000 |
| }, |
| { |
| "epoch": 20.45219638242894, |
| "eval_accuracy": 0.3956393715344369, |
| "eval_loss": 3.287869691848755, |
| "eval_runtime": 55.497, |
| "eval_samples_per_second": 324.558, |
| "eval_steps_per_second": 20.289, |
| "step": 76000 |
| }, |
| { |
| "epoch": 20.465654608096468, |
| "grad_norm": 0.44577735662460327, |
| "learning_rate": 0.00029321798842062743, |
| "loss": 3.008, |
| "step": 76050 |
| }, |
| { |
| "epoch": 20.479112833763995, |
| "grad_norm": 0.4258463680744171, |
| "learning_rate": 0.00029301602262016963, |
| "loss": 3.0077, |
| "step": 76100 |
| }, |
| { |
| "epoch": 20.492571059431526, |
| "grad_norm": 0.4366303086280823, |
| "learning_rate": 0.0002928140568197118, |
| "loss": 3.0012, |
| "step": 76150 |
| }, |
| { |
| "epoch": 20.506029285099054, |
| "grad_norm": 0.4077489674091339, |
| "learning_rate": 0.000292612091019254, |
| "loss": 3.001, |
| "step": 76200 |
| }, |
| { |
| "epoch": 20.51948751076658, |
| "grad_norm": 0.4265643358230591, |
| "learning_rate": 0.0002924101252187962, |
| "loss": 3.0123, |
| "step": 76250 |
| }, |
| { |
| "epoch": 20.532945736434108, |
| "grad_norm": 0.42429250478744507, |
| "learning_rate": 0.00029220815941833847, |
| "loss": 3.0062, |
| "step": 76300 |
| }, |
| { |
| "epoch": 20.546403962101635, |
| "grad_norm": 0.41154927015304565, |
| "learning_rate": 0.0002920061936178807, |
| "loss": 3.0039, |
| "step": 76350 |
| }, |
| { |
| "epoch": 20.559862187769163, |
| "grad_norm": 0.41237393021583557, |
| "learning_rate": 0.0002918042278174229, |
| "loss": 3.0127, |
| "step": 76400 |
| }, |
| { |
| "epoch": 20.573320413436694, |
| "grad_norm": 0.42659783363342285, |
| "learning_rate": 0.0002916022620169651, |
| "loss": 3.0032, |
| "step": 76450 |
| }, |
| { |
| "epoch": 20.58677863910422, |
| "grad_norm": 0.4047008454799652, |
| "learning_rate": 0.0002914002962165073, |
| "loss": 3.0177, |
| "step": 76500 |
| }, |
| { |
| "epoch": 20.600236864771748, |
| "grad_norm": 0.39722734689712524, |
| "learning_rate": 0.0002911983304160495, |
| "loss": 3.005, |
| "step": 76550 |
| }, |
| { |
| "epoch": 20.613695090439276, |
| "grad_norm": 0.4279921054840088, |
| "learning_rate": 0.00029099636461559175, |
| "loss": 2.9999, |
| "step": 76600 |
| }, |
| { |
| "epoch": 20.627153316106803, |
| "grad_norm": 0.42196568846702576, |
| "learning_rate": 0.00029079439881513395, |
| "loss": 3.0085, |
| "step": 76650 |
| }, |
| { |
| "epoch": 20.640611541774334, |
| "grad_norm": 0.42284271121025085, |
| "learning_rate": 0.00029059243301467614, |
| "loss": 3.0077, |
| "step": 76700 |
| }, |
| { |
| "epoch": 20.65406976744186, |
| "grad_norm": 0.4308563768863678, |
| "learning_rate": 0.0002903904672142184, |
| "loss": 3.0001, |
| "step": 76750 |
| }, |
| { |
| "epoch": 20.66752799310939, |
| "grad_norm": 0.399814248085022, |
| "learning_rate": 0.0002901885014137606, |
| "loss": 3.0182, |
| "step": 76800 |
| }, |
| { |
| "epoch": 20.680986218776916, |
| "grad_norm": 0.4300035834312439, |
| "learning_rate": 0.0002899865356133028, |
| "loss": 3.0036, |
| "step": 76850 |
| }, |
| { |
| "epoch": 20.694444444444443, |
| "grad_norm": 0.4693319499492645, |
| "learning_rate": 0.000289784569812845, |
| "loss": 3.0158, |
| "step": 76900 |
| }, |
| { |
| "epoch": 20.707902670111974, |
| "grad_norm": 0.43014174699783325, |
| "learning_rate": 0.00028958260401238723, |
| "loss": 3.0057, |
| "step": 76950 |
| }, |
| { |
| "epoch": 20.7213608957795, |
| "grad_norm": 0.43209654092788696, |
| "learning_rate": 0.00028938063821192943, |
| "loss": 3.028, |
| "step": 77000 |
| }, |
| { |
| "epoch": 20.7213608957795, |
| "eval_accuracy": 0.3957726812262145, |
| "eval_loss": 3.284038543701172, |
| "eval_runtime": 55.0721, |
| "eval_samples_per_second": 327.062, |
| "eval_steps_per_second": 20.446, |
| "step": 77000 |
| }, |
| { |
| "epoch": 20.73481912144703, |
| "grad_norm": 0.4174690842628479, |
| "learning_rate": 0.0002891786724114716, |
| "loss": 3.0144, |
| "step": 77050 |
| }, |
| { |
| "epoch": 20.748277347114556, |
| "grad_norm": 0.42950111627578735, |
| "learning_rate": 0.0002889767066110138, |
| "loss": 3.01, |
| "step": 77100 |
| }, |
| { |
| "epoch": 20.761735572782083, |
| "grad_norm": 0.4183488190174103, |
| "learning_rate": 0.000288774740810556, |
| "loss": 3.0134, |
| "step": 77150 |
| }, |
| { |
| "epoch": 20.775193798449614, |
| "grad_norm": 0.43711057305336, |
| "learning_rate": 0.00028857277501009827, |
| "loss": 3.0213, |
| "step": 77200 |
| }, |
| { |
| "epoch": 20.78865202411714, |
| "grad_norm": 0.46745753288269043, |
| "learning_rate": 0.00028837080920964046, |
| "loss": 3.0255, |
| "step": 77250 |
| }, |
| { |
| "epoch": 20.80211024978467, |
| "grad_norm": 0.4243837594985962, |
| "learning_rate": 0.0002881688434091827, |
| "loss": 3.0196, |
| "step": 77300 |
| }, |
| { |
| "epoch": 20.815568475452196, |
| "grad_norm": 0.4431048631668091, |
| "learning_rate": 0.0002879668776087249, |
| "loss": 3.0138, |
| "step": 77350 |
| }, |
| { |
| "epoch": 20.829026701119723, |
| "grad_norm": 0.42100629210472107, |
| "learning_rate": 0.0002877649118082671, |
| "loss": 3.0169, |
| "step": 77400 |
| }, |
| { |
| "epoch": 20.842484926787254, |
| "grad_norm": 0.40253376960754395, |
| "learning_rate": 0.0002875629460078093, |
| "loss": 3.0119, |
| "step": 77450 |
| }, |
| { |
| "epoch": 20.85594315245478, |
| "grad_norm": 0.4260178804397583, |
| "learning_rate": 0.0002873609802073515, |
| "loss": 3.0266, |
| "step": 77500 |
| }, |
| { |
| "epoch": 20.86940137812231, |
| "grad_norm": 0.4209248423576355, |
| "learning_rate": 0.00028715901440689375, |
| "loss": 3.0247, |
| "step": 77550 |
| }, |
| { |
| "epoch": 20.882859603789836, |
| "grad_norm": 0.4190898537635803, |
| "learning_rate": 0.00028695704860643594, |
| "loss": 3.0384, |
| "step": 77600 |
| }, |
| { |
| "epoch": 20.896317829457363, |
| "grad_norm": 0.42939960956573486, |
| "learning_rate": 0.0002867550828059782, |
| "loss": 3.0209, |
| "step": 77650 |
| }, |
| { |
| "epoch": 20.909776055124894, |
| "grad_norm": 0.4232606589794159, |
| "learning_rate": 0.0002865531170055204, |
| "loss": 3.0296, |
| "step": 77700 |
| }, |
| { |
| "epoch": 20.92323428079242, |
| "grad_norm": 0.4421567916870117, |
| "learning_rate": 0.0002863511512050626, |
| "loss": 3.0192, |
| "step": 77750 |
| }, |
| { |
| "epoch": 20.93669250645995, |
| "grad_norm": 0.4117579162120819, |
| "learning_rate": 0.0002861491854046048, |
| "loss": 3.0199, |
| "step": 77800 |
| }, |
| { |
| "epoch": 20.950150732127476, |
| "grad_norm": 0.43439605832099915, |
| "learning_rate": 0.000285947219604147, |
| "loss": 3.0265, |
| "step": 77850 |
| }, |
| { |
| "epoch": 20.963608957795003, |
| "grad_norm": 0.44588276743888855, |
| "learning_rate": 0.00028574525380368923, |
| "loss": 3.0231, |
| "step": 77900 |
| }, |
| { |
| "epoch": 20.977067183462534, |
| "grad_norm": 0.43251749873161316, |
| "learning_rate": 0.0002855432880032314, |
| "loss": 3.0243, |
| "step": 77950 |
| }, |
| { |
| "epoch": 20.99052540913006, |
| "grad_norm": 0.4446578323841095, |
| "learning_rate": 0.0002853413222027736, |
| "loss": 3.015, |
| "step": 78000 |
| }, |
| { |
| "epoch": 20.99052540913006, |
| "eval_accuracy": 0.39639381529947637, |
| "eval_loss": 3.2760121822357178, |
| "eval_runtime": 55.1938, |
| "eval_samples_per_second": 326.341, |
| "eval_steps_per_second": 20.401, |
| "step": 78000 |
| }, |
| { |
| "epoch": 21.00376830318691, |
| "grad_norm": 0.4257495403289795, |
| "learning_rate": 0.00028513935640231587, |
| "loss": 3.0018, |
| "step": 78050 |
| }, |
| { |
| "epoch": 21.017226528854437, |
| "grad_norm": 0.4258408844470978, |
| "learning_rate": 0.00028493739060185807, |
| "loss": 2.935, |
| "step": 78100 |
| }, |
| { |
| "epoch": 21.030684754521964, |
| "grad_norm": 0.4279120862483978, |
| "learning_rate": 0.00028473542480140026, |
| "loss": 2.9377, |
| "step": 78150 |
| }, |
| { |
| "epoch": 21.04414298018949, |
| "grad_norm": 0.40601927042007446, |
| "learning_rate": 0.0002845334590009425, |
| "loss": 2.9336, |
| "step": 78200 |
| }, |
| { |
| "epoch": 21.05760120585702, |
| "grad_norm": 0.46881160140037537, |
| "learning_rate": 0.0002843314932004847, |
| "loss": 2.948, |
| "step": 78250 |
| }, |
| { |
| "epoch": 21.07105943152455, |
| "grad_norm": 0.4635114371776581, |
| "learning_rate": 0.0002841295274000269, |
| "loss": 2.9619, |
| "step": 78300 |
| }, |
| { |
| "epoch": 21.084517657192077, |
| "grad_norm": 0.42312192916870117, |
| "learning_rate": 0.0002839275615995691, |
| "loss": 2.9504, |
| "step": 78350 |
| }, |
| { |
| "epoch": 21.097975882859604, |
| "grad_norm": 0.43240320682525635, |
| "learning_rate": 0.0002837255957991113, |
| "loss": 2.9561, |
| "step": 78400 |
| }, |
| { |
| "epoch": 21.11143410852713, |
| "grad_norm": 0.43588295578956604, |
| "learning_rate": 0.00028352362999865355, |
| "loss": 2.9655, |
| "step": 78450 |
| }, |
| { |
| "epoch": 21.12489233419466, |
| "grad_norm": 0.4394192397594452, |
| "learning_rate": 0.00028332166419819574, |
| "loss": 2.961, |
| "step": 78500 |
| }, |
| { |
| "epoch": 21.13835055986219, |
| "grad_norm": 0.4436136484146118, |
| "learning_rate": 0.000283119698397738, |
| "loss": 2.9606, |
| "step": 78550 |
| }, |
| { |
| "epoch": 21.151808785529717, |
| "grad_norm": 0.4162307679653168, |
| "learning_rate": 0.0002829177325972802, |
| "loss": 2.9571, |
| "step": 78600 |
| }, |
| { |
| "epoch": 21.165267011197244, |
| "grad_norm": 0.44429853558540344, |
| "learning_rate": 0.0002827157667968224, |
| "loss": 2.9653, |
| "step": 78650 |
| }, |
| { |
| "epoch": 21.17872523686477, |
| "grad_norm": 0.4423364996910095, |
| "learning_rate": 0.0002825138009963646, |
| "loss": 2.9711, |
| "step": 78700 |
| }, |
| { |
| "epoch": 21.1921834625323, |
| "grad_norm": 0.45903778076171875, |
| "learning_rate": 0.0002823118351959068, |
| "loss": 2.9725, |
| "step": 78750 |
| }, |
| { |
| "epoch": 21.205641688199826, |
| "grad_norm": 0.4161292016506195, |
| "learning_rate": 0.00028210986939544903, |
| "loss": 2.9722, |
| "step": 78800 |
| }, |
| { |
| "epoch": 21.219099913867357, |
| "grad_norm": 0.41161108016967773, |
| "learning_rate": 0.0002819079035949912, |
| "loss": 2.9769, |
| "step": 78850 |
| }, |
| { |
| "epoch": 21.232558139534884, |
| "grad_norm": 0.4620738625526428, |
| "learning_rate": 0.0002817059377945334, |
| "loss": 2.9643, |
| "step": 78900 |
| }, |
| { |
| "epoch": 21.24601636520241, |
| "grad_norm": 0.42669835686683655, |
| "learning_rate": 0.00028150397199407567, |
| "loss": 2.9732, |
| "step": 78950 |
| }, |
| { |
| "epoch": 21.25947459086994, |
| "grad_norm": 0.4427119195461273, |
| "learning_rate": 0.00028130200619361787, |
| "loss": 2.9675, |
| "step": 79000 |
| }, |
| { |
| "epoch": 21.25947459086994, |
| "eval_accuracy": 0.39546597115295606, |
| "eval_loss": 3.294128894805908, |
| "eval_runtime": 54.6202, |
| "eval_samples_per_second": 329.768, |
| "eval_steps_per_second": 20.615, |
| "step": 79000 |
| }, |
| { |
| "epoch": 21.272932816537466, |
| "grad_norm": 0.4521730840206146, |
| "learning_rate": 0.00028110004039316006, |
| "loss": 2.9685, |
| "step": 79050 |
| }, |
| { |
| "epoch": 21.286391042204997, |
| "grad_norm": 0.4480832517147064, |
| "learning_rate": 0.00028089807459270226, |
| "loss": 2.9697, |
| "step": 79100 |
| }, |
| { |
| "epoch": 21.299849267872524, |
| "grad_norm": 0.42608147859573364, |
| "learning_rate": 0.0002806961087922445, |
| "loss": 2.9664, |
| "step": 79150 |
| }, |
| { |
| "epoch": 21.31330749354005, |
| "grad_norm": 0.42942923307418823, |
| "learning_rate": 0.0002804941429917867, |
| "loss": 2.9785, |
| "step": 79200 |
| }, |
| { |
| "epoch": 21.32676571920758, |
| "grad_norm": 0.42538803815841675, |
| "learning_rate": 0.0002802921771913289, |
| "loss": 2.9762, |
| "step": 79250 |
| }, |
| { |
| "epoch": 21.340223944875106, |
| "grad_norm": 0.42696043848991394, |
| "learning_rate": 0.0002800902113908711, |
| "loss": 2.9846, |
| "step": 79300 |
| }, |
| { |
| "epoch": 21.353682170542637, |
| "grad_norm": 0.4404904544353485, |
| "learning_rate": 0.0002798882455904133, |
| "loss": 2.9768, |
| "step": 79350 |
| }, |
| { |
| "epoch": 21.367140396210164, |
| "grad_norm": 0.46387866139411926, |
| "learning_rate": 0.00027968627978995555, |
| "loss": 2.9766, |
| "step": 79400 |
| }, |
| { |
| "epoch": 21.38059862187769, |
| "grad_norm": 0.427202433347702, |
| "learning_rate": 0.00027948431398949774, |
| "loss": 2.9877, |
| "step": 79450 |
| }, |
| { |
| "epoch": 21.39405684754522, |
| "grad_norm": 0.41624459624290466, |
| "learning_rate": 0.00027928234818904, |
| "loss": 2.9882, |
| "step": 79500 |
| }, |
| { |
| "epoch": 21.407515073212746, |
| "grad_norm": 0.45006537437438965, |
| "learning_rate": 0.0002790803823885822, |
| "loss": 2.988, |
| "step": 79550 |
| }, |
| { |
| "epoch": 21.420973298880277, |
| "grad_norm": 0.42271533608436584, |
| "learning_rate": 0.0002788784165881244, |
| "loss": 2.9872, |
| "step": 79600 |
| }, |
| { |
| "epoch": 21.434431524547804, |
| "grad_norm": 0.4559018015861511, |
| "learning_rate": 0.0002786764507876666, |
| "loss": 2.9797, |
| "step": 79650 |
| }, |
| { |
| "epoch": 21.44788975021533, |
| "grad_norm": 0.42914602160453796, |
| "learning_rate": 0.0002784744849872088, |
| "loss": 2.9898, |
| "step": 79700 |
| }, |
| { |
| "epoch": 21.46134797588286, |
| "grad_norm": 0.40934568643569946, |
| "learning_rate": 0.000278272519186751, |
| "loss": 2.9896, |
| "step": 79750 |
| }, |
| { |
| "epoch": 21.474806201550386, |
| "grad_norm": 0.4386095106601715, |
| "learning_rate": 0.0002780705533862932, |
| "loss": 2.9904, |
| "step": 79800 |
| }, |
| { |
| "epoch": 21.488264427217917, |
| "grad_norm": 0.42801961302757263, |
| "learning_rate": 0.0002778685875858355, |
| "loss": 2.981, |
| "step": 79850 |
| }, |
| { |
| "epoch": 21.501722652885444, |
| "grad_norm": 0.42655736207962036, |
| "learning_rate": 0.00027766662178537767, |
| "loss": 2.9964, |
| "step": 79900 |
| }, |
| { |
| "epoch": 21.51518087855297, |
| "grad_norm": 0.43257346749305725, |
| "learning_rate": 0.00027746465598491987, |
| "loss": 2.9868, |
| "step": 79950 |
| }, |
| { |
| "epoch": 21.5286391042205, |
| "grad_norm": 0.4436582922935486, |
| "learning_rate": 0.00027726269018446206, |
| "loss": 2.9881, |
| "step": 80000 |
| }, |
| { |
| "epoch": 21.5286391042205, |
| "eval_accuracy": 0.3959727000873086, |
| "eval_loss": 3.2854392528533936, |
| "eval_runtime": 55.4249, |
| "eval_samples_per_second": 324.98, |
| "eval_steps_per_second": 20.316, |
| "step": 80000 |
| }, |
| { |
| "epoch": 21.542097329888026, |
| "grad_norm": 0.4455031752586365, |
| "learning_rate": 0.00027706072438400426, |
| "loss": 2.988, |
| "step": 80050 |
| }, |
| { |
| "epoch": 21.555555555555557, |
| "grad_norm": 0.43415355682373047, |
| "learning_rate": 0.0002768587585835465, |
| "loss": 3.0084, |
| "step": 80100 |
| }, |
| { |
| "epoch": 21.569013781223084, |
| "grad_norm": 0.47144296765327454, |
| "learning_rate": 0.0002766567927830887, |
| "loss": 2.9981, |
| "step": 80150 |
| }, |
| { |
| "epoch": 21.58247200689061, |
| "grad_norm": 0.46530064940452576, |
| "learning_rate": 0.0002764548269826309, |
| "loss": 3.0019, |
| "step": 80200 |
| }, |
| { |
| "epoch": 21.59593023255814, |
| "grad_norm": 0.4477296471595764, |
| "learning_rate": 0.0002762528611821731, |
| "loss": 2.9965, |
| "step": 80250 |
| }, |
| { |
| "epoch": 21.609388458225666, |
| "grad_norm": 0.4592190384864807, |
| "learning_rate": 0.00027605089538171535, |
| "loss": 3.009, |
| "step": 80300 |
| }, |
| { |
| "epoch": 21.622846683893197, |
| "grad_norm": 0.4123205542564392, |
| "learning_rate": 0.00027584892958125754, |
| "loss": 3.0118, |
| "step": 80350 |
| }, |
| { |
| "epoch": 21.636304909560724, |
| "grad_norm": 0.41009268164634705, |
| "learning_rate": 0.0002756469637807998, |
| "loss": 2.9985, |
| "step": 80400 |
| }, |
| { |
| "epoch": 21.649763135228252, |
| "grad_norm": 0.41830089688301086, |
| "learning_rate": 0.000275444997980342, |
| "loss": 2.9939, |
| "step": 80450 |
| }, |
| { |
| "epoch": 21.66322136089578, |
| "grad_norm": 0.4384111166000366, |
| "learning_rate": 0.0002752430321798842, |
| "loss": 3.0026, |
| "step": 80500 |
| }, |
| { |
| "epoch": 21.676679586563306, |
| "grad_norm": 0.41752365231513977, |
| "learning_rate": 0.0002750410663794264, |
| "loss": 3.0026, |
| "step": 80550 |
| }, |
| { |
| "epoch": 21.690137812230837, |
| "grad_norm": 0.4098077714443207, |
| "learning_rate": 0.0002748391005789686, |
| "loss": 3.0001, |
| "step": 80600 |
| }, |
| { |
| "epoch": 21.703596037898365, |
| "grad_norm": 0.4417395293712616, |
| "learning_rate": 0.00027463713477851083, |
| "loss": 3.0034, |
| "step": 80650 |
| }, |
| { |
| "epoch": 21.717054263565892, |
| "grad_norm": 0.42584118247032166, |
| "learning_rate": 0.000274435168978053, |
| "loss": 2.9965, |
| "step": 80700 |
| }, |
| { |
| "epoch": 21.73051248923342, |
| "grad_norm": 0.42295321822166443, |
| "learning_rate": 0.0002742332031775953, |
| "loss": 3.0049, |
| "step": 80750 |
| }, |
| { |
| "epoch": 21.743970714900946, |
| "grad_norm": 0.41715312004089355, |
| "learning_rate": 0.00027403123737713747, |
| "loss": 3.0042, |
| "step": 80800 |
| }, |
| { |
| "epoch": 21.757428940568474, |
| "grad_norm": 0.41474443674087524, |
| "learning_rate": 0.00027382927157667967, |
| "loss": 3.009, |
| "step": 80850 |
| }, |
| { |
| "epoch": 21.770887166236005, |
| "grad_norm": 0.45613738894462585, |
| "learning_rate": 0.00027362730577622186, |
| "loss": 3.0105, |
| "step": 80900 |
| }, |
| { |
| "epoch": 21.784345391903532, |
| "grad_norm": 0.4140360951423645, |
| "learning_rate": 0.00027342533997576406, |
| "loss": 3.012, |
| "step": 80950 |
| }, |
| { |
| "epoch": 21.79780361757106, |
| "grad_norm": 0.47605544328689575, |
| "learning_rate": 0.0002732233741753063, |
| "loss": 3.0037, |
| "step": 81000 |
| }, |
| { |
| "epoch": 21.79780361757106, |
| "eval_accuracy": 0.3962476852787422, |
| "eval_loss": 3.280226707458496, |
| "eval_runtime": 55.7877, |
| "eval_samples_per_second": 322.867, |
| "eval_steps_per_second": 20.184, |
| "step": 81000 |
| }, |
| { |
| "epoch": 21.811261843238587, |
| "grad_norm": 0.4371415674686432, |
| "learning_rate": 0.0002730214083748485, |
| "loss": 3.0165, |
| "step": 81050 |
| }, |
| { |
| "epoch": 21.824720068906114, |
| "grad_norm": 0.42411020398139954, |
| "learning_rate": 0.0002728194425743907, |
| "loss": 3.0095, |
| "step": 81100 |
| }, |
| { |
| "epoch": 21.838178294573645, |
| "grad_norm": 0.4219321310520172, |
| "learning_rate": 0.0002726174767739329, |
| "loss": 3.0101, |
| "step": 81150 |
| }, |
| { |
| "epoch": 21.851636520241172, |
| "grad_norm": 0.41034814715385437, |
| "learning_rate": 0.00027241551097347515, |
| "loss": 3.0089, |
| "step": 81200 |
| }, |
| { |
| "epoch": 21.8650947459087, |
| "grad_norm": 0.4187079668045044, |
| "learning_rate": 0.00027221354517301734, |
| "loss": 3.0148, |
| "step": 81250 |
| }, |
| { |
| "epoch": 21.878552971576227, |
| "grad_norm": 0.41652804613113403, |
| "learning_rate": 0.00027201157937255954, |
| "loss": 3.0148, |
| "step": 81300 |
| }, |
| { |
| "epoch": 21.892011197243754, |
| "grad_norm": 0.4195777475833893, |
| "learning_rate": 0.0002718096135721018, |
| "loss": 3.0215, |
| "step": 81350 |
| }, |
| { |
| "epoch": 21.905469422911285, |
| "grad_norm": 0.44653409719467163, |
| "learning_rate": 0.000271607647771644, |
| "loss": 3.0189, |
| "step": 81400 |
| }, |
| { |
| "epoch": 21.918927648578812, |
| "grad_norm": 0.422077476978302, |
| "learning_rate": 0.0002714056819711862, |
| "loss": 3.0073, |
| "step": 81450 |
| }, |
| { |
| "epoch": 21.93238587424634, |
| "grad_norm": 0.40733182430267334, |
| "learning_rate": 0.0002712037161707284, |
| "loss": 3.0223, |
| "step": 81500 |
| }, |
| { |
| "epoch": 21.945844099913867, |
| "grad_norm": 0.42711979150772095, |
| "learning_rate": 0.0002710017503702706, |
| "loss": 3.0135, |
| "step": 81550 |
| }, |
| { |
| "epoch": 21.959302325581394, |
| "grad_norm": 0.42944803833961487, |
| "learning_rate": 0.0002707997845698128, |
| "loss": 3.0209, |
| "step": 81600 |
| }, |
| { |
| "epoch": 21.972760551248925, |
| "grad_norm": 0.39599812030792236, |
| "learning_rate": 0.000270597818769355, |
| "loss": 3.0113, |
| "step": 81650 |
| }, |
| { |
| "epoch": 21.986218776916452, |
| "grad_norm": 0.4108112156391144, |
| "learning_rate": 0.00027039585296889727, |
| "loss": 3.0103, |
| "step": 81700 |
| }, |
| { |
| "epoch": 21.99967700258398, |
| "grad_norm": 0.4447796642780304, |
| "learning_rate": 0.00027019388716843947, |
| "loss": 3.0168, |
| "step": 81750 |
| }, |
| { |
| "epoch": 22.012919896640827, |
| "grad_norm": 0.43677303194999695, |
| "learning_rate": 0.00026999192136798166, |
| "loss": 2.9338, |
| "step": 81800 |
| }, |
| { |
| "epoch": 22.026378122308355, |
| "grad_norm": 0.4421975314617157, |
| "learning_rate": 0.00026978995556752386, |
| "loss": 2.932, |
| "step": 81850 |
| }, |
| { |
| "epoch": 22.039836347975882, |
| "grad_norm": 0.4097566604614258, |
| "learning_rate": 0.00026958798976706606, |
| "loss": 2.9407, |
| "step": 81900 |
| }, |
| { |
| "epoch": 22.05329457364341, |
| "grad_norm": 0.4221478998661041, |
| "learning_rate": 0.0002693860239666083, |
| "loss": 2.9391, |
| "step": 81950 |
| }, |
| { |
| "epoch": 22.06675279931094, |
| "grad_norm": 0.4292081296443939, |
| "learning_rate": 0.0002691840581661505, |
| "loss": 2.9562, |
| "step": 82000 |
| }, |
| { |
| "epoch": 22.06675279931094, |
| "eval_accuracy": 0.3957283533091442, |
| "eval_loss": 3.292602300643921, |
| "eval_runtime": 55.3547, |
| "eval_samples_per_second": 325.392, |
| "eval_steps_per_second": 20.342, |
| "step": 82000 |
| }, |
| { |
| "epoch": 22.080211024978468, |
| "grad_norm": 0.4228661060333252, |
| "learning_rate": 0.0002689820923656927, |
| "loss": 2.9327, |
| "step": 82050 |
| }, |
| { |
| "epoch": 22.093669250645995, |
| "grad_norm": 0.4465382993221283, |
| "learning_rate": 0.00026878012656523495, |
| "loss": 2.9365, |
| "step": 82100 |
| }, |
| { |
| "epoch": 22.107127476313522, |
| "grad_norm": 0.43030622601509094, |
| "learning_rate": 0.00026857816076477714, |
| "loss": 2.9365, |
| "step": 82150 |
| }, |
| { |
| "epoch": 22.12058570198105, |
| "grad_norm": 0.4425069987773895, |
| "learning_rate": 0.00026837619496431934, |
| "loss": 2.9483, |
| "step": 82200 |
| }, |
| { |
| "epoch": 22.13404392764858, |
| "grad_norm": 0.4402919113636017, |
| "learning_rate": 0.0002681742291638616, |
| "loss": 2.948, |
| "step": 82250 |
| }, |
| { |
| "epoch": 22.147502153316108, |
| "grad_norm": 0.44754478335380554, |
| "learning_rate": 0.0002679722633634038, |
| "loss": 2.9469, |
| "step": 82300 |
| }, |
| { |
| "epoch": 22.160960378983635, |
| "grad_norm": 0.4323514401912689, |
| "learning_rate": 0.000267770297562946, |
| "loss": 2.9433, |
| "step": 82350 |
| }, |
| { |
| "epoch": 22.174418604651162, |
| "grad_norm": 0.43964049220085144, |
| "learning_rate": 0.0002675683317624882, |
| "loss": 2.9485, |
| "step": 82400 |
| }, |
| { |
| "epoch": 22.18787683031869, |
| "grad_norm": 0.4523833990097046, |
| "learning_rate": 0.0002673663659620304, |
| "loss": 2.9487, |
| "step": 82450 |
| }, |
| { |
| "epoch": 22.20133505598622, |
| "grad_norm": 0.4370718002319336, |
| "learning_rate": 0.0002671644001615726, |
| "loss": 2.961, |
| "step": 82500 |
| }, |
| { |
| "epoch": 22.214793281653748, |
| "grad_norm": 0.44688382744789124, |
| "learning_rate": 0.0002669624343611148, |
| "loss": 2.9504, |
| "step": 82550 |
| }, |
| { |
| "epoch": 22.228251507321275, |
| "grad_norm": 0.44331830739974976, |
| "learning_rate": 0.00026676046856065707, |
| "loss": 2.9555, |
| "step": 82600 |
| }, |
| { |
| "epoch": 22.241709732988802, |
| "grad_norm": 0.42020565271377563, |
| "learning_rate": 0.00026655850276019927, |
| "loss": 2.9615, |
| "step": 82650 |
| }, |
| { |
| "epoch": 22.25516795865633, |
| "grad_norm": 0.4653908908367157, |
| "learning_rate": 0.00026635653695974146, |
| "loss": 2.9603, |
| "step": 82700 |
| }, |
| { |
| "epoch": 22.26862618432386, |
| "grad_norm": 0.41683560609817505, |
| "learning_rate": 0.00026615457115928366, |
| "loss": 2.966, |
| "step": 82750 |
| }, |
| { |
| "epoch": 22.282084409991388, |
| "grad_norm": 0.46766164898872375, |
| "learning_rate": 0.00026595260535882586, |
| "loss": 2.9694, |
| "step": 82800 |
| }, |
| { |
| "epoch": 22.295542635658915, |
| "grad_norm": 0.46885979175567627, |
| "learning_rate": 0.0002657506395583681, |
| "loss": 2.9826, |
| "step": 82850 |
| }, |
| { |
| "epoch": 22.309000861326442, |
| "grad_norm": 0.4445846676826477, |
| "learning_rate": 0.0002655486737579103, |
| "loss": 2.9655, |
| "step": 82900 |
| }, |
| { |
| "epoch": 22.32245908699397, |
| "grad_norm": 0.4270409047603607, |
| "learning_rate": 0.00026534670795745255, |
| "loss": 2.9681, |
| "step": 82950 |
| }, |
| { |
| "epoch": 22.3359173126615, |
| "grad_norm": 0.4194854497909546, |
| "learning_rate": 0.00026514474215699475, |
| "loss": 2.9683, |
| "step": 83000 |
| }, |
| { |
| "epoch": 22.3359173126615, |
| "eval_accuracy": 0.3955815714072766, |
| "eval_loss": 3.292879819869995, |
| "eval_runtime": 55.3417, |
| "eval_samples_per_second": 325.469, |
| "eval_steps_per_second": 20.346, |
| "step": 83000 |
| }, |
| { |
| "epoch": 22.349375538329028, |
| "grad_norm": 0.429385781288147, |
| "learning_rate": 0.00026494277635653694, |
| "loss": 2.9716, |
| "step": 83050 |
| }, |
| { |
| "epoch": 22.362833763996555, |
| "grad_norm": 0.4517979621887207, |
| "learning_rate": 0.00026474081055607914, |
| "loss": 2.9669, |
| "step": 83100 |
| }, |
| { |
| "epoch": 22.376291989664082, |
| "grad_norm": 0.44728177785873413, |
| "learning_rate": 0.00026453884475562134, |
| "loss": 2.9587, |
| "step": 83150 |
| }, |
| { |
| "epoch": 22.38975021533161, |
| "grad_norm": 0.4721812605857849, |
| "learning_rate": 0.0002643368789551636, |
| "loss": 2.986, |
| "step": 83200 |
| }, |
| { |
| "epoch": 22.403208440999137, |
| "grad_norm": 0.44930049777030945, |
| "learning_rate": 0.0002641349131547058, |
| "loss": 2.9718, |
| "step": 83250 |
| }, |
| { |
| "epoch": 22.416666666666668, |
| "grad_norm": 0.47097310423851013, |
| "learning_rate": 0.000263932947354248, |
| "loss": 2.9863, |
| "step": 83300 |
| }, |
| { |
| "epoch": 22.430124892334195, |
| "grad_norm": 0.46559232473373413, |
| "learning_rate": 0.0002637309815537902, |
| "loss": 2.971, |
| "step": 83350 |
| }, |
| { |
| "epoch": 22.443583118001722, |
| "grad_norm": 0.42774224281311035, |
| "learning_rate": 0.0002635290157533324, |
| "loss": 2.984, |
| "step": 83400 |
| }, |
| { |
| "epoch": 22.45704134366925, |
| "grad_norm": 0.4455351233482361, |
| "learning_rate": 0.0002633270499528746, |
| "loss": 2.9784, |
| "step": 83450 |
| }, |
| { |
| "epoch": 22.470499569336777, |
| "grad_norm": 0.42454469203948975, |
| "learning_rate": 0.0002631250841524168, |
| "loss": 2.9708, |
| "step": 83500 |
| }, |
| { |
| "epoch": 22.483957795004308, |
| "grad_norm": 0.4294556975364685, |
| "learning_rate": 0.00026292311835195907, |
| "loss": 2.9963, |
| "step": 83550 |
| }, |
| { |
| "epoch": 22.497416020671835, |
| "grad_norm": 0.4321839213371277, |
| "learning_rate": 0.00026272115255150126, |
| "loss": 2.995, |
| "step": 83600 |
| }, |
| { |
| "epoch": 22.510874246339363, |
| "grad_norm": 0.4495703876018524, |
| "learning_rate": 0.00026251918675104346, |
| "loss": 2.988, |
| "step": 83650 |
| }, |
| { |
| "epoch": 22.52433247200689, |
| "grad_norm": 0.4380459189414978, |
| "learning_rate": 0.00026231722095058566, |
| "loss": 2.9866, |
| "step": 83700 |
| }, |
| { |
| "epoch": 22.537790697674417, |
| "grad_norm": 0.4377855062484741, |
| "learning_rate": 0.00026211525515012785, |
| "loss": 2.9762, |
| "step": 83750 |
| }, |
| { |
| "epoch": 22.551248923341948, |
| "grad_norm": 0.4443385899066925, |
| "learning_rate": 0.0002619132893496701, |
| "loss": 2.9879, |
| "step": 83800 |
| }, |
| { |
| "epoch": 22.564707149009475, |
| "grad_norm": 0.4317345917224884, |
| "learning_rate": 0.0002617113235492123, |
| "loss": 2.9873, |
| "step": 83850 |
| }, |
| { |
| "epoch": 22.578165374677003, |
| "grad_norm": 0.41431862115859985, |
| "learning_rate": 0.00026150935774875455, |
| "loss": 2.9937, |
| "step": 83900 |
| }, |
| { |
| "epoch": 22.59162360034453, |
| "grad_norm": 0.4557620882987976, |
| "learning_rate": 0.00026130739194829675, |
| "loss": 2.9766, |
| "step": 83950 |
| }, |
| { |
| "epoch": 22.605081826012057, |
| "grad_norm": 0.45642057061195374, |
| "learning_rate": 0.00026110542614783894, |
| "loss": 2.9852, |
| "step": 84000 |
| }, |
| { |
| "epoch": 22.605081826012057, |
| "eval_accuracy": 0.3964245623596011, |
| "eval_loss": 3.2839629650115967, |
| "eval_runtime": 55.1163, |
| "eval_samples_per_second": 326.8, |
| "eval_steps_per_second": 20.43, |
| "step": 84000 |
| }, |
| { |
| "epoch": 22.618540051679588, |
| "grad_norm": 0.48557594418525696, |
| "learning_rate": 0.00026090346034738114, |
| "loss": 2.9942, |
| "step": 84050 |
| }, |
| { |
| "epoch": 22.631998277347115, |
| "grad_norm": 0.44271036982536316, |
| "learning_rate": 0.0002607014945469234, |
| "loss": 2.9952, |
| "step": 84100 |
| }, |
| { |
| "epoch": 22.645456503014643, |
| "grad_norm": 0.46364423632621765, |
| "learning_rate": 0.0002604995287464656, |
| "loss": 2.9961, |
| "step": 84150 |
| }, |
| { |
| "epoch": 22.65891472868217, |
| "grad_norm": 0.4304630160331726, |
| "learning_rate": 0.0002602975629460078, |
| "loss": 2.9918, |
| "step": 84200 |
| }, |
| { |
| "epoch": 22.672372954349697, |
| "grad_norm": 0.4598659873008728, |
| "learning_rate": 0.00026009559714555, |
| "loss": 2.9955, |
| "step": 84250 |
| }, |
| { |
| "epoch": 22.685831180017228, |
| "grad_norm": 0.43804359436035156, |
| "learning_rate": 0.0002598936313450922, |
| "loss": 3.002, |
| "step": 84300 |
| }, |
| { |
| "epoch": 22.699289405684755, |
| "grad_norm": 0.44611334800720215, |
| "learning_rate": 0.0002596916655446344, |
| "loss": 2.9896, |
| "step": 84350 |
| }, |
| { |
| "epoch": 22.712747631352283, |
| "grad_norm": 0.4401785433292389, |
| "learning_rate": 0.0002594896997441766, |
| "loss": 2.9971, |
| "step": 84400 |
| }, |
| { |
| "epoch": 22.72620585701981, |
| "grad_norm": 0.45873647928237915, |
| "learning_rate": 0.00025928773394371887, |
| "loss": 2.9911, |
| "step": 84450 |
| }, |
| { |
| "epoch": 22.739664082687337, |
| "grad_norm": 0.42473602294921875, |
| "learning_rate": 0.00025908576814326107, |
| "loss": 2.9927, |
| "step": 84500 |
| }, |
| { |
| "epoch": 22.753122308354868, |
| "grad_norm": 0.4496315121650696, |
| "learning_rate": 0.00025888380234280326, |
| "loss": 2.9939, |
| "step": 84550 |
| }, |
| { |
| "epoch": 22.766580534022395, |
| "grad_norm": 0.44188305735588074, |
| "learning_rate": 0.00025868183654234546, |
| "loss": 2.9925, |
| "step": 84600 |
| }, |
| { |
| "epoch": 22.780038759689923, |
| "grad_norm": 0.47845959663391113, |
| "learning_rate": 0.00025847987074188765, |
| "loss": 2.9961, |
| "step": 84650 |
| }, |
| { |
| "epoch": 22.79349698535745, |
| "grad_norm": 0.4484929144382477, |
| "learning_rate": 0.0002582779049414299, |
| "loss": 3.0014, |
| "step": 84700 |
| }, |
| { |
| "epoch": 22.806955211024977, |
| "grad_norm": 0.41167107224464417, |
| "learning_rate": 0.0002580759391409721, |
| "loss": 2.9951, |
| "step": 84750 |
| }, |
| { |
| "epoch": 22.820413436692505, |
| "grad_norm": 0.4409504532814026, |
| "learning_rate": 0.00025787397334051435, |
| "loss": 2.9982, |
| "step": 84800 |
| }, |
| { |
| "epoch": 22.833871662360036, |
| "grad_norm": 0.44134747982025146, |
| "learning_rate": 0.00025767200754005655, |
| "loss": 2.9994, |
| "step": 84850 |
| }, |
| { |
| "epoch": 22.847329888027563, |
| "grad_norm": 0.475885808467865, |
| "learning_rate": 0.00025747004173959874, |
| "loss": 3.0012, |
| "step": 84900 |
| }, |
| { |
| "epoch": 22.86078811369509, |
| "grad_norm": 0.4444062411785126, |
| "learning_rate": 0.00025726807593914094, |
| "loss": 2.9944, |
| "step": 84950 |
| }, |
| { |
| "epoch": 22.874246339362617, |
| "grad_norm": 0.434906929731369, |
| "learning_rate": 0.00025706611013868313, |
| "loss": 3.0085, |
| "step": 85000 |
| }, |
| { |
| "epoch": 22.874246339362617, |
| "eval_accuracy": 0.396510393375497, |
| "eval_loss": 3.278679132461548, |
| "eval_runtime": 55.0664, |
| "eval_samples_per_second": 327.096, |
| "eval_steps_per_second": 20.448, |
| "step": 85000 |
| }, |
| { |
| "epoch": 22.887704565030145, |
| "grad_norm": 0.4553506076335907, |
| "learning_rate": 0.0002568641443382254, |
| "loss": 3.0023, |
| "step": 85050 |
| }, |
| { |
| "epoch": 22.901162790697676, |
| "grad_norm": 0.4525589942932129, |
| "learning_rate": 0.0002566621785377676, |
| "loss": 3.0052, |
| "step": 85100 |
| }, |
| { |
| "epoch": 22.914621016365203, |
| "grad_norm": 0.44847944378852844, |
| "learning_rate": 0.0002564602127373098, |
| "loss": 3.0062, |
| "step": 85150 |
| }, |
| { |
| "epoch": 22.92807924203273, |
| "grad_norm": 0.45734959840774536, |
| "learning_rate": 0.00025625824693685203, |
| "loss": 3.0016, |
| "step": 85200 |
| }, |
| { |
| "epoch": 22.941537467700257, |
| "grad_norm": 0.43638479709625244, |
| "learning_rate": 0.0002560562811363942, |
| "loss": 3.0056, |
| "step": 85250 |
| }, |
| { |
| "epoch": 22.954995693367785, |
| "grad_norm": 0.4501156210899353, |
| "learning_rate": 0.0002558543153359364, |
| "loss": 3.0053, |
| "step": 85300 |
| }, |
| { |
| "epoch": 22.968453919035316, |
| "grad_norm": 0.45217031240463257, |
| "learning_rate": 0.0002556523495354786, |
| "loss": 3.003, |
| "step": 85350 |
| }, |
| { |
| "epoch": 22.981912144702843, |
| "grad_norm": 0.44591525197029114, |
| "learning_rate": 0.00025545038373502087, |
| "loss": 2.9979, |
| "step": 85400 |
| }, |
| { |
| "epoch": 22.99537037037037, |
| "grad_norm": 0.4603706896305084, |
| "learning_rate": 0.00025524841793456306, |
| "loss": 3.0025, |
| "step": 85450 |
| }, |
| { |
| "epoch": 23.00861326442722, |
| "grad_norm": 0.43974849581718445, |
| "learning_rate": 0.00025504645213410526, |
| "loss": 2.9376, |
| "step": 85500 |
| }, |
| { |
| "epoch": 23.022071490094746, |
| "grad_norm": 0.4222824275493622, |
| "learning_rate": 0.00025484448633364745, |
| "loss": 2.9166, |
| "step": 85550 |
| }, |
| { |
| "epoch": 23.035529715762273, |
| "grad_norm": 0.44281378388404846, |
| "learning_rate": 0.00025464252053318965, |
| "loss": 2.9168, |
| "step": 85600 |
| }, |
| { |
| "epoch": 23.0489879414298, |
| "grad_norm": 0.478463351726532, |
| "learning_rate": 0.0002544405547327319, |
| "loss": 2.9187, |
| "step": 85650 |
| }, |
| { |
| "epoch": 23.06244616709733, |
| "grad_norm": 0.43511971831321716, |
| "learning_rate": 0.0002542385889322741, |
| "loss": 2.9263, |
| "step": 85700 |
| }, |
| { |
| "epoch": 23.07590439276486, |
| "grad_norm": 0.492725670337677, |
| "learning_rate": 0.00025403662313181635, |
| "loss": 2.9329, |
| "step": 85750 |
| }, |
| { |
| "epoch": 23.089362618432386, |
| "grad_norm": 0.4485880732536316, |
| "learning_rate": 0.00025383465733135854, |
| "loss": 2.9346, |
| "step": 85800 |
| }, |
| { |
| "epoch": 23.102820844099913, |
| "grad_norm": 0.4755620062351227, |
| "learning_rate": 0.00025363269153090074, |
| "loss": 2.9237, |
| "step": 85850 |
| }, |
| { |
| "epoch": 23.11627906976744, |
| "grad_norm": 0.4183093011379242, |
| "learning_rate": 0.00025343072573044294, |
| "loss": 2.9378, |
| "step": 85900 |
| }, |
| { |
| "epoch": 23.12973729543497, |
| "grad_norm": 0.4465462565422058, |
| "learning_rate": 0.00025322875992998513, |
| "loss": 2.9452, |
| "step": 85950 |
| }, |
| { |
| "epoch": 23.1431955211025, |
| "grad_norm": 0.42842474579811096, |
| "learning_rate": 0.0002530267941295274, |
| "loss": 2.9407, |
| "step": 86000 |
| }, |
| { |
| "epoch": 23.1431955211025, |
| "eval_accuracy": 0.3958654656408665, |
| "eval_loss": 3.2916109561920166, |
| "eval_runtime": 54.925, |
| "eval_samples_per_second": 327.938, |
| "eval_steps_per_second": 20.501, |
| "step": 86000 |
| }, |
| { |
| "epoch": 23.156653746770026, |
| "grad_norm": 0.4357658624649048, |
| "learning_rate": 0.0002528248283290696, |
| "loss": 2.9473, |
| "step": 86050 |
| }, |
| { |
| "epoch": 23.170111972437553, |
| "grad_norm": 0.4642813503742218, |
| "learning_rate": 0.00025262286252861183, |
| "loss": 2.9555, |
| "step": 86100 |
| }, |
| { |
| "epoch": 23.18357019810508, |
| "grad_norm": 0.45447543263435364, |
| "learning_rate": 0.000252420896728154, |
| "loss": 2.9458, |
| "step": 86150 |
| }, |
| { |
| "epoch": 23.19702842377261, |
| "grad_norm": 0.44857949018478394, |
| "learning_rate": 0.0002522189309276962, |
| "loss": 2.9494, |
| "step": 86200 |
| }, |
| { |
| "epoch": 23.21048664944014, |
| "grad_norm": 0.4471588730812073, |
| "learning_rate": 0.0002520169651272384, |
| "loss": 2.946, |
| "step": 86250 |
| }, |
| { |
| "epoch": 23.223944875107666, |
| "grad_norm": 0.4542173147201538, |
| "learning_rate": 0.00025181499932678067, |
| "loss": 2.9522, |
| "step": 86300 |
| }, |
| { |
| "epoch": 23.237403100775193, |
| "grad_norm": 0.44934558868408203, |
| "learning_rate": 0.00025161303352632286, |
| "loss": 2.9551, |
| "step": 86350 |
| }, |
| { |
| "epoch": 23.25086132644272, |
| "grad_norm": 0.45528241991996765, |
| "learning_rate": 0.00025141106772586506, |
| "loss": 2.9546, |
| "step": 86400 |
| }, |
| { |
| "epoch": 23.26431955211025, |
| "grad_norm": 0.4349673092365265, |
| "learning_rate": 0.00025120910192540726, |
| "loss": 2.9571, |
| "step": 86450 |
| }, |
| { |
| "epoch": 23.27777777777778, |
| "grad_norm": 0.44581711292266846, |
| "learning_rate": 0.00025100713612494945, |
| "loss": 2.9511, |
| "step": 86500 |
| }, |
| { |
| "epoch": 23.291236003445306, |
| "grad_norm": 0.4143180251121521, |
| "learning_rate": 0.0002508051703244917, |
| "loss": 2.9606, |
| "step": 86550 |
| }, |
| { |
| "epoch": 23.304694229112833, |
| "grad_norm": 0.44750291109085083, |
| "learning_rate": 0.0002506032045240339, |
| "loss": 2.9643, |
| "step": 86600 |
| }, |
| { |
| "epoch": 23.31815245478036, |
| "grad_norm": 0.4560827612876892, |
| "learning_rate": 0.00025040123872357615, |
| "loss": 2.961, |
| "step": 86650 |
| }, |
| { |
| "epoch": 23.33161068044789, |
| "grad_norm": 0.45675498247146606, |
| "learning_rate": 0.00025019927292311834, |
| "loss": 2.967, |
| "step": 86700 |
| }, |
| { |
| "epoch": 23.34506890611542, |
| "grad_norm": 0.4347269833087921, |
| "learning_rate": 0.00024999730712266054, |
| "loss": 2.965, |
| "step": 86750 |
| }, |
| { |
| "epoch": 23.358527131782946, |
| "grad_norm": 0.4316846430301666, |
| "learning_rate": 0.00024979534132220274, |
| "loss": 2.9634, |
| "step": 86800 |
| }, |
| { |
| "epoch": 23.371985357450473, |
| "grad_norm": 0.47420305013656616, |
| "learning_rate": 0.00024959337552174493, |
| "loss": 2.9675, |
| "step": 86850 |
| }, |
| { |
| "epoch": 23.385443583118, |
| "grad_norm": 0.44756773114204407, |
| "learning_rate": 0.0002493914097212872, |
| "loss": 2.9659, |
| "step": 86900 |
| }, |
| { |
| "epoch": 23.39890180878553, |
| "grad_norm": 0.5035362243652344, |
| "learning_rate": 0.0002491894439208294, |
| "loss": 2.9752, |
| "step": 86950 |
| }, |
| { |
| "epoch": 23.41236003445306, |
| "grad_norm": 0.46639010310173035, |
| "learning_rate": 0.00024898747812037163, |
| "loss": 2.9756, |
| "step": 87000 |
| }, |
| { |
| "epoch": 23.41236003445306, |
| "eval_accuracy": 0.3962778891045891, |
| "eval_loss": 3.2877182960510254, |
| "eval_runtime": 54.721, |
| "eval_samples_per_second": 329.161, |
| "eval_steps_per_second": 20.577, |
| "step": 87000 |
| }, |
| { |
| "epoch": 23.425818260120586, |
| "grad_norm": 0.44842982292175293, |
| "learning_rate": 0.0002487855123199138, |
| "loss": 2.9664, |
| "step": 87050 |
| }, |
| { |
| "epoch": 23.439276485788113, |
| "grad_norm": 0.45062634348869324, |
| "learning_rate": 0.000248583546519456, |
| "loss": 2.9603, |
| "step": 87100 |
| }, |
| { |
| "epoch": 23.45273471145564, |
| "grad_norm": 0.4483391046524048, |
| "learning_rate": 0.0002483815807189982, |
| "loss": 2.976, |
| "step": 87150 |
| }, |
| { |
| "epoch": 23.466192937123168, |
| "grad_norm": 0.45215165615081787, |
| "learning_rate": 0.0002481796149185404, |
| "loss": 2.9683, |
| "step": 87200 |
| }, |
| { |
| "epoch": 23.4796511627907, |
| "grad_norm": 0.48873844742774963, |
| "learning_rate": 0.00024797764911808266, |
| "loss": 2.9729, |
| "step": 87250 |
| }, |
| { |
| "epoch": 23.493109388458226, |
| "grad_norm": 0.4677174985408783, |
| "learning_rate": 0.00024777568331762486, |
| "loss": 2.9737, |
| "step": 87300 |
| }, |
| { |
| "epoch": 23.506567614125753, |
| "grad_norm": 0.43766000866889954, |
| "learning_rate": 0.00024757371751716706, |
| "loss": 2.9754, |
| "step": 87350 |
| }, |
| { |
| "epoch": 23.52002583979328, |
| "grad_norm": 0.42998966574668884, |
| "learning_rate": 0.0002473717517167093, |
| "loss": 2.9692, |
| "step": 87400 |
| }, |
| { |
| "epoch": 23.533484065460808, |
| "grad_norm": 0.45019766688346863, |
| "learning_rate": 0.0002471697859162515, |
| "loss": 2.9847, |
| "step": 87450 |
| }, |
| { |
| "epoch": 23.54694229112834, |
| "grad_norm": 0.43693816661834717, |
| "learning_rate": 0.0002469678201157937, |
| "loss": 2.9708, |
| "step": 87500 |
| }, |
| { |
| "epoch": 23.560400516795866, |
| "grad_norm": 0.4665409326553345, |
| "learning_rate": 0.0002467658543153359, |
| "loss": 2.9764, |
| "step": 87550 |
| }, |
| { |
| "epoch": 23.573858742463393, |
| "grad_norm": 0.46596479415893555, |
| "learning_rate": 0.00024656388851487815, |
| "loss": 2.9756, |
| "step": 87600 |
| }, |
| { |
| "epoch": 23.58731696813092, |
| "grad_norm": 0.4340677857398987, |
| "learning_rate": 0.00024636192271442034, |
| "loss": 2.9808, |
| "step": 87650 |
| }, |
| { |
| "epoch": 23.600775193798448, |
| "grad_norm": 0.47798460721969604, |
| "learning_rate": 0.00024615995691396254, |
| "loss": 2.9849, |
| "step": 87700 |
| }, |
| { |
| "epoch": 23.61423341946598, |
| "grad_norm": 0.45297908782958984, |
| "learning_rate": 0.00024595799111350473, |
| "loss": 2.9802, |
| "step": 87750 |
| }, |
| { |
| "epoch": 23.627691645133506, |
| "grad_norm": 0.473803848028183, |
| "learning_rate": 0.00024575602531304693, |
| "loss": 2.9863, |
| "step": 87800 |
| }, |
| { |
| "epoch": 23.641149870801033, |
| "grad_norm": 0.4625190794467926, |
| "learning_rate": 0.0002455540595125892, |
| "loss": 2.9927, |
| "step": 87850 |
| }, |
| { |
| "epoch": 23.65460809646856, |
| "grad_norm": 0.4412365257740021, |
| "learning_rate": 0.00024535209371213143, |
| "loss": 2.9739, |
| "step": 87900 |
| }, |
| { |
| "epoch": 23.668066322136088, |
| "grad_norm": 0.4445769488811493, |
| "learning_rate": 0.0002451501279116736, |
| "loss": 2.9827, |
| "step": 87950 |
| }, |
| { |
| "epoch": 23.68152454780362, |
| "grad_norm": 0.4315958619117737, |
| "learning_rate": 0.0002449481621112158, |
| "loss": 2.981, |
| "step": 88000 |
| }, |
| { |
| "epoch": 23.68152454780362, |
| "eval_accuracy": 0.3964807327839279, |
| "eval_loss": 3.2832388877868652, |
| "eval_runtime": 55.0121, |
| "eval_samples_per_second": 327.419, |
| "eval_steps_per_second": 20.468, |
| "step": 88000 |
| }, |
| { |
| "epoch": 23.694982773471146, |
| "grad_norm": 0.4323495328426361, |
| "learning_rate": 0.000244746196310758, |
| "loss": 2.9931, |
| "step": 88050 |
| }, |
| { |
| "epoch": 23.708440999138674, |
| "grad_norm": 0.46069687604904175, |
| "learning_rate": 0.0002445442305103002, |
| "loss": 2.9773, |
| "step": 88100 |
| }, |
| { |
| "epoch": 23.7218992248062, |
| "grad_norm": 0.4385862946510315, |
| "learning_rate": 0.00024434226470984246, |
| "loss": 2.9801, |
| "step": 88150 |
| }, |
| { |
| "epoch": 23.735357450473728, |
| "grad_norm": 0.45180192589759827, |
| "learning_rate": 0.00024414029890938463, |
| "loss": 2.988, |
| "step": 88200 |
| }, |
| { |
| "epoch": 23.74881567614126, |
| "grad_norm": 0.45826536417007446, |
| "learning_rate": 0.00024393833310892686, |
| "loss": 2.9816, |
| "step": 88250 |
| }, |
| { |
| "epoch": 23.762273901808786, |
| "grad_norm": 0.45120131969451904, |
| "learning_rate": 0.0002437363673084691, |
| "loss": 2.9752, |
| "step": 88300 |
| }, |
| { |
| "epoch": 23.775732127476314, |
| "grad_norm": 0.4657435417175293, |
| "learning_rate": 0.0002435344015080113, |
| "loss": 2.9837, |
| "step": 88350 |
| }, |
| { |
| "epoch": 23.78919035314384, |
| "grad_norm": 0.4517814815044403, |
| "learning_rate": 0.0002433324357075535, |
| "loss": 2.9858, |
| "step": 88400 |
| }, |
| { |
| "epoch": 23.802648578811368, |
| "grad_norm": 0.4310310184955597, |
| "learning_rate": 0.00024313046990709572, |
| "loss": 2.986, |
| "step": 88450 |
| }, |
| { |
| "epoch": 23.8161068044789, |
| "grad_norm": 0.4929543733596802, |
| "learning_rate": 0.00024292850410663792, |
| "loss": 2.9891, |
| "step": 88500 |
| }, |
| { |
| "epoch": 23.829565030146426, |
| "grad_norm": 0.471609890460968, |
| "learning_rate": 0.00024272653830618014, |
| "loss": 2.9759, |
| "step": 88550 |
| }, |
| { |
| "epoch": 23.843023255813954, |
| "grad_norm": 0.48696190118789673, |
| "learning_rate": 0.00024252457250572234, |
| "loss": 2.9843, |
| "step": 88600 |
| }, |
| { |
| "epoch": 23.85648148148148, |
| "grad_norm": 0.4469008147716522, |
| "learning_rate": 0.00024232260670526453, |
| "loss": 2.99, |
| "step": 88650 |
| }, |
| { |
| "epoch": 23.86993970714901, |
| "grad_norm": 0.45953747630119324, |
| "learning_rate": 0.00024212064090480676, |
| "loss": 2.9742, |
| "step": 88700 |
| }, |
| { |
| "epoch": 23.88339793281654, |
| "grad_norm": 0.48521921038627625, |
| "learning_rate": 0.00024191867510434898, |
| "loss": 2.9907, |
| "step": 88750 |
| }, |
| { |
| "epoch": 23.896856158484066, |
| "grad_norm": 0.4295513927936554, |
| "learning_rate": 0.0002417167093038912, |
| "loss": 2.9931, |
| "step": 88800 |
| }, |
| { |
| "epoch": 23.910314384151594, |
| "grad_norm": 0.4355540871620178, |
| "learning_rate": 0.0002415147435034334, |
| "loss": 2.9859, |
| "step": 88850 |
| }, |
| { |
| "epoch": 23.92377260981912, |
| "grad_norm": 0.43405336141586304, |
| "learning_rate": 0.00024131277770297562, |
| "loss": 3.0063, |
| "step": 88900 |
| }, |
| { |
| "epoch": 23.93723083548665, |
| "grad_norm": 0.4761360287666321, |
| "learning_rate": 0.00024111081190251782, |
| "loss": 2.9799, |
| "step": 88950 |
| }, |
| { |
| "epoch": 23.95068906115418, |
| "grad_norm": 0.4812622666358948, |
| "learning_rate": 0.00024090884610206002, |
| "loss": 2.9913, |
| "step": 89000 |
| }, |
| { |
| "epoch": 23.95068906115418, |
| "eval_accuracy": 0.3971195762946468, |
| "eval_loss": 3.276291847229004, |
| "eval_runtime": 54.8659, |
| "eval_samples_per_second": 328.291, |
| "eval_steps_per_second": 20.523, |
| "step": 89000 |
| }, |
| { |
| "epoch": 23.964147286821706, |
| "grad_norm": 0.4610179662704468, |
| "learning_rate": 0.00024070688030160224, |
| "loss": 2.9939, |
| "step": 89050 |
| }, |
| { |
| "epoch": 23.977605512489234, |
| "grad_norm": 0.4359179735183716, |
| "learning_rate": 0.00024050491450114443, |
| "loss": 2.9952, |
| "step": 89100 |
| }, |
| { |
| "epoch": 23.99106373815676, |
| "grad_norm": 0.45922085642814636, |
| "learning_rate": 0.00024030294870068666, |
| "loss": 2.9857, |
| "step": 89150 |
| }, |
| { |
| "epoch": 24.00430663221361, |
| "grad_norm": 0.46470963954925537, |
| "learning_rate": 0.00024010098290022888, |
| "loss": 2.9692, |
| "step": 89200 |
| }, |
| { |
| "epoch": 24.017764857881136, |
| "grad_norm": 0.42660030722618103, |
| "learning_rate": 0.0002398990170997711, |
| "loss": 2.8936, |
| "step": 89250 |
| }, |
| { |
| "epoch": 24.031223083548664, |
| "grad_norm": 0.433124840259552, |
| "learning_rate": 0.0002396970512993133, |
| "loss": 2.9173, |
| "step": 89300 |
| }, |
| { |
| "epoch": 24.044681309216195, |
| "grad_norm": 0.4298465847969055, |
| "learning_rate": 0.00023949508549885552, |
| "loss": 2.9137, |
| "step": 89350 |
| }, |
| { |
| "epoch": 24.058139534883722, |
| "grad_norm": 0.4245285987854004, |
| "learning_rate": 0.00023929311969839772, |
| "loss": 2.9107, |
| "step": 89400 |
| }, |
| { |
| "epoch": 24.07159776055125, |
| "grad_norm": 0.465323269367218, |
| "learning_rate": 0.00023909115389793992, |
| "loss": 2.92, |
| "step": 89450 |
| }, |
| { |
| "epoch": 24.085055986218777, |
| "grad_norm": 0.47525930404663086, |
| "learning_rate": 0.00023888918809748214, |
| "loss": 2.9168, |
| "step": 89500 |
| }, |
| { |
| "epoch": 24.098514211886304, |
| "grad_norm": 0.46605080366134644, |
| "learning_rate": 0.00023868722229702433, |
| "loss": 2.9265, |
| "step": 89550 |
| }, |
| { |
| "epoch": 24.11197243755383, |
| "grad_norm": 0.4591079354286194, |
| "learning_rate": 0.00023848525649656656, |
| "loss": 2.9212, |
| "step": 89600 |
| }, |
| { |
| "epoch": 24.125430663221362, |
| "grad_norm": 0.4741298258304596, |
| "learning_rate": 0.00023828329069610878, |
| "loss": 2.9165, |
| "step": 89650 |
| }, |
| { |
| "epoch": 24.13888888888889, |
| "grad_norm": 0.46697476506233215, |
| "learning_rate": 0.000238081324895651, |
| "loss": 2.9309, |
| "step": 89700 |
| }, |
| { |
| "epoch": 24.152347114556417, |
| "grad_norm": 0.47774580121040344, |
| "learning_rate": 0.0002378793590951932, |
| "loss": 2.9415, |
| "step": 89750 |
| }, |
| { |
| "epoch": 24.165805340223944, |
| "grad_norm": 0.4766087532043457, |
| "learning_rate": 0.0002376773932947354, |
| "loss": 2.9361, |
| "step": 89800 |
| }, |
| { |
| "epoch": 24.17926356589147, |
| "grad_norm": 0.464669793844223, |
| "learning_rate": 0.00023747542749427762, |
| "loss": 2.9333, |
| "step": 89850 |
| }, |
| { |
| "epoch": 24.192721791559002, |
| "grad_norm": 0.43852221965789795, |
| "learning_rate": 0.00023727346169381982, |
| "loss": 2.9363, |
| "step": 89900 |
| }, |
| { |
| "epoch": 24.20618001722653, |
| "grad_norm": 0.4529156982898712, |
| "learning_rate": 0.00023707149589336204, |
| "loss": 2.9422, |
| "step": 89950 |
| }, |
| { |
| "epoch": 24.219638242894057, |
| "grad_norm": 0.5049745440483093, |
| "learning_rate": 0.00023686953009290424, |
| "loss": 2.958, |
| "step": 90000 |
| }, |
| { |
| "epoch": 24.219638242894057, |
| "eval_accuracy": 0.3960319126235912, |
| "eval_loss": 3.292781352996826, |
| "eval_runtime": 54.9708, |
| "eval_samples_per_second": 327.665, |
| "eval_steps_per_second": 20.484, |
| "step": 90000 |
| }, |
| { |
| "epoch": 24.233096468561584, |
| "grad_norm": 0.4704437255859375, |
| "learning_rate": 0.00023666756429244643, |
| "loss": 2.9132, |
| "step": 90050 |
| }, |
| { |
| "epoch": 24.24655469422911, |
| "grad_norm": 0.4529268145561218, |
| "learning_rate": 0.00023646559849198868, |
| "loss": 2.9176, |
| "step": 90100 |
| }, |
| { |
| "epoch": 24.260012919896642, |
| "grad_norm": 0.43529608845710754, |
| "learning_rate": 0.00023626363269153088, |
| "loss": 2.9243, |
| "step": 90150 |
| }, |
| { |
| "epoch": 24.27347114556417, |
| "grad_norm": 0.48170673847198486, |
| "learning_rate": 0.0002360616668910731, |
| "loss": 2.9182, |
| "step": 90200 |
| }, |
| { |
| "epoch": 24.286929371231697, |
| "grad_norm": 0.46854284405708313, |
| "learning_rate": 0.0002358597010906153, |
| "loss": 2.9155, |
| "step": 90250 |
| }, |
| { |
| "epoch": 24.300387596899224, |
| "grad_norm": 0.45262134075164795, |
| "learning_rate": 0.00023565773529015752, |
| "loss": 2.926, |
| "step": 90300 |
| }, |
| { |
| "epoch": 24.31384582256675, |
| "grad_norm": 0.4530410170555115, |
| "learning_rate": 0.00023545576948969972, |
| "loss": 2.929, |
| "step": 90350 |
| }, |
| { |
| "epoch": 24.327304048234282, |
| "grad_norm": 0.483509361743927, |
| "learning_rate": 0.00023525380368924194, |
| "loss": 2.9243, |
| "step": 90400 |
| }, |
| { |
| "epoch": 24.34076227390181, |
| "grad_norm": 0.4750220477581024, |
| "learning_rate": 0.00023505183788878414, |
| "loss": 2.9258, |
| "step": 90450 |
| }, |
| { |
| "epoch": 24.354220499569337, |
| "grad_norm": 0.4815353751182556, |
| "learning_rate": 0.00023484987208832633, |
| "loss": 2.9405, |
| "step": 90500 |
| }, |
| { |
| "epoch": 24.367678725236864, |
| "grad_norm": 0.49263864755630493, |
| "learning_rate": 0.00023464790628786858, |
| "loss": 2.9295, |
| "step": 90550 |
| }, |
| { |
| "epoch": 24.38113695090439, |
| "grad_norm": 0.4376738667488098, |
| "learning_rate": 0.00023444594048741078, |
| "loss": 2.9399, |
| "step": 90600 |
| }, |
| { |
| "epoch": 24.394595176571922, |
| "grad_norm": 0.47231411933898926, |
| "learning_rate": 0.000234243974686953, |
| "loss": 2.9306, |
| "step": 90650 |
| }, |
| { |
| "epoch": 24.40805340223945, |
| "grad_norm": 0.4553051292896271, |
| "learning_rate": 0.0002340420088864952, |
| "loss": 2.9463, |
| "step": 90700 |
| }, |
| { |
| "epoch": 24.421511627906977, |
| "grad_norm": 0.47880223393440247, |
| "learning_rate": 0.00023384004308603742, |
| "loss": 2.9336, |
| "step": 90750 |
| }, |
| { |
| "epoch": 24.434969853574504, |
| "grad_norm": 0.422132670879364, |
| "learning_rate": 0.00023363807728557962, |
| "loss": 2.9377, |
| "step": 90800 |
| }, |
| { |
| "epoch": 24.44842807924203, |
| "grad_norm": 0.475422739982605, |
| "learning_rate": 0.0002334361114851218, |
| "loss": 2.9358, |
| "step": 90850 |
| }, |
| { |
| "epoch": 24.461886304909562, |
| "grad_norm": 0.4530656933784485, |
| "learning_rate": 0.00023323414568466404, |
| "loss": 2.9447, |
| "step": 90900 |
| }, |
| { |
| "epoch": 24.47534453057709, |
| "grad_norm": 0.4522700607776642, |
| "learning_rate": 0.00023303217988420623, |
| "loss": 2.9453, |
| "step": 90950 |
| }, |
| { |
| "epoch": 24.488802756244617, |
| "grad_norm": 0.4927493929862976, |
| "learning_rate": 0.00023283021408374848, |
| "loss": 2.951, |
| "step": 91000 |
| }, |
| { |
| "epoch": 24.488802756244617, |
| "eval_accuracy": 0.3960585311032045, |
| "eval_loss": 3.29355788230896, |
| "eval_runtime": 147.0497, |
| "eval_samples_per_second": 122.489, |
| "eval_steps_per_second": 7.657, |
| "step": 91000 |
| }, |
| { |
| "epoch": 24.502260981912144, |
| "grad_norm": 0.447524756193161, |
| "learning_rate": 0.00023262824828329068, |
| "loss": 2.9552, |
| "step": 91050 |
| }, |
| { |
| "epoch": 24.51571920757967, |
| "grad_norm": 0.45930957794189453, |
| "learning_rate": 0.0002324262824828329, |
| "loss": 2.9467, |
| "step": 91100 |
| }, |
| { |
| "epoch": 24.529177433247202, |
| "grad_norm": 0.473864883184433, |
| "learning_rate": 0.0002322243166823751, |
| "loss": 2.9415, |
| "step": 91150 |
| }, |
| { |
| "epoch": 24.54263565891473, |
| "grad_norm": 0.4445241689682007, |
| "learning_rate": 0.0002320223508819173, |
| "loss": 2.94, |
| "step": 91200 |
| }, |
| { |
| "epoch": 24.556093884582257, |
| "grad_norm": 0.4805726110935211, |
| "learning_rate": 0.00023182038508145952, |
| "loss": 2.9394, |
| "step": 91250 |
| }, |
| { |
| "epoch": 24.569552110249784, |
| "grad_norm": 0.4495256543159485, |
| "learning_rate": 0.0002316184192810017, |
| "loss": 2.9555, |
| "step": 91300 |
| }, |
| { |
| "epoch": 24.58301033591731, |
| "grad_norm": 0.4489133954048157, |
| "learning_rate": 0.00023141645348054394, |
| "loss": 2.9541, |
| "step": 91350 |
| }, |
| { |
| "epoch": 24.596468561584842, |
| "grad_norm": 0.48831552267074585, |
| "learning_rate": 0.00023121448768008613, |
| "loss": 2.9554, |
| "step": 91400 |
| }, |
| { |
| "epoch": 24.60992678725237, |
| "grad_norm": 0.4598943591117859, |
| "learning_rate": 0.00023101252187962838, |
| "loss": 2.9543, |
| "step": 91450 |
| }, |
| { |
| "epoch": 24.623385012919897, |
| "grad_norm": 0.4772765040397644, |
| "learning_rate": 0.00023081055607917058, |
| "loss": 2.9584, |
| "step": 91500 |
| }, |
| { |
| "epoch": 24.636843238587424, |
| "grad_norm": 0.47404298186302185, |
| "learning_rate": 0.0002306085902787128, |
| "loss": 2.9563, |
| "step": 91550 |
| }, |
| { |
| "epoch": 24.65030146425495, |
| "grad_norm": 0.5125542283058167, |
| "learning_rate": 0.000230406624478255, |
| "loss": 2.964, |
| "step": 91600 |
| }, |
| { |
| "epoch": 24.66375968992248, |
| "grad_norm": 0.48980486392974854, |
| "learning_rate": 0.0002302046586777972, |
| "loss": 2.9626, |
| "step": 91650 |
| }, |
| { |
| "epoch": 24.67721791559001, |
| "grad_norm": 0.48496899008750916, |
| "learning_rate": 0.00023000269287733942, |
| "loss": 2.9436, |
| "step": 91700 |
| }, |
| { |
| "epoch": 24.690676141257537, |
| "grad_norm": 0.49066996574401855, |
| "learning_rate": 0.00022980072707688161, |
| "loss": 2.9684, |
| "step": 91750 |
| }, |
| { |
| "epoch": 24.704134366925064, |
| "grad_norm": 0.4482797086238861, |
| "learning_rate": 0.00022959876127642384, |
| "loss": 2.9512, |
| "step": 91800 |
| }, |
| { |
| "epoch": 24.71759259259259, |
| "grad_norm": 0.49523958563804626, |
| "learning_rate": 0.00022939679547596603, |
| "loss": 2.9561, |
| "step": 91850 |
| }, |
| { |
| "epoch": 24.73105081826012, |
| "grad_norm": 0.44394782185554504, |
| "learning_rate": 0.00022919482967550828, |
| "loss": 2.9688, |
| "step": 91900 |
| }, |
| { |
| "epoch": 24.74450904392765, |
| "grad_norm": 0.49176523089408875, |
| "learning_rate": 0.00022899286387505048, |
| "loss": 2.9611, |
| "step": 91950 |
| }, |
| { |
| "epoch": 24.757967269595177, |
| "grad_norm": 0.4634227156639099, |
| "learning_rate": 0.00022879089807459268, |
| "loss": 2.9644, |
| "step": 92000 |
| }, |
| { |
| "epoch": 24.757967269595177, |
| "eval_accuracy": 0.396619148877917, |
| "eval_loss": 3.2853758335113525, |
| "eval_runtime": 147.3435, |
| "eval_samples_per_second": 122.245, |
| "eval_steps_per_second": 7.642, |
| "step": 92000 |
| }, |
| { |
| "epoch": 24.771425495262704, |
| "grad_norm": 0.4450036287307739, |
| "learning_rate": 0.0002285889322741349, |
| "loss": 2.9646, |
| "step": 92050 |
| }, |
| { |
| "epoch": 24.78488372093023, |
| "grad_norm": 0.46700024604797363, |
| "learning_rate": 0.0002283869664736771, |
| "loss": 2.9706, |
| "step": 92100 |
| }, |
| { |
| "epoch": 24.79834194659776, |
| "grad_norm": 0.48077601194381714, |
| "learning_rate": 0.00022818500067321932, |
| "loss": 2.9661, |
| "step": 92150 |
| }, |
| { |
| "epoch": 24.81180017226529, |
| "grad_norm": 0.46260204911231995, |
| "learning_rate": 0.00022798303487276151, |
| "loss": 2.9705, |
| "step": 92200 |
| }, |
| { |
| "epoch": 24.825258397932817, |
| "grad_norm": 0.4556381106376648, |
| "learning_rate": 0.0002277810690723037, |
| "loss": 2.9745, |
| "step": 92250 |
| }, |
| { |
| "epoch": 24.838716623600344, |
| "grad_norm": 0.4934409558773041, |
| "learning_rate": 0.00022757910327184596, |
| "loss": 2.9711, |
| "step": 92300 |
| }, |
| { |
| "epoch": 24.852174849267872, |
| "grad_norm": 0.46511897444725037, |
| "learning_rate": 0.00022737713747138818, |
| "loss": 2.9626, |
| "step": 92350 |
| }, |
| { |
| "epoch": 24.8656330749354, |
| "grad_norm": 0.43899616599082947, |
| "learning_rate": 0.00022717517167093038, |
| "loss": 2.9746, |
| "step": 92400 |
| }, |
| { |
| "epoch": 24.87909130060293, |
| "grad_norm": 0.46751806139945984, |
| "learning_rate": 0.00022697320587047258, |
| "loss": 2.9637, |
| "step": 92450 |
| }, |
| { |
| "epoch": 24.892549526270457, |
| "grad_norm": 0.465297132730484, |
| "learning_rate": 0.0002267712400700148, |
| "loss": 2.9674, |
| "step": 92500 |
| }, |
| { |
| "epoch": 24.906007751937985, |
| "grad_norm": 0.44672736525535583, |
| "learning_rate": 0.000226569274269557, |
| "loss": 2.9694, |
| "step": 92550 |
| }, |
| { |
| "epoch": 24.919465977605512, |
| "grad_norm": 0.4716099798679352, |
| "learning_rate": 0.00022636730846909922, |
| "loss": 2.9788, |
| "step": 92600 |
| }, |
| { |
| "epoch": 24.93292420327304, |
| "grad_norm": 0.47384023666381836, |
| "learning_rate": 0.00022616534266864141, |
| "loss": 2.9707, |
| "step": 92650 |
| }, |
| { |
| "epoch": 24.94638242894057, |
| "grad_norm": 0.5042713284492493, |
| "learning_rate": 0.0002259633768681836, |
| "loss": 2.983, |
| "step": 92700 |
| }, |
| { |
| "epoch": 24.959840654608097, |
| "grad_norm": 0.4943690896034241, |
| "learning_rate": 0.00022576141106772586, |
| "loss": 2.9636, |
| "step": 92750 |
| }, |
| { |
| "epoch": 24.973298880275625, |
| "grad_norm": 0.4922303557395935, |
| "learning_rate": 0.00022555944526726806, |
| "loss": 2.9757, |
| "step": 92800 |
| }, |
| { |
| "epoch": 24.986757105943152, |
| "grad_norm": 0.46571773290634155, |
| "learning_rate": 0.00022535747946681028, |
| "loss": 2.9737, |
| "step": 92850 |
| }, |
| { |
| "epoch": 25.00026916451335, |
| "grad_norm": 1.1899535655975342, |
| "learning_rate": 0.00022515551366635248, |
| "loss": 3.0341, |
| "step": 92900 |
| }, |
| { |
| "epoch": 25.01372739018088, |
| "grad_norm": 0.4870604872703552, |
| "learning_rate": 0.0002249535478658947, |
| "loss": 2.9042, |
| "step": 92950 |
| }, |
| { |
| "epoch": 25.027185615848406, |
| "grad_norm": 0.469473272562027, |
| "learning_rate": 0.0002247515820654369, |
| "loss": 2.9012, |
| "step": 93000 |
| }, |
| { |
| "epoch": 25.027185615848406, |
| "eval_accuracy": 0.39639979087653243, |
| "eval_loss": 3.292593240737915, |
| "eval_runtime": 147.4879, |
| "eval_samples_per_second": 122.125, |
| "eval_steps_per_second": 7.635, |
| "step": 93000 |
| }, |
| { |
| "epoch": 25.040643841515934, |
| "grad_norm": 0.4822351336479187, |
| "learning_rate": 0.0002245496162649791, |
| "loss": 2.9127, |
| "step": 93050 |
| }, |
| { |
| "epoch": 25.05410206718346, |
| "grad_norm": 0.46038123965263367, |
| "learning_rate": 0.00022434765046452131, |
| "loss": 2.9251, |
| "step": 93100 |
| }, |
| { |
| "epoch": 25.06756029285099, |
| "grad_norm": 0.44845762848854065, |
| "learning_rate": 0.0002241456846640635, |
| "loss": 2.9082, |
| "step": 93150 |
| }, |
| { |
| "epoch": 25.08101851851852, |
| "grad_norm": 0.5279198288917542, |
| "learning_rate": 0.00022394371886360576, |
| "loss": 2.9228, |
| "step": 93200 |
| }, |
| { |
| "epoch": 25.094476744186046, |
| "grad_norm": 0.45746752619743347, |
| "learning_rate": 0.00022374175306314796, |
| "loss": 2.9189, |
| "step": 93250 |
| }, |
| { |
| "epoch": 25.107934969853574, |
| "grad_norm": 0.4499928653240204, |
| "learning_rate": 0.00022353978726269018, |
| "loss": 2.919, |
| "step": 93300 |
| }, |
| { |
| "epoch": 25.1213931955211, |
| "grad_norm": 0.4959993064403534, |
| "learning_rate": 0.00022333782146223238, |
| "loss": 2.9152, |
| "step": 93350 |
| }, |
| { |
| "epoch": 25.134851421188632, |
| "grad_norm": 0.47189101576805115, |
| "learning_rate": 0.0002231358556617746, |
| "loss": 2.9283, |
| "step": 93400 |
| }, |
| { |
| "epoch": 25.14830964685616, |
| "grad_norm": 0.47774845361709595, |
| "learning_rate": 0.0002229338898613168, |
| "loss": 2.9224, |
| "step": 93450 |
| }, |
| { |
| "epoch": 25.161767872523686, |
| "grad_norm": 0.45308101177215576, |
| "learning_rate": 0.000222731924060859, |
| "loss": 2.9252, |
| "step": 93500 |
| }, |
| { |
| "epoch": 25.175226098191214, |
| "grad_norm": 0.4624495208263397, |
| "learning_rate": 0.00022252995826040122, |
| "loss": 2.9312, |
| "step": 93550 |
| }, |
| { |
| "epoch": 25.18868432385874, |
| "grad_norm": 0.4419146180152893, |
| "learning_rate": 0.0002223279924599434, |
| "loss": 2.9285, |
| "step": 93600 |
| }, |
| { |
| "epoch": 25.202142549526272, |
| "grad_norm": 0.4739558696746826, |
| "learning_rate": 0.00022212602665948566, |
| "loss": 2.9303, |
| "step": 93650 |
| }, |
| { |
| "epoch": 25.2156007751938, |
| "grad_norm": 0.4595142900943756, |
| "learning_rate": 0.00022192406085902786, |
| "loss": 2.9369, |
| "step": 93700 |
| }, |
| { |
| "epoch": 25.229059000861326, |
| "grad_norm": 0.47502854466438293, |
| "learning_rate": 0.00022172209505857008, |
| "loss": 2.9372, |
| "step": 93750 |
| }, |
| { |
| "epoch": 25.242517226528854, |
| "grad_norm": 0.4656815826892853, |
| "learning_rate": 0.00022152012925811228, |
| "loss": 2.9324, |
| "step": 93800 |
| }, |
| { |
| "epoch": 25.25597545219638, |
| "grad_norm": 0.4996993839740753, |
| "learning_rate": 0.00022131816345765447, |
| "loss": 2.9346, |
| "step": 93850 |
| }, |
| { |
| "epoch": 25.269433677863912, |
| "grad_norm": 0.43119895458221436, |
| "learning_rate": 0.0002211161976571967, |
| "loss": 2.9464, |
| "step": 93900 |
| }, |
| { |
| "epoch": 25.28289190353144, |
| "grad_norm": 0.48845374584198, |
| "learning_rate": 0.0002209142318567389, |
| "loss": 2.9391, |
| "step": 93950 |
| }, |
| { |
| "epoch": 25.296350129198967, |
| "grad_norm": 0.44671630859375, |
| "learning_rate": 0.00022071226605628112, |
| "loss": 2.9246, |
| "step": 94000 |
| }, |
| { |
| "epoch": 25.296350129198967, |
| "eval_accuracy": 0.3961111161812977, |
| "eval_loss": 3.2946267127990723, |
| "eval_runtime": 146.1973, |
| "eval_samples_per_second": 123.203, |
| "eval_steps_per_second": 7.702, |
| "step": 94000 |
| }, |
| { |
| "epoch": 25.309808354866494, |
| "grad_norm": 0.48212531208992004, |
| "learning_rate": 0.0002205103002558233, |
| "loss": 2.942, |
| "step": 94050 |
| }, |
| { |
| "epoch": 25.32326658053402, |
| "grad_norm": 0.540875256061554, |
| "learning_rate": 0.00022030833445536556, |
| "loss": 2.9456, |
| "step": 94100 |
| }, |
| { |
| "epoch": 25.336724806201552, |
| "grad_norm": 0.4727246165275574, |
| "learning_rate": 0.00022010636865490776, |
| "loss": 2.9372, |
| "step": 94150 |
| }, |
| { |
| "epoch": 25.35018303186908, |
| "grad_norm": 0.48135116696357727, |
| "learning_rate": 0.00021990440285444998, |
| "loss": 2.9495, |
| "step": 94200 |
| }, |
| { |
| "epoch": 25.363641257536607, |
| "grad_norm": 0.48746976256370544, |
| "learning_rate": 0.00021970243705399218, |
| "loss": 2.944, |
| "step": 94250 |
| }, |
| { |
| "epoch": 25.377099483204134, |
| "grad_norm": 0.43600234389305115, |
| "learning_rate": 0.00021950047125353437, |
| "loss": 2.9406, |
| "step": 94300 |
| }, |
| { |
| "epoch": 25.39055770887166, |
| "grad_norm": 0.4867391288280487, |
| "learning_rate": 0.0002192985054530766, |
| "loss": 2.9455, |
| "step": 94350 |
| }, |
| { |
| "epoch": 25.404015934539192, |
| "grad_norm": 0.4582846760749817, |
| "learning_rate": 0.0002190965396526188, |
| "loss": 2.9339, |
| "step": 94400 |
| }, |
| { |
| "epoch": 25.41747416020672, |
| "grad_norm": 0.49392837285995483, |
| "learning_rate": 0.00021889457385216102, |
| "loss": 2.9495, |
| "step": 94450 |
| }, |
| { |
| "epoch": 25.430932385874247, |
| "grad_norm": 0.46900445222854614, |
| "learning_rate": 0.0002186926080517032, |
| "loss": 2.9567, |
| "step": 94500 |
| }, |
| { |
| "epoch": 25.444390611541774, |
| "grad_norm": 0.504092812538147, |
| "learning_rate": 0.00021849064225124546, |
| "loss": 2.9483, |
| "step": 94550 |
| }, |
| { |
| "epoch": 25.4578488372093, |
| "grad_norm": 0.45981499552726746, |
| "learning_rate": 0.00021828867645078766, |
| "loss": 2.9443, |
| "step": 94600 |
| }, |
| { |
| "epoch": 25.471307062876832, |
| "grad_norm": 0.47376635670661926, |
| "learning_rate": 0.00021808671065032985, |
| "loss": 2.9667, |
| "step": 94650 |
| }, |
| { |
| "epoch": 25.48476528854436, |
| "grad_norm": 0.4677162766456604, |
| "learning_rate": 0.00021788474484987208, |
| "loss": 2.9544, |
| "step": 94700 |
| }, |
| { |
| "epoch": 25.498223514211887, |
| "grad_norm": 0.4654765725135803, |
| "learning_rate": 0.00021768277904941427, |
| "loss": 2.9487, |
| "step": 94750 |
| }, |
| { |
| "epoch": 25.511681739879414, |
| "grad_norm": 0.49871331453323364, |
| "learning_rate": 0.0002174808132489565, |
| "loss": 2.9439, |
| "step": 94800 |
| }, |
| { |
| "epoch": 25.52513996554694, |
| "grad_norm": 0.511438250541687, |
| "learning_rate": 0.0002172788474484987, |
| "loss": 2.9495, |
| "step": 94850 |
| }, |
| { |
| "epoch": 25.53859819121447, |
| "grad_norm": 0.48844873905181885, |
| "learning_rate": 0.0002170768816480409, |
| "loss": 2.9612, |
| "step": 94900 |
| }, |
| { |
| "epoch": 25.552056416882, |
| "grad_norm": 0.4564681649208069, |
| "learning_rate": 0.0002168749158475831, |
| "loss": 2.9637, |
| "step": 94950 |
| }, |
| { |
| "epoch": 25.565514642549527, |
| "grad_norm": 0.4665428698062897, |
| "learning_rate": 0.00021667295004712534, |
| "loss": 2.9605, |
| "step": 95000 |
| }, |
| { |
| "epoch": 25.565514642549527, |
| "eval_accuracy": 0.3967212769221476, |
| "eval_loss": 3.2849857807159424, |
| "eval_runtime": 146.1173, |
| "eval_samples_per_second": 123.271, |
| "eval_steps_per_second": 7.706, |
| "step": 95000 |
| }, |
| { |
| "epoch": 25.578972868217054, |
| "grad_norm": 0.45568856596946716, |
| "learning_rate": 0.00021647098424666756, |
| "loss": 2.9594, |
| "step": 95050 |
| }, |
| { |
| "epoch": 25.59243109388458, |
| "grad_norm": 0.47148966789245605, |
| "learning_rate": 0.00021626901844620976, |
| "loss": 2.9654, |
| "step": 95100 |
| }, |
| { |
| "epoch": 25.60588931955211, |
| "grad_norm": 0.4540193974971771, |
| "learning_rate": 0.00021606705264575198, |
| "loss": 2.9655, |
| "step": 95150 |
| }, |
| { |
| "epoch": 25.61934754521964, |
| "grad_norm": 0.491960734128952, |
| "learning_rate": 0.00021586508684529417, |
| "loss": 2.9575, |
| "step": 95200 |
| }, |
| { |
| "epoch": 25.632805770887167, |
| "grad_norm": 0.4663715958595276, |
| "learning_rate": 0.0002156631210448364, |
| "loss": 2.9559, |
| "step": 95250 |
| }, |
| { |
| "epoch": 25.646263996554694, |
| "grad_norm": 0.45852020382881165, |
| "learning_rate": 0.0002154611552443786, |
| "loss": 2.9595, |
| "step": 95300 |
| }, |
| { |
| "epoch": 25.65972222222222, |
| "grad_norm": 0.48818692564964294, |
| "learning_rate": 0.0002152591894439208, |
| "loss": 2.9513, |
| "step": 95350 |
| }, |
| { |
| "epoch": 25.67318044788975, |
| "grad_norm": 0.4618067145347595, |
| "learning_rate": 0.000215057223643463, |
| "loss": 2.9658, |
| "step": 95400 |
| }, |
| { |
| "epoch": 25.68663867355728, |
| "grad_norm": 0.4699975848197937, |
| "learning_rate": 0.00021485525784300524, |
| "loss": 2.9694, |
| "step": 95450 |
| }, |
| { |
| "epoch": 25.700096899224807, |
| "grad_norm": 0.49483734369277954, |
| "learning_rate": 0.00021465329204254746, |
| "loss": 2.9666, |
| "step": 95500 |
| }, |
| { |
| "epoch": 25.713555124892334, |
| "grad_norm": 0.479103684425354, |
| "learning_rate": 0.00021445132624208966, |
| "loss": 2.9711, |
| "step": 95550 |
| }, |
| { |
| "epoch": 25.72701335055986, |
| "grad_norm": 0.5061235427856445, |
| "learning_rate": 0.00021424936044163188, |
| "loss": 2.9746, |
| "step": 95600 |
| }, |
| { |
| "epoch": 25.74047157622739, |
| "grad_norm": 0.5018367171287537, |
| "learning_rate": 0.00021404739464117407, |
| "loss": 2.9706, |
| "step": 95650 |
| }, |
| { |
| "epoch": 25.75392980189492, |
| "grad_norm": 0.47308751940727234, |
| "learning_rate": 0.00021384542884071627, |
| "loss": 2.961, |
| "step": 95700 |
| }, |
| { |
| "epoch": 25.767388027562447, |
| "grad_norm": 0.47990474104881287, |
| "learning_rate": 0.0002136434630402585, |
| "loss": 2.9735, |
| "step": 95750 |
| }, |
| { |
| "epoch": 25.780846253229974, |
| "grad_norm": 0.5104753375053406, |
| "learning_rate": 0.0002134414972398007, |
| "loss": 2.9615, |
| "step": 95800 |
| }, |
| { |
| "epoch": 25.7943044788975, |
| "grad_norm": 0.4795853793621063, |
| "learning_rate": 0.0002132395314393429, |
| "loss": 2.9692, |
| "step": 95850 |
| }, |
| { |
| "epoch": 25.80776270456503, |
| "grad_norm": 0.47165408730506897, |
| "learning_rate": 0.00021303756563888514, |
| "loss": 2.9595, |
| "step": 95900 |
| }, |
| { |
| "epoch": 25.82122093023256, |
| "grad_norm": 0.4764661192893982, |
| "learning_rate": 0.00021283559983842736, |
| "loss": 2.9581, |
| "step": 95950 |
| }, |
| { |
| "epoch": 25.834679155900087, |
| "grad_norm": 0.4767214357852936, |
| "learning_rate": 0.00021263363403796956, |
| "loss": 2.9767, |
| "step": 96000 |
| }, |
| { |
| "epoch": 25.834679155900087, |
| "eval_accuracy": 0.3970028895717706, |
| "eval_loss": 3.2806341648101807, |
| "eval_runtime": 146.2565, |
| "eval_samples_per_second": 123.154, |
| "eval_steps_per_second": 7.699, |
| "step": 96000 |
| }, |
| { |
| "epoch": 25.848137381567614, |
| "grad_norm": 0.48081299662590027, |
| "learning_rate": 0.00021243166823751175, |
| "loss": 2.9614, |
| "step": 96050 |
| }, |
| { |
| "epoch": 25.86159560723514, |
| "grad_norm": 0.4761458933353424, |
| "learning_rate": 0.00021222970243705398, |
| "loss": 2.9817, |
| "step": 96100 |
| }, |
| { |
| "epoch": 25.87505383290267, |
| "grad_norm": 0.4514337182044983, |
| "learning_rate": 0.00021202773663659617, |
| "loss": 2.9691, |
| "step": 96150 |
| }, |
| { |
| "epoch": 25.8885120585702, |
| "grad_norm": 0.47705453634262085, |
| "learning_rate": 0.0002118257708361384, |
| "loss": 2.972, |
| "step": 96200 |
| }, |
| { |
| "epoch": 25.901970284237727, |
| "grad_norm": 0.46351706981658936, |
| "learning_rate": 0.0002116238050356806, |
| "loss": 2.964, |
| "step": 96250 |
| }, |
| { |
| "epoch": 25.915428509905254, |
| "grad_norm": 0.4799213409423828, |
| "learning_rate": 0.00021142183923522281, |
| "loss": 2.9753, |
| "step": 96300 |
| }, |
| { |
| "epoch": 25.92888673557278, |
| "grad_norm": 0.46214932203292847, |
| "learning_rate": 0.00021121987343476504, |
| "loss": 2.9806, |
| "step": 96350 |
| }, |
| { |
| "epoch": 25.94234496124031, |
| "grad_norm": 0.4498823583126068, |
| "learning_rate": 0.00021101790763430726, |
| "loss": 2.978, |
| "step": 96400 |
| }, |
| { |
| "epoch": 25.955803186907836, |
| "grad_norm": 0.4700547456741333, |
| "learning_rate": 0.00021081594183384946, |
| "loss": 2.9633, |
| "step": 96450 |
| }, |
| { |
| "epoch": 25.969261412575367, |
| "grad_norm": 0.49166616797447205, |
| "learning_rate": 0.00021061397603339165, |
| "loss": 2.9794, |
| "step": 96500 |
| }, |
| { |
| "epoch": 25.982719638242894, |
| "grad_norm": 0.4853648841381073, |
| "learning_rate": 0.00021041201023293388, |
| "loss": 2.9736, |
| "step": 96550 |
| }, |
| { |
| "epoch": 25.99617786391042, |
| "grad_norm": 0.4653874933719635, |
| "learning_rate": 0.00021021004443247607, |
| "loss": 2.9679, |
| "step": 96600 |
| }, |
| { |
| "epoch": 26.00942075796727, |
| "grad_norm": 0.48257073760032654, |
| "learning_rate": 0.0002100080786320183, |
| "loss": 2.9124, |
| "step": 96650 |
| }, |
| { |
| "epoch": 26.022878983634797, |
| "grad_norm": 0.4681517779827118, |
| "learning_rate": 0.0002098061128315605, |
| "loss": 2.8909, |
| "step": 96700 |
| }, |
| { |
| "epoch": 26.036337209302324, |
| "grad_norm": 0.5120118260383606, |
| "learning_rate": 0.00020960414703110274, |
| "loss": 2.9027, |
| "step": 96750 |
| }, |
| { |
| "epoch": 26.049795434969855, |
| "grad_norm": 0.5014836192131042, |
| "learning_rate": 0.00020940218123064494, |
| "loss": 2.9032, |
| "step": 96800 |
| }, |
| { |
| "epoch": 26.063253660637383, |
| "grad_norm": 0.48905983567237854, |
| "learning_rate": 0.00020920021543018713, |
| "loss": 2.9024, |
| "step": 96850 |
| }, |
| { |
| "epoch": 26.07671188630491, |
| "grad_norm": 0.4677479565143585, |
| "learning_rate": 0.00020899824962972936, |
| "loss": 2.9186, |
| "step": 96900 |
| }, |
| { |
| "epoch": 26.090170111972437, |
| "grad_norm": 0.4844556450843811, |
| "learning_rate": 0.00020879628382927155, |
| "loss": 2.9093, |
| "step": 96950 |
| }, |
| { |
| "epoch": 26.103628337639964, |
| "grad_norm": 0.48902034759521484, |
| "learning_rate": 0.00020859431802881378, |
| "loss": 2.9106, |
| "step": 97000 |
| }, |
| { |
| "epoch": 26.103628337639964, |
| "eval_accuracy": 0.3965241915261537, |
| "eval_loss": 3.293030261993408, |
| "eval_runtime": 146.2368, |
| "eval_samples_per_second": 123.17, |
| "eval_steps_per_second": 7.7, |
| "step": 97000 |
| }, |
| { |
| "epoch": 26.117086563307492, |
| "grad_norm": 0.4999445080757141, |
| "learning_rate": 0.00020839235222835597, |
| "loss": 2.9076, |
| "step": 97050 |
| }, |
| { |
| "epoch": 26.130544788975023, |
| "grad_norm": 0.4936251938343048, |
| "learning_rate": 0.00020819038642789817, |
| "loss": 2.9138, |
| "step": 97100 |
| }, |
| { |
| "epoch": 26.14400301464255, |
| "grad_norm": 0.5119921565055847, |
| "learning_rate": 0.0002079884206274404, |
| "loss": 2.9163, |
| "step": 97150 |
| }, |
| { |
| "epoch": 26.157461240310077, |
| "grad_norm": 0.4628806412220001, |
| "learning_rate": 0.00020778645482698264, |
| "loss": 2.9275, |
| "step": 97200 |
| }, |
| { |
| "epoch": 26.170919465977605, |
| "grad_norm": 0.4793238341808319, |
| "learning_rate": 0.00020758448902652484, |
| "loss": 2.9143, |
| "step": 97250 |
| }, |
| { |
| "epoch": 26.184377691645132, |
| "grad_norm": 0.5028554201126099, |
| "learning_rate": 0.00020738252322606703, |
| "loss": 2.9154, |
| "step": 97300 |
| }, |
| { |
| "epoch": 26.197835917312663, |
| "grad_norm": 0.4699283838272095, |
| "learning_rate": 0.00020718055742560926, |
| "loss": 2.92, |
| "step": 97350 |
| }, |
| { |
| "epoch": 26.21129414298019, |
| "grad_norm": 0.4700480103492737, |
| "learning_rate": 0.00020697859162515145, |
| "loss": 2.9203, |
| "step": 97400 |
| }, |
| { |
| "epoch": 26.224752368647717, |
| "grad_norm": 0.47370514273643494, |
| "learning_rate": 0.00020677662582469368, |
| "loss": 2.9186, |
| "step": 97450 |
| }, |
| { |
| "epoch": 26.238210594315245, |
| "grad_norm": 0.4865691363811493, |
| "learning_rate": 0.00020657466002423587, |
| "loss": 2.9284, |
| "step": 97500 |
| }, |
| { |
| "epoch": 26.251668819982772, |
| "grad_norm": 0.5234982371330261, |
| "learning_rate": 0.00020637269422377807, |
| "loss": 2.9388, |
| "step": 97550 |
| }, |
| { |
| "epoch": 26.265127045650303, |
| "grad_norm": 0.4665259122848511, |
| "learning_rate": 0.0002061707284233203, |
| "loss": 2.9165, |
| "step": 97600 |
| }, |
| { |
| "epoch": 26.27858527131783, |
| "grad_norm": 0.4814673960208893, |
| "learning_rate": 0.00020596876262286251, |
| "loss": 2.9279, |
| "step": 97650 |
| }, |
| { |
| "epoch": 26.292043496985357, |
| "grad_norm": 0.49803024530410767, |
| "learning_rate": 0.00020576679682240474, |
| "loss": 2.9278, |
| "step": 97700 |
| }, |
| { |
| "epoch": 26.305501722652885, |
| "grad_norm": 0.5198856592178345, |
| "learning_rate": 0.00020556483102194693, |
| "loss": 2.9315, |
| "step": 97750 |
| }, |
| { |
| "epoch": 26.318959948320412, |
| "grad_norm": 0.454545259475708, |
| "learning_rate": 0.00020536286522148916, |
| "loss": 2.9344, |
| "step": 97800 |
| }, |
| { |
| "epoch": 26.332418173987943, |
| "grad_norm": 0.5368754267692566, |
| "learning_rate": 0.00020516089942103135, |
| "loss": 2.9265, |
| "step": 97850 |
| }, |
| { |
| "epoch": 26.34587639965547, |
| "grad_norm": 0.46915602684020996, |
| "learning_rate": 0.00020495893362057355, |
| "loss": 2.9348, |
| "step": 97900 |
| }, |
| { |
| "epoch": 26.359334625322997, |
| "grad_norm": 0.495172381401062, |
| "learning_rate": 0.00020475696782011577, |
| "loss": 2.9374, |
| "step": 97950 |
| }, |
| { |
| "epoch": 26.372792850990525, |
| "grad_norm": 0.4795806109905243, |
| "learning_rate": 0.00020455500201965797, |
| "loss": 2.9378, |
| "step": 98000 |
| }, |
| { |
| "epoch": 26.372792850990525, |
| "eval_accuracy": 0.39687186146395986, |
| "eval_loss": 3.2883505821228027, |
| "eval_runtime": 146.3636, |
| "eval_samples_per_second": 123.063, |
| "eval_steps_per_second": 7.693, |
| "step": 98000 |
| }, |
| { |
| "epoch": 26.386251076658052, |
| "grad_norm": 0.4649851322174072, |
| "learning_rate": 0.0002043530362192002, |
| "loss": 2.934, |
| "step": 98050 |
| }, |
| { |
| "epoch": 26.399709302325583, |
| "grad_norm": 0.46076539158821106, |
| "learning_rate": 0.00020415107041874242, |
| "loss": 2.9432, |
| "step": 98100 |
| }, |
| { |
| "epoch": 26.41316752799311, |
| "grad_norm": 0.4639580249786377, |
| "learning_rate": 0.00020394910461828464, |
| "loss": 2.938, |
| "step": 98150 |
| }, |
| { |
| "epoch": 26.426625753660637, |
| "grad_norm": 0.48218265175819397, |
| "learning_rate": 0.00020374713881782683, |
| "loss": 2.9363, |
| "step": 98200 |
| }, |
| { |
| "epoch": 26.440083979328165, |
| "grad_norm": 0.4805491268634796, |
| "learning_rate": 0.00020354517301736906, |
| "loss": 2.936, |
| "step": 98250 |
| }, |
| { |
| "epoch": 26.453542204995692, |
| "grad_norm": 0.4810453951358795, |
| "learning_rate": 0.00020334320721691125, |
| "loss": 2.9441, |
| "step": 98300 |
| }, |
| { |
| "epoch": 26.467000430663223, |
| "grad_norm": 0.4797106981277466, |
| "learning_rate": 0.00020314124141645345, |
| "loss": 2.9435, |
| "step": 98350 |
| }, |
| { |
| "epoch": 26.48045865633075, |
| "grad_norm": 0.48143908381462097, |
| "learning_rate": 0.00020293927561599567, |
| "loss": 2.9428, |
| "step": 98400 |
| }, |
| { |
| "epoch": 26.493916881998278, |
| "grad_norm": 0.4961640536785126, |
| "learning_rate": 0.00020273730981553787, |
| "loss": 2.9369, |
| "step": 98450 |
| }, |
| { |
| "epoch": 26.507375107665805, |
| "grad_norm": 0.49791309237480164, |
| "learning_rate": 0.0002025353440150801, |
| "loss": 2.9537, |
| "step": 98500 |
| }, |
| { |
| "epoch": 26.520833333333332, |
| "grad_norm": 0.52032071352005, |
| "learning_rate": 0.00020233337821462232, |
| "loss": 2.9455, |
| "step": 98550 |
| }, |
| { |
| "epoch": 26.534291559000863, |
| "grad_norm": 0.4943895637989044, |
| "learning_rate": 0.00020213141241416454, |
| "loss": 2.9452, |
| "step": 98600 |
| }, |
| { |
| "epoch": 26.54774978466839, |
| "grad_norm": 0.4840410649776459, |
| "learning_rate": 0.00020192944661370674, |
| "loss": 2.944, |
| "step": 98650 |
| }, |
| { |
| "epoch": 26.561208010335918, |
| "grad_norm": 0.488031268119812, |
| "learning_rate": 0.00020172748081324893, |
| "loss": 2.954, |
| "step": 98700 |
| }, |
| { |
| "epoch": 26.574666236003445, |
| "grad_norm": 0.49708092212677, |
| "learning_rate": 0.00020152551501279115, |
| "loss": 2.9446, |
| "step": 98750 |
| }, |
| { |
| "epoch": 26.588124461670972, |
| "grad_norm": 0.4940889775753021, |
| "learning_rate": 0.00020132354921233335, |
| "loss": 2.9518, |
| "step": 98800 |
| }, |
| { |
| "epoch": 26.6015826873385, |
| "grad_norm": 0.4973823130130768, |
| "learning_rate": 0.00020112158341187557, |
| "loss": 2.9486, |
| "step": 98850 |
| }, |
| { |
| "epoch": 26.61504091300603, |
| "grad_norm": 0.4541454315185547, |
| "learning_rate": 0.00020091961761141777, |
| "loss": 2.9475, |
| "step": 98900 |
| }, |
| { |
| "epoch": 26.628499138673558, |
| "grad_norm": 0.47265252470970154, |
| "learning_rate": 0.00020071765181095997, |
| "loss": 2.9577, |
| "step": 98950 |
| }, |
| { |
| "epoch": 26.641957364341085, |
| "grad_norm": 0.48501822352409363, |
| "learning_rate": 0.00020051568601050222, |
| "loss": 2.9573, |
| "step": 99000 |
| }, |
| { |
| "epoch": 26.641957364341085, |
| "eval_accuracy": 0.396756587150206, |
| "eval_loss": 3.2868714332580566, |
| "eval_runtime": 146.3487, |
| "eval_samples_per_second": 123.076, |
| "eval_steps_per_second": 7.694, |
| "step": 99000 |
| }, |
| { |
| "epoch": 26.655415590008612, |
| "grad_norm": 0.4774377644062042, |
| "learning_rate": 0.00020031372021004444, |
| "loss": 2.9507, |
| "step": 99050 |
| }, |
| { |
| "epoch": 26.66887381567614, |
| "grad_norm": 0.4970323443412781, |
| "learning_rate": 0.00020011175440958664, |
| "loss": 2.9539, |
| "step": 99100 |
| }, |
| { |
| "epoch": 26.68233204134367, |
| "grad_norm": 0.47971388697624207, |
| "learning_rate": 0.00019990978860912883, |
| "loss": 2.9507, |
| "step": 99150 |
| }, |
| { |
| "epoch": 26.695790267011198, |
| "grad_norm": 0.4856283664703369, |
| "learning_rate": 0.00019970782280867105, |
| "loss": 2.9576, |
| "step": 99200 |
| }, |
| { |
| "epoch": 26.709248492678725, |
| "grad_norm": 0.5186119675636292, |
| "learning_rate": 0.00019950585700821325, |
| "loss": 2.9318, |
| "step": 99250 |
| }, |
| { |
| "epoch": 26.722706718346252, |
| "grad_norm": 0.472085177898407, |
| "learning_rate": 0.00019930389120775547, |
| "loss": 2.9396, |
| "step": 99300 |
| }, |
| { |
| "epoch": 26.73616494401378, |
| "grad_norm": 0.4699694514274597, |
| "learning_rate": 0.00019910192540729767, |
| "loss": 2.9471, |
| "step": 99350 |
| }, |
| { |
| "epoch": 26.74962316968131, |
| "grad_norm": 0.4968441128730774, |
| "learning_rate": 0.00019889995960683987, |
| "loss": 2.9501, |
| "step": 99400 |
| }, |
| { |
| "epoch": 26.763081395348838, |
| "grad_norm": 0.4901743233203888, |
| "learning_rate": 0.00019869799380638212, |
| "loss": 2.9584, |
| "step": 99450 |
| }, |
| { |
| "epoch": 26.776539621016365, |
| "grad_norm": 0.4878545105457306, |
| "learning_rate": 0.0001984960280059243, |
| "loss": 2.9686, |
| "step": 99500 |
| }, |
| { |
| "epoch": 26.789997846683892, |
| "grad_norm": 0.48013490438461304, |
| "learning_rate": 0.00019829406220546654, |
| "loss": 2.9544, |
| "step": 99550 |
| }, |
| { |
| "epoch": 26.80345607235142, |
| "grad_norm": 0.5074095726013184, |
| "learning_rate": 0.00019809209640500873, |
| "loss": 2.9607, |
| "step": 99600 |
| }, |
| { |
| "epoch": 26.81691429801895, |
| "grad_norm": 0.4604112505912781, |
| "learning_rate": 0.00019789013060455096, |
| "loss": 2.9542, |
| "step": 99650 |
| }, |
| { |
| "epoch": 26.830372523686478, |
| "grad_norm": 0.45256295800209045, |
| "learning_rate": 0.00019768816480409315, |
| "loss": 2.9589, |
| "step": 99700 |
| }, |
| { |
| "epoch": 26.843830749354005, |
| "grad_norm": 0.479516863822937, |
| "learning_rate": 0.00019748619900363535, |
| "loss": 2.9495, |
| "step": 99750 |
| }, |
| { |
| "epoch": 26.857288975021532, |
| "grad_norm": 0.48909792304039, |
| "learning_rate": 0.00019728423320317757, |
| "loss": 2.9572, |
| "step": 99800 |
| }, |
| { |
| "epoch": 26.87074720068906, |
| "grad_norm": 0.49186989665031433, |
| "learning_rate": 0.00019708226740271977, |
| "loss": 2.9611, |
| "step": 99850 |
| }, |
| { |
| "epoch": 26.88420542635659, |
| "grad_norm": 0.45272672176361084, |
| "learning_rate": 0.00019688030160226202, |
| "loss": 2.9534, |
| "step": 99900 |
| }, |
| { |
| "epoch": 26.897663652024118, |
| "grad_norm": 0.48817697167396545, |
| "learning_rate": 0.0001966783358018042, |
| "loss": 2.9614, |
| "step": 99950 |
| }, |
| { |
| "epoch": 26.911121877691645, |
| "grad_norm": 0.5166176557540894, |
| "learning_rate": 0.00019647637000134644, |
| "loss": 2.9592, |
| "step": 100000 |
| }, |
| { |
| "epoch": 26.911121877691645, |
| "eval_accuracy": 0.39753297758006945, |
| "eval_loss": 3.279358386993408, |
| "eval_runtime": 146.0791, |
| "eval_samples_per_second": 123.303, |
| "eval_steps_per_second": 7.708, |
| "step": 100000 |
| }, |
| { |
| "epoch": 26.924580103359173, |
| "grad_norm": 0.4788917601108551, |
| "learning_rate": 0.00019627440420088863, |
| "loss": 2.9581, |
| "step": 100050 |
| }, |
| { |
| "epoch": 26.9380383290267, |
| "grad_norm": 0.4836042821407318, |
| "learning_rate": 0.00019607243840043086, |
| "loss": 2.9502, |
| "step": 100100 |
| }, |
| { |
| "epoch": 26.95149655469423, |
| "grad_norm": 0.47917941212654114, |
| "learning_rate": 0.00019587047259997305, |
| "loss": 2.9627, |
| "step": 100150 |
| }, |
| { |
| "epoch": 26.964954780361758, |
| "grad_norm": 0.4649355113506317, |
| "learning_rate": 0.00019566850679951525, |
| "loss": 2.9603, |
| "step": 100200 |
| }, |
| { |
| "epoch": 26.978413006029285, |
| "grad_norm": 0.5115824341773987, |
| "learning_rate": 0.00019546654099905747, |
| "loss": 2.956, |
| "step": 100250 |
| }, |
| { |
| "epoch": 26.991871231696813, |
| "grad_norm": 0.511033833026886, |
| "learning_rate": 0.00019526457519859967, |
| "loss": 2.9732, |
| "step": 100300 |
| }, |
| { |
| "epoch": 27.00511412575366, |
| "grad_norm": 0.4819963276386261, |
| "learning_rate": 0.00019506260939814192, |
| "loss": 2.9322, |
| "step": 100350 |
| }, |
| { |
| "epoch": 27.018572351421188, |
| "grad_norm": 0.5109541416168213, |
| "learning_rate": 0.0001948606435976841, |
| "loss": 2.8825, |
| "step": 100400 |
| }, |
| { |
| "epoch": 27.032030577088715, |
| "grad_norm": 0.5299190878868103, |
| "learning_rate": 0.00019465867779722634, |
| "loss": 2.9011, |
| "step": 100450 |
| }, |
| { |
| "epoch": 27.045488802756246, |
| "grad_norm": 0.4947880804538727, |
| "learning_rate": 0.00019445671199676853, |
| "loss": 2.8925, |
| "step": 100500 |
| }, |
| { |
| "epoch": 27.058947028423773, |
| "grad_norm": 0.4906946122646332, |
| "learning_rate": 0.00019425474619631073, |
| "loss": 2.9027, |
| "step": 100550 |
| }, |
| { |
| "epoch": 27.0724052540913, |
| "grad_norm": 0.5048971176147461, |
| "learning_rate": 0.00019405278039585295, |
| "loss": 2.9011, |
| "step": 100600 |
| }, |
| { |
| "epoch": 27.085863479758828, |
| "grad_norm": 0.47121816873550415, |
| "learning_rate": 0.00019385081459539515, |
| "loss": 2.8926, |
| "step": 100650 |
| }, |
| { |
| "epoch": 27.099321705426355, |
| "grad_norm": 0.5003146529197693, |
| "learning_rate": 0.00019364884879493737, |
| "loss": 2.9034, |
| "step": 100700 |
| }, |
| { |
| "epoch": 27.112779931093886, |
| "grad_norm": 0.491941899061203, |
| "learning_rate": 0.00019344688299447957, |
| "loss": 2.9028, |
| "step": 100750 |
| }, |
| { |
| "epoch": 27.126238156761413, |
| "grad_norm": 0.4929443895816803, |
| "learning_rate": 0.00019324491719402182, |
| "loss": 2.8981, |
| "step": 100800 |
| }, |
| { |
| "epoch": 27.13969638242894, |
| "grad_norm": 0.48886409401893616, |
| "learning_rate": 0.00019304295139356401, |
| "loss": 2.9095, |
| "step": 100850 |
| }, |
| { |
| "epoch": 27.153154608096468, |
| "grad_norm": 0.4845869541168213, |
| "learning_rate": 0.0001928409855931062, |
| "loss": 2.8995, |
| "step": 100900 |
| }, |
| { |
| "epoch": 27.166612833763995, |
| "grad_norm": 0.5020434260368347, |
| "learning_rate": 0.00019263901979264843, |
| "loss": 2.9163, |
| "step": 100950 |
| }, |
| { |
| "epoch": 27.180071059431526, |
| "grad_norm": 0.48844480514526367, |
| "learning_rate": 0.00019243705399219063, |
| "loss": 2.9014, |
| "step": 101000 |
| }, |
| { |
| "epoch": 27.180071059431526, |
| "eval_accuracy": 0.39657873224764706, |
| "eval_loss": 3.2880594730377197, |
| "eval_runtime": 146.9823, |
| "eval_samples_per_second": 122.545, |
| "eval_steps_per_second": 7.661, |
| "step": 101000 |
| }, |
| { |
| "epoch": 27.193529285099054, |
| "grad_norm": 0.4601687490940094, |
| "learning_rate": 0.00019223508819173285, |
| "loss": 2.911, |
| "step": 101050 |
| }, |
| { |
| "epoch": 27.20698751076658, |
| "grad_norm": 0.5111984610557556, |
| "learning_rate": 0.00019203312239127505, |
| "loss": 2.9103, |
| "step": 101100 |
| }, |
| { |
| "epoch": 27.220445736434108, |
| "grad_norm": 0.48042234778404236, |
| "learning_rate": 0.00019183115659081727, |
| "loss": 2.918, |
| "step": 101150 |
| }, |
| { |
| "epoch": 27.233903962101635, |
| "grad_norm": 0.48300600051879883, |
| "learning_rate": 0.00019162919079035947, |
| "loss": 2.9079, |
| "step": 101200 |
| }, |
| { |
| "epoch": 27.247362187769163, |
| "grad_norm": 0.48452457785606384, |
| "learning_rate": 0.00019142722498990172, |
| "loss": 2.9142, |
| "step": 101250 |
| }, |
| { |
| "epoch": 27.260820413436694, |
| "grad_norm": 0.49995142221450806, |
| "learning_rate": 0.00019122525918944391, |
| "loss": 2.9201, |
| "step": 101300 |
| }, |
| { |
| "epoch": 27.27427863910422, |
| "grad_norm": 0.5176795721054077, |
| "learning_rate": 0.0001910232933889861, |
| "loss": 2.9096, |
| "step": 101350 |
| }, |
| { |
| "epoch": 27.287736864771748, |
| "grad_norm": 0.5069646239280701, |
| "learning_rate": 0.00019082132758852833, |
| "loss": 2.8984, |
| "step": 101400 |
| }, |
| { |
| "epoch": 27.301195090439276, |
| "grad_norm": 0.46328434348106384, |
| "learning_rate": 0.00019061936178807053, |
| "loss": 2.9239, |
| "step": 101450 |
| }, |
| { |
| "epoch": 27.314653316106803, |
| "grad_norm": 0.521058201789856, |
| "learning_rate": 0.00019041739598761275, |
| "loss": 2.9206, |
| "step": 101500 |
| }, |
| { |
| "epoch": 27.328111541774334, |
| "grad_norm": 0.5169256925582886, |
| "learning_rate": 0.00019021543018715495, |
| "loss": 2.9243, |
| "step": 101550 |
| }, |
| { |
| "epoch": 27.34156976744186, |
| "grad_norm": 0.47673463821411133, |
| "learning_rate": 0.00019001346438669714, |
| "loss": 2.9226, |
| "step": 101600 |
| }, |
| { |
| "epoch": 27.35502799310939, |
| "grad_norm": 0.4859578013420105, |
| "learning_rate": 0.0001898114985862394, |
| "loss": 2.9334, |
| "step": 101650 |
| }, |
| { |
| "epoch": 27.368486218776916, |
| "grad_norm": 0.5149036049842834, |
| "learning_rate": 0.0001896095327857816, |
| "loss": 2.9226, |
| "step": 101700 |
| }, |
| { |
| "epoch": 27.381944444444443, |
| "grad_norm": 0.485603392124176, |
| "learning_rate": 0.00018940756698532381, |
| "loss": 2.9129, |
| "step": 101750 |
| }, |
| { |
| "epoch": 27.395402670111974, |
| "grad_norm": 0.4967415928840637, |
| "learning_rate": 0.000189205601184866, |
| "loss": 2.9217, |
| "step": 101800 |
| }, |
| { |
| "epoch": 27.4088608957795, |
| "grad_norm": 0.48943030834198, |
| "learning_rate": 0.00018900363538440823, |
| "loss": 2.9191, |
| "step": 101850 |
| }, |
| { |
| "epoch": 27.42231912144703, |
| "grad_norm": 0.5117086172103882, |
| "learning_rate": 0.00018880166958395043, |
| "loss": 2.9233, |
| "step": 101900 |
| }, |
| { |
| "epoch": 27.435777347114556, |
| "grad_norm": 0.4888044595718384, |
| "learning_rate": 0.00018859970378349263, |
| "loss": 2.9305, |
| "step": 101950 |
| }, |
| { |
| "epoch": 27.449235572782083, |
| "grad_norm": 0.48958805203437805, |
| "learning_rate": 0.00018839773798303485, |
| "loss": 2.9383, |
| "step": 102000 |
| }, |
| { |
| "epoch": 27.449235572782083, |
| "eval_accuracy": 0.397087090884833, |
| "eval_loss": 3.2870469093322754, |
| "eval_runtime": 146.0558, |
| "eval_samples_per_second": 123.323, |
| "eval_steps_per_second": 7.709, |
| "step": 102000 |
| }, |
| { |
| "epoch": 27.462693798449614, |
| "grad_norm": 0.49132028222084045, |
| "learning_rate": 0.00018819577218257705, |
| "loss": 2.934, |
| "step": 102050 |
| }, |
| { |
| "epoch": 27.47615202411714, |
| "grad_norm": 0.5282468795776367, |
| "learning_rate": 0.0001879938063821193, |
| "loss": 2.9369, |
| "step": 102100 |
| }, |
| { |
| "epoch": 27.48961024978467, |
| "grad_norm": 0.5024771094322205, |
| "learning_rate": 0.0001877918405816615, |
| "loss": 2.9185, |
| "step": 102150 |
| }, |
| { |
| "epoch": 27.503068475452196, |
| "grad_norm": 0.48891547322273254, |
| "learning_rate": 0.00018758987478120371, |
| "loss": 2.9419, |
| "step": 102200 |
| }, |
| { |
| "epoch": 27.516526701119723, |
| "grad_norm": 0.5016714334487915, |
| "learning_rate": 0.0001873879089807459, |
| "loss": 2.9221, |
| "step": 102250 |
| }, |
| { |
| "epoch": 27.529984926787254, |
| "grad_norm": 0.5051071047782898, |
| "learning_rate": 0.00018718594318028813, |
| "loss": 2.9334, |
| "step": 102300 |
| }, |
| { |
| "epoch": 27.54344315245478, |
| "grad_norm": 0.4839610457420349, |
| "learning_rate": 0.00018698397737983033, |
| "loss": 2.9334, |
| "step": 102350 |
| }, |
| { |
| "epoch": 27.55690137812231, |
| "grad_norm": 0.49538129568099976, |
| "learning_rate": 0.00018678201157937253, |
| "loss": 2.9433, |
| "step": 102400 |
| }, |
| { |
| "epoch": 27.570359603789836, |
| "grad_norm": 0.5163812637329102, |
| "learning_rate": 0.00018658004577891475, |
| "loss": 2.9399, |
| "step": 102450 |
| }, |
| { |
| "epoch": 27.583817829457363, |
| "grad_norm": 0.48955875635147095, |
| "learning_rate": 0.00018637807997845695, |
| "loss": 2.938, |
| "step": 102500 |
| }, |
| { |
| "epoch": 27.597276055124894, |
| "grad_norm": 0.48446550965309143, |
| "learning_rate": 0.0001861761141779992, |
| "loss": 2.9378, |
| "step": 102550 |
| }, |
| { |
| "epoch": 27.61073428079242, |
| "grad_norm": 0.48129525780677795, |
| "learning_rate": 0.0001859741483775414, |
| "loss": 2.9373, |
| "step": 102600 |
| }, |
| { |
| "epoch": 27.62419250645995, |
| "grad_norm": 0.4978967308998108, |
| "learning_rate": 0.00018577218257708362, |
| "loss": 2.9394, |
| "step": 102650 |
| }, |
| { |
| "epoch": 27.637650732127476, |
| "grad_norm": 0.5358911156654358, |
| "learning_rate": 0.0001855702167766258, |
| "loss": 2.9379, |
| "step": 102700 |
| }, |
| { |
| "epoch": 27.651108957795003, |
| "grad_norm": 0.48122960329055786, |
| "learning_rate": 0.000185368250976168, |
| "loss": 2.9336, |
| "step": 102750 |
| }, |
| { |
| "epoch": 27.664567183462534, |
| "grad_norm": 0.4696429967880249, |
| "learning_rate": 0.00018516628517571023, |
| "loss": 2.9367, |
| "step": 102800 |
| }, |
| { |
| "epoch": 27.67802540913006, |
| "grad_norm": 0.49047210812568665, |
| "learning_rate": 0.00018496431937525243, |
| "loss": 2.9348, |
| "step": 102850 |
| }, |
| { |
| "epoch": 27.69148363479759, |
| "grad_norm": 0.4979493319988251, |
| "learning_rate": 0.00018476235357479465, |
| "loss": 2.9339, |
| "step": 102900 |
| }, |
| { |
| "epoch": 27.704941860465116, |
| "grad_norm": 0.46877792477607727, |
| "learning_rate": 0.00018456038777433685, |
| "loss": 2.9504, |
| "step": 102950 |
| }, |
| { |
| "epoch": 27.718400086132643, |
| "grad_norm": 0.5015468597412109, |
| "learning_rate": 0.0001843584219738791, |
| "loss": 2.9516, |
| "step": 103000 |
| }, |
| { |
| "epoch": 27.718400086132643, |
| "eval_accuracy": 0.39719171780674156, |
| "eval_loss": 3.282999277114868, |
| "eval_runtime": 146.236, |
| "eval_samples_per_second": 123.171, |
| "eval_steps_per_second": 7.7, |
| "step": 103000 |
| }, |
| { |
| "epoch": 27.731858311800174, |
| "grad_norm": 0.4905368685722351, |
| "learning_rate": 0.0001841564561734213, |
| "loss": 2.9381, |
| "step": 103050 |
| }, |
| { |
| "epoch": 27.7453165374677, |
| "grad_norm": 0.4895152747631073, |
| "learning_rate": 0.00018395449037296352, |
| "loss": 2.9425, |
| "step": 103100 |
| }, |
| { |
| "epoch": 27.75877476313523, |
| "grad_norm": 0.4837099313735962, |
| "learning_rate": 0.0001837525245725057, |
| "loss": 2.9406, |
| "step": 103150 |
| }, |
| { |
| "epoch": 27.772232988802756, |
| "grad_norm": 0.48197370767593384, |
| "learning_rate": 0.0001835505587720479, |
| "loss": 2.9444, |
| "step": 103200 |
| }, |
| { |
| "epoch": 27.785691214470283, |
| "grad_norm": 0.5033994913101196, |
| "learning_rate": 0.00018334859297159013, |
| "loss": 2.9319, |
| "step": 103250 |
| }, |
| { |
| "epoch": 27.79914944013781, |
| "grad_norm": 0.5263758301734924, |
| "learning_rate": 0.00018314662717113233, |
| "loss": 2.9547, |
| "step": 103300 |
| }, |
| { |
| "epoch": 27.81260766580534, |
| "grad_norm": 0.47729551792144775, |
| "learning_rate": 0.00018294466137067455, |
| "loss": 2.9482, |
| "step": 103350 |
| }, |
| { |
| "epoch": 27.82606589147287, |
| "grad_norm": 0.545293927192688, |
| "learning_rate": 0.00018274269557021675, |
| "loss": 2.9654, |
| "step": 103400 |
| }, |
| { |
| "epoch": 27.839524117140396, |
| "grad_norm": 0.49454832077026367, |
| "learning_rate": 0.000182540729769759, |
| "loss": 2.9512, |
| "step": 103450 |
| }, |
| { |
| "epoch": 27.852982342807923, |
| "grad_norm": 0.4755057394504547, |
| "learning_rate": 0.0001823387639693012, |
| "loss": 2.9534, |
| "step": 103500 |
| }, |
| { |
| "epoch": 27.86644056847545, |
| "grad_norm": 0.49803298711776733, |
| "learning_rate": 0.0001821367981688434, |
| "loss": 2.9539, |
| "step": 103550 |
| }, |
| { |
| "epoch": 27.87989879414298, |
| "grad_norm": 0.5047042369842529, |
| "learning_rate": 0.0001819348323683856, |
| "loss": 2.9482, |
| "step": 103600 |
| }, |
| { |
| "epoch": 27.89335701981051, |
| "grad_norm": 0.4869590997695923, |
| "learning_rate": 0.0001817328665679278, |
| "loss": 2.9538, |
| "step": 103650 |
| }, |
| { |
| "epoch": 27.906815245478036, |
| "grad_norm": 0.498722642660141, |
| "learning_rate": 0.00018153090076747003, |
| "loss": 2.9507, |
| "step": 103700 |
| }, |
| { |
| "epoch": 27.920273471145563, |
| "grad_norm": 0.5139634013175964, |
| "learning_rate": 0.00018132893496701223, |
| "loss": 2.9621, |
| "step": 103750 |
| }, |
| { |
| "epoch": 27.93373169681309, |
| "grad_norm": 0.5020641088485718, |
| "learning_rate": 0.00018112696916655442, |
| "loss": 2.9514, |
| "step": 103800 |
| }, |
| { |
| "epoch": 27.94718992248062, |
| "grad_norm": 0.48636969923973083, |
| "learning_rate": 0.00018092500336609665, |
| "loss": 2.9596, |
| "step": 103850 |
| }, |
| { |
| "epoch": 27.96064814814815, |
| "grad_norm": 0.48579445481300354, |
| "learning_rate": 0.0001807230375656389, |
| "loss": 2.948, |
| "step": 103900 |
| }, |
| { |
| "epoch": 27.974106373815676, |
| "grad_norm": 0.4894184470176697, |
| "learning_rate": 0.0001805210717651811, |
| "loss": 2.9485, |
| "step": 103950 |
| }, |
| { |
| "epoch": 27.987564599483203, |
| "grad_norm": 0.4811153709888458, |
| "learning_rate": 0.0001803191059647233, |
| "loss": 2.9534, |
| "step": 104000 |
| }, |
| { |
| "epoch": 27.987564599483203, |
| "eval_accuracy": 0.39759164688207427, |
| "eval_loss": 3.276357889175415, |
| "eval_runtime": 147.4517, |
| "eval_samples_per_second": 122.155, |
| "eval_steps_per_second": 7.636, |
| "step": 104000 |
| }, |
| { |
| "epoch": 28.00080749354005, |
| "grad_norm": 0.5266789793968201, |
| "learning_rate": 0.0001801171401642655, |
| "loss": 2.9475, |
| "step": 104050 |
| }, |
| { |
| "epoch": 28.01426571920758, |
| "grad_norm": 0.49525949358940125, |
| "learning_rate": 0.0001799151743638077, |
| "loss": 2.87, |
| "step": 104100 |
| }, |
| { |
| "epoch": 28.027723944875106, |
| "grad_norm": 0.5010724663734436, |
| "learning_rate": 0.00017971320856334993, |
| "loss": 2.8858, |
| "step": 104150 |
| }, |
| { |
| "epoch": 28.041182170542637, |
| "grad_norm": 0.48668617010116577, |
| "learning_rate": 0.00017951124276289213, |
| "loss": 2.8767, |
| "step": 104200 |
| }, |
| { |
| "epoch": 28.054640396210164, |
| "grad_norm": 0.48478174209594727, |
| "learning_rate": 0.00017930927696243432, |
| "loss": 2.8777, |
| "step": 104250 |
| }, |
| { |
| "epoch": 28.06809862187769, |
| "grad_norm": 0.4890100359916687, |
| "learning_rate": 0.00017910731116197655, |
| "loss": 2.8855, |
| "step": 104300 |
| }, |
| { |
| "epoch": 28.08155684754522, |
| "grad_norm": 0.48516470193862915, |
| "learning_rate": 0.00017890534536151877, |
| "loss": 2.8959, |
| "step": 104350 |
| }, |
| { |
| "epoch": 28.095015073212746, |
| "grad_norm": 0.5097112655639648, |
| "learning_rate": 0.000178703379561061, |
| "loss": 2.8993, |
| "step": 104400 |
| }, |
| { |
| "epoch": 28.108473298880277, |
| "grad_norm": 0.49391499161720276, |
| "learning_rate": 0.0001785014137606032, |
| "loss": 2.8922, |
| "step": 104450 |
| }, |
| { |
| "epoch": 28.121931524547804, |
| "grad_norm": 0.5124300122261047, |
| "learning_rate": 0.0001782994479601454, |
| "loss": 2.9033, |
| "step": 104500 |
| }, |
| { |
| "epoch": 28.13538975021533, |
| "grad_norm": 0.5130902528762817, |
| "learning_rate": 0.0001780974821596876, |
| "loss": 2.895, |
| "step": 104550 |
| }, |
| { |
| "epoch": 28.14884797588286, |
| "grad_norm": 0.4990008473396301, |
| "learning_rate": 0.0001778955163592298, |
| "loss": 2.8929, |
| "step": 104600 |
| }, |
| { |
| "epoch": 28.162306201550386, |
| "grad_norm": 0.530022144317627, |
| "learning_rate": 0.00017769355055877203, |
| "loss": 2.8976, |
| "step": 104650 |
| }, |
| { |
| "epoch": 28.175764427217917, |
| "grad_norm": 0.45357492566108704, |
| "learning_rate": 0.00017749158475831422, |
| "loss": 2.8898, |
| "step": 104700 |
| }, |
| { |
| "epoch": 28.189222652885444, |
| "grad_norm": 0.5224539637565613, |
| "learning_rate": 0.00017728961895785645, |
| "loss": 2.8979, |
| "step": 104750 |
| }, |
| { |
| "epoch": 28.20268087855297, |
| "grad_norm": 0.5053698420524597, |
| "learning_rate": 0.00017708765315739867, |
| "loss": 2.9074, |
| "step": 104800 |
| }, |
| { |
| "epoch": 28.2161391042205, |
| "grad_norm": 0.49665287137031555, |
| "learning_rate": 0.0001768856873569409, |
| "loss": 2.8893, |
| "step": 104850 |
| }, |
| { |
| "epoch": 28.229597329888026, |
| "grad_norm": 0.5174131989479065, |
| "learning_rate": 0.0001766837215564831, |
| "loss": 2.9079, |
| "step": 104900 |
| }, |
| { |
| "epoch": 28.243055555555557, |
| "grad_norm": 0.482637882232666, |
| "learning_rate": 0.0001764817557560253, |
| "loss": 2.9022, |
| "step": 104950 |
| }, |
| { |
| "epoch": 28.256513781223084, |
| "grad_norm": 0.5195315480232239, |
| "learning_rate": 0.0001762797899555675, |
| "loss": 2.9109, |
| "step": 105000 |
| }, |
| { |
| "epoch": 28.256513781223084, |
| "eval_accuracy": 0.3970951307521448, |
| "eval_loss": 3.2897088527679443, |
| "eval_runtime": 147.4221, |
| "eval_samples_per_second": 122.18, |
| "eval_steps_per_second": 7.638, |
| "step": 105000 |
| }, |
| { |
| "epoch": 28.26997200689061, |
| "grad_norm": 0.5179916024208069, |
| "learning_rate": 0.0001760778241551097, |
| "loss": 2.9069, |
| "step": 105050 |
| }, |
| { |
| "epoch": 28.28343023255814, |
| "grad_norm": 0.4780939519405365, |
| "learning_rate": 0.00017587585835465193, |
| "loss": 2.9143, |
| "step": 105100 |
| }, |
| { |
| "epoch": 28.296888458225666, |
| "grad_norm": 0.5064705610275269, |
| "learning_rate": 0.00017567389255419412, |
| "loss": 2.8949, |
| "step": 105150 |
| }, |
| { |
| "epoch": 28.310346683893197, |
| "grad_norm": 0.493682861328125, |
| "learning_rate": 0.00017547192675373635, |
| "loss": 2.8993, |
| "step": 105200 |
| }, |
| { |
| "epoch": 28.323804909560724, |
| "grad_norm": 0.5155807137489319, |
| "learning_rate": 0.00017526996095327857, |
| "loss": 2.921, |
| "step": 105250 |
| }, |
| { |
| "epoch": 28.337263135228252, |
| "grad_norm": 0.5077420473098755, |
| "learning_rate": 0.0001750679951528208, |
| "loss": 2.9144, |
| "step": 105300 |
| }, |
| { |
| "epoch": 28.35072136089578, |
| "grad_norm": 0.4884844422340393, |
| "learning_rate": 0.000174866029352363, |
| "loss": 2.9225, |
| "step": 105350 |
| }, |
| { |
| "epoch": 28.364179586563306, |
| "grad_norm": 0.49359777569770813, |
| "learning_rate": 0.0001746640635519052, |
| "loss": 2.9095, |
| "step": 105400 |
| }, |
| { |
| "epoch": 28.377637812230837, |
| "grad_norm": 0.5174322724342346, |
| "learning_rate": 0.0001744620977514474, |
| "loss": 2.9265, |
| "step": 105450 |
| }, |
| { |
| "epoch": 28.391096037898365, |
| "grad_norm": 0.49454420804977417, |
| "learning_rate": 0.0001742601319509896, |
| "loss": 2.9277, |
| "step": 105500 |
| }, |
| { |
| "epoch": 28.404554263565892, |
| "grad_norm": 0.5095421075820923, |
| "learning_rate": 0.00017405816615053183, |
| "loss": 2.9204, |
| "step": 105550 |
| }, |
| { |
| "epoch": 28.41801248923342, |
| "grad_norm": 0.527397632598877, |
| "learning_rate": 0.00017385620035007403, |
| "loss": 2.9252, |
| "step": 105600 |
| }, |
| { |
| "epoch": 28.431470714900946, |
| "grad_norm": 0.4911178648471832, |
| "learning_rate": 0.00017365423454961622, |
| "loss": 2.9129, |
| "step": 105650 |
| }, |
| { |
| "epoch": 28.444928940568474, |
| "grad_norm": 0.5081452131271362, |
| "learning_rate": 0.00017345226874915847, |
| "loss": 2.922, |
| "step": 105700 |
| }, |
| { |
| "epoch": 28.458387166236005, |
| "grad_norm": 0.5088328123092651, |
| "learning_rate": 0.00017325030294870067, |
| "loss": 2.9274, |
| "step": 105750 |
| }, |
| { |
| "epoch": 28.471845391903532, |
| "grad_norm": 0.4922160804271698, |
| "learning_rate": 0.0001730483371482429, |
| "loss": 2.9178, |
| "step": 105800 |
| }, |
| { |
| "epoch": 28.48530361757106, |
| "grad_norm": 0.5186408162117004, |
| "learning_rate": 0.0001728463713477851, |
| "loss": 2.9271, |
| "step": 105850 |
| }, |
| { |
| "epoch": 28.498761843238587, |
| "grad_norm": 0.5223609209060669, |
| "learning_rate": 0.0001726444055473273, |
| "loss": 2.9204, |
| "step": 105900 |
| }, |
| { |
| "epoch": 28.512220068906114, |
| "grad_norm": 0.5327643752098083, |
| "learning_rate": 0.0001724424397468695, |
| "loss": 2.9196, |
| "step": 105950 |
| }, |
| { |
| "epoch": 28.525678294573645, |
| "grad_norm": 0.5054581761360168, |
| "learning_rate": 0.00017224047394641173, |
| "loss": 2.9167, |
| "step": 106000 |
| }, |
| { |
| "epoch": 28.525678294573645, |
| "eval_accuracy": 0.39750092675767795, |
| "eval_loss": 3.2850351333618164, |
| "eval_runtime": 146.736, |
| "eval_samples_per_second": 122.751, |
| "eval_steps_per_second": 7.674, |
| "step": 106000 |
| }, |
| { |
| "epoch": 28.539136520241172, |
| "grad_norm": 0.5137544870376587, |
| "learning_rate": 0.00017203850814595393, |
| "loss": 2.9188, |
| "step": 106050 |
| }, |
| { |
| "epoch": 28.5525947459087, |
| "grad_norm": 0.5124856233596802, |
| "learning_rate": 0.00017183654234549612, |
| "loss": 2.9283, |
| "step": 106100 |
| }, |
| { |
| "epoch": 28.566052971576227, |
| "grad_norm": 0.5131354928016663, |
| "learning_rate": 0.00017163457654503837, |
| "loss": 2.9297, |
| "step": 106150 |
| }, |
| { |
| "epoch": 28.579511197243754, |
| "grad_norm": 0.5123969912528992, |
| "learning_rate": 0.00017143261074458057, |
| "loss": 2.9228, |
| "step": 106200 |
| }, |
| { |
| "epoch": 28.592969422911285, |
| "grad_norm": 0.5282868146896362, |
| "learning_rate": 0.0001712306449441228, |
| "loss": 2.9372, |
| "step": 106250 |
| }, |
| { |
| "epoch": 28.606427648578812, |
| "grad_norm": 0.5144837498664856, |
| "learning_rate": 0.000171028679143665, |
| "loss": 2.9326, |
| "step": 106300 |
| }, |
| { |
| "epoch": 28.61988587424634, |
| "grad_norm": 0.5485707521438599, |
| "learning_rate": 0.0001708267133432072, |
| "loss": 2.9339, |
| "step": 106350 |
| }, |
| { |
| "epoch": 28.633344099913867, |
| "grad_norm": 0.5258968472480774, |
| "learning_rate": 0.0001706247475427494, |
| "loss": 2.9307, |
| "step": 106400 |
| }, |
| { |
| "epoch": 28.646802325581394, |
| "grad_norm": 0.49330660700798035, |
| "learning_rate": 0.0001704227817422916, |
| "loss": 2.9332, |
| "step": 106450 |
| }, |
| { |
| "epoch": 28.660260551248925, |
| "grad_norm": 0.4907183051109314, |
| "learning_rate": 0.00017022081594183383, |
| "loss": 2.9261, |
| "step": 106500 |
| }, |
| { |
| "epoch": 28.673718776916452, |
| "grad_norm": 0.496756374835968, |
| "learning_rate": 0.00017001885014137605, |
| "loss": 2.9219, |
| "step": 106550 |
| }, |
| { |
| "epoch": 28.68717700258398, |
| "grad_norm": 0.5075603723526001, |
| "learning_rate": 0.00016981688434091827, |
| "loss": 2.928, |
| "step": 106600 |
| }, |
| { |
| "epoch": 28.700635228251507, |
| "grad_norm": 0.4845956861972809, |
| "learning_rate": 0.00016961491854046047, |
| "loss": 2.9293, |
| "step": 106650 |
| }, |
| { |
| "epoch": 28.714093453919034, |
| "grad_norm": 0.5165157914161682, |
| "learning_rate": 0.0001694129527400027, |
| "loss": 2.9381, |
| "step": 106700 |
| }, |
| { |
| "epoch": 28.727551679586565, |
| "grad_norm": 0.539211094379425, |
| "learning_rate": 0.0001692109869395449, |
| "loss": 2.9374, |
| "step": 106750 |
| }, |
| { |
| "epoch": 28.741009905254092, |
| "grad_norm": 0.5208998322486877, |
| "learning_rate": 0.00016900902113908708, |
| "loss": 2.9233, |
| "step": 106800 |
| }, |
| { |
| "epoch": 28.75446813092162, |
| "grad_norm": 0.5117523670196533, |
| "learning_rate": 0.0001688070553386293, |
| "loss": 2.9272, |
| "step": 106850 |
| }, |
| { |
| "epoch": 28.767926356589147, |
| "grad_norm": 0.48024994134902954, |
| "learning_rate": 0.0001686050895381715, |
| "loss": 2.9297, |
| "step": 106900 |
| }, |
| { |
| "epoch": 28.781384582256674, |
| "grad_norm": 0.5294599533081055, |
| "learning_rate": 0.00016840312373771373, |
| "loss": 2.9464, |
| "step": 106950 |
| }, |
| { |
| "epoch": 28.794842807924205, |
| "grad_norm": 0.49581241607666016, |
| "learning_rate": 0.00016820115793725595, |
| "loss": 2.9433, |
| "step": 107000 |
| }, |
| { |
| "epoch": 28.794842807924205, |
| "eval_accuracy": 0.39789781372105487, |
| "eval_loss": 3.2776732444763184, |
| "eval_runtime": 146.8371, |
| "eval_samples_per_second": 122.667, |
| "eval_steps_per_second": 7.668, |
| "step": 107000 |
| }, |
| { |
| "epoch": 28.808301033591732, |
| "grad_norm": 0.4745948612689972, |
| "learning_rate": 0.00016799919213679817, |
| "loss": 2.9495, |
| "step": 107050 |
| }, |
| { |
| "epoch": 28.82175925925926, |
| "grad_norm": 0.5273579955101013, |
| "learning_rate": 0.00016779722633634037, |
| "loss": 2.9345, |
| "step": 107100 |
| }, |
| { |
| "epoch": 28.835217484926787, |
| "grad_norm": 0.5143641829490662, |
| "learning_rate": 0.0001675952605358826, |
| "loss": 2.9346, |
| "step": 107150 |
| }, |
| { |
| "epoch": 28.848675710594314, |
| "grad_norm": 0.4993511438369751, |
| "learning_rate": 0.0001673932947354248, |
| "loss": 2.9314, |
| "step": 107200 |
| }, |
| { |
| "epoch": 28.86213393626184, |
| "grad_norm": 0.5145397782325745, |
| "learning_rate": 0.00016719132893496698, |
| "loss": 2.943, |
| "step": 107250 |
| }, |
| { |
| "epoch": 28.875592161929372, |
| "grad_norm": 0.5847262740135193, |
| "learning_rate": 0.0001669893631345092, |
| "loss": 2.939, |
| "step": 107300 |
| }, |
| { |
| "epoch": 28.8890503875969, |
| "grad_norm": 0.5064164400100708, |
| "learning_rate": 0.0001667873973340514, |
| "loss": 2.9254, |
| "step": 107350 |
| }, |
| { |
| "epoch": 28.902508613264427, |
| "grad_norm": 0.5213847160339355, |
| "learning_rate": 0.00016658543153359363, |
| "loss": 2.9416, |
| "step": 107400 |
| }, |
| { |
| "epoch": 28.915966838931954, |
| "grad_norm": 0.5266076922416687, |
| "learning_rate": 0.00016638346573313585, |
| "loss": 2.9345, |
| "step": 107450 |
| }, |
| { |
| "epoch": 28.92942506459948, |
| "grad_norm": 0.5118159055709839, |
| "learning_rate": 0.00016618149993267807, |
| "loss": 2.9467, |
| "step": 107500 |
| }, |
| { |
| "epoch": 28.942883290267012, |
| "grad_norm": 0.5127436518669128, |
| "learning_rate": 0.00016597953413222027, |
| "loss": 2.9452, |
| "step": 107550 |
| }, |
| { |
| "epoch": 28.95634151593454, |
| "grad_norm": 0.5206359028816223, |
| "learning_rate": 0.00016577756833176247, |
| "loss": 2.9348, |
| "step": 107600 |
| }, |
| { |
| "epoch": 28.969799741602067, |
| "grad_norm": 0.5183207392692566, |
| "learning_rate": 0.0001655756025313047, |
| "loss": 2.9361, |
| "step": 107650 |
| }, |
| { |
| "epoch": 28.983257967269594, |
| "grad_norm": 0.489900141954422, |
| "learning_rate": 0.00016537363673084688, |
| "loss": 2.9478, |
| "step": 107700 |
| }, |
| { |
| "epoch": 28.99671619293712, |
| "grad_norm": 0.5312089323997498, |
| "learning_rate": 0.0001651716709303891, |
| "loss": 2.953, |
| "step": 107750 |
| }, |
| { |
| "epoch": 29.00995908699397, |
| "grad_norm": 0.48560231924057007, |
| "learning_rate": 0.0001649697051299313, |
| "loss": 2.8764, |
| "step": 107800 |
| }, |
| { |
| "epoch": 29.023417312661497, |
| "grad_norm": 0.5327926874160767, |
| "learning_rate": 0.0001647677393294735, |
| "loss": 2.8725, |
| "step": 107850 |
| }, |
| { |
| "epoch": 29.036875538329028, |
| "grad_norm": 0.48088765144348145, |
| "learning_rate": 0.00016456577352901575, |
| "loss": 2.8643, |
| "step": 107900 |
| }, |
| { |
| "epoch": 29.050333763996555, |
| "grad_norm": 0.5463573932647705, |
| "learning_rate": 0.00016436380772855797, |
| "loss": 2.8722, |
| "step": 107950 |
| }, |
| { |
| "epoch": 29.063791989664082, |
| "grad_norm": 0.5013686418533325, |
| "learning_rate": 0.00016416184192810017, |
| "loss": 2.8714, |
| "step": 108000 |
| }, |
| { |
| "epoch": 29.063791989664082, |
| "eval_accuracy": 0.39687685921931587, |
| "eval_loss": 3.2901217937469482, |
| "eval_runtime": 146.7973, |
| "eval_samples_per_second": 122.7, |
| "eval_steps_per_second": 7.67, |
| "step": 108000 |
| }, |
| { |
| "epoch": 29.07725021533161, |
| "grad_norm": 0.5061647891998291, |
| "learning_rate": 0.00016395987612764237, |
| "loss": 2.8791, |
| "step": 108050 |
| }, |
| { |
| "epoch": 29.090708440999137, |
| "grad_norm": 0.5251488089561462, |
| "learning_rate": 0.0001637579103271846, |
| "loss": 2.8695, |
| "step": 108100 |
| }, |
| { |
| "epoch": 29.104166666666668, |
| "grad_norm": 0.5041511058807373, |
| "learning_rate": 0.00016355594452672679, |
| "loss": 2.8829, |
| "step": 108150 |
| }, |
| { |
| "epoch": 29.117624892334195, |
| "grad_norm": 0.49148404598236084, |
| "learning_rate": 0.000163353978726269, |
| "loss": 2.8866, |
| "step": 108200 |
| }, |
| { |
| "epoch": 29.131083118001722, |
| "grad_norm": 0.5406279563903809, |
| "learning_rate": 0.0001631520129258112, |
| "loss": 2.8911, |
| "step": 108250 |
| }, |
| { |
| "epoch": 29.14454134366925, |
| "grad_norm": 0.5385282039642334, |
| "learning_rate": 0.0001629500471253534, |
| "loss": 2.8916, |
| "step": 108300 |
| }, |
| { |
| "epoch": 29.157999569336777, |
| "grad_norm": 0.5390298366546631, |
| "learning_rate": 0.00016274808132489565, |
| "loss": 2.8827, |
| "step": 108350 |
| }, |
| { |
| "epoch": 29.171457795004308, |
| "grad_norm": 0.5405099391937256, |
| "learning_rate": 0.00016254611552443785, |
| "loss": 2.8937, |
| "step": 108400 |
| }, |
| { |
| "epoch": 29.184916020671835, |
| "grad_norm": 0.5607936382293701, |
| "learning_rate": 0.00016234414972398007, |
| "loss": 2.8967, |
| "step": 108450 |
| }, |
| { |
| "epoch": 29.198374246339363, |
| "grad_norm": 0.5277544856071472, |
| "learning_rate": 0.00016214218392352227, |
| "loss": 2.913, |
| "step": 108500 |
| }, |
| { |
| "epoch": 29.21183247200689, |
| "grad_norm": 0.5103113055229187, |
| "learning_rate": 0.0001619402181230645, |
| "loss": 2.904, |
| "step": 108550 |
| }, |
| { |
| "epoch": 29.225290697674417, |
| "grad_norm": 0.5134518146514893, |
| "learning_rate": 0.00016173825232260669, |
| "loss": 2.8923, |
| "step": 108600 |
| }, |
| { |
| "epoch": 29.238748923341948, |
| "grad_norm": 0.5226130485534668, |
| "learning_rate": 0.00016153628652214888, |
| "loss": 2.8982, |
| "step": 108650 |
| }, |
| { |
| "epoch": 29.252207149009475, |
| "grad_norm": 0.5704584121704102, |
| "learning_rate": 0.0001613343207216911, |
| "loss": 2.8925, |
| "step": 108700 |
| }, |
| { |
| "epoch": 29.265665374677003, |
| "grad_norm": 0.4942656457424164, |
| "learning_rate": 0.0001611323549212333, |
| "loss": 2.8977, |
| "step": 108750 |
| }, |
| { |
| "epoch": 29.27912360034453, |
| "grad_norm": 0.5402054190635681, |
| "learning_rate": 0.00016093038912077555, |
| "loss": 2.9063, |
| "step": 108800 |
| }, |
| { |
| "epoch": 29.292581826012057, |
| "grad_norm": 0.5127511620521545, |
| "learning_rate": 0.00016072842332031775, |
| "loss": 2.9033, |
| "step": 108850 |
| }, |
| { |
| "epoch": 29.306040051679588, |
| "grad_norm": 0.529906690120697, |
| "learning_rate": 0.00016052645751985997, |
| "loss": 2.8883, |
| "step": 108900 |
| }, |
| { |
| "epoch": 29.319498277347115, |
| "grad_norm": 0.5000776648521423, |
| "learning_rate": 0.00016032449171940217, |
| "loss": 2.896, |
| "step": 108950 |
| }, |
| { |
| "epoch": 29.332956503014643, |
| "grad_norm": 0.5496556162834167, |
| "learning_rate": 0.0001601225259189444, |
| "loss": 2.897, |
| "step": 109000 |
| }, |
| { |
| "epoch": 29.332956503014643, |
| "eval_accuracy": 0.3971406537846263, |
| "eval_loss": 3.2881126403808594, |
| "eval_runtime": 146.7304, |
| "eval_samples_per_second": 122.756, |
| "eval_steps_per_second": 7.674, |
| "step": 109000 |
| }, |
| { |
| "epoch": 29.34641472868217, |
| "grad_norm": 0.4822516441345215, |
| "learning_rate": 0.00015992056011848659, |
| "loss": 2.916, |
| "step": 109050 |
| }, |
| { |
| "epoch": 29.359872954349697, |
| "grad_norm": 0.5103018283843994, |
| "learning_rate": 0.00015971859431802878, |
| "loss": 2.9114, |
| "step": 109100 |
| }, |
| { |
| "epoch": 29.373331180017228, |
| "grad_norm": 0.5238035917282104, |
| "learning_rate": 0.000159516628517571, |
| "loss": 2.906, |
| "step": 109150 |
| }, |
| { |
| "epoch": 29.386789405684755, |
| "grad_norm": 1.0078877210617065, |
| "learning_rate": 0.0001593146627171132, |
| "loss": 2.9091, |
| "step": 109200 |
| }, |
| { |
| "epoch": 29.400247631352283, |
| "grad_norm": 0.5114656090736389, |
| "learning_rate": 0.00015911269691665545, |
| "loss": 2.908, |
| "step": 109250 |
| }, |
| { |
| "epoch": 29.41370585701981, |
| "grad_norm": 0.4911574423313141, |
| "learning_rate": 0.00015891073111619765, |
| "loss": 2.9068, |
| "step": 109300 |
| }, |
| { |
| "epoch": 29.427164082687337, |
| "grad_norm": 0.5115836262702942, |
| "learning_rate": 0.00015870876531573987, |
| "loss": 2.9027, |
| "step": 109350 |
| }, |
| { |
| "epoch": 29.440622308354868, |
| "grad_norm": 0.538198709487915, |
| "learning_rate": 0.00015850679951528207, |
| "loss": 2.9203, |
| "step": 109400 |
| }, |
| { |
| "epoch": 29.454080534022395, |
| "grad_norm": 0.5085831880569458, |
| "learning_rate": 0.00015830483371482426, |
| "loss": 2.9104, |
| "step": 109450 |
| }, |
| { |
| "epoch": 29.467538759689923, |
| "grad_norm": 0.503837525844574, |
| "learning_rate": 0.00015810286791436649, |
| "loss": 2.9144, |
| "step": 109500 |
| }, |
| { |
| "epoch": 29.48099698535745, |
| "grad_norm": 0.5280073285102844, |
| "learning_rate": 0.00015790090211390868, |
| "loss": 2.9088, |
| "step": 109550 |
| }, |
| { |
| "epoch": 29.494455211024977, |
| "grad_norm": 0.5246075391769409, |
| "learning_rate": 0.0001576989363134509, |
| "loss": 2.9121, |
| "step": 109600 |
| }, |
| { |
| "epoch": 29.507913436692505, |
| "grad_norm": 0.5014354586601257, |
| "learning_rate": 0.0001574969705129931, |
| "loss": 2.911, |
| "step": 109650 |
| }, |
| { |
| "epoch": 29.521371662360036, |
| "grad_norm": 0.5218913555145264, |
| "learning_rate": 0.00015729500471253535, |
| "loss": 2.9129, |
| "step": 109700 |
| }, |
| { |
| "epoch": 29.534829888027563, |
| "grad_norm": 0.5031024217605591, |
| "learning_rate": 0.00015709303891207755, |
| "loss": 2.9214, |
| "step": 109750 |
| }, |
| { |
| "epoch": 29.54828811369509, |
| "grad_norm": 0.5340117812156677, |
| "learning_rate": 0.00015689107311161977, |
| "loss": 2.9132, |
| "step": 109800 |
| }, |
| { |
| "epoch": 29.561746339362617, |
| "grad_norm": 0.5272226929664612, |
| "learning_rate": 0.00015668910731116197, |
| "loss": 2.9138, |
| "step": 109850 |
| }, |
| { |
| "epoch": 29.575204565030145, |
| "grad_norm": 0.522051215171814, |
| "learning_rate": 0.00015648714151070416, |
| "loss": 2.9115, |
| "step": 109900 |
| }, |
| { |
| "epoch": 29.588662790697676, |
| "grad_norm": 0.5091174244880676, |
| "learning_rate": 0.0001562851757102464, |
| "loss": 2.909, |
| "step": 109950 |
| }, |
| { |
| "epoch": 29.602121016365203, |
| "grad_norm": 0.5172711610794067, |
| "learning_rate": 0.00015608320990978858, |
| "loss": 2.9142, |
| "step": 110000 |
| }, |
| { |
| "epoch": 29.602121016365203, |
| "eval_accuracy": 0.39755785770999374, |
| "eval_loss": 3.281320571899414, |
| "eval_runtime": 146.6744, |
| "eval_samples_per_second": 122.803, |
| "eval_steps_per_second": 7.677, |
| "step": 110000 |
| }, |
| { |
| "epoch": 29.602121016365203, |
| "step": 110000, |
| "total_flos": 2.29889824948224e+18, |
| "train_loss": 0.5332863204956054, |
| "train_runtime": 28956.9492, |
| "train_samples_per_second": 410.561, |
| "train_steps_per_second": 5.133 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 148640, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 40, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 20 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.29889824948224e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|