diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1985.4716981132076, + "eval_steps": 500, + "global_step": 52000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.7547169811320755, + "grad_norm": 30.673773492701116, + "learning_rate": 3.2000000000000005e-05, + "loss": 9.2984, + "step": 20 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 14.373295694588162, + "learning_rate": 6.400000000000001e-05, + "loss": 7.8844, + "step": 40 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 19.800138134391982, + "learning_rate": 7.99999818716091e-05, + "loss": 6.0394, + "step": 60 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 2.662186456025038, + "learning_rate": 7.9999836844587e-05, + "loss": 4.373, + "step": 80 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 2.9944135324779984, + "learning_rate": 7.999954679110365e-05, + "loss": 3.93, + "step": 100 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 6.144462097933044, + "learning_rate": 7.999911171228081e-05, + "loss": 3.8796, + "step": 120 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 1.885170861907592, + "learning_rate": 7.999853160980113e-05, + "loss": 3.7437, + "step": 140 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 1.794246585785847, + "learning_rate": 7.999780648590806e-05, + "loss": 3.6892, + "step": 160 + }, + { + "epoch": 6.7924528301886795, + "grad_norm": 12.766182271547313, + "learning_rate": 7.999693634340594e-05, + "loss": 3.6124, + "step": 180 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 3.014805105995273, + "learning_rate": 7.999592118565996e-05, + "loss": 3.4224, + "step": 200 + }, + { + "epoch": 8.30188679245283, + "grad_norm": 1.6809726247876995, + "learning_rate": 7.999476101659613e-05, + "loss": 3.2811, + "step": 220 + }, + { + "epoch": 9.056603773584905, + "grad_norm": 1.7074366581291722, + "learning_rate": 7.999345584070125e-05, + "loss": 3.2012, + "step": 240 + }, + { + "epoch": 9.81132075471698, + "grad_norm": 1.6971425854714368, + "learning_rate": 7.999200566302298e-05, + "loss": 3.1281, + "step": 260 + }, + { + "epoch": 10.566037735849056, + "grad_norm": 2.2456748845663155, + "learning_rate": 7.999041048916969e-05, + "loss": 3.1133, + "step": 280 + }, + { + "epoch": 11.320754716981131, + "grad_norm": 2.301731796932263, + "learning_rate": 7.998867032531055e-05, + "loss": 3.0835, + "step": 300 + }, + { + "epoch": 12.075471698113208, + "grad_norm": 1.9076791695070954, + "learning_rate": 7.998678517817546e-05, + "loss": 3.0189, + "step": 320 + }, + { + "epoch": 12.830188679245284, + "grad_norm": 1.8142406149549974, + "learning_rate": 7.9984755055055e-05, + "loss": 3.0019, + "step": 340 + }, + { + "epoch": 13.584905660377359, + "grad_norm": 1.747017241682793, + "learning_rate": 7.998257996380048e-05, + "loss": 2.9866, + "step": 360 + }, + { + "epoch": 14.339622641509434, + "grad_norm": 1.430663205424301, + "learning_rate": 7.998025991282377e-05, + "loss": 3.0026, + "step": 380 + }, + { + "epoch": 15.09433962264151, + "grad_norm": 2.0885624099422566, + "learning_rate": 7.997779491109745e-05, + "loss": 2.946, + "step": 400 + }, + { + "epoch": 15.849056603773585, + "grad_norm": 1.858513295415329, + "learning_rate": 7.997518496815465e-05, + "loss": 2.9293, + "step": 420 + }, + { + "epoch": 16.60377358490566, + "grad_norm": 4.071990410305379, + "learning_rate": 7.9972430094089e-05, + "loss": 2.8812, + "step": 440 + }, + { + "epoch": 17.358490566037737, + "grad_norm": 1.7862953717408494, + "learning_rate": 7.996953029955468e-05, + "loss": 2.9285, + "step": 460 + }, + { + "epoch": 18.11320754716981, + "grad_norm": 1.6791350922351642, + "learning_rate": 7.996648559576633e-05, + "loss": 2.9062, + "step": 480 + }, + { + "epoch": 18.867924528301888, + "grad_norm": 2.14320095701687, + "learning_rate": 7.996329599449902e-05, + "loss": 2.8974, + "step": 500 + }, + { + "epoch": 19.62264150943396, + "grad_norm": 1.645724273362961, + "learning_rate": 7.995996150808815e-05, + "loss": 2.842, + "step": 520 + }, + { + "epoch": 20.37735849056604, + "grad_norm": 1.6833734047290758, + "learning_rate": 7.99564821494295e-05, + "loss": 2.8603, + "step": 540 + }, + { + "epoch": 21.132075471698112, + "grad_norm": 1.680341673033382, + "learning_rate": 7.995285793197909e-05, + "loss": 2.8275, + "step": 560 + }, + { + "epoch": 21.88679245283019, + "grad_norm": 1.5669198141799856, + "learning_rate": 7.994908886975317e-05, + "loss": 2.8495, + "step": 580 + }, + { + "epoch": 22.641509433962263, + "grad_norm": 1.572130529899407, + "learning_rate": 7.99451749773282e-05, + "loss": 2.7967, + "step": 600 + }, + { + "epoch": 23.39622641509434, + "grad_norm": 2.0314221097665413, + "learning_rate": 7.994111626984069e-05, + "loss": 2.8023, + "step": 620 + }, + { + "epoch": 24.150943396226417, + "grad_norm": 2.524880591484311, + "learning_rate": 7.993691276298728e-05, + "loss": 2.7928, + "step": 640 + }, + { + "epoch": 24.90566037735849, + "grad_norm": 3.1215250835282884, + "learning_rate": 7.993256447302454e-05, + "loss": 2.7998, + "step": 660 + }, + { + "epoch": 25.660377358490567, + "grad_norm": 1.2555800634223058, + "learning_rate": 7.9928071416769e-05, + "loss": 2.7862, + "step": 680 + }, + { + "epoch": 26.41509433962264, + "grad_norm": 1.5918196349327507, + "learning_rate": 7.992343361159705e-05, + "loss": 2.7825, + "step": 700 + }, + { + "epoch": 27.169811320754718, + "grad_norm": 1.4446288187541776, + "learning_rate": 7.991865107544492e-05, + "loss": 2.7566, + "step": 720 + }, + { + "epoch": 27.92452830188679, + "grad_norm": 1.5498155400201465, + "learning_rate": 7.991372382680851e-05, + "loss": 2.7341, + "step": 740 + }, + { + "epoch": 28.67924528301887, + "grad_norm": 1.9707678993278974, + "learning_rate": 7.99086518847434e-05, + "loss": 2.7315, + "step": 760 + }, + { + "epoch": 29.433962264150942, + "grad_norm": 1.663827192167774, + "learning_rate": 7.99034352688648e-05, + "loss": 2.733, + "step": 780 + }, + { + "epoch": 30.18867924528302, + "grad_norm": 1.9883280378959602, + "learning_rate": 7.989807399934738e-05, + "loss": 2.7323, + "step": 800 + }, + { + "epoch": 30.943396226415093, + "grad_norm": 1.535721464256961, + "learning_rate": 7.989256809692524e-05, + "loss": 2.7081, + "step": 820 + }, + { + "epoch": 31.69811320754717, + "grad_norm": 1.4966431407465013, + "learning_rate": 7.988691758289184e-05, + "loss": 2.694, + "step": 840 + }, + { + "epoch": 32.45283018867924, + "grad_norm": 1.9684497666022975, + "learning_rate": 7.988112247909996e-05, + "loss": 2.6947, + "step": 860 + }, + { + "epoch": 33.20754716981132, + "grad_norm": 1.2512316865338262, + "learning_rate": 7.987518280796148e-05, + "loss": 2.7216, + "step": 880 + }, + { + "epoch": 33.9622641509434, + "grad_norm": 1.2566119692549285, + "learning_rate": 7.986909859244743e-05, + "loss": 2.6675, + "step": 900 + }, + { + "epoch": 34.716981132075475, + "grad_norm": 1.8282134080395354, + "learning_rate": 7.986286985608782e-05, + "loss": 2.6712, + "step": 920 + }, + { + "epoch": 35.471698113207545, + "grad_norm": 1.3075929724419728, + "learning_rate": 7.985649662297164e-05, + "loss": 2.668, + "step": 940 + }, + { + "epoch": 36.22641509433962, + "grad_norm": 1.4411328906456615, + "learning_rate": 7.984997891774664e-05, + "loss": 2.6937, + "step": 960 + }, + { + "epoch": 36.9811320754717, + "grad_norm": 1.8430700250981429, + "learning_rate": 7.984331676561932e-05, + "loss": 2.6798, + "step": 980 + }, + { + "epoch": 37.735849056603776, + "grad_norm": 1.4511454692578831, + "learning_rate": 7.983651019235483e-05, + "loss": 2.6637, + "step": 1000 + }, + { + "epoch": 38.490566037735846, + "grad_norm": 1.502704294446435, + "learning_rate": 7.982955922427681e-05, + "loss": 2.6688, + "step": 1020 + }, + { + "epoch": 39.24528301886792, + "grad_norm": 1.4028762887194124, + "learning_rate": 7.982246388826741e-05, + "loss": 2.6086, + "step": 1040 + }, + { + "epoch": 40.0, + "grad_norm": 1.539721758237447, + "learning_rate": 7.981522421176697e-05, + "loss": 2.6084, + "step": 1060 + }, + { + "epoch": 40.75471698113208, + "grad_norm": 1.77576635302677, + "learning_rate": 7.980784022277421e-05, + "loss": 2.6216, + "step": 1080 + }, + { + "epoch": 41.509433962264154, + "grad_norm": 1.4874495954369062, + "learning_rate": 7.980031194984588e-05, + "loss": 2.6328, + "step": 1100 + }, + { + "epoch": 42.264150943396224, + "grad_norm": 1.8846242927156294, + "learning_rate": 7.979263942209669e-05, + "loss": 2.6427, + "step": 1120 + }, + { + "epoch": 43.0188679245283, + "grad_norm": 1.6981877996408483, + "learning_rate": 7.978482266919936e-05, + "loss": 2.6224, + "step": 1140 + }, + { + "epoch": 43.77358490566038, + "grad_norm": 1.3746555989630926, + "learning_rate": 7.977686172138426e-05, + "loss": 2.6011, + "step": 1160 + }, + { + "epoch": 44.528301886792455, + "grad_norm": 1.377657678804025, + "learning_rate": 7.97687566094395e-05, + "loss": 2.6086, + "step": 1180 + }, + { + "epoch": 45.283018867924525, + "grad_norm": 1.2094636718352942, + "learning_rate": 7.976050736471069e-05, + "loss": 2.582, + "step": 1200 + }, + { + "epoch": 46.0377358490566, + "grad_norm": 1.4433837187551148, + "learning_rate": 7.975211401910087e-05, + "loss": 2.6294, + "step": 1220 + }, + { + "epoch": 46.79245283018868, + "grad_norm": 1.5026382784404573, + "learning_rate": 7.97435766050704e-05, + "loss": 2.5993, + "step": 1240 + }, + { + "epoch": 47.54716981132076, + "grad_norm": 1.2094136471599368, + "learning_rate": 7.973489515563676e-05, + "loss": 2.6164, + "step": 1260 + }, + { + "epoch": 48.301886792452834, + "grad_norm": 1.394688364908413, + "learning_rate": 7.972606970437446e-05, + "loss": 2.6056, + "step": 1280 + }, + { + "epoch": 49.056603773584904, + "grad_norm": 1.271568801692499, + "learning_rate": 7.971710028541502e-05, + "loss": 2.5755, + "step": 1300 + }, + { + "epoch": 49.81132075471698, + "grad_norm": 1.4259670316825253, + "learning_rate": 7.970798693344663e-05, + "loss": 2.5759, + "step": 1320 + }, + { + "epoch": 50.56603773584906, + "grad_norm": 1.3071538390073274, + "learning_rate": 7.969872968371418e-05, + "loss": 2.6031, + "step": 1340 + }, + { + "epoch": 51.320754716981135, + "grad_norm": 1.2595773412735998, + "learning_rate": 7.968932857201907e-05, + "loss": 2.5711, + "step": 1360 + }, + { + "epoch": 52.075471698113205, + "grad_norm": 2.0429570729259714, + "learning_rate": 7.967978363471901e-05, + "loss": 2.5662, + "step": 1380 + }, + { + "epoch": 52.83018867924528, + "grad_norm": 1.6700659590709221, + "learning_rate": 7.967009490872805e-05, + "loss": 2.5618, + "step": 1400 + }, + { + "epoch": 53.58490566037736, + "grad_norm": 1.33856858087749, + "learning_rate": 7.966026243151624e-05, + "loss": 2.5351, + "step": 1420 + }, + { + "epoch": 54.339622641509436, + "grad_norm": 1.514257636366029, + "learning_rate": 7.965028624110956e-05, + "loss": 2.5686, + "step": 1440 + }, + { + "epoch": 55.094339622641506, + "grad_norm": 1.5655072381428023, + "learning_rate": 7.964016637608987e-05, + "loss": 2.5329, + "step": 1460 + }, + { + "epoch": 55.84905660377358, + "grad_norm": 1.3215439672221574, + "learning_rate": 7.96299028755946e-05, + "loss": 2.5701, + "step": 1480 + }, + { + "epoch": 56.60377358490566, + "grad_norm": 1.4294366995579832, + "learning_rate": 7.961949577931671e-05, + "loss": 2.5143, + "step": 1500 + }, + { + "epoch": 57.35849056603774, + "grad_norm": 1.1676370114885968, + "learning_rate": 7.960894512750449e-05, + "loss": 2.5653, + "step": 1520 + }, + { + "epoch": 58.113207547169814, + "grad_norm": 1.2026735621707902, + "learning_rate": 7.95982509609614e-05, + "loss": 2.5161, + "step": 1540 + }, + { + "epoch": 58.867924528301884, + "grad_norm": 1.2479764772455937, + "learning_rate": 7.958741332104596e-05, + "loss": 2.508, + "step": 1560 + }, + { + "epoch": 59.62264150943396, + "grad_norm": 1.1961953679380617, + "learning_rate": 7.957643224967155e-05, + "loss": 2.5009, + "step": 1580 + }, + { + "epoch": 60.37735849056604, + "grad_norm": 1.2497242021619674, + "learning_rate": 7.956530778930622e-05, + "loss": 2.5059, + "step": 1600 + }, + { + "epoch": 61.132075471698116, + "grad_norm": 1.3171737588939698, + "learning_rate": 7.955403998297261e-05, + "loss": 2.4988, + "step": 1620 + }, + { + "epoch": 61.886792452830186, + "grad_norm": 1.4834839050766762, + "learning_rate": 7.95426288742477e-05, + "loss": 2.4981, + "step": 1640 + }, + { + "epoch": 62.64150943396226, + "grad_norm": 1.5715059944491987, + "learning_rate": 7.953107450726267e-05, + "loss": 2.5151, + "step": 1660 + }, + { + "epoch": 63.39622641509434, + "grad_norm": 1.3272413313721245, + "learning_rate": 7.95193769267028e-05, + "loss": 2.4963, + "step": 1680 + }, + { + "epoch": 64.15094339622641, + "grad_norm": 1.0349543461606097, + "learning_rate": 7.950753617780715e-05, + "loss": 2.4829, + "step": 1700 + }, + { + "epoch": 64.90566037735849, + "grad_norm": 1.9240255477140202, + "learning_rate": 7.949555230636851e-05, + "loss": 2.4943, + "step": 1720 + }, + { + "epoch": 65.66037735849056, + "grad_norm": 1.3252737004710828, + "learning_rate": 7.948342535873318e-05, + "loss": 2.4642, + "step": 1740 + }, + { + "epoch": 66.41509433962264, + "grad_norm": 1.4539890356994254, + "learning_rate": 7.947115538180077e-05, + "loss": 2.4609, + "step": 1760 + }, + { + "epoch": 67.16981132075472, + "grad_norm": 1.4560503030341407, + "learning_rate": 7.945874242302408e-05, + "loss": 2.5209, + "step": 1780 + }, + { + "epoch": 67.9245283018868, + "grad_norm": 1.344513424004851, + "learning_rate": 7.944618653040883e-05, + "loss": 2.4993, + "step": 1800 + }, + { + "epoch": 68.67924528301887, + "grad_norm": 1.8832922124286062, + "learning_rate": 7.943348775251356e-05, + "loss": 2.4646, + "step": 1820 + }, + { + "epoch": 69.43396226415095, + "grad_norm": 1.2399208576659413, + "learning_rate": 7.942064613844938e-05, + "loss": 2.4849, + "step": 1840 + }, + { + "epoch": 70.18867924528301, + "grad_norm": 1.3243651575141913, + "learning_rate": 7.940766173787979e-05, + "loss": 2.4599, + "step": 1860 + }, + { + "epoch": 70.94339622641509, + "grad_norm": 1.556289335581103, + "learning_rate": 7.939453460102055e-05, + "loss": 2.4888, + "step": 1880 + }, + { + "epoch": 71.69811320754717, + "grad_norm": 1.2063108835158236, + "learning_rate": 7.93812647786394e-05, + "loss": 2.4403, + "step": 1900 + }, + { + "epoch": 72.45283018867924, + "grad_norm": 1.381601728211994, + "learning_rate": 7.936785232205587e-05, + "loss": 2.4616, + "step": 1920 + }, + { + "epoch": 73.20754716981132, + "grad_norm": 1.1962254019464496, + "learning_rate": 7.935429728314119e-05, + "loss": 2.4594, + "step": 1940 + }, + { + "epoch": 73.9622641509434, + "grad_norm": 1.5037943758052086, + "learning_rate": 7.934059971431796e-05, + "loss": 2.4767, + "step": 1960 + }, + { + "epoch": 74.71698113207547, + "grad_norm": 1.3520028552568202, + "learning_rate": 7.932675966856001e-05, + "loss": 2.4627, + "step": 1980 + }, + { + "epoch": 75.47169811320755, + "grad_norm": 1.1447719169505226, + "learning_rate": 7.931277719939217e-05, + "loss": 2.4434, + "step": 2000 + }, + { + "epoch": 76.22641509433963, + "grad_norm": 1.264362441844072, + "learning_rate": 7.92986523608901e-05, + "loss": 2.4532, + "step": 2020 + }, + { + "epoch": 76.98113207547169, + "grad_norm": 1.3287557477036405, + "learning_rate": 7.928438520768005e-05, + "loss": 2.4227, + "step": 2040 + }, + { + "epoch": 77.73584905660377, + "grad_norm": 1.3638216226843092, + "learning_rate": 7.926997579493864e-05, + "loss": 2.4124, + "step": 2060 + }, + { + "epoch": 78.49056603773585, + "grad_norm": 1.8271009676844974, + "learning_rate": 7.925542417839267e-05, + "loss": 2.4564, + "step": 2080 + }, + { + "epoch": 79.24528301886792, + "grad_norm": 3.54259081233157, + "learning_rate": 7.924073041431895e-05, + "loss": 2.4369, + "step": 2100 + }, + { + "epoch": 80.0, + "grad_norm": 1.4017324354742142, + "learning_rate": 7.922589455954394e-05, + "loss": 2.4464, + "step": 2120 + }, + { + "epoch": 80.75471698113208, + "grad_norm": 1.2911982587049995, + "learning_rate": 7.921091667144366e-05, + "loss": 2.4513, + "step": 2140 + }, + { + "epoch": 81.50943396226415, + "grad_norm": 1.6022339076405718, + "learning_rate": 7.919579680794347e-05, + "loss": 2.4203, + "step": 2160 + }, + { + "epoch": 82.26415094339623, + "grad_norm": 1.192345448314673, + "learning_rate": 7.918053502751772e-05, + "loss": 2.4254, + "step": 2180 + }, + { + "epoch": 83.01886792452831, + "grad_norm": 1.6383611170040047, + "learning_rate": 7.916513138918968e-05, + "loss": 2.4271, + "step": 2200 + }, + { + "epoch": 83.77358490566037, + "grad_norm": 1.2342470221196802, + "learning_rate": 7.91495859525312e-05, + "loss": 2.4079, + "step": 2220 + }, + { + "epoch": 84.52830188679245, + "grad_norm": 1.0846993450602334, + "learning_rate": 7.913389877766257e-05, + "loss": 2.4383, + "step": 2240 + }, + { + "epoch": 85.28301886792453, + "grad_norm": 1.6823249556637492, + "learning_rate": 7.911806992525215e-05, + "loss": 2.4146, + "step": 2260 + }, + { + "epoch": 86.0377358490566, + "grad_norm": 1.1641636008270617, + "learning_rate": 7.91020994565163e-05, + "loss": 2.4208, + "step": 2280 + }, + { + "epoch": 86.79245283018868, + "grad_norm": 1.2267450186018727, + "learning_rate": 7.9085987433219e-05, + "loss": 2.4123, + "step": 2300 + }, + { + "epoch": 87.54716981132076, + "grad_norm": 1.3570826999644423, + "learning_rate": 7.906973391767178e-05, + "loss": 2.3968, + "step": 2320 + }, + { + "epoch": 88.30188679245283, + "grad_norm": 1.4751948166402733, + "learning_rate": 7.905333897273327e-05, + "loss": 2.4266, + "step": 2340 + }, + { + "epoch": 89.05660377358491, + "grad_norm": 1.6442713319159463, + "learning_rate": 7.903680266180908e-05, + "loss": 2.4226, + "step": 2360 + }, + { + "epoch": 89.81132075471699, + "grad_norm": 1.3132724406404779, + "learning_rate": 7.90201250488516e-05, + "loss": 2.419, + "step": 2380 + }, + { + "epoch": 90.56603773584905, + "grad_norm": 1.4073019579145547, + "learning_rate": 7.900330619835963e-05, + "loss": 2.3689, + "step": 2400 + }, + { + "epoch": 91.32075471698113, + "grad_norm": 1.2366514839120522, + "learning_rate": 7.89863461753782e-05, + "loss": 2.4054, + "step": 2420 + }, + { + "epoch": 92.0754716981132, + "grad_norm": 1.2825349652701765, + "learning_rate": 7.896924504549836e-05, + "loss": 2.4019, + "step": 2440 + }, + { + "epoch": 92.83018867924528, + "grad_norm": 1.836162542809911, + "learning_rate": 7.895200287485676e-05, + "loss": 2.4177, + "step": 2460 + }, + { + "epoch": 93.58490566037736, + "grad_norm": 1.1862449779023223, + "learning_rate": 7.893461973013567e-05, + "loss": 2.417, + "step": 2480 + }, + { + "epoch": 94.33962264150944, + "grad_norm": 1.4267902121087415, + "learning_rate": 7.891709567856242e-05, + "loss": 2.3877, + "step": 2500 + }, + { + "epoch": 95.09433962264151, + "grad_norm": 1.2628527153576017, + "learning_rate": 7.889943078790934e-05, + "loss": 2.3893, + "step": 2520 + }, + { + "epoch": 95.84905660377359, + "grad_norm": 1.2789710243072507, + "learning_rate": 7.888162512649344e-05, + "loss": 2.3747, + "step": 2540 + }, + { + "epoch": 96.60377358490567, + "grad_norm": 1.2286761119774143, + "learning_rate": 7.886367876317615e-05, + "loss": 2.3835, + "step": 2560 + }, + { + "epoch": 97.35849056603773, + "grad_norm": 1.1142509789518844, + "learning_rate": 7.884559176736305e-05, + "loss": 2.3751, + "step": 2580 + }, + { + "epoch": 98.11320754716981, + "grad_norm": 1.4479112681435136, + "learning_rate": 7.882736420900357e-05, + "loss": 2.3885, + "step": 2600 + }, + { + "epoch": 98.86792452830188, + "grad_norm": 1.363147415477506, + "learning_rate": 7.880899615859078e-05, + "loss": 2.3738, + "step": 2620 + }, + { + "epoch": 99.62264150943396, + "grad_norm": 1.1387365076919822, + "learning_rate": 7.879048768716105e-05, + "loss": 2.3476, + "step": 2640 + }, + { + "epoch": 100.37735849056604, + "grad_norm": 1.1944352338174065, + "learning_rate": 7.87718388662939e-05, + "loss": 2.3729, + "step": 2660 + }, + { + "epoch": 101.13207547169812, + "grad_norm": 1.1017143695500988, + "learning_rate": 7.875304976811153e-05, + "loss": 2.3846, + "step": 2680 + }, + { + "epoch": 101.88679245283019, + "grad_norm": 1.250014546065029, + "learning_rate": 7.873412046527873e-05, + "loss": 2.3928, + "step": 2700 + }, + { + "epoch": 102.64150943396227, + "grad_norm": 1.4448571670529484, + "learning_rate": 7.871505103100243e-05, + "loss": 2.3464, + "step": 2720 + }, + { + "epoch": 103.39622641509433, + "grad_norm": 1.1242909760207218, + "learning_rate": 7.869584153903159e-05, + "loss": 2.3739, + "step": 2740 + }, + { + "epoch": 104.15094339622641, + "grad_norm": 2.2842982833142176, + "learning_rate": 7.86764920636568e-05, + "loss": 2.348, + "step": 2760 + }, + { + "epoch": 104.90566037735849, + "grad_norm": 1.377894286349549, + "learning_rate": 7.865700267970997e-05, + "loss": 2.3888, + "step": 2780 + }, + { + "epoch": 105.66037735849056, + "grad_norm": 1.889252338819464, + "learning_rate": 7.863737346256416e-05, + "loss": 2.339, + "step": 2800 + }, + { + "epoch": 106.41509433962264, + "grad_norm": 1.2007024366101338, + "learning_rate": 7.861760448813318e-05, + "loss": 2.3518, + "step": 2820 + }, + { + "epoch": 107.16981132075472, + "grad_norm": 1.3150471864332571, + "learning_rate": 7.859769583287136e-05, + "loss": 2.3755, + "step": 2840 + }, + { + "epoch": 107.9245283018868, + "grad_norm": 1.3488307619297817, + "learning_rate": 7.857764757377321e-05, + "loss": 2.3613, + "step": 2860 + }, + { + "epoch": 108.67924528301887, + "grad_norm": 1.1271224750447038, + "learning_rate": 7.855745978837316e-05, + "loss": 2.3434, + "step": 2880 + }, + { + "epoch": 109.43396226415095, + "grad_norm": 1.2792788627087681, + "learning_rate": 7.85371325547452e-05, + "loss": 2.3475, + "step": 2900 + }, + { + "epoch": 110.18867924528301, + "grad_norm": 1.1278269502097389, + "learning_rate": 7.851666595150267e-05, + "loss": 2.3561, + "step": 2920 + }, + { + "epoch": 110.94339622641509, + "grad_norm": 1.2221588824212564, + "learning_rate": 7.849606005779789e-05, + "loss": 2.345, + "step": 2940 + }, + { + "epoch": 111.69811320754717, + "grad_norm": 1.2272636691471697, + "learning_rate": 7.84753149533219e-05, + "loss": 2.3491, + "step": 2960 + }, + { + "epoch": 112.45283018867924, + "grad_norm": 1.4379769660358386, + "learning_rate": 7.845443071830403e-05, + "loss": 2.3703, + "step": 2980 + }, + { + "epoch": 113.20754716981132, + "grad_norm": 1.1938598523408401, + "learning_rate": 7.843340743351179e-05, + "loss": 2.3514, + "step": 3000 + }, + { + "epoch": 113.9622641509434, + "grad_norm": 1.1633264713108291, + "learning_rate": 7.841224518025038e-05, + "loss": 2.3396, + "step": 3020 + }, + { + "epoch": 114.71698113207547, + "grad_norm": 1.1889386134705129, + "learning_rate": 7.839094404036246e-05, + "loss": 2.3654, + "step": 3040 + }, + { + "epoch": 115.47169811320755, + "grad_norm": 1.2210304404269434, + "learning_rate": 7.836950409622788e-05, + "loss": 2.3827, + "step": 3060 + }, + { + "epoch": 116.22641509433963, + "grad_norm": 1.2063342612399106, + "learning_rate": 7.834792543076318e-05, + "loss": 2.3316, + "step": 3080 + }, + { + "epoch": 116.98113207547169, + "grad_norm": 1.1263568091149723, + "learning_rate": 7.832620812742149e-05, + "loss": 2.3483, + "step": 3100 + }, + { + "epoch": 117.73584905660377, + "grad_norm": 1.1259514670897872, + "learning_rate": 7.830435227019208e-05, + "loss": 2.3125, + "step": 3120 + }, + { + "epoch": 118.49056603773585, + "grad_norm": 1.4031978763279247, + "learning_rate": 7.828235794360003e-05, + "loss": 2.3509, + "step": 3140 + }, + { + "epoch": 119.24528301886792, + "grad_norm": 1.1004874238643756, + "learning_rate": 7.826022523270598e-05, + "loss": 2.2975, + "step": 3160 + }, + { + "epoch": 120.0, + "grad_norm": 1.1440875702771847, + "learning_rate": 7.823795422310573e-05, + "loss": 2.3048, + "step": 3180 + }, + { + "epoch": 120.75471698113208, + "grad_norm": 1.254578833443374, + "learning_rate": 7.821554500092995e-05, + "loss": 2.3253, + "step": 3200 + }, + { + "epoch": 121.50943396226415, + "grad_norm": 1.3020705320626609, + "learning_rate": 7.819299765284377e-05, + "loss": 2.32, + "step": 3220 + }, + { + "epoch": 122.26415094339623, + "grad_norm": 1.144219025307704, + "learning_rate": 7.817031226604663e-05, + "loss": 2.3338, + "step": 3240 + }, + { + "epoch": 123.01886792452831, + "grad_norm": 1.4392091454771268, + "learning_rate": 7.814748892827171e-05, + "loss": 2.3081, + "step": 3260 + }, + { + "epoch": 123.77358490566037, + "grad_norm": 1.57104334995189, + "learning_rate": 7.812452772778576e-05, + "loss": 2.3044, + "step": 3280 + }, + { + "epoch": 124.52830188679245, + "grad_norm": 1.2140782445458616, + "learning_rate": 7.810142875338864e-05, + "loss": 2.3162, + "step": 3300 + }, + { + "epoch": 125.28301886792453, + "grad_norm": 1.1430293000699974, + "learning_rate": 7.807819209441311e-05, + "loss": 2.3349, + "step": 3320 + }, + { + "epoch": 126.0377358490566, + "grad_norm": 1.3717461598648188, + "learning_rate": 7.805481784072435e-05, + "loss": 2.3048, + "step": 3340 + }, + { + "epoch": 126.79245283018868, + "grad_norm": 1.413324243222593, + "learning_rate": 7.803130608271972e-05, + "loss": 2.2987, + "step": 3360 + }, + { + "epoch": 127.54716981132076, + "grad_norm": 1.4726228615781376, + "learning_rate": 7.80076569113283e-05, + "loss": 2.3164, + "step": 3380 + }, + { + "epoch": 128.30188679245282, + "grad_norm": 1.4760960932985028, + "learning_rate": 7.798387041801066e-05, + "loss": 2.3314, + "step": 3400 + }, + { + "epoch": 129.0566037735849, + "grad_norm": 1.429175780411594, + "learning_rate": 7.795994669475842e-05, + "loss": 2.2752, + "step": 3420 + }, + { + "epoch": 129.81132075471697, + "grad_norm": 1.1413240245586067, + "learning_rate": 7.793588583409394e-05, + "loss": 2.333, + "step": 3440 + }, + { + "epoch": 130.56603773584905, + "grad_norm": 1.298767089765165, + "learning_rate": 7.791168792906992e-05, + "loss": 2.3227, + "step": 3460 + }, + { + "epoch": 131.32075471698113, + "grad_norm": 1.2359046339523858, + "learning_rate": 7.788735307326908e-05, + "loss": 2.3108, + "step": 3480 + }, + { + "epoch": 132.0754716981132, + "grad_norm": 1.1866095738297588, + "learning_rate": 7.786288136080376e-05, + "loss": 2.274, + "step": 3500 + }, + { + "epoch": 132.83018867924528, + "grad_norm": 1.1104279322428132, + "learning_rate": 7.78382728863156e-05, + "loss": 2.2888, + "step": 3520 + }, + { + "epoch": 133.58490566037736, + "grad_norm": 2.2680957450657537, + "learning_rate": 7.781352774497518e-05, + "loss": 2.2938, + "step": 3540 + }, + { + "epoch": 134.33962264150944, + "grad_norm": 1.6397138146409036, + "learning_rate": 7.778864603248155e-05, + "loss": 2.3068, + "step": 3560 + }, + { + "epoch": 135.0943396226415, + "grad_norm": 1.3552905083817133, + "learning_rate": 7.7763627845062e-05, + "loss": 2.3155, + "step": 3580 + }, + { + "epoch": 135.8490566037736, + "grad_norm": 1.1820209128101842, + "learning_rate": 7.773847327947157e-05, + "loss": 2.2937, + "step": 3600 + }, + { + "epoch": 136.60377358490567, + "grad_norm": 1.2309654111909685, + "learning_rate": 7.771318243299278e-05, + "loss": 2.2887, + "step": 3620 + }, + { + "epoch": 137.35849056603774, + "grad_norm": 1.3795089340342572, + "learning_rate": 7.768775540343515e-05, + "loss": 2.2961, + "step": 3640 + }, + { + "epoch": 138.11320754716982, + "grad_norm": 1.3662606951792517, + "learning_rate": 7.766219228913492e-05, + "loss": 2.288, + "step": 3660 + }, + { + "epoch": 138.8679245283019, + "grad_norm": 1.1081358756463113, + "learning_rate": 7.763649318895459e-05, + "loss": 2.3193, + "step": 3680 + }, + { + "epoch": 139.62264150943398, + "grad_norm": 1.3054965758516237, + "learning_rate": 7.761065820228258e-05, + "loss": 2.2904, + "step": 3700 + }, + { + "epoch": 140.37735849056602, + "grad_norm": 1.4052953203319152, + "learning_rate": 7.758468742903284e-05, + "loss": 2.2803, + "step": 3720 + }, + { + "epoch": 141.1320754716981, + "grad_norm": 1.310015833541638, + "learning_rate": 7.755858096964445e-05, + "loss": 2.2891, + "step": 3740 + }, + { + "epoch": 141.88679245283018, + "grad_norm": 1.0645192580358254, + "learning_rate": 7.753233892508125e-05, + "loss": 2.2982, + "step": 3760 + }, + { + "epoch": 142.64150943396226, + "grad_norm": 1.0898474528650213, + "learning_rate": 7.750596139683145e-05, + "loss": 2.2711, + "step": 3780 + }, + { + "epoch": 143.39622641509433, + "grad_norm": 1.2074165473918712, + "learning_rate": 7.747944848690719e-05, + "loss": 2.2592, + "step": 3800 + }, + { + "epoch": 144.1509433962264, + "grad_norm": 1.0959283058664937, + "learning_rate": 7.745280029784423e-05, + "loss": 2.2813, + "step": 3820 + }, + { + "epoch": 144.9056603773585, + "grad_norm": 1.2139556681199035, + "learning_rate": 7.742601693270148e-05, + "loss": 2.2564, + "step": 3840 + }, + { + "epoch": 145.66037735849056, + "grad_norm": 1.2374163658098694, + "learning_rate": 7.739909849506064e-05, + "loss": 2.2972, + "step": 3860 + }, + { + "epoch": 146.41509433962264, + "grad_norm": 1.212582172960113, + "learning_rate": 7.737204508902578e-05, + "loss": 2.2683, + "step": 3880 + }, + { + "epoch": 147.16981132075472, + "grad_norm": 1.064638273683967, + "learning_rate": 7.734485681922295e-05, + "loss": 2.2643, + "step": 3900 + }, + { + "epoch": 147.9245283018868, + "grad_norm": 1.5185500205423423, + "learning_rate": 7.731753379079976e-05, + "loss": 2.2825, + "step": 3920 + }, + { + "epoch": 148.67924528301887, + "grad_norm": 1.2571175031602655, + "learning_rate": 7.7290076109425e-05, + "loss": 2.2838, + "step": 3940 + }, + { + "epoch": 149.43396226415095, + "grad_norm": 1.1185096216789012, + "learning_rate": 7.726248388128821e-05, + "loss": 2.2713, + "step": 3960 + }, + { + "epoch": 150.18867924528303, + "grad_norm": 1.283741452573828, + "learning_rate": 7.723475721309926e-05, + "loss": 2.2578, + "step": 3980 + }, + { + "epoch": 150.9433962264151, + "grad_norm": 1.1735101055664479, + "learning_rate": 7.720689621208799e-05, + "loss": 2.2584, + "step": 4000 + }, + { + "epoch": 151.69811320754718, + "grad_norm": 1.1931741706657397, + "learning_rate": 7.717890098600371e-05, + "loss": 2.2439, + "step": 4020 + }, + { + "epoch": 152.45283018867926, + "grad_norm": 1.0510206287412838, + "learning_rate": 7.715077164311486e-05, + "loss": 2.2646, + "step": 4040 + }, + { + "epoch": 153.20754716981133, + "grad_norm": 1.2236996476850626, + "learning_rate": 7.712250829220856e-05, + "loss": 2.2518, + "step": 4060 + }, + { + "epoch": 153.96226415094338, + "grad_norm": 1.4295022161938338, + "learning_rate": 7.70941110425902e-05, + "loss": 2.2445, + "step": 4080 + }, + { + "epoch": 154.71698113207546, + "grad_norm": 1.2608108045607223, + "learning_rate": 7.706558000408294e-05, + "loss": 2.2504, + "step": 4100 + }, + { + "epoch": 155.47169811320754, + "grad_norm": 1.4378816608236173, + "learning_rate": 7.703691528702747e-05, + "loss": 2.2433, + "step": 4120 + }, + { + "epoch": 156.22641509433961, + "grad_norm": 1.3122607821127985, + "learning_rate": 7.700811700228138e-05, + "loss": 2.2593, + "step": 4140 + }, + { + "epoch": 156.9811320754717, + "grad_norm": 1.1677763203213758, + "learning_rate": 7.697918526121882e-05, + "loss": 2.2521, + "step": 4160 + }, + { + "epoch": 157.73584905660377, + "grad_norm": 1.1304212534843256, + "learning_rate": 7.695012017573013e-05, + "loss": 2.2743, + "step": 4180 + }, + { + "epoch": 158.49056603773585, + "grad_norm": 1.2157344056650818, + "learning_rate": 7.692092185822129e-05, + "loss": 2.2405, + "step": 4200 + }, + { + "epoch": 159.24528301886792, + "grad_norm": 1.2521062422528308, + "learning_rate": 7.689159042161356e-05, + "loss": 2.258, + "step": 4220 + }, + { + "epoch": 160.0, + "grad_norm": 1.417021221810849, + "learning_rate": 7.686212597934299e-05, + "loss": 2.2187, + "step": 4240 + }, + { + "epoch": 160.75471698113208, + "grad_norm": 1.0987738687082824, + "learning_rate": 7.68325286453601e-05, + "loss": 2.2155, + "step": 4260 + }, + { + "epoch": 161.50943396226415, + "grad_norm": 1.4771801969035276, + "learning_rate": 7.680279853412924e-05, + "loss": 2.27, + "step": 4280 + }, + { + "epoch": 162.26415094339623, + "grad_norm": 1.1956274528883593, + "learning_rate": 7.677293576062836e-05, + "loss": 2.2717, + "step": 4300 + }, + { + "epoch": 163.0188679245283, + "grad_norm": 1.1219859338242828, + "learning_rate": 7.674294044034839e-05, + "loss": 2.2487, + "step": 4320 + }, + { + "epoch": 163.77358490566039, + "grad_norm": 1.255744824066408, + "learning_rate": 7.671281268929293e-05, + "loss": 2.2366, + "step": 4340 + }, + { + "epoch": 164.52830188679246, + "grad_norm": 1.112451658029252, + "learning_rate": 7.668255262397772e-05, + "loss": 2.2377, + "step": 4360 + }, + { + "epoch": 165.28301886792454, + "grad_norm": 1.1131032086265853, + "learning_rate": 7.66521603614302e-05, + "loss": 2.2483, + "step": 4380 + }, + { + "epoch": 166.03773584905662, + "grad_norm": 1.2568117014241036, + "learning_rate": 7.662163601918907e-05, + "loss": 2.2637, + "step": 4400 + }, + { + "epoch": 166.79245283018867, + "grad_norm": 1.0763275712599132, + "learning_rate": 7.659097971530385e-05, + "loss": 2.2275, + "step": 4420 + }, + { + "epoch": 167.54716981132074, + "grad_norm": 1.0880356132513982, + "learning_rate": 7.656019156833438e-05, + "loss": 2.227, + "step": 4440 + }, + { + "epoch": 168.30188679245282, + "grad_norm": 1.0805504953865772, + "learning_rate": 7.652927169735042e-05, + "loss": 2.2205, + "step": 4460 + }, + { + "epoch": 169.0566037735849, + "grad_norm": 1.0979536600508317, + "learning_rate": 7.649822022193114e-05, + "loss": 2.2008, + "step": 4480 + }, + { + "epoch": 169.81132075471697, + "grad_norm": 1.0424485855679975, + "learning_rate": 7.646703726216467e-05, + "loss": 2.235, + "step": 4500 + }, + { + "epoch": 170.56603773584905, + "grad_norm": 1.1541609361962377, + "learning_rate": 7.643572293864766e-05, + "loss": 2.2297, + "step": 4520 + }, + { + "epoch": 171.32075471698113, + "grad_norm": 1.1630212513509717, + "learning_rate": 7.640427737248479e-05, + "loss": 2.2295, + "step": 4540 + }, + { + "epoch": 172.0754716981132, + "grad_norm": 1.5088805287099432, + "learning_rate": 7.637270068528828e-05, + "loss": 2.2445, + "step": 4560 + }, + { + "epoch": 172.83018867924528, + "grad_norm": 1.66773080303759, + "learning_rate": 7.634099299917748e-05, + "loss": 2.2336, + "step": 4580 + }, + { + "epoch": 173.58490566037736, + "grad_norm": 1.4239223646642891, + "learning_rate": 7.630915443677834e-05, + "loss": 2.2128, + "step": 4600 + }, + { + "epoch": 174.33962264150944, + "grad_norm": 1.2623270496447048, + "learning_rate": 7.627718512122297e-05, + "loss": 2.2253, + "step": 4620 + }, + { + "epoch": 175.0943396226415, + "grad_norm": 1.2406324767245749, + "learning_rate": 7.624508517614919e-05, + "loss": 2.2131, + "step": 4640 + }, + { + "epoch": 175.8490566037736, + "grad_norm": 1.3130455463591448, + "learning_rate": 7.621285472569993e-05, + "loss": 2.1944, + "step": 4660 + }, + { + "epoch": 176.60377358490567, + "grad_norm": 1.1413419622441512, + "learning_rate": 7.61804938945229e-05, + "loss": 2.2243, + "step": 4680 + }, + { + "epoch": 177.35849056603774, + "grad_norm": 1.2146654711035267, + "learning_rate": 7.614800280777005e-05, + "loss": 2.2172, + "step": 4700 + }, + { + "epoch": 178.11320754716982, + "grad_norm": 1.3634898063511693, + "learning_rate": 7.611538159109703e-05, + "loss": 2.205, + "step": 4720 + }, + { + "epoch": 178.8679245283019, + "grad_norm": 1.2311721419826, + "learning_rate": 7.608263037066277e-05, + "loss": 2.2252, + "step": 4740 + }, + { + "epoch": 179.62264150943398, + "grad_norm": 1.0908914570592438, + "learning_rate": 7.6049749273129e-05, + "loss": 2.2138, + "step": 4760 + }, + { + "epoch": 180.37735849056602, + "grad_norm": 1.1038829505990149, + "learning_rate": 7.601673842565972e-05, + "loss": 2.1939, + "step": 4780 + }, + { + "epoch": 181.1320754716981, + "grad_norm": 1.1236372724431538, + "learning_rate": 7.598359795592073e-05, + "loss": 2.2382, + "step": 4800 + }, + { + "epoch": 181.88679245283018, + "grad_norm": 1.3232451908070362, + "learning_rate": 7.59503279920791e-05, + "loss": 2.201, + "step": 4820 + }, + { + "epoch": 182.64150943396226, + "grad_norm": 1.3292125597941664, + "learning_rate": 7.591692866280274e-05, + "loss": 2.2058, + "step": 4840 + }, + { + "epoch": 183.39622641509433, + "grad_norm": 1.1970310296785942, + "learning_rate": 7.588340009725985e-05, + "loss": 2.206, + "step": 4860 + }, + { + "epoch": 184.1509433962264, + "grad_norm": 1.055682897860096, + "learning_rate": 7.584974242511845e-05, + "loss": 2.2148, + "step": 4880 + }, + { + "epoch": 184.9056603773585, + "grad_norm": 1.1655929048666676, + "learning_rate": 7.581595577654584e-05, + "loss": 2.2146, + "step": 4900 + }, + { + "epoch": 185.66037735849056, + "grad_norm": 1.2197862783964168, + "learning_rate": 7.578204028220814e-05, + "loss": 2.2023, + "step": 4920 + }, + { + "epoch": 186.41509433962264, + "grad_norm": 1.1536947546834515, + "learning_rate": 7.574799607326977e-05, + "loss": 2.2074, + "step": 4940 + }, + { + "epoch": 187.16981132075472, + "grad_norm": 1.1570044860516948, + "learning_rate": 7.571382328139293e-05, + "loss": 2.2057, + "step": 4960 + }, + { + "epoch": 187.9245283018868, + "grad_norm": 2.2251854969672165, + "learning_rate": 7.56795220387371e-05, + "loss": 2.1975, + "step": 4980 + }, + { + "epoch": 188.67924528301887, + "grad_norm": 1.16489093753128, + "learning_rate": 7.564509247795854e-05, + "loss": 2.1947, + "step": 5000 + }, + { + "epoch": 189.43396226415095, + "grad_norm": 1.1610456984999162, + "learning_rate": 7.561053473220977e-05, + "loss": 2.1861, + "step": 5020 + }, + { + "epoch": 190.18867924528303, + "grad_norm": 1.173342232590181, + "learning_rate": 7.557584893513902e-05, + "loss": 2.1997, + "step": 5040 + }, + { + "epoch": 190.9433962264151, + "grad_norm": 1.1873135989990635, + "learning_rate": 7.554103522088976e-05, + "loss": 2.1841, + "step": 5060 + }, + { + "epoch": 191.69811320754718, + "grad_norm": 1.4263143797188473, + "learning_rate": 7.550609372410018e-05, + "loss": 2.1823, + "step": 5080 + }, + { + "epoch": 192.45283018867926, + "grad_norm": 1.0849530063111787, + "learning_rate": 7.547102457990266e-05, + "loss": 2.1842, + "step": 5100 + }, + { + "epoch": 193.20754716981133, + "grad_norm": 1.1497288768060088, + "learning_rate": 7.54358279239232e-05, + "loss": 2.2258, + "step": 5120 + }, + { + "epoch": 193.96226415094338, + "grad_norm": 1.2795496420829302, + "learning_rate": 7.540050389228099e-05, + "loss": 2.192, + "step": 5140 + }, + { + "epoch": 194.71698113207546, + "grad_norm": 1.0700549445449614, + "learning_rate": 7.536505262158779e-05, + "loss": 2.1913, + "step": 5160 + }, + { + "epoch": 195.47169811320754, + "grad_norm": 1.3697359389801924, + "learning_rate": 7.532947424894744e-05, + "loss": 2.2044, + "step": 5180 + }, + { + "epoch": 196.22641509433961, + "grad_norm": 1.0721264053082575, + "learning_rate": 7.52937689119554e-05, + "loss": 2.1916, + "step": 5200 + }, + { + "epoch": 196.9811320754717, + "grad_norm": 1.2325173290768243, + "learning_rate": 7.525793674869805e-05, + "loss": 2.1738, + "step": 5220 + }, + { + "epoch": 197.73584905660377, + "grad_norm": 1.078471360885739, + "learning_rate": 7.522197789775235e-05, + "loss": 2.2043, + "step": 5240 + }, + { + "epoch": 198.49056603773585, + "grad_norm": 1.508079711738152, + "learning_rate": 7.518589249818516e-05, + "loss": 2.2159, + "step": 5260 + }, + { + "epoch": 199.24528301886792, + "grad_norm": 1.0511550659614401, + "learning_rate": 7.514968068955273e-05, + "loss": 2.168, + "step": 5280 + }, + { + "epoch": 200.0, + "grad_norm": 1.0585993296644824, + "learning_rate": 7.511334261190026e-05, + "loss": 2.1847, + "step": 5300 + }, + { + "epoch": 200.75471698113208, + "grad_norm": 1.19584254290663, + "learning_rate": 7.507687840576123e-05, + "loss": 2.1953, + "step": 5320 + }, + { + "epoch": 201.50943396226415, + "grad_norm": 1.360707266271236, + "learning_rate": 7.504028821215686e-05, + "loss": 2.1866, + "step": 5340 + }, + { + "epoch": 202.26415094339623, + "grad_norm": 1.0326833677791634, + "learning_rate": 7.500357217259573e-05, + "loss": 2.1889, + "step": 5360 + }, + { + "epoch": 203.0188679245283, + "grad_norm": 1.1966698046584427, + "learning_rate": 7.496673042907302e-05, + "loss": 2.204, + "step": 5380 + }, + { + "epoch": 203.77358490566039, + "grad_norm": 1.1792590946885393, + "learning_rate": 7.492976312407011e-05, + "loss": 2.1679, + "step": 5400 + }, + { + "epoch": 204.52830188679246, + "grad_norm": 1.0821551276306904, + "learning_rate": 7.489267040055393e-05, + "loss": 2.172, + "step": 5420 + }, + { + "epoch": 205.28301886792454, + "grad_norm": 1.1206896992927644, + "learning_rate": 7.48554524019765e-05, + "loss": 2.1558, + "step": 5440 + }, + { + "epoch": 206.03773584905662, + "grad_norm": 1.164481519584628, + "learning_rate": 7.481810927227427e-05, + "loss": 2.1707, + "step": 5460 + }, + { + "epoch": 206.79245283018867, + "grad_norm": 1.045173494578065, + "learning_rate": 7.47806411558677e-05, + "loss": 2.1454, + "step": 5480 + }, + { + "epoch": 207.54716981132074, + "grad_norm": 1.3037299893846073, + "learning_rate": 7.474304819766053e-05, + "loss": 2.1735, + "step": 5500 + }, + { + "epoch": 208.30188679245282, + "grad_norm": 1.1799164756908072, + "learning_rate": 7.470533054303937e-05, + "loss": 2.1678, + "step": 5520 + }, + { + "epoch": 209.0566037735849, + "grad_norm": 1.23204534029245, + "learning_rate": 7.46674883378731e-05, + "loss": 2.18, + "step": 5540 + }, + { + "epoch": 209.81132075471697, + "grad_norm": 1.1705040244332197, + "learning_rate": 7.462952172851219e-05, + "loss": 2.1638, + "step": 5560 + }, + { + "epoch": 210.56603773584905, + "grad_norm": 1.115647376955501, + "learning_rate": 7.459143086178838e-05, + "loss": 2.1517, + "step": 5580 + }, + { + "epoch": 211.32075471698113, + "grad_norm": 1.095644914375309, + "learning_rate": 7.455321588501378e-05, + "loss": 2.1624, + "step": 5600 + }, + { + "epoch": 212.0754716981132, + "grad_norm": 1.2461377018123299, + "learning_rate": 7.451487694598063e-05, + "loss": 2.1795, + "step": 5620 + }, + { + "epoch": 212.83018867924528, + "grad_norm": 1.0808714278402736, + "learning_rate": 7.447641419296051e-05, + "loss": 2.1857, + "step": 5640 + }, + { + "epoch": 213.58490566037736, + "grad_norm": 1.175783749152713, + "learning_rate": 7.443782777470388e-05, + "loss": 2.1489, + "step": 5660 + }, + { + "epoch": 214.33962264150944, + "grad_norm": 1.0323602107911023, + "learning_rate": 7.43991178404394e-05, + "loss": 2.1814, + "step": 5680 + }, + { + "epoch": 215.0943396226415, + "grad_norm": 1.4371901693782694, + "learning_rate": 7.436028453987343e-05, + "loss": 2.1607, + "step": 5700 + }, + { + "epoch": 215.8490566037736, + "grad_norm": 1.2749189929859621, + "learning_rate": 7.432132802318953e-05, + "loss": 2.1344, + "step": 5720 + }, + { + "epoch": 216.60377358490567, + "grad_norm": 1.7991005001893379, + "learning_rate": 7.428224844104763e-05, + "loss": 2.1705, + "step": 5740 + }, + { + "epoch": 217.35849056603774, + "grad_norm": 1.0763947355182082, + "learning_rate": 7.424304594458374e-05, + "loss": 2.1681, + "step": 5760 + }, + { + "epoch": 218.11320754716982, + "grad_norm": 1.147647175883896, + "learning_rate": 7.420372068540913e-05, + "loss": 2.1792, + "step": 5780 + }, + { + "epoch": 218.8679245283019, + "grad_norm": 1.232169418468151, + "learning_rate": 7.41642728156099e-05, + "loss": 2.1143, + "step": 5800 + }, + { + "epoch": 219.62264150943398, + "grad_norm": 1.3992234444810514, + "learning_rate": 7.41247024877463e-05, + "loss": 2.1612, + "step": 5820 + }, + { + "epoch": 220.37735849056602, + "grad_norm": 1.2478978185410232, + "learning_rate": 7.40850098548522e-05, + "loss": 2.1749, + "step": 5840 + }, + { + "epoch": 221.1320754716981, + "grad_norm": 1.0796153228438745, + "learning_rate": 7.404519507043443e-05, + "loss": 2.1345, + "step": 5860 + }, + { + "epoch": 221.88679245283018, + "grad_norm": 1.164330548160425, + "learning_rate": 7.40052582884723e-05, + "loss": 2.1573, + "step": 5880 + }, + { + "epoch": 222.64150943396226, + "grad_norm": 1.2041551436276394, + "learning_rate": 7.396519966341684e-05, + "loss": 2.162, + "step": 5900 + }, + { + "epoch": 223.39622641509433, + "grad_norm": 1.2780053810145304, + "learning_rate": 7.392501935019036e-05, + "loss": 2.1524, + "step": 5920 + }, + { + "epoch": 224.1509433962264, + "grad_norm": 1.069497717017709, + "learning_rate": 7.388471750418576e-05, + "loss": 2.1427, + "step": 5940 + }, + { + "epoch": 224.9056603773585, + "grad_norm": 1.1790523262171884, + "learning_rate": 7.384429428126599e-05, + "loss": 2.1693, + "step": 5960 + }, + { + "epoch": 225.66037735849056, + "grad_norm": 1.0727940077044007, + "learning_rate": 7.380374983776333e-05, + "loss": 2.1146, + "step": 5980 + }, + { + "epoch": 226.41509433962264, + "grad_norm": 1.0481198314836597, + "learning_rate": 7.376308433047898e-05, + "loss": 2.1563, + "step": 6000 + }, + { + "epoch": 227.16981132075472, + "grad_norm": 1.3874056107583248, + "learning_rate": 7.372229791668223e-05, + "loss": 2.1456, + "step": 6020 + }, + { + "epoch": 227.9245283018868, + "grad_norm": 1.3153838535909976, + "learning_rate": 7.368139075411003e-05, + "loss": 2.1575, + "step": 6040 + }, + { + "epoch": 228.67924528301887, + "grad_norm": 1.1788160013410025, + "learning_rate": 7.364036300096631e-05, + "loss": 2.1437, + "step": 6060 + }, + { + "epoch": 229.43396226415095, + "grad_norm": 1.1169312984810649, + "learning_rate": 7.359921481592136e-05, + "loss": 2.1568, + "step": 6080 + }, + { + "epoch": 230.18867924528303, + "grad_norm": 1.1435068374715258, + "learning_rate": 7.355794635811118e-05, + "loss": 2.1503, + "step": 6100 + }, + { + "epoch": 230.9433962264151, + "grad_norm": 1.7552469517638039, + "learning_rate": 7.3516557787137e-05, + "loss": 2.128, + "step": 6120 + }, + { + "epoch": 231.69811320754718, + "grad_norm": 1.0779669989000775, + "learning_rate": 7.347504926306452e-05, + "loss": 2.1485, + "step": 6140 + }, + { + "epoch": 232.45283018867926, + "grad_norm": 1.186788194688993, + "learning_rate": 7.343342094642333e-05, + "loss": 2.1576, + "step": 6160 + }, + { + "epoch": 233.20754716981133, + "grad_norm": 1.0594086679490557, + "learning_rate": 7.339167299820636e-05, + "loss": 2.1492, + "step": 6180 + }, + { + "epoch": 233.96226415094338, + "grad_norm": 1.1917321731840318, + "learning_rate": 7.334980557986916e-05, + "loss": 2.1482, + "step": 6200 + }, + { + "epoch": 234.71698113207546, + "grad_norm": 1.083198692826801, + "learning_rate": 7.330781885332932e-05, + "loss": 2.1461, + "step": 6220 + }, + { + "epoch": 235.47169811320754, + "grad_norm": 1.3139182121317998, + "learning_rate": 7.326571298096586e-05, + "loss": 2.156, + "step": 6240 + }, + { + "epoch": 236.22641509433961, + "grad_norm": 1.0854207170845476, + "learning_rate": 7.322348812561857e-05, + "loss": 2.1258, + "step": 6260 + }, + { + "epoch": 236.9811320754717, + "grad_norm": 1.5015227061373095, + "learning_rate": 7.318114445058739e-05, + "loss": 2.1439, + "step": 6280 + }, + { + "epoch": 237.73584905660377, + "grad_norm": 1.2347794021289429, + "learning_rate": 7.313868211963179e-05, + "loss": 2.1317, + "step": 6300 + }, + { + "epoch": 238.49056603773585, + "grad_norm": 1.442835177639965, + "learning_rate": 7.309610129697015e-05, + "loss": 2.113, + "step": 6320 + }, + { + "epoch": 239.24528301886792, + "grad_norm": 1.236255276661992, + "learning_rate": 7.305340214727905e-05, + "loss": 2.1378, + "step": 6340 + }, + { + "epoch": 240.0, + "grad_norm": 1.205183440308278, + "learning_rate": 7.301058483569271e-05, + "loss": 2.1336, + "step": 6360 + }, + { + "epoch": 240.75471698113208, + "grad_norm": 1.3246584618487252, + "learning_rate": 7.296764952780239e-05, + "loss": 2.1221, + "step": 6380 + }, + { + "epoch": 241.50943396226415, + "grad_norm": 1.0168670687272512, + "learning_rate": 7.292459638965558e-05, + "loss": 2.1188, + "step": 6400 + }, + { + "epoch": 242.26415094339623, + "grad_norm": 1.3467491151924502, + "learning_rate": 7.288142558775552e-05, + "loss": 2.101, + "step": 6420 + }, + { + "epoch": 243.0188679245283, + "grad_norm": 1.261074686560294, + "learning_rate": 7.283813728906054e-05, + "loss": 2.1411, + "step": 6440 + }, + { + "epoch": 243.77358490566039, + "grad_norm": 1.2485690805022434, + "learning_rate": 7.27947316609833e-05, + "loss": 2.1277, + "step": 6460 + }, + { + "epoch": 244.52830188679246, + "grad_norm": 1.3241322758759912, + "learning_rate": 7.275120887139026e-05, + "loss": 2.1363, + "step": 6480 + }, + { + "epoch": 245.28301886792454, + "grad_norm": 1.5599386219671891, + "learning_rate": 7.270756908860098e-05, + "loss": 2.1089, + "step": 6500 + }, + { + "epoch": 246.03773584905662, + "grad_norm": 1.2114819177389966, + "learning_rate": 7.266381248138751e-05, + "loss": 2.1089, + "step": 6520 + }, + { + "epoch": 246.79245283018867, + "grad_norm": 1.1306582742382014, + "learning_rate": 7.261993921897364e-05, + "loss": 2.1079, + "step": 6540 + }, + { + "epoch": 247.54716981132074, + "grad_norm": 1.2673326383282852, + "learning_rate": 7.257594947103438e-05, + "loss": 2.1266, + "step": 6560 + }, + { + "epoch": 248.30188679245282, + "grad_norm": 1.4019469970816203, + "learning_rate": 7.253184340769518e-05, + "loss": 2.1481, + "step": 6580 + }, + { + "epoch": 249.0566037735849, + "grad_norm": 1.0449709444069573, + "learning_rate": 7.248762119953135e-05, + "loss": 2.1158, + "step": 6600 + }, + { + "epoch": 249.81132075471697, + "grad_norm": 1.1593445705123036, + "learning_rate": 7.244328301756737e-05, + "loss": 2.13, + "step": 6620 + }, + { + "epoch": 250.56603773584905, + "grad_norm": 1.2635129121192081, + "learning_rate": 7.23988290332763e-05, + "loss": 2.1167, + "step": 6640 + }, + { + "epoch": 251.32075471698113, + "grad_norm": 1.0527620092255492, + "learning_rate": 7.235425941857891e-05, + "loss": 2.114, + "step": 6660 + }, + { + "epoch": 252.0754716981132, + "grad_norm": 1.3035661165388843, + "learning_rate": 7.230957434584331e-05, + "loss": 2.0928, + "step": 6680 + }, + { + "epoch": 252.83018867924528, + "grad_norm": 1.0136550616355096, + "learning_rate": 7.226477398788402e-05, + "loss": 2.0987, + "step": 6700 + }, + { + "epoch": 253.58490566037736, + "grad_norm": 1.258957796854538, + "learning_rate": 7.22198585179615e-05, + "loss": 2.1032, + "step": 6720 + }, + { + "epoch": 254.33962264150944, + "grad_norm": 1.2937771749668925, + "learning_rate": 7.21748281097813e-05, + "loss": 2.1003, + "step": 6740 + }, + { + "epoch": 255.0943396226415, + "grad_norm": 1.0533802729958242, + "learning_rate": 7.212968293749357e-05, + "loss": 2.1201, + "step": 6760 + }, + { + "epoch": 255.8490566037736, + "grad_norm": 1.0065996122655994, + "learning_rate": 7.208442317569225e-05, + "loss": 2.1119, + "step": 6780 + }, + { + "epoch": 256.60377358490564, + "grad_norm": 1.1726423865130644, + "learning_rate": 7.203904899941444e-05, + "loss": 2.0967, + "step": 6800 + }, + { + "epoch": 257.35849056603774, + "grad_norm": 1.4137580376820904, + "learning_rate": 7.199356058413975e-05, + "loss": 2.1297, + "step": 6820 + }, + { + "epoch": 258.1132075471698, + "grad_norm": 1.2534212871623691, + "learning_rate": 7.194795810578956e-05, + "loss": 2.1142, + "step": 6840 + }, + { + "epoch": 258.8679245283019, + "grad_norm": 1.1760816154209972, + "learning_rate": 7.190224174072643e-05, + "loss": 2.1524, + "step": 6860 + }, + { + "epoch": 259.62264150943395, + "grad_norm": 1.1576937144547554, + "learning_rate": 7.185641166575331e-05, + "loss": 2.0873, + "step": 6880 + }, + { + "epoch": 260.37735849056605, + "grad_norm": 1.18595129264392, + "learning_rate": 7.181046805811294e-05, + "loss": 2.1118, + "step": 6900 + }, + { + "epoch": 261.1320754716981, + "grad_norm": 1.1205604370668647, + "learning_rate": 7.176441109548715e-05, + "loss": 2.0986, + "step": 6920 + }, + { + "epoch": 261.8867924528302, + "grad_norm": 1.1884901674285933, + "learning_rate": 7.171824095599609e-05, + "loss": 2.1109, + "step": 6940 + }, + { + "epoch": 262.64150943396226, + "grad_norm": 1.123509221114028, + "learning_rate": 7.167195781819768e-05, + "loss": 2.1047, + "step": 6960 + }, + { + "epoch": 263.39622641509436, + "grad_norm": 1.1260075560640628, + "learning_rate": 7.162556186108684e-05, + "loss": 2.0972, + "step": 6980 + }, + { + "epoch": 264.1509433962264, + "grad_norm": 1.191570785218505, + "learning_rate": 7.157905326409477e-05, + "loss": 2.0938, + "step": 7000 + }, + { + "epoch": 264.9056603773585, + "grad_norm": 1.1009948219165815, + "learning_rate": 7.153243220708831e-05, + "loss": 2.1084, + "step": 7020 + }, + { + "epoch": 265.66037735849056, + "grad_norm": 1.0984593800759155, + "learning_rate": 7.148569887036923e-05, + "loss": 2.0989, + "step": 7040 + }, + { + "epoch": 266.41509433962267, + "grad_norm": 1.2379578619669414, + "learning_rate": 7.143885343467355e-05, + "loss": 2.1166, + "step": 7060 + }, + { + "epoch": 267.1698113207547, + "grad_norm": 1.1064036960932773, + "learning_rate": 7.139189608117077e-05, + "loss": 2.1104, + "step": 7080 + }, + { + "epoch": 267.92452830188677, + "grad_norm": 1.0772108392111555, + "learning_rate": 7.134482699146328e-05, + "loss": 2.0897, + "step": 7100 + }, + { + "epoch": 268.6792452830189, + "grad_norm": 1.1292302199915438, + "learning_rate": 7.129764634758554e-05, + "loss": 2.1157, + "step": 7120 + }, + { + "epoch": 269.4339622641509, + "grad_norm": 1.1278390668879588, + "learning_rate": 7.125035433200346e-05, + "loss": 2.0932, + "step": 7140 + }, + { + "epoch": 270.188679245283, + "grad_norm": 1.0414531139729244, + "learning_rate": 7.120295112761368e-05, + "loss": 2.1151, + "step": 7160 + }, + { + "epoch": 270.9433962264151, + "grad_norm": 1.1545750967690267, + "learning_rate": 7.115543691774282e-05, + "loss": 2.1131, + "step": 7180 + }, + { + "epoch": 271.6981132075472, + "grad_norm": 1.204421852849513, + "learning_rate": 7.110781188614684e-05, + "loss": 2.0802, + "step": 7200 + }, + { + "epoch": 272.45283018867923, + "grad_norm": 1.7831981359411682, + "learning_rate": 7.106007621701024e-05, + "loss": 2.0798, + "step": 7220 + }, + { + "epoch": 273.20754716981133, + "grad_norm": 1.3197751132016162, + "learning_rate": 7.101223009494545e-05, + "loss": 2.0992, + "step": 7240 + }, + { + "epoch": 273.9622641509434, + "grad_norm": 1.2400242729400996, + "learning_rate": 7.096427370499204e-05, + "loss": 2.0864, + "step": 7260 + }, + { + "epoch": 274.7169811320755, + "grad_norm": 1.0912978575620245, + "learning_rate": 7.091620723261605e-05, + "loss": 2.0923, + "step": 7280 + }, + { + "epoch": 275.47169811320754, + "grad_norm": 1.231133204650358, + "learning_rate": 7.086803086370918e-05, + "loss": 2.0795, + "step": 7300 + }, + { + "epoch": 276.22641509433964, + "grad_norm": 1.2282758399906704, + "learning_rate": 7.081974478458825e-05, + "loss": 2.0761, + "step": 7320 + }, + { + "epoch": 276.9811320754717, + "grad_norm": 1.3460196947110317, + "learning_rate": 7.077134918199428e-05, + "loss": 2.0752, + "step": 7340 + }, + { + "epoch": 277.7358490566038, + "grad_norm": 1.080960097565614, + "learning_rate": 7.072284424309193e-05, + "loss": 2.0889, + "step": 7360 + }, + { + "epoch": 278.49056603773585, + "grad_norm": 1.3111047591517453, + "learning_rate": 7.067423015546863e-05, + "loss": 2.0839, + "step": 7380 + }, + { + "epoch": 279.24528301886795, + "grad_norm": 1.5401314919739673, + "learning_rate": 7.0625507107134e-05, + "loss": 2.0927, + "step": 7400 + }, + { + "epoch": 280.0, + "grad_norm": 1.1041636245431063, + "learning_rate": 7.057667528651904e-05, + "loss": 2.0803, + "step": 7420 + }, + { + "epoch": 280.75471698113205, + "grad_norm": 1.4056508604045173, + "learning_rate": 7.052773488247539e-05, + "loss": 2.0668, + "step": 7440 + }, + { + "epoch": 281.50943396226415, + "grad_norm": 1.046923519873644, + "learning_rate": 7.047868608427462e-05, + "loss": 2.082, + "step": 7460 + }, + { + "epoch": 282.2641509433962, + "grad_norm": 1.3029328700653047, + "learning_rate": 7.042952908160754e-05, + "loss": 2.0556, + "step": 7480 + }, + { + "epoch": 283.0188679245283, + "grad_norm": 1.227982067650406, + "learning_rate": 7.03802640645834e-05, + "loss": 2.0478, + "step": 7500 + }, + { + "epoch": 283.77358490566036, + "grad_norm": 1.1015625311453152, + "learning_rate": 7.033089122372919e-05, + "loss": 2.0773, + "step": 7520 + }, + { + "epoch": 284.52830188679246, + "grad_norm": 1.316103623119528, + "learning_rate": 7.028141074998891e-05, + "loss": 2.0756, + "step": 7540 + }, + { + "epoch": 285.2830188679245, + "grad_norm": 1.255257016262856, + "learning_rate": 7.023182283472277e-05, + "loss": 2.0866, + "step": 7560 + }, + { + "epoch": 286.0377358490566, + "grad_norm": 1.030736862070767, + "learning_rate": 7.018212766970658e-05, + "loss": 2.0723, + "step": 7580 + }, + { + "epoch": 286.79245283018867, + "grad_norm": 1.293105711429154, + "learning_rate": 7.013232544713086e-05, + "loss": 2.0759, + "step": 7600 + }, + { + "epoch": 287.54716981132077, + "grad_norm": 1.538845550854816, + "learning_rate": 7.008241635960018e-05, + "loss": 2.0238, + "step": 7620 + }, + { + "epoch": 288.3018867924528, + "grad_norm": 1.1376778839977162, + "learning_rate": 7.003240060013241e-05, + "loss": 2.0895, + "step": 7640 + }, + { + "epoch": 289.0566037735849, + "grad_norm": 1.1524198390846205, + "learning_rate": 6.998227836215794e-05, + "loss": 2.0712, + "step": 7660 + }, + { + "epoch": 289.811320754717, + "grad_norm": 1.1269455096899952, + "learning_rate": 6.9932049839519e-05, + "loss": 2.0785, + "step": 7680 + }, + { + "epoch": 290.5660377358491, + "grad_norm": 1.1947929898571277, + "learning_rate": 6.98817152264688e-05, + "loss": 2.0789, + "step": 7700 + }, + { + "epoch": 291.3207547169811, + "grad_norm": 1.05216691350341, + "learning_rate": 6.983127471767088e-05, + "loss": 2.0721, + "step": 7720 + }, + { + "epoch": 292.07547169811323, + "grad_norm": 1.0659553585546824, + "learning_rate": 6.978072850819832e-05, + "loss": 2.0897, + "step": 7740 + }, + { + "epoch": 292.8301886792453, + "grad_norm": 1.4999207869643305, + "learning_rate": 6.9730076793533e-05, + "loss": 2.0875, + "step": 7760 + }, + { + "epoch": 293.58490566037733, + "grad_norm": 1.051015475094539, + "learning_rate": 6.967931976956479e-05, + "loss": 2.0572, + "step": 7780 + }, + { + "epoch": 294.33962264150944, + "grad_norm": 1.4062884543226315, + "learning_rate": 6.962845763259084e-05, + "loss": 2.0783, + "step": 7800 + }, + { + "epoch": 295.0943396226415, + "grad_norm": 1.3344933104485628, + "learning_rate": 6.957749057931486e-05, + "loss": 2.0491, + "step": 7820 + }, + { + "epoch": 295.8490566037736, + "grad_norm": 1.2853456909301206, + "learning_rate": 6.952641880684623e-05, + "loss": 2.0589, + "step": 7840 + }, + { + "epoch": 296.60377358490564, + "grad_norm": 0.9567533052896401, + "learning_rate": 6.947524251269942e-05, + "loss": 2.0638, + "step": 7860 + }, + { + "epoch": 297.35849056603774, + "grad_norm": 1.0295871659614384, + "learning_rate": 6.942396189479305e-05, + "loss": 2.0452, + "step": 7880 + }, + { + "epoch": 298.1132075471698, + "grad_norm": 1.0385361931014787, + "learning_rate": 6.937257715144922e-05, + "loss": 2.0693, + "step": 7900 + }, + { + "epoch": 298.8679245283019, + "grad_norm": 1.0809377854877955, + "learning_rate": 6.932108848139274e-05, + "loss": 2.0657, + "step": 7920 + }, + { + "epoch": 299.62264150943395, + "grad_norm": 1.2262962690135735, + "learning_rate": 6.926949608375031e-05, + "loss": 2.0333, + "step": 7940 + }, + { + "epoch": 300.37735849056605, + "grad_norm": 1.2494078282001366, + "learning_rate": 6.921780015804983e-05, + "loss": 2.0611, + "step": 7960 + }, + { + "epoch": 301.1320754716981, + "grad_norm": 1.2191785732688871, + "learning_rate": 6.916600090421955e-05, + "loss": 2.0414, + "step": 7980 + }, + { + "epoch": 301.8867924528302, + "grad_norm": 1.3922552836876412, + "learning_rate": 6.911409852258734e-05, + "loss": 2.0344, + "step": 8000 + }, + { + "epoch": 302.64150943396226, + "grad_norm": 1.001774801425353, + "learning_rate": 6.906209321387992e-05, + "loss": 2.0745, + "step": 8020 + }, + { + "epoch": 303.39622641509436, + "grad_norm": 1.1104337640037032, + "learning_rate": 6.900998517922203e-05, + "loss": 2.0593, + "step": 8040 + }, + { + "epoch": 304.1509433962264, + "grad_norm": 1.825335695980726, + "learning_rate": 6.895777462013575e-05, + "loss": 2.0459, + "step": 8060 + }, + { + "epoch": 304.9056603773585, + "grad_norm": 1.157162721000543, + "learning_rate": 6.89054617385396e-05, + "loss": 2.0565, + "step": 8080 + }, + { + "epoch": 305.66037735849056, + "grad_norm": 1.0813317731646406, + "learning_rate": 6.885304673674785e-05, + "loss": 2.0647, + "step": 8100 + }, + { + "epoch": 306.41509433962267, + "grad_norm": 1.1711257270339308, + "learning_rate": 6.880052981746973e-05, + "loss": 2.0779, + "step": 8120 + }, + { + "epoch": 307.1698113207547, + "grad_norm": 1.063366248076951, + "learning_rate": 6.874791118380859e-05, + "loss": 2.0299, + "step": 8140 + }, + { + "epoch": 307.92452830188677, + "grad_norm": 1.4839153895893722, + "learning_rate": 6.869519103926117e-05, + "loss": 2.0689, + "step": 8160 + }, + { + "epoch": 308.6792452830189, + "grad_norm": 1.0599730190677705, + "learning_rate": 6.864236958771677e-05, + "loss": 2.0559, + "step": 8180 + }, + { + "epoch": 309.4339622641509, + "grad_norm": 1.1000743640073944, + "learning_rate": 6.85894470334565e-05, + "loss": 2.0814, + "step": 8200 + }, + { + "epoch": 310.188679245283, + "grad_norm": 1.1134888630426287, + "learning_rate": 6.853642358115248e-05, + "loss": 2.0619, + "step": 8220 + }, + { + "epoch": 310.9433962264151, + "grad_norm": 1.4109893718513755, + "learning_rate": 6.848329943586703e-05, + "loss": 2.0478, + "step": 8240 + }, + { + "epoch": 311.6981132075472, + "grad_norm": 1.4005508549478216, + "learning_rate": 6.843007480305188e-05, + "loss": 2.0451, + "step": 8260 + }, + { + "epoch": 312.45283018867923, + "grad_norm": 1.1506459796822934, + "learning_rate": 6.83767498885474e-05, + "loss": 2.0496, + "step": 8280 + }, + { + "epoch": 313.20754716981133, + "grad_norm": 1.1846013546521996, + "learning_rate": 6.832332489858181e-05, + "loss": 2.0503, + "step": 8300 + }, + { + "epoch": 313.9622641509434, + "grad_norm": 1.083146150872066, + "learning_rate": 6.826980003977029e-05, + "loss": 2.0411, + "step": 8320 + }, + { + "epoch": 314.7169811320755, + "grad_norm": 1.1083923007981826, + "learning_rate": 6.821617551911432e-05, + "loss": 2.059, + "step": 8340 + }, + { + "epoch": 315.47169811320754, + "grad_norm": 1.7089827022606041, + "learning_rate": 6.816245154400081e-05, + "loss": 2.0316, + "step": 8360 + }, + { + "epoch": 316.22641509433964, + "grad_norm": 1.2036464145657677, + "learning_rate": 6.810862832220125e-05, + "loss": 2.0383, + "step": 8380 + }, + { + "epoch": 316.9811320754717, + "grad_norm": 1.0678492328292477, + "learning_rate": 6.8054706061871e-05, + "loss": 2.0357, + "step": 8400 + }, + { + "epoch": 317.7358490566038, + "grad_norm": 1.1510123829327024, + "learning_rate": 6.800068497154838e-05, + "loss": 2.0509, + "step": 8420 + }, + { + "epoch": 318.49056603773585, + "grad_norm": 1.1744519756591179, + "learning_rate": 6.794656526015402e-05, + "loss": 2.0362, + "step": 8440 + }, + { + "epoch": 319.24528301886795, + "grad_norm": 1.0951767070535987, + "learning_rate": 6.78923471369899e-05, + "loss": 2.0261, + "step": 8460 + }, + { + "epoch": 320.0, + "grad_norm": 1.40278574496307, + "learning_rate": 6.783803081173856e-05, + "loss": 2.0041, + "step": 8480 + }, + { + "epoch": 320.75471698113205, + "grad_norm": 1.2731462205629138, + "learning_rate": 6.778361649446238e-05, + "loss": 2.0455, + "step": 8500 + }, + { + "epoch": 321.50943396226415, + "grad_norm": 1.1686588861352702, + "learning_rate": 6.772910439560273e-05, + "loss": 2.0328, + "step": 8520 + }, + { + "epoch": 322.2641509433962, + "grad_norm": 1.0989551544372271, + "learning_rate": 6.767449472597907e-05, + "loss": 2.0495, + "step": 8540 + }, + { + "epoch": 323.0188679245283, + "grad_norm": 1.3624805761549945, + "learning_rate": 6.761978769678828e-05, + "loss": 2.0447, + "step": 8560 + }, + { + "epoch": 323.77358490566036, + "grad_norm": 1.1444357160826135, + "learning_rate": 6.75649835196037e-05, + "loss": 2.0663, + "step": 8580 + }, + { + "epoch": 324.52830188679246, + "grad_norm": 1.1946574491976927, + "learning_rate": 6.75100824063744e-05, + "loss": 2.0483, + "step": 8600 + }, + { + "epoch": 325.2830188679245, + "grad_norm": 1.0504702569050626, + "learning_rate": 6.745508456942438e-05, + "loss": 1.9978, + "step": 8620 + }, + { + "epoch": 326.0377358490566, + "grad_norm": 0.9767612100068984, + "learning_rate": 6.739999022145167e-05, + "loss": 2.0382, + "step": 8640 + }, + { + "epoch": 326.79245283018867, + "grad_norm": 1.0291078738332238, + "learning_rate": 6.734479957552753e-05, + "loss": 2.0298, + "step": 8660 + }, + { + "epoch": 327.54716981132077, + "grad_norm": 1.244368475618607, + "learning_rate": 6.72895128450957e-05, + "loss": 2.005, + "step": 8680 + }, + { + "epoch": 328.3018867924528, + "grad_norm": 1.1290954094741668, + "learning_rate": 6.723413024397144e-05, + "loss": 2.0569, + "step": 8700 + }, + { + "epoch": 329.0566037735849, + "grad_norm": 1.0915375487825718, + "learning_rate": 6.717865198634082e-05, + "loss": 2.0447, + "step": 8720 + }, + { + "epoch": 329.811320754717, + "grad_norm": 1.1768398401350053, + "learning_rate": 6.71230782867599e-05, + "loss": 2.0217, + "step": 8740 + }, + { + "epoch": 330.5660377358491, + "grad_norm": 1.104835402612007, + "learning_rate": 6.706740936015375e-05, + "loss": 2.0386, + "step": 8760 + }, + { + "epoch": 331.3207547169811, + "grad_norm": 1.1248015036534322, + "learning_rate": 6.70116454218158e-05, + "loss": 2.0103, + "step": 8780 + }, + { + "epoch": 332.07547169811323, + "grad_norm": 1.2169922349555569, + "learning_rate": 6.69557866874069e-05, + "loss": 2.0241, + "step": 8800 + }, + { + "epoch": 332.8301886792453, + "grad_norm": 1.214613807170357, + "learning_rate": 6.689983337295448e-05, + "loss": 2.0188, + "step": 8820 + }, + { + "epoch": 333.58490566037733, + "grad_norm": 1.790201434963867, + "learning_rate": 6.684378569485181e-05, + "loss": 1.9779, + "step": 8840 + }, + { + "epoch": 334.33962264150944, + "grad_norm": 1.1294537346241684, + "learning_rate": 6.678764386985706e-05, + "loss": 2.0288, + "step": 8860 + }, + { + "epoch": 335.0943396226415, + "grad_norm": 1.023220014865738, + "learning_rate": 6.673140811509254e-05, + "loss": 2.0388, + "step": 8880 + }, + { + "epoch": 335.8490566037736, + "grad_norm": 1.184810167823652, + "learning_rate": 6.667507864804373e-05, + "loss": 2.0527, + "step": 8900 + }, + { + "epoch": 336.60377358490564, + "grad_norm": 1.4108637345041166, + "learning_rate": 6.661865568655867e-05, + "loss": 2.0521, + "step": 8920 + }, + { + "epoch": 337.35849056603774, + "grad_norm": 1.1099661578909887, + "learning_rate": 6.656213944884687e-05, + "loss": 2.0142, + "step": 8940 + }, + { + "epoch": 338.1132075471698, + "grad_norm": 1.2550646623744501, + "learning_rate": 6.650553015347861e-05, + "loss": 2.0234, + "step": 8960 + }, + { + "epoch": 338.8679245283019, + "grad_norm": 1.3226209109800835, + "learning_rate": 6.64488280193841e-05, + "loss": 2.0026, + "step": 8980 + }, + { + "epoch": 339.62264150943395, + "grad_norm": 1.2466974723773605, + "learning_rate": 6.639203326585253e-05, + "loss": 2.0505, + "step": 9000 + }, + { + "epoch": 340.37735849056605, + "grad_norm": 1.2259925868881607, + "learning_rate": 6.633514611253129e-05, + "loss": 1.989, + "step": 9020 + }, + { + "epoch": 341.1320754716981, + "grad_norm": 1.2616555953485367, + "learning_rate": 6.627816677942518e-05, + "loss": 2.0172, + "step": 9040 + }, + { + "epoch": 341.8867924528302, + "grad_norm": 1.0660372107925478, + "learning_rate": 6.622109548689542e-05, + "loss": 2.0235, + "step": 9060 + }, + { + "epoch": 342.64150943396226, + "grad_norm": 1.2995047263783295, + "learning_rate": 6.616393245565893e-05, + "loss": 2.0116, + "step": 9080 + }, + { + "epoch": 343.39622641509436, + "grad_norm": 1.2947984731633606, + "learning_rate": 6.610667790678738e-05, + "loss": 2.0241, + "step": 9100 + }, + { + "epoch": 344.1509433962264, + "grad_norm": 1.008247863003288, + "learning_rate": 6.60493320617064e-05, + "loss": 1.984, + "step": 9120 + }, + { + "epoch": 344.9056603773585, + "grad_norm": 1.5922178618355085, + "learning_rate": 6.599189514219469e-05, + "loss": 1.989, + "step": 9140 + }, + { + "epoch": 345.66037735849056, + "grad_norm": 1.0785537649724395, + "learning_rate": 6.593436737038316e-05, + "loss": 2.0135, + "step": 9160 + }, + { + "epoch": 346.41509433962267, + "grad_norm": 1.1766322003509095, + "learning_rate": 6.58767489687541e-05, + "loss": 2.0021, + "step": 9180 + }, + { + "epoch": 347.1698113207547, + "grad_norm": 1.3777596137615202, + "learning_rate": 6.581904016014026e-05, + "loss": 1.9988, + "step": 9200 + }, + { + "epoch": 347.92452830188677, + "grad_norm": 1.5517987219865874, + "learning_rate": 6.57612411677241e-05, + "loss": 2.0309, + "step": 9220 + }, + { + "epoch": 348.6792452830189, + "grad_norm": 1.0560275221648643, + "learning_rate": 6.570335221503679e-05, + "loss": 1.9923, + "step": 9240 + }, + { + "epoch": 349.4339622641509, + "grad_norm": 1.3504991405267055, + "learning_rate": 6.564537352595744e-05, + "loss": 1.9739, + "step": 9260 + }, + { + "epoch": 350.188679245283, + "grad_norm": 1.153039781830911, + "learning_rate": 6.558730532471219e-05, + "loss": 1.9803, + "step": 9280 + }, + { + "epoch": 350.9433962264151, + "grad_norm": 0.9434571532030971, + "learning_rate": 6.55291478358734e-05, + "loss": 1.9677, + "step": 9300 + }, + { + "epoch": 351.6981132075472, + "grad_norm": 1.8784899020425583, + "learning_rate": 6.547090128435869e-05, + "loss": 1.9988, + "step": 9320 + }, + { + "epoch": 352.45283018867923, + "grad_norm": 1.449139419473746, + "learning_rate": 6.541256589543013e-05, + "loss": 1.9974, + "step": 9340 + }, + { + "epoch": 353.20754716981133, + "grad_norm": 1.3936924715065266, + "learning_rate": 6.53541418946934e-05, + "loss": 2.016, + "step": 9360 + }, + { + "epoch": 353.9622641509434, + "grad_norm": 1.6114753123601104, + "learning_rate": 6.529562950809679e-05, + "loss": 2.0021, + "step": 9380 + }, + { + "epoch": 354.7169811320755, + "grad_norm": 1.109612991369577, + "learning_rate": 6.523702896193052e-05, + "loss": 1.9928, + "step": 9400 + }, + { + "epoch": 355.47169811320754, + "grad_norm": 1.2128714849575388, + "learning_rate": 6.517834048282572e-05, + "loss": 1.9908, + "step": 9420 + }, + { + "epoch": 356.22641509433964, + "grad_norm": 1.6910853942561526, + "learning_rate": 6.511956429775353e-05, + "loss": 1.998, + "step": 9440 + }, + { + "epoch": 356.9811320754717, + "grad_norm": 1.1430018991497974, + "learning_rate": 6.506070063402434e-05, + "loss": 1.9726, + "step": 9460 + }, + { + "epoch": 357.7358490566038, + "grad_norm": 1.1413458296675405, + "learning_rate": 6.500174971928684e-05, + "loss": 1.9972, + "step": 9480 + }, + { + "epoch": 358.49056603773585, + "grad_norm": 1.1439825840759497, + "learning_rate": 6.494271178152717e-05, + "loss": 1.9965, + "step": 9500 + }, + { + "epoch": 359.24528301886795, + "grad_norm": 1.1294761259383999, + "learning_rate": 6.488358704906799e-05, + "loss": 1.9651, + "step": 9520 + }, + { + "epoch": 360.0, + "grad_norm": 1.0691230669285636, + "learning_rate": 6.482437575056767e-05, + "loss": 1.9559, + "step": 9540 + }, + { + "epoch": 360.75471698113205, + "grad_norm": 1.1830173398938235, + "learning_rate": 6.476507811501933e-05, + "loss": 2.0035, + "step": 9560 + }, + { + "epoch": 361.50943396226415, + "grad_norm": 1.0463609194131098, + "learning_rate": 6.470569437175001e-05, + "loss": 2.0062, + "step": 9580 + }, + { + "epoch": 362.2641509433962, + "grad_norm": 1.0991234719266971, + "learning_rate": 6.464622475041972e-05, + "loss": 1.9775, + "step": 9600 + }, + { + "epoch": 363.0188679245283, + "grad_norm": 1.198408129328553, + "learning_rate": 6.458666948102068e-05, + "loss": 1.9684, + "step": 9620 + }, + { + "epoch": 363.77358490566036, + "grad_norm": 1.2518491777745682, + "learning_rate": 6.452702879387625e-05, + "loss": 2.0052, + "step": 9640 + }, + { + "epoch": 364.52830188679246, + "grad_norm": 1.1372899117638453, + "learning_rate": 6.44673029196402e-05, + "loss": 2.005, + "step": 9660 + }, + { + "epoch": 365.2830188679245, + "grad_norm": 1.1412736472546972, + "learning_rate": 6.44074920892957e-05, + "loss": 1.9545, + "step": 9680 + }, + { + "epoch": 366.0377358490566, + "grad_norm": 1.295474241313066, + "learning_rate": 6.434759653415454e-05, + "loss": 1.9943, + "step": 9700 + }, + { + "epoch": 366.79245283018867, + "grad_norm": 1.6454682745260736, + "learning_rate": 6.42876164858561e-05, + "loss": 1.9831, + "step": 9720 + }, + { + "epoch": 367.54716981132077, + "grad_norm": 1.3152256044573012, + "learning_rate": 6.42275521763666e-05, + "loss": 1.9898, + "step": 9740 + }, + { + "epoch": 368.3018867924528, + "grad_norm": 1.1398547695205932, + "learning_rate": 6.416740383797806e-05, + "loss": 2.0018, + "step": 9760 + }, + { + "epoch": 369.0566037735849, + "grad_norm": 1.560025618616493, + "learning_rate": 6.410717170330754e-05, + "loss": 1.9774, + "step": 9780 + }, + { + "epoch": 369.811320754717, + "grad_norm": 1.1084036628854508, + "learning_rate": 6.404685600529614e-05, + "loss": 1.9898, + "step": 9800 + }, + { + "epoch": 370.5660377358491, + "grad_norm": 1.064043317797065, + "learning_rate": 6.398645697720813e-05, + "loss": 1.9683, + "step": 9820 + }, + { + "epoch": 371.3207547169811, + "grad_norm": 1.0561133254372814, + "learning_rate": 6.392597485263005e-05, + "loss": 1.9892, + "step": 9840 + }, + { + "epoch": 372.07547169811323, + "grad_norm": 1.8498750890966262, + "learning_rate": 6.386540986546981e-05, + "loss": 2.0028, + "step": 9860 + }, + { + "epoch": 372.8301886792453, + "grad_norm": 1.1839290834136853, + "learning_rate": 6.38047622499558e-05, + "loss": 2.0067, + "step": 9880 + }, + { + "epoch": 373.58490566037733, + "grad_norm": 1.0860526304824587, + "learning_rate": 6.374403224063593e-05, + "loss": 1.9771, + "step": 9900 + }, + { + "epoch": 374.33962264150944, + "grad_norm": 1.1746289875773082, + "learning_rate": 6.368322007237679e-05, + "loss": 1.9693, + "step": 9920 + }, + { + "epoch": 375.0943396226415, + "grad_norm": 1.3082714258157306, + "learning_rate": 6.36223259803627e-05, + "loss": 1.9737, + "step": 9940 + }, + { + "epoch": 375.8490566037736, + "grad_norm": 1.1647865395498773, + "learning_rate": 6.356135020009478e-05, + "loss": 1.9619, + "step": 9960 + }, + { + "epoch": 376.60377358490564, + "grad_norm": 0.947059541183795, + "learning_rate": 6.350029296739012e-05, + "loss": 1.975, + "step": 9980 + }, + { + "epoch": 377.35849056603774, + "grad_norm": 1.2248902808010191, + "learning_rate": 6.343915451838081e-05, + "loss": 1.9628, + "step": 10000 + }, + { + "epoch": 378.1132075471698, + "grad_norm": 1.097611241891744, + "learning_rate": 6.337793508951301e-05, + "loss": 1.9775, + "step": 10020 + }, + { + "epoch": 378.8679245283019, + "grad_norm": 1.2529669087878597, + "learning_rate": 6.331663491754607e-05, + "loss": 1.9468, + "step": 10040 + }, + { + "epoch": 379.62264150943395, + "grad_norm": 1.1767271144174725, + "learning_rate": 6.325525423955162e-05, + "loss": 1.9413, + "step": 10060 + }, + { + "epoch": 380.37735849056605, + "grad_norm": 1.282222785156654, + "learning_rate": 6.319379329291262e-05, + "loss": 1.9655, + "step": 10080 + }, + { + "epoch": 381.1320754716981, + "grad_norm": 0.9819686841799513, + "learning_rate": 6.313225231532246e-05, + "loss": 1.9537, + "step": 10100 + }, + { + "epoch": 381.8867924528302, + "grad_norm": 1.206003307363446, + "learning_rate": 6.307063154478407e-05, + "loss": 1.9387, + "step": 10120 + }, + { + "epoch": 382.64150943396226, + "grad_norm": 1.236739142400694, + "learning_rate": 6.300893121960891e-05, + "loss": 1.9478, + "step": 10140 + }, + { + "epoch": 383.39622641509436, + "grad_norm": 1.0017771325975895, + "learning_rate": 6.294715157841618e-05, + "loss": 1.9714, + "step": 10160 + }, + { + "epoch": 384.1509433962264, + "grad_norm": 1.0637999951499557, + "learning_rate": 6.28852928601318e-05, + "loss": 1.9905, + "step": 10180 + }, + { + "epoch": 384.9056603773585, + "grad_norm": 1.0944082795368726, + "learning_rate": 6.282335530398746e-05, + "loss": 1.9586, + "step": 10200 + }, + { + "epoch": 385.66037735849056, + "grad_norm": 1.0420947581782276, + "learning_rate": 6.276133914951982e-05, + "loss": 2.0008, + "step": 10220 + }, + { + "epoch": 386.41509433962267, + "grad_norm": 1.2531335945397626, + "learning_rate": 6.26992446365695e-05, + "loss": 1.9718, + "step": 10240 + }, + { + "epoch": 387.1698113207547, + "grad_norm": 1.0272789455614961, + "learning_rate": 6.26370720052801e-05, + "loss": 1.9741, + "step": 10260 + }, + { + "epoch": 387.92452830188677, + "grad_norm": 1.1543574176007045, + "learning_rate": 6.25748214960974e-05, + "loss": 1.9508, + "step": 10280 + }, + { + "epoch": 388.6792452830189, + "grad_norm": 1.123008926585049, + "learning_rate": 6.251249334976835e-05, + "loss": 1.9238, + "step": 10300 + }, + { + "epoch": 389.4339622641509, + "grad_norm": 1.1351605673087415, + "learning_rate": 6.245008780734015e-05, + "loss": 1.9379, + "step": 10320 + }, + { + "epoch": 390.188679245283, + "grad_norm": 1.139914151072252, + "learning_rate": 6.238760511015928e-05, + "loss": 1.9863, + "step": 10340 + }, + { + "epoch": 390.9433962264151, + "grad_norm": 1.3069434137417522, + "learning_rate": 6.232504549987069e-05, + "loss": 1.9569, + "step": 10360 + }, + { + "epoch": 391.6981132075472, + "grad_norm": 1.7598014364780348, + "learning_rate": 6.22624092184167e-05, + "loss": 1.9389, + "step": 10380 + }, + { + "epoch": 392.45283018867923, + "grad_norm": 1.0862334208555093, + "learning_rate": 6.21996965080362e-05, + "loss": 1.9744, + "step": 10400 + }, + { + "epoch": 393.20754716981133, + "grad_norm": 1.1400427784758083, + "learning_rate": 6.213690761126365e-05, + "loss": 1.9563, + "step": 10420 + }, + { + "epoch": 393.9622641509434, + "grad_norm": 1.171092319320692, + "learning_rate": 6.207404277092816e-05, + "loss": 1.9268, + "step": 10440 + }, + { + "epoch": 394.7169811320755, + "grad_norm": 1.2187674621534166, + "learning_rate": 6.201110223015247e-05, + "loss": 1.9141, + "step": 10460 + }, + { + "epoch": 395.47169811320754, + "grad_norm": 1.1182747577783947, + "learning_rate": 6.19480862323522e-05, + "loss": 1.9498, + "step": 10480 + }, + { + "epoch": 396.22641509433964, + "grad_norm": 1.2189637302318261, + "learning_rate": 6.188499502123471e-05, + "loss": 1.9563, + "step": 10500 + }, + { + "epoch": 396.9811320754717, + "grad_norm": 1.0928304287739772, + "learning_rate": 6.18218288407983e-05, + "loss": 1.976, + "step": 10520 + }, + { + "epoch": 397.7358490566038, + "grad_norm": 1.0366879822409767, + "learning_rate": 6.17585879353311e-05, + "loss": 1.9804, + "step": 10540 + }, + { + "epoch": 398.49056603773585, + "grad_norm": 1.153371078643115, + "learning_rate": 6.169527254941035e-05, + "loss": 1.987, + "step": 10560 + }, + { + "epoch": 399.24528301886795, + "grad_norm": 1.2467206603942558, + "learning_rate": 6.163188292790129e-05, + "loss": 1.958, + "step": 10580 + }, + { + "epoch": 400.0, + "grad_norm": 1.222097823579558, + "learning_rate": 6.156841931595623e-05, + "loss": 1.9838, + "step": 10600 + }, + { + "epoch": 400.75471698113205, + "grad_norm": 1.1722193895632427, + "learning_rate": 6.150488195901367e-05, + "loss": 1.9496, + "step": 10620 + }, + { + "epoch": 401.50943396226415, + "grad_norm": 1.7976530024431303, + "learning_rate": 6.144127110279726e-05, + "loss": 1.9869, + "step": 10640 + }, + { + "epoch": 402.2641509433962, + "grad_norm": 1.1341428853515279, + "learning_rate": 6.137758699331498e-05, + "loss": 1.9235, + "step": 10660 + }, + { + "epoch": 403.0188679245283, + "grad_norm": 1.107959509965475, + "learning_rate": 6.131382987685803e-05, + "loss": 1.927, + "step": 10680 + }, + { + "epoch": 403.77358490566036, + "grad_norm": 1.8525780404729881, + "learning_rate": 6.125000000000001e-05, + "loss": 1.9487, + "step": 10700 + }, + { + "epoch": 404.52830188679246, + "grad_norm": 1.1448557605131082, + "learning_rate": 6.118609760959587e-05, + "loss": 1.9547, + "step": 10720 + }, + { + "epoch": 405.2830188679245, + "grad_norm": 1.1150883211805585, + "learning_rate": 6.112212295278103e-05, + "loss": 1.9487, + "step": 10740 + }, + { + "epoch": 406.0377358490566, + "grad_norm": 1.149956522288425, + "learning_rate": 6.105807627697039e-05, + "loss": 1.9614, + "step": 10760 + }, + { + "epoch": 406.79245283018867, + "grad_norm": 1.1217970508214505, + "learning_rate": 6.099395782985736e-05, + "loss": 1.9555, + "step": 10780 + }, + { + "epoch": 407.54716981132077, + "grad_norm": 1.261444831314206, + "learning_rate": 6.0929767859412914e-05, + "loss": 1.9527, + "step": 10800 + }, + { + "epoch": 408.3018867924528, + "grad_norm": 1.2610523391632782, + "learning_rate": 6.086550661388466e-05, + "loss": 1.9321, + "step": 10820 + }, + { + "epoch": 409.0566037735849, + "grad_norm": 1.1090357115444625, + "learning_rate": 6.080117434179586e-05, + "loss": 1.9211, + "step": 10840 + }, + { + "epoch": 409.811320754717, + "grad_norm": 0.9790706595618122, + "learning_rate": 6.0736771291944384e-05, + "loss": 1.9203, + "step": 10860 + }, + { + "epoch": 410.5660377358491, + "grad_norm": 1.2166651155014474, + "learning_rate": 6.067229771340195e-05, + "loss": 1.9323, + "step": 10880 + }, + { + "epoch": 411.3207547169811, + "grad_norm": 1.433284831152631, + "learning_rate": 6.0607753855512944e-05, + "loss": 1.9623, + "step": 10900 + }, + { + "epoch": 412.07547169811323, + "grad_norm": 1.5219003618009672, + "learning_rate": 6.054313996789358e-05, + "loss": 1.9198, + "step": 10920 + }, + { + "epoch": 412.8301886792453, + "grad_norm": 1.1498060228780786, + "learning_rate": 6.047845630043091e-05, + "loss": 1.9487, + "step": 10940 + }, + { + "epoch": 413.58490566037733, + "grad_norm": 1.1202482322074203, + "learning_rate": 6.041370310328184e-05, + "loss": 1.9067, + "step": 10960 + }, + { + "epoch": 414.33962264150944, + "grad_norm": 1.2430430079147141, + "learning_rate": 6.0348880626872184e-05, + "loss": 1.9382, + "step": 10980 + }, + { + "epoch": 415.0943396226415, + "grad_norm": 1.0869359699622836, + "learning_rate": 6.028398912189569e-05, + "loss": 1.9611, + "step": 11000 + }, + { + "epoch": 415.8490566037736, + "grad_norm": 1.2526066034095944, + "learning_rate": 6.0219028839313045e-05, + "loss": 1.9644, + "step": 11020 + }, + { + "epoch": 416.60377358490564, + "grad_norm": 1.3228258582837578, + "learning_rate": 6.015400003035096e-05, + "loss": 1.9401, + "step": 11040 + }, + { + "epoch": 417.35849056603774, + "grad_norm": 1.9762624742364299, + "learning_rate": 6.008890294650111e-05, + "loss": 1.9465, + "step": 11060 + }, + { + "epoch": 418.1132075471698, + "grad_norm": 1.1424506198639062, + "learning_rate": 6.0023737839519284e-05, + "loss": 1.9439, + "step": 11080 + }, + { + "epoch": 418.8679245283019, + "grad_norm": 1.2107658482065982, + "learning_rate": 5.995850496142429e-05, + "loss": 1.9342, + "step": 11100 + }, + { + "epoch": 419.62264150943395, + "grad_norm": 1.1945042871517195, + "learning_rate": 5.989320456449705e-05, + "loss": 1.9463, + "step": 11120 + }, + { + "epoch": 420.37735849056605, + "grad_norm": 1.4521278706471037, + "learning_rate": 5.9827836901279616e-05, + "loss": 1.9097, + "step": 11140 + }, + { + "epoch": 421.1320754716981, + "grad_norm": 1.3941560968372226, + "learning_rate": 5.97624022245742e-05, + "loss": 1.9332, + "step": 11160 + }, + { + "epoch": 421.8867924528302, + "grad_norm": 1.39870585850457, + "learning_rate": 5.969690078744211e-05, + "loss": 1.9458, + "step": 11180 + }, + { + "epoch": 422.64150943396226, + "grad_norm": 1.0274165784628992, + "learning_rate": 5.963133284320292e-05, + "loss": 1.9365, + "step": 11200 + }, + { + "epoch": 423.39622641509436, + "grad_norm": 1.1694905737535597, + "learning_rate": 5.956569864543338e-05, + "loss": 1.8966, + "step": 11220 + }, + { + "epoch": 424.1509433962264, + "grad_norm": 2.28522960541773, + "learning_rate": 5.9499998447966484e-05, + "loss": 1.9332, + "step": 11240 + }, + { + "epoch": 424.9056603773585, + "grad_norm": 1.1252246816652476, + "learning_rate": 5.943423250489044e-05, + "loss": 1.9308, + "step": 11260 + }, + { + "epoch": 425.66037735849056, + "grad_norm": 1.132211561056973, + "learning_rate": 5.9368401070547756e-05, + "loss": 1.9221, + "step": 11280 + }, + { + "epoch": 426.41509433962267, + "grad_norm": 1.0848974124812198, + "learning_rate": 5.93025043995342e-05, + "loss": 1.9374, + "step": 11300 + }, + { + "epoch": 427.1698113207547, + "grad_norm": 1.0363649309093041, + "learning_rate": 5.9236542746697845e-05, + "loss": 1.9461, + "step": 11320 + }, + { + "epoch": 427.92452830188677, + "grad_norm": 1.1928905874896651, + "learning_rate": 5.9170516367138065e-05, + "loss": 1.9378, + "step": 11340 + }, + { + "epoch": 428.6792452830189, + "grad_norm": 1.1106230737067035, + "learning_rate": 5.910442551620457e-05, + "loss": 1.942, + "step": 11360 + }, + { + "epoch": 429.4339622641509, + "grad_norm": 1.1576265708604865, + "learning_rate": 5.903827044949638e-05, + "loss": 1.9471, + "step": 11380 + }, + { + "epoch": 430.188679245283, + "grad_norm": 1.1174079944741127, + "learning_rate": 5.897205142286091e-05, + "loss": 1.8922, + "step": 11400 + }, + { + "epoch": 430.9433962264151, + "grad_norm": 1.3085632343404145, + "learning_rate": 5.890576869239289e-05, + "loss": 1.9458, + "step": 11420 + }, + { + "epoch": 431.6981132075472, + "grad_norm": 1.2863747903344196, + "learning_rate": 5.883942251443342e-05, + "loss": 1.9099, + "step": 11440 + }, + { + "epoch": 432.45283018867923, + "grad_norm": 1.0551943593908828, + "learning_rate": 5.877301314556899e-05, + "loss": 1.9141, + "step": 11460 + }, + { + "epoch": 433.20754716981133, + "grad_norm": 1.086077987508793, + "learning_rate": 5.870654084263047e-05, + "loss": 1.96, + "step": 11480 + }, + { + "epoch": 433.9622641509434, + "grad_norm": 1.3656558008500363, + "learning_rate": 5.864000586269215e-05, + "loss": 1.904, + "step": 11500 + }, + { + "epoch": 434.7169811320755, + "grad_norm": 1.4439265227942644, + "learning_rate": 5.8573408463070655e-05, + "loss": 1.9273, + "step": 11520 + }, + { + "epoch": 435.47169811320754, + "grad_norm": 1.3611886653187657, + "learning_rate": 5.850674890132405e-05, + "loss": 1.9034, + "step": 11540 + }, + { + "epoch": 436.22641509433964, + "grad_norm": 1.3616986059829845, + "learning_rate": 5.844002743525081e-05, + "loss": 1.9143, + "step": 11560 + }, + { + "epoch": 436.9811320754717, + "grad_norm": 1.1127209994732485, + "learning_rate": 5.8373244322888796e-05, + "loss": 1.9467, + "step": 11580 + }, + { + "epoch": 437.7358490566038, + "grad_norm": 1.2452581872873123, + "learning_rate": 5.83063998225143e-05, + "loss": 1.946, + "step": 11600 + }, + { + "epoch": 438.49056603773585, + "grad_norm": 1.169219637417814, + "learning_rate": 5.823949419264102e-05, + "loss": 1.9057, + "step": 11620 + }, + { + "epoch": 439.24528301886795, + "grad_norm": 1.3149994286787028, + "learning_rate": 5.817252769201905e-05, + "loss": 1.8922, + "step": 11640 + }, + { + "epoch": 440.0, + "grad_norm": 1.3111574851574335, + "learning_rate": 5.81055005796339e-05, + "loss": 1.9222, + "step": 11660 + }, + { + "epoch": 440.75471698113205, + "grad_norm": 1.187457856172297, + "learning_rate": 5.803841311470551e-05, + "loss": 1.9188, + "step": 11680 + }, + { + "epoch": 441.50943396226415, + "grad_norm": 1.057264779435906, + "learning_rate": 5.7971265556687206e-05, + "loss": 1.9185, + "step": 11700 + }, + { + "epoch": 442.2641509433962, + "grad_norm": 1.1403855029477634, + "learning_rate": 5.790405816526473e-05, + "loss": 1.9328, + "step": 11720 + }, + { + "epoch": 443.0188679245283, + "grad_norm": 1.2270148994812622, + "learning_rate": 5.78367912003552e-05, + "loss": 1.8952, + "step": 11740 + }, + { + "epoch": 443.77358490566036, + "grad_norm": 1.0385291337101263, + "learning_rate": 5.776946492210618e-05, + "loss": 1.9042, + "step": 11760 + }, + { + "epoch": 444.52830188679246, + "grad_norm": 1.2996045459665522, + "learning_rate": 5.770207959089455e-05, + "loss": 1.9373, + "step": 11780 + }, + { + "epoch": 445.2830188679245, + "grad_norm": 1.1405006769622614, + "learning_rate": 5.763463546732563e-05, + "loss": 1.9035, + "step": 11800 + }, + { + "epoch": 446.0377358490566, + "grad_norm": 1.2182586988416257, + "learning_rate": 5.756713281223206e-05, + "loss": 1.936, + "step": 11820 + }, + { + "epoch": 446.79245283018867, + "grad_norm": 1.097561953783009, + "learning_rate": 5.74995718866729e-05, + "loss": 1.9057, + "step": 11840 + }, + { + "epoch": 447.54716981132077, + "grad_norm": 1.0690078287157365, + "learning_rate": 5.743195295193255e-05, + "loss": 1.9074, + "step": 11860 + }, + { + "epoch": 448.3018867924528, + "grad_norm": 1.1127790113128593, + "learning_rate": 5.736427626951971e-05, + "loss": 1.9269, + "step": 11880 + }, + { + "epoch": 449.0566037735849, + "grad_norm": 1.0521548682001445, + "learning_rate": 5.729654210116646e-05, + "loss": 1.897, + "step": 11900 + }, + { + "epoch": 449.811320754717, + "grad_norm": 1.0831322365716964, + "learning_rate": 5.7228750708827196e-05, + "loss": 1.9019, + "step": 11920 + }, + { + "epoch": 450.5660377358491, + "grad_norm": 1.113425539515294, + "learning_rate": 5.71609023546776e-05, + "loss": 1.8995, + "step": 11940 + }, + { + "epoch": 451.3207547169811, + "grad_norm": 1.1378527380008467, + "learning_rate": 5.709299730111367e-05, + "loss": 1.9112, + "step": 11960 + }, + { + "epoch": 452.07547169811323, + "grad_norm": 1.2308344759482057, + "learning_rate": 5.702503581075065e-05, + "loss": 1.8869, + "step": 11980 + }, + { + "epoch": 452.8301886792453, + "grad_norm": 1.3869181367868268, + "learning_rate": 5.6957018146422106e-05, + "loss": 1.9092, + "step": 12000 + }, + { + "epoch": 453.58490566037733, + "grad_norm": 1.1702979518774306, + "learning_rate": 5.688894457117877e-05, + "loss": 1.8944, + "step": 12020 + }, + { + "epoch": 454.33962264150944, + "grad_norm": 1.2974690219427283, + "learning_rate": 5.6820815348287674e-05, + "loss": 1.8794, + "step": 12040 + }, + { + "epoch": 455.0943396226415, + "grad_norm": 1.3757370848375583, + "learning_rate": 5.675263074123103e-05, + "loss": 1.9208, + "step": 12060 + }, + { + "epoch": 455.8490566037736, + "grad_norm": 1.3314963474728592, + "learning_rate": 5.668439101370524e-05, + "loss": 1.8823, + "step": 12080 + }, + { + "epoch": 456.60377358490564, + "grad_norm": 1.1525239716029143, + "learning_rate": 5.6616096429619885e-05, + "loss": 1.8778, + "step": 12100 + }, + { + "epoch": 457.35849056603774, + "grad_norm": 1.1391429331630094, + "learning_rate": 5.6547747253096713e-05, + "loss": 1.8973, + "step": 12120 + }, + { + "epoch": 458.1132075471698, + "grad_norm": 1.2813875070982645, + "learning_rate": 5.647934374846856e-05, + "loss": 1.9037, + "step": 12140 + }, + { + "epoch": 458.8679245283019, + "grad_norm": 1.130130379386682, + "learning_rate": 5.641088618027841e-05, + "loss": 1.8946, + "step": 12160 + }, + { + "epoch": 459.62264150943395, + "grad_norm": 1.189098976296786, + "learning_rate": 5.6342374813278305e-05, + "loss": 1.9122, + "step": 12180 + }, + { + "epoch": 460.37735849056605, + "grad_norm": 1.18982288351709, + "learning_rate": 5.627380991242839e-05, + "loss": 1.8893, + "step": 12200 + }, + { + "epoch": 461.1320754716981, + "grad_norm": 1.3440462024222728, + "learning_rate": 5.6205191742895787e-05, + "loss": 1.8879, + "step": 12220 + }, + { + "epoch": 461.8867924528302, + "grad_norm": 1.0998628162432096, + "learning_rate": 5.613652057005367e-05, + "loss": 1.8911, + "step": 12240 + }, + { + "epoch": 462.64150943396226, + "grad_norm": 1.0660994627393063, + "learning_rate": 5.6067796659480196e-05, + "loss": 1.9055, + "step": 12260 + }, + { + "epoch": 463.39622641509436, + "grad_norm": 1.7426680752228556, + "learning_rate": 5.599902027695745e-05, + "loss": 1.897, + "step": 12280 + }, + { + "epoch": 464.1509433962264, + "grad_norm": 1.388841332022157, + "learning_rate": 5.593019168847049e-05, + "loss": 1.8812, + "step": 12300 + }, + { + "epoch": 464.9056603773585, + "grad_norm": 1.2274558384609464, + "learning_rate": 5.586131116020621e-05, + "loss": 1.8496, + "step": 12320 + }, + { + "epoch": 465.66037735849056, + "grad_norm": 1.1945002690405846, + "learning_rate": 5.5792378958552456e-05, + "loss": 1.9146, + "step": 12340 + }, + { + "epoch": 466.41509433962267, + "grad_norm": 1.1629769495886029, + "learning_rate": 5.5723395350096866e-05, + "loss": 1.8734, + "step": 12360 + }, + { + "epoch": 467.1698113207547, + "grad_norm": 1.1703423211235366, + "learning_rate": 5.565436060162589e-05, + "loss": 1.8882, + "step": 12380 + }, + { + "epoch": 467.92452830188677, + "grad_norm": 1.3904930914694782, + "learning_rate": 5.5585274980123765e-05, + "loss": 1.8794, + "step": 12400 + }, + { + "epoch": 468.6792452830189, + "grad_norm": 1.1043102032574945, + "learning_rate": 5.551613875277148e-05, + "loss": 1.888, + "step": 12420 + }, + { + "epoch": 469.4339622641509, + "grad_norm": 1.019172984960956, + "learning_rate": 5.5446952186945716e-05, + "loss": 1.8887, + "step": 12440 + }, + { + "epoch": 470.188679245283, + "grad_norm": 1.2815784609995193, + "learning_rate": 5.537771555021785e-05, + "loss": 1.9026, + "step": 12460 + }, + { + "epoch": 470.9433962264151, + "grad_norm": 1.0452909524777938, + "learning_rate": 5.53084291103529e-05, + "loss": 1.8688, + "step": 12480 + }, + { + "epoch": 471.6981132075472, + "grad_norm": 1.2824929707840547, + "learning_rate": 5.5239093135308484e-05, + "loss": 1.8568, + "step": 12500 + }, + { + "epoch": 472.45283018867923, + "grad_norm": 1.0473918662270072, + "learning_rate": 5.516970789323382e-05, + "loss": 1.8962, + "step": 12520 + }, + { + "epoch": 473.20754716981133, + "grad_norm": 1.1551860073406197, + "learning_rate": 5.5100273652468596e-05, + "loss": 1.9053, + "step": 12540 + }, + { + "epoch": 473.9622641509434, + "grad_norm": 1.2315884678620779, + "learning_rate": 5.50307906815421e-05, + "loss": 1.8802, + "step": 12560 + }, + { + "epoch": 474.7169811320755, + "grad_norm": 1.2036057101238689, + "learning_rate": 5.496125924917195e-05, + "loss": 1.8848, + "step": 12580 + }, + { + "epoch": 475.47169811320754, + "grad_norm": 1.1443042979106692, + "learning_rate": 5.4891679624263313e-05, + "loss": 1.8993, + "step": 12600 + }, + { + "epoch": 476.22641509433964, + "grad_norm": 1.1112985130684456, + "learning_rate": 5.482205207590763e-05, + "loss": 1.8997, + "step": 12620 + }, + { + "epoch": 476.9811320754717, + "grad_norm": 1.1198907315048803, + "learning_rate": 5.475237687338175e-05, + "loss": 1.9204, + "step": 12640 + }, + { + "epoch": 477.7358490566038, + "grad_norm": 1.0505243476691362, + "learning_rate": 5.468265428614679e-05, + "loss": 1.8824, + "step": 12660 + }, + { + "epoch": 478.49056603773585, + "grad_norm": 1.1618158349057395, + "learning_rate": 5.461288458384711e-05, + "loss": 1.8675, + "step": 12680 + }, + { + "epoch": 479.24528301886795, + "grad_norm": 1.310696647632245, + "learning_rate": 5.454306803630931e-05, + "loss": 1.8617, + "step": 12700 + }, + { + "epoch": 480.0, + "grad_norm": 1.2853008412145361, + "learning_rate": 5.447320491354114e-05, + "loss": 1.8798, + "step": 12720 + }, + { + "epoch": 480.75471698113205, + "grad_norm": 1.2035604713641803, + "learning_rate": 5.440329548573049e-05, + "loss": 1.8505, + "step": 12740 + }, + { + "epoch": 481.50943396226415, + "grad_norm": 1.301768301418178, + "learning_rate": 5.433334002324431e-05, + "loss": 1.8849, + "step": 12760 + }, + { + "epoch": 482.2641509433962, + "grad_norm": 1.0741158531319273, + "learning_rate": 5.426333879662761e-05, + "loss": 1.8362, + "step": 12780 + }, + { + "epoch": 483.0188679245283, + "grad_norm": 1.2118720683926874, + "learning_rate": 5.419329207660237e-05, + "loss": 1.8811, + "step": 12800 + }, + { + "epoch": 483.77358490566036, + "grad_norm": 1.295829194970654, + "learning_rate": 5.412320013406651e-05, + "loss": 1.8473, + "step": 12820 + }, + { + "epoch": 484.52830188679246, + "grad_norm": 1.2658203604478202, + "learning_rate": 5.405306324009282e-05, + "loss": 1.8728, + "step": 12840 + }, + { + "epoch": 485.2830188679245, + "grad_norm": 1.2195390339098875, + "learning_rate": 5.3982881665928015e-05, + "loss": 1.8704, + "step": 12860 + }, + { + "epoch": 486.0377358490566, + "grad_norm": 1.067227068131729, + "learning_rate": 5.391265568299149e-05, + "loss": 1.8619, + "step": 12880 + }, + { + "epoch": 486.79245283018867, + "grad_norm": 1.3306442274846357, + "learning_rate": 5.384238556287451e-05, + "loss": 1.8638, + "step": 12900 + }, + { + "epoch": 487.54716981132077, + "grad_norm": 1.2531810114251472, + "learning_rate": 5.377207157733893e-05, + "loss": 1.8839, + "step": 12920 + }, + { + "epoch": 488.3018867924528, + "grad_norm": 1.0879029191456078, + "learning_rate": 5.370171399831631e-05, + "loss": 1.866, + "step": 12940 + }, + { + "epoch": 489.0566037735849, + "grad_norm": 1.1769881515511749, + "learning_rate": 5.363131309790678e-05, + "loss": 1.8253, + "step": 12960 + }, + { + "epoch": 489.811320754717, + "grad_norm": 1.3614975573612427, + "learning_rate": 5.356086914837802e-05, + "loss": 1.8487, + "step": 12980 + }, + { + "epoch": 490.5660377358491, + "grad_norm": 1.5342718531352588, + "learning_rate": 5.349038242216419e-05, + "loss": 1.847, + "step": 13000 + }, + { + "epoch": 491.3207547169811, + "grad_norm": 1.1571547119310825, + "learning_rate": 5.341985319186489e-05, + "loss": 1.8822, + "step": 13020 + }, + { + "epoch": 492.07547169811323, + "grad_norm": 1.1739881074209173, + "learning_rate": 5.33492817302441e-05, + "loss": 1.8531, + "step": 13040 + }, + { + "epoch": 492.8301886792453, + "grad_norm": 1.1934573145337144, + "learning_rate": 5.3278668310229125e-05, + "loss": 1.8986, + "step": 13060 + }, + { + "epoch": 493.58490566037733, + "grad_norm": 2.343948986647593, + "learning_rate": 5.320801320490955e-05, + "loss": 1.8935, + "step": 13080 + }, + { + "epoch": 494.33962264150944, + "grad_norm": 1.0764970314512263, + "learning_rate": 5.3137316687536136e-05, + "loss": 1.854, + "step": 13100 + }, + { + "epoch": 495.0943396226415, + "grad_norm": 1.0129335749841757, + "learning_rate": 5.3066579031519824e-05, + "loss": 1.874, + "step": 13120 + }, + { + "epoch": 495.8490566037736, + "grad_norm": 1.3577114498479963, + "learning_rate": 5.299580051043069e-05, + "loss": 1.8534, + "step": 13140 + }, + { + "epoch": 496.60377358490564, + "grad_norm": 1.1849380554040083, + "learning_rate": 5.292498139799678e-05, + "loss": 1.8705, + "step": 13160 + }, + { + "epoch": 497.35849056603774, + "grad_norm": 1.1290155132472126, + "learning_rate": 5.2854121968103185e-05, + "loss": 1.8659, + "step": 13180 + }, + { + "epoch": 498.1132075471698, + "grad_norm": 1.3627686926052522, + "learning_rate": 5.278322249479088e-05, + "loss": 1.8686, + "step": 13200 + }, + { + "epoch": 498.8679245283019, + "grad_norm": 1.147585604555274, + "learning_rate": 5.271228325225573e-05, + "loss": 1.8301, + "step": 13220 + }, + { + "epoch": 499.62264150943395, + "grad_norm": 1.124456551859716, + "learning_rate": 5.264130451484736e-05, + "loss": 1.846, + "step": 13240 + }, + { + "epoch": 500.37735849056605, + "grad_norm": 1.1328557419125425, + "learning_rate": 5.257028655706819e-05, + "loss": 1.8489, + "step": 13260 + }, + { + "epoch": 501.1320754716981, + "grad_norm": 1.3248366741093285, + "learning_rate": 5.249922965357231e-05, + "loss": 1.847, + "step": 13280 + }, + { + "epoch": 501.8867924528302, + "grad_norm": 1.2987470821979115, + "learning_rate": 5.24281340791644e-05, + "loss": 1.8598, + "step": 13300 + }, + { + "epoch": 502.64150943396226, + "grad_norm": 1.2954015666799925, + "learning_rate": 5.235700010879869e-05, + "loss": 1.8144, + "step": 13320 + }, + { + "epoch": 503.39622641509436, + "grad_norm": 1.14457084913068, + "learning_rate": 5.228582801757796e-05, + "loss": 1.8666, + "step": 13340 + }, + { + "epoch": 504.1509433962264, + "grad_norm": 1.1877213110343792, + "learning_rate": 5.221461808075237e-05, + "loss": 1.8637, + "step": 13360 + }, + { + "epoch": 504.9056603773585, + "grad_norm": 1.0687096720310838, + "learning_rate": 5.214337057371846e-05, + "loss": 1.8639, + "step": 13380 + }, + { + "epoch": 505.66037735849056, + "grad_norm": 1.1296264305151373, + "learning_rate": 5.207208577201805e-05, + "loss": 1.8508, + "step": 13400 + }, + { + "epoch": 506.41509433962267, + "grad_norm": 1.4245030753661052, + "learning_rate": 5.200076395133721e-05, + "loss": 1.8328, + "step": 13420 + }, + { + "epoch": 507.1698113207547, + "grad_norm": 1.3736962730435212, + "learning_rate": 5.1929405387505185e-05, + "loss": 1.8402, + "step": 13440 + }, + { + "epoch": 507.92452830188677, + "grad_norm": 1.3874806329791736, + "learning_rate": 5.185801035649329e-05, + "loss": 1.8392, + "step": 13460 + }, + { + "epoch": 508.6792452830189, + "grad_norm": 1.2993168124302985, + "learning_rate": 5.1786579134413916e-05, + "loss": 1.8357, + "step": 13480 + }, + { + "epoch": 509.4339622641509, + "grad_norm": 1.1615849238599296, + "learning_rate": 5.171511199751936e-05, + "loss": 1.8602, + "step": 13500 + }, + { + "epoch": 510.188679245283, + "grad_norm": 1.313961870036688, + "learning_rate": 5.164360922220089e-05, + "loss": 1.8276, + "step": 13520 + }, + { + "epoch": 510.9433962264151, + "grad_norm": 1.240911570140835, + "learning_rate": 5.157207108498754e-05, + "loss": 1.83, + "step": 13540 + }, + { + "epoch": 511.6981132075472, + "grad_norm": 1.2739381058558579, + "learning_rate": 5.1500497862545134e-05, + "loss": 1.864, + "step": 13560 + }, + { + "epoch": 512.4528301886793, + "grad_norm": 1.3641387795362538, + "learning_rate": 5.142888983167516e-05, + "loss": 1.9016, + "step": 13580 + }, + { + "epoch": 513.2075471698113, + "grad_norm": 1.233949050539118, + "learning_rate": 5.135724726931374e-05, + "loss": 1.8224, + "step": 13600 + }, + { + "epoch": 513.9622641509434, + "grad_norm": 1.2764553522178392, + "learning_rate": 5.128557045253056e-05, + "loss": 1.8489, + "step": 13620 + }, + { + "epoch": 514.7169811320755, + "grad_norm": 1.0847794881407822, + "learning_rate": 5.121385965852773e-05, + "loss": 1.8433, + "step": 13640 + }, + { + "epoch": 515.4716981132076, + "grad_norm": 1.211639546404476, + "learning_rate": 5.114211516463883e-05, + "loss": 1.8592, + "step": 13660 + }, + { + "epoch": 516.2264150943396, + "grad_norm": 1.6499382505803508, + "learning_rate": 5.1070337248327704e-05, + "loss": 1.8491, + "step": 13680 + }, + { + "epoch": 516.9811320754717, + "grad_norm": 1.1415154218905448, + "learning_rate": 5.0998526187187506e-05, + "loss": 1.8263, + "step": 13700 + }, + { + "epoch": 517.7358490566038, + "grad_norm": 1.2931592721596668, + "learning_rate": 5.092668225893955e-05, + "loss": 1.8341, + "step": 13720 + }, + { + "epoch": 518.4905660377359, + "grad_norm": 1.1289936456910783, + "learning_rate": 5.0854805741432266e-05, + "loss": 1.8256, + "step": 13740 + }, + { + "epoch": 519.2452830188679, + "grad_norm": 1.1568681178648177, + "learning_rate": 5.078289691264009e-05, + "loss": 1.8268, + "step": 13760 + }, + { + "epoch": 520.0, + "grad_norm": 1.2075151796344337, + "learning_rate": 5.071095605066247e-05, + "loss": 1.8342, + "step": 13780 + }, + { + "epoch": 520.7547169811321, + "grad_norm": 1.41061431054736, + "learning_rate": 5.063898343372271e-05, + "loss": 1.8569, + "step": 13800 + }, + { + "epoch": 521.5094339622641, + "grad_norm": 1.7141184097601845, + "learning_rate": 5.0566979340166915e-05, + "loss": 1.8447, + "step": 13820 + }, + { + "epoch": 522.2641509433962, + "grad_norm": 1.1912730571129804, + "learning_rate": 5.0494944048462946e-05, + "loss": 1.8632, + "step": 13840 + }, + { + "epoch": 523.0188679245283, + "grad_norm": 1.2784159482259496, + "learning_rate": 5.042287783719931e-05, + "loss": 1.8293, + "step": 13860 + }, + { + "epoch": 523.7735849056604, + "grad_norm": 1.1444265949319492, + "learning_rate": 5.0350780985084076e-05, + "loss": 1.8423, + "step": 13880 + }, + { + "epoch": 524.5283018867924, + "grad_norm": 1.1366776283872817, + "learning_rate": 5.027865377094383e-05, + "loss": 1.8284, + "step": 13900 + }, + { + "epoch": 525.2830188679245, + "grad_norm": 1.2870871198292675, + "learning_rate": 5.020649647372258e-05, + "loss": 1.8313, + "step": 13920 + }, + { + "epoch": 526.0377358490566, + "grad_norm": 1.8138208437079086, + "learning_rate": 5.013430937248066e-05, + "loss": 1.8382, + "step": 13940 + }, + { + "epoch": 526.7924528301887, + "grad_norm": 1.319578877513452, + "learning_rate": 5.00620927463937e-05, + "loss": 1.8343, + "step": 13960 + }, + { + "epoch": 527.5471698113207, + "grad_norm": 1.2254503656584737, + "learning_rate": 4.998984687475148e-05, + "loss": 1.8439, + "step": 13980 + }, + { + "epoch": 528.3018867924528, + "grad_norm": 1.0900572753736815, + "learning_rate": 4.9917572036956896e-05, + "loss": 1.8339, + "step": 14000 + }, + { + "epoch": 529.0566037735849, + "grad_norm": 1.3672093328811397, + "learning_rate": 4.984526851252489e-05, + "loss": 1.8269, + "step": 14020 + }, + { + "epoch": 529.811320754717, + "grad_norm": 1.0474703180578433, + "learning_rate": 4.97729365810813e-05, + "loss": 1.8278, + "step": 14040 + }, + { + "epoch": 530.566037735849, + "grad_norm": 1.302303693187968, + "learning_rate": 4.9700576522361875e-05, + "loss": 1.8406, + "step": 14060 + }, + { + "epoch": 531.3207547169811, + "grad_norm": 1.3351319231004828, + "learning_rate": 4.96281886162111e-05, + "loss": 1.805, + "step": 14080 + }, + { + "epoch": 532.0754716981132, + "grad_norm": 1.7423062973900807, + "learning_rate": 4.955577314258118e-05, + "loss": 1.8021, + "step": 14100 + }, + { + "epoch": 532.8301886792453, + "grad_norm": 1.0851524592672839, + "learning_rate": 4.9483330381530944e-05, + "loss": 1.8376, + "step": 14120 + }, + { + "epoch": 533.5849056603773, + "grad_norm": 1.110982412101906, + "learning_rate": 4.941086061322473e-05, + "loss": 1.8468, + "step": 14140 + }, + { + "epoch": 534.3396226415094, + "grad_norm": 1.1481042439437046, + "learning_rate": 4.933836411793133e-05, + "loss": 1.8131, + "step": 14160 + }, + { + "epoch": 535.0943396226415, + "grad_norm": 1.1504371756112235, + "learning_rate": 4.926584117602288e-05, + "loss": 1.8081, + "step": 14180 + }, + { + "epoch": 535.8490566037735, + "grad_norm": 1.1403864383961178, + "learning_rate": 4.919329206797387e-05, + "loss": 1.823, + "step": 14200 + }, + { + "epoch": 536.6037735849056, + "grad_norm": 1.2962348904995422, + "learning_rate": 4.912071707435988e-05, + "loss": 1.8187, + "step": 14220 + }, + { + "epoch": 537.3584905660377, + "grad_norm": 1.1885752096952027, + "learning_rate": 4.904811647585668e-05, + "loss": 1.8256, + "step": 14240 + }, + { + "epoch": 538.1132075471698, + "grad_norm": 1.064497747677543, + "learning_rate": 4.897549055323902e-05, + "loss": 1.8, + "step": 14260 + }, + { + "epoch": 538.8679245283018, + "grad_norm": 1.4469124816185257, + "learning_rate": 4.8902839587379614e-05, + "loss": 1.8365, + "step": 14280 + }, + { + "epoch": 539.622641509434, + "grad_norm": 1.0326597719869466, + "learning_rate": 4.8830163859248014e-05, + "loss": 1.812, + "step": 14300 + }, + { + "epoch": 540.377358490566, + "grad_norm": 1.261127094091647, + "learning_rate": 4.875746364990955e-05, + "loss": 1.7936, + "step": 14320 + }, + { + "epoch": 541.1320754716982, + "grad_norm": 1.6850662159573848, + "learning_rate": 4.8684739240524185e-05, + "loss": 1.8039, + "step": 14340 + }, + { + "epoch": 541.8867924528302, + "grad_norm": 1.1719859164333604, + "learning_rate": 4.861199091234556e-05, + "loss": 1.7995, + "step": 14360 + }, + { + "epoch": 542.6415094339623, + "grad_norm": 1.1168812884827573, + "learning_rate": 4.853921894671973e-05, + "loss": 1.804, + "step": 14380 + }, + { + "epoch": 543.3962264150944, + "grad_norm": 1.5041434010962127, + "learning_rate": 4.846642362508422e-05, + "loss": 1.8042, + "step": 14400 + }, + { + "epoch": 544.1509433962265, + "grad_norm": 1.2922119772360392, + "learning_rate": 4.8393605228966854e-05, + "loss": 1.8176, + "step": 14420 + }, + { + "epoch": 544.9056603773585, + "grad_norm": 1.316092813395267, + "learning_rate": 4.832076403998472e-05, + "loss": 1.8324, + "step": 14440 + }, + { + "epoch": 545.6603773584906, + "grad_norm": 1.148925533679318, + "learning_rate": 4.8247900339843045e-05, + "loss": 1.8249, + "step": 14460 + }, + { + "epoch": 546.4150943396227, + "grad_norm": 1.3351586320323485, + "learning_rate": 4.817501441033409e-05, + "loss": 1.8023, + "step": 14480 + }, + { + "epoch": 547.1698113207547, + "grad_norm": 1.4554583529380825, + "learning_rate": 4.810210653333613e-05, + "loss": 1.782, + "step": 14500 + }, + { + "epoch": 547.9245283018868, + "grad_norm": 1.2418737812043639, + "learning_rate": 4.802917699081225e-05, + "loss": 1.7981, + "step": 14520 + }, + { + "epoch": 548.6792452830189, + "grad_norm": 1.1837142285238051, + "learning_rate": 4.795622606480942e-05, + "loss": 1.7982, + "step": 14540 + }, + { + "epoch": 549.433962264151, + "grad_norm": 1.2674115880751322, + "learning_rate": 4.788325403745724e-05, + "loss": 1.8055, + "step": 14560 + }, + { + "epoch": 550.188679245283, + "grad_norm": 2.02523705877845, + "learning_rate": 4.7810261190966944e-05, + "loss": 1.7905, + "step": 14580 + }, + { + "epoch": 550.9433962264151, + "grad_norm": 1.3660297273644537, + "learning_rate": 4.773724780763023e-05, + "loss": 1.8267, + "step": 14600 + }, + { + "epoch": 551.6981132075472, + "grad_norm": 1.1728070148137189, + "learning_rate": 4.766421416981833e-05, + "loss": 1.7862, + "step": 14620 + }, + { + "epoch": 552.4528301886793, + "grad_norm": 1.148521109395332, + "learning_rate": 4.759116055998069e-05, + "loss": 1.7842, + "step": 14640 + }, + { + "epoch": 553.2075471698113, + "grad_norm": 1.2578627421373816, + "learning_rate": 4.7518087260644065e-05, + "loss": 1.8105, + "step": 14660 + }, + { + "epoch": 553.9622641509434, + "grad_norm": 1.2736902452272465, + "learning_rate": 4.744499455441133e-05, + "loss": 1.7931, + "step": 14680 + }, + { + "epoch": 554.7169811320755, + "grad_norm": 1.0794014181765008, + "learning_rate": 4.737188272396044e-05, + "loss": 1.8043, + "step": 14700 + }, + { + "epoch": 555.4716981132076, + "grad_norm": 1.3894129104855453, + "learning_rate": 4.729875205204327e-05, + "loss": 1.8301, + "step": 14720 + }, + { + "epoch": 556.2264150943396, + "grad_norm": 1.147340224849857, + "learning_rate": 4.722560282148459e-05, + "loss": 1.8178, + "step": 14740 + }, + { + "epoch": 556.9811320754717, + "grad_norm": 1.3948879461559769, + "learning_rate": 4.7152435315180975e-05, + "loss": 1.7648, + "step": 14760 + }, + { + "epoch": 557.7358490566038, + "grad_norm": 1.3694680696221502, + "learning_rate": 4.7079249816099584e-05, + "loss": 1.8104, + "step": 14780 + }, + { + "epoch": 558.4905660377359, + "grad_norm": 1.4147919843537753, + "learning_rate": 4.700604660727726e-05, + "loss": 1.7721, + "step": 14800 + }, + { + "epoch": 559.2452830188679, + "grad_norm": 1.2297666792262925, + "learning_rate": 4.6932825971819285e-05, + "loss": 1.7923, + "step": 14820 + }, + { + "epoch": 560.0, + "grad_norm": 1.1416590332464547, + "learning_rate": 4.6859588192898365e-05, + "loss": 1.7709, + "step": 14840 + }, + { + "epoch": 560.7547169811321, + "grad_norm": 1.2633394473980435, + "learning_rate": 4.6786333553753454e-05, + "loss": 1.8265, + "step": 14860 + }, + { + "epoch": 561.5094339622641, + "grad_norm": 1.73410063706433, + "learning_rate": 4.671306233768877e-05, + "loss": 1.7935, + "step": 14880 + }, + { + "epoch": 562.2641509433962, + "grad_norm": 1.909552398589606, + "learning_rate": 4.663977482807263e-05, + "loss": 1.7928, + "step": 14900 + }, + { + "epoch": 563.0188679245283, + "grad_norm": 2.3340344731557505, + "learning_rate": 4.656647130833632e-05, + "loss": 1.8083, + "step": 14920 + }, + { + "epoch": 563.7735849056604, + "grad_norm": 1.5856106264075287, + "learning_rate": 4.64931520619731e-05, + "loss": 1.8345, + "step": 14940 + }, + { + "epoch": 564.5283018867924, + "grad_norm": 1.4125116448786768, + "learning_rate": 4.6419817372537015e-05, + "loss": 1.7764, + "step": 14960 + }, + { + "epoch": 565.2830188679245, + "grad_norm": 1.1720058705654566, + "learning_rate": 4.634646752364185e-05, + "loss": 1.7917, + "step": 14980 + }, + { + "epoch": 566.0377358490566, + "grad_norm": 1.1615325214837866, + "learning_rate": 4.627310279896001e-05, + "loss": 1.7916, + "step": 15000 + }, + { + "epoch": 566.7924528301887, + "grad_norm": 1.3392839325444756, + "learning_rate": 4.619972348222143e-05, + "loss": 1.7803, + "step": 15020 + }, + { + "epoch": 567.5471698113207, + "grad_norm": 1.3665016393198224, + "learning_rate": 4.6126329857212486e-05, + "loss": 1.7822, + "step": 15040 + }, + { + "epoch": 568.3018867924528, + "grad_norm": 1.6085820618369988, + "learning_rate": 4.605292220777489e-05, + "loss": 1.7889, + "step": 15060 + }, + { + "epoch": 569.0566037735849, + "grad_norm": 1.4230286645037085, + "learning_rate": 4.5979500817804594e-05, + "loss": 1.805, + "step": 15080 + }, + { + "epoch": 569.811320754717, + "grad_norm": 1.1596468566263143, + "learning_rate": 4.590606597125065e-05, + "loss": 1.7892, + "step": 15100 + }, + { + "epoch": 570.566037735849, + "grad_norm": 1.4539519726534167, + "learning_rate": 4.583261795211423e-05, + "loss": 1.7831, + "step": 15120 + }, + { + "epoch": 571.3207547169811, + "grad_norm": 1.2521318013943803, + "learning_rate": 4.575915704444736e-05, + "loss": 1.8018, + "step": 15140 + }, + { + "epoch": 572.0754716981132, + "grad_norm": 1.1819685518944387, + "learning_rate": 4.5685683532352e-05, + "loss": 1.7866, + "step": 15160 + }, + { + "epoch": 572.8301886792453, + "grad_norm": 1.1876204585927221, + "learning_rate": 4.5612197699978766e-05, + "loss": 1.7833, + "step": 15180 + }, + { + "epoch": 573.5849056603773, + "grad_norm": 1.1556357684763976, + "learning_rate": 4.5538699831526006e-05, + "loss": 1.8024, + "step": 15200 + }, + { + "epoch": 574.3396226415094, + "grad_norm": 1.3326980140111142, + "learning_rate": 4.5465190211238544e-05, + "loss": 1.7829, + "step": 15220 + }, + { + "epoch": 575.0943396226415, + "grad_norm": 1.308268798679134, + "learning_rate": 4.539166912340671e-05, + "loss": 1.7766, + "step": 15240 + }, + { + "epoch": 575.8490566037735, + "grad_norm": 1.1564791044184874, + "learning_rate": 4.531813685236516e-05, + "loss": 1.8021, + "step": 15260 + }, + { + "epoch": 576.6037735849056, + "grad_norm": 1.4187942127459952, + "learning_rate": 4.524459368249179e-05, + "loss": 1.7523, + "step": 15280 + }, + { + "epoch": 577.3584905660377, + "grad_norm": 1.1994628151621998, + "learning_rate": 4.5171039898206644e-05, + "loss": 1.7845, + "step": 15300 + }, + { + "epoch": 578.1132075471698, + "grad_norm": 1.172216325696233, + "learning_rate": 4.509747578397086e-05, + "loss": 1.7591, + "step": 15320 + }, + { + "epoch": 578.8679245283018, + "grad_norm": 1.1667988074546227, + "learning_rate": 4.5023901624285465e-05, + "loss": 1.7955, + "step": 15340 + }, + { + "epoch": 579.622641509434, + "grad_norm": 1.31427332849911, + "learning_rate": 4.495031770369038e-05, + "loss": 1.7605, + "step": 15360 + }, + { + "epoch": 580.377358490566, + "grad_norm": 1.2050607756000014, + "learning_rate": 4.487672430676325e-05, + "loss": 1.7673, + "step": 15380 + }, + { + "epoch": 581.1320754716982, + "grad_norm": 1.2087614153318165, + "learning_rate": 4.480312171811838e-05, + "loss": 1.7876, + "step": 15400 + }, + { + "epoch": 581.8867924528302, + "grad_norm": 1.3000620466205515, + "learning_rate": 4.472951022240562e-05, + "loss": 1.7611, + "step": 15420 + }, + { + "epoch": 582.6415094339623, + "grad_norm": 1.7966112906689369, + "learning_rate": 4.4655890104309254e-05, + "loss": 1.7702, + "step": 15440 + }, + { + "epoch": 583.3962264150944, + "grad_norm": 1.198242649687164, + "learning_rate": 4.458226164854697e-05, + "loss": 1.7942, + "step": 15460 + }, + { + "epoch": 584.1509433962265, + "grad_norm": 1.6859720478683236, + "learning_rate": 4.450862513986861e-05, + "loss": 1.758, + "step": 15480 + }, + { + "epoch": 584.9056603773585, + "grad_norm": 1.1440767805984655, + "learning_rate": 4.443498086305525e-05, + "loss": 1.7647, + "step": 15500 + }, + { + "epoch": 585.6603773584906, + "grad_norm": 1.2426581026511485, + "learning_rate": 4.436132910291792e-05, + "loss": 1.7468, + "step": 15520 + }, + { + "epoch": 586.4150943396227, + "grad_norm": 1.5652742956982049, + "learning_rate": 4.4287670144296675e-05, + "loss": 1.7733, + "step": 15540 + }, + { + "epoch": 587.1698113207547, + "grad_norm": 1.150105537080449, + "learning_rate": 4.421400427205934e-05, + "loss": 1.7878, + "step": 15560 + }, + { + "epoch": 587.9245283018868, + "grad_norm": 1.297179193085273, + "learning_rate": 4.4140331771100516e-05, + "loss": 1.7558, + "step": 15580 + }, + { + "epoch": 588.6792452830189, + "grad_norm": 1.4354989531166704, + "learning_rate": 4.406665292634046e-05, + "loss": 1.7652, + "step": 15600 + }, + { + "epoch": 589.433962264151, + "grad_norm": 1.3544454831633896, + "learning_rate": 4.399296802272388e-05, + "loss": 1.7695, + "step": 15620 + }, + { + "epoch": 590.188679245283, + "grad_norm": 1.356710977975809, + "learning_rate": 4.3919277345219033e-05, + "loss": 1.7317, + "step": 15640 + }, + { + "epoch": 590.9433962264151, + "grad_norm": 1.3504644293745585, + "learning_rate": 4.3845581178816394e-05, + "loss": 1.7784, + "step": 15660 + }, + { + "epoch": 591.6981132075472, + "grad_norm": 1.2934057468915228, + "learning_rate": 4.377187980852775e-05, + "loss": 1.7655, + "step": 15680 + }, + { + "epoch": 592.4528301886793, + "grad_norm": 2.7284471186236976, + "learning_rate": 4.369817351938495e-05, + "loss": 1.7617, + "step": 15700 + }, + { + "epoch": 593.2075471698113, + "grad_norm": 1.4587946653999224, + "learning_rate": 4.3624462596438926e-05, + "loss": 1.7675, + "step": 15720 + }, + { + "epoch": 593.9622641509434, + "grad_norm": 1.824543804524391, + "learning_rate": 4.3550747324758475e-05, + "loss": 1.7835, + "step": 15740 + }, + { + "epoch": 594.7169811320755, + "grad_norm": 1.1558960324762337, + "learning_rate": 4.3477027989429267e-05, + "loss": 1.7848, + "step": 15760 + }, + { + "epoch": 595.4716981132076, + "grad_norm": 1.3618125278208344, + "learning_rate": 4.340330487555261e-05, + "loss": 1.7717, + "step": 15780 + }, + { + "epoch": 596.2264150943396, + "grad_norm": 1.8336334887122832, + "learning_rate": 4.332957826824451e-05, + "loss": 1.7753, + "step": 15800 + }, + { + "epoch": 596.9811320754717, + "grad_norm": 1.6035556059617442, + "learning_rate": 4.325584845263445e-05, + "loss": 1.7507, + "step": 15820 + }, + { + "epoch": 597.7358490566038, + "grad_norm": 1.1021262642715972, + "learning_rate": 4.318211571386428e-05, + "loss": 1.7683, + "step": 15840 + }, + { + "epoch": 598.4905660377359, + "grad_norm": 1.3112589542500708, + "learning_rate": 4.310838033708722e-05, + "loss": 1.753, + "step": 15860 + }, + { + "epoch": 599.2452830188679, + "grad_norm": 1.5257318148219035, + "learning_rate": 4.303464260746667e-05, + "loss": 1.7446, + "step": 15880 + }, + { + "epoch": 600.0, + "grad_norm": 1.2648959489882874, + "learning_rate": 4.296090281017511e-05, + "loss": 1.7513, + "step": 15900 + }, + { + "epoch": 600.7547169811321, + "grad_norm": 1.4414622706601208, + "learning_rate": 4.2887161230393034e-05, + "loss": 1.7421, + "step": 15920 + }, + { + "epoch": 601.5094339622641, + "grad_norm": 1.60216933395765, + "learning_rate": 4.281341815330784e-05, + "loss": 1.7335, + "step": 15940 + }, + { + "epoch": 602.2641509433962, + "grad_norm": 1.438261210769706, + "learning_rate": 4.273967386411267e-05, + "loss": 1.7676, + "step": 15960 + }, + { + "epoch": 603.0188679245283, + "grad_norm": 1.61121062509495, + "learning_rate": 4.26659286480054e-05, + "loss": 1.7767, + "step": 15980 + }, + { + "epoch": 603.7735849056604, + "grad_norm": 1.3368219249794455, + "learning_rate": 4.2592182790187495e-05, + "loss": 1.7615, + "step": 16000 + }, + { + "epoch": 604.5283018867924, + "grad_norm": 1.1964899050496502, + "learning_rate": 4.251843657586285e-05, + "loss": 1.7909, + "step": 16020 + }, + { + "epoch": 605.2830188679245, + "grad_norm": 1.1409598499641234, + "learning_rate": 4.244469029023682e-05, + "loss": 1.7806, + "step": 16040 + }, + { + "epoch": 606.0377358490566, + "grad_norm": 1.0775618341358217, + "learning_rate": 4.237094421851494e-05, + "loss": 1.7696, + "step": 16060 + }, + { + "epoch": 606.7924528301887, + "grad_norm": 1.201425866436519, + "learning_rate": 4.2297198645901986e-05, + "loss": 1.7424, + "step": 16080 + }, + { + "epoch": 607.5471698113207, + "grad_norm": 1.29163631265219, + "learning_rate": 4.222345385760079e-05, + "loss": 1.749, + "step": 16100 + }, + { + "epoch": 608.3018867924528, + "grad_norm": 1.4158324908813191, + "learning_rate": 4.214971013881114e-05, + "loss": 1.7594, + "step": 16120 + }, + { + "epoch": 609.0566037735849, + "grad_norm": 1.2390733211978042, + "learning_rate": 4.2075967774728675e-05, + "loss": 1.7707, + "step": 16140 + }, + { + "epoch": 609.811320754717, + "grad_norm": 1.0960663109570459, + "learning_rate": 4.200222705054385e-05, + "loss": 1.7633, + "step": 16160 + }, + { + "epoch": 610.566037735849, + "grad_norm": 1.167381366879647, + "learning_rate": 4.1928488251440704e-05, + "loss": 1.7735, + "step": 16180 + }, + { + "epoch": 611.3207547169811, + "grad_norm": 1.468960912277373, + "learning_rate": 4.185475166259588e-05, + "loss": 1.7222, + "step": 16200 + }, + { + "epoch": 612.0754716981132, + "grad_norm": 1.2572603668608606, + "learning_rate": 4.178101756917746e-05, + "loss": 1.7477, + "step": 16220 + }, + { + "epoch": 612.8301886792453, + "grad_norm": 1.2661070355556836, + "learning_rate": 4.170728625634387e-05, + "loss": 1.7437, + "step": 16240 + }, + { + "epoch": 613.5849056603773, + "grad_norm": 1.6793862205908143, + "learning_rate": 4.16335580092428e-05, + "loss": 1.7518, + "step": 16260 + }, + { + "epoch": 614.3396226415094, + "grad_norm": 1.3347192318840417, + "learning_rate": 4.155983311301006e-05, + "loss": 1.7275, + "step": 16280 + }, + { + "epoch": 615.0943396226415, + "grad_norm": 1.146186653201129, + "learning_rate": 4.148611185276852e-05, + "loss": 1.7505, + "step": 16300 + }, + { + "epoch": 615.8490566037735, + "grad_norm": 1.2853858560898548, + "learning_rate": 4.1412394513626976e-05, + "loss": 1.7345, + "step": 16320 + }, + { + "epoch": 616.6037735849056, + "grad_norm": 1.3689931241044506, + "learning_rate": 4.1338681380679055e-05, + "loss": 1.7372, + "step": 16340 + }, + { + "epoch": 617.3584905660377, + "grad_norm": 1.2520152979412003, + "learning_rate": 4.126497273900214e-05, + "loss": 1.7749, + "step": 16360 + }, + { + "epoch": 618.1132075471698, + "grad_norm": 4.4664529214017685, + "learning_rate": 4.119126887365623e-05, + "loss": 1.7291, + "step": 16380 + }, + { + "epoch": 618.8679245283018, + "grad_norm": 1.4871942552231863, + "learning_rate": 4.111757006968283e-05, + "loss": 1.729, + "step": 16400 + }, + { + "epoch": 619.622641509434, + "grad_norm": 1.7327021169643824, + "learning_rate": 4.104387661210391e-05, + "loss": 1.7906, + "step": 16420 + }, + { + "epoch": 620.377358490566, + "grad_norm": 1.3011566548368803, + "learning_rate": 4.0970188785920764e-05, + "loss": 1.7498, + "step": 16440 + }, + { + "epoch": 621.1320754716982, + "grad_norm": 1.503913785893422, + "learning_rate": 4.0896506876112856e-05, + "loss": 1.7333, + "step": 16460 + }, + { + "epoch": 621.8867924528302, + "grad_norm": 1.2396874135815048, + "learning_rate": 4.082283116763683e-05, + "loss": 1.7474, + "step": 16480 + }, + { + "epoch": 622.6415094339623, + "grad_norm": 1.3186465498196096, + "learning_rate": 4.07491619454253e-05, + "loss": 1.7641, + "step": 16500 + }, + { + "epoch": 623.3962264150944, + "grad_norm": 1.2224446651472063, + "learning_rate": 4.067549949438583e-05, + "loss": 1.7596, + "step": 16520 + }, + { + "epoch": 624.1509433962265, + "grad_norm": 1.299102298479128, + "learning_rate": 4.060184409939977e-05, + "loss": 1.7399, + "step": 16540 + }, + { + "epoch": 624.9056603773585, + "grad_norm": 1.2080109960062584, + "learning_rate": 4.052819604532121e-05, + "loss": 1.7545, + "step": 16560 + }, + { + "epoch": 625.6603773584906, + "grad_norm": 1.1330156099339754, + "learning_rate": 4.04545556169758e-05, + "loss": 1.7514, + "step": 16580 + }, + { + "epoch": 626.4150943396227, + "grad_norm": 1.877556318395021, + "learning_rate": 4.038092309915976e-05, + "loss": 1.7495, + "step": 16600 + }, + { + "epoch": 627.1698113207547, + "grad_norm": 1.3430468095941768, + "learning_rate": 4.0307298776638696e-05, + "loss": 1.7387, + "step": 16620 + }, + { + "epoch": 627.9245283018868, + "grad_norm": 1.3456306138048115, + "learning_rate": 4.023368293414651e-05, + "loss": 1.7586, + "step": 16640 + }, + { + "epoch": 628.6792452830189, + "grad_norm": 1.2925035537026515, + "learning_rate": 4.016007585638428e-05, + "loss": 1.7222, + "step": 16660 + }, + { + "epoch": 629.433962264151, + "grad_norm": 1.5060755357936446, + "learning_rate": 4.0086477828019247e-05, + "loss": 1.734, + "step": 16680 + }, + { + "epoch": 630.188679245283, + "grad_norm": 1.2358138916528858, + "learning_rate": 4.001288913368361e-05, + "loss": 1.7585, + "step": 16700 + }, + { + "epoch": 630.9433962264151, + "grad_norm": 1.0536359575721053, + "learning_rate": 3.9939310057973496e-05, + "loss": 1.699, + "step": 16720 + }, + { + "epoch": 631.6981132075472, + "grad_norm": 1.3396521000709494, + "learning_rate": 3.986574088544782e-05, + "loss": 1.745, + "step": 16740 + }, + { + "epoch": 632.4528301886793, + "grad_norm": 1.1966711285530698, + "learning_rate": 3.979218190062718e-05, + "loss": 1.7049, + "step": 16760 + }, + { + "epoch": 633.2075471698113, + "grad_norm": 1.3511753835544016, + "learning_rate": 3.971863338799283e-05, + "loss": 1.7319, + "step": 16780 + }, + { + "epoch": 633.9622641509434, + "grad_norm": 1.2759632464750865, + "learning_rate": 3.964509563198547e-05, + "loss": 1.7431, + "step": 16800 + }, + { + "epoch": 634.7169811320755, + "grad_norm": 1.5118791481962728, + "learning_rate": 3.957156891700422e-05, + "loss": 1.7549, + "step": 16820 + }, + { + "epoch": 635.4716981132076, + "grad_norm": 1.358305138616916, + "learning_rate": 3.949805352740549e-05, + "loss": 1.7146, + "step": 16840 + }, + { + "epoch": 636.2264150943396, + "grad_norm": 1.2468444876323985, + "learning_rate": 3.9424549747501916e-05, + "loss": 1.6839, + "step": 16860 + }, + { + "epoch": 636.9811320754717, + "grad_norm": 1.6815476229074826, + "learning_rate": 3.9351057861561194e-05, + "loss": 1.7381, + "step": 16880 + }, + { + "epoch": 637.7358490566038, + "grad_norm": 1.3183944234813532, + "learning_rate": 3.927757815380507e-05, + "loss": 1.737, + "step": 16900 + }, + { + "epoch": 638.4905660377359, + "grad_norm": 1.2664716657296504, + "learning_rate": 3.920411090840813e-05, + "loss": 1.7552, + "step": 16920 + }, + { + "epoch": 639.2452830188679, + "grad_norm": 1.4316626122660758, + "learning_rate": 3.9130656409496826e-05, + "loss": 1.7035, + "step": 16940 + }, + { + "epoch": 640.0, + "grad_norm": 1.624465349724497, + "learning_rate": 3.90572149411483e-05, + "loss": 1.7349, + "step": 16960 + }, + { + "epoch": 640.7547169811321, + "grad_norm": 1.3525138710560463, + "learning_rate": 3.8983786787389264e-05, + "loss": 1.7196, + "step": 16980 + }, + { + "epoch": 641.5094339622641, + "grad_norm": 1.1968289253916946, + "learning_rate": 3.891037223219497e-05, + "loss": 1.7288, + "step": 17000 + }, + { + "epoch": 642.2641509433962, + "grad_norm": 1.3150467173282183, + "learning_rate": 3.883697155948808e-05, + "loss": 1.7478, + "step": 17020 + }, + { + "epoch": 643.0188679245283, + "grad_norm": 1.3494514082635618, + "learning_rate": 3.876358505313754e-05, + "loss": 1.7208, + "step": 17040 + }, + { + "epoch": 643.7735849056604, + "grad_norm": 1.5328078930199742, + "learning_rate": 3.869021299695754e-05, + "loss": 1.747, + "step": 17060 + }, + { + "epoch": 644.5283018867924, + "grad_norm": 1.2945392233470665, + "learning_rate": 3.8616855674706354e-05, + "loss": 1.7225, + "step": 17080 + }, + { + "epoch": 645.2830188679245, + "grad_norm": 1.2582163265054458, + "learning_rate": 3.854351337008532e-05, + "loss": 1.7428, + "step": 17100 + }, + { + "epoch": 646.0377358490566, + "grad_norm": 1.1370724946903576, + "learning_rate": 3.847018636673765e-05, + "loss": 1.704, + "step": 17120 + }, + { + "epoch": 646.7924528301887, + "grad_norm": 1.2161474947016768, + "learning_rate": 3.839687494824741e-05, + "loss": 1.7129, + "step": 17140 + }, + { + "epoch": 647.5471698113207, + "grad_norm": 1.1033819366614397, + "learning_rate": 3.832357939813837e-05, + "loss": 1.695, + "step": 17160 + }, + { + "epoch": 648.3018867924528, + "grad_norm": 1.2993665260901381, + "learning_rate": 3.825029999987296e-05, + "loss": 1.7022, + "step": 17180 + }, + { + "epoch": 649.0566037735849, + "grad_norm": 1.2577276608492982, + "learning_rate": 3.8177037036851115e-05, + "loss": 1.7029, + "step": 17200 + }, + { + "epoch": 649.811320754717, + "grad_norm": 1.2459092691964395, + "learning_rate": 3.810379079240922e-05, + "loss": 1.7139, + "step": 17220 + }, + { + "epoch": 650.566037735849, + "grad_norm": 1.3152629296897698, + "learning_rate": 3.8030561549819015e-05, + "loss": 1.7088, + "step": 17240 + }, + { + "epoch": 651.3207547169811, + "grad_norm": 1.2367123181404969, + "learning_rate": 3.795734959228645e-05, + "loss": 1.6936, + "step": 17260 + }, + { + "epoch": 652.0754716981132, + "grad_norm": 1.1338754969305556, + "learning_rate": 3.7884155202950696e-05, + "loss": 1.7151, + "step": 17280 + }, + { + "epoch": 652.8301886792453, + "grad_norm": 1.2942728726977033, + "learning_rate": 3.781097866488291e-05, + "loss": 1.712, + "step": 17300 + }, + { + "epoch": 653.5849056603773, + "grad_norm": 1.39400718208209, + "learning_rate": 3.773782026108526e-05, + "loss": 1.7181, + "step": 17320 + }, + { + "epoch": 654.3396226415094, + "grad_norm": 1.3198924641523746, + "learning_rate": 3.766468027448973e-05, + "loss": 1.6913, + "step": 17340 + }, + { + "epoch": 655.0943396226415, + "grad_norm": 1.1991934682117795, + "learning_rate": 3.759155898795714e-05, + "loss": 1.7093, + "step": 17360 + }, + { + "epoch": 655.8490566037735, + "grad_norm": 1.239259370659102, + "learning_rate": 3.751845668427593e-05, + "loss": 1.7009, + "step": 17380 + }, + { + "epoch": 656.6037735849056, + "grad_norm": 1.2833857218204128, + "learning_rate": 3.7445373646161176e-05, + "loss": 1.7005, + "step": 17400 + }, + { + "epoch": 657.3584905660377, + "grad_norm": 1.454767822481044, + "learning_rate": 3.737231015625341e-05, + "loss": 1.6906, + "step": 17420 + }, + { + "epoch": 658.1132075471698, + "grad_norm": 1.4542141511941185, + "learning_rate": 3.729926649711759e-05, + "loss": 1.7058, + "step": 17440 + }, + { + "epoch": 658.8679245283018, + "grad_norm": 1.3091035418860133, + "learning_rate": 3.722624295124197e-05, + "loss": 1.6885, + "step": 17460 + }, + { + "epoch": 659.622641509434, + "grad_norm": 1.2943161972236163, + "learning_rate": 3.7153239801037014e-05, + "loss": 1.714, + "step": 17480 + }, + { + "epoch": 660.377358490566, + "grad_norm": 1.3377320776810098, + "learning_rate": 3.708025732883431e-05, + "loss": 1.684, + "step": 17500 + }, + { + "epoch": 661.1320754716982, + "grad_norm": 1.2629223675934866, + "learning_rate": 3.700729581688547e-05, + "loss": 1.699, + "step": 17520 + }, + { + "epoch": 661.8867924528302, + "grad_norm": 1.2451499003174673, + "learning_rate": 3.693435554736107e-05, + "loss": 1.6818, + "step": 17540 + }, + { + "epoch": 662.6415094339623, + "grad_norm": 1.5331175213775703, + "learning_rate": 3.6861436802349504e-05, + "loss": 1.7177, + "step": 17560 + }, + { + "epoch": 663.3962264150944, + "grad_norm": 1.4360430543768725, + "learning_rate": 3.6788539863855925e-05, + "loss": 1.7119, + "step": 17580 + }, + { + "epoch": 664.1509433962265, + "grad_norm": 1.3816247903457854, + "learning_rate": 3.671566501380116e-05, + "loss": 1.7148, + "step": 17600 + }, + { + "epoch": 664.9056603773585, + "grad_norm": 1.2778334686031196, + "learning_rate": 3.6642812534020636e-05, + "loss": 1.6935, + "step": 17620 + }, + { + "epoch": 665.6603773584906, + "grad_norm": 1.552301737650962, + "learning_rate": 3.656998270626322e-05, + "loss": 1.6917, + "step": 17640 + }, + { + "epoch": 666.4150943396227, + "grad_norm": 1.1626344688263202, + "learning_rate": 3.649717581219022e-05, + "loss": 1.6869, + "step": 17660 + }, + { + "epoch": 667.1698113207547, + "grad_norm": 1.2478591651994395, + "learning_rate": 3.642439213337418e-05, + "loss": 1.6964, + "step": 17680 + }, + { + "epoch": 667.9245283018868, + "grad_norm": 1.1665269494870496, + "learning_rate": 3.635163195129796e-05, + "loss": 1.706, + "step": 17700 + }, + { + "epoch": 668.6792452830189, + "grad_norm": 1.2417440240279074, + "learning_rate": 3.627889554735346e-05, + "loss": 1.6607, + "step": 17720 + }, + { + "epoch": 669.433962264151, + "grad_norm": 1.4243990985436537, + "learning_rate": 3.620618320284067e-05, + "loss": 1.6874, + "step": 17740 + }, + { + "epoch": 670.188679245283, + "grad_norm": 1.4914544739718891, + "learning_rate": 3.613349519896652e-05, + "loss": 1.6908, + "step": 17760 + }, + { + "epoch": 670.9433962264151, + "grad_norm": 1.3300772606283862, + "learning_rate": 3.606083181684381e-05, + "loss": 1.688, + "step": 17780 + }, + { + "epoch": 671.6981132075472, + "grad_norm": 1.2461357748180606, + "learning_rate": 3.5988193337490116e-05, + "loss": 1.6547, + "step": 17800 + }, + { + "epoch": 672.4528301886793, + "grad_norm": 1.370151145210619, + "learning_rate": 3.5915580041826694e-05, + "loss": 1.7193, + "step": 17820 + }, + { + "epoch": 673.2075471698113, + "grad_norm": 1.2763659906881193, + "learning_rate": 3.5842992210677416e-05, + "loss": 1.6808, + "step": 17840 + }, + { + "epoch": 673.9622641509434, + "grad_norm": 1.2944519984940064, + "learning_rate": 3.577043012476768e-05, + "loss": 1.7, + "step": 17860 + }, + { + "epoch": 674.7169811320755, + "grad_norm": 1.3186599824633134, + "learning_rate": 3.56978940647233e-05, + "loss": 1.6954, + "step": 17880 + }, + { + "epoch": 675.4716981132076, + "grad_norm": 1.252700498164797, + "learning_rate": 3.5625384311069444e-05, + "loss": 1.6686, + "step": 17900 + }, + { + "epoch": 676.2264150943396, + "grad_norm": 1.5231032873107, + "learning_rate": 3.555290114422955e-05, + "loss": 1.6747, + "step": 17920 + }, + { + "epoch": 676.9811320754717, + "grad_norm": 1.2910659178037445, + "learning_rate": 3.548044484452421e-05, + "loss": 1.6778, + "step": 17940 + }, + { + "epoch": 677.7358490566038, + "grad_norm": 1.398570166804289, + "learning_rate": 3.540801569217016e-05, + "loss": 1.6949, + "step": 17960 + }, + { + "epoch": 678.4905660377359, + "grad_norm": 1.4283155036503146, + "learning_rate": 3.53356139672791e-05, + "loss": 1.682, + "step": 17980 + }, + { + "epoch": 679.2452830188679, + "grad_norm": 1.3275162110816598, + "learning_rate": 3.526323994985669e-05, + "loss": 1.695, + "step": 18000 + }, + { + "epoch": 680.0, + "grad_norm": 1.2754138886413842, + "learning_rate": 3.519089391980139e-05, + "loss": 1.6977, + "step": 18020 + }, + { + "epoch": 680.7547169811321, + "grad_norm": 1.3077633836764546, + "learning_rate": 3.511857615690347e-05, + "loss": 1.6811, + "step": 18040 + }, + { + "epoch": 681.5094339622641, + "grad_norm": 1.3473268942249876, + "learning_rate": 3.504628694084385e-05, + "loss": 1.6984, + "step": 18060 + }, + { + "epoch": 682.2641509433962, + "grad_norm": 1.3350261204503644, + "learning_rate": 3.497402655119306e-05, + "loss": 1.6567, + "step": 18080 + }, + { + "epoch": 683.0188679245283, + "grad_norm": 1.243885167646148, + "learning_rate": 3.490179526741014e-05, + "loss": 1.6837, + "step": 18100 + }, + { + "epoch": 683.7735849056604, + "grad_norm": 1.4293023473168278, + "learning_rate": 3.48295933688416e-05, + "loss": 1.7039, + "step": 18120 + }, + { + "epoch": 684.5283018867924, + "grad_norm": 1.3686594771374196, + "learning_rate": 3.4757421134720236e-05, + "loss": 1.7067, + "step": 18140 + }, + { + "epoch": 685.2830188679245, + "grad_norm": 1.6243192735337049, + "learning_rate": 3.46852788441642e-05, + "loss": 1.6661, + "step": 18160 + }, + { + "epoch": 686.0377358490566, + "grad_norm": 1.2075045336020302, + "learning_rate": 3.461316677617577e-05, + "loss": 1.6779, + "step": 18180 + }, + { + "epoch": 686.7924528301887, + "grad_norm": 1.348462905709941, + "learning_rate": 3.4541085209640396e-05, + "loss": 1.6962, + "step": 18200 + }, + { + "epoch": 687.5471698113207, + "grad_norm": 1.370184561468331, + "learning_rate": 3.446903442332552e-05, + "loss": 1.6819, + "step": 18220 + }, + { + "epoch": 688.3018867924528, + "grad_norm": 2.4058560541467537, + "learning_rate": 3.439701469587961e-05, + "loss": 1.6562, + "step": 18240 + }, + { + "epoch": 689.0566037735849, + "grad_norm": 1.2548392090130422, + "learning_rate": 3.4325026305830914e-05, + "loss": 1.662, + "step": 18260 + }, + { + "epoch": 689.811320754717, + "grad_norm": 1.2311253301629015, + "learning_rate": 3.4253069531586616e-05, + "loss": 1.6629, + "step": 18280 + }, + { + "epoch": 690.566037735849, + "grad_norm": 1.9966791662877068, + "learning_rate": 3.418114465143153e-05, + "loss": 1.6592, + "step": 18300 + }, + { + "epoch": 691.3207547169811, + "grad_norm": 1.2370362395857986, + "learning_rate": 3.410925194352715e-05, + "loss": 1.6806, + "step": 18320 + }, + { + "epoch": 692.0754716981132, + "grad_norm": 1.463146145452869, + "learning_rate": 3.4037391685910566e-05, + "loss": 1.6937, + "step": 18340 + }, + { + "epoch": 692.8301886792453, + "grad_norm": 1.2590469253316379, + "learning_rate": 3.396556415649336e-05, + "loss": 1.6746, + "step": 18360 + }, + { + "epoch": 693.5849056603773, + "grad_norm": 1.3472170619382864, + "learning_rate": 3.389376963306052e-05, + "loss": 1.681, + "step": 18380 + }, + { + "epoch": 694.3396226415094, + "grad_norm": 1.4907805923383493, + "learning_rate": 3.382200839326942e-05, + "loss": 1.6822, + "step": 18400 + }, + { + "epoch": 695.0943396226415, + "grad_norm": 1.3754366409172392, + "learning_rate": 3.375028071464869e-05, + "loss": 1.6819, + "step": 18420 + }, + { + "epoch": 695.8490566037735, + "grad_norm": 1.2854564980336112, + "learning_rate": 3.3678586874597176e-05, + "loss": 1.6712, + "step": 18440 + }, + { + "epoch": 696.6037735849056, + "grad_norm": 1.4614311570416143, + "learning_rate": 3.3606927150382865e-05, + "loss": 1.649, + "step": 18460 + }, + { + "epoch": 697.3584905660377, + "grad_norm": 1.3139946901519874, + "learning_rate": 3.353530181914178e-05, + "loss": 1.7062, + "step": 18480 + }, + { + "epoch": 698.1132075471698, + "grad_norm": 1.4895975475886944, + "learning_rate": 3.3463711157876966e-05, + "loss": 1.6841, + "step": 18500 + }, + { + "epoch": 698.8679245283018, + "grad_norm": 1.2111074764483576, + "learning_rate": 3.339215544345735e-05, + "loss": 1.6799, + "step": 18520 + }, + { + "epoch": 699.622641509434, + "grad_norm": 1.254964544152517, + "learning_rate": 3.3320634952616736e-05, + "loss": 1.6554, + "step": 18540 + }, + { + "epoch": 700.377358490566, + "grad_norm": 1.4098934710763775, + "learning_rate": 3.3249149961952686e-05, + "loss": 1.6821, + "step": 18560 + }, + { + "epoch": 701.1320754716982, + "grad_norm": 1.449098110180846, + "learning_rate": 3.3177700747925484e-05, + "loss": 1.6775, + "step": 18580 + }, + { + "epoch": 701.8867924528302, + "grad_norm": 1.4166300599178772, + "learning_rate": 3.310628758685702e-05, + "loss": 1.6647, + "step": 18600 + }, + { + "epoch": 702.6415094339623, + "grad_norm": 1.3321739096846923, + "learning_rate": 3.30349107549298e-05, + "loss": 1.6606, + "step": 18620 + }, + { + "epoch": 703.3962264150944, + "grad_norm": 1.3195021828180338, + "learning_rate": 3.2963570528185814e-05, + "loss": 1.6414, + "step": 18640 + }, + { + "epoch": 704.1509433962265, + "grad_norm": 1.2954808039261523, + "learning_rate": 3.2892267182525456e-05, + "loss": 1.6691, + "step": 18660 + }, + { + "epoch": 704.9056603773585, + "grad_norm": 1.3215765511079391, + "learning_rate": 3.2821000993706524e-05, + "loss": 1.6774, + "step": 18680 + }, + { + "epoch": 705.6603773584906, + "grad_norm": 1.3256079186058618, + "learning_rate": 3.2749772237343104e-05, + "loss": 1.6675, + "step": 18700 + }, + { + "epoch": 706.4150943396227, + "grad_norm": 1.3105427183809564, + "learning_rate": 3.26785811889045e-05, + "loss": 1.669, + "step": 18720 + }, + { + "epoch": 707.1698113207547, + "grad_norm": 1.1406031822674032, + "learning_rate": 3.26074281237142e-05, + "loss": 1.6528, + "step": 18740 + }, + { + "epoch": 707.9245283018868, + "grad_norm": 1.1721675684528943, + "learning_rate": 3.253631331694882e-05, + "loss": 1.6243, + "step": 18760 + }, + { + "epoch": 708.6792452830189, + "grad_norm": 1.262858428237141, + "learning_rate": 3.2465237043636945e-05, + "loss": 1.6811, + "step": 18780 + }, + { + "epoch": 709.433962264151, + "grad_norm": 1.3398257997775693, + "learning_rate": 3.239419957865822e-05, + "loss": 1.6531, + "step": 18800 + }, + { + "epoch": 710.188679245283, + "grad_norm": 1.3245763474105379, + "learning_rate": 3.2323201196742164e-05, + "loss": 1.6796, + "step": 18820 + }, + { + "epoch": 710.9433962264151, + "grad_norm": 1.3633874472219405, + "learning_rate": 3.225224217246712e-05, + "loss": 1.6544, + "step": 18840 + }, + { + "epoch": 711.6981132075472, + "grad_norm": 1.7407734601052158, + "learning_rate": 3.218132278025927e-05, + "loss": 1.6765, + "step": 18860 + }, + { + "epoch": 712.4528301886793, + "grad_norm": 1.4569167040451834, + "learning_rate": 3.2110443294391486e-05, + "loss": 1.6411, + "step": 18880 + }, + { + "epoch": 713.2075471698113, + "grad_norm": 1.3711197707215454, + "learning_rate": 3.203960398898234e-05, + "loss": 1.6385, + "step": 18900 + }, + { + "epoch": 713.9622641509434, + "grad_norm": 1.2731560765553942, + "learning_rate": 3.196880513799497e-05, + "loss": 1.6605, + "step": 18920 + }, + { + "epoch": 714.7169811320755, + "grad_norm": 1.3127125434194904, + "learning_rate": 3.189804701523608e-05, + "loss": 1.6774, + "step": 18940 + }, + { + "epoch": 715.4716981132076, + "grad_norm": 1.3249230445075728, + "learning_rate": 3.1827329894354874e-05, + "loss": 1.6753, + "step": 18960 + }, + { + "epoch": 716.2264150943396, + "grad_norm": 1.4612490587732805, + "learning_rate": 3.1756654048842e-05, + "loss": 1.655, + "step": 18980 + }, + { + "epoch": 716.9811320754717, + "grad_norm": 1.278645383417836, + "learning_rate": 3.1686019752028424e-05, + "loss": 1.6692, + "step": 19000 + }, + { + "epoch": 717.7358490566038, + "grad_norm": 1.3408714115191198, + "learning_rate": 3.161542727708446e-05, + "loss": 1.6448, + "step": 19020 + }, + { + "epoch": 718.4905660377359, + "grad_norm": 1.8695203026536409, + "learning_rate": 3.154487689701869e-05, + "loss": 1.6786, + "step": 19040 + }, + { + "epoch": 719.2452830188679, + "grad_norm": 1.3167685135936378, + "learning_rate": 3.147436888467689e-05, + "loss": 1.6625, + "step": 19060 + }, + { + "epoch": 720.0, + "grad_norm": 1.2539486625475944, + "learning_rate": 3.140390351274096e-05, + "loss": 1.6533, + "step": 19080 + }, + { + "epoch": 720.7547169811321, + "grad_norm": 1.3710221082026877, + "learning_rate": 3.133348105372793e-05, + "loss": 1.677, + "step": 19100 + }, + { + "epoch": 721.5094339622641, + "grad_norm": 1.528521350034396, + "learning_rate": 3.126310177998883e-05, + "loss": 1.6593, + "step": 19120 + }, + { + "epoch": 722.2641509433962, + "grad_norm": 1.2092386328287839, + "learning_rate": 3.1192765963707726e-05, + "loss": 1.669, + "step": 19140 + }, + { + "epoch": 723.0188679245283, + "grad_norm": 1.605845379972632, + "learning_rate": 3.1122473876900574e-05, + "loss": 1.6372, + "step": 19160 + }, + { + "epoch": 723.7735849056604, + "grad_norm": 1.5877097735994508, + "learning_rate": 3.105222579141423e-05, + "loss": 1.6557, + "step": 19180 + }, + { + "epoch": 724.5283018867924, + "grad_norm": 1.2516228941598748, + "learning_rate": 3.098202197892538e-05, + "loss": 1.6513, + "step": 19200 + }, + { + "epoch": 725.2830188679245, + "grad_norm": 1.2391402579938813, + "learning_rate": 3.091186271093947e-05, + "loss": 1.6526, + "step": 19220 + }, + { + "epoch": 726.0377358490566, + "grad_norm": 1.2782890497326889, + "learning_rate": 3.084174825878972e-05, + "loss": 1.6591, + "step": 19240 + }, + { + "epoch": 726.7924528301887, + "grad_norm": 1.2506962493164657, + "learning_rate": 3.0771678893635963e-05, + "loss": 1.65, + "step": 19260 + }, + { + "epoch": 727.5471698113207, + "grad_norm": 1.768116692306316, + "learning_rate": 3.070165488646371e-05, + "loss": 1.6516, + "step": 19280 + }, + { + "epoch": 728.3018867924528, + "grad_norm": 1.559057461009202, + "learning_rate": 3.063167650808307e-05, + "loss": 1.6616, + "step": 19300 + }, + { + "epoch": 729.0566037735849, + "grad_norm": 1.2888728962143756, + "learning_rate": 3.0561744029127636e-05, + "loss": 1.6574, + "step": 19320 + }, + { + "epoch": 729.811320754717, + "grad_norm": 1.2688734788741953, + "learning_rate": 3.049185772005353e-05, + "loss": 1.618, + "step": 19340 + }, + { + "epoch": 730.566037735849, + "grad_norm": 1.155730285013269, + "learning_rate": 3.0422017851138287e-05, + "loss": 1.6515, + "step": 19360 + }, + { + "epoch": 731.3207547169811, + "grad_norm": 1.7451043683696195, + "learning_rate": 3.0352224692479883e-05, + "loss": 1.6371, + "step": 19380 + }, + { + "epoch": 732.0754716981132, + "grad_norm": 1.526187340694129, + "learning_rate": 3.0282478513995598e-05, + "loss": 1.6523, + "step": 19400 + }, + { + "epoch": 732.8301886792453, + "grad_norm": 1.4075608712323138, + "learning_rate": 3.0212779585421064e-05, + "loss": 1.6335, + "step": 19420 + }, + { + "epoch": 733.5849056603773, + "grad_norm": 1.345293550699471, + "learning_rate": 3.0143128176309125e-05, + "loss": 1.6505, + "step": 19440 + }, + { + "epoch": 734.3396226415094, + "grad_norm": 1.3467855791600631, + "learning_rate": 3.007352455602892e-05, + "loss": 1.6591, + "step": 19460 + }, + { + "epoch": 735.0943396226415, + "grad_norm": 1.3667404544607202, + "learning_rate": 3.000396899376472e-05, + "loss": 1.6244, + "step": 19480 + }, + { + "epoch": 735.8490566037735, + "grad_norm": 1.2844014927173513, + "learning_rate": 2.9934461758514944e-05, + "loss": 1.6154, + "step": 19500 + }, + { + "epoch": 736.6037735849056, + "grad_norm": 1.46598947181564, + "learning_rate": 2.986500311909114e-05, + "loss": 1.6443, + "step": 19520 + }, + { + "epoch": 737.3584905660377, + "grad_norm": 1.2682755408237392, + "learning_rate": 2.9795593344116856e-05, + "loss": 1.6492, + "step": 19540 + }, + { + "epoch": 738.1132075471698, + "grad_norm": 1.4017683975117536, + "learning_rate": 2.972623270202674e-05, + "loss": 1.6614, + "step": 19560 + }, + { + "epoch": 738.8679245283018, + "grad_norm": 1.5142927604100354, + "learning_rate": 2.9656921461065357e-05, + "loss": 1.6357, + "step": 19580 + }, + { + "epoch": 739.622641509434, + "grad_norm": 1.2492564466728204, + "learning_rate": 2.958765988928627e-05, + "loss": 1.6468, + "step": 19600 + }, + { + "epoch": 740.377358490566, + "grad_norm": 1.4008655564779207, + "learning_rate": 2.951844825455089e-05, + "loss": 1.64, + "step": 19620 + }, + { + "epoch": 741.1320754716982, + "grad_norm": 1.2731601803567079, + "learning_rate": 2.944928682452759e-05, + "loss": 1.6324, + "step": 19640 + }, + { + "epoch": 741.8867924528302, + "grad_norm": 1.5569572939387173, + "learning_rate": 2.9380175866690493e-05, + "loss": 1.6368, + "step": 19660 + }, + { + "epoch": 742.6415094339623, + "grad_norm": 1.3215892057968033, + "learning_rate": 2.9311115648318603e-05, + "loss": 1.5918, + "step": 19680 + }, + { + "epoch": 743.3962264150944, + "grad_norm": 1.301974969557669, + "learning_rate": 2.924210643649462e-05, + "loss": 1.625, + "step": 19700 + }, + { + "epoch": 744.1509433962265, + "grad_norm": 1.245601615853851, + "learning_rate": 2.917314849810405e-05, + "loss": 1.6436, + "step": 19720 + }, + { + "epoch": 744.9056603773585, + "grad_norm": 1.4218013050424188, + "learning_rate": 2.9104242099834047e-05, + "loss": 1.633, + "step": 19740 + }, + { + "epoch": 745.6603773584906, + "grad_norm": 1.584425316406802, + "learning_rate": 2.9035387508172488e-05, + "loss": 1.654, + "step": 19760 + }, + { + "epoch": 746.4150943396227, + "grad_norm": 1.237326080185327, + "learning_rate": 2.896658498940685e-05, + "loss": 1.6417, + "step": 19780 + }, + { + "epoch": 747.1698113207547, + "grad_norm": 1.336327325511772, + "learning_rate": 2.8897834809623266e-05, + "loss": 1.6278, + "step": 19800 + }, + { + "epoch": 747.9245283018868, + "grad_norm": 1.3731531069367304, + "learning_rate": 2.8829137234705436e-05, + "loss": 1.6339, + "step": 19820 + }, + { + "epoch": 748.6792452830189, + "grad_norm": 1.4396961322439583, + "learning_rate": 2.8760492530333595e-05, + "loss": 1.6132, + "step": 19840 + }, + { + "epoch": 749.433962264151, + "grad_norm": 1.4566587475130242, + "learning_rate": 2.869190096198354e-05, + "loss": 1.6236, + "step": 19860 + }, + { + "epoch": 750.188679245283, + "grad_norm": 1.3378675846922892, + "learning_rate": 2.8623362794925554e-05, + "loss": 1.6407, + "step": 19880 + }, + { + "epoch": 750.9433962264151, + "grad_norm": 1.1248642736382553, + "learning_rate": 2.85548782942234e-05, + "loss": 1.6328, + "step": 19900 + }, + { + "epoch": 751.6981132075472, + "grad_norm": 1.281060533625914, + "learning_rate": 2.8486447724733283e-05, + "loss": 1.6288, + "step": 19920 + }, + { + "epoch": 752.4528301886793, + "grad_norm": 1.2477580789710936, + "learning_rate": 2.841807135110286e-05, + "loss": 1.6129, + "step": 19940 + }, + { + "epoch": 753.2075471698113, + "grad_norm": 1.3050801379092132, + "learning_rate": 2.8349749437770146e-05, + "loss": 1.6259, + "step": 19960 + }, + { + "epoch": 753.9622641509434, + "grad_norm": 1.6556396088385372, + "learning_rate": 2.8281482248962588e-05, + "loss": 1.6264, + "step": 19980 + }, + { + "epoch": 754.7169811320755, + "grad_norm": 1.442836668716919, + "learning_rate": 2.8213270048695976e-05, + "loss": 1.6286, + "step": 20000 + }, + { + "epoch": 755.4716981132076, + "grad_norm": 1.3276233208619523, + "learning_rate": 2.814511310077342e-05, + "loss": 1.6485, + "step": 20020 + }, + { + "epoch": 756.2264150943396, + "grad_norm": 1.2751456415696178, + "learning_rate": 2.807701166878436e-05, + "loss": 1.622, + "step": 20040 + }, + { + "epoch": 756.9811320754717, + "grad_norm": 1.2003976158870355, + "learning_rate": 2.8008966016103532e-05, + "loss": 1.6002, + "step": 20060 + }, + { + "epoch": 757.7358490566038, + "grad_norm": 1.3873947326300384, + "learning_rate": 2.7940976405889962e-05, + "loss": 1.5892, + "step": 20080 + }, + { + "epoch": 758.4905660377359, + "grad_norm": 1.6648131685984493, + "learning_rate": 2.787304310108591e-05, + "loss": 1.6496, + "step": 20100 + }, + { + "epoch": 759.2452830188679, + "grad_norm": 1.4092462550250433, + "learning_rate": 2.780516636441591e-05, + "loss": 1.6222, + "step": 20120 + }, + { + "epoch": 760.0, + "grad_norm": 1.3221797397344044, + "learning_rate": 2.7737346458385732e-05, + "loss": 1.6276, + "step": 20140 + }, + { + "epoch": 760.7547169811321, + "grad_norm": 1.2328101453363856, + "learning_rate": 2.766958364528132e-05, + "loss": 1.6199, + "step": 20160 + }, + { + "epoch": 761.5094339622641, + "grad_norm": 1.198723191362267, + "learning_rate": 2.7601878187167865e-05, + "loss": 1.6028, + "step": 20180 + }, + { + "epoch": 762.2641509433962, + "grad_norm": 1.9424131363478752, + "learning_rate": 2.7534230345888686e-05, + "loss": 1.6155, + "step": 20200 + }, + { + "epoch": 763.0188679245283, + "grad_norm": 1.3568601924624037, + "learning_rate": 2.7466640383064343e-05, + "loss": 1.615, + "step": 20220 + }, + { + "epoch": 763.7735849056604, + "grad_norm": 1.6734295204532768, + "learning_rate": 2.7399108560091492e-05, + "loss": 1.6127, + "step": 20240 + }, + { + "epoch": 764.5283018867924, + "grad_norm": 1.3054727154474908, + "learning_rate": 2.7331635138141997e-05, + "loss": 1.6121, + "step": 20260 + }, + { + "epoch": 765.2830188679245, + "grad_norm": 1.4085434131191898, + "learning_rate": 2.7264220378161817e-05, + "loss": 1.5995, + "step": 20280 + }, + { + "epoch": 766.0377358490566, + "grad_norm": 1.2882798163186127, + "learning_rate": 2.719686454087006e-05, + "loss": 1.6209, + "step": 20300 + }, + { + "epoch": 766.7924528301887, + "grad_norm": 1.3843343328010425, + "learning_rate": 2.712956788675799e-05, + "loss": 1.6253, + "step": 20320 + }, + { + "epoch": 767.5471698113207, + "grad_norm": 1.2235858453276647, + "learning_rate": 2.7062330676087928e-05, + "loss": 1.5965, + "step": 20340 + }, + { + "epoch": 768.3018867924528, + "grad_norm": 3.572459256976869, + "learning_rate": 2.6995153168892342e-05, + "loss": 1.6146, + "step": 20360 + }, + { + "epoch": 769.0566037735849, + "grad_norm": 1.6994398915504043, + "learning_rate": 2.692803562497278e-05, + "loss": 1.6034, + "step": 20380 + }, + { + "epoch": 769.811320754717, + "grad_norm": 1.2122097844602269, + "learning_rate": 2.6860978303898913e-05, + "loss": 1.6133, + "step": 20400 + }, + { + "epoch": 770.566037735849, + "grad_norm": 1.5836773539567761, + "learning_rate": 2.6793981465007477e-05, + "loss": 1.6149, + "step": 20420 + }, + { + "epoch": 771.3207547169811, + "grad_norm": 1.9577284294586506, + "learning_rate": 2.6727045367401357e-05, + "loss": 1.6038, + "step": 20440 + }, + { + "epoch": 772.0754716981132, + "grad_norm": 1.453554282623515, + "learning_rate": 2.6660170269948445e-05, + "loss": 1.6425, + "step": 20460 + }, + { + "epoch": 772.8301886792453, + "grad_norm": 1.3031139346537821, + "learning_rate": 2.65933564312808e-05, + "loss": 1.5996, + "step": 20480 + }, + { + "epoch": 773.5849056603773, + "grad_norm": 1.2921513380534098, + "learning_rate": 2.6526604109793517e-05, + "loss": 1.6097, + "step": 20500 + }, + { + "epoch": 774.3396226415094, + "grad_norm": 1.2706024142950736, + "learning_rate": 2.6459913563643797e-05, + "loss": 1.6151, + "step": 20520 + }, + { + "epoch": 775.0943396226415, + "grad_norm": 1.3170228555500274, + "learning_rate": 2.6393285050749948e-05, + "loss": 1.6117, + "step": 20540 + }, + { + "epoch": 775.8490566037735, + "grad_norm": 1.4811497809397014, + "learning_rate": 2.6326718828790347e-05, + "loss": 1.6065, + "step": 20560 + }, + { + "epoch": 776.6037735849056, + "grad_norm": 1.3171774089155976, + "learning_rate": 2.6260215155202478e-05, + "loss": 1.5846, + "step": 20580 + }, + { + "epoch": 777.3584905660377, + "grad_norm": 1.4867958271178354, + "learning_rate": 2.6193774287181905e-05, + "loss": 1.6182, + "step": 20600 + }, + { + "epoch": 778.1132075471698, + "grad_norm": 1.485845846341643, + "learning_rate": 2.612739648168134e-05, + "loss": 1.618, + "step": 20620 + }, + { + "epoch": 778.8679245283018, + "grad_norm": 1.3411546152150449, + "learning_rate": 2.6061081995409594e-05, + "loss": 1.5979, + "step": 20640 + }, + { + "epoch": 779.622641509434, + "grad_norm": 1.352180099861608, + "learning_rate": 2.5994831084830585e-05, + "loss": 1.607, + "step": 20660 + }, + { + "epoch": 780.377358490566, + "grad_norm": 1.4317453454675355, + "learning_rate": 2.5928644006162356e-05, + "loss": 1.63, + "step": 20680 + }, + { + "epoch": 781.1320754716982, + "grad_norm": 1.3910753254665948, + "learning_rate": 2.5862521015376083e-05, + "loss": 1.6066, + "step": 20700 + }, + { + "epoch": 781.8867924528302, + "grad_norm": 1.3073180519851255, + "learning_rate": 2.579646236819513e-05, + "loss": 1.6064, + "step": 20720 + }, + { + "epoch": 782.6415094339623, + "grad_norm": 1.2595549167905473, + "learning_rate": 2.5730468320093977e-05, + "loss": 1.5911, + "step": 20740 + }, + { + "epoch": 783.3962264150944, + "grad_norm": 1.2678103789921547, + "learning_rate": 2.566453912629729e-05, + "loss": 1.5817, + "step": 20760 + }, + { + "epoch": 784.1509433962265, + "grad_norm": 1.5236492215060178, + "learning_rate": 2.5598675041778895e-05, + "loss": 1.6007, + "step": 20780 + }, + { + "epoch": 784.9056603773585, + "grad_norm": 1.4661421776894412, + "learning_rate": 2.553287632126086e-05, + "loss": 1.5504, + "step": 20800 + }, + { + "epoch": 785.6603773584906, + "grad_norm": 2.1927891520328635, + "learning_rate": 2.5467143219212452e-05, + "loss": 1.5841, + "step": 20820 + }, + { + "epoch": 786.4150943396227, + "grad_norm": 1.3795117819084444, + "learning_rate": 2.5401475989849135e-05, + "loss": 1.6066, + "step": 20840 + }, + { + "epoch": 787.1698113207547, + "grad_norm": 1.4556438165329462, + "learning_rate": 2.5335874887131648e-05, + "loss": 1.5968, + "step": 20860 + }, + { + "epoch": 787.9245283018868, + "grad_norm": 1.4073316916031215, + "learning_rate": 2.5270340164764954e-05, + "loss": 1.5903, + "step": 20880 + }, + { + "epoch": 788.6792452830189, + "grad_norm": 1.519045194155026, + "learning_rate": 2.5204872076197373e-05, + "loss": 1.6143, + "step": 20900 + }, + { + "epoch": 789.433962264151, + "grad_norm": 1.371854180935982, + "learning_rate": 2.513947087461945e-05, + "loss": 1.5956, + "step": 20920 + }, + { + "epoch": 790.188679245283, + "grad_norm": 1.3445443793198255, + "learning_rate": 2.5074136812963086e-05, + "loss": 1.6161, + "step": 20940 + }, + { + "epoch": 790.9433962264151, + "grad_norm": 1.3427364962397694, + "learning_rate": 2.5008870143900505e-05, + "loss": 1.5568, + "step": 20960 + }, + { + "epoch": 791.6981132075472, + "grad_norm": 1.2656549996025988, + "learning_rate": 2.4943671119843328e-05, + "loss": 1.5955, + "step": 20980 + }, + { + "epoch": 792.4528301886793, + "grad_norm": 1.4205258402430134, + "learning_rate": 2.4878539992941564e-05, + "loss": 1.5806, + "step": 21000 + }, + { + "epoch": 793.2075471698113, + "grad_norm": 1.6035321030423435, + "learning_rate": 2.4813477015082614e-05, + "loss": 1.6141, + "step": 21020 + }, + { + "epoch": 793.9622641509434, + "grad_norm": 1.411461969155631, + "learning_rate": 2.4748482437890327e-05, + "loss": 1.613, + "step": 21040 + }, + { + "epoch": 794.7169811320755, + "grad_norm": 1.5232357865305386, + "learning_rate": 2.4683556512724013e-05, + "loss": 1.5999, + "step": 21060 + }, + { + "epoch": 795.4716981132076, + "grad_norm": 1.423060839013135, + "learning_rate": 2.4618699490677522e-05, + "loss": 1.6014, + "step": 21080 + }, + { + "epoch": 796.2264150943396, + "grad_norm": 1.3310370009240546, + "learning_rate": 2.4553911622578173e-05, + "loss": 1.5633, + "step": 21100 + }, + { + "epoch": 796.9811320754717, + "grad_norm": 1.5449295211536895, + "learning_rate": 2.4489193158985862e-05, + "loss": 1.5948, + "step": 21120 + }, + { + "epoch": 797.7358490566038, + "grad_norm": 1.4953862144554202, + "learning_rate": 2.4424544350192054e-05, + "loss": 1.5576, + "step": 21140 + }, + { + "epoch": 798.4905660377359, + "grad_norm": 1.4322654299272977, + "learning_rate": 2.4359965446218893e-05, + "loss": 1.6043, + "step": 21160 + }, + { + "epoch": 799.2452830188679, + "grad_norm": 1.230403444648656, + "learning_rate": 2.4295456696818116e-05, + "loss": 1.5875, + "step": 21180 + }, + { + "epoch": 800.0, + "grad_norm": 1.2803680639521113, + "learning_rate": 2.423101835147014e-05, + "loss": 1.5929, + "step": 21200 + }, + { + "epoch": 800.7547169811321, + "grad_norm": 1.3170298641719804, + "learning_rate": 2.4166650659383118e-05, + "loss": 1.5807, + "step": 21220 + }, + { + "epoch": 801.5094339622641, + "grad_norm": 1.6269742919477346, + "learning_rate": 2.410235386949199e-05, + "loss": 1.6065, + "step": 21240 + }, + { + "epoch": 802.2641509433962, + "grad_norm": 1.5458337442207868, + "learning_rate": 2.4038128230457458e-05, + "loss": 1.5717, + "step": 21260 + }, + { + "epoch": 803.0188679245283, + "grad_norm": 2.161638931230412, + "learning_rate": 2.3973973990665043e-05, + "loss": 1.5762, + "step": 21280 + }, + { + "epoch": 803.7735849056604, + "grad_norm": 1.4046972399313973, + "learning_rate": 2.3909891398224146e-05, + "loss": 1.5661, + "step": 21300 + }, + { + "epoch": 804.5283018867924, + "grad_norm": 1.35050834441664, + "learning_rate": 2.3845880700967103e-05, + "loss": 1.5706, + "step": 21320 + }, + { + "epoch": 805.2830188679245, + "grad_norm": 1.5896148693041472, + "learning_rate": 2.3781942146448204e-05, + "loss": 1.5729, + "step": 21340 + }, + { + "epoch": 806.0377358490566, + "grad_norm": 1.5191801749378997, + "learning_rate": 2.3718075981942708e-05, + "loss": 1.5602, + "step": 21360 + }, + { + "epoch": 806.7924528301887, + "grad_norm": 1.1849456023631704, + "learning_rate": 2.3654282454445914e-05, + "loss": 1.5577, + "step": 21380 + }, + { + "epoch": 807.5471698113207, + "grad_norm": 1.6435607024595327, + "learning_rate": 2.3590561810672222e-05, + "loss": 1.5806, + "step": 21400 + }, + { + "epoch": 808.3018867924528, + "grad_norm": 2.2764964715817153, + "learning_rate": 2.3526914297054165e-05, + "loss": 1.5465, + "step": 21420 + }, + { + "epoch": 809.0566037735849, + "grad_norm": 1.143095634553467, + "learning_rate": 2.3463340159741438e-05, + "loss": 1.5608, + "step": 21440 + }, + { + "epoch": 809.811320754717, + "grad_norm": 1.786405965035776, + "learning_rate": 2.3399839644599966e-05, + "loss": 1.5685, + "step": 21460 + }, + { + "epoch": 810.566037735849, + "grad_norm": 1.7826688318895536, + "learning_rate": 2.3336412997210945e-05, + "loss": 1.5673, + "step": 21480 + }, + { + "epoch": 811.3207547169811, + "grad_norm": 1.9973557900265262, + "learning_rate": 2.3273060462869915e-05, + "loss": 1.58, + "step": 21500 + }, + { + "epoch": 812.0754716981132, + "grad_norm": 2.725826480276118, + "learning_rate": 2.320978228658578e-05, + "loss": 1.5798, + "step": 21520 + }, + { + "epoch": 812.8301886792453, + "grad_norm": 1.3785885365626125, + "learning_rate": 2.3146578713079873e-05, + "loss": 1.584, + "step": 21540 + }, + { + "epoch": 813.5849056603773, + "grad_norm": 1.3942585361528321, + "learning_rate": 2.308344998678499e-05, + "loss": 1.5801, + "step": 21560 + }, + { + "epoch": 814.3396226415094, + "grad_norm": 1.3778634108496939, + "learning_rate": 2.3020396351844476e-05, + "loss": 1.587, + "step": 21580 + }, + { + "epoch": 815.0943396226415, + "grad_norm": 1.3226870132637325, + "learning_rate": 2.2957418052111304e-05, + "loss": 1.5666, + "step": 21600 + }, + { + "epoch": 815.8490566037735, + "grad_norm": 1.4483636324260303, + "learning_rate": 2.2894515331147043e-05, + "loss": 1.5721, + "step": 21620 + }, + { + "epoch": 816.6037735849056, + "grad_norm": 1.2510733452498213, + "learning_rate": 2.2831688432220988e-05, + "loss": 1.5909, + "step": 21640 + }, + { + "epoch": 817.3584905660377, + "grad_norm": 1.4064726551185514, + "learning_rate": 2.2768937598309226e-05, + "loss": 1.5581, + "step": 21660 + }, + { + "epoch": 818.1132075471698, + "grad_norm": 1.3598025710319712, + "learning_rate": 2.2706263072093622e-05, + "loss": 1.5798, + "step": 21680 + }, + { + "epoch": 818.8679245283018, + "grad_norm": 1.4055798705480538, + "learning_rate": 2.2643665095960992e-05, + "loss": 1.5376, + "step": 21700 + }, + { + "epoch": 819.622641509434, + "grad_norm": 1.3886244820387288, + "learning_rate": 2.258114391200204e-05, + "loss": 1.588, + "step": 21720 + }, + { + "epoch": 820.377358490566, + "grad_norm": 1.672353853467523, + "learning_rate": 2.2518699762010527e-05, + "loss": 1.5771, + "step": 21740 + }, + { + "epoch": 821.1320754716982, + "grad_norm": 1.6122695109482281, + "learning_rate": 2.245633288748226e-05, + "loss": 1.5744, + "step": 21760 + }, + { + "epoch": 821.8867924528302, + "grad_norm": 1.2184243938930763, + "learning_rate": 2.239404352961424e-05, + "loss": 1.5579, + "step": 21780 + }, + { + "epoch": 822.6415094339623, + "grad_norm": 2.6739030707563383, + "learning_rate": 2.233183192930362e-05, + "loss": 1.5742, + "step": 21800 + }, + { + "epoch": 823.3962264150944, + "grad_norm": 1.513583565471533, + "learning_rate": 2.22696983271469e-05, + "loss": 1.5543, + "step": 21820 + }, + { + "epoch": 824.1509433962265, + "grad_norm": 1.5062076381870015, + "learning_rate": 2.2207642963438875e-05, + "loss": 1.5578, + "step": 21840 + }, + { + "epoch": 824.9056603773585, + "grad_norm": 1.4515191009181103, + "learning_rate": 2.2145666078171794e-05, + "loss": 1.5599, + "step": 21860 + }, + { + "epoch": 825.6603773584906, + "grad_norm": 1.7800885670540134, + "learning_rate": 2.2083767911034394e-05, + "loss": 1.5724, + "step": 21880 + }, + { + "epoch": 826.4150943396227, + "grad_norm": 1.655570469233021, + "learning_rate": 2.2021948701410956e-05, + "loss": 1.5722, + "step": 21900 + }, + { + "epoch": 827.1698113207547, + "grad_norm": 1.682338091450034, + "learning_rate": 2.1960208688380426e-05, + "loss": 1.5289, + "step": 21920 + }, + { + "epoch": 827.9245283018868, + "grad_norm": 1.3769805944636337, + "learning_rate": 2.189854811071546e-05, + "loss": 1.5523, + "step": 21940 + }, + { + "epoch": 828.6792452830189, + "grad_norm": 1.2988448014856364, + "learning_rate": 2.183696720688152e-05, + "loss": 1.5493, + "step": 21960 + }, + { + "epoch": 829.433962264151, + "grad_norm": 1.352528590030774, + "learning_rate": 2.1775466215035887e-05, + "loss": 1.5505, + "step": 21980 + }, + { + "epoch": 830.188679245283, + "grad_norm": 1.9587571716355492, + "learning_rate": 2.1714045373026878e-05, + "loss": 1.5611, + "step": 22000 + }, + { + "epoch": 830.9433962264151, + "grad_norm": 1.4092678213797292, + "learning_rate": 2.165270491839274e-05, + "loss": 1.5799, + "step": 22020 + }, + { + "epoch": 831.6981132075472, + "grad_norm": 1.2980309483736483, + "learning_rate": 2.159144508836092e-05, + "loss": 1.5409, + "step": 22040 + }, + { + "epoch": 832.4528301886793, + "grad_norm": 2.367411086569801, + "learning_rate": 2.1530266119847e-05, + "loss": 1.5565, + "step": 22060 + }, + { + "epoch": 833.2075471698113, + "grad_norm": 1.4677294354247894, + "learning_rate": 2.146916824945386e-05, + "loss": 1.567, + "step": 22080 + }, + { + "epoch": 833.9622641509434, + "grad_norm": 1.2034171336508228, + "learning_rate": 2.1408151713470727e-05, + "loss": 1.5324, + "step": 22100 + }, + { + "epoch": 834.7169811320755, + "grad_norm": 1.6112142759671855, + "learning_rate": 2.1347216747872316e-05, + "loss": 1.5728, + "step": 22120 + }, + { + "epoch": 835.4716981132076, + "grad_norm": 2.3612009460762025, + "learning_rate": 2.1286363588317815e-05, + "loss": 1.5777, + "step": 22140 + }, + { + "epoch": 836.2264150943396, + "grad_norm": 1.3794177780422423, + "learning_rate": 2.122559247015011e-05, + "loss": 1.5337, + "step": 22160 + }, + { + "epoch": 836.9811320754717, + "grad_norm": 1.4913217058342938, + "learning_rate": 2.116490362839475e-05, + "loss": 1.5712, + "step": 22180 + }, + { + "epoch": 837.7358490566038, + "grad_norm": 1.393269094002593, + "learning_rate": 2.1104297297759077e-05, + "loss": 1.56, + "step": 22200 + }, + { + "epoch": 838.4905660377359, + "grad_norm": 1.5277254368751014, + "learning_rate": 2.104377371263138e-05, + "loss": 1.564, + "step": 22220 + }, + { + "epoch": 839.2452830188679, + "grad_norm": 1.8220574387124733, + "learning_rate": 2.0983333107079923e-05, + "loss": 1.593, + "step": 22240 + }, + { + "epoch": 840.0, + "grad_norm": 1.4636327213867844, + "learning_rate": 2.0922975714852024e-05, + "loss": 1.5482, + "step": 22260 + }, + { + "epoch": 840.7547169811321, + "grad_norm": 1.374724993121681, + "learning_rate": 2.0862701769373194e-05, + "loss": 1.5386, + "step": 22280 + }, + { + "epoch": 841.5094339622641, + "grad_norm": 1.3056844963466483, + "learning_rate": 2.0802511503746282e-05, + "loss": 1.5499, + "step": 22300 + }, + { + "epoch": 842.2641509433962, + "grad_norm": 1.8941001751457995, + "learning_rate": 2.074240515075041e-05, + "loss": 1.5556, + "step": 22320 + }, + { + "epoch": 843.0188679245283, + "grad_norm": 1.5811456544096827, + "learning_rate": 2.0682382942840276e-05, + "loss": 1.5301, + "step": 22340 + }, + { + "epoch": 843.7735849056604, + "grad_norm": 1.6509929914813097, + "learning_rate": 2.062244511214511e-05, + "loss": 1.5114, + "step": 22360 + }, + { + "epoch": 844.5283018867924, + "grad_norm": 1.7262725135545645, + "learning_rate": 2.0562591890467795e-05, + "loss": 1.5771, + "step": 22380 + }, + { + "epoch": 845.2830188679245, + "grad_norm": 2.3494461416325176, + "learning_rate": 2.050282350928407e-05, + "loss": 1.5355, + "step": 22400 + }, + { + "epoch": 846.0377358490566, + "grad_norm": 1.5449531783263548, + "learning_rate": 2.0443140199741506e-05, + "loss": 1.5322, + "step": 22420 + }, + { + "epoch": 846.7924528301887, + "grad_norm": 1.6993440968380624, + "learning_rate": 2.0383542192658678e-05, + "loss": 1.5595, + "step": 22440 + }, + { + "epoch": 847.5471698113207, + "grad_norm": 1.4219970620295765, + "learning_rate": 2.0324029718524266e-05, + "loss": 1.544, + "step": 22460 + }, + { + "epoch": 848.3018867924528, + "grad_norm": 1.4581628071481192, + "learning_rate": 2.0264603007496174e-05, + "loss": 1.5504, + "step": 22480 + }, + { + "epoch": 849.0566037735849, + "grad_norm": 1.7218288706081564, + "learning_rate": 2.0205262289400635e-05, + "loss": 1.5329, + "step": 22500 + }, + { + "epoch": 849.811320754717, + "grad_norm": 1.557573117936356, + "learning_rate": 2.0146007793731277e-05, + "loss": 1.5413, + "step": 22520 + }, + { + "epoch": 850.566037735849, + "grad_norm": 1.556424340318002, + "learning_rate": 2.0086839749648294e-05, + "loss": 1.585, + "step": 22540 + }, + { + "epoch": 851.3207547169811, + "grad_norm": 1.5130697235799593, + "learning_rate": 2.002775838597753e-05, + "loss": 1.5365, + "step": 22560 + }, + { + "epoch": 852.0754716981132, + "grad_norm": 1.8393652727073544, + "learning_rate": 1.9968763931209628e-05, + "loss": 1.5459, + "step": 22580 + }, + { + "epoch": 852.8301886792453, + "grad_norm": 1.5587158507011118, + "learning_rate": 1.9909856613499096e-05, + "loss": 1.5429, + "step": 22600 + }, + { + "epoch": 853.5849056603773, + "grad_norm": 1.5786253886757977, + "learning_rate": 1.9851036660663427e-05, + "loss": 1.5293, + "step": 22620 + }, + { + "epoch": 854.3396226415094, + "grad_norm": 1.6955187366248636, + "learning_rate": 1.9792304300182305e-05, + "loss": 1.5488, + "step": 22640 + }, + { + "epoch": 855.0943396226415, + "grad_norm": 1.429545844614554, + "learning_rate": 1.9733659759196588e-05, + "loss": 1.533, + "step": 22660 + }, + { + "epoch": 855.8490566037735, + "grad_norm": 1.3624588099774164, + "learning_rate": 1.967510326450757e-05, + "loss": 1.5257, + "step": 22680 + }, + { + "epoch": 856.6037735849056, + "grad_norm": 1.4701659884745055, + "learning_rate": 1.9616635042575986e-05, + "loss": 1.5579, + "step": 22700 + }, + { + "epoch": 857.3584905660377, + "grad_norm": 1.258458227155755, + "learning_rate": 1.9558255319521186e-05, + "loss": 1.5174, + "step": 22720 + }, + { + "epoch": 858.1132075471698, + "grad_norm": 1.147380018733113, + "learning_rate": 1.9499964321120298e-05, + "loss": 1.5483, + "step": 22740 + }, + { + "epoch": 858.8679245283018, + "grad_norm": 1.427160544906616, + "learning_rate": 1.9441762272807296e-05, + "loss": 1.53, + "step": 22760 + }, + { + "epoch": 859.622641509434, + "grad_norm": 1.480555621655005, + "learning_rate": 1.9383649399672136e-05, + "loss": 1.5431, + "step": 22780 + }, + { + "epoch": 860.377358490566, + "grad_norm": 1.6140763796883943, + "learning_rate": 1.9325625926459906e-05, + "loss": 1.5372, + "step": 22800 + }, + { + "epoch": 861.1320754716982, + "grad_norm": 1.3659868727706357, + "learning_rate": 1.9267692077569966e-05, + "loss": 1.5693, + "step": 22820 + }, + { + "epoch": 861.8867924528302, + "grad_norm": 1.668704322839176, + "learning_rate": 1.9209848077055063e-05, + "loss": 1.5491, + "step": 22840 + }, + { + "epoch": 862.6415094339623, + "grad_norm": 1.6416845244091214, + "learning_rate": 1.915209414862045e-05, + "loss": 1.5449, + "step": 22860 + }, + { + "epoch": 863.3962264150944, + "grad_norm": 1.5619688603918687, + "learning_rate": 1.9094430515623036e-05, + "loss": 1.5109, + "step": 22880 + }, + { + "epoch": 864.1509433962265, + "grad_norm": 1.5251429637162535, + "learning_rate": 1.9036857401070517e-05, + "loss": 1.5358, + "step": 22900 + }, + { + "epoch": 864.9056603773585, + "grad_norm": 1.6195136008209567, + "learning_rate": 1.8979375027620553e-05, + "loss": 1.5167, + "step": 22920 + }, + { + "epoch": 865.6603773584906, + "grad_norm": 1.4453402703839808, + "learning_rate": 1.8921983617579843e-05, + "loss": 1.5345, + "step": 22940 + }, + { + "epoch": 866.4150943396227, + "grad_norm": 1.6142287693511135, + "learning_rate": 1.8864683392903296e-05, + "loss": 1.5427, + "step": 22960 + }, + { + "epoch": 867.1698113207547, + "grad_norm": 1.4589091367603184, + "learning_rate": 1.880747457519317e-05, + "loss": 1.4945, + "step": 22980 + }, + { + "epoch": 867.9245283018868, + "grad_norm": 1.485668957375296, + "learning_rate": 1.8750357385698233e-05, + "loss": 1.5278, + "step": 23000 + }, + { + "epoch": 868.6792452830189, + "grad_norm": 1.4865684774055008, + "learning_rate": 1.8693332045312905e-05, + "loss": 1.5178, + "step": 23020 + }, + { + "epoch": 869.433962264151, + "grad_norm": 1.6955473002125137, + "learning_rate": 1.8636398774576337e-05, + "loss": 1.5485, + "step": 23040 + }, + { + "epoch": 870.188679245283, + "grad_norm": 1.5715186186512253, + "learning_rate": 1.857955779367166e-05, + "loss": 1.5192, + "step": 23060 + }, + { + "epoch": 870.9433962264151, + "grad_norm": 1.5717069835325073, + "learning_rate": 1.8522809322425036e-05, + "loss": 1.5106, + "step": 23080 + }, + { + "epoch": 871.6981132075472, + "grad_norm": 1.3775027498551788, + "learning_rate": 1.8466153580304923e-05, + "loss": 1.5255, + "step": 23100 + }, + { + "epoch": 872.4528301886793, + "grad_norm": 1.7060704667189681, + "learning_rate": 1.8409590786421106e-05, + "loss": 1.5152, + "step": 23120 + }, + { + "epoch": 873.2075471698113, + "grad_norm": 1.3772746674273528, + "learning_rate": 1.8353121159523913e-05, + "loss": 1.4952, + "step": 23140 + }, + { + "epoch": 873.9622641509434, + "grad_norm": 1.6021480905291907, + "learning_rate": 1.8296744918003365e-05, + "loss": 1.5548, + "step": 23160 + }, + { + "epoch": 874.7169811320755, + "grad_norm": 1.6510954563611369, + "learning_rate": 1.8240462279888328e-05, + "loss": 1.5341, + "step": 23180 + }, + { + "epoch": 875.4716981132076, + "grad_norm": 1.5525128595509998, + "learning_rate": 1.8184273462845678e-05, + "loss": 1.5399, + "step": 23200 + }, + { + "epoch": 876.2264150943396, + "grad_norm": 1.3584051699815205, + "learning_rate": 1.812817868417943e-05, + "loss": 1.5245, + "step": 23220 + }, + { + "epoch": 876.9811320754717, + "grad_norm": 1.909931733744526, + "learning_rate": 1.8072178160829906e-05, + "loss": 1.5333, + "step": 23240 + }, + { + "epoch": 877.7358490566038, + "grad_norm": 1.7102569423853409, + "learning_rate": 1.8016272109372925e-05, + "loss": 1.5131, + "step": 23260 + }, + { + "epoch": 878.4905660377359, + "grad_norm": 2.4326218341752384, + "learning_rate": 1.7960460746018958e-05, + "loss": 1.4983, + "step": 23280 + }, + { + "epoch": 879.2452830188679, + "grad_norm": 1.6888708257619338, + "learning_rate": 1.790474428661225e-05, + "loss": 1.5268, + "step": 23300 + }, + { + "epoch": 880.0, + "grad_norm": 1.4793278776392822, + "learning_rate": 1.784912294663003e-05, + "loss": 1.5144, + "step": 23320 + }, + { + "epoch": 880.7547169811321, + "grad_norm": 1.3797110952325906, + "learning_rate": 1.7793596941181667e-05, + "loss": 1.5224, + "step": 23340 + }, + { + "epoch": 881.5094339622641, + "grad_norm": 1.5055338530715117, + "learning_rate": 1.7738166485007843e-05, + "loss": 1.5276, + "step": 23360 + }, + { + "epoch": 882.2641509433962, + "grad_norm": 1.3850071229139178, + "learning_rate": 1.768283179247969e-05, + "loss": 1.5216, + "step": 23380 + }, + { + "epoch": 883.0188679245283, + "grad_norm": 1.4681066166997387, + "learning_rate": 1.7627593077597997e-05, + "loss": 1.534, + "step": 23400 + }, + { + "epoch": 883.7735849056604, + "grad_norm": 1.5242995737679692, + "learning_rate": 1.7572450553992356e-05, + "loss": 1.4992, + "step": 23420 + }, + { + "epoch": 884.5283018867924, + "grad_norm": 1.642787390621851, + "learning_rate": 1.751740443492039e-05, + "loss": 1.5002, + "step": 23440 + }, + { + "epoch": 885.2830188679245, + "grad_norm": 1.490074296578881, + "learning_rate": 1.7462454933266846e-05, + "loss": 1.5211, + "step": 23460 + }, + { + "epoch": 886.0377358490566, + "grad_norm": 1.5694629977285655, + "learning_rate": 1.740760226154283e-05, + "loss": 1.5335, + "step": 23480 + }, + { + "epoch": 886.7924528301887, + "grad_norm": 1.6846894322403163, + "learning_rate": 1.7352846631884956e-05, + "loss": 1.4995, + "step": 23500 + }, + { + "epoch": 887.5471698113207, + "grad_norm": 1.4525398790667088, + "learning_rate": 1.7298188256054564e-05, + "loss": 1.4957, + "step": 23520 + }, + { + "epoch": 888.3018867924528, + "grad_norm": 1.635106498771857, + "learning_rate": 1.7243627345436874e-05, + "loss": 1.5271, + "step": 23540 + }, + { + "epoch": 889.0566037735849, + "grad_norm": 1.4587656230559394, + "learning_rate": 1.7189164111040147e-05, + "loss": 1.501, + "step": 23560 + }, + { + "epoch": 889.811320754717, + "grad_norm": 1.5410070982779924, + "learning_rate": 1.71347987634949e-05, + "loss": 1.4982, + "step": 23580 + }, + { + "epoch": 890.566037735849, + "grad_norm": 1.5645035336411055, + "learning_rate": 1.708053151305308e-05, + "loss": 1.5002, + "step": 23600 + }, + { + "epoch": 891.3207547169811, + "grad_norm": 1.3307742805961782, + "learning_rate": 1.702636256958728e-05, + "loss": 1.5184, + "step": 23620 + }, + { + "epoch": 892.0754716981132, + "grad_norm": 1.6962843737118656, + "learning_rate": 1.6972292142589877e-05, + "loss": 1.5107, + "step": 23640 + }, + { + "epoch": 892.8301886792453, + "grad_norm": 1.8950680189724871, + "learning_rate": 1.6918320441172233e-05, + "loss": 1.517, + "step": 23660 + }, + { + "epoch": 893.5849056603773, + "grad_norm": 1.7479434721374532, + "learning_rate": 1.686444767406395e-05, + "loss": 1.5051, + "step": 23680 + }, + { + "epoch": 894.3396226415094, + "grad_norm": 1.8611101959164753, + "learning_rate": 1.6810674049611953e-05, + "loss": 1.5063, + "step": 23700 + }, + { + "epoch": 895.0943396226415, + "grad_norm": 1.5841028344361991, + "learning_rate": 1.67569997757798e-05, + "loss": 1.481, + "step": 23720 + }, + { + "epoch": 895.8490566037735, + "grad_norm": 1.5025051335412982, + "learning_rate": 1.6703425060146778e-05, + "loss": 1.5253, + "step": 23740 + }, + { + "epoch": 896.6037735849056, + "grad_norm": 2.8439948944917757, + "learning_rate": 1.6649950109907165e-05, + "loss": 1.5216, + "step": 23760 + }, + { + "epoch": 897.3584905660377, + "grad_norm": 1.6268608502019901, + "learning_rate": 1.6596575131869387e-05, + "loss": 1.5334, + "step": 23780 + }, + { + "epoch": 898.1132075471698, + "grad_norm": 1.4759450457116179, + "learning_rate": 1.6543300332455273e-05, + "loss": 1.5007, + "step": 23800 + }, + { + "epoch": 898.8679245283018, + "grad_norm": 1.4818248018036755, + "learning_rate": 1.6490125917699203e-05, + "loss": 1.4973, + "step": 23820 + }, + { + "epoch": 899.622641509434, + "grad_norm": 1.548616527993675, + "learning_rate": 1.6437052093247303e-05, + "loss": 1.517, + "step": 23840 + }, + { + "epoch": 900.377358490566, + "grad_norm": 1.5445734121981956, + "learning_rate": 1.6384079064356744e-05, + "loss": 1.521, + "step": 23860 + }, + { + "epoch": 901.1320754716982, + "grad_norm": 1.5970555623190617, + "learning_rate": 1.6331207035894806e-05, + "loss": 1.5172, + "step": 23880 + }, + { + "epoch": 901.8867924528302, + "grad_norm": 1.389904429038452, + "learning_rate": 1.6278436212338226e-05, + "loss": 1.4987, + "step": 23900 + }, + { + "epoch": 902.6415094339623, + "grad_norm": 1.3455191149235926, + "learning_rate": 1.62257667977723e-05, + "loss": 1.5047, + "step": 23920 + }, + { + "epoch": 903.3962264150944, + "grad_norm": 1.4729168638466097, + "learning_rate": 1.6173198995890152e-05, + "loss": 1.5032, + "step": 23940 + }, + { + "epoch": 904.1509433962265, + "grad_norm": 1.5230989764955487, + "learning_rate": 1.612073300999191e-05, + "loss": 1.5244, + "step": 23960 + }, + { + "epoch": 904.9056603773585, + "grad_norm": 1.4504907356107584, + "learning_rate": 1.6068369042983987e-05, + "loss": 1.5072, + "step": 23980 + }, + { + "epoch": 905.6603773584906, + "grad_norm": 1.3570035581449431, + "learning_rate": 1.601610729737819e-05, + "loss": 1.5002, + "step": 24000 + }, + { + "epoch": 906.4150943396227, + "grad_norm": 1.408532335123701, + "learning_rate": 1.5963947975291056e-05, + "loss": 1.4974, + "step": 24020 + }, + { + "epoch": 907.1698113207547, + "grad_norm": 1.6703383627319723, + "learning_rate": 1.591189127844295e-05, + "loss": 1.5056, + "step": 24040 + }, + { + "epoch": 907.9245283018868, + "grad_norm": 1.4548307957349456, + "learning_rate": 1.5859937408157403e-05, + "loss": 1.4836, + "step": 24060 + }, + { + "epoch": 908.6792452830189, + "grad_norm": 1.622725332424491, + "learning_rate": 1.5808086565360235e-05, + "loss": 1.4652, + "step": 24080 + }, + { + "epoch": 909.433962264151, + "grad_norm": 1.9382762093036214, + "learning_rate": 1.575633895057883e-05, + "loss": 1.507, + "step": 24100 + }, + { + "epoch": 910.188679245283, + "grad_norm": 5.171486198720905, + "learning_rate": 1.5704694763941345e-05, + "loss": 1.4918, + "step": 24120 + }, + { + "epoch": 910.9433962264151, + "grad_norm": 1.318697524518072, + "learning_rate": 1.5653154205175963e-05, + "loss": 1.485, + "step": 24140 + }, + { + "epoch": 911.6981132075472, + "grad_norm": 1.640456368314345, + "learning_rate": 1.5601717473610066e-05, + "loss": 1.493, + "step": 24160 + }, + { + "epoch": 912.4528301886793, + "grad_norm": 1.7783411819352481, + "learning_rate": 1.555038476816951e-05, + "loss": 1.5233, + "step": 24180 + }, + { + "epoch": 913.2075471698113, + "grad_norm": 1.8560943552673308, + "learning_rate": 1.5499156287377857e-05, + "loss": 1.4845, + "step": 24200 + }, + { + "epoch": 913.9622641509434, + "grad_norm": 1.3922157561757162, + "learning_rate": 1.544803222935555e-05, + "loss": 1.513, + "step": 24220 + }, + { + "epoch": 914.7169811320755, + "grad_norm": 1.5964166307266414, + "learning_rate": 1.5397012791819248e-05, + "loss": 1.5029, + "step": 24240 + }, + { + "epoch": 915.4716981132076, + "grad_norm": 1.581271765982569, + "learning_rate": 1.5346098172080947e-05, + "loss": 1.5139, + "step": 24260 + }, + { + "epoch": 916.2264150943396, + "grad_norm": 1.3829789961056094, + "learning_rate": 1.5295288567047304e-05, + "loss": 1.4727, + "step": 24280 + }, + { + "epoch": 916.9811320754717, + "grad_norm": 1.595484488791353, + "learning_rate": 1.5244584173218816e-05, + "loss": 1.4764, + "step": 24300 + }, + { + "epoch": 917.7358490566038, + "grad_norm": 1.9817110984943331, + "learning_rate": 1.5193985186689126e-05, + "loss": 1.488, + "step": 24320 + }, + { + "epoch": 918.4905660377359, + "grad_norm": 1.5041365073617188, + "learning_rate": 1.5143491803144183e-05, + "loss": 1.4823, + "step": 24340 + }, + { + "epoch": 919.2452830188679, + "grad_norm": 1.623717820636255, + "learning_rate": 1.5093104217861574e-05, + "loss": 1.4711, + "step": 24360 + }, + { + "epoch": 920.0, + "grad_norm": 1.4153896302283269, + "learning_rate": 1.5042822625709687e-05, + "loss": 1.4729, + "step": 24380 + }, + { + "epoch": 920.7547169811321, + "grad_norm": 1.8914526627670851, + "learning_rate": 1.499264722114699e-05, + "loss": 1.4744, + "step": 24400 + }, + { + "epoch": 921.5094339622641, + "grad_norm": 1.3579367015171855, + "learning_rate": 1.494257819822132e-05, + "loss": 1.5068, + "step": 24420 + }, + { + "epoch": 922.2641509433962, + "grad_norm": 1.7241565511209502, + "learning_rate": 1.4892615750569062e-05, + "loss": 1.4629, + "step": 24440 + }, + { + "epoch": 923.0188679245283, + "grad_norm": 1.6169769566812962, + "learning_rate": 1.4842760071414446e-05, + "loss": 1.4987, + "step": 24460 + }, + { + "epoch": 923.7735849056604, + "grad_norm": 1.9954016377464863, + "learning_rate": 1.4793011353568764e-05, + "loss": 1.5263, + "step": 24480 + }, + { + "epoch": 924.5283018867924, + "grad_norm": 1.4779174235189176, + "learning_rate": 1.4743369789429686e-05, + "loss": 1.4769, + "step": 24500 + }, + { + "epoch": 925.2830188679245, + "grad_norm": 1.7019641943900714, + "learning_rate": 1.4693835570980468e-05, + "loss": 1.4749, + "step": 24520 + }, + { + "epoch": 926.0377358490566, + "grad_norm": 1.5323014302848716, + "learning_rate": 1.4644408889789189e-05, + "loss": 1.4984, + "step": 24540 + }, + { + "epoch": 926.7924528301887, + "grad_norm": 1.446942162217049, + "learning_rate": 1.4595089937008062e-05, + "loss": 1.4998, + "step": 24560 + }, + { + "epoch": 927.5471698113207, + "grad_norm": 1.3609927181175356, + "learning_rate": 1.4545878903372663e-05, + "loss": 1.4765, + "step": 24580 + }, + { + "epoch": 928.3018867924528, + "grad_norm": 1.4584582755904496, + "learning_rate": 1.4496775979201224e-05, + "loss": 1.4828, + "step": 24600 + }, + { + "epoch": 929.0566037735849, + "grad_norm": 1.4254389674669559, + "learning_rate": 1.444778135439385e-05, + "loss": 1.5041, + "step": 24620 + }, + { + "epoch": 929.811320754717, + "grad_norm": 1.5655038573484212, + "learning_rate": 1.4398895218431825e-05, + "loss": 1.4995, + "step": 24640 + }, + { + "epoch": 930.566037735849, + "grad_norm": 1.623569066402965, + "learning_rate": 1.4350117760376843e-05, + "loss": 1.4966, + "step": 24660 + }, + { + "epoch": 931.3207547169811, + "grad_norm": 1.594778698950599, + "learning_rate": 1.4301449168870325e-05, + "loss": 1.4899, + "step": 24680 + }, + { + "epoch": 932.0754716981132, + "grad_norm": 1.7627482209727463, + "learning_rate": 1.4252889632132667e-05, + "loss": 1.4784, + "step": 24700 + }, + { + "epoch": 932.8301886792453, + "grad_norm": 1.5595702425460922, + "learning_rate": 1.4204439337962486e-05, + "loss": 1.4962, + "step": 24720 + }, + { + "epoch": 933.5849056603773, + "grad_norm": 1.6175712268221147, + "learning_rate": 1.4156098473735903e-05, + "loss": 1.4858, + "step": 24740 + }, + { + "epoch": 934.3396226415094, + "grad_norm": 1.5528087670883148, + "learning_rate": 1.4107867226405882e-05, + "loss": 1.4959, + "step": 24760 + }, + { + "epoch": 935.0943396226415, + "grad_norm": 1.5105693139489524, + "learning_rate": 1.4059745782501403e-05, + "loss": 1.4694, + "step": 24780 + }, + { + "epoch": 935.8490566037735, + "grad_norm": 1.424625384350829, + "learning_rate": 1.4011734328126825e-05, + "loss": 1.4531, + "step": 24800 + }, + { + "epoch": 936.6037735849056, + "grad_norm": 1.921412092336305, + "learning_rate": 1.3963833048961103e-05, + "loss": 1.5003, + "step": 24820 + }, + { + "epoch": 937.3584905660377, + "grad_norm": 1.5289456190701718, + "learning_rate": 1.3916042130257145e-05, + "loss": 1.5177, + "step": 24840 + }, + { + "epoch": 938.1132075471698, + "grad_norm": 1.410017115369323, + "learning_rate": 1.3868361756841036e-05, + "loss": 1.4957, + "step": 24860 + }, + { + "epoch": 938.8679245283018, + "grad_norm": 1.3741594118478162, + "learning_rate": 1.3820792113111323e-05, + "loss": 1.4876, + "step": 24880 + }, + { + "epoch": 939.622641509434, + "grad_norm": 1.5111524219290895, + "learning_rate": 1.377333338303833e-05, + "loss": 1.4789, + "step": 24900 + }, + { + "epoch": 940.377358490566, + "grad_norm": 1.2690279082779223, + "learning_rate": 1.3725985750163418e-05, + "loss": 1.4851, + "step": 24920 + }, + { + "epoch": 941.1320754716982, + "grad_norm": 1.5760629816984877, + "learning_rate": 1.3678749397598337e-05, + "loss": 1.4993, + "step": 24940 + }, + { + "epoch": 941.8867924528302, + "grad_norm": 1.5719387109025893, + "learning_rate": 1.363162450802443e-05, + "loss": 1.4654, + "step": 24960 + }, + { + "epoch": 942.6415094339623, + "grad_norm": 1.51578687737706, + "learning_rate": 1.3584611263691974e-05, + "loss": 1.4985, + "step": 24980 + }, + { + "epoch": 943.3962264150944, + "grad_norm": 1.5864417766142165, + "learning_rate": 1.353770984641948e-05, + "loss": 1.4891, + "step": 25000 + }, + { + "epoch": 944.1509433962265, + "grad_norm": 1.5330683898736195, + "learning_rate": 1.3490920437592985e-05, + "loss": 1.4928, + "step": 25020 + }, + { + "epoch": 944.9056603773585, + "grad_norm": 1.8666313722767156, + "learning_rate": 1.344424321816535e-05, + "loss": 1.4558, + "step": 25040 + }, + { + "epoch": 945.6603773584906, + "grad_norm": 1.4103376741909914, + "learning_rate": 1.3397678368655534e-05, + "loss": 1.467, + "step": 25060 + }, + { + "epoch": 946.4150943396227, + "grad_norm": 1.6978974580611665, + "learning_rate": 1.3351226069147934e-05, + "loss": 1.4586, + "step": 25080 + }, + { + "epoch": 947.1698113207547, + "grad_norm": 1.3043741098462962, + "learning_rate": 1.3304886499291653e-05, + "loss": 1.4651, + "step": 25100 + }, + { + "epoch": 947.9245283018868, + "grad_norm": 1.5721530761043376, + "learning_rate": 1.3258659838299863e-05, + "loss": 1.4851, + "step": 25120 + }, + { + "epoch": 948.6792452830189, + "grad_norm": 2.445174125656233, + "learning_rate": 1.3212546264949038e-05, + "loss": 1.4861, + "step": 25140 + }, + { + "epoch": 949.433962264151, + "grad_norm": 3.0455557993861584, + "learning_rate": 1.3166545957578312e-05, + "loss": 1.4956, + "step": 25160 + }, + { + "epoch": 950.188679245283, + "grad_norm": 1.481231036001675, + "learning_rate": 1.3120659094088763e-05, + "loss": 1.4786, + "step": 25180 + }, + { + "epoch": 950.9433962264151, + "grad_norm": 1.6177001101633584, + "learning_rate": 1.3074885851942757e-05, + "loss": 1.4691, + "step": 25200 + }, + { + "epoch": 951.6981132075472, + "grad_norm": 1.7370265253795278, + "learning_rate": 1.3029226408163237e-05, + "loss": 1.456, + "step": 25220 + }, + { + "epoch": 952.4528301886793, + "grad_norm": 1.476098649785593, + "learning_rate": 1.2983680939333043e-05, + "loss": 1.457, + "step": 25240 + }, + { + "epoch": 953.2075471698113, + "grad_norm": 1.9700691780666086, + "learning_rate": 1.2938249621594219e-05, + "loss": 1.4916, + "step": 25260 + }, + { + "epoch": 953.9622641509434, + "grad_norm": 1.4124078828516038, + "learning_rate": 1.289293263064734e-05, + "loss": 1.4442, + "step": 25280 + }, + { + "epoch": 954.7169811320755, + "grad_norm": 1.609015057343637, + "learning_rate": 1.284773014175086e-05, + "loss": 1.4808, + "step": 25300 + }, + { + "epoch": 955.4716981132076, + "grad_norm": 1.545457288749583, + "learning_rate": 1.2802642329720385e-05, + "loss": 1.4388, + "step": 25320 + }, + { + "epoch": 956.2264150943396, + "grad_norm": 1.4137648487617847, + "learning_rate": 1.275766936892803e-05, + "loss": 1.4558, + "step": 25340 + }, + { + "epoch": 956.9811320754717, + "grad_norm": 1.7375121010804517, + "learning_rate": 1.2712811433301723e-05, + "loss": 1.4864, + "step": 25360 + }, + { + "epoch": 957.7358490566038, + "grad_norm": 2.170614678870875, + "learning_rate": 1.2668068696324572e-05, + "loss": 1.4668, + "step": 25380 + }, + { + "epoch": 958.4905660377359, + "grad_norm": 1.3921099231821001, + "learning_rate": 1.2623441331034153e-05, + "loss": 1.466, + "step": 25400 + }, + { + "epoch": 959.2452830188679, + "grad_norm": 1.763881906266782, + "learning_rate": 1.2578929510021851e-05, + "loss": 1.4556, + "step": 25420 + }, + { + "epoch": 960.0, + "grad_norm": 1.6251732366885816, + "learning_rate": 1.2534533405432192e-05, + "loss": 1.4831, + "step": 25440 + }, + { + "epoch": 960.7547169811321, + "grad_norm": 1.35568804382613, + "learning_rate": 1.2490253188962184e-05, + "loss": 1.4637, + "step": 25460 + }, + { + "epoch": 961.5094339622641, + "grad_norm": 1.5192686857357145, + "learning_rate": 1.2446089031860666e-05, + "loss": 1.5039, + "step": 25480 + }, + { + "epoch": 962.2641509433962, + "grad_norm": 1.645823339942095, + "learning_rate": 1.2402041104927622e-05, + "loss": 1.4643, + "step": 25500 + }, + { + "epoch": 963.0188679245283, + "grad_norm": 1.5266645922223165, + "learning_rate": 1.2358109578513502e-05, + "loss": 1.4609, + "step": 25520 + }, + { + "epoch": 963.7735849056604, + "grad_norm": 2.012096934939658, + "learning_rate": 1.2314294622518637e-05, + "loss": 1.4707, + "step": 25540 + }, + { + "epoch": 964.5283018867924, + "grad_norm": 1.6019652732905527, + "learning_rate": 1.227059640639251e-05, + "loss": 1.4624, + "step": 25560 + }, + { + "epoch": 965.2830188679245, + "grad_norm": 1.5459039987734797, + "learning_rate": 1.2227015099133119e-05, + "loss": 1.4462, + "step": 25580 + }, + { + "epoch": 966.0377358490566, + "grad_norm": 1.4581354369376407, + "learning_rate": 1.2183550869286346e-05, + "loss": 1.4602, + "step": 25600 + }, + { + "epoch": 966.7924528301887, + "grad_norm": 1.5627139982974774, + "learning_rate": 1.2140203884945257e-05, + "loss": 1.4558, + "step": 25620 + }, + { + "epoch": 967.5471698113207, + "grad_norm": 1.6163383081813927, + "learning_rate": 1.2096974313749544e-05, + "loss": 1.442, + "step": 25640 + }, + { + "epoch": 968.3018867924528, + "grad_norm": 1.4708485221948149, + "learning_rate": 1.2053862322884756e-05, + "loss": 1.4449, + "step": 25660 + }, + { + "epoch": 969.0566037735849, + "grad_norm": 1.457232110275896, + "learning_rate": 1.2010868079081735e-05, + "loss": 1.4714, + "step": 25680 + }, + { + "epoch": 969.811320754717, + "grad_norm": 2.130030633684405, + "learning_rate": 1.1967991748615972e-05, + "loss": 1.4672, + "step": 25700 + }, + { + "epoch": 970.566037735849, + "grad_norm": 1.6585416945015101, + "learning_rate": 1.1925233497306898e-05, + "loss": 1.4582, + "step": 25720 + }, + { + "epoch": 971.3207547169811, + "grad_norm": 1.696646559562477, + "learning_rate": 1.1882593490517333e-05, + "loss": 1.4616, + "step": 25740 + }, + { + "epoch": 972.0754716981132, + "grad_norm": 1.8347228047889477, + "learning_rate": 1.1840071893152767e-05, + "loss": 1.4412, + "step": 25760 + }, + { + "epoch": 972.8301886792453, + "grad_norm": 1.5105738469091443, + "learning_rate": 1.1797668869660753e-05, + "loss": 1.4476, + "step": 25780 + }, + { + "epoch": 973.5849056603773, + "grad_norm": 1.6402649798470197, + "learning_rate": 1.1755384584030287e-05, + "loss": 1.4458, + "step": 25800 + }, + { + "epoch": 974.3396226415094, + "grad_norm": 1.4580507747280478, + "learning_rate": 1.171321919979116e-05, + "loss": 1.4414, + "step": 25820 + }, + { + "epoch": 975.0943396226415, + "grad_norm": 1.8999226743757298, + "learning_rate": 1.1671172880013328e-05, + "loss": 1.4501, + "step": 25840 + }, + { + "epoch": 975.8490566037735, + "grad_norm": 1.3767670402035495, + "learning_rate": 1.1629245787306247e-05, + "loss": 1.4422, + "step": 25860 + }, + { + "epoch": 976.6037735849056, + "grad_norm": 1.3303378991562944, + "learning_rate": 1.158743808381832e-05, + "loss": 1.437, + "step": 25880 + }, + { + "epoch": 977.3584905660377, + "grad_norm": 1.5011235086965091, + "learning_rate": 1.1545749931236199e-05, + "loss": 1.4225, + "step": 25900 + }, + { + "epoch": 978.1132075471698, + "grad_norm": 1.7853875208460404, + "learning_rate": 1.1504181490784197e-05, + "loss": 1.4405, + "step": 25920 + }, + { + "epoch": 978.8679245283018, + "grad_norm": 1.4852022947554018, + "learning_rate": 1.1462732923223643e-05, + "loss": 1.4197, + "step": 25940 + }, + { + "epoch": 979.622641509434, + "grad_norm": 1.492057926353613, + "learning_rate": 1.1421404388852275e-05, + "loss": 1.4516, + "step": 25960 + }, + { + "epoch": 980.377358490566, + "grad_norm": 1.8767944270145316, + "learning_rate": 1.1380196047503614e-05, + "loss": 1.4613, + "step": 25980 + }, + { + "epoch": 981.1320754716982, + "grad_norm": 1.5723288438267475, + "learning_rate": 1.1339108058546365e-05, + "loss": 1.4636, + "step": 26000 + }, + { + "epoch": 981.8867924528302, + "grad_norm": 1.4572390965943247, + "learning_rate": 1.1298140580883752e-05, + "loss": 1.4291, + "step": 26020 + }, + { + "epoch": 982.6415094339623, + "grad_norm": 2.0340602707703566, + "learning_rate": 1.1257293772952971e-05, + "loss": 1.4342, + "step": 26040 + }, + { + "epoch": 983.3962264150944, + "grad_norm": 1.7563358001308935, + "learning_rate": 1.1216567792724513e-05, + "loss": 1.44, + "step": 26060 + }, + { + "epoch": 984.1509433962265, + "grad_norm": 1.7195863256249895, + "learning_rate": 1.1175962797701585e-05, + "loss": 1.473, + "step": 26080 + }, + { + "epoch": 984.9056603773585, + "grad_norm": 1.5325109929141458, + "learning_rate": 1.1135478944919515e-05, + "loss": 1.4537, + "step": 26100 + }, + { + "epoch": 985.6603773584906, + "grad_norm": 1.4246338183010563, + "learning_rate": 1.1095116390945116e-05, + "loss": 1.4576, + "step": 26120 + }, + { + "epoch": 986.4150943396227, + "grad_norm": 1.5264334254918077, + "learning_rate": 1.1054875291876081e-05, + "loss": 1.4355, + "step": 26140 + }, + { + "epoch": 987.1698113207547, + "grad_norm": 1.7871427472844674, + "learning_rate": 1.101475580334039e-05, + "loss": 1.4285, + "step": 26160 + }, + { + "epoch": 987.9245283018868, + "grad_norm": 1.628111810825388, + "learning_rate": 1.0974758080495742e-05, + "loss": 1.432, + "step": 26180 + }, + { + "epoch": 988.6792452830189, + "grad_norm": 1.6079918141380485, + "learning_rate": 1.0934882278028875e-05, + "loss": 1.473, + "step": 26200 + }, + { + "epoch": 989.433962264151, + "grad_norm": 1.9227955059143975, + "learning_rate": 1.0895128550155048e-05, + "loss": 1.4319, + "step": 26220 + }, + { + "epoch": 990.188679245283, + "grad_norm": 1.4777834491856459, + "learning_rate": 1.0855497050617383e-05, + "loss": 1.4715, + "step": 26240 + }, + { + "epoch": 990.9433962264151, + "grad_norm": 1.752347342407413, + "learning_rate": 1.0815987932686322e-05, + "loss": 1.4483, + "step": 26260 + }, + { + "epoch": 991.6981132075472, + "grad_norm": 1.7965242738400287, + "learning_rate": 1.0776601349158992e-05, + "loss": 1.445, + "step": 26280 + }, + { + "epoch": 992.4528301886793, + "grad_norm": 1.6880482866877031, + "learning_rate": 1.0737337452358643e-05, + "loss": 1.4289, + "step": 26300 + }, + { + "epoch": 993.2075471698113, + "grad_norm": 1.3587051959850933, + "learning_rate": 1.0698196394134027e-05, + "loss": 1.4248, + "step": 26320 + }, + { + "epoch": 993.9622641509434, + "grad_norm": 1.6893835419836905, + "learning_rate": 1.0659178325858868e-05, + "loss": 1.4593, + "step": 26340 + }, + { + "epoch": 994.7169811320755, + "grad_norm": 1.6372424305822535, + "learning_rate": 1.0620283398431196e-05, + "loss": 1.4248, + "step": 26360 + }, + { + "epoch": 995.4716981132076, + "grad_norm": 1.628959331603337, + "learning_rate": 1.0581511762272856e-05, + "loss": 1.459, + "step": 26380 + }, + { + "epoch": 996.2264150943396, + "grad_norm": 1.9899303146490552, + "learning_rate": 1.0542863567328837e-05, + "loss": 1.4608, + "step": 26400 + }, + { + "epoch": 996.9811320754717, + "grad_norm": 1.6980987241375505, + "learning_rate": 1.0504338963066745e-05, + "loss": 1.4489, + "step": 26420 + }, + { + "epoch": 997.7358490566038, + "grad_norm": 1.791483449843248, + "learning_rate": 1.0465938098476226e-05, + "loss": 1.4647, + "step": 26440 + }, + { + "epoch": 998.4905660377359, + "grad_norm": 1.3823874629634854, + "learning_rate": 1.0427661122068363e-05, + "loss": 1.431, + "step": 26460 + }, + { + "epoch": 999.2452830188679, + "grad_norm": 1.7547951381187532, + "learning_rate": 1.0389508181875114e-05, + "loss": 1.4374, + "step": 26480 + }, + { + "epoch": 1000.0, + "grad_norm": 1.6329317283212297, + "learning_rate": 1.035147942544874e-05, + "loss": 1.4436, + "step": 26500 + }, + { + "epoch": 1000.7547169811321, + "grad_norm": 1.482848334089, + "learning_rate": 1.0313574999861255e-05, + "loss": 1.4263, + "step": 26520 + }, + { + "epoch": 1001.5094339622641, + "grad_norm": 1.4085297987389735, + "learning_rate": 1.027579505170381e-05, + "loss": 1.4423, + "step": 26540 + }, + { + "epoch": 1002.2641509433962, + "grad_norm": 1.586157768854042, + "learning_rate": 1.0238139727086178e-05, + "loss": 1.4289, + "step": 26560 + }, + { + "epoch": 1003.0188679245283, + "grad_norm": 1.4910507620311724, + "learning_rate": 1.020060917163614e-05, + "loss": 1.4555, + "step": 26580 + }, + { + "epoch": 1003.7735849056604, + "grad_norm": 1.7298473240434828, + "learning_rate": 1.0163203530498955e-05, + "loss": 1.4176, + "step": 26600 + }, + { + "epoch": 1004.5283018867924, + "grad_norm": 1.9395741512745615, + "learning_rate": 1.0125922948336813e-05, + "loss": 1.4297, + "step": 26620 + }, + { + "epoch": 1005.2830188679245, + "grad_norm": 1.3752095871887702, + "learning_rate": 1.0088767569328215e-05, + "loss": 1.4224, + "step": 26640 + }, + { + "epoch": 1006.0377358490566, + "grad_norm": 1.6566420053219757, + "learning_rate": 1.0051737537167479e-05, + "loss": 1.4416, + "step": 26660 + }, + { + "epoch": 1006.7924528301887, + "grad_norm": 1.8401842062612699, + "learning_rate": 1.001483299506413e-05, + "loss": 1.4406, + "step": 26680 + }, + { + "epoch": 1007.5471698113207, + "grad_norm": 1.5895021822365676, + "learning_rate": 9.978054085742407e-06, + "loss": 1.4104, + "step": 26700 + }, + { + "epoch": 1008.3018867924528, + "grad_norm": 1.5495688189805843, + "learning_rate": 9.941400951440674e-06, + "loss": 1.4446, + "step": 26720 + }, + { + "epoch": 1009.0566037735849, + "grad_norm": 1.6376917222270109, + "learning_rate": 9.904873733910852e-06, + "loss": 1.4023, + "step": 26740 + }, + { + "epoch": 1009.811320754717, + "grad_norm": 1.7729521919831477, + "learning_rate": 9.868472574417906e-06, + "loss": 1.4409, + "step": 26760 + }, + { + "epoch": 1010.566037735849, + "grad_norm": 1.5909106157325896, + "learning_rate": 9.832197613739278e-06, + "loss": 1.4284, + "step": 26780 + }, + { + "epoch": 1011.3207547169811, + "grad_norm": 1.5416992698357255, + "learning_rate": 9.79604899216437e-06, + "loss": 1.4165, + "step": 26800 + }, + { + "epoch": 1012.0754716981132, + "grad_norm": 1.7245150906399498, + "learning_rate": 9.760026849493962e-06, + "loss": 1.4281, + "step": 26820 + }, + { + "epoch": 1012.8301886792453, + "grad_norm": 1.8518007110272525, + "learning_rate": 9.7241313250397e-06, + "loss": 1.4223, + "step": 26840 + }, + { + "epoch": 1013.5849056603773, + "grad_norm": 1.593106128312966, + "learning_rate": 9.688362557623527e-06, + "loss": 1.4377, + "step": 26860 + }, + { + "epoch": 1014.3396226415094, + "grad_norm": 1.6557177655883284, + "learning_rate": 9.6527206855772e-06, + "loss": 1.4394, + "step": 26880 + }, + { + "epoch": 1015.0943396226415, + "grad_norm": 1.5950355314495743, + "learning_rate": 9.617205846741719e-06, + "loss": 1.4506, + "step": 26900 + }, + { + "epoch": 1015.8490566037735, + "grad_norm": 1.7685274450403552, + "learning_rate": 9.58181817846677e-06, + "loss": 1.4484, + "step": 26920 + }, + { + "epoch": 1016.6037735849056, + "grad_norm": 1.4639040403309866, + "learning_rate": 9.54655781761023e-06, + "loss": 1.4043, + "step": 26940 + }, + { + "epoch": 1017.3584905660377, + "grad_norm": 1.6074583945207908, + "learning_rate": 9.511424900537656e-06, + "loss": 1.4197, + "step": 26960 + }, + { + "epoch": 1018.1132075471698, + "grad_norm": 1.5459146912367183, + "learning_rate": 9.476419563121698e-06, + "loss": 1.4232, + "step": 26980 + }, + { + "epoch": 1018.8679245283018, + "grad_norm": 1.6166722954994783, + "learning_rate": 9.441541940741613e-06, + "loss": 1.4407, + "step": 27000 + }, + { + "epoch": 1019.622641509434, + "grad_norm": 1.6533674302686083, + "learning_rate": 9.406792168282739e-06, + "loss": 1.4393, + "step": 27020 + }, + { + "epoch": 1020.377358490566, + "grad_norm": 2.1409264555789123, + "learning_rate": 9.37217038013597e-06, + "loss": 1.4507, + "step": 27040 + }, + { + "epoch": 1021.1320754716982, + "grad_norm": 1.9876202106584275, + "learning_rate": 9.337676710197243e-06, + "loss": 1.4486, + "step": 27060 + }, + { + "epoch": 1021.8867924528302, + "grad_norm": 1.6321392819191982, + "learning_rate": 9.303311291866996e-06, + "loss": 1.4337, + "step": 27080 + }, + { + "epoch": 1022.6415094339623, + "grad_norm": 1.5614664744291826, + "learning_rate": 9.269074258049671e-06, + "loss": 1.4245, + "step": 27100 + }, + { + "epoch": 1023.3962264150944, + "grad_norm": 1.775529049395487, + "learning_rate": 9.234965741153195e-06, + "loss": 1.4284, + "step": 27120 + }, + { + "epoch": 1024.1509433962265, + "grad_norm": 1.4430739083306536, + "learning_rate": 9.200985873088487e-06, + "loss": 1.4235, + "step": 27140 + }, + { + "epoch": 1024.9056603773586, + "grad_norm": 2.0811882500763255, + "learning_rate": 9.167134785268918e-06, + "loss": 1.402, + "step": 27160 + }, + { + "epoch": 1025.6603773584907, + "grad_norm": 1.5403915703954525, + "learning_rate": 9.133412608609811e-06, + "loss": 1.4302, + "step": 27180 + }, + { + "epoch": 1026.4150943396226, + "grad_norm": 1.9685065156678565, + "learning_rate": 9.099819473527936e-06, + "loss": 1.3969, + "step": 27200 + }, + { + "epoch": 1027.1698113207547, + "grad_norm": 1.5336587010545035, + "learning_rate": 9.066355509941036e-06, + "loss": 1.428, + "step": 27220 + }, + { + "epoch": 1027.9245283018868, + "grad_norm": 1.9045363331404057, + "learning_rate": 9.033020847267277e-06, + "loss": 1.4521, + "step": 27240 + }, + { + "epoch": 1028.6792452830189, + "grad_norm": 1.7010720746106325, + "learning_rate": 8.999815614424768e-06, + "loss": 1.4408, + "step": 27260 + }, + { + "epoch": 1029.433962264151, + "grad_norm": 1.6652770284797922, + "learning_rate": 8.966739939831065e-06, + "loss": 1.4275, + "step": 27280 + }, + { + "epoch": 1030.188679245283, + "grad_norm": 1.438920885601344, + "learning_rate": 8.933793951402666e-06, + "loss": 1.4363, + "step": 27300 + }, + { + "epoch": 1030.9433962264152, + "grad_norm": 1.523374273868093, + "learning_rate": 8.900977776554543e-06, + "loss": 1.4178, + "step": 27320 + }, + { + "epoch": 1031.698113207547, + "grad_norm": 1.9388166404138083, + "learning_rate": 8.868291542199601e-06, + "loss": 1.4339, + "step": 27340 + }, + { + "epoch": 1032.4528301886792, + "grad_norm": 1.910046684059762, + "learning_rate": 8.835735374748235e-06, + "loss": 1.407, + "step": 27360 + }, + { + "epoch": 1033.2075471698113, + "grad_norm": 1.5548634820286755, + "learning_rate": 8.803309400107802e-06, + "loss": 1.4183, + "step": 27380 + }, + { + "epoch": 1033.9622641509434, + "grad_norm": 1.5932417218331991, + "learning_rate": 8.771013743682171e-06, + "loss": 1.4447, + "step": 27400 + }, + { + "epoch": 1034.7169811320755, + "grad_norm": 1.4796581852592556, + "learning_rate": 8.738848530371221e-06, + "loss": 1.3946, + "step": 27420 + }, + { + "epoch": 1035.4716981132076, + "grad_norm": 1.6106803868616077, + "learning_rate": 8.706813884570337e-06, + "loss": 1.4152, + "step": 27440 + }, + { + "epoch": 1036.2264150943397, + "grad_norm": 1.5383725584269896, + "learning_rate": 8.674909930169968e-06, + "loss": 1.4344, + "step": 27460 + }, + { + "epoch": 1036.9811320754718, + "grad_norm": 1.6971458233324348, + "learning_rate": 8.643136790555101e-06, + "loss": 1.42, + "step": 27480 + }, + { + "epoch": 1037.7358490566037, + "grad_norm": 1.7975384013574476, + "learning_rate": 8.61149458860486e-06, + "loss": 1.4456, + "step": 27500 + }, + { + "epoch": 1038.4905660377358, + "grad_norm": 1.5540181334521903, + "learning_rate": 8.579983446691931e-06, + "loss": 1.3976, + "step": 27520 + }, + { + "epoch": 1039.245283018868, + "grad_norm": 1.7107813027346386, + "learning_rate": 8.548603486682165e-06, + "loss": 1.4119, + "step": 27540 + }, + { + "epoch": 1040.0, + "grad_norm": 1.7225563012589893, + "learning_rate": 8.517354829934086e-06, + "loss": 1.4347, + "step": 27560 + }, + { + "epoch": 1040.754716981132, + "grad_norm": 1.6396983385388997, + "learning_rate": 8.486237597298396e-06, + "loss": 1.4076, + "step": 27580 + }, + { + "epoch": 1041.5094339622642, + "grad_norm": 1.59607993020723, + "learning_rate": 8.455251909117562e-06, + "loss": 1.391, + "step": 27600 + }, + { + "epoch": 1042.2641509433963, + "grad_norm": 1.6787714792885464, + "learning_rate": 8.424397885225284e-06, + "loss": 1.4319, + "step": 27620 + }, + { + "epoch": 1043.0188679245282, + "grad_norm": 1.514103336557697, + "learning_rate": 8.39367564494608e-06, + "loss": 1.4282, + "step": 27640 + }, + { + "epoch": 1043.7735849056603, + "grad_norm": 1.6827281624065857, + "learning_rate": 8.3630853070948e-06, + "loss": 1.4268, + "step": 27660 + }, + { + "epoch": 1044.5283018867924, + "grad_norm": 1.5242384493420091, + "learning_rate": 8.332626989976201e-06, + "loss": 1.394, + "step": 27680 + }, + { + "epoch": 1045.2830188679245, + "grad_norm": 1.5477899241579378, + "learning_rate": 8.302300811384443e-06, + "loss": 1.4188, + "step": 27700 + }, + { + "epoch": 1046.0377358490566, + "grad_norm": 1.7533265453937938, + "learning_rate": 8.272106888602644e-06, + "loss": 1.4147, + "step": 27720 + }, + { + "epoch": 1046.7924528301887, + "grad_norm": 1.7810905836721207, + "learning_rate": 8.242045338402464e-06, + "loss": 1.4249, + "step": 27740 + }, + { + "epoch": 1047.5471698113208, + "grad_norm": 1.6994451629715164, + "learning_rate": 8.212116277043624e-06, + "loss": 1.4087, + "step": 27760 + }, + { + "epoch": 1048.301886792453, + "grad_norm": 1.5273771258038336, + "learning_rate": 8.18231982027344e-06, + "loss": 1.4105, + "step": 27780 + }, + { + "epoch": 1049.0566037735848, + "grad_norm": 1.7986470388936215, + "learning_rate": 8.15265608332641e-06, + "loss": 1.417, + "step": 27800 + }, + { + "epoch": 1049.811320754717, + "grad_norm": 3.7362962798847605, + "learning_rate": 8.123125180923732e-06, + "loss": 1.4428, + "step": 27820 + }, + { + "epoch": 1050.566037735849, + "grad_norm": 1.4871345729412693, + "learning_rate": 8.093727227272918e-06, + "loss": 1.3913, + "step": 27840 + }, + { + "epoch": 1051.3207547169811, + "grad_norm": 1.6862935331038202, + "learning_rate": 8.064462336067288e-06, + "loss": 1.4099, + "step": 27860 + }, + { + "epoch": 1052.0754716981132, + "grad_norm": 1.5729155867984972, + "learning_rate": 8.03533062048555e-06, + "loss": 1.3896, + "step": 27880 + }, + { + "epoch": 1052.8301886792453, + "grad_norm": 1.7312033654611378, + "learning_rate": 8.006332193191406e-06, + "loss": 1.4183, + "step": 27900 + }, + { + "epoch": 1053.5849056603774, + "grad_norm": 1.737310060702965, + "learning_rate": 7.977467166333041e-06, + "loss": 1.4098, + "step": 27920 + }, + { + "epoch": 1054.3396226415093, + "grad_norm": 1.787345801838152, + "learning_rate": 7.948735651542762e-06, + "loss": 1.4472, + "step": 27940 + }, + { + "epoch": 1055.0943396226414, + "grad_norm": 1.6643759736424013, + "learning_rate": 7.920137759936503e-06, + "loss": 1.4248, + "step": 27960 + }, + { + "epoch": 1055.8490566037735, + "grad_norm": 1.665184448890738, + "learning_rate": 7.891673602113444e-06, + "loss": 1.4184, + "step": 27980 + }, + { + "epoch": 1056.6037735849056, + "grad_norm": 1.4651905410431068, + "learning_rate": 7.863343288155553e-06, + "loss": 1.4117, + "step": 28000 + }, + { + "epoch": 1057.3584905660377, + "grad_norm": 1.761583496091816, + "learning_rate": 7.835146927627195e-06, + "loss": 1.4173, + "step": 28020 + }, + { + "epoch": 1058.1132075471698, + "grad_norm": 1.4468036902445778, + "learning_rate": 7.807084629574648e-06, + "loss": 1.3899, + "step": 28040 + }, + { + "epoch": 1058.867924528302, + "grad_norm": 1.9317915574764288, + "learning_rate": 7.779156502525752e-06, + "loss": 1.4283, + "step": 28060 + }, + { + "epoch": 1059.622641509434, + "grad_norm": 1.6586645034969292, + "learning_rate": 7.751362654489442e-06, + "loss": 1.3729, + "step": 28080 + }, + { + "epoch": 1060.377358490566, + "grad_norm": 1.54736903517111, + "learning_rate": 7.72370319295533e-06, + "loss": 1.4323, + "step": 28100 + }, + { + "epoch": 1061.132075471698, + "grad_norm": 1.7410908156190221, + "learning_rate": 7.696178224893333e-06, + "loss": 1.4446, + "step": 28120 + }, + { + "epoch": 1061.8867924528302, + "grad_norm": 1.5846972848377703, + "learning_rate": 7.668787856753206e-06, + "loss": 1.4069, + "step": 28140 + }, + { + "epoch": 1062.6415094339623, + "grad_norm": 2.0032825052950005, + "learning_rate": 7.641532194464159e-06, + "loss": 1.4091, + "step": 28160 + }, + { + "epoch": 1063.3962264150944, + "grad_norm": 1.5526416600245057, + "learning_rate": 7.6144113434344445e-06, + "loss": 1.3988, + "step": 28180 + }, + { + "epoch": 1064.1509433962265, + "grad_norm": 1.6399869572854062, + "learning_rate": 7.587425408550953e-06, + "loss": 1.4317, + "step": 28200 + }, + { + "epoch": 1064.9056603773586, + "grad_norm": 2.218545819761043, + "learning_rate": 7.560574494178785e-06, + "loss": 1.4166, + "step": 28220 + }, + { + "epoch": 1065.6603773584907, + "grad_norm": 1.610893838079929, + "learning_rate": 7.5338587041608855e-06, + "loss": 1.4034, + "step": 28240 + }, + { + "epoch": 1066.4150943396226, + "grad_norm": 1.901849515787354, + "learning_rate": 7.507278141817603e-06, + "loss": 1.4082, + "step": 28260 + }, + { + "epoch": 1067.1698113207547, + "grad_norm": 1.9915752693535391, + "learning_rate": 7.4808329099463165e-06, + "loss": 1.4202, + "step": 28280 + }, + { + "epoch": 1067.9245283018868, + "grad_norm": 2.337231756702343, + "learning_rate": 7.454523110821034e-06, + "loss": 1.4033, + "step": 28300 + }, + { + "epoch": 1068.6792452830189, + "grad_norm": 1.4499700621594815, + "learning_rate": 7.428348846191982e-06, + "loss": 1.4106, + "step": 28320 + }, + { + "epoch": 1069.433962264151, + "grad_norm": 1.7981102056016145, + "learning_rate": 7.402310217285226e-06, + "loss": 1.4061, + "step": 28340 + }, + { + "epoch": 1070.188679245283, + "grad_norm": 1.7129433355903898, + "learning_rate": 7.376407324802275e-06, + "loss": 1.4019, + "step": 28360 + }, + { + "epoch": 1070.9433962264152, + "grad_norm": 1.5382026111028457, + "learning_rate": 7.350640268919691e-06, + "loss": 1.4197, + "step": 28380 + }, + { + "epoch": 1071.698113207547, + "grad_norm": 1.7225324354326523, + "learning_rate": 7.325009149288721e-06, + "loss": 1.4061, + "step": 28400 + }, + { + "epoch": 1072.4528301886792, + "grad_norm": 1.9701222408661871, + "learning_rate": 7.299514065034864e-06, + "loss": 1.399, + "step": 28420 + }, + { + "epoch": 1073.2075471698113, + "grad_norm": 2.560013262107365, + "learning_rate": 7.2741551147575365e-06, + "loss": 1.4011, + "step": 28440 + }, + { + "epoch": 1073.9622641509434, + "grad_norm": 1.7468598350718882, + "learning_rate": 7.248932396529666e-06, + "loss": 1.3906, + "step": 28460 + }, + { + "epoch": 1074.7169811320755, + "grad_norm": 1.5217037013529344, + "learning_rate": 7.223846007897321e-06, + "loss": 1.3824, + "step": 28480 + }, + { + "epoch": 1075.4716981132076, + "grad_norm": 1.9246360758156291, + "learning_rate": 7.198896045879323e-06, + "loss": 1.401, + "step": 28500 + }, + { + "epoch": 1076.2264150943397, + "grad_norm": 1.6887933139540061, + "learning_rate": 7.174082606966883e-06, + "loss": 1.4025, + "step": 28520 + }, + { + "epoch": 1076.9811320754718, + "grad_norm": 1.6294766788073725, + "learning_rate": 7.149405787123236e-06, + "loss": 1.3986, + "step": 28540 + }, + { + "epoch": 1077.7358490566037, + "grad_norm": 1.5618807274404587, + "learning_rate": 7.124865681783234e-06, + "loss": 1.4005, + "step": 28560 + }, + { + "epoch": 1078.4905660377358, + "grad_norm": 1.6678211596916697, + "learning_rate": 7.100462385853021e-06, + "loss": 1.4071, + "step": 28580 + }, + { + "epoch": 1079.245283018868, + "grad_norm": 1.9223978868928677, + "learning_rate": 7.07619599370964e-06, + "loss": 1.4135, + "step": 28600 + }, + { + "epoch": 1080.0, + "grad_norm": 1.6632265815235145, + "learning_rate": 7.052066599200659e-06, + "loss": 1.3882, + "step": 28620 + }, + { + "epoch": 1080.754716981132, + "grad_norm": 1.6022030717394165, + "learning_rate": 7.028074295643851e-06, + "loss": 1.3972, + "step": 28640 + }, + { + "epoch": 1081.5094339622642, + "grad_norm": 1.4991746539828543, + "learning_rate": 7.004219175826785e-06, + "loss": 1.382, + "step": 28660 + }, + { + "epoch": 1082.2641509433963, + "grad_norm": 1.6838520383575963, + "learning_rate": 6.9805013320064956e-06, + "loss": 1.4146, + "step": 28680 + }, + { + "epoch": 1083.0188679245282, + "grad_norm": 1.8350778781710608, + "learning_rate": 6.9569208559091e-06, + "loss": 1.4138, + "step": 28700 + }, + { + "epoch": 1083.7735849056603, + "grad_norm": 1.5249940477637465, + "learning_rate": 6.9334778387294835e-06, + "loss": 1.403, + "step": 28720 + }, + { + "epoch": 1084.5283018867924, + "grad_norm": 1.4543697117371763, + "learning_rate": 6.910172371130925e-06, + "loss": 1.4115, + "step": 28740 + }, + { + "epoch": 1085.2830188679245, + "grad_norm": 1.8878771205671918, + "learning_rate": 6.8870045432447285e-06, + "loss": 1.3783, + "step": 28760 + }, + { + "epoch": 1086.0377358490566, + "grad_norm": 1.6650946199070653, + "learning_rate": 6.8639744446698945e-06, + "loss": 1.4065, + "step": 28780 + }, + { + "epoch": 1086.7924528301887, + "grad_norm": 1.9063799347508024, + "learning_rate": 6.84108216447278e-06, + "loss": 1.3896, + "step": 28800 + }, + { + "epoch": 1087.5471698113208, + "grad_norm": 1.7745103676453513, + "learning_rate": 6.818327791186747e-06, + "loss": 1.4068, + "step": 28820 + }, + { + "epoch": 1088.301886792453, + "grad_norm": 1.6208415487366228, + "learning_rate": 6.795711412811805e-06, + "loss": 1.3827, + "step": 28840 + }, + { + "epoch": 1089.0566037735848, + "grad_norm": 1.4568669649899233, + "learning_rate": 6.773233116814289e-06, + "loss": 1.3918, + "step": 28860 + }, + { + "epoch": 1089.811320754717, + "grad_norm": 1.861515176168054, + "learning_rate": 6.750892990126514e-06, + "loss": 1.3901, + "step": 28880 + }, + { + "epoch": 1090.566037735849, + "grad_norm": 1.7283660067362911, + "learning_rate": 6.728691119146446e-06, + "loss": 1.4157, + "step": 28900 + }, + { + "epoch": 1091.3207547169811, + "grad_norm": 1.679598340558233, + "learning_rate": 6.706627589737369e-06, + "loss": 1.3938, + "step": 28920 + }, + { + "epoch": 1092.0754716981132, + "grad_norm": 1.5691857730547452, + "learning_rate": 6.6847024872275215e-06, + "loss": 1.4176, + "step": 28940 + }, + { + "epoch": 1092.8301886792453, + "grad_norm": 1.5537251935711112, + "learning_rate": 6.66291589640982e-06, + "loss": 1.3967, + "step": 28960 + }, + { + "epoch": 1093.5849056603774, + "grad_norm": 1.8881979410475171, + "learning_rate": 6.641267901541472e-06, + "loss": 1.418, + "step": 28980 + }, + { + "epoch": 1094.3396226415093, + "grad_norm": 1.613241830342873, + "learning_rate": 6.619758586343714e-06, + "loss": 1.3901, + "step": 29000 + }, + { + "epoch": 1095.0943396226414, + "grad_norm": 1.5946632443607534, + "learning_rate": 6.598388034001433e-06, + "loss": 1.3634, + "step": 29020 + }, + { + "epoch": 1095.8490566037735, + "grad_norm": 1.8962995366661943, + "learning_rate": 6.577156327162867e-06, + "loss": 1.392, + "step": 29040 + }, + { + "epoch": 1096.6037735849056, + "grad_norm": 1.629681556076702, + "learning_rate": 6.55606354793928e-06, + "loss": 1.4078, + "step": 29060 + }, + { + "epoch": 1097.3584905660377, + "grad_norm": 1.6952819453222434, + "learning_rate": 6.535109777904677e-06, + "loss": 1.4017, + "step": 29080 + }, + { + "epoch": 1098.1132075471698, + "grad_norm": 2.5813616029432267, + "learning_rate": 6.514295098095432e-06, + "loss": 1.3986, + "step": 29100 + }, + { + "epoch": 1098.867924528302, + "grad_norm": 1.5192224713062508, + "learning_rate": 6.493619589010008e-06, + "loss": 1.3995, + "step": 29120 + }, + { + "epoch": 1099.622641509434, + "grad_norm": 1.5723195273483208, + "learning_rate": 6.4730833306086425e-06, + "loss": 1.3804, + "step": 29140 + }, + { + "epoch": 1100.377358490566, + "grad_norm": 1.6397338659549336, + "learning_rate": 6.452686402313042e-06, + "loss": 1.386, + "step": 29160 + }, + { + "epoch": 1101.132075471698, + "grad_norm": 1.5791257173150743, + "learning_rate": 6.43242888300607e-06, + "loss": 1.3847, + "step": 29180 + }, + { + "epoch": 1101.8867924528302, + "grad_norm": 1.5559887095506482, + "learning_rate": 6.412310851031428e-06, + "loss": 1.393, + "step": 29200 + }, + { + "epoch": 1102.6415094339623, + "grad_norm": 1.6663466000474887, + "learning_rate": 6.392332384193371e-06, + "loss": 1.3896, + "step": 29220 + }, + { + "epoch": 1103.3962264150944, + "grad_norm": 1.9956674599720932, + "learning_rate": 6.372493559756415e-06, + "loss": 1.378, + "step": 29240 + }, + { + "epoch": 1104.1509433962265, + "grad_norm": 1.787105155690102, + "learning_rate": 6.352794454445007e-06, + "loss": 1.3879, + "step": 29260 + }, + { + "epoch": 1104.9056603773586, + "grad_norm": 1.561482889041861, + "learning_rate": 6.333235144443262e-06, + "loss": 1.402, + "step": 29280 + }, + { + "epoch": 1105.6603773584907, + "grad_norm": 1.8736117457797759, + "learning_rate": 6.31381570539463e-06, + "loss": 1.3879, + "step": 29300 + }, + { + "epoch": 1106.4150943396226, + "grad_norm": 1.4692581652153442, + "learning_rate": 6.294536212401641e-06, + "loss": 1.3914, + "step": 29320 + }, + { + "epoch": 1107.1698113207547, + "grad_norm": 1.4908544439114542, + "learning_rate": 6.275396740025605e-06, + "loss": 1.4028, + "step": 29340 + }, + { + "epoch": 1107.9245283018868, + "grad_norm": 1.488666750171173, + "learning_rate": 6.256397362286306e-06, + "loss": 1.3799, + "step": 29360 + }, + { + "epoch": 1108.6792452830189, + "grad_norm": 1.517431762228245, + "learning_rate": 6.237538152661723e-06, + "loss": 1.3765, + "step": 29380 + }, + { + "epoch": 1109.433962264151, + "grad_norm": 2.2381909450089803, + "learning_rate": 6.218819184087767e-06, + "loss": 1.4079, + "step": 29400 + }, + { + "epoch": 1110.188679245283, + "grad_norm": 1.7858504458920295, + "learning_rate": 6.200240528957965e-06, + "loss": 1.3554, + "step": 29420 + }, + { + "epoch": 1110.9433962264152, + "grad_norm": 1.7350524849254911, + "learning_rate": 6.181802259123219e-06, + "loss": 1.3967, + "step": 29440 + }, + { + "epoch": 1111.698113207547, + "grad_norm": 1.702971597589678, + "learning_rate": 6.163504445891484e-06, + "loss": 1.3671, + "step": 29460 + }, + { + "epoch": 1112.4528301886792, + "grad_norm": 1.7712134929173684, + "learning_rate": 6.145347160027524e-06, + "loss": 1.3829, + "step": 29480 + }, + { + "epoch": 1113.2075471698113, + "grad_norm": 1.4073555395505457, + "learning_rate": 6.1273304717526284e-06, + "loss": 1.4108, + "step": 29500 + }, + { + "epoch": 1113.9622641509434, + "grad_norm": 1.6527537265171588, + "learning_rate": 6.10945445074435e-06, + "loss": 1.4068, + "step": 29520 + }, + { + "epoch": 1114.7169811320755, + "grad_norm": 1.6866987009556351, + "learning_rate": 6.091719166136209e-06, + "loss": 1.3793, + "step": 29540 + }, + { + "epoch": 1115.4716981132076, + "grad_norm": 1.7073159356044332, + "learning_rate": 6.074124686517448e-06, + "loss": 1.3826, + "step": 29560 + }, + { + "epoch": 1116.2264150943397, + "grad_norm": 1.8230785653176147, + "learning_rate": 6.056671079932781e-06, + "loss": 1.4153, + "step": 29580 + }, + { + "epoch": 1116.9811320754718, + "grad_norm": 1.6857598634250675, + "learning_rate": 6.0393584138820814e-06, + "loss": 1.3887, + "step": 29600 + }, + { + "epoch": 1117.7358490566037, + "grad_norm": 1.5568678463492682, + "learning_rate": 6.022186755320181e-06, + "loss": 1.3901, + "step": 29620 + }, + { + "epoch": 1118.4905660377358, + "grad_norm": 1.8571545157336313, + "learning_rate": 6.0051561706565545e-06, + "loss": 1.4013, + "step": 29640 + }, + { + "epoch": 1119.245283018868, + "grad_norm": 2.413996452708785, + "learning_rate": 5.988266725755103e-06, + "loss": 1.3613, + "step": 29660 + }, + { + "epoch": 1120.0, + "grad_norm": 1.687989711452293, + "learning_rate": 5.9715184859338745e-06, + "loss": 1.4031, + "step": 29680 + }, + { + "epoch": 1120.754716981132, + "grad_norm": 1.7351377623187432, + "learning_rate": 5.9549115159648416e-06, + "loss": 1.3949, + "step": 29700 + }, + { + "epoch": 1121.5094339622642, + "grad_norm": 1.6317556572084198, + "learning_rate": 5.9384458800736175e-06, + "loss": 1.3769, + "step": 29720 + }, + { + "epoch": 1122.2641509433963, + "grad_norm": 1.5268456230996348, + "learning_rate": 5.922121641939213e-06, + "loss": 1.3816, + "step": 29740 + }, + { + "epoch": 1123.0188679245282, + "grad_norm": 1.712558259908726, + "learning_rate": 5.905938864693819e-06, + "loss": 1.3798, + "step": 29760 + }, + { + "epoch": 1123.7735849056603, + "grad_norm": 2.381990895927805, + "learning_rate": 5.889897610922528e-06, + "loss": 1.3607, + "step": 29780 + }, + { + "epoch": 1124.5283018867924, + "grad_norm": 1.880675021280631, + "learning_rate": 5.873997942663118e-06, + "loss": 1.3886, + "step": 29800 + }, + { + "epoch": 1125.2830188679245, + "grad_norm": 1.7160648060328811, + "learning_rate": 5.858239921405781e-06, + "loss": 1.4049, + "step": 29820 + }, + { + "epoch": 1126.0377358490566, + "grad_norm": 2.0253315053102656, + "learning_rate": 5.842623608092928e-06, + "loss": 1.393, + "step": 29840 + }, + { + "epoch": 1126.7924528301887, + "grad_norm": 1.7870648066969081, + "learning_rate": 5.8271490631189085e-06, + "loss": 1.3654, + "step": 29860 + }, + { + "epoch": 1127.5471698113208, + "grad_norm": 2.0620223544323393, + "learning_rate": 5.811816346329819e-06, + "loss": 1.3776, + "step": 29880 + }, + { + "epoch": 1128.301886792453, + "grad_norm": 1.892915815700359, + "learning_rate": 5.796625517023236e-06, + "loss": 1.377, + "step": 29900 + }, + { + "epoch": 1129.0566037735848, + "grad_norm": 1.6134589423454577, + "learning_rate": 5.781576633948012e-06, + "loss": 1.3958, + "step": 29920 + }, + { + "epoch": 1129.811320754717, + "grad_norm": 1.8880173462636753, + "learning_rate": 5.766669755304027e-06, + "loss": 1.3707, + "step": 29940 + }, + { + "epoch": 1130.566037735849, + "grad_norm": 1.899687605902805, + "learning_rate": 5.75190493874199e-06, + "loss": 1.3648, + "step": 29960 + }, + { + "epoch": 1131.3207547169811, + "grad_norm": 2.0484945041635143, + "learning_rate": 5.737282241363189e-06, + "loss": 1.3689, + "step": 29980 + }, + { + "epoch": 1132.0754716981132, + "grad_norm": 1.676321084433534, + "learning_rate": 5.72280171971928e-06, + "loss": 1.4161, + "step": 30000 + }, + { + "epoch": 1132.8301886792453, + "grad_norm": 1.7718376566707665, + "learning_rate": 5.708463429812077e-06, + "loss": 1.3427, + "step": 30020 + }, + { + "epoch": 1133.5849056603774, + "grad_norm": 1.9751240318001524, + "learning_rate": 5.694267427093333e-06, + "loss": 1.3674, + "step": 30040 + }, + { + "epoch": 1134.3396226415093, + "grad_norm": 2.3259508666245754, + "learning_rate": 5.680213766464505e-06, + "loss": 1.3815, + "step": 30060 + }, + { + "epoch": 1135.0943396226414, + "grad_norm": 1.7499567507331477, + "learning_rate": 5.6663025022765734e-06, + "loss": 1.3898, + "step": 30080 + }, + { + "epoch": 1135.8490566037735, + "grad_norm": 1.7700410283415744, + "learning_rate": 5.652533688329809e-06, + "loss": 1.3801, + "step": 30100 + }, + { + "epoch": 1136.6037735849056, + "grad_norm": 2.4028113618062843, + "learning_rate": 5.638907377873572e-06, + "loss": 1.4025, + "step": 30120 + }, + { + "epoch": 1137.3584905660377, + "grad_norm": 1.7119758682153656, + "learning_rate": 5.625423623606109e-06, + "loss": 1.3933, + "step": 30140 + }, + { + "epoch": 1138.1132075471698, + "grad_norm": 1.6434771622606816, + "learning_rate": 5.612082477674341e-06, + "loss": 1.3723, + "step": 30160 + }, + { + "epoch": 1138.867924528302, + "grad_norm": 1.6260264586830788, + "learning_rate": 5.598883991673678e-06, + "loss": 1.4009, + "step": 30180 + }, + { + "epoch": 1139.622641509434, + "grad_norm": 3.7353731641696166, + "learning_rate": 5.58582821664779e-06, + "loss": 1.3621, + "step": 30200 + }, + { + "epoch": 1140.377358490566, + "grad_norm": 1.567966811159742, + "learning_rate": 5.572915203088453e-06, + "loss": 1.3679, + "step": 30220 + }, + { + "epoch": 1141.132075471698, + "grad_norm": 1.7536276327044822, + "learning_rate": 5.560145000935302e-06, + "loss": 1.3899, + "step": 30240 + }, + { + "epoch": 1141.8867924528302, + "grad_norm": 1.6246811713037859, + "learning_rate": 5.547517659575683e-06, + "loss": 1.3754, + "step": 30260 + }, + { + "epoch": 1142.6415094339623, + "grad_norm": 1.5935354859602073, + "learning_rate": 5.535033227844446e-06, + "loss": 1.3783, + "step": 30280 + }, + { + "epoch": 1143.3962264150944, + "grad_norm": 1.5837499746804282, + "learning_rate": 5.522691754023736e-06, + "loss": 1.3664, + "step": 30300 + }, + { + "epoch": 1144.1509433962265, + "grad_norm": 1.5561292753074283, + "learning_rate": 5.5104932858428386e-06, + "loss": 1.3934, + "step": 30320 + }, + { + "epoch": 1144.9056603773586, + "grad_norm": 1.5051486824601223, + "learning_rate": 5.498437870477979e-06, + "loss": 1.3569, + "step": 30340 + }, + { + "epoch": 1145.6603773584907, + "grad_norm": 1.5724530317281036, + "learning_rate": 5.48652555455214e-06, + "loss": 1.384, + "step": 30360 + }, + { + "epoch": 1146.4150943396226, + "grad_norm": 1.7499070562961392, + "learning_rate": 5.474756384134872e-06, + "loss": 1.3661, + "step": 30380 + }, + { + "epoch": 1147.1698113207547, + "grad_norm": 1.682172454392295, + "learning_rate": 5.46313040474215e-06, + "loss": 1.3668, + "step": 30400 + }, + { + "epoch": 1147.9245283018868, + "grad_norm": 1.6400451026874565, + "learning_rate": 5.4516476613361565e-06, + "loss": 1.3605, + "step": 30420 + }, + { + "epoch": 1148.6792452830189, + "grad_norm": 1.952384343786011, + "learning_rate": 5.440308198325125e-06, + "loss": 1.388, + "step": 30440 + }, + { + "epoch": 1149.433962264151, + "grad_norm": 2.052044266530817, + "learning_rate": 5.4291120595631796e-06, + "loss": 1.3699, + "step": 30460 + }, + { + "epoch": 1150.188679245283, + "grad_norm": 1.868354121694302, + "learning_rate": 5.4180592883501325e-06, + "loss": 1.4099, + "step": 30480 + }, + { + "epoch": 1150.9433962264152, + "grad_norm": 1.650613691746538, + "learning_rate": 5.40714992743136e-06, + "loss": 1.3788, + "step": 30500 + }, + { + "epoch": 1151.698113207547, + "grad_norm": 1.48074352750423, + "learning_rate": 5.3963840189976066e-06, + "loss": 1.3587, + "step": 30520 + }, + { + "epoch": 1152.4528301886792, + "grad_norm": 1.914894176993607, + "learning_rate": 5.385761604684826e-06, + "loss": 1.3622, + "step": 30540 + }, + { + "epoch": 1153.2075471698113, + "grad_norm": 1.736154691724524, + "learning_rate": 5.375282725574028e-06, + "loss": 1.3451, + "step": 30560 + }, + { + "epoch": 1153.9622641509434, + "grad_norm": 1.7175923216328703, + "learning_rate": 5.364947422191111e-06, + "loss": 1.385, + "step": 30580 + }, + { + "epoch": 1154.7169811320755, + "grad_norm": 1.8230347081955776, + "learning_rate": 5.3547557345067295e-06, + "loss": 1.3797, + "step": 30600 + }, + { + "epoch": 1155.4716981132076, + "grad_norm": 1.4897355923840079, + "learning_rate": 5.344707701936093e-06, + "loss": 1.3812, + "step": 30620 + }, + { + "epoch": 1156.2264150943397, + "grad_norm": 1.7795720356372806, + "learning_rate": 5.334803363338855e-06, + "loss": 1.3508, + "step": 30640 + }, + { + "epoch": 1156.9811320754718, + "grad_norm": 2.461699887903762, + "learning_rate": 5.325042757018952e-06, + "loss": 1.3904, + "step": 30660 + }, + { + "epoch": 1157.7358490566037, + "grad_norm": 1.7684288169829847, + "learning_rate": 5.315425920724443e-06, + "loss": 1.362, + "step": 30680 + }, + { + "epoch": 1158.4905660377358, + "grad_norm": 1.9326301215722892, + "learning_rate": 5.3059528916473754e-06, + "loss": 1.3764, + "step": 30700 + }, + { + "epoch": 1159.245283018868, + "grad_norm": 1.7547993585411785, + "learning_rate": 5.296623706423637e-06, + "loss": 1.3624, + "step": 30720 + }, + { + "epoch": 1160.0, + "grad_norm": 2.2647989876543897, + "learning_rate": 5.2874384011328235e-06, + "loss": 1.3804, + "step": 30740 + }, + { + "epoch": 1160.754716981132, + "grad_norm": 1.897412746168143, + "learning_rate": 5.278397011298081e-06, + "loss": 1.3882, + "step": 30760 + }, + { + "epoch": 1161.5094339622642, + "grad_norm": 1.5286725772277845, + "learning_rate": 5.269499571885985e-06, + "loss": 1.381, + "step": 30780 + }, + { + "epoch": 1162.2641509433963, + "grad_norm": 1.6848292059915215, + "learning_rate": 5.260746117306394e-06, + "loss": 1.361, + "step": 30800 + }, + { + "epoch": 1163.0188679245282, + "grad_norm": 1.4576957104143031, + "learning_rate": 5.25213668141232e-06, + "loss": 1.3773, + "step": 30820 + }, + { + "epoch": 1163.7735849056603, + "grad_norm": 1.6655981961615232, + "learning_rate": 5.243671297499806e-06, + "loss": 1.3403, + "step": 30840 + }, + { + "epoch": 1164.5283018867924, + "grad_norm": 6.016182274377044, + "learning_rate": 5.235349998307786e-06, + "loss": 1.3994, + "step": 30860 + }, + { + "epoch": 1165.2830188679245, + "grad_norm": 1.7659588641922745, + "learning_rate": 5.227172816017956e-06, + "loss": 1.3507, + "step": 30880 + }, + { + "epoch": 1166.0377358490566, + "grad_norm": 2.0037468459561962, + "learning_rate": 5.219139782254665e-06, + "loss": 1.3703, + "step": 30900 + }, + { + "epoch": 1166.7924528301887, + "grad_norm": 2.15024644673786, + "learning_rate": 5.211250928084786e-06, + "loss": 1.3473, + "step": 30920 + }, + { + "epoch": 1167.5471698113208, + "grad_norm": 2.5013172573697466, + "learning_rate": 5.203506284017583e-06, + "loss": 1.3814, + "step": 30940 + }, + { + "epoch": 1168.301886792453, + "grad_norm": 1.5816513523971083, + "learning_rate": 5.195905880004609e-06, + "loss": 1.3668, + "step": 30960 + }, + { + "epoch": 1169.0566037735848, + "grad_norm": 1.512996764161357, + "learning_rate": 5.188449745439581e-06, + "loss": 1.3581, + "step": 30980 + }, + { + "epoch": 1169.811320754717, + "grad_norm": 1.536263448282502, + "learning_rate": 5.181137909158276e-06, + "loss": 1.3277, + "step": 31000 + }, + { + "epoch": 1170.566037735849, + "grad_norm": 1.6755767673451942, + "learning_rate": 5.1739703994384105e-06, + "loss": 1.3923, + "step": 31020 + }, + { + "epoch": 1171.3207547169811, + "grad_norm": 1.7976047665675525, + "learning_rate": 5.166947243999532e-06, + "loss": 1.3671, + "step": 31040 + }, + { + "epoch": 1172.0754716981132, + "grad_norm": 1.5604607884699584, + "learning_rate": 5.1600684700029165e-06, + "loss": 1.3613, + "step": 31060 + }, + { + "epoch": 1172.8301886792453, + "grad_norm": 1.5133379987405895, + "learning_rate": 5.1533341040514576e-06, + "loss": 1.3696, + "step": 31080 + }, + { + "epoch": 1173.5849056603774, + "grad_norm": 1.8992042289915705, + "learning_rate": 5.146744172189571e-06, + "loss": 1.3464, + "step": 31100 + }, + { + "epoch": 1174.3396226415093, + "grad_norm": 1.8549085471784923, + "learning_rate": 5.140298699903085e-06, + "loss": 1.3478, + "step": 31120 + }, + { + "epoch": 1175.0943396226414, + "grad_norm": 1.6926406458235648, + "learning_rate": 5.133997712119152e-06, + "loss": 1.3526, + "step": 31140 + }, + { + "epoch": 1175.8490566037735, + "grad_norm": 1.9538672940442745, + "learning_rate": 5.127841233206144e-06, + "loss": 1.3686, + "step": 31160 + }, + { + "epoch": 1176.6037735849056, + "grad_norm": 1.850655603319905, + "learning_rate": 5.1218292869735606e-06, + "loss": 1.3906, + "step": 31180 + }, + { + "epoch": 1177.3584905660377, + "grad_norm": 1.7127479688627378, + "learning_rate": 5.115961896671935e-06, + "loss": 1.3703, + "step": 31200 + }, + { + "epoch": 1178.1132075471698, + "grad_norm": 1.556614260381109, + "learning_rate": 5.110239084992749e-06, + "loss": 1.3532, + "step": 31220 + }, + { + "epoch": 1178.867924528302, + "grad_norm": 2.001126139034296, + "learning_rate": 5.1046608740683435e-06, + "loss": 1.3929, + "step": 31240 + }, + { + "epoch": 1179.622641509434, + "grad_norm": 2.127747604876417, + "learning_rate": 5.09922728547183e-06, + "loss": 1.3657, + "step": 31260 + }, + { + "epoch": 1180.377358490566, + "grad_norm": 1.8364327564945553, + "learning_rate": 5.093938340217008e-06, + "loss": 1.3426, + "step": 31280 + }, + { + "epoch": 1181.132075471698, + "grad_norm": 1.9292610849222944, + "learning_rate": 5.088794058758295e-06, + "loss": 1.368, + "step": 31300 + }, + { + "epoch": 1181.8867924528302, + "grad_norm": 2.0114024877177505, + "learning_rate": 5.083794460990618e-06, + "loss": 1.39, + "step": 31320 + }, + { + "epoch": 1182.6415094339623, + "grad_norm": 1.5735214803674382, + "learning_rate": 5.078939566249372e-06, + "loss": 1.3632, + "step": 31340 + }, + { + "epoch": 1183.3962264150944, + "grad_norm": 1.8428642902345547, + "learning_rate": 5.074229393310324e-06, + "loss": 1.3757, + "step": 31360 + }, + { + "epoch": 1184.1509433962265, + "grad_norm": 1.697897177712772, + "learning_rate": 5.06966396038955e-06, + "loss": 1.354, + "step": 31380 + }, + { + "epoch": 1184.9056603773586, + "grad_norm": 1.807086734591878, + "learning_rate": 5.065243285143349e-06, + "loss": 1.3757, + "step": 31400 + }, + { + "epoch": 1185.6603773584907, + "grad_norm": 1.743179055242126, + "learning_rate": 5.0609673846681936e-06, + "loss": 1.3819, + "step": 31420 + }, + { + "epoch": 1186.4150943396226, + "grad_norm": 1.8735264452983302, + "learning_rate": 5.056836275500658e-06, + "loss": 1.3579, + "step": 31440 + }, + { + "epoch": 1187.1698113207547, + "grad_norm": 1.5862970321945447, + "learning_rate": 5.052849973617347e-06, + "loss": 1.3445, + "step": 31460 + }, + { + "epoch": 1187.9245283018868, + "grad_norm": 1.692517823714256, + "learning_rate": 5.049008494434844e-06, + "loss": 1.3694, + "step": 31480 + }, + { + "epoch": 1188.6792452830189, + "grad_norm": 1.6212477472255649, + "learning_rate": 5.045311852809638e-06, + "loss": 1.3929, + "step": 31500 + }, + { + "epoch": 1189.433962264151, + "grad_norm": 1.52306373987035, + "learning_rate": 5.041760063038081e-06, + "loss": 1.3579, + "step": 31520 + }, + { + "epoch": 1190.188679245283, + "grad_norm": 1.7830544839573095, + "learning_rate": 5.038353138856331e-06, + "loss": 1.348, + "step": 31540 + }, + { + "epoch": 1190.9433962264152, + "grad_norm": 1.7203728735463606, + "learning_rate": 5.035091093440292e-06, + "loss": 1.37, + "step": 31560 + }, + { + "epoch": 1191.698113207547, + "grad_norm": 1.9298089743408848, + "learning_rate": 5.0319739394055525e-06, + "loss": 1.3627, + "step": 31580 + }, + { + "epoch": 1192.4528301886792, + "grad_norm": 1.9488940650586162, + "learning_rate": 5.029001688807368e-06, + "loss": 1.3537, + "step": 31600 + }, + { + "epoch": 1193.2075471698113, + "grad_norm": 2.0609178957358667, + "learning_rate": 5.026174353140584e-06, + "loss": 1.3521, + "step": 31620 + }, + { + "epoch": 1193.9622641509434, + "grad_norm": 1.710559073613117, + "learning_rate": 5.0234919433396115e-06, + "loss": 1.3768, + "step": 31640 + }, + { + "epoch": 1194.7169811320755, + "grad_norm": 1.5082465689013147, + "learning_rate": 5.02095446977837e-06, + "loss": 1.3893, + "step": 31660 + }, + { + "epoch": 1195.4716981132076, + "grad_norm": 2.4105153089947526, + "learning_rate": 5.018561942270259e-06, + "loss": 1.3532, + "step": 31680 + }, + { + "epoch": 1196.2264150943397, + "grad_norm": 1.5148689250273666, + "learning_rate": 5.016314370068112e-06, + "loss": 1.3429, + "step": 31700 + }, + { + "epoch": 1196.9811320754718, + "grad_norm": 1.7305388649029056, + "learning_rate": 5.014211761864169e-06, + "loss": 1.3559, + "step": 31720 + }, + { + "epoch": 1197.7358490566037, + "grad_norm": 3.661229816284544, + "learning_rate": 5.012254125790028e-06, + "loss": 1.37, + "step": 31740 + }, + { + "epoch": 1198.4905660377358, + "grad_norm": 1.9493540072501139, + "learning_rate": 5.010441469416635e-06, + "loss": 1.3808, + "step": 31760 + }, + { + "epoch": 1199.245283018868, + "grad_norm": 1.6896444872077154, + "learning_rate": 5.008773799754234e-06, + "loss": 1.3631, + "step": 31780 + }, + { + "epoch": 1200.0, + "grad_norm": 1.884439542410789, + "learning_rate": 5.007251123252356e-06, + "loss": 1.3638, + "step": 31800 + }, + { + "epoch": 1200.754716981132, + "grad_norm": 1.98761366434412, + "learning_rate": 5.005873445799779e-06, + "loss": 1.35, + "step": 31820 + }, + { + "epoch": 1201.5094339622642, + "grad_norm": 1.8352779283455332, + "learning_rate": 5.004640772724519e-06, + "loss": 1.3369, + "step": 31840 + }, + { + "epoch": 1202.2641509433963, + "grad_norm": 1.712020294826759, + "learning_rate": 5.003553108793802e-06, + "loss": 1.3511, + "step": 31860 + }, + { + "epoch": 1203.0188679245282, + "grad_norm": 1.6743616923339946, + "learning_rate": 5.002610458214054e-06, + "loss": 1.3259, + "step": 31880 + }, + { + "epoch": 1203.7735849056603, + "grad_norm": 1.8393462234102256, + "learning_rate": 5.001812824630864e-06, + "loss": 1.3646, + "step": 31900 + }, + { + "epoch": 1204.5283018867924, + "grad_norm": 1.7631293985305598, + "learning_rate": 5.001160211128995e-06, + "loss": 1.3384, + "step": 31920 + }, + { + "epoch": 1205.2830188679245, + "grad_norm": 1.6536424071703635, + "learning_rate": 5.0006526202323554e-06, + "loss": 1.3605, + "step": 31940 + }, + { + "epoch": 1206.0377358490566, + "grad_norm": 1.5387931434470863, + "learning_rate": 5.000290053904e-06, + "loss": 1.3892, + "step": 31960 + }, + { + "epoch": 1206.7924528301887, + "grad_norm": 1.948827205429464, + "learning_rate": 5.0000725135461104e-06, + "loss": 1.3541, + "step": 31980 + }, + { + "epoch": 1207.5471698113208, + "grad_norm": 1.595259284912312, + "learning_rate": 5e-06, + "loss": 1.3478, + "step": 32000 + }, + { + "epoch": 1231.5094339622642, + "grad_norm": 1.9859843003442184, + "learning_rate": 3.1745653570607866e-05, + "loss": 1.4161, + "step": 32020 + }, + { + "epoch": 1232.2641509433963, + "grad_norm": 2.399291840461689, + "learning_rate": 3.170382168563073e-05, + "loss": 1.4292, + "step": 32040 + }, + { + "epoch": 1233.0188679245282, + "grad_norm": 2.2207067356830597, + "learning_rate": 3.166200444421923e-05, + "loss": 1.4248, + "step": 32060 + }, + { + "epoch": 1233.7735849056603, + "grad_norm": 1.7538222025729717, + "learning_rate": 3.1620201903092876e-05, + "loss": 1.4549, + "step": 32080 + }, + { + "epoch": 1234.5283018867924, + "grad_norm": 1.7296401624898199, + "learning_rate": 3.157841411895116e-05, + "loss": 1.4544, + "step": 32100 + }, + { + "epoch": 1235.2830188679245, + "grad_norm": 1.6657757057870137, + "learning_rate": 3.153664114847362e-05, + "loss": 1.4734, + "step": 32120 + }, + { + "epoch": 1236.0377358490566, + "grad_norm": 1.7240277610891936, + "learning_rate": 3.149488304831967e-05, + "loss": 1.451, + "step": 32140 + }, + { + "epoch": 1236.7924528301887, + "grad_norm": 1.6885797820089437, + "learning_rate": 3.145313987512854e-05, + "loss": 1.4366, + "step": 32160 + }, + { + "epoch": 1237.5471698113208, + "grad_norm": 1.4963776794399322, + "learning_rate": 3.141141168551928e-05, + "loss": 1.4652, + "step": 32180 + }, + { + "epoch": 1238.301886792453, + "grad_norm": 1.4609983523815115, + "learning_rate": 3.1369698536090554e-05, + "loss": 1.4648, + "step": 32200 + }, + { + "epoch": 1239.0566037735848, + "grad_norm": 1.9029419687473905, + "learning_rate": 3.132800048342065e-05, + "loss": 1.4664, + "step": 32220 + }, + { + "epoch": 1239.811320754717, + "grad_norm": 1.7932066669770592, + "learning_rate": 3.128631758406736e-05, + "loss": 1.4585, + "step": 32240 + }, + { + "epoch": 1240.566037735849, + "grad_norm": 1.6253328044167166, + "learning_rate": 3.1244649894567945e-05, + "loss": 1.4492, + "step": 32260 + }, + { + "epoch": 1241.3207547169811, + "grad_norm": 1.507775786714413, + "learning_rate": 3.120299747143905e-05, + "loss": 1.4934, + "step": 32280 + }, + { + "epoch": 1242.0754716981132, + "grad_norm": 1.7801850010709415, + "learning_rate": 3.1161360371176566e-05, + "loss": 1.4486, + "step": 32300 + }, + { + "epoch": 1242.8301886792453, + "grad_norm": 1.6106209389195743, + "learning_rate": 3.111973865025564e-05, + "loss": 1.4468, + "step": 32320 + }, + { + "epoch": 1243.5849056603774, + "grad_norm": 1.8027839874458171, + "learning_rate": 3.107813236513054e-05, + "loss": 1.477, + "step": 32340 + }, + { + "epoch": 1244.3396226415093, + "grad_norm": 1.883131295400716, + "learning_rate": 3.1036541572234594e-05, + "loss": 1.4555, + "step": 32360 + }, + { + "epoch": 1245.0943396226414, + "grad_norm": 1.591157945654413, + "learning_rate": 3.099496632798014e-05, + "loss": 1.4708, + "step": 32380 + }, + { + "epoch": 1245.8490566037735, + "grad_norm": 1.6694778342522842, + "learning_rate": 3.095340668875842e-05, + "loss": 1.4639, + "step": 32400 + }, + { + "epoch": 1246.6037735849056, + "grad_norm": 1.6841562206011031, + "learning_rate": 3.091186271093947e-05, + "loss": 1.5116, + "step": 32420 + }, + { + "epoch": 1247.3584905660377, + "grad_norm": 2.3369379900409943, + "learning_rate": 3.0870334450872156e-05, + "loss": 1.4754, + "step": 32440 + }, + { + "epoch": 1248.1132075471698, + "grad_norm": 1.720534890104194, + "learning_rate": 3.0828821964883944e-05, + "loss": 1.4941, + "step": 32460 + }, + { + "epoch": 1248.867924528302, + "grad_norm": 1.7549772489735695, + "learning_rate": 3.0787325309280966e-05, + "loss": 1.4799, + "step": 32480 + }, + { + "epoch": 1249.622641509434, + "grad_norm": 1.8182084066575632, + "learning_rate": 3.074584454034788e-05, + "loss": 1.4715, + "step": 32500 + }, + { + "epoch": 1250.377358490566, + "grad_norm": 1.5605662428278646, + "learning_rate": 3.0704379714347736e-05, + "loss": 1.4783, + "step": 32520 + }, + { + "epoch": 1251.132075471698, + "grad_norm": 1.569853865239183, + "learning_rate": 3.066293088752203e-05, + "loss": 1.4638, + "step": 32540 + }, + { + "epoch": 1251.8867924528302, + "grad_norm": 1.704579985134968, + "learning_rate": 3.062149811609051e-05, + "loss": 1.492, + "step": 32560 + }, + { + "epoch": 1252.6415094339623, + "grad_norm": 1.7794864973864697, + "learning_rate": 3.058008145625118e-05, + "loss": 1.4705, + "step": 32580 + }, + { + "epoch": 1253.3962264150944, + "grad_norm": 1.8222736973302784, + "learning_rate": 3.053868096418017e-05, + "loss": 1.4893, + "step": 32600 + }, + { + "epoch": 1254.1509433962265, + "grad_norm": 1.5789611538013155, + "learning_rate": 3.0497296696031678e-05, + "loss": 1.4665, + "step": 32620 + }, + { + "epoch": 1254.9056603773586, + "grad_norm": 1.657785958532039, + "learning_rate": 3.0455928707937924e-05, + "loss": 1.491, + "step": 32640 + }, + { + "epoch": 1255.6603773584907, + "grad_norm": 1.3254023383839637, + "learning_rate": 3.0414577056008995e-05, + "loss": 1.4823, + "step": 32660 + }, + { + "epoch": 1256.4150943396226, + "grad_norm": 1.5602437010509045, + "learning_rate": 3.0373241796332887e-05, + "loss": 1.4704, + "step": 32680 + }, + { + "epoch": 1257.1698113207547, + "grad_norm": 2.029474586920305, + "learning_rate": 3.0331922984975316e-05, + "loss": 1.4765, + "step": 32700 + }, + { + "epoch": 1257.9245283018868, + "grad_norm": 1.8553896972815955, + "learning_rate": 3.0290620677979688e-05, + "loss": 1.5096, + "step": 32720 + }, + { + "epoch": 1258.6792452830189, + "grad_norm": 1.4989759048156965, + "learning_rate": 3.0249334931367046e-05, + "loss": 1.5122, + "step": 32740 + }, + { + "epoch": 1259.433962264151, + "grad_norm": 1.6763111597334728, + "learning_rate": 3.0208065801135942e-05, + "loss": 1.4787, + "step": 32760 + }, + { + "epoch": 1260.188679245283, + "grad_norm": 1.469251133196546, + "learning_rate": 3.016681334326244e-05, + "loss": 1.4854, + "step": 32780 + }, + { + "epoch": 1260.9433962264152, + "grad_norm": 1.8501919367454238, + "learning_rate": 3.0125577613699926e-05, + "loss": 1.4929, + "step": 32800 + }, + { + "epoch": 1261.698113207547, + "grad_norm": 1.5790438820656068, + "learning_rate": 3.0084358668379155e-05, + "loss": 1.5055, + "step": 32820 + }, + { + "epoch": 1262.4528301886792, + "grad_norm": 1.5952733717783116, + "learning_rate": 3.004315656320806e-05, + "loss": 1.4907, + "step": 32840 + }, + { + "epoch": 1263.2075471698113, + "grad_norm": 1.6182930520428953, + "learning_rate": 3.0001971354071772e-05, + "loss": 1.4909, + "step": 32860 + }, + { + "epoch": 1263.9622641509434, + "grad_norm": 2.2886630268428663, + "learning_rate": 2.996080309683252e-05, + "loss": 1.4992, + "step": 32880 + }, + { + "epoch": 1264.7169811320755, + "grad_norm": 1.3793974197803296, + "learning_rate": 2.9919651847329483e-05, + "loss": 1.5061, + "step": 32900 + }, + { + "epoch": 1265.4716981132076, + "grad_norm": 1.39182833894468, + "learning_rate": 2.9878517661378828e-05, + "loss": 1.4591, + "step": 32920 + }, + { + "epoch": 1266.2264150943397, + "grad_norm": 1.6904437738848905, + "learning_rate": 2.9837400594773515e-05, + "loss": 1.5118, + "step": 32940 + }, + { + "epoch": 1266.9811320754718, + "grad_norm": 1.6447748796714898, + "learning_rate": 2.979630070328336e-05, + "loss": 1.4881, + "step": 32960 + }, + { + "epoch": 1267.7358490566037, + "grad_norm": 1.3512114550316146, + "learning_rate": 2.975521804265484e-05, + "loss": 1.4719, + "step": 32980 + }, + { + "epoch": 1268.4905660377358, + "grad_norm": 1.6317892668767962, + "learning_rate": 2.971415266861105e-05, + "loss": 1.5057, + "step": 33000 + }, + { + "epoch": 1269.245283018868, + "grad_norm": 1.6596450520295813, + "learning_rate": 2.967310463685166e-05, + "loss": 1.481, + "step": 33020 + }, + { + "epoch": 1270.0, + "grad_norm": 1.6548890468178368, + "learning_rate": 2.9632074003052808e-05, + "loss": 1.5136, + "step": 33040 + }, + { + "epoch": 1270.754716981132, + "grad_norm": 1.5074284840254797, + "learning_rate": 2.9591060822867042e-05, + "loss": 1.4971, + "step": 33060 + }, + { + "epoch": 1271.5094339622642, + "grad_norm": 1.5075074748556512, + "learning_rate": 2.9550065151923238e-05, + "loss": 1.4647, + "step": 33080 + }, + { + "epoch": 1272.2641509433963, + "grad_norm": 1.7144775848474376, + "learning_rate": 2.9509087045826505e-05, + "loss": 1.5145, + "step": 33100 + }, + { + "epoch": 1273.0188679245282, + "grad_norm": 1.5547570517351919, + "learning_rate": 2.946812656015815e-05, + "loss": 1.4806, + "step": 33120 + }, + { + "epoch": 1273.7735849056603, + "grad_norm": 1.91096744807036, + "learning_rate": 2.942718375047554e-05, + "loss": 1.4953, + "step": 33140 + }, + { + "epoch": 1274.5283018867924, + "grad_norm": 1.690681911094072, + "learning_rate": 2.9386258672312143e-05, + "loss": 1.5043, + "step": 33160 + }, + { + "epoch": 1275.2830188679245, + "grad_norm": 1.6094990513366627, + "learning_rate": 2.93453513811773e-05, + "loss": 1.4656, + "step": 33180 + }, + { + "epoch": 1276.0377358490566, + "grad_norm": 1.7166760221415358, + "learning_rate": 2.9304461932556262e-05, + "loss": 1.5049, + "step": 33200 + }, + { + "epoch": 1276.7924528301887, + "grad_norm": 1.4781436729661779, + "learning_rate": 2.9263590381910078e-05, + "loss": 1.4901, + "step": 33220 + }, + { + "epoch": 1277.5471698113208, + "grad_norm": 1.6055713664381628, + "learning_rate": 2.9222736784675506e-05, + "loss": 1.4744, + "step": 33240 + }, + { + "epoch": 1278.301886792453, + "grad_norm": 1.6185246350349134, + "learning_rate": 2.9181901196264983e-05, + "loss": 1.4809, + "step": 33260 + }, + { + "epoch": 1279.0566037735848, + "grad_norm": 1.876852753612874, + "learning_rate": 2.9141083672066472e-05, + "loss": 1.4737, + "step": 33280 + }, + { + "epoch": 1279.811320754717, + "grad_norm": 1.646333221814719, + "learning_rate": 2.910028426744349e-05, + "loss": 1.4807, + "step": 33300 + }, + { + "epoch": 1280.566037735849, + "grad_norm": 1.4950158846180641, + "learning_rate": 2.9059503037734925e-05, + "loss": 1.4871, + "step": 33320 + }, + { + "epoch": 1281.3207547169811, + "grad_norm": 2.5440304246025702, + "learning_rate": 2.9018740038255044e-05, + "loss": 1.4869, + "step": 33340 + }, + { + "epoch": 1282.0754716981132, + "grad_norm": 1.5221803613837093, + "learning_rate": 2.897799532429339e-05, + "loss": 1.4756, + "step": 33360 + }, + { + "epoch": 1282.8301886792453, + "grad_norm": 1.459833552438949, + "learning_rate": 2.8937268951114686e-05, + "loss": 1.4782, + "step": 33380 + }, + { + "epoch": 1283.5849056603774, + "grad_norm": 1.5193291412259906, + "learning_rate": 2.8896560973958796e-05, + "loss": 1.4925, + "step": 33400 + }, + { + "epoch": 1284.3396226415093, + "grad_norm": 1.457579254538571, + "learning_rate": 2.88558714480406e-05, + "loss": 1.4865, + "step": 33420 + }, + { + "epoch": 1285.0943396226414, + "grad_norm": 2.116390864572185, + "learning_rate": 2.8815200428549985e-05, + "loss": 1.4823, + "step": 33440 + }, + { + "epoch": 1285.8490566037735, + "grad_norm": 2.333973476529065, + "learning_rate": 2.8774547970651747e-05, + "loss": 1.4701, + "step": 33460 + }, + { + "epoch": 1286.6037735849056, + "grad_norm": 1.4347402180741313, + "learning_rate": 2.8733914129485457e-05, + "loss": 1.4964, + "step": 33480 + }, + { + "epoch": 1287.3584905660377, + "grad_norm": 1.5219049837257324, + "learning_rate": 2.8693298960165473e-05, + "loss": 1.4845, + "step": 33500 + }, + { + "epoch": 1288.1132075471698, + "grad_norm": 1.7295744561903763, + "learning_rate": 2.8652702517780815e-05, + "loss": 1.4729, + "step": 33520 + }, + { + "epoch": 1288.867924528302, + "grad_norm": 1.3491913340767474, + "learning_rate": 2.8612124857395097e-05, + "loss": 1.4734, + "step": 33540 + }, + { + "epoch": 1289.622641509434, + "grad_norm": 1.612399971127458, + "learning_rate": 2.8571566034046486e-05, + "loss": 1.4717, + "step": 33560 + }, + { + "epoch": 1290.377358490566, + "grad_norm": 1.523340229132746, + "learning_rate": 2.8531026102747552e-05, + "loss": 1.4784, + "step": 33580 + }, + { + "epoch": 1291.132075471698, + "grad_norm": 1.391650177787444, + "learning_rate": 2.849050511848529e-05, + "loss": 1.4968, + "step": 33600 + }, + { + "epoch": 1291.8867924528302, + "grad_norm": 1.631972432390494, + "learning_rate": 2.845000313622095e-05, + "loss": 1.4783, + "step": 33620 + }, + { + "epoch": 1292.6415094339623, + "grad_norm": 1.4676382942374402, + "learning_rate": 2.840952021089003e-05, + "loss": 1.4724, + "step": 33640 + }, + { + "epoch": 1293.3962264150944, + "grad_norm": 1.5025191965428788, + "learning_rate": 2.83690563974022e-05, + "loss": 1.4958, + "step": 33660 + }, + { + "epoch": 1294.1509433962265, + "grad_norm": 1.6379644083109945, + "learning_rate": 2.832861175064119e-05, + "loss": 1.4834, + "step": 33680 + }, + { + "epoch": 1294.9056603773586, + "grad_norm": 1.7312099049664693, + "learning_rate": 2.8288186325464705e-05, + "loss": 1.4941, + "step": 33700 + }, + { + "epoch": 1295.6603773584907, + "grad_norm": 1.5113721107585405, + "learning_rate": 2.8247780176704408e-05, + "loss": 1.4863, + "step": 33720 + }, + { + "epoch": 1296.4150943396226, + "grad_norm": 1.4187238404455875, + "learning_rate": 2.8207393359165837e-05, + "loss": 1.4635, + "step": 33740 + }, + { + "epoch": 1297.1698113207547, + "grad_norm": 1.5036198246572734, + "learning_rate": 2.8167025927628266e-05, + "loss": 1.4663, + "step": 33760 + }, + { + "epoch": 1297.9245283018868, + "grad_norm": 1.656299435435026, + "learning_rate": 2.8126677936844698e-05, + "loss": 1.4809, + "step": 33780 + }, + { + "epoch": 1298.6792452830189, + "grad_norm": 1.7227294745544, + "learning_rate": 2.808634944154176e-05, + "loss": 1.4518, + "step": 33800 + }, + { + "epoch": 1299.433962264151, + "grad_norm": 1.554440422068932, + "learning_rate": 2.8046040496419622e-05, + "loss": 1.4858, + "step": 33820 + }, + { + "epoch": 1300.188679245283, + "grad_norm": 1.5684395687858594, + "learning_rate": 2.8005751156151996e-05, + "loss": 1.4939, + "step": 33840 + }, + { + "epoch": 1300.9433962264152, + "grad_norm": 1.4791453327586883, + "learning_rate": 2.7965481475385922e-05, + "loss": 1.4981, + "step": 33860 + }, + { + "epoch": 1301.698113207547, + "grad_norm": 1.8682361890592045, + "learning_rate": 2.792523150874184e-05, + "loss": 1.485, + "step": 33880 + }, + { + "epoch": 1302.4528301886792, + "grad_norm": 1.4376784070576631, + "learning_rate": 2.7885001310813394e-05, + "loss": 1.4771, + "step": 33900 + }, + { + "epoch": 1303.2075471698113, + "grad_norm": 1.4919487782728726, + "learning_rate": 2.7844790936167448e-05, + "loss": 1.4818, + "step": 33920 + }, + { + "epoch": 1303.9622641509434, + "grad_norm": 1.5926644935407461, + "learning_rate": 2.7804600439344004e-05, + "loss": 1.481, + "step": 33940 + }, + { + "epoch": 1304.7169811320755, + "grad_norm": 2.129672326977145, + "learning_rate": 2.776442987485605e-05, + "loss": 1.4809, + "step": 33960 + }, + { + "epoch": 1305.4716981132076, + "grad_norm": 1.4661184798946012, + "learning_rate": 2.7724279297189564e-05, + "loss": 1.4734, + "step": 33980 + }, + { + "epoch": 1306.2264150943397, + "grad_norm": 1.6422416038082728, + "learning_rate": 2.7684148760803404e-05, + "loss": 1.4706, + "step": 34000 + }, + { + "epoch": 1306.9811320754718, + "grad_norm": 1.6788541325557527, + "learning_rate": 2.7644038320129247e-05, + "loss": 1.4734, + "step": 34020 + }, + { + "epoch": 1307.7358490566037, + "grad_norm": 1.5820996412366164, + "learning_rate": 2.7603948029571546e-05, + "loss": 1.4731, + "step": 34040 + }, + { + "epoch": 1308.4905660377358, + "grad_norm": 1.8093817496261688, + "learning_rate": 2.756387794350737e-05, + "loss": 1.4876, + "step": 34060 + }, + { + "epoch": 1309.245283018868, + "grad_norm": 1.4611414622430816, + "learning_rate": 2.7523828116286425e-05, + "loss": 1.4958, + "step": 34080 + }, + { + "epoch": 1310.0, + "grad_norm": 1.4982681857066789, + "learning_rate": 2.7483798602230905e-05, + "loss": 1.4713, + "step": 34100 + }, + { + "epoch": 1310.754716981132, + "grad_norm": 1.7049190400136933, + "learning_rate": 2.744378945563547e-05, + "loss": 1.4698, + "step": 34120 + }, + { + "epoch": 1311.5094339622642, + "grad_norm": 1.465072325468145, + "learning_rate": 2.7403800730767165e-05, + "loss": 1.4814, + "step": 34140 + }, + { + "epoch": 1312.2641509433963, + "grad_norm": 1.6806290813940998, + "learning_rate": 2.7363832481865326e-05, + "loss": 1.4623, + "step": 34160 + }, + { + "epoch": 1313.0188679245282, + "grad_norm": 1.422949129304357, + "learning_rate": 2.7323884763141494e-05, + "loss": 1.4798, + "step": 34180 + }, + { + "epoch": 1313.7735849056603, + "grad_norm": 1.5386955048633302, + "learning_rate": 2.728395762877941e-05, + "loss": 1.4588, + "step": 34200 + }, + { + "epoch": 1314.5283018867924, + "grad_norm": 1.5472770555424338, + "learning_rate": 2.7244051132934836e-05, + "loss": 1.451, + "step": 34220 + }, + { + "epoch": 1315.2830188679245, + "grad_norm": 1.435168914934391, + "learning_rate": 2.72041653297356e-05, + "loss": 1.4943, + "step": 34240 + }, + { + "epoch": 1316.0377358490566, + "grad_norm": 1.4183350034608622, + "learning_rate": 2.716430027328143e-05, + "loss": 1.4519, + "step": 34260 + }, + { + "epoch": 1316.7924528301887, + "grad_norm": 1.7134876611489063, + "learning_rate": 2.7124456017643914e-05, + "loss": 1.4658, + "step": 34280 + }, + { + "epoch": 1317.5471698113208, + "grad_norm": 1.4042660927164932, + "learning_rate": 2.7084632616866437e-05, + "loss": 1.4665, + "step": 34300 + }, + { + "epoch": 1318.301886792453, + "grad_norm": 1.7236176772036846, + "learning_rate": 2.7044830124964073e-05, + "loss": 1.4598, + "step": 34320 + }, + { + "epoch": 1319.0566037735848, + "grad_norm": 1.7345912564178498, + "learning_rate": 2.7005048595923597e-05, + "loss": 1.4941, + "step": 34340 + }, + { + "epoch": 1319.811320754717, + "grad_norm": 1.6553359599381614, + "learning_rate": 2.696528808370328e-05, + "loss": 1.448, + "step": 34360 + }, + { + "epoch": 1320.566037735849, + "grad_norm": 1.654924545197036, + "learning_rate": 2.6925548642232916e-05, + "loss": 1.453, + "step": 34380 + }, + { + "epoch": 1321.3207547169811, + "grad_norm": 1.966241914838029, + "learning_rate": 2.6885830325413732e-05, + "loss": 1.4791, + "step": 34400 + }, + { + "epoch": 1322.0754716981132, + "grad_norm": 1.613098173730771, + "learning_rate": 2.6846133187118266e-05, + "loss": 1.4456, + "step": 34420 + }, + { + "epoch": 1322.8301886792453, + "grad_norm": 1.694164161340185, + "learning_rate": 2.6806457281190392e-05, + "loss": 1.4697, + "step": 34440 + }, + { + "epoch": 1323.5849056603774, + "grad_norm": 1.7709910517494127, + "learning_rate": 2.6766802661445123e-05, + "loss": 1.4767, + "step": 34460 + }, + { + "epoch": 1324.3396226415093, + "grad_norm": 2.1757270130771547, + "learning_rate": 2.672716938166863e-05, + "loss": 1.5023, + "step": 34480 + }, + { + "epoch": 1325.0943396226414, + "grad_norm": 1.618966012864335, + "learning_rate": 2.66875574956181e-05, + "loss": 1.4459, + "step": 34500 + }, + { + "epoch": 1325.8490566037735, + "grad_norm": 1.6395370020860782, + "learning_rate": 2.6647967057021783e-05, + "loss": 1.4716, + "step": 34520 + }, + { + "epoch": 1326.6037735849056, + "grad_norm": 1.458865429611614, + "learning_rate": 2.6608398119578777e-05, + "loss": 1.4509, + "step": 34540 + }, + { + "epoch": 1327.3584905660377, + "grad_norm": 1.8785096977087146, + "learning_rate": 2.656885073695903e-05, + "loss": 1.4563, + "step": 34560 + }, + { + "epoch": 1328.1132075471698, + "grad_norm": 1.9390316222323336, + "learning_rate": 2.652932496280323e-05, + "loss": 1.4851, + "step": 34580 + }, + { + "epoch": 1328.867924528302, + "grad_norm": 5.310289949887802, + "learning_rate": 2.6489820850722802e-05, + "loss": 1.4768, + "step": 34600 + }, + { + "epoch": 1329.622641509434, + "grad_norm": 1.4684731158219795, + "learning_rate": 2.6450338454299786e-05, + "loss": 1.4516, + "step": 34620 + }, + { + "epoch": 1330.377358490566, + "grad_norm": 1.639577731583303, + "learning_rate": 2.641087782708672e-05, + "loss": 1.4654, + "step": 34640 + }, + { + "epoch": 1331.132075471698, + "grad_norm": 1.6849901015256106, + "learning_rate": 2.6371439022606665e-05, + "loss": 1.4615, + "step": 34660 + }, + { + "epoch": 1331.8867924528302, + "grad_norm": 1.619952725687253, + "learning_rate": 2.6332022094353024e-05, + "loss": 1.4461, + "step": 34680 + }, + { + "epoch": 1332.6415094339623, + "grad_norm": 1.5608967063706551, + "learning_rate": 2.6292627095789594e-05, + "loss": 1.4523, + "step": 34700 + }, + { + "epoch": 1333.3962264150944, + "grad_norm": 1.7568408459896505, + "learning_rate": 2.625325408035041e-05, + "loss": 1.4758, + "step": 34720 + }, + { + "epoch": 1334.1509433962265, + "grad_norm": 1.5186845485994895, + "learning_rate": 2.6213903101439668e-05, + "loss": 1.4527, + "step": 34740 + }, + { + "epoch": 1334.9056603773586, + "grad_norm": 1.9016010055715276, + "learning_rate": 2.6174574212431673e-05, + "loss": 1.4708, + "step": 34760 + }, + { + "epoch": 1335.6603773584907, + "grad_norm": 1.3914584691450766, + "learning_rate": 2.6135267466670776e-05, + "loss": 1.4519, + "step": 34780 + }, + { + "epoch": 1336.4150943396226, + "grad_norm": 1.7920706183325235, + "learning_rate": 2.6095982917471312e-05, + "loss": 1.4551, + "step": 34800 + }, + { + "epoch": 1337.1698113207547, + "grad_norm": 1.7415199040517522, + "learning_rate": 2.6056720618117508e-05, + "loss": 1.4618, + "step": 34820 + }, + { + "epoch": 1337.9245283018868, + "grad_norm": 2.0387577968023423, + "learning_rate": 2.6017480621863382e-05, + "loss": 1.4336, + "step": 34840 + }, + { + "epoch": 1338.6792452830189, + "grad_norm": 1.7452335041516622, + "learning_rate": 2.5978262981932716e-05, + "loss": 1.4845, + "step": 34860 + }, + { + "epoch": 1339.433962264151, + "grad_norm": 1.8221491527113842, + "learning_rate": 2.5939067751518968e-05, + "loss": 1.4509, + "step": 34880 + }, + { + "epoch": 1340.188679245283, + "grad_norm": 1.573534969706598, + "learning_rate": 2.58998949837852e-05, + "loss": 1.4597, + "step": 34900 + }, + { + "epoch": 1340.9433962264152, + "grad_norm": 1.7418894779202971, + "learning_rate": 2.5860744731864037e-05, + "loss": 1.4509, + "step": 34920 + }, + { + "epoch": 1341.698113207547, + "grad_norm": 2.3533748801857612, + "learning_rate": 2.5821617048857514e-05, + "loss": 1.4707, + "step": 34940 + }, + { + "epoch": 1342.4528301886792, + "grad_norm": 1.6384303594662744, + "learning_rate": 2.5782511987837087e-05, + "loss": 1.4483, + "step": 34960 + }, + { + "epoch": 1343.2075471698113, + "grad_norm": 1.7437935503570192, + "learning_rate": 2.5743429601843493e-05, + "loss": 1.4708, + "step": 34980 + }, + { + "epoch": 1343.9622641509434, + "grad_norm": 1.6299173329516294, + "learning_rate": 2.5704369943886763e-05, + "loss": 1.4487, + "step": 35000 + }, + { + "epoch": 1344.7169811320755, + "grad_norm": 1.5340708358576824, + "learning_rate": 2.5665333066946082e-05, + "loss": 1.4659, + "step": 35020 + }, + { + "epoch": 1345.4716981132076, + "grad_norm": 1.70280338168885, + "learning_rate": 2.5626319023969715e-05, + "loss": 1.4547, + "step": 35040 + }, + { + "epoch": 1346.2264150943397, + "grad_norm": 1.6585665666032239, + "learning_rate": 2.558732786787497e-05, + "loss": 1.4514, + "step": 35060 + }, + { + "epoch": 1346.9811320754718, + "grad_norm": 1.562613257380082, + "learning_rate": 2.5548359651548126e-05, + "loss": 1.4661, + "step": 35080 + }, + { + "epoch": 1347.7358490566037, + "grad_norm": 1.7392138600300024, + "learning_rate": 2.550941442784431e-05, + "loss": 1.4546, + "step": 35100 + }, + { + "epoch": 1348.4905660377358, + "grad_norm": 1.9111375288571992, + "learning_rate": 2.5470492249587522e-05, + "loss": 1.4478, + "step": 35120 + }, + { + "epoch": 1349.245283018868, + "grad_norm": 1.4950805686503206, + "learning_rate": 2.5431593169570446e-05, + "loss": 1.4535, + "step": 35140 + }, + { + "epoch": 1350.0, + "grad_norm": 2.553809298230812, + "learning_rate": 2.539271724055444e-05, + "loss": 1.464, + "step": 35160 + }, + { + "epoch": 1350.754716981132, + "grad_norm": 1.562798272416066, + "learning_rate": 2.5353864515269525e-05, + "loss": 1.4665, + "step": 35180 + }, + { + "epoch": 1351.5094339622642, + "grad_norm": 1.5956415565820565, + "learning_rate": 2.531503504641416e-05, + "loss": 1.4174, + "step": 35200 + }, + { + "epoch": 1352.2641509433963, + "grad_norm": 1.745867042261029, + "learning_rate": 2.5276228886655333e-05, + "loss": 1.4738, + "step": 35220 + }, + { + "epoch": 1353.0188679245282, + "grad_norm": 1.8454370598772634, + "learning_rate": 2.5237446088628384e-05, + "loss": 1.4407, + "step": 35240 + }, + { + "epoch": 1353.7735849056603, + "grad_norm": 1.70704191729437, + "learning_rate": 2.5198686704936945e-05, + "loss": 1.4617, + "step": 35260 + }, + { + "epoch": 1354.5283018867924, + "grad_norm": 1.410719238952515, + "learning_rate": 2.5159950788152942e-05, + "loss": 1.4397, + "step": 35280 + }, + { + "epoch": 1355.2830188679245, + "grad_norm": 1.811804083528806, + "learning_rate": 2.512123839081642e-05, + "loss": 1.443, + "step": 35300 + }, + { + "epoch": 1356.0377358490566, + "grad_norm": 1.659319824434148, + "learning_rate": 2.508254956543557e-05, + "loss": 1.4577, + "step": 35320 + }, + { + "epoch": 1356.7924528301887, + "grad_norm": 1.5084615900612242, + "learning_rate": 2.504388436448657e-05, + "loss": 1.4702, + "step": 35340 + }, + { + "epoch": 1357.5471698113208, + "grad_norm": 1.6272545133599885, + "learning_rate": 2.500524284041357e-05, + "loss": 1.4397, + "step": 35360 + }, + { + "epoch": 1358.301886792453, + "grad_norm": 1.470645864952112, + "learning_rate": 2.4966625045628615e-05, + "loss": 1.4435, + "step": 35380 + }, + { + "epoch": 1359.0566037735848, + "grad_norm": 1.455775463587072, + "learning_rate": 2.4928031032511544e-05, + "loss": 1.4554, + "step": 35400 + }, + { + "epoch": 1359.811320754717, + "grad_norm": 1.6787988136879601, + "learning_rate": 2.4889460853409974e-05, + "loss": 1.4692, + "step": 35420 + }, + { + "epoch": 1360.566037735849, + "grad_norm": 1.5640507822516196, + "learning_rate": 2.485091456063916e-05, + "loss": 1.4528, + "step": 35440 + }, + { + "epoch": 1361.3207547169811, + "grad_norm": 1.396621608357886, + "learning_rate": 2.4812392206481945e-05, + "loss": 1.4371, + "step": 35460 + }, + { + "epoch": 1362.0754716981132, + "grad_norm": 1.8537494645554213, + "learning_rate": 2.477389384318876e-05, + "loss": 1.4395, + "step": 35480 + }, + { + "epoch": 1362.8301886792453, + "grad_norm": 1.512928732642698, + "learning_rate": 2.4735419522977467e-05, + "loss": 1.4914, + "step": 35500 + }, + { + "epoch": 1363.5849056603774, + "grad_norm": 1.6340922613193214, + "learning_rate": 2.46969692980333e-05, + "loss": 1.4654, + "step": 35520 + }, + { + "epoch": 1364.3396226415093, + "grad_norm": 1.5378015561259157, + "learning_rate": 2.465854322050881e-05, + "loss": 1.4246, + "step": 35540 + }, + { + "epoch": 1365.0943396226414, + "grad_norm": 1.8471949838761705, + "learning_rate": 2.462014134252384e-05, + "loss": 1.4386, + "step": 35560 + }, + { + "epoch": 1365.8490566037735, + "grad_norm": 2.139477793232749, + "learning_rate": 2.4581763716165345e-05, + "loss": 1.4314, + "step": 35580 + }, + { + "epoch": 1366.6037735849056, + "grad_norm": 1.5366713623147805, + "learning_rate": 2.454341039348746e-05, + "loss": 1.4514, + "step": 35600 + }, + { + "epoch": 1367.3584905660377, + "grad_norm": 2.6632963736452018, + "learning_rate": 2.4505081426511286e-05, + "loss": 1.4244, + "step": 35620 + }, + { + "epoch": 1368.1132075471698, + "grad_norm": 1.7507517183403924, + "learning_rate": 2.4466776867224914e-05, + "loss": 1.4401, + "step": 35640 + }, + { + "epoch": 1368.867924528302, + "grad_norm": 1.7263277038654796, + "learning_rate": 2.4428496767583355e-05, + "loss": 1.4569, + "step": 35660 + }, + { + "epoch": 1369.622641509434, + "grad_norm": 1.664393561735168, + "learning_rate": 2.4390241179508404e-05, + "loss": 1.4387, + "step": 35680 + }, + { + "epoch": 1370.377358490566, + "grad_norm": 1.723479394345894, + "learning_rate": 2.435201015488865e-05, + "loss": 1.4411, + "step": 35700 + }, + { + "epoch": 1371.132075471698, + "grad_norm": 1.434976866992101, + "learning_rate": 2.4313803745579318e-05, + "loss": 1.4284, + "step": 35720 + }, + { + "epoch": 1371.8867924528302, + "grad_norm": 1.4785579710843697, + "learning_rate": 2.4275622003402272e-05, + "loss": 1.442, + "step": 35740 + }, + { + "epoch": 1372.6415094339623, + "grad_norm": 1.4377021405339876, + "learning_rate": 2.4237464980145938e-05, + "loss": 1.4585, + "step": 35760 + }, + { + "epoch": 1373.3962264150944, + "grad_norm": 1.4439423657468624, + "learning_rate": 2.4199332727565162e-05, + "loss": 1.4415, + "step": 35780 + }, + { + "epoch": 1374.1509433962265, + "grad_norm": 1.6286933767716432, + "learning_rate": 2.4161225297381257e-05, + "loss": 1.4191, + "step": 35800 + }, + { + "epoch": 1374.9056603773586, + "grad_norm": 1.8061947503706157, + "learning_rate": 2.412314274128181e-05, + "loss": 1.4328, + "step": 35820 + }, + { + "epoch": 1375.6603773584907, + "grad_norm": 1.4892866827277318, + "learning_rate": 2.408508511092069e-05, + "loss": 1.426, + "step": 35840 + }, + { + "epoch": 1376.4150943396226, + "grad_norm": 2.1944517889347206, + "learning_rate": 2.4047052457917976e-05, + "loss": 1.4383, + "step": 35860 + }, + { + "epoch": 1377.1698113207547, + "grad_norm": 1.657764612011157, + "learning_rate": 2.4009044833859837e-05, + "loss": 1.4335, + "step": 35880 + }, + { + "epoch": 1377.9245283018868, + "grad_norm": 1.6641457685651413, + "learning_rate": 2.397106229029853e-05, + "loss": 1.449, + "step": 35900 + }, + { + "epoch": 1378.6792452830189, + "grad_norm": 1.6180638342974163, + "learning_rate": 2.3933104878752255e-05, + "loss": 1.4531, + "step": 35920 + }, + { + "epoch": 1379.433962264151, + "grad_norm": 1.4294375910343768, + "learning_rate": 2.3895172650705135e-05, + "loss": 1.394, + "step": 35940 + }, + { + "epoch": 1380.188679245283, + "grad_norm": 1.8277501896092694, + "learning_rate": 2.3857265657607175e-05, + "loss": 1.3907, + "step": 35960 + }, + { + "epoch": 1380.9433962264152, + "grad_norm": 1.498142714401942, + "learning_rate": 2.381938395087408e-05, + "loss": 1.427, + "step": 35980 + }, + { + "epoch": 1381.698113207547, + "grad_norm": 1.6446695245077154, + "learning_rate": 2.3781527581887328e-05, + "loss": 1.4267, + "step": 36000 + }, + { + "epoch": 1382.4528301886792, + "grad_norm": 2.126047948088478, + "learning_rate": 2.3743696601993973e-05, + "loss": 1.4513, + "step": 36020 + }, + { + "epoch": 1383.2075471698113, + "grad_norm": 1.5906073184513956, + "learning_rate": 2.3705891062506686e-05, + "loss": 1.4468, + "step": 36040 + }, + { + "epoch": 1383.9622641509434, + "grad_norm": 1.6659051387541641, + "learning_rate": 2.366811101470359e-05, + "loss": 1.4397, + "step": 36060 + }, + { + "epoch": 1384.7169811320755, + "grad_norm": 1.7950603394090476, + "learning_rate": 2.363035650982822e-05, + "loss": 1.4314, + "step": 36080 + }, + { + "epoch": 1385.4716981132076, + "grad_norm": 1.7227503126171113, + "learning_rate": 2.359262759908953e-05, + "loss": 1.4305, + "step": 36100 + }, + { + "epoch": 1386.2264150943397, + "grad_norm": 1.5686879263532916, + "learning_rate": 2.355492433366169e-05, + "loss": 1.4606, + "step": 36120 + }, + { + "epoch": 1386.9811320754718, + "grad_norm": 1.6010165898998077, + "learning_rate": 2.3517246764684138e-05, + "loss": 1.441, + "step": 36140 + }, + { + "epoch": 1387.7358490566037, + "grad_norm": 3.491710911332113, + "learning_rate": 2.3479594943261428e-05, + "loss": 1.4341, + "step": 36160 + }, + { + "epoch": 1388.4905660377358, + "grad_norm": 1.6931483101249463, + "learning_rate": 2.3441968920463175e-05, + "loss": 1.4059, + "step": 36180 + }, + { + "epoch": 1389.245283018868, + "grad_norm": 1.5814288881168233, + "learning_rate": 2.340436874732406e-05, + "loss": 1.4494, + "step": 36200 + }, + { + "epoch": 1390.0, + "grad_norm": 1.7550476965929234, + "learning_rate": 2.3366794474843636e-05, + "loss": 1.4461, + "step": 36220 + }, + { + "epoch": 1390.754716981132, + "grad_norm": 1.6037325519139611, + "learning_rate": 2.332924615398638e-05, + "loss": 1.4324, + "step": 36240 + }, + { + "epoch": 1391.5094339622642, + "grad_norm": 1.5872440902961078, + "learning_rate": 2.3291723835681542e-05, + "loss": 1.4229, + "step": 36260 + }, + { + "epoch": 1392.2641509433963, + "grad_norm": 1.6075974238110624, + "learning_rate": 2.3254227570823088e-05, + "loss": 1.4319, + "step": 36280 + }, + { + "epoch": 1393.0188679245282, + "grad_norm": 1.664082496030561, + "learning_rate": 2.3216757410269688e-05, + "loss": 1.4133, + "step": 36300 + }, + { + "epoch": 1393.7735849056603, + "grad_norm": 1.868185444331913, + "learning_rate": 2.3179313404844556e-05, + "loss": 1.4303, + "step": 36320 + }, + { + "epoch": 1394.5283018867924, + "grad_norm": 1.5709216565532331, + "learning_rate": 2.314189560533549e-05, + "loss": 1.4136, + "step": 36340 + }, + { + "epoch": 1395.2830188679245, + "grad_norm": 1.6461901097795721, + "learning_rate": 2.3104504062494673e-05, + "loss": 1.4359, + "step": 36360 + }, + { + "epoch": 1396.0377358490566, + "grad_norm": 1.4737937485692245, + "learning_rate": 2.306713882703874e-05, + "loss": 1.4417, + "step": 36380 + }, + { + "epoch": 1396.7924528301887, + "grad_norm": 1.62600468664324, + "learning_rate": 2.3029799949648578e-05, + "loss": 1.4471, + "step": 36400 + }, + { + "epoch": 1397.5471698113208, + "grad_norm": 2.4473530264247914, + "learning_rate": 2.2992487480969405e-05, + "loss": 1.4239, + "step": 36420 + }, + { + "epoch": 1398.301886792453, + "grad_norm": 1.451788707298732, + "learning_rate": 2.295520147161054e-05, + "loss": 1.4213, + "step": 36440 + }, + { + "epoch": 1399.0566037735848, + "grad_norm": 1.6561495842890779, + "learning_rate": 2.2917941972145448e-05, + "loss": 1.4289, + "step": 36460 + }, + { + "epoch": 1399.811320754717, + "grad_norm": 1.7199804756862742, + "learning_rate": 2.288070903311165e-05, + "loss": 1.4089, + "step": 36480 + }, + { + "epoch": 1400.566037735849, + "grad_norm": 1.3767860468778748, + "learning_rate": 2.2843502705010602e-05, + "loss": 1.43, + "step": 36500 + }, + { + "epoch": 1401.3207547169811, + "grad_norm": 1.629044071752712, + "learning_rate": 2.2806323038307724e-05, + "loss": 1.4353, + "step": 36520 + }, + { + "epoch": 1402.0754716981132, + "grad_norm": 1.5402931594748135, + "learning_rate": 2.2769170083432224e-05, + "loss": 1.4002, + "step": 36540 + }, + { + "epoch": 1402.8301886792453, + "grad_norm": 1.6851610727649395, + "learning_rate": 2.273204389077707e-05, + "loss": 1.4303, + "step": 36560 + }, + { + "epoch": 1403.5849056603774, + "grad_norm": 1.6351932980143555, + "learning_rate": 2.2694944510698992e-05, + "loss": 1.4324, + "step": 36580 + }, + { + "epoch": 1404.3396226415093, + "grad_norm": 1.3360407707287731, + "learning_rate": 2.265787199351829e-05, + "loss": 1.4296, + "step": 36600 + }, + { + "epoch": 1405.0943396226414, + "grad_norm": 1.6229856547835415, + "learning_rate": 2.2620826389518878e-05, + "loss": 1.4132, + "step": 36620 + }, + { + "epoch": 1405.8490566037735, + "grad_norm": 1.5762261444691155, + "learning_rate": 2.258380774894813e-05, + "loss": 1.4189, + "step": 36640 + }, + { + "epoch": 1406.6037735849056, + "grad_norm": 1.6330786646124598, + "learning_rate": 2.254681612201684e-05, + "loss": 1.4229, + "step": 36660 + }, + { + "epoch": 1407.3584905660377, + "grad_norm": 1.6074464661210397, + "learning_rate": 2.2509851558899212e-05, + "loss": 1.4438, + "step": 36680 + }, + { + "epoch": 1408.1132075471698, + "grad_norm": 2.912277484153031, + "learning_rate": 2.2472914109732686e-05, + "loss": 1.4195, + "step": 36700 + }, + { + "epoch": 1408.867924528302, + "grad_norm": 1.6223740817719732, + "learning_rate": 2.2436003824617963e-05, + "loss": 1.4099, + "step": 36720 + }, + { + "epoch": 1409.622641509434, + "grad_norm": 1.766781857646511, + "learning_rate": 2.2399120753618896e-05, + "loss": 1.4168, + "step": 36740 + }, + { + "epoch": 1410.377358490566, + "grad_norm": 1.5296965456959557, + "learning_rate": 2.2362264946762392e-05, + "loss": 1.4118, + "step": 36760 + }, + { + "epoch": 1411.132075471698, + "grad_norm": 1.6610041335566879, + "learning_rate": 2.232543645403842e-05, + "loss": 1.4166, + "step": 36780 + }, + { + "epoch": 1411.8867924528302, + "grad_norm": 1.5205836616470723, + "learning_rate": 2.228863532539987e-05, + "loss": 1.4246, + "step": 36800 + }, + { + "epoch": 1412.6415094339623, + "grad_norm": 2.018497485986653, + "learning_rate": 2.2251861610762556e-05, + "loss": 1.4219, + "step": 36820 + }, + { + "epoch": 1413.3962264150944, + "grad_norm": 1.495393210690481, + "learning_rate": 2.221511536000505e-05, + "loss": 1.4201, + "step": 36840 + }, + { + "epoch": 1414.1509433962265, + "grad_norm": 1.5817177891641536, + "learning_rate": 2.2178396622968714e-05, + "loss": 1.4301, + "step": 36860 + }, + { + "epoch": 1414.9056603773586, + "grad_norm": 1.5602680564678848, + "learning_rate": 2.2141705449457588e-05, + "loss": 1.4246, + "step": 36880 + }, + { + "epoch": 1415.6603773584907, + "grad_norm": 1.5687723652001904, + "learning_rate": 2.2105041889238327e-05, + "loss": 1.4291, + "step": 36900 + }, + { + "epoch": 1416.4150943396226, + "grad_norm": 1.6516438298835592, + "learning_rate": 2.2068405992040127e-05, + "loss": 1.4186, + "step": 36920 + }, + { + "epoch": 1417.1698113207547, + "grad_norm": 1.6972963029742167, + "learning_rate": 2.2031797807554646e-05, + "loss": 1.4026, + "step": 36940 + }, + { + "epoch": 1417.9245283018868, + "grad_norm": 1.6936263753645908, + "learning_rate": 2.1995217385435962e-05, + "loss": 1.3882, + "step": 36960 + }, + { + "epoch": 1418.6792452830189, + "grad_norm": 1.697372534880421, + "learning_rate": 2.1958664775300517e-05, + "loss": 1.4228, + "step": 36980 + }, + { + "epoch": 1419.433962264151, + "grad_norm": 1.4972148012217616, + "learning_rate": 2.192214002672703e-05, + "loss": 1.3961, + "step": 37000 + }, + { + "epoch": 1420.188679245283, + "grad_norm": 1.6926137674291781, + "learning_rate": 2.1885643189256404e-05, + "loss": 1.4005, + "step": 37020 + }, + { + "epoch": 1420.9433962264152, + "grad_norm": 1.6182171449313734, + "learning_rate": 2.1849174312391693e-05, + "loss": 1.3939, + "step": 37040 + }, + { + "epoch": 1421.698113207547, + "grad_norm": 1.6235165658387523, + "learning_rate": 2.181273344559802e-05, + "loss": 1.414, + "step": 37060 + }, + { + "epoch": 1422.4528301886792, + "grad_norm": 1.7354641628437306, + "learning_rate": 2.1776320638302533e-05, + "loss": 1.4039, + "step": 37080 + }, + { + "epoch": 1423.2075471698113, + "grad_norm": 1.7598777416483105, + "learning_rate": 2.1739935939894332e-05, + "loss": 1.4319, + "step": 37100 + }, + { + "epoch": 1423.9622641509434, + "grad_norm": 1.6119817066147992, + "learning_rate": 2.170357939972436e-05, + "loss": 1.4083, + "step": 37120 + }, + { + "epoch": 1424.7169811320755, + "grad_norm": 1.5177195143064601, + "learning_rate": 2.1667251067105383e-05, + "loss": 1.4084, + "step": 37140 + }, + { + "epoch": 1425.4716981132076, + "grad_norm": 1.6531623474873094, + "learning_rate": 2.1630950991311884e-05, + "loss": 1.3961, + "step": 37160 + }, + { + "epoch": 1426.2264150943397, + "grad_norm": 1.9866189092494402, + "learning_rate": 2.159467922158006e-05, + "loss": 1.4205, + "step": 37180 + }, + { + "epoch": 1426.9811320754718, + "grad_norm": 1.6409536663163726, + "learning_rate": 2.15584358071077e-05, + "loss": 1.4065, + "step": 37200 + }, + { + "epoch": 1427.7358490566037, + "grad_norm": 1.5972136032609723, + "learning_rate": 2.1522220797054117e-05, + "loss": 1.3999, + "step": 37220 + }, + { + "epoch": 1428.4905660377358, + "grad_norm": 1.7176147072411343, + "learning_rate": 2.1486034240540095e-05, + "loss": 1.4077, + "step": 37240 + }, + { + "epoch": 1429.245283018868, + "grad_norm": 2.2258677114656655, + "learning_rate": 2.1449876186647868e-05, + "loss": 1.4174, + "step": 37260 + }, + { + "epoch": 1430.0, + "grad_norm": 1.702909141767608, + "learning_rate": 2.1413746684420938e-05, + "loss": 1.3745, + "step": 37280 + }, + { + "epoch": 1430.754716981132, + "grad_norm": 2.1998514759828915, + "learning_rate": 2.1377645782864164e-05, + "loss": 1.421, + "step": 37300 + }, + { + "epoch": 1431.5094339622642, + "grad_norm": 1.4634078104497494, + "learning_rate": 2.134157353094355e-05, + "loss": 1.4219, + "step": 37320 + }, + { + "epoch": 1432.2641509433963, + "grad_norm": 1.7232746233155163, + "learning_rate": 2.1305529977586244e-05, + "loss": 1.4236, + "step": 37340 + }, + { + "epoch": 1433.0188679245282, + "grad_norm": 1.7372503788909404, + "learning_rate": 2.1269515171680505e-05, + "loss": 1.391, + "step": 37360 + }, + { + "epoch": 1433.7735849056603, + "grad_norm": 1.8471672382610358, + "learning_rate": 2.1233529162075586e-05, + "loss": 1.4087, + "step": 37380 + }, + { + "epoch": 1434.5283018867924, + "grad_norm": 1.6217649320497987, + "learning_rate": 2.1197571997581665e-05, + "loss": 1.4239, + "step": 37400 + }, + { + "epoch": 1435.2830188679245, + "grad_norm": 1.5296478429731253, + "learning_rate": 2.1161643726969807e-05, + "loss": 1.3958, + "step": 37420 + }, + { + "epoch": 1436.0377358490566, + "grad_norm": 1.560888539193858, + "learning_rate": 2.1125744398971865e-05, + "loss": 1.3979, + "step": 37440 + }, + { + "epoch": 1436.7924528301887, + "grad_norm": 4.524184692042414, + "learning_rate": 2.1089874062280467e-05, + "loss": 1.4068, + "step": 37460 + }, + { + "epoch": 1437.5471698113208, + "grad_norm": 1.9907802095010148, + "learning_rate": 2.1054032765548943e-05, + "loss": 1.4128, + "step": 37480 + }, + { + "epoch": 1438.301886792453, + "grad_norm": 1.5158745007996666, + "learning_rate": 2.1018220557391152e-05, + "loss": 1.4206, + "step": 37500 + }, + { + "epoch": 1439.0566037735848, + "grad_norm": 1.922346737191028, + "learning_rate": 2.0982437486381567e-05, + "loss": 1.4155, + "step": 37520 + }, + { + "epoch": 1439.811320754717, + "grad_norm": 1.6671399833246607, + "learning_rate": 2.094668360105509e-05, + "loss": 1.4052, + "step": 37540 + }, + { + "epoch": 1440.566037735849, + "grad_norm": 1.6551308179910114, + "learning_rate": 2.0910958949907086e-05, + "loss": 1.3986, + "step": 37560 + }, + { + "epoch": 1441.3207547169811, + "grad_norm": 1.5091823329163863, + "learning_rate": 2.087526358139325e-05, + "loss": 1.3842, + "step": 37580 + }, + { + "epoch": 1442.0754716981132, + "grad_norm": 1.5775979624954766, + "learning_rate": 2.0839597543929547e-05, + "loss": 1.3695, + "step": 37600 + }, + { + "epoch": 1442.8301886792453, + "grad_norm": 1.65888589339979, + "learning_rate": 2.0803960885892166e-05, + "loss": 1.4212, + "step": 37620 + }, + { + "epoch": 1443.5849056603774, + "grad_norm": 1.7548961067858515, + "learning_rate": 2.0768353655617437e-05, + "loss": 1.4113, + "step": 37640 + }, + { + "epoch": 1444.3396226415093, + "grad_norm": 1.6568444527605615, + "learning_rate": 2.0732775901401787e-05, + "loss": 1.4097, + "step": 37660 + }, + { + "epoch": 1445.0943396226414, + "grad_norm": 1.569007851847122, + "learning_rate": 2.0697227671501686e-05, + "loss": 1.4025, + "step": 37680 + }, + { + "epoch": 1445.8490566037735, + "grad_norm": 2.054795832820314, + "learning_rate": 2.0661709014133507e-05, + "loss": 1.379, + "step": 37700 + }, + { + "epoch": 1446.6037735849056, + "grad_norm": 1.825419627550906, + "learning_rate": 2.0626219977473546e-05, + "loss": 1.4141, + "step": 37720 + }, + { + "epoch": 1447.3584905660377, + "grad_norm": 1.5477059924334846, + "learning_rate": 2.05907606096579e-05, + "loss": 1.3764, + "step": 37740 + }, + { + "epoch": 1448.1132075471698, + "grad_norm": 1.6756003225697567, + "learning_rate": 2.0555330958782456e-05, + "loss": 1.3943, + "step": 37760 + }, + { + "epoch": 1448.867924528302, + "grad_norm": 1.6181024178942431, + "learning_rate": 2.0519931072902775e-05, + "loss": 1.3828, + "step": 37780 + }, + { + "epoch": 1449.622641509434, + "grad_norm": 1.5075137093108786, + "learning_rate": 2.0484561000034048e-05, + "loss": 1.3993, + "step": 37800 + }, + { + "epoch": 1450.377358490566, + "grad_norm": 1.7611806649373956, + "learning_rate": 2.0449220788151017e-05, + "loss": 1.4025, + "step": 37820 + }, + { + "epoch": 1451.132075471698, + "grad_norm": 1.5899695714047575, + "learning_rate": 2.0413910485187918e-05, + "loss": 1.4011, + "step": 37840 + }, + { + "epoch": 1451.8867924528302, + "grad_norm": 1.9368764939203147, + "learning_rate": 2.0378630139038477e-05, + "loss": 1.3914, + "step": 37860 + }, + { + "epoch": 1452.6415094339623, + "grad_norm": 2.2298811573087938, + "learning_rate": 2.0343379797555718e-05, + "loss": 1.4096, + "step": 37880 + }, + { + "epoch": 1453.3962264150944, + "grad_norm": 1.8812446095800621, + "learning_rate": 2.0308159508552003e-05, + "loss": 1.3994, + "step": 37900 + }, + { + "epoch": 1454.1509433962265, + "grad_norm": 1.546640573436516, + "learning_rate": 2.0272969319798898e-05, + "loss": 1.3901, + "step": 37920 + }, + { + "epoch": 1454.9056603773586, + "grad_norm": 1.6385943250375863, + "learning_rate": 2.0237809279027187e-05, + "loss": 1.3954, + "step": 37940 + }, + { + "epoch": 1455.6603773584907, + "grad_norm": 1.5716563974399815, + "learning_rate": 2.0202679433926757e-05, + "loss": 1.3935, + "step": 37960 + }, + { + "epoch": 1456.4150943396226, + "grad_norm": 1.735984609092, + "learning_rate": 2.0167579832146505e-05, + "loss": 1.4118, + "step": 37980 + }, + { + "epoch": 1457.1698113207547, + "grad_norm": 1.7313373844094564, + "learning_rate": 2.013251052129433e-05, + "loss": 1.3767, + "step": 38000 + }, + { + "epoch": 1457.9245283018868, + "grad_norm": 1.7960534040522838, + "learning_rate": 2.0097471548937024e-05, + "loss": 1.3803, + "step": 38020 + }, + { + "epoch": 1458.6792452830189, + "grad_norm": 2.5283310401144434, + "learning_rate": 2.0062462962600258e-05, + "loss": 1.3763, + "step": 38040 + }, + { + "epoch": 1459.433962264151, + "grad_norm": 1.5401697781512245, + "learning_rate": 2.0027484809768506e-05, + "loss": 1.3768, + "step": 38060 + }, + { + "epoch": 1460.188679245283, + "grad_norm": 2.291171375246112, + "learning_rate": 1.9992537137884905e-05, + "loss": 1.389, + "step": 38080 + }, + { + "epoch": 1460.9433962264152, + "grad_norm": 1.6878149956470094, + "learning_rate": 1.9957619994351278e-05, + "loss": 1.3978, + "step": 38100 + }, + { + "epoch": 1461.698113207547, + "grad_norm": 1.9279881821004916, + "learning_rate": 1.9922733426528033e-05, + "loss": 1.3576, + "step": 38120 + }, + { + "epoch": 1462.4528301886792, + "grad_norm": 1.6593792348690906, + "learning_rate": 1.9887877481734122e-05, + "loss": 1.3827, + "step": 38140 + }, + { + "epoch": 1463.2075471698113, + "grad_norm": 1.6870370599265458, + "learning_rate": 1.9853052207246967e-05, + "loss": 1.3498, + "step": 38160 + }, + { + "epoch": 1463.9622641509434, + "grad_norm": 1.5201114526632646, + "learning_rate": 1.981825765030236e-05, + "loss": 1.3972, + "step": 38180 + }, + { + "epoch": 1464.7169811320755, + "grad_norm": 1.676738216954013, + "learning_rate": 1.9783493858094444e-05, + "loss": 1.3751, + "step": 38200 + }, + { + "epoch": 1465.4716981132076, + "grad_norm": 1.7174190166635537, + "learning_rate": 1.9748760877775622e-05, + "loss": 1.3723, + "step": 38220 + }, + { + "epoch": 1466.2264150943397, + "grad_norm": 1.6832142484740018, + "learning_rate": 1.9714058756456533e-05, + "loss": 1.383, + "step": 38240 + }, + { + "epoch": 1466.9811320754718, + "grad_norm": 1.6409050107329164, + "learning_rate": 1.9679387541205946e-05, + "loss": 1.3868, + "step": 38260 + }, + { + "epoch": 1467.7358490566037, + "grad_norm": 1.870348325922022, + "learning_rate": 1.96447472790507e-05, + "loss": 1.4093, + "step": 38280 + }, + { + "epoch": 1468.4905660377358, + "grad_norm": 1.622451000807429, + "learning_rate": 1.9610138016975643e-05, + "loss": 1.3908, + "step": 38300 + }, + { + "epoch": 1469.245283018868, + "grad_norm": 1.5580142525601877, + "learning_rate": 1.9575559801923602e-05, + "loss": 1.3519, + "step": 38320 + }, + { + "epoch": 1470.0, + "grad_norm": 5.308564996067893, + "learning_rate": 1.95410126807953e-05, + "loss": 1.3806, + "step": 38340 + }, + { + "epoch": 1470.754716981132, + "grad_norm": 1.697128624848967, + "learning_rate": 1.9506496700449247e-05, + "loss": 1.4021, + "step": 38360 + }, + { + "epoch": 1471.5094339622642, + "grad_norm": 1.6407506564774348, + "learning_rate": 1.9472011907701736e-05, + "loss": 1.3889, + "step": 38380 + }, + { + "epoch": 1472.2641509433963, + "grad_norm": 2.5227247039872567, + "learning_rate": 1.9437558349326745e-05, + "loss": 1.3656, + "step": 38400 + }, + { + "epoch": 1473.0188679245282, + "grad_norm": 2.0102914189329155, + "learning_rate": 1.9403136072055903e-05, + "loss": 1.3631, + "step": 38420 + }, + { + "epoch": 1473.7735849056603, + "grad_norm": 1.8549253610315775, + "learning_rate": 1.9368745122578427e-05, + "loss": 1.3835, + "step": 38440 + }, + { + "epoch": 1474.5283018867924, + "grad_norm": 1.7474837425802672, + "learning_rate": 1.9334385547541004e-05, + "loss": 1.3876, + "step": 38460 + }, + { + "epoch": 1475.2830188679245, + "grad_norm": 1.4221446206180013, + "learning_rate": 1.930005739354778e-05, + "loss": 1.3875, + "step": 38480 + }, + { + "epoch": 1476.0377358490566, + "grad_norm": 1.567704112230289, + "learning_rate": 1.926576070716028e-05, + "loss": 1.3787, + "step": 38500 + }, + { + "epoch": 1476.7924528301887, + "grad_norm": 1.8092743011121888, + "learning_rate": 1.9231495534897356e-05, + "loss": 1.3746, + "step": 38520 + }, + { + "epoch": 1477.5471698113208, + "grad_norm": 1.9871542434365639, + "learning_rate": 1.919726192323512e-05, + "loss": 1.4062, + "step": 38540 + }, + { + "epoch": 1478.301886792453, + "grad_norm": 2.100192891688555, + "learning_rate": 1.916305991860687e-05, + "loss": 1.372, + "step": 38560 + }, + { + "epoch": 1479.0566037735848, + "grad_norm": 1.689968827696773, + "learning_rate": 1.912888956740302e-05, + "loss": 1.3994, + "step": 38580 + }, + { + "epoch": 1479.811320754717, + "grad_norm": 1.59619952456533, + "learning_rate": 1.9094750915971053e-05, + "loss": 1.3547, + "step": 38600 + }, + { + "epoch": 1480.566037735849, + "grad_norm": 3.206605320072948, + "learning_rate": 1.9060644010615473e-05, + "loss": 1.4052, + "step": 38620 + }, + { + "epoch": 1481.3207547169811, + "grad_norm": 1.5795369303879008, + "learning_rate": 1.9026568897597735e-05, + "loss": 1.3921, + "step": 38640 + }, + { + "epoch": 1482.0754716981132, + "grad_norm": 2.1910690965934467, + "learning_rate": 1.8992525623136132e-05, + "loss": 1.3563, + "step": 38660 + }, + { + "epoch": 1482.8301886792453, + "grad_norm": 1.5353645456337577, + "learning_rate": 1.8958514233405793e-05, + "loss": 1.4077, + "step": 38680 + }, + { + "epoch": 1483.5849056603774, + "grad_norm": 1.7836996022414107, + "learning_rate": 1.8924534774538593e-05, + "loss": 1.3824, + "step": 38700 + }, + { + "epoch": 1484.3396226415093, + "grad_norm": 1.6136317181444138, + "learning_rate": 1.8890587292623113e-05, + "loss": 1.3511, + "step": 38720 + }, + { + "epoch": 1485.0943396226414, + "grad_norm": 1.8211866581007339, + "learning_rate": 1.8856671833704565e-05, + "loss": 1.3725, + "step": 38740 + }, + { + "epoch": 1485.8490566037735, + "grad_norm": 1.5979573815344084, + "learning_rate": 1.8822788443784704e-05, + "loss": 1.3571, + "step": 38760 + }, + { + "epoch": 1486.6037735849056, + "grad_norm": 1.6365464316772047, + "learning_rate": 1.878893716882177e-05, + "loss": 1.3588, + "step": 38780 + }, + { + "epoch": 1487.3584905660377, + "grad_norm": 2.811912712405292, + "learning_rate": 1.8755118054730514e-05, + "loss": 1.3823, + "step": 38800 + }, + { + "epoch": 1488.1132075471698, + "grad_norm": 1.684830593576563, + "learning_rate": 1.8721331147381986e-05, + "loss": 1.3604, + "step": 38820 + }, + { + "epoch": 1488.867924528302, + "grad_norm": 1.5101192259883982, + "learning_rate": 1.868757649260362e-05, + "loss": 1.3712, + "step": 38840 + }, + { + "epoch": 1489.622641509434, + "grad_norm": 1.8516527403548584, + "learning_rate": 1.8653854136179047e-05, + "loss": 1.3576, + "step": 38860 + }, + { + "epoch": 1490.377358490566, + "grad_norm": 1.5630443819078437, + "learning_rate": 1.8620164123848113e-05, + "loss": 1.3729, + "step": 38880 + }, + { + "epoch": 1491.132075471698, + "grad_norm": 1.9558078371048477, + "learning_rate": 1.8586506501306792e-05, + "loss": 1.3466, + "step": 38900 + }, + { + "epoch": 1491.8867924528302, + "grad_norm": 1.6581722869425195, + "learning_rate": 1.8552881314207158e-05, + "loss": 1.3547, + "step": 38920 + }, + { + "epoch": 1492.6415094339623, + "grad_norm": 1.9162311420660751, + "learning_rate": 1.8519288608157236e-05, + "loss": 1.3995, + "step": 38940 + }, + { + "epoch": 1493.3962264150944, + "grad_norm": 2.8463480242853874, + "learning_rate": 1.8485728428721025e-05, + "loss": 1.3609, + "step": 38960 + }, + { + "epoch": 1494.1509433962265, + "grad_norm": 1.7832047879021928, + "learning_rate": 1.845220082141838e-05, + "loss": 1.3966, + "step": 38980 + }, + { + "epoch": 1494.9056603773586, + "grad_norm": 1.628697490406908, + "learning_rate": 1.841870583172502e-05, + "loss": 1.3577, + "step": 39000 + }, + { + "epoch": 1495.6603773584907, + "grad_norm": 1.7499682485349517, + "learning_rate": 1.8385243505072403e-05, + "loss": 1.3634, + "step": 39020 + }, + { + "epoch": 1496.4150943396226, + "grad_norm": 2.369232208734949, + "learning_rate": 1.835181388684767e-05, + "loss": 1.3804, + "step": 39040 + }, + { + "epoch": 1497.1698113207547, + "grad_norm": 2.002186669217615, + "learning_rate": 1.8318417022393614e-05, + "loss": 1.3775, + "step": 39060 + }, + { + "epoch": 1497.9245283018868, + "grad_norm": 1.7745981101584183, + "learning_rate": 1.8285052957008572e-05, + "loss": 1.3678, + "step": 39080 + }, + { + "epoch": 1498.6792452830189, + "grad_norm": 2.3506034698380027, + "learning_rate": 1.825172173594644e-05, + "loss": 1.3819, + "step": 39100 + }, + { + "epoch": 1499.433962264151, + "grad_norm": 1.5587811175176152, + "learning_rate": 1.8218423404416543e-05, + "loss": 1.3623, + "step": 39120 + }, + { + "epoch": 1500.188679245283, + "grad_norm": 1.5407388891782507, + "learning_rate": 1.818515800758359e-05, + "loss": 1.3737, + "step": 39140 + }, + { + "epoch": 1500.9433962264152, + "grad_norm": 1.7105290658502008, + "learning_rate": 1.8151925590567624e-05, + "loss": 1.3416, + "step": 39160 + }, + { + "epoch": 1501.698113207547, + "grad_norm": 2.1160472894699973, + "learning_rate": 1.811872619844394e-05, + "loss": 1.3596, + "step": 39180 + }, + { + "epoch": 1502.4528301886792, + "grad_norm": 1.7134114803577327, + "learning_rate": 1.8085559876243068e-05, + "loss": 1.3486, + "step": 39200 + }, + { + "epoch": 1503.2075471698113, + "grad_norm": 1.5742520539626361, + "learning_rate": 1.805242666895068e-05, + "loss": 1.3737, + "step": 39220 + }, + { + "epoch": 1503.9622641509434, + "grad_norm": 1.6841300143409803, + "learning_rate": 1.8019326621507504e-05, + "loss": 1.3593, + "step": 39240 + }, + { + "epoch": 1504.7169811320755, + "grad_norm": 1.4678089134086005, + "learning_rate": 1.7986259778809304e-05, + "loss": 1.3332, + "step": 39260 + }, + { + "epoch": 1505.4716981132076, + "grad_norm": 1.5583137022685134, + "learning_rate": 1.7953226185706828e-05, + "loss": 1.3532, + "step": 39280 + }, + { + "epoch": 1506.2264150943397, + "grad_norm": 1.754522974870956, + "learning_rate": 1.7920225887005686e-05, + "loss": 1.3969, + "step": 39300 + }, + { + "epoch": 1506.9811320754718, + "grad_norm": 2.329959975485945, + "learning_rate": 1.788725892746638e-05, + "loss": 1.3693, + "step": 39320 + }, + { + "epoch": 1507.7358490566037, + "grad_norm": 1.9736365954487893, + "learning_rate": 1.7854325351804138e-05, + "loss": 1.3545, + "step": 39340 + }, + { + "epoch": 1508.4905660377358, + "grad_norm": 2.0873147005956274, + "learning_rate": 1.782142520468893e-05, + "loss": 1.357, + "step": 39360 + }, + { + "epoch": 1509.245283018868, + "grad_norm": 1.8565982391914584, + "learning_rate": 1.7788558530745406e-05, + "loss": 1.3574, + "step": 39380 + }, + { + "epoch": 1510.0, + "grad_norm": 1.3940404402455406, + "learning_rate": 1.7755725374552767e-05, + "loss": 1.3322, + "step": 39400 + }, + { + "epoch": 1510.754716981132, + "grad_norm": 1.6757375238937267, + "learning_rate": 1.772292578064481e-05, + "loss": 1.3562, + "step": 39420 + }, + { + "epoch": 1511.5094339622642, + "grad_norm": 2.021397727623104, + "learning_rate": 1.769015979350977e-05, + "loss": 1.3494, + "step": 39440 + }, + { + "epoch": 1512.2641509433963, + "grad_norm": 1.9073518931594837, + "learning_rate": 1.7657427457590277e-05, + "loss": 1.3469, + "step": 39460 + }, + { + "epoch": 1513.0188679245282, + "grad_norm": 1.5299281651503949, + "learning_rate": 1.7624728817283386e-05, + "loss": 1.3347, + "step": 39480 + }, + { + "epoch": 1513.7735849056603, + "grad_norm": 1.6171286973533487, + "learning_rate": 1.7592063916940385e-05, + "loss": 1.3781, + "step": 39500 + }, + { + "epoch": 1514.5283018867924, + "grad_norm": 2.3151167774892283, + "learning_rate": 1.7559432800866844e-05, + "loss": 1.3389, + "step": 39520 + }, + { + "epoch": 1515.2830188679245, + "grad_norm": 1.8404517143557557, + "learning_rate": 1.752683551332248e-05, + "loss": 1.3809, + "step": 39540 + }, + { + "epoch": 1516.0377358490566, + "grad_norm": 1.80165740372062, + "learning_rate": 1.749427209852112e-05, + "loss": 1.3647, + "step": 39560 + }, + { + "epoch": 1516.7924528301887, + "grad_norm": 1.5503005965319303, + "learning_rate": 1.7461742600630684e-05, + "loss": 1.3553, + "step": 39580 + }, + { + "epoch": 1517.5471698113208, + "grad_norm": 1.7389286642537964, + "learning_rate": 1.7429247063773047e-05, + "loss": 1.3566, + "step": 39600 + }, + { + "epoch": 1518.301886792453, + "grad_norm": 1.5514338805704833, + "learning_rate": 1.7396785532024062e-05, + "loss": 1.3771, + "step": 39620 + }, + { + "epoch": 1519.0566037735848, + "grad_norm": 1.738553891820026, + "learning_rate": 1.7364358049413427e-05, + "loss": 1.3608, + "step": 39640 + }, + { + "epoch": 1519.811320754717, + "grad_norm": 2.2590021667446476, + "learning_rate": 1.7331964659924647e-05, + "loss": 1.3594, + "step": 39660 + }, + { + "epoch": 1520.566037735849, + "grad_norm": 1.8008534873454645, + "learning_rate": 1.729960540749503e-05, + "loss": 1.3446, + "step": 39680 + }, + { + "epoch": 1521.3207547169811, + "grad_norm": 1.9823359457338208, + "learning_rate": 1.7267280336015543e-05, + "loss": 1.3604, + "step": 39700 + }, + { + "epoch": 1522.0754716981132, + "grad_norm": 1.6630862297023916, + "learning_rate": 1.723498948933081e-05, + "loss": 1.3831, + "step": 39720 + }, + { + "epoch": 1522.8301886792453, + "grad_norm": 1.9271729195919085, + "learning_rate": 1.720273291123901e-05, + "loss": 1.3571, + "step": 39740 + }, + { + "epoch": 1523.5849056603774, + "grad_norm": 1.6944904437475812, + "learning_rate": 1.7170510645491884e-05, + "loss": 1.3845, + "step": 39760 + }, + { + "epoch": 1524.3396226415093, + "grad_norm": 2.0111059446030164, + "learning_rate": 1.7138322735794582e-05, + "loss": 1.3464, + "step": 39780 + }, + { + "epoch": 1525.0943396226414, + "grad_norm": 1.6636863494806655, + "learning_rate": 1.7106169225805703e-05, + "loss": 1.3472, + "step": 39800 + }, + { + "epoch": 1525.8490566037735, + "grad_norm": 1.654778862655826, + "learning_rate": 1.7074050159137155e-05, + "loss": 1.3517, + "step": 39820 + }, + { + "epoch": 1526.6037735849056, + "grad_norm": 1.9255620043148591, + "learning_rate": 1.7041965579354115e-05, + "loss": 1.359, + "step": 39840 + }, + { + "epoch": 1527.3584905660377, + "grad_norm": 1.7612527068488755, + "learning_rate": 1.7009915529975046e-05, + "loss": 1.3535, + "step": 39860 + }, + { + "epoch": 1528.1132075471698, + "grad_norm": 1.6440495946289901, + "learning_rate": 1.69779000544715e-05, + "loss": 1.3275, + "step": 39880 + }, + { + "epoch": 1528.867924528302, + "grad_norm": 1.8436152132956103, + "learning_rate": 1.6945919196268195e-05, + "loss": 1.3269, + "step": 39900 + }, + { + "epoch": 1529.622641509434, + "grad_norm": 1.5249165119761414, + "learning_rate": 1.6913972998742855e-05, + "loss": 1.3528, + "step": 39920 + }, + { + "epoch": 1530.377358490566, + "grad_norm": 1.7116936271233165, + "learning_rate": 1.6882061505226197e-05, + "loss": 1.3351, + "step": 39940 + }, + { + "epoch": 1531.132075471698, + "grad_norm": 1.9127818443391411, + "learning_rate": 1.68501847590019e-05, + "loss": 1.3649, + "step": 39960 + }, + { + "epoch": 1531.8867924528302, + "grad_norm": 1.7968703663627887, + "learning_rate": 1.681834280330646e-05, + "loss": 1.3664, + "step": 39980 + }, + { + "epoch": 1532.6415094339623, + "grad_norm": 2.01086719476703, + "learning_rate": 1.6786535681329242e-05, + "loss": 1.3354, + "step": 40000 + }, + { + "epoch": 1533.3962264150944, + "grad_norm": 1.8971146877595166, + "learning_rate": 1.6754763436212318e-05, + "loss": 1.3459, + "step": 40020 + }, + { + "epoch": 1534.1509433962265, + "grad_norm": 1.5538558777058122, + "learning_rate": 1.6723026111050465e-05, + "loss": 1.348, + "step": 40040 + }, + { + "epoch": 1534.9056603773586, + "grad_norm": 1.6899684943437072, + "learning_rate": 1.6691323748891116e-05, + "loss": 1.3219, + "step": 40060 + }, + { + "epoch": 1535.6603773584907, + "grad_norm": 1.5696983044378243, + "learning_rate": 1.6659656392734248e-05, + "loss": 1.3523, + "step": 40080 + }, + { + "epoch": 1536.4150943396226, + "grad_norm": 2.008558955924781, + "learning_rate": 1.6628024085532394e-05, + "loss": 1.3507, + "step": 40100 + }, + { + "epoch": 1537.1698113207547, + "grad_norm": 1.6814099912171956, + "learning_rate": 1.6596426870190517e-05, + "loss": 1.3271, + "step": 40120 + }, + { + "epoch": 1537.9245283018868, + "grad_norm": 1.6392939792056798, + "learning_rate": 1.6564864789566017e-05, + "loss": 1.3628, + "step": 40140 + }, + { + "epoch": 1538.6792452830189, + "grad_norm": 1.6937327457602671, + "learning_rate": 1.6533337886468593e-05, + "loss": 1.3457, + "step": 40160 + }, + { + "epoch": 1539.433962264151, + "grad_norm": 2.0312461746808674, + "learning_rate": 1.650184620366025e-05, + "loss": 1.345, + "step": 40180 + }, + { + "epoch": 1540.188679245283, + "grad_norm": 1.6324403361347462, + "learning_rate": 1.647038978385525e-05, + "loss": 1.3614, + "step": 40200 + }, + { + "epoch": 1540.9433962264152, + "grad_norm": 1.8937794347785448, + "learning_rate": 1.643896866971998e-05, + "loss": 1.3485, + "step": 40220 + }, + { + "epoch": 1541.698113207547, + "grad_norm": 1.66458657626364, + "learning_rate": 1.6407582903872977e-05, + "loss": 1.3201, + "step": 40240 + }, + { + "epoch": 1542.4528301886792, + "grad_norm": 1.637256903291043, + "learning_rate": 1.637623252888481e-05, + "loss": 1.3287, + "step": 40260 + }, + { + "epoch": 1543.2075471698113, + "grad_norm": 1.771255607485422, + "learning_rate": 1.634491758727804e-05, + "loss": 1.3386, + "step": 40280 + }, + { + "epoch": 1543.9622641509434, + "grad_norm": 2.294826209947056, + "learning_rate": 1.6313638121527195e-05, + "loss": 1.3443, + "step": 40300 + }, + { + "epoch": 1544.7169811320755, + "grad_norm": 1.5369973618999444, + "learning_rate": 1.6282394174058652e-05, + "loss": 1.3199, + "step": 40320 + }, + { + "epoch": 1545.4716981132076, + "grad_norm": 1.7805574251016163, + "learning_rate": 1.6251185787250646e-05, + "loss": 1.3427, + "step": 40340 + }, + { + "epoch": 1546.2264150943397, + "grad_norm": 1.7055546669575088, + "learning_rate": 1.6220013003433163e-05, + "loss": 1.3595, + "step": 40360 + }, + { + "epoch": 1546.9811320754718, + "grad_norm": 1.6493151345521173, + "learning_rate": 1.618887586488787e-05, + "loss": 1.3417, + "step": 40380 + }, + { + "epoch": 1547.7358490566037, + "grad_norm": 1.7099299752279526, + "learning_rate": 1.6157774413848147e-05, + "loss": 1.3286, + "step": 40400 + }, + { + "epoch": 1548.4905660377358, + "grad_norm": 1.6461054638879455, + "learning_rate": 1.61267086924989e-05, + "loss": 1.3651, + "step": 40420 + }, + { + "epoch": 1549.245283018868, + "grad_norm": 2.239209937375333, + "learning_rate": 1.6095678742976643e-05, + "loss": 1.3402, + "step": 40440 + }, + { + "epoch": 1550.0, + "grad_norm": 1.9293560078530108, + "learning_rate": 1.6064684607369317e-05, + "loss": 1.3566, + "step": 40460 + }, + { + "epoch": 1550.754716981132, + "grad_norm": 1.7850904902946119, + "learning_rate": 1.603372632771629e-05, + "loss": 1.3522, + "step": 40480 + }, + { + "epoch": 1551.5094339622642, + "grad_norm": 1.8694667431709797, + "learning_rate": 1.6002803946008334e-05, + "loss": 1.3254, + "step": 40500 + }, + { + "epoch": 1552.2641509433963, + "grad_norm": 1.8970095488709016, + "learning_rate": 1.5971917504187483e-05, + "loss": 1.3456, + "step": 40520 + }, + { + "epoch": 1553.0188679245282, + "grad_norm": 1.473985828394077, + "learning_rate": 1.5941067044147068e-05, + "loss": 1.3425, + "step": 40540 + }, + { + "epoch": 1553.7735849056603, + "grad_norm": 2.4173810451888436, + "learning_rate": 1.591025260773159e-05, + "loss": 1.3616, + "step": 40560 + }, + { + "epoch": 1554.5283018867924, + "grad_norm": 4.978545098723231, + "learning_rate": 1.587947423673667e-05, + "loss": 1.3302, + "step": 40580 + }, + { + "epoch": 1555.2830188679245, + "grad_norm": 1.8210531663627934, + "learning_rate": 1.5848731972909058e-05, + "loss": 1.3208, + "step": 40600 + }, + { + "epoch": 1556.0377358490566, + "grad_norm": 1.7214332519076236, + "learning_rate": 1.5818025857946504e-05, + "loss": 1.3429, + "step": 40620 + }, + { + "epoch": 1556.7924528301887, + "grad_norm": 1.7715531344419837, + "learning_rate": 1.5787355933497722e-05, + "loss": 1.3236, + "step": 40640 + }, + { + "epoch": 1557.5471698113208, + "grad_norm": 1.6774415891925254, + "learning_rate": 1.5756722241162336e-05, + "loss": 1.3038, + "step": 40660 + }, + { + "epoch": 1558.301886792453, + "grad_norm": 1.7378768400910978, + "learning_rate": 1.5726124822490856e-05, + "loss": 1.3393, + "step": 40680 + }, + { + "epoch": 1559.0566037735848, + "grad_norm": 1.5050468895919773, + "learning_rate": 1.569556371898455e-05, + "loss": 1.3169, + "step": 40700 + }, + { + "epoch": 1559.811320754717, + "grad_norm": 1.7494465787076923, + "learning_rate": 1.5665038972095462e-05, + "loss": 1.3219, + "step": 40720 + }, + { + "epoch": 1560.566037735849, + "grad_norm": 1.942070137365104, + "learning_rate": 1.563455062322631e-05, + "loss": 1.3331, + "step": 40740 + }, + { + "epoch": 1561.3207547169811, + "grad_norm": 1.4760834184650184, + "learning_rate": 1.560409871373043e-05, + "loss": 1.3371, + "step": 40760 + }, + { + "epoch": 1562.0754716981132, + "grad_norm": 1.794169465456889, + "learning_rate": 1.5573683284911766e-05, + "loss": 1.361, + "step": 40780 + }, + { + "epoch": 1562.8301886792453, + "grad_norm": 1.5717564295595021, + "learning_rate": 1.5543304378024745e-05, + "loss": 1.3198, + "step": 40800 + }, + { + "epoch": 1563.5849056603774, + "grad_norm": 1.679663629392091, + "learning_rate": 1.5512962034274292e-05, + "loss": 1.3225, + "step": 40820 + }, + { + "epoch": 1564.3396226415093, + "grad_norm": 1.5963236435216681, + "learning_rate": 1.5482656294815706e-05, + "loss": 1.3475, + "step": 40840 + }, + { + "epoch": 1565.0943396226414, + "grad_norm": 1.6282234240786269, + "learning_rate": 1.5452387200754648e-05, + "loss": 1.341, + "step": 40860 + }, + { + "epoch": 1565.8490566037735, + "grad_norm": 1.8356401444891661, + "learning_rate": 1.542215479314709e-05, + "loss": 1.3093, + "step": 40880 + }, + { + "epoch": 1566.6037735849056, + "grad_norm": 1.802086287293627, + "learning_rate": 1.5391959112999222e-05, + "loss": 1.3234, + "step": 40900 + }, + { + "epoch": 1567.3584905660377, + "grad_norm": 1.596543322520551, + "learning_rate": 1.536180020126744e-05, + "loss": 1.3207, + "step": 40920 + }, + { + "epoch": 1568.1132075471698, + "grad_norm": 1.5925040590351016, + "learning_rate": 1.5331678098858253e-05, + "loss": 1.3434, + "step": 40940 + }, + { + "epoch": 1568.867924528302, + "grad_norm": 1.7805635796964523, + "learning_rate": 1.5301592846628236e-05, + "loss": 1.3189, + "step": 40960 + }, + { + "epoch": 1569.622641509434, + "grad_norm": 2.270882122194477, + "learning_rate": 1.5271544485384005e-05, + "loss": 1.3331, + "step": 40980 + }, + { + "epoch": 1570.377358490566, + "grad_norm": 1.5460374945916004, + "learning_rate": 1.524153305588211e-05, + "loss": 1.3307, + "step": 41000 + }, + { + "epoch": 1571.132075471698, + "grad_norm": 1.5240728933202146, + "learning_rate": 1.5211558598829046e-05, + "loss": 1.3261, + "step": 41020 + }, + { + "epoch": 1571.8867924528302, + "grad_norm": 1.6551356652204947, + "learning_rate": 1.518162115488113e-05, + "loss": 1.3444, + "step": 41040 + }, + { + "epoch": 1572.6415094339623, + "grad_norm": 2.3662389207443897, + "learning_rate": 1.5151720764644462e-05, + "loss": 1.3078, + "step": 41060 + }, + { + "epoch": 1573.3962264150944, + "grad_norm": 2.0201453815678336, + "learning_rate": 1.5121857468674923e-05, + "loss": 1.2931, + "step": 41080 + }, + { + "epoch": 1574.1509433962265, + "grad_norm": 2.1833616270471428, + "learning_rate": 1.509203130747807e-05, + "loss": 1.3113, + "step": 41100 + }, + { + "epoch": 1574.9056603773586, + "grad_norm": 1.6606152578025972, + "learning_rate": 1.506224232150908e-05, + "loss": 1.3488, + "step": 41120 + }, + { + "epoch": 1575.6603773584907, + "grad_norm": 2.2621888669728776, + "learning_rate": 1.5032490551172706e-05, + "loss": 1.293, + "step": 41140 + }, + { + "epoch": 1576.4150943396226, + "grad_norm": 1.8118753564672168, + "learning_rate": 1.5002776036823215e-05, + "loss": 1.3288, + "step": 41160 + }, + { + "epoch": 1577.1698113207547, + "grad_norm": 1.4675478833771125, + "learning_rate": 1.4973098818764368e-05, + "loss": 1.3181, + "step": 41180 + }, + { + "epoch": 1577.9245283018868, + "grad_norm": 1.5426718546178322, + "learning_rate": 1.4943458937249337e-05, + "loss": 1.3041, + "step": 41200 + }, + { + "epoch": 1578.6792452830189, + "grad_norm": 1.7915801444691424, + "learning_rate": 1.4913856432480624e-05, + "loss": 1.3244, + "step": 41220 + }, + { + "epoch": 1579.433962264151, + "grad_norm": 1.6284356095147676, + "learning_rate": 1.4884291344610055e-05, + "loss": 1.3623, + "step": 41240 + }, + { + "epoch": 1580.188679245283, + "grad_norm": 2.018808164067539, + "learning_rate": 1.4854763713738692e-05, + "loss": 1.3265, + "step": 41260 + }, + { + "epoch": 1580.9433962264152, + "grad_norm": 1.9982741446146173, + "learning_rate": 1.48252735799168e-05, + "loss": 1.3174, + "step": 41280 + }, + { + "epoch": 1581.698113207547, + "grad_norm": 1.6089408891188777, + "learning_rate": 1.4795820983143804e-05, + "loss": 1.3054, + "step": 41300 + }, + { + "epoch": 1582.4528301886792, + "grad_norm": 1.9322291254142352, + "learning_rate": 1.4766405963368183e-05, + "loss": 1.3288, + "step": 41320 + }, + { + "epoch": 1583.2075471698113, + "grad_norm": 2.1500428010731105, + "learning_rate": 1.4737028560487459e-05, + "loss": 1.3251, + "step": 41340 + }, + { + "epoch": 1583.9622641509434, + "grad_norm": 1.6754742169090076, + "learning_rate": 1.470768881434812e-05, + "loss": 1.3111, + "step": 41360 + }, + { + "epoch": 1584.7169811320755, + "grad_norm": 2.0456793462392864, + "learning_rate": 1.4678386764745604e-05, + "loss": 1.2852, + "step": 41380 + }, + { + "epoch": 1585.4716981132076, + "grad_norm": 1.6265186141557229, + "learning_rate": 1.4649122451424216e-05, + "loss": 1.3246, + "step": 41400 + }, + { + "epoch": 1586.2264150943397, + "grad_norm": 1.7034501168928484, + "learning_rate": 1.4619895914077052e-05, + "loss": 1.3061, + "step": 41420 + }, + { + "epoch": 1586.9811320754718, + "grad_norm": 2.5172683919046834, + "learning_rate": 1.459070719234599e-05, + "loss": 1.3287, + "step": 41440 + }, + { + "epoch": 1587.7358490566037, + "grad_norm": 1.681004851075849, + "learning_rate": 1.4561556325821593e-05, + "loss": 1.307, + "step": 41460 + }, + { + "epoch": 1588.4905660377358, + "grad_norm": 1.9545864689840218, + "learning_rate": 1.4532443354043108e-05, + "loss": 1.3015, + "step": 41480 + }, + { + "epoch": 1589.245283018868, + "grad_norm": 2.2238938165489186, + "learning_rate": 1.4503368316498385e-05, + "loss": 1.3323, + "step": 41500 + }, + { + "epoch": 1590.0, + "grad_norm": 2.0846781514374704, + "learning_rate": 1.4474331252623795e-05, + "loss": 1.3273, + "step": 41520 + }, + { + "epoch": 1590.754716981132, + "grad_norm": 1.677709576953602, + "learning_rate": 1.44453322018042e-05, + "loss": 1.3035, + "step": 41540 + }, + { + "epoch": 1591.5094339622642, + "grad_norm": 1.8283093943528037, + "learning_rate": 1.4416371203372931e-05, + "loss": 1.3261, + "step": 41560 + }, + { + "epoch": 1592.2641509433963, + "grad_norm": 1.5077647603013566, + "learning_rate": 1.4387448296611699e-05, + "loss": 1.3039, + "step": 41580 + }, + { + "epoch": 1593.0188679245282, + "grad_norm": 1.7410910545917078, + "learning_rate": 1.4358563520750539e-05, + "loss": 1.3073, + "step": 41600 + }, + { + "epoch": 1593.7735849056603, + "grad_norm": 1.7856680678107866, + "learning_rate": 1.4329716914967761e-05, + "loss": 1.3128, + "step": 41620 + }, + { + "epoch": 1594.5283018867924, + "grad_norm": 2.291591913470402, + "learning_rate": 1.4300908518389904e-05, + "loss": 1.3067, + "step": 41640 + }, + { + "epoch": 1595.2830188679245, + "grad_norm": 1.785529683540441, + "learning_rate": 1.42721383700917e-05, + "loss": 1.3338, + "step": 41660 + }, + { + "epoch": 1596.0377358490566, + "grad_norm": 1.5806566985978232, + "learning_rate": 1.4243406509096e-05, + "loss": 1.3212, + "step": 41680 + }, + { + "epoch": 1596.7924528301887, + "grad_norm": 1.6779162023356309, + "learning_rate": 1.4214712974373703e-05, + "loss": 1.304, + "step": 41700 + }, + { + "epoch": 1597.5471698113208, + "grad_norm": 1.7562533688231816, + "learning_rate": 1.418605780484373e-05, + "loss": 1.2875, + "step": 41720 + }, + { + "epoch": 1598.301886792453, + "grad_norm": 1.7567734296175508, + "learning_rate": 1.4157441039372966e-05, + "loss": 1.3145, + "step": 41740 + }, + { + "epoch": 1599.0566037735848, + "grad_norm": 1.722702892703264, + "learning_rate": 1.4128862716776218e-05, + "loss": 1.3019, + "step": 41760 + }, + { + "epoch": 1599.811320754717, + "grad_norm": 1.9956191020255551, + "learning_rate": 1.4100322875816148e-05, + "loss": 1.3114, + "step": 41780 + }, + { + "epoch": 1600.566037735849, + "grad_norm": 1.6531331340969986, + "learning_rate": 1.4071821555203213e-05, + "loss": 1.2966, + "step": 41800 + }, + { + "epoch": 1601.3207547169811, + "grad_norm": 2.0591826060061376, + "learning_rate": 1.4043358793595621e-05, + "loss": 1.3015, + "step": 41820 + }, + { + "epoch": 1602.0754716981132, + "grad_norm": 1.6984765409093496, + "learning_rate": 1.4014934629599273e-05, + "loss": 1.2995, + "step": 41840 + }, + { + "epoch": 1602.8301886792453, + "grad_norm": 2.2995147164644165, + "learning_rate": 1.3986549101767747e-05, + "loss": 1.3184, + "step": 41860 + }, + { + "epoch": 1603.5849056603774, + "grad_norm": 1.6563391450177631, + "learning_rate": 1.39582022486022e-05, + "loss": 1.3379, + "step": 41880 + }, + { + "epoch": 1604.3396226415093, + "grad_norm": 1.8150129139182571, + "learning_rate": 1.3929894108551327e-05, + "loss": 1.2831, + "step": 41900 + }, + { + "epoch": 1605.0943396226414, + "grad_norm": 1.6835080581169954, + "learning_rate": 1.390162472001131e-05, + "loss": 1.3275, + "step": 41920 + }, + { + "epoch": 1605.8490566037735, + "grad_norm": 1.6294691180875247, + "learning_rate": 1.3873394121325766e-05, + "loss": 1.2913, + "step": 41940 + }, + { + "epoch": 1606.6037735849056, + "grad_norm": 1.7180885898850626, + "learning_rate": 1.3845202350785745e-05, + "loss": 1.2965, + "step": 41960 + }, + { + "epoch": 1607.3584905660377, + "grad_norm": 1.5583008263797746, + "learning_rate": 1.3817049446629576e-05, + "loss": 1.2832, + "step": 41980 + }, + { + "epoch": 1608.1132075471698, + "grad_norm": 1.8079371040717394, + "learning_rate": 1.3788935447042895e-05, + "loss": 1.2954, + "step": 42000 + }, + { + "epoch": 1608.867924528302, + "grad_norm": 1.8337600854116936, + "learning_rate": 1.3760860390158554e-05, + "loss": 1.309, + "step": 42020 + }, + { + "epoch": 1609.622641509434, + "grad_norm": 1.6419903042017507, + "learning_rate": 1.3732824314056604e-05, + "loss": 1.3068, + "step": 42040 + }, + { + "epoch": 1610.377358490566, + "grad_norm": 1.948536117708095, + "learning_rate": 1.370482725676423e-05, + "loss": 1.3399, + "step": 42060 + }, + { + "epoch": 1611.132075471698, + "grad_norm": 1.9254279275726736, + "learning_rate": 1.3676869256255669e-05, + "loss": 1.3151, + "step": 42080 + }, + { + "epoch": 1611.8867924528302, + "grad_norm": 1.802137918813917, + "learning_rate": 1.3648950350452192e-05, + "loss": 1.2844, + "step": 42100 + }, + { + "epoch": 1612.6415094339623, + "grad_norm": 3.3181585562433358, + "learning_rate": 1.3621070577222036e-05, + "loss": 1.3125, + "step": 42120 + }, + { + "epoch": 1613.3962264150944, + "grad_norm": 1.6631495329844195, + "learning_rate": 1.3593229974380375e-05, + "loss": 1.2908, + "step": 42140 + }, + { + "epoch": 1614.1509433962265, + "grad_norm": 1.8984095064618975, + "learning_rate": 1.3565428579689256e-05, + "loss": 1.2937, + "step": 42160 + }, + { + "epoch": 1614.9056603773586, + "grad_norm": 1.8422792147059388, + "learning_rate": 1.3537666430857535e-05, + "loss": 1.284, + "step": 42180 + }, + { + "epoch": 1615.6603773584907, + "grad_norm": 1.6231527761557085, + "learning_rate": 1.3509943565540833e-05, + "loss": 1.297, + "step": 42200 + }, + { + "epoch": 1616.4150943396226, + "grad_norm": 1.9453735113539294, + "learning_rate": 1.3482260021341475e-05, + "loss": 1.2902, + "step": 42220 + }, + { + "epoch": 1617.1698113207547, + "grad_norm": 1.7641921684369601, + "learning_rate": 1.345461583580849e-05, + "loss": 1.282, + "step": 42240 + }, + { + "epoch": 1617.9245283018868, + "grad_norm": 2.0387885601326228, + "learning_rate": 1.3427011046437513e-05, + "loss": 1.2898, + "step": 42260 + }, + { + "epoch": 1618.6792452830189, + "grad_norm": 1.777342395003277, + "learning_rate": 1.3399445690670713e-05, + "loss": 1.3168, + "step": 42280 + }, + { + "epoch": 1619.433962264151, + "grad_norm": 1.6370450829023924, + "learning_rate": 1.33719198058968e-05, + "loss": 1.3075, + "step": 42300 + }, + { + "epoch": 1620.188679245283, + "grad_norm": 1.779662328060948, + "learning_rate": 1.334443342945093e-05, + "loss": 1.2919, + "step": 42320 + }, + { + "epoch": 1620.9433962264152, + "grad_norm": 1.7581747425039895, + "learning_rate": 1.3316986598614685e-05, + "loss": 1.3074, + "step": 42340 + }, + { + "epoch": 1621.698113207547, + "grad_norm": 1.8150800521846453, + "learning_rate": 1.3289579350616015e-05, + "loss": 1.2807, + "step": 42360 + }, + { + "epoch": 1622.4528301886792, + "grad_norm": 1.7354020247655273, + "learning_rate": 1.3262211722629166e-05, + "loss": 1.2826, + "step": 42380 + }, + { + "epoch": 1623.2075471698113, + "grad_norm": 1.468102174253192, + "learning_rate": 1.3234883751774644e-05, + "loss": 1.288, + "step": 42400 + }, + { + "epoch": 1623.9622641509434, + "grad_norm": 1.657454196093378, + "learning_rate": 1.3207595475119152e-05, + "loss": 1.3106, + "step": 42420 + }, + { + "epoch": 1624.7169811320755, + "grad_norm": 1.6557640300068772, + "learning_rate": 1.3180346929675611e-05, + "loss": 1.2731, + "step": 42440 + }, + { + "epoch": 1625.4716981132076, + "grad_norm": 1.7735353149818989, + "learning_rate": 1.3153138152402996e-05, + "loss": 1.2763, + "step": 42460 + }, + { + "epoch": 1626.2264150943397, + "grad_norm": 1.9382385391993158, + "learning_rate": 1.3125969180206349e-05, + "loss": 1.319, + "step": 42480 + }, + { + "epoch": 1626.9811320754718, + "grad_norm": 2.136894507958651, + "learning_rate": 1.3098840049936733e-05, + "loss": 1.2805, + "step": 42500 + }, + { + "epoch": 1627.7358490566037, + "grad_norm": 2.1483143587792366, + "learning_rate": 1.3071750798391171e-05, + "loss": 1.2853, + "step": 42520 + }, + { + "epoch": 1628.4905660377358, + "grad_norm": 1.710874022513826, + "learning_rate": 1.304470146231261e-05, + "loss": 1.2806, + "step": 42540 + }, + { + "epoch": 1629.245283018868, + "grad_norm": 1.8951457389499449, + "learning_rate": 1.3017692078389823e-05, + "loss": 1.2932, + "step": 42560 + }, + { + "epoch": 1630.0, + "grad_norm": 1.5984669511746095, + "learning_rate": 1.299072268325742e-05, + "loss": 1.2931, + "step": 42580 + }, + { + "epoch": 1630.754716981132, + "grad_norm": 1.8192427048671964, + "learning_rate": 1.2963793313495747e-05, + "loss": 1.2736, + "step": 42600 + }, + { + "epoch": 1631.5094339622642, + "grad_norm": 1.7556743408681688, + "learning_rate": 1.2936904005630886e-05, + "loss": 1.2844, + "step": 42620 + }, + { + "epoch": 1632.2641509433963, + "grad_norm": 1.9584621506348525, + "learning_rate": 1.2910054796134588e-05, + "loss": 1.2903, + "step": 42640 + }, + { + "epoch": 1633.0188679245282, + "grad_norm": 1.818910778704821, + "learning_rate": 1.2883245721424182e-05, + "loss": 1.2982, + "step": 42660 + }, + { + "epoch": 1633.7735849056603, + "grad_norm": 1.6564371207191282, + "learning_rate": 1.2856476817862578e-05, + "loss": 1.2719, + "step": 42680 + }, + { + "epoch": 1634.5283018867924, + "grad_norm": 2.176105338983291, + "learning_rate": 1.2829748121758186e-05, + "loss": 1.2703, + "step": 42700 + }, + { + "epoch": 1635.2830188679245, + "grad_norm": 2.053268530527867, + "learning_rate": 1.280305966936491e-05, + "loss": 1.2745, + "step": 42720 + }, + { + "epoch": 1636.0377358490566, + "grad_norm": 1.9471448193829715, + "learning_rate": 1.2776411496882053e-05, + "loss": 1.2924, + "step": 42740 + }, + { + "epoch": 1636.7924528301887, + "grad_norm": 1.7439666627316233, + "learning_rate": 1.2749803640454274e-05, + "loss": 1.2883, + "step": 42760 + }, + { + "epoch": 1637.5471698113208, + "grad_norm": 1.7616849064246298, + "learning_rate": 1.2723236136171557e-05, + "loss": 1.2901, + "step": 42780 + }, + { + "epoch": 1638.301886792453, + "grad_norm": 1.9785596426579124, + "learning_rate": 1.2696709020069137e-05, + "loss": 1.2806, + "step": 42800 + }, + { + "epoch": 1639.0566037735848, + "grad_norm": 1.8267869166287358, + "learning_rate": 1.2670222328127502e-05, + "loss": 1.2915, + "step": 42820 + }, + { + "epoch": 1639.811320754717, + "grad_norm": 1.8394119712662444, + "learning_rate": 1.2643776096272298e-05, + "loss": 1.2959, + "step": 42840 + }, + { + "epoch": 1640.566037735849, + "grad_norm": 1.5745567144671013, + "learning_rate": 1.2617370360374272e-05, + "loss": 1.2542, + "step": 42860 + }, + { + "epoch": 1641.3207547169811, + "grad_norm": 1.759927673486542, + "learning_rate": 1.2591005156249265e-05, + "loss": 1.2957, + "step": 42880 + }, + { + "epoch": 1642.0754716981132, + "grad_norm": 1.8212375700098509, + "learning_rate": 1.2564680519658124e-05, + "loss": 1.2911, + "step": 42900 + }, + { + "epoch": 1642.8301886792453, + "grad_norm": 1.7170892376270965, + "learning_rate": 1.2538396486306685e-05, + "loss": 1.2815, + "step": 42920 + }, + { + "epoch": 1643.5849056603774, + "grad_norm": 1.9707121868823774, + "learning_rate": 1.2512153091845724e-05, + "loss": 1.2817, + "step": 42940 + }, + { + "epoch": 1644.3396226415093, + "grad_norm": 1.6048036207691687, + "learning_rate": 1.2485950371870873e-05, + "loss": 1.2318, + "step": 42960 + }, + { + "epoch": 1645.0943396226414, + "grad_norm": 1.7616652416059821, + "learning_rate": 1.2459788361922582e-05, + "loss": 1.2482, + "step": 42980 + }, + { + "epoch": 1645.8490566037735, + "grad_norm": 1.971912319619838, + "learning_rate": 1.2433667097486137e-05, + "loss": 1.2732, + "step": 43000 + }, + { + "epoch": 1646.6037735849056, + "grad_norm": 1.8554872676424141, + "learning_rate": 1.2407586613991493e-05, + "loss": 1.2862, + "step": 43020 + }, + { + "epoch": 1647.3584905660377, + "grad_norm": 1.699680830714332, + "learning_rate": 1.2381546946813345e-05, + "loss": 1.2783, + "step": 43040 + }, + { + "epoch": 1648.1132075471698, + "grad_norm": 2.0246817924294174, + "learning_rate": 1.2355548131271e-05, + "loss": 1.2913, + "step": 43060 + }, + { + "epoch": 1648.867924528302, + "grad_norm": 1.8248553236680727, + "learning_rate": 1.2329590202628339e-05, + "loss": 1.2982, + "step": 43080 + }, + { + "epoch": 1649.622641509434, + "grad_norm": 1.5949265423747017, + "learning_rate": 1.2303673196093838e-05, + "loss": 1.2682, + "step": 43100 + }, + { + "epoch": 1650.377358490566, + "grad_norm": 1.8162411778047456, + "learning_rate": 1.2277797146820398e-05, + "loss": 1.2775, + "step": 43120 + }, + { + "epoch": 1651.132075471698, + "grad_norm": 1.8713447665462608, + "learning_rate": 1.225196208990544e-05, + "loss": 1.2816, + "step": 43140 + }, + { + "epoch": 1651.8867924528302, + "grad_norm": 1.7680462074180785, + "learning_rate": 1.2226168060390733e-05, + "loss": 1.2583, + "step": 43160 + }, + { + "epoch": 1652.6415094339623, + "grad_norm": 1.7924027918393708, + "learning_rate": 1.2200415093262394e-05, + "loss": 1.2631, + "step": 43180 + }, + { + "epoch": 1653.3962264150944, + "grad_norm": 1.8736863718504475, + "learning_rate": 1.2174703223450895e-05, + "loss": 1.2841, + "step": 43200 + }, + { + "epoch": 1654.1509433962265, + "grad_norm": 2.4263803178613257, + "learning_rate": 1.2149032485830917e-05, + "loss": 1.2549, + "step": 43220 + }, + { + "epoch": 1654.9056603773586, + "grad_norm": 2.0941904006879746, + "learning_rate": 1.212340291522137e-05, + "loss": 1.2723, + "step": 43240 + }, + { + "epoch": 1655.6603773584907, + "grad_norm": 1.8402209538224543, + "learning_rate": 1.2097814546385328e-05, + "loss": 1.2974, + "step": 43260 + }, + { + "epoch": 1656.4150943396226, + "grad_norm": 1.6993013644974577, + "learning_rate": 1.2072267414029963e-05, + "loss": 1.2513, + "step": 43280 + }, + { + "epoch": 1657.1698113207547, + "grad_norm": 1.7170522867791609, + "learning_rate": 1.2046761552806534e-05, + "loss": 1.2805, + "step": 43300 + }, + { + "epoch": 1657.9245283018868, + "grad_norm": 2.0350355441018007, + "learning_rate": 1.2021296997310335e-05, + "loss": 1.2705, + "step": 43320 + }, + { + "epoch": 1658.6792452830189, + "grad_norm": 2.4347287101096127, + "learning_rate": 1.1995873782080597e-05, + "loss": 1.3121, + "step": 43340 + }, + { + "epoch": 1659.433962264151, + "grad_norm": 2.4941517416955423, + "learning_rate": 1.1970491941600483e-05, + "loss": 1.283, + "step": 43360 + }, + { + "epoch": 1660.188679245283, + "grad_norm": 1.5936769777901518, + "learning_rate": 1.1945151510297077e-05, + "loss": 1.3007, + "step": 43380 + }, + { + "epoch": 1660.9433962264152, + "grad_norm": 1.7694343184984302, + "learning_rate": 1.191985252254125e-05, + "loss": 1.2624, + "step": 43400 + }, + { + "epoch": 1661.698113207547, + "grad_norm": 1.795958862054882, + "learning_rate": 1.1894595012647705e-05, + "loss": 1.2845, + "step": 43420 + }, + { + "epoch": 1662.4528301886792, + "grad_norm": 1.7885172471497286, + "learning_rate": 1.1869379014874838e-05, + "loss": 1.2917, + "step": 43440 + }, + { + "epoch": 1663.2075471698113, + "grad_norm": 2.120293693613178, + "learning_rate": 1.1844204563424761e-05, + "loss": 1.2772, + "step": 43460 + }, + { + "epoch": 1663.9622641509434, + "grad_norm": 1.9597227989228387, + "learning_rate": 1.1819071692443259e-05, + "loss": 1.2795, + "step": 43480 + }, + { + "epoch": 1664.7169811320755, + "grad_norm": 1.6400049564311938, + "learning_rate": 1.1793980436019665e-05, + "loss": 1.2698, + "step": 43500 + }, + { + "epoch": 1665.4716981132076, + "grad_norm": 1.8644080059944403, + "learning_rate": 1.1768930828186929e-05, + "loss": 1.2587, + "step": 43520 + }, + { + "epoch": 1666.2264150943397, + "grad_norm": 1.8282116731915254, + "learning_rate": 1.1743922902921463e-05, + "loss": 1.3132, + "step": 43540 + }, + { + "epoch": 1666.9811320754718, + "grad_norm": 1.7249443500233614, + "learning_rate": 1.1718956694143148e-05, + "loss": 1.2723, + "step": 43560 + }, + { + "epoch": 1667.7358490566037, + "grad_norm": 2.2697663843146665, + "learning_rate": 1.1694032235715316e-05, + "loss": 1.2568, + "step": 43580 + }, + { + "epoch": 1668.4905660377358, + "grad_norm": 1.530647627622187, + "learning_rate": 1.1669149561444626e-05, + "loss": 1.2717, + "step": 43600 + }, + { + "epoch": 1669.245283018868, + "grad_norm": 1.9077370898342472, + "learning_rate": 1.1644308705081098e-05, + "loss": 1.252, + "step": 43620 + }, + { + "epoch": 1670.0, + "grad_norm": 2.44039353912842, + "learning_rate": 1.1619509700318012e-05, + "loss": 1.2829, + "step": 43640 + }, + { + "epoch": 1670.754716981132, + "grad_norm": 1.7743292552737207, + "learning_rate": 1.159475258079188e-05, + "loss": 1.2831, + "step": 43660 + }, + { + "epoch": 1671.5094339622642, + "grad_norm": 1.747479187441525, + "learning_rate": 1.1570037380082422e-05, + "loss": 1.2933, + "step": 43680 + }, + { + "epoch": 1672.2641509433963, + "grad_norm": 1.6300209643052874, + "learning_rate": 1.154536413171247e-05, + "loss": 1.262, + "step": 43700 + }, + { + "epoch": 1673.0188679245282, + "grad_norm": 1.6492759638436003, + "learning_rate": 1.1520732869147992e-05, + "loss": 1.2733, + "step": 43720 + }, + { + "epoch": 1673.7735849056603, + "grad_norm": 1.8650633960066672, + "learning_rate": 1.149614362579798e-05, + "loss": 1.2536, + "step": 43740 + }, + { + "epoch": 1674.5283018867924, + "grad_norm": 2.1343348774015154, + "learning_rate": 1.1471596435014422e-05, + "loss": 1.2813, + "step": 43760 + }, + { + "epoch": 1675.2830188679245, + "grad_norm": 1.8893198944715273, + "learning_rate": 1.144709133009231e-05, + "loss": 1.2563, + "step": 43780 + }, + { + "epoch": 1676.0377358490566, + "grad_norm": 2.1120967291275416, + "learning_rate": 1.1422628344269509e-05, + "loss": 1.2821, + "step": 43800 + }, + { + "epoch": 1676.7924528301887, + "grad_norm": 1.9495522685303381, + "learning_rate": 1.1398207510726789e-05, + "loss": 1.2517, + "step": 43820 + }, + { + "epoch": 1677.5471698113208, + "grad_norm": 2.467904007991342, + "learning_rate": 1.1373828862587707e-05, + "loss": 1.2561, + "step": 43840 + }, + { + "epoch": 1678.301886792453, + "grad_norm": 2.0894802109018364, + "learning_rate": 1.1349492432918656e-05, + "loss": 1.2435, + "step": 43860 + }, + { + "epoch": 1679.0566037735848, + "grad_norm": 1.828171013483477, + "learning_rate": 1.1325198254728714e-05, + "loss": 1.2622, + "step": 43880 + }, + { + "epoch": 1679.811320754717, + "grad_norm": 1.801681773387057, + "learning_rate": 1.1300946360969663e-05, + "loss": 1.2558, + "step": 43900 + }, + { + "epoch": 1680.566037735849, + "grad_norm": 1.6735648229781173, + "learning_rate": 1.127673678453596e-05, + "loss": 1.2758, + "step": 43920 + }, + { + "epoch": 1681.3207547169811, + "grad_norm": 1.8439021982026453, + "learning_rate": 1.1252569558264623e-05, + "loss": 1.2711, + "step": 43940 + }, + { + "epoch": 1682.0754716981132, + "grad_norm": 2.1505206283933016, + "learning_rate": 1.1228444714935267e-05, + "loss": 1.2654, + "step": 43960 + }, + { + "epoch": 1682.8301886792453, + "grad_norm": 2.3535064036412763, + "learning_rate": 1.1204362287269989e-05, + "loss": 1.2864, + "step": 43980 + }, + { + "epoch": 1683.5849056603774, + "grad_norm": 1.9848669397447662, + "learning_rate": 1.1180322307933367e-05, + "loss": 1.2678, + "step": 44000 + }, + { + "epoch": 1684.3396226415093, + "grad_norm": 1.9913924876704123, + "learning_rate": 1.1156324809532414e-05, + "loss": 1.2676, + "step": 44020 + }, + { + "epoch": 1685.0943396226414, + "grad_norm": 1.7514208349630622, + "learning_rate": 1.1132369824616499e-05, + "loss": 1.2616, + "step": 44040 + }, + { + "epoch": 1685.8490566037735, + "grad_norm": 2.2320848012235888, + "learning_rate": 1.1108457385677357e-05, + "loss": 1.2342, + "step": 44060 + }, + { + "epoch": 1686.6037735849056, + "grad_norm": 1.5942465466549, + "learning_rate": 1.1084587525148977e-05, + "loss": 1.2645, + "step": 44080 + }, + { + "epoch": 1687.3584905660377, + "grad_norm": 1.7930621881455668, + "learning_rate": 1.1060760275407643e-05, + "loss": 1.2534, + "step": 44100 + }, + { + "epoch": 1688.1132075471698, + "grad_norm": 1.9526833117644506, + "learning_rate": 1.1036975668771807e-05, + "loss": 1.2609, + "step": 44120 + }, + { + "epoch": 1688.867924528302, + "grad_norm": 2.7437511360247084, + "learning_rate": 1.1013233737502087e-05, + "loss": 1.2343, + "step": 44140 + }, + { + "epoch": 1689.622641509434, + "grad_norm": 1.9223463912813794, + "learning_rate": 1.098953451380124e-05, + "loss": 1.2442, + "step": 44160 + }, + { + "epoch": 1690.377358490566, + "grad_norm": 1.8610095181805815, + "learning_rate": 1.0965878029814056e-05, + "loss": 1.2754, + "step": 44180 + }, + { + "epoch": 1691.132075471698, + "grad_norm": 1.8877853703015002, + "learning_rate": 1.0942264317627406e-05, + "loss": 1.2491, + "step": 44200 + }, + { + "epoch": 1691.8867924528302, + "grad_norm": 1.7714097265467896, + "learning_rate": 1.09186934092701e-05, + "loss": 1.2405, + "step": 44220 + }, + { + "epoch": 1692.6415094339623, + "grad_norm": 1.7637174019223203, + "learning_rate": 1.0895165336712904e-05, + "loss": 1.2829, + "step": 44240 + }, + { + "epoch": 1693.3962264150944, + "grad_norm": 1.7656304793121242, + "learning_rate": 1.087168013186851e-05, + "loss": 1.2702, + "step": 44260 + }, + { + "epoch": 1694.1509433962265, + "grad_norm": 1.7808808271916323, + "learning_rate": 1.0848237826591417e-05, + "loss": 1.2587, + "step": 44280 + }, + { + "epoch": 1694.9056603773586, + "grad_norm": 2.0266053428110538, + "learning_rate": 1.0824838452677987e-05, + "loss": 1.2926, + "step": 44300 + }, + { + "epoch": 1695.6603773584907, + "grad_norm": 1.8115058261411354, + "learning_rate": 1.0801482041866307e-05, + "loss": 1.2694, + "step": 44320 + }, + { + "epoch": 1696.4150943396226, + "grad_norm": 2.4945233871026526, + "learning_rate": 1.0778168625836231e-05, + "loss": 1.2699, + "step": 44340 + }, + { + "epoch": 1697.1698113207547, + "grad_norm": 1.6502310134668141, + "learning_rate": 1.0754898236209268e-05, + "loss": 1.2614, + "step": 44360 + }, + { + "epoch": 1697.9245283018868, + "grad_norm": 1.8453532789577662, + "learning_rate": 1.0731670904548564e-05, + "loss": 1.2823, + "step": 44380 + }, + { + "epoch": 1698.6792452830189, + "grad_norm": 1.860653653518071, + "learning_rate": 1.070848666235889e-05, + "loss": 1.2641, + "step": 44400 + }, + { + "epoch": 1699.433962264151, + "grad_norm": 2.5399359657927856, + "learning_rate": 1.0685345541086543e-05, + "loss": 1.2654, + "step": 44420 + }, + { + "epoch": 1700.188679245283, + "grad_norm": 2.0018966910355798, + "learning_rate": 1.0662247572119366e-05, + "loss": 1.2504, + "step": 44440 + }, + { + "epoch": 1700.9433962264152, + "grad_norm": 1.9371452956267547, + "learning_rate": 1.0639192786786632e-05, + "loss": 1.2599, + "step": 44460 + }, + { + "epoch": 1701.698113207547, + "grad_norm": 1.6521207456435931, + "learning_rate": 1.061618121635906e-05, + "loss": 1.2391, + "step": 44480 + }, + { + "epoch": 1702.4528301886792, + "grad_norm": 1.8656871033709692, + "learning_rate": 1.0593212892048769e-05, + "loss": 1.2724, + "step": 44500 + }, + { + "epoch": 1703.2075471698113, + "grad_norm": 1.8848695534792095, + "learning_rate": 1.0570287845009191e-05, + "loss": 1.2528, + "step": 44520 + }, + { + "epoch": 1703.9622641509434, + "grad_norm": 1.9187788004054305, + "learning_rate": 1.0547406106335084e-05, + "loss": 1.2518, + "step": 44540 + }, + { + "epoch": 1704.7169811320755, + "grad_norm": 1.6729690153958676, + "learning_rate": 1.0524567707062449e-05, + "loss": 1.2437, + "step": 44560 + }, + { + "epoch": 1705.4716981132076, + "grad_norm": 1.8430409315959264, + "learning_rate": 1.0501772678168493e-05, + "loss": 1.2467, + "step": 44580 + }, + { + "epoch": 1706.2264150943397, + "grad_norm": 1.8213698778380842, + "learning_rate": 1.0479021050571638e-05, + "loss": 1.261, + "step": 44600 + }, + { + "epoch": 1706.9811320754718, + "grad_norm": 1.7756575044684015, + "learning_rate": 1.0456312855131388e-05, + "loss": 1.2278, + "step": 44620 + }, + { + "epoch": 1707.7358490566037, + "grad_norm": 1.4850849315300283, + "learning_rate": 1.0433648122648373e-05, + "loss": 1.242, + "step": 44640 + }, + { + "epoch": 1708.4905660377358, + "grad_norm": 1.9352490992820244, + "learning_rate": 1.0411026883864254e-05, + "loss": 1.2507, + "step": 44660 + }, + { + "epoch": 1709.245283018868, + "grad_norm": 2.2842368933958634, + "learning_rate": 1.0388449169461693e-05, + "loss": 1.2614, + "step": 44680 + }, + { + "epoch": 1710.0, + "grad_norm": 1.716195015782983, + "learning_rate": 1.0365915010064342e-05, + "loss": 1.2467, + "step": 44700 + }, + { + "epoch": 1710.754716981132, + "grad_norm": 2.1393035099583524, + "learning_rate": 1.0343424436236746e-05, + "loss": 1.2697, + "step": 44720 + }, + { + "epoch": 1711.5094339622642, + "grad_norm": 2.0773856374828354, + "learning_rate": 1.0320977478484364e-05, + "loss": 1.2642, + "step": 44740 + }, + { + "epoch": 1712.2641509433963, + "grad_norm": 1.862877983575214, + "learning_rate": 1.0298574167253475e-05, + "loss": 1.2269, + "step": 44760 + }, + { + "epoch": 1713.0188679245282, + "grad_norm": 1.8342101414521328, + "learning_rate": 1.0276214532931146e-05, + "loss": 1.2535, + "step": 44780 + }, + { + "epoch": 1713.7735849056603, + "grad_norm": 1.9601396356391216, + "learning_rate": 1.0253898605845225e-05, + "loss": 1.2327, + "step": 44800 + }, + { + "epoch": 1714.5283018867924, + "grad_norm": 4.66393766300096, + "learning_rate": 1.0231626416264286e-05, + "loss": 1.2503, + "step": 44820 + }, + { + "epoch": 1715.2830188679245, + "grad_norm": 1.8180258292414466, + "learning_rate": 1.020939799439755e-05, + "loss": 1.2401, + "step": 44840 + }, + { + "epoch": 1716.0377358490566, + "grad_norm": 2.106671537780403, + "learning_rate": 1.0187213370394877e-05, + "loss": 1.2536, + "step": 44860 + }, + { + "epoch": 1716.7924528301887, + "grad_norm": 2.006353528787222, + "learning_rate": 1.016507257434674e-05, + "loss": 1.2669, + "step": 44880 + }, + { + "epoch": 1717.5471698113208, + "grad_norm": 1.9080849159374786, + "learning_rate": 1.0142975636284143e-05, + "loss": 1.2509, + "step": 44900 + }, + { + "epoch": 1718.301886792453, + "grad_norm": 1.8773707581872159, + "learning_rate": 1.0120922586178633e-05, + "loss": 1.2675, + "step": 44920 + }, + { + "epoch": 1719.0566037735848, + "grad_norm": 2.0076660138565647, + "learning_rate": 1.00989134539422e-05, + "loss": 1.2534, + "step": 44940 + }, + { + "epoch": 1719.811320754717, + "grad_norm": 2.33419651636862, + "learning_rate": 1.0076948269427267e-05, + "loss": 1.2397, + "step": 44960 + }, + { + "epoch": 1720.566037735849, + "grad_norm": 2.1404808355187552, + "learning_rate": 1.0055027062426677e-05, + "loss": 1.2533, + "step": 44980 + }, + { + "epoch": 1721.3207547169811, + "grad_norm": 1.8480400039657447, + "learning_rate": 1.003314986267358e-05, + "loss": 1.2493, + "step": 45000 + }, + { + "epoch": 1722.0754716981132, + "grad_norm": 1.8827968491873732, + "learning_rate": 1.0011316699841473e-05, + "loss": 1.2622, + "step": 45020 + }, + { + "epoch": 1722.8301886792453, + "grad_norm": 1.805703534242214, + "learning_rate": 9.989527603544106e-06, + "loss": 1.2363, + "step": 45040 + }, + { + "epoch": 1723.5849056603774, + "grad_norm": 1.808082360236483, + "learning_rate": 9.967782603335458e-06, + "loss": 1.2487, + "step": 45060 + }, + { + "epoch": 1724.3396226415093, + "grad_norm": 1.7375867158357146, + "learning_rate": 9.946081728709704e-06, + "loss": 1.2495, + "step": 45080 + }, + { + "epoch": 1725.0943396226414, + "grad_norm": 1.9612535567440743, + "learning_rate": 9.92442500910116e-06, + "loss": 1.229, + "step": 45100 + }, + { + "epoch": 1725.8490566037735, + "grad_norm": 1.7862147453258874, + "learning_rate": 9.902812473884265e-06, + "loss": 1.257, + "step": 45120 + }, + { + "epoch": 1726.6037735849056, + "grad_norm": 1.9883007002332853, + "learning_rate": 9.881244152373517e-06, + "loss": 1.2449, + "step": 45140 + }, + { + "epoch": 1727.3584905660377, + "grad_norm": 1.9502309547963228, + "learning_rate": 9.859720073823439e-06, + "loss": 1.224, + "step": 45160 + }, + { + "epoch": 1728.1132075471698, + "grad_norm": 1.896645829195727, + "learning_rate": 9.838240267428569e-06, + "loss": 1.2396, + "step": 45180 + }, + { + "epoch": 1728.867924528302, + "grad_norm": 1.629877819405046, + "learning_rate": 9.816804762323362e-06, + "loss": 1.2227, + "step": 45200 + }, + { + "epoch": 1729.622641509434, + "grad_norm": 1.929708983579025, + "learning_rate": 9.795413587582212e-06, + "loss": 1.2516, + "step": 45220 + }, + { + "epoch": 1730.377358490566, + "grad_norm": 2.0413627464070543, + "learning_rate": 9.77406677221937e-06, + "loss": 1.2514, + "step": 45240 + }, + { + "epoch": 1731.132075471698, + "grad_norm": 1.9912183520226578, + "learning_rate": 9.75276434518892e-06, + "loss": 1.2414, + "step": 45260 + }, + { + "epoch": 1731.8867924528302, + "grad_norm": 1.9880956960393557, + "learning_rate": 9.731506335384743e-06, + "loss": 1.2419, + "step": 45280 + }, + { + "epoch": 1732.6415094339623, + "grad_norm": 1.7890362722548563, + "learning_rate": 9.710292771640488e-06, + "loss": 1.2369, + "step": 45300 + }, + { + "epoch": 1733.3962264150944, + "grad_norm": 1.7651257661243038, + "learning_rate": 9.689123682729494e-06, + "loss": 1.2311, + "step": 45320 + }, + { + "epoch": 1734.1509433962265, + "grad_norm": 2.2714752518928596, + "learning_rate": 9.667999097364786e-06, + "loss": 1.2367, + "step": 45340 + }, + { + "epoch": 1734.9056603773586, + "grad_norm": 1.6713670044384341, + "learning_rate": 9.646919044199022e-06, + "loss": 1.2404, + "step": 45360 + }, + { + "epoch": 1735.6603773584907, + "grad_norm": 1.6792236680717407, + "learning_rate": 9.625883551824463e-06, + "loss": 1.2196, + "step": 45380 + }, + { + "epoch": 1736.4150943396226, + "grad_norm": 2.115412691262234, + "learning_rate": 9.604892648772943e-06, + "loss": 1.266, + "step": 45400 + }, + { + "epoch": 1737.1698113207547, + "grad_norm": 1.8051416403865777, + "learning_rate": 9.583946363515793e-06, + "loss": 1.2226, + "step": 45420 + }, + { + "epoch": 1737.9245283018868, + "grad_norm": 1.6009792370635079, + "learning_rate": 9.563044724463834e-06, + "loss": 1.2201, + "step": 45440 + }, + { + "epoch": 1738.6792452830189, + "grad_norm": 1.6768644470720357, + "learning_rate": 9.542187759967324e-06, + "loss": 1.2421, + "step": 45460 + }, + { + "epoch": 1739.433962264151, + "grad_norm": 1.8206836356027367, + "learning_rate": 9.521375498315946e-06, + "loss": 1.2329, + "step": 45480 + }, + { + "epoch": 1740.188679245283, + "grad_norm": 1.938456827512391, + "learning_rate": 9.500607967738736e-06, + "loss": 1.2325, + "step": 45500 + }, + { + "epoch": 1740.9433962264152, + "grad_norm": 2.2438430820956277, + "learning_rate": 9.47988519640406e-06, + "loss": 1.2354, + "step": 45520 + }, + { + "epoch": 1741.698113207547, + "grad_norm": 1.8583994537682718, + "learning_rate": 9.459207212419571e-06, + "loss": 1.235, + "step": 45540 + }, + { + "epoch": 1742.4528301886792, + "grad_norm": 1.917428400078255, + "learning_rate": 9.438574043832166e-06, + "loss": 1.224, + "step": 45560 + }, + { + "epoch": 1743.2075471698113, + "grad_norm": 1.9824261322614047, + "learning_rate": 9.417985718627978e-06, + "loss": 1.2129, + "step": 45580 + }, + { + "epoch": 1743.9622641509434, + "grad_norm": 1.612460364379856, + "learning_rate": 9.397442264732312e-06, + "loss": 1.2377, + "step": 45600 + }, + { + "epoch": 1744.7169811320755, + "grad_norm": 1.887246888937515, + "learning_rate": 9.376943710009596e-06, + "loss": 1.239, + "step": 45620 + }, + { + "epoch": 1745.4716981132076, + "grad_norm": 1.8662303877058588, + "learning_rate": 9.35649008226336e-06, + "loss": 1.2157, + "step": 45640 + }, + { + "epoch": 1746.2264150943397, + "grad_norm": 1.8071426126559238, + "learning_rate": 9.336081409236198e-06, + "loss": 1.2428, + "step": 45660 + }, + { + "epoch": 1746.9811320754718, + "grad_norm": 2.0833393122383828, + "learning_rate": 9.315717718609757e-06, + "loss": 1.2492, + "step": 45680 + }, + { + "epoch": 1747.7358490566037, + "grad_norm": 1.6715168273471837, + "learning_rate": 9.295399038004633e-06, + "loss": 1.2266, + "step": 45700 + }, + { + "epoch": 1748.4905660377358, + "grad_norm": 2.195357416639307, + "learning_rate": 9.275125394980386e-06, + "loss": 1.2253, + "step": 45720 + }, + { + "epoch": 1749.245283018868, + "grad_norm": 1.6874680664457093, + "learning_rate": 9.254896817035483e-06, + "loss": 1.2173, + "step": 45740 + }, + { + "epoch": 1750.0, + "grad_norm": 1.711392020263029, + "learning_rate": 9.234713331607285e-06, + "loss": 1.2454, + "step": 45760 + }, + { + "epoch": 1750.754716981132, + "grad_norm": 2.3706684674067713, + "learning_rate": 9.214574966071978e-06, + "loss": 1.2308, + "step": 45780 + }, + { + "epoch": 1751.5094339622642, + "grad_norm": 1.6753069249066932, + "learning_rate": 9.19448174774455e-06, + "loss": 1.2413, + "step": 45800 + }, + { + "epoch": 1752.2641509433963, + "grad_norm": 1.769095060241516, + "learning_rate": 9.174433703878748e-06, + "loss": 1.2319, + "step": 45820 + }, + { + "epoch": 1753.0188679245282, + "grad_norm": 2.3962310618426756, + "learning_rate": 9.154430861667043e-06, + "loss": 1.2352, + "step": 45840 + }, + { + "epoch": 1753.7735849056603, + "grad_norm": 1.9412455526945074, + "learning_rate": 9.134473248240613e-06, + "loss": 1.2102, + "step": 45860 + }, + { + "epoch": 1754.5283018867924, + "grad_norm": 1.8463733972517142, + "learning_rate": 9.114560890669284e-06, + "loss": 1.2332, + "step": 45880 + }, + { + "epoch": 1755.2830188679245, + "grad_norm": 2.04310291045162, + "learning_rate": 9.094693815961489e-06, + "loss": 1.214, + "step": 45900 + }, + { + "epoch": 1756.0377358490566, + "grad_norm": 1.6171015922091012, + "learning_rate": 9.074872051064247e-06, + "loss": 1.2699, + "step": 45920 + }, + { + "epoch": 1756.7924528301887, + "grad_norm": 1.9834386299608673, + "learning_rate": 9.05509562286311e-06, + "loss": 1.2278, + "step": 45940 + }, + { + "epoch": 1757.5471698113208, + "grad_norm": 1.821750331122801, + "learning_rate": 9.035364558182156e-06, + "loss": 1.2209, + "step": 45960 + }, + { + "epoch": 1758.301886792453, + "grad_norm": 2.014697149659363, + "learning_rate": 9.01567888378393e-06, + "loss": 1.2467, + "step": 45980 + }, + { + "epoch": 1759.0566037735848, + "grad_norm": 1.7691564521949696, + "learning_rate": 8.9960386263694e-06, + "loss": 1.2387, + "step": 46000 + }, + { + "epoch": 1759.811320754717, + "grad_norm": 2.3970514513874353, + "learning_rate": 8.976443812577933e-06, + "loss": 1.2356, + "step": 46020 + }, + { + "epoch": 1760.566037735849, + "grad_norm": 1.7866985162824316, + "learning_rate": 8.956894468987255e-06, + "loss": 1.2192, + "step": 46040 + }, + { + "epoch": 1761.3207547169811, + "grad_norm": 1.4793276251372218, + "learning_rate": 8.93739062211343e-06, + "loss": 1.2255, + "step": 46060 + }, + { + "epoch": 1762.0754716981132, + "grad_norm": 1.629080653433639, + "learning_rate": 8.917932298410821e-06, + "loss": 1.2293, + "step": 46080 + }, + { + "epoch": 1762.8301886792453, + "grad_norm": 1.9159436924110016, + "learning_rate": 8.898519524272015e-06, + "loss": 1.2401, + "step": 46100 + }, + { + "epoch": 1763.5849056603774, + "grad_norm": 2.067014892731833, + "learning_rate": 8.879152326027837e-06, + "loss": 1.2344, + "step": 46120 + }, + { + "epoch": 1764.3396226415093, + "grad_norm": 1.8696210113324339, + "learning_rate": 8.859830729947271e-06, + "loss": 1.2223, + "step": 46140 + }, + { + "epoch": 1765.0943396226414, + "grad_norm": 2.110486612271203, + "learning_rate": 8.840554762237504e-06, + "loss": 1.243, + "step": 46160 + }, + { + "epoch": 1765.8490566037735, + "grad_norm": 2.123761822878677, + "learning_rate": 8.821324449043775e-06, + "loss": 1.219, + "step": 46180 + }, + { + "epoch": 1766.6037735849056, + "grad_norm": 1.9704006034099235, + "learning_rate": 8.802139816449425e-06, + "loss": 1.2274, + "step": 46200 + }, + { + "epoch": 1767.3584905660377, + "grad_norm": 2.577400619765411, + "learning_rate": 8.783000890475817e-06, + "loss": 1.2215, + "step": 46220 + }, + { + "epoch": 1768.1132075471698, + "grad_norm": 1.9304613762583265, + "learning_rate": 8.763907697082349e-06, + "loss": 1.2278, + "step": 46240 + }, + { + "epoch": 1768.867924528302, + "grad_norm": 9.077022357816322, + "learning_rate": 8.744860262166374e-06, + "loss": 1.2376, + "step": 46260 + }, + { + "epoch": 1769.622641509434, + "grad_norm": 1.7013870498396941, + "learning_rate": 8.72585861156318e-06, + "loss": 1.2435, + "step": 46280 + }, + { + "epoch": 1770.377358490566, + "grad_norm": 2.2733345450497597, + "learning_rate": 8.706902771045942e-06, + "loss": 1.2491, + "step": 46300 + }, + { + "epoch": 1771.132075471698, + "grad_norm": 1.7197101765888114, + "learning_rate": 8.687992766325712e-06, + "loss": 1.2308, + "step": 46320 + }, + { + "epoch": 1771.8867924528302, + "grad_norm": 1.722161318565123, + "learning_rate": 8.669128623051374e-06, + "loss": 1.2153, + "step": 46340 + }, + { + "epoch": 1772.6415094339623, + "grad_norm": 1.768434935423491, + "learning_rate": 8.650310366809618e-06, + "loss": 1.231, + "step": 46360 + }, + { + "epoch": 1773.3962264150944, + "grad_norm": 1.754239611346281, + "learning_rate": 8.631538023124864e-06, + "loss": 1.2132, + "step": 46380 + }, + { + "epoch": 1774.1509433962265, + "grad_norm": 1.8552614353082573, + "learning_rate": 8.612811617459285e-06, + "loss": 1.2112, + "step": 46400 + }, + { + "epoch": 1774.9056603773586, + "grad_norm": 2.0773862798469467, + "learning_rate": 8.594131175212718e-06, + "loss": 1.2189, + "step": 46420 + }, + { + "epoch": 1775.6603773584907, + "grad_norm": 2.3280607387947905, + "learning_rate": 8.57549672172269e-06, + "loss": 1.238, + "step": 46440 + }, + { + "epoch": 1776.4150943396226, + "grad_norm": 2.1755774262596717, + "learning_rate": 8.556908282264332e-06, + "loss": 1.2024, + "step": 46460 + }, + { + "epoch": 1777.1698113207547, + "grad_norm": 1.7187738055157478, + "learning_rate": 8.538365882050364e-06, + "loss": 1.2234, + "step": 46480 + }, + { + "epoch": 1777.9245283018868, + "grad_norm": 3.1309450039543165, + "learning_rate": 8.51986954623106e-06, + "loss": 1.2081, + "step": 46500 + }, + { + "epoch": 1778.6792452830189, + "grad_norm": 2.1042177578345567, + "learning_rate": 8.501419299894205e-06, + "loss": 1.1976, + "step": 46520 + }, + { + "epoch": 1779.433962264151, + "grad_norm": 2.4039696113928586, + "learning_rate": 8.483015168065095e-06, + "loss": 1.2068, + "step": 46540 + }, + { + "epoch": 1780.188679245283, + "grad_norm": 2.0537571832378605, + "learning_rate": 8.464657175706461e-06, + "loss": 1.2143, + "step": 46560 + }, + { + "epoch": 1780.9433962264152, + "grad_norm": 1.9918815720142324, + "learning_rate": 8.44634534771845e-06, + "loss": 1.2019, + "step": 46580 + }, + { + "epoch": 1781.698113207547, + "grad_norm": 3.5070134161926214, + "learning_rate": 8.428079708938597e-06, + "loss": 1.2117, + "step": 46600 + }, + { + "epoch": 1782.4528301886792, + "grad_norm": 1.9332698868995186, + "learning_rate": 8.409860284141776e-06, + "loss": 1.2109, + "step": 46620 + }, + { + "epoch": 1783.2075471698113, + "grad_norm": 1.8649611050997916, + "learning_rate": 8.391687098040202e-06, + "loss": 1.2127, + "step": 46640 + }, + { + "epoch": 1783.9622641509434, + "grad_norm": 2.1126115309707276, + "learning_rate": 8.373560175283366e-06, + "loss": 1.2071, + "step": 46660 + }, + { + "epoch": 1784.7169811320755, + "grad_norm": 2.1198410570984145, + "learning_rate": 8.355479540457997e-06, + "loss": 1.2136, + "step": 46680 + }, + { + "epoch": 1785.4716981132076, + "grad_norm": 1.6900109710024558, + "learning_rate": 8.337445218088043e-06, + "loss": 1.2524, + "step": 46700 + }, + { + "epoch": 1786.2264150943397, + "grad_norm": 1.8630113220385771, + "learning_rate": 8.31945723263464e-06, + "loss": 1.2265, + "step": 46720 + }, + { + "epoch": 1786.9811320754718, + "grad_norm": 1.8874455281957463, + "learning_rate": 8.301515608496088e-06, + "loss": 1.2177, + "step": 46740 + }, + { + "epoch": 1787.7358490566037, + "grad_norm": 2.014600854617101, + "learning_rate": 8.283620370007777e-06, + "loss": 1.2181, + "step": 46760 + }, + { + "epoch": 1788.4905660377358, + "grad_norm": 2.0564703961686885, + "learning_rate": 8.2657715414422e-06, + "loss": 1.234, + "step": 46780 + }, + { + "epoch": 1789.245283018868, + "grad_norm": 1.7463019171504772, + "learning_rate": 8.247969147008883e-06, + "loss": 1.2357, + "step": 46800 + }, + { + "epoch": 1790.0, + "grad_norm": 2.0207773867855345, + "learning_rate": 8.230213210854395e-06, + "loss": 1.2148, + "step": 46820 + }, + { + "epoch": 1790.754716981132, + "grad_norm": 2.3337870810525168, + "learning_rate": 8.21250375706228e-06, + "loss": 1.237, + "step": 46840 + }, + { + "epoch": 1791.5094339622642, + "grad_norm": 2.1435617881979563, + "learning_rate": 8.194840809653027e-06, + "loss": 1.2374, + "step": 46860 + }, + { + "epoch": 1792.2641509433963, + "grad_norm": 1.9102469560838522, + "learning_rate": 8.177224392584056e-06, + "loss": 1.209, + "step": 46880 + }, + { + "epoch": 1793.0188679245282, + "grad_norm": 2.1795923550151737, + "learning_rate": 8.159654529749662e-06, + "loss": 1.2063, + "step": 46900 + }, + { + "epoch": 1793.7735849056603, + "grad_norm": 1.823175394536622, + "learning_rate": 8.142131244981005e-06, + "loss": 1.1934, + "step": 46920 + }, + { + "epoch": 1794.5283018867924, + "grad_norm": 1.8053211353930545, + "learning_rate": 8.12465456204608e-06, + "loss": 1.2198, + "step": 46940 + }, + { + "epoch": 1795.2830188679245, + "grad_norm": 2.2947577379489195, + "learning_rate": 8.107224504649651e-06, + "loss": 1.2309, + "step": 46960 + }, + { + "epoch": 1796.0377358490566, + "grad_norm": 1.8475992608945049, + "learning_rate": 8.089841096433251e-06, + "loss": 1.2087, + "step": 46980 + }, + { + "epoch": 1796.7924528301887, + "grad_norm": 1.8272879309025556, + "learning_rate": 8.072504360975127e-06, + "loss": 1.2136, + "step": 47000 + }, + { + "epoch": 1797.5471698113208, + "grad_norm": 1.8165782997861282, + "learning_rate": 8.055214321790241e-06, + "loss": 1.1889, + "step": 47020 + }, + { + "epoch": 1798.301886792453, + "grad_norm": 2.3340672269584726, + "learning_rate": 8.03797100233022e-06, + "loss": 1.221, + "step": 47040 + }, + { + "epoch": 1799.0566037735848, + "grad_norm": 2.092467100741215, + "learning_rate": 8.020774425983296e-06, + "loss": 1.2128, + "step": 47060 + }, + { + "epoch": 1799.811320754717, + "grad_norm": 2.3746119444632, + "learning_rate": 8.003624616074315e-06, + "loss": 1.2182, + "step": 47080 + }, + { + "epoch": 1800.566037735849, + "grad_norm": 1.8281656528438364, + "learning_rate": 7.9865215958647e-06, + "loss": 1.2263, + "step": 47100 + }, + { + "epoch": 1801.3207547169811, + "grad_norm": 1.7918154594625133, + "learning_rate": 7.969465388552383e-06, + "loss": 1.2213, + "step": 47120 + }, + { + "epoch": 1802.0754716981132, + "grad_norm": 1.6967922986825377, + "learning_rate": 7.95245601727184e-06, + "loss": 1.2138, + "step": 47140 + }, + { + "epoch": 1802.8301886792453, + "grad_norm": 2.1758444336626437, + "learning_rate": 7.935493505093988e-06, + "loss": 1.2148, + "step": 47160 + }, + { + "epoch": 1803.5849056603774, + "grad_norm": 2.065548344188712, + "learning_rate": 7.918577875026188e-06, + "loss": 1.225, + "step": 47180 + }, + { + "epoch": 1804.3396226415093, + "grad_norm": 2.285598146488397, + "learning_rate": 7.901709150012234e-06, + "loss": 1.2029, + "step": 47200 + }, + { + "epoch": 1805.0943396226414, + "grad_norm": 1.845588983749011, + "learning_rate": 7.884887352932272e-06, + "loss": 1.2197, + "step": 47220 + }, + { + "epoch": 1805.8490566037735, + "grad_norm": 2.1058361117020095, + "learning_rate": 7.868112506602826e-06, + "loss": 1.2153, + "step": 47240 + }, + { + "epoch": 1806.6037735849056, + "grad_norm": 2.465710936967516, + "learning_rate": 7.851384633776713e-06, + "loss": 1.228, + "step": 47260 + }, + { + "epoch": 1807.3584905660377, + "grad_norm": 2.5705709977723905, + "learning_rate": 7.834703757143039e-06, + "loss": 1.2098, + "step": 47280 + }, + { + "epoch": 1808.1132075471698, + "grad_norm": 2.2374731183447105, + "learning_rate": 7.818069899327187e-06, + "loss": 1.2129, + "step": 47300 + }, + { + "epoch": 1808.867924528302, + "grad_norm": 1.9504362821950096, + "learning_rate": 7.801483082890734e-06, + "loss": 1.1901, + "step": 47320 + }, + { + "epoch": 1809.622641509434, + "grad_norm": 2.3420693435077813, + "learning_rate": 7.784943330331486e-06, + "loss": 1.211, + "step": 47340 + }, + { + "epoch": 1810.377358490566, + "grad_norm": 2.031062915249881, + "learning_rate": 7.768450664083389e-06, + "loss": 1.2156, + "step": 47360 + }, + { + "epoch": 1811.132075471698, + "grad_norm": 1.6147973470014159, + "learning_rate": 7.752005106516516e-06, + "loss": 1.2246, + "step": 47380 + }, + { + "epoch": 1811.8867924528302, + "grad_norm": 3.287052917624636, + "learning_rate": 7.735606679937075e-06, + "loss": 1.2064, + "step": 47400 + }, + { + "epoch": 1812.6415094339623, + "grad_norm": 2.2761297016427178, + "learning_rate": 7.719255406587317e-06, + "loss": 1.212, + "step": 47420 + }, + { + "epoch": 1813.3962264150944, + "grad_norm": 1.7704387481824377, + "learning_rate": 7.702951308645558e-06, + "loss": 1.2085, + "step": 47440 + }, + { + "epoch": 1814.1509433962265, + "grad_norm": 1.8350242336586524, + "learning_rate": 7.68669440822611e-06, + "loss": 1.222, + "step": 47460 + }, + { + "epoch": 1814.9056603773586, + "grad_norm": 2.2387441733555202, + "learning_rate": 7.67048472737927e-06, + "loss": 1.2227, + "step": 47480 + }, + { + "epoch": 1815.6603773584907, + "grad_norm": 2.0331586599518863, + "learning_rate": 7.654322288091307e-06, + "loss": 1.2105, + "step": 47500 + }, + { + "epoch": 1816.4150943396226, + "grad_norm": 1.7302486664188137, + "learning_rate": 7.638207112284387e-06, + "loss": 1.2006, + "step": 47520 + }, + { + "epoch": 1817.1698113207547, + "grad_norm": 2.102196000276882, + "learning_rate": 7.622139221816588e-06, + "loss": 1.2129, + "step": 47540 + }, + { + "epoch": 1817.9245283018868, + "grad_norm": 2.367853919945459, + "learning_rate": 7.606118638481834e-06, + "loss": 1.2137, + "step": 47560 + }, + { + "epoch": 1818.6792452830189, + "grad_norm": 1.7313717857059043, + "learning_rate": 7.5901453840099084e-06, + "loss": 1.1895, + "step": 47580 + }, + { + "epoch": 1819.433962264151, + "grad_norm": 1.90549898399535, + "learning_rate": 7.574219480066374e-06, + "loss": 1.2056, + "step": 47600 + }, + { + "epoch": 1820.188679245283, + "grad_norm": 2.037261933639343, + "learning_rate": 7.55834094825259e-06, + "loss": 1.2174, + "step": 47620 + }, + { + "epoch": 1820.9433962264152, + "grad_norm": 2.106635441636325, + "learning_rate": 7.542509810105648e-06, + "loss": 1.1982, + "step": 47640 + }, + { + "epoch": 1821.698113207547, + "grad_norm": 2.092038104009338, + "learning_rate": 7.526726087098354e-06, + "loss": 1.2218, + "step": 47660 + }, + { + "epoch": 1822.4528301886792, + "grad_norm": 1.997516919579926, + "learning_rate": 7.51098980063922e-06, + "loss": 1.2219, + "step": 47680 + }, + { + "epoch": 1823.2075471698113, + "grad_norm": 1.8136562199600643, + "learning_rate": 7.49530097207239e-06, + "loss": 1.1796, + "step": 47700 + }, + { + "epoch": 1823.9622641509434, + "grad_norm": 2.5616204147227934, + "learning_rate": 7.47965962267767e-06, + "loss": 1.1939, + "step": 47720 + }, + { + "epoch": 1824.7169811320755, + "grad_norm": 2.1387597203680815, + "learning_rate": 7.464065773670437e-06, + "loss": 1.1602, + "step": 47740 + }, + { + "epoch": 1825.4716981132076, + "grad_norm": 1.7803739365612326, + "learning_rate": 7.448519446201648e-06, + "loss": 1.2392, + "step": 47760 + }, + { + "epoch": 1826.2264150943397, + "grad_norm": 1.8173426913493826, + "learning_rate": 7.433020661357822e-06, + "loss": 1.1921, + "step": 47780 + }, + { + "epoch": 1826.9811320754718, + "grad_norm": 1.7763793595591069, + "learning_rate": 7.417569440160968e-06, + "loss": 1.2139, + "step": 47800 + }, + { + "epoch": 1827.7358490566037, + "grad_norm": 2.151890225358, + "learning_rate": 7.402165803568603e-06, + "loss": 1.1918, + "step": 47820 + }, + { + "epoch": 1828.4905660377358, + "grad_norm": 2.241696527607786, + "learning_rate": 7.386809772473682e-06, + "loss": 1.199, + "step": 47840 + }, + { + "epoch": 1829.245283018868, + "grad_norm": 1.904140122730207, + "learning_rate": 7.371501367704594e-06, + "loss": 1.175, + "step": 47860 + }, + { + "epoch": 1830.0, + "grad_norm": 2.2057960272933035, + "learning_rate": 7.356240610025147e-06, + "loss": 1.2026, + "step": 47880 + }, + { + "epoch": 1830.754716981132, + "grad_norm": 1.992268245473379, + "learning_rate": 7.341027520134496e-06, + "loss": 1.2226, + "step": 47900 + }, + { + "epoch": 1831.5094339622642, + "grad_norm": 1.8558557959544568, + "learning_rate": 7.325862118667166e-06, + "loss": 1.1879, + "step": 47920 + }, + { + "epoch": 1832.2641509433963, + "grad_norm": 2.649917575243484, + "learning_rate": 7.3107444261929805e-06, + "loss": 1.2128, + "step": 47940 + }, + { + "epoch": 1833.0188679245282, + "grad_norm": 1.838198084617481, + "learning_rate": 7.295674463217053e-06, + "loss": 1.1932, + "step": 47960 + }, + { + "epoch": 1833.7735849056603, + "grad_norm": 1.6953634210974582, + "learning_rate": 7.280652250179774e-06, + "loss": 1.1964, + "step": 47980 + }, + { + "epoch": 1834.5283018867924, + "grad_norm": 1.6312994813012875, + "learning_rate": 7.26567780745675e-06, + "loss": 1.1941, + "step": 48000 + }, + { + "epoch": 1835.2830188679245, + "grad_norm": 2.0702401300059528, + "learning_rate": 7.250751155358808e-06, + "loss": 1.2005, + "step": 48020 + }, + { + "epoch": 1836.0377358490566, + "grad_norm": 2.2642885375841395, + "learning_rate": 7.2358723141319396e-06, + "loss": 1.1894, + "step": 48040 + }, + { + "epoch": 1836.7924528301887, + "grad_norm": 1.9527405946827057, + "learning_rate": 7.2210413039573e-06, + "loss": 1.182, + "step": 48060 + }, + { + "epoch": 1837.5471698113208, + "grad_norm": 2.3498816913200984, + "learning_rate": 7.206258144951163e-06, + "loss": 1.1913, + "step": 48080 + }, + { + "epoch": 1838.301886792453, + "grad_norm": 2.123796744980879, + "learning_rate": 7.1915228571648876e-06, + "loss": 1.2076, + "step": 48100 + }, + { + "epoch": 1839.0566037735848, + "grad_norm": 1.7252206439090503, + "learning_rate": 7.176835460584927e-06, + "loss": 1.1861, + "step": 48120 + }, + { + "epoch": 1839.811320754717, + "grad_norm": 1.8734365315429182, + "learning_rate": 7.162195975132747e-06, + "loss": 1.1826, + "step": 48140 + }, + { + "epoch": 1840.566037735849, + "grad_norm": 2.316186234026582, + "learning_rate": 7.147604420664858e-06, + "loss": 1.177, + "step": 48160 + }, + { + "epoch": 1841.3207547169811, + "grad_norm": 1.788911685930357, + "learning_rate": 7.133060816972735e-06, + "loss": 1.1844, + "step": 48180 + }, + { + "epoch": 1842.0754716981132, + "grad_norm": 1.9701957941688446, + "learning_rate": 7.118565183782816e-06, + "loss": 1.211, + "step": 48200 + }, + { + "epoch": 1842.8301886792453, + "grad_norm": 1.721205749217039, + "learning_rate": 7.104117540756494e-06, + "loss": 1.2045, + "step": 48220 + }, + { + "epoch": 1843.5849056603774, + "grad_norm": 1.73943762799037, + "learning_rate": 7.089717907490048e-06, + "loss": 1.2005, + "step": 48240 + }, + { + "epoch": 1844.3396226415093, + "grad_norm": 2.2562973087741587, + "learning_rate": 7.07536630351465e-06, + "loss": 1.21, + "step": 48260 + }, + { + "epoch": 1845.0943396226414, + "grad_norm": 2.10020840598067, + "learning_rate": 7.061062748296323e-06, + "loss": 1.191, + "step": 48280 + }, + { + "epoch": 1845.8490566037735, + "grad_norm": 1.8070339824697725, + "learning_rate": 7.0468072612359105e-06, + "loss": 1.193, + "step": 48300 + }, + { + "epoch": 1846.6037735849056, + "grad_norm": 1.997808139499102, + "learning_rate": 7.032599861669077e-06, + "loss": 1.2014, + "step": 48320 + }, + { + "epoch": 1847.3584905660377, + "grad_norm": 2.0691188983956277, + "learning_rate": 7.018440568866245e-06, + "loss": 1.1966, + "step": 48340 + }, + { + "epoch": 1848.1132075471698, + "grad_norm": 2.403979236635362, + "learning_rate": 7.004329402032594e-06, + "loss": 1.1782, + "step": 48360 + }, + { + "epoch": 1848.867924528302, + "grad_norm": 2.6004080171234385, + "learning_rate": 6.9902663803080305e-06, + "loss": 1.1804, + "step": 48380 + }, + { + "epoch": 1849.622641509434, + "grad_norm": 1.9209297506929766, + "learning_rate": 6.976251522767146e-06, + "loss": 1.1743, + "step": 48400 + }, + { + "epoch": 1850.377358490566, + "grad_norm": 1.870852128682101, + "learning_rate": 6.962284848419221e-06, + "loss": 1.1968, + "step": 48420 + }, + { + "epoch": 1851.132075471698, + "grad_norm": 2.335271083007723, + "learning_rate": 6.948366376208161e-06, + "loss": 1.1848, + "step": 48440 + }, + { + "epoch": 1851.8867924528302, + "grad_norm": 1.6640538101181919, + "learning_rate": 6.93449612501252e-06, + "loss": 1.201, + "step": 48460 + }, + { + "epoch": 1852.6415094339623, + "grad_norm": 1.8790103798312214, + "learning_rate": 6.920674113645418e-06, + "loss": 1.167, + "step": 48480 + }, + { + "epoch": 1853.3962264150944, + "grad_norm": 1.7578289543420753, + "learning_rate": 6.906900360854565e-06, + "loss": 1.2007, + "step": 48500 + }, + { + "epoch": 1854.1509433962265, + "grad_norm": 1.8615274702658844, + "learning_rate": 6.893174885322198e-06, + "loss": 1.2056, + "step": 48520 + }, + { + "epoch": 1854.9056603773586, + "grad_norm": 2.3236275376684143, + "learning_rate": 6.879497705665089e-06, + "loss": 1.1716, + "step": 48540 + }, + { + "epoch": 1855.6603773584907, + "grad_norm": 2.1963583105378213, + "learning_rate": 6.865868840434493e-06, + "loss": 1.1769, + "step": 48560 + }, + { + "epoch": 1856.4150943396226, + "grad_norm": 2.129799150022101, + "learning_rate": 6.852288308116133e-06, + "loss": 1.1861, + "step": 48580 + }, + { + "epoch": 1857.1698113207547, + "grad_norm": 1.978798574769679, + "learning_rate": 6.8387561271301765e-06, + "loss": 1.1971, + "step": 48600 + }, + { + "epoch": 1857.9245283018868, + "grad_norm": 2.2936585531474596, + "learning_rate": 6.8252723158312055e-06, + "loss": 1.1911, + "step": 48620 + }, + { + "epoch": 1858.6792452830189, + "grad_norm": 1.860277938482895, + "learning_rate": 6.81183689250821e-06, + "loss": 1.1566, + "step": 48640 + }, + { + "epoch": 1859.433962264151, + "grad_norm": 2.017669140863562, + "learning_rate": 6.79844987538453e-06, + "loss": 1.1728, + "step": 48660 + }, + { + "epoch": 1860.188679245283, + "grad_norm": 2.098145579199566, + "learning_rate": 6.785111282617849e-06, + "loss": 1.1934, + "step": 48680 + }, + { + "epoch": 1860.9433962264152, + "grad_norm": 1.835823989245946, + "learning_rate": 6.771821132300191e-06, + "loss": 1.1621, + "step": 48700 + }, + { + "epoch": 1861.698113207547, + "grad_norm": 1.9612631399268534, + "learning_rate": 6.7585794424578464e-06, + "loss": 1.1911, + "step": 48720 + }, + { + "epoch": 1862.4528301886792, + "grad_norm": 2.0755400141270464, + "learning_rate": 6.745386231051399e-06, + "loss": 1.1804, + "step": 48740 + }, + { + "epoch": 1863.2075471698113, + "grad_norm": 2.2075989618364984, + "learning_rate": 6.732241515975663e-06, + "loss": 1.1933, + "step": 48760 + }, + { + "epoch": 1863.9622641509434, + "grad_norm": 1.9214092744343696, + "learning_rate": 6.719145315059678e-06, + "loss": 1.1913, + "step": 48780 + }, + { + "epoch": 1864.7169811320755, + "grad_norm": 2.3557809190891703, + "learning_rate": 6.7060976460666846e-06, + "loss": 1.1905, + "step": 48800 + }, + { + "epoch": 1865.4716981132076, + "grad_norm": 2.114305919520162, + "learning_rate": 6.693098526694083e-06, + "loss": 1.2047, + "step": 48820 + }, + { + "epoch": 1866.2264150943397, + "grad_norm": 1.8242775313878226, + "learning_rate": 6.680147974573452e-06, + "loss": 1.1933, + "step": 48840 + }, + { + "epoch": 1866.9811320754718, + "grad_norm": 2.1056639763813956, + "learning_rate": 6.66724600727046e-06, + "loss": 1.1808, + "step": 48860 + }, + { + "epoch": 1867.7358490566037, + "grad_norm": 1.7165725449830957, + "learning_rate": 6.654392642284892e-06, + "loss": 1.1782, + "step": 48880 + }, + { + "epoch": 1868.4905660377358, + "grad_norm": 1.7341902387718784, + "learning_rate": 6.6415878970506175e-06, + "loss": 1.179, + "step": 48900 + }, + { + "epoch": 1869.245283018868, + "grad_norm": 1.7667425869444906, + "learning_rate": 6.6288317889355535e-06, + "loss": 1.1754, + "step": 48920 + }, + { + "epoch": 1870.0, + "grad_norm": 2.2994252135110655, + "learning_rate": 6.616124335241648e-06, + "loss": 1.1992, + "step": 48940 + }, + { + "epoch": 1870.754716981132, + "grad_norm": 1.7046658246235185, + "learning_rate": 6.603465553204852e-06, + "loss": 1.1811, + "step": 48960 + }, + { + "epoch": 1871.5094339622642, + "grad_norm": 2.1898673540015428, + "learning_rate": 6.5908554599951e-06, + "loss": 1.178, + "step": 48980 + }, + { + "epoch": 1872.2641509433963, + "grad_norm": 1.9804369829197095, + "learning_rate": 6.578294072716292e-06, + "loss": 1.1989, + "step": 49000 + }, + { + "epoch": 1873.0188679245282, + "grad_norm": 2.065726596455928, + "learning_rate": 6.565781408406267e-06, + "loss": 1.1931, + "step": 49020 + }, + { + "epoch": 1873.7735849056603, + "grad_norm": 2.2467788121970123, + "learning_rate": 6.553317484036772e-06, + "loss": 1.2074, + "step": 49040 + }, + { + "epoch": 1874.5283018867924, + "grad_norm": 1.991691603079823, + "learning_rate": 6.5409023165134424e-06, + "loss": 1.1983, + "step": 49060 + }, + { + "epoch": 1875.2830188679245, + "grad_norm": 2.106299625577455, + "learning_rate": 6.528535922675781e-06, + "loss": 1.1956, + "step": 49080 + }, + { + "epoch": 1876.0377358490566, + "grad_norm": 2.037693251120139, + "learning_rate": 6.516218319297147e-06, + "loss": 1.185, + "step": 49100 + }, + { + "epoch": 1876.7924528301887, + "grad_norm": 2.3718612692091763, + "learning_rate": 6.503949523084718e-06, + "loss": 1.1859, + "step": 49120 + }, + { + "epoch": 1877.5471698113208, + "grad_norm": 1.9858435056818156, + "learning_rate": 6.491729550679461e-06, + "loss": 1.2076, + "step": 49140 + }, + { + "epoch": 1878.301886792453, + "grad_norm": 2.147814028235424, + "learning_rate": 6.479558418656134e-06, + "loss": 1.1682, + "step": 49160 + }, + { + "epoch": 1879.0566037735848, + "grad_norm": 1.7628164718106505, + "learning_rate": 6.467436143523228e-06, + "loss": 1.1791, + "step": 49180 + }, + { + "epoch": 1879.811320754717, + "grad_norm": 1.9837896355936764, + "learning_rate": 6.455362741722995e-06, + "loss": 1.1977, + "step": 49200 + }, + { + "epoch": 1880.566037735849, + "grad_norm": 2.0613808893064327, + "learning_rate": 6.44333822963138e-06, + "loss": 1.1738, + "step": 49220 + }, + { + "epoch": 1881.3207547169811, + "grad_norm": 1.6736931908615154, + "learning_rate": 6.431362623558018e-06, + "loss": 1.1774, + "step": 49240 + }, + { + "epoch": 1882.0754716981132, + "grad_norm": 1.691911714014794, + "learning_rate": 6.4194359397462055e-06, + "loss": 1.1666, + "step": 49260 + }, + { + "epoch": 1882.8301886792453, + "grad_norm": 2.1345911027894138, + "learning_rate": 6.4075581943728944e-06, + "loss": 1.1973, + "step": 49280 + }, + { + "epoch": 1883.5849056603774, + "grad_norm": 1.9512349129787812, + "learning_rate": 6.395729403548645e-06, + "loss": 1.1672, + "step": 49300 + }, + { + "epoch": 1884.3396226415093, + "grad_norm": 2.6451924153676125, + "learning_rate": 6.383949583317629e-06, + "loss": 1.1695, + "step": 49320 + }, + { + "epoch": 1885.0943396226414, + "grad_norm": 1.783294063259621, + "learning_rate": 6.372218749657584e-06, + "loss": 1.1648, + "step": 49340 + }, + { + "epoch": 1885.8490566037735, + "grad_norm": 2.3799777683561967, + "learning_rate": 6.360536918479806e-06, + "loss": 1.1776, + "step": 49360 + }, + { + "epoch": 1886.6037735849056, + "grad_norm": 2.0124780882138347, + "learning_rate": 6.348904105629139e-06, + "loss": 1.1884, + "step": 49380 + }, + { + "epoch": 1887.3584905660377, + "grad_norm": 1.8426672524927896, + "learning_rate": 6.3373203268839345e-06, + "loss": 1.1842, + "step": 49400 + }, + { + "epoch": 1888.1132075471698, + "grad_norm": 2.0963675882931274, + "learning_rate": 6.325785597956021e-06, + "loss": 1.1807, + "step": 49420 + }, + { + "epoch": 1888.867924528302, + "grad_norm": 1.9137633109249375, + "learning_rate": 6.314299934490717e-06, + "loss": 1.1932, + "step": 49440 + }, + { + "epoch": 1889.622641509434, + "grad_norm": 2.2927620018796033, + "learning_rate": 6.3028633520667744e-06, + "loss": 1.186, + "step": 49460 + }, + { + "epoch": 1890.377358490566, + "grad_norm": 2.7433533031518182, + "learning_rate": 6.291475866196384e-06, + "loss": 1.1363, + "step": 49480 + }, + { + "epoch": 1891.132075471698, + "grad_norm": 1.563437023715403, + "learning_rate": 6.280137492325147e-06, + "loss": 1.2093, + "step": 49500 + }, + { + "epoch": 1891.8867924528302, + "grad_norm": 2.793822111662886, + "learning_rate": 6.2688482458320434e-06, + "loss": 1.1751, + "step": 49520 + }, + { + "epoch": 1892.6415094339623, + "grad_norm": 1.8418670948197584, + "learning_rate": 6.25760814202941e-06, + "loss": 1.1658, + "step": 49540 + }, + { + "epoch": 1893.3962264150944, + "grad_norm": 1.8332744098429328, + "learning_rate": 6.246417196162944e-06, + "loss": 1.1654, + "step": 49560 + }, + { + "epoch": 1894.1509433962265, + "grad_norm": 1.7817661421186255, + "learning_rate": 6.235275423411659e-06, + "loss": 1.1764, + "step": 49580 + }, + { + "epoch": 1894.9056603773586, + "grad_norm": 1.9495189221186473, + "learning_rate": 6.224182838887876e-06, + "loss": 1.1529, + "step": 49600 + }, + { + "epoch": 1895.6603773584907, + "grad_norm": 2.4039058315851447, + "learning_rate": 6.213139457637196e-06, + "loss": 1.1747, + "step": 49620 + }, + { + "epoch": 1896.4150943396226, + "grad_norm": 1.875771973172552, + "learning_rate": 6.202145294638478e-06, + "loss": 1.1821, + "step": 49640 + }, + { + "epoch": 1897.1698113207547, + "grad_norm": 2.1999372490425393, + "learning_rate": 6.191200364803824e-06, + "loss": 1.1813, + "step": 49660 + }, + { + "epoch": 1897.9245283018868, + "grad_norm": 11.988202176475387, + "learning_rate": 6.180304682978568e-06, + "loss": 1.1569, + "step": 49680 + }, + { + "epoch": 1898.6792452830189, + "grad_norm": 1.8662829336756046, + "learning_rate": 6.169458263941242e-06, + "loss": 1.1816, + "step": 49700 + }, + { + "epoch": 1899.433962264151, + "grad_norm": 2.3098966440534294, + "learning_rate": 6.158661122403553e-06, + "loss": 1.1581, + "step": 49720 + }, + { + "epoch": 1900.188679245283, + "grad_norm": 2.332658522584547, + "learning_rate": 6.1479132730103704e-06, + "loss": 1.1946, + "step": 49740 + }, + { + "epoch": 1900.9433962264152, + "grad_norm": 1.7105735490477962, + "learning_rate": 6.137214730339707e-06, + "loss": 1.1868, + "step": 49760 + }, + { + "epoch": 1901.698113207547, + "grad_norm": 2.243808666742797, + "learning_rate": 6.126565508902698e-06, + "loss": 1.1599, + "step": 49780 + }, + { + "epoch": 1902.4528301886792, + "grad_norm": 1.8783312097697262, + "learning_rate": 6.115965623143589e-06, + "loss": 1.1621, + "step": 49800 + }, + { + "epoch": 1903.2075471698113, + "grad_norm": 1.7166955372139616, + "learning_rate": 6.105415087439699e-06, + "loss": 1.1862, + "step": 49820 + }, + { + "epoch": 1903.9622641509434, + "grad_norm": 2.0340234917391524, + "learning_rate": 6.094913916101413e-06, + "loss": 1.1561, + "step": 49840 + }, + { + "epoch": 1904.7169811320755, + "grad_norm": 1.8220266868042787, + "learning_rate": 6.084462123372144e-06, + "loss": 1.1749, + "step": 49860 + }, + { + "epoch": 1905.4716981132076, + "grad_norm": 2.3373227334868973, + "learning_rate": 6.07405972342837e-06, + "loss": 1.2081, + "step": 49880 + }, + { + "epoch": 1906.2264150943397, + "grad_norm": 2.31770817514565, + "learning_rate": 6.063706730379534e-06, + "loss": 1.1705, + "step": 49900 + }, + { + "epoch": 1906.9811320754718, + "grad_norm": 1.749701769225866, + "learning_rate": 6.053403158268086e-06, + "loss": 1.1732, + "step": 49920 + }, + { + "epoch": 1907.7358490566037, + "grad_norm": 2.0136702275524736, + "learning_rate": 6.043149021069432e-06, + "loss": 1.1789, + "step": 49940 + }, + { + "epoch": 1908.4905660377358, + "grad_norm": 1.8991267563990468, + "learning_rate": 6.032944332691932e-06, + "loss": 1.1691, + "step": 49960 + }, + { + "epoch": 1909.245283018868, + "grad_norm": 2.178284333271757, + "learning_rate": 6.02278910697688e-06, + "loss": 1.1698, + "step": 49980 + }, + { + "epoch": 1910.0, + "grad_norm": 2.270879262505861, + "learning_rate": 6.012683357698476e-06, + "loss": 1.1424, + "step": 50000 + }, + { + "epoch": 1910.754716981132, + "grad_norm": 1.6800724855002753, + "learning_rate": 6.0026270985638094e-06, + "loss": 1.1405, + "step": 50020 + }, + { + "epoch": 1911.5094339622642, + "grad_norm": 1.9982510142589247, + "learning_rate": 5.9926203432128405e-06, + "loss": 1.1811, + "step": 50040 + }, + { + "epoch": 1912.2641509433963, + "grad_norm": 2.072677046394058, + "learning_rate": 5.98266310521839e-06, + "loss": 1.1832, + "step": 50060 + }, + { + "epoch": 1913.0188679245282, + "grad_norm": 1.8518177038658126, + "learning_rate": 5.972755398086119e-06, + "loss": 1.1768, + "step": 50080 + }, + { + "epoch": 1913.7735849056603, + "grad_norm": 1.7233232781661019, + "learning_rate": 5.9628972352545016e-06, + "loss": 1.1916, + "step": 50100 + }, + { + "epoch": 1914.5283018867924, + "grad_norm": 2.2438247684764776, + "learning_rate": 5.953088630094804e-06, + "loss": 1.1965, + "step": 50120 + }, + { + "epoch": 1915.2830188679245, + "grad_norm": 2.354329582753457, + "learning_rate": 5.943329595911085e-06, + "loss": 1.1657, + "step": 50140 + }, + { + "epoch": 1916.0377358490566, + "grad_norm": 2.0821470705714558, + "learning_rate": 5.933620145940163e-06, + "loss": 1.1733, + "step": 50160 + }, + { + "epoch": 1916.7924528301887, + "grad_norm": 2.3851614247004513, + "learning_rate": 5.92396029335161e-06, + "loss": 1.1973, + "step": 50180 + }, + { + "epoch": 1917.5471698113208, + "grad_norm": 2.165021041548156, + "learning_rate": 5.91435005124771e-06, + "loss": 1.1605, + "step": 50200 + }, + { + "epoch": 1918.301886792453, + "grad_norm": 1.8316074131304803, + "learning_rate": 5.904789432663471e-06, + "loss": 1.175, + "step": 50220 + }, + { + "epoch": 1919.0566037735848, + "grad_norm": 2.2444762984325517, + "learning_rate": 5.8952784505665775e-06, + "loss": 1.1546, + "step": 50240 + }, + { + "epoch": 1919.811320754717, + "grad_norm": 1.990381226210719, + "learning_rate": 5.885817117857409e-06, + "loss": 1.1734, + "step": 50260 + }, + { + "epoch": 1920.566037735849, + "grad_norm": 2.1012346230151935, + "learning_rate": 5.876405447368989e-06, + "loss": 1.1726, + "step": 50280 + }, + { + "epoch": 1921.3207547169811, + "grad_norm": 2.036740240000707, + "learning_rate": 5.867043451866989e-06, + "loss": 1.1858, + "step": 50300 + }, + { + "epoch": 1922.0754716981132, + "grad_norm": 1.9828813541843844, + "learning_rate": 5.85773114404969e-06, + "loss": 1.1523, + "step": 50320 + }, + { + "epoch": 1922.8301886792453, + "grad_norm": 2.1278126328460196, + "learning_rate": 5.848468536547991e-06, + "loss": 1.1886, + "step": 50340 + }, + { + "epoch": 1923.5849056603774, + "grad_norm": 2.315788726027488, + "learning_rate": 5.8392556419253755e-06, + "loss": 1.1686, + "step": 50360 + }, + { + "epoch": 1924.3396226415093, + "grad_norm": 2.3735624423680117, + "learning_rate": 5.830092472677899e-06, + "loss": 1.1584, + "step": 50380 + }, + { + "epoch": 1925.0943396226414, + "grad_norm": 2.3872016424634093, + "learning_rate": 5.820979041234169e-06, + "loss": 1.1859, + "step": 50400 + }, + { + "epoch": 1925.8490566037735, + "grad_norm": 2.0316120352053115, + "learning_rate": 5.811915359955322e-06, + "loss": 1.1578, + "step": 50420 + }, + { + "epoch": 1926.6037735849056, + "grad_norm": 1.9955741026809004, + "learning_rate": 5.8029014411350336e-06, + "loss": 1.1699, + "step": 50440 + }, + { + "epoch": 1927.3584905660377, + "grad_norm": 2.671513853147586, + "learning_rate": 5.793937296999476e-06, + "loss": 1.1613, + "step": 50460 + }, + { + "epoch": 1928.1132075471698, + "grad_norm": 2.085910842457962, + "learning_rate": 5.785022939707302e-06, + "loss": 1.1919, + "step": 50480 + }, + { + "epoch": 1928.867924528302, + "grad_norm": 2.15354309947986, + "learning_rate": 5.77615838134964e-06, + "loss": 1.1766, + "step": 50500 + }, + { + "epoch": 1929.622641509434, + "grad_norm": 2.5693650339132463, + "learning_rate": 5.76734363395007e-06, + "loss": 1.175, + "step": 50520 + }, + { + "epoch": 1930.377358490566, + "grad_norm": 3.2479427163076533, + "learning_rate": 5.7585787094646196e-06, + "loss": 1.1703, + "step": 50540 + }, + { + "epoch": 1931.132075471698, + "grad_norm": 2.1161416369904695, + "learning_rate": 5.749863619781723e-06, + "loss": 1.1657, + "step": 50560 + }, + { + "epoch": 1931.8867924528302, + "grad_norm": 1.7714808950845444, + "learning_rate": 5.7411983767222415e-06, + "loss": 1.1717, + "step": 50580 + }, + { + "epoch": 1932.6415094339623, + "grad_norm": 2.535706381586084, + "learning_rate": 5.732582992039398e-06, + "loss": 1.1553, + "step": 50600 + }, + { + "epoch": 1933.3962264150944, + "grad_norm": 2.1037009756450527, + "learning_rate": 5.724017477418814e-06, + "loss": 1.1771, + "step": 50620 + }, + { + "epoch": 1934.1509433962265, + "grad_norm": 1.849746935628885, + "learning_rate": 5.7155018444784526e-06, + "loss": 1.1422, + "step": 50640 + }, + { + "epoch": 1934.9056603773586, + "grad_norm": 1.9210522037566025, + "learning_rate": 5.707036104768635e-06, + "loss": 1.1756, + "step": 50660 + }, + { + "epoch": 1935.6603773584907, + "grad_norm": 1.819206038769788, + "learning_rate": 5.698620269771997e-06, + "loss": 1.1916, + "step": 50680 + }, + { + "epoch": 1936.4150943396226, + "grad_norm": 2.5377205844625417, + "learning_rate": 5.690254350903488e-06, + "loss": 1.1619, + "step": 50700 + }, + { + "epoch": 1937.1698113207547, + "grad_norm": 2.2007521096063902, + "learning_rate": 5.681938359510347e-06, + "loss": 1.1846, + "step": 50720 + }, + { + "epoch": 1937.9245283018868, + "grad_norm": 2.192606880082283, + "learning_rate": 5.673672306872103e-06, + "loss": 1.1699, + "step": 50740 + }, + { + "epoch": 1938.6792452830189, + "grad_norm": 2.1766069540448436, + "learning_rate": 5.665456204200552e-06, + "loss": 1.1871, + "step": 50760 + }, + { + "epoch": 1939.433962264151, + "grad_norm": 1.9751649291014899, + "learning_rate": 5.657290062639727e-06, + "loss": 1.1474, + "step": 50780 + }, + { + "epoch": 1940.188679245283, + "grad_norm": 1.890603847246591, + "learning_rate": 5.6491738932659e-06, + "loss": 1.1559, + "step": 50800 + }, + { + "epoch": 1940.9433962264152, + "grad_norm": 1.975301210235016, + "learning_rate": 5.641107707087573e-06, + "loss": 1.1521, + "step": 50820 + }, + { + "epoch": 1941.698113207547, + "grad_norm": 1.8441779800267277, + "learning_rate": 5.6330915150454375e-06, + "loss": 1.145, + "step": 50840 + }, + { + "epoch": 1942.4528301886792, + "grad_norm": 2.4451642203033064, + "learning_rate": 5.625125328012387e-06, + "loss": 1.1791, + "step": 50860 + }, + { + "epoch": 1943.2075471698113, + "grad_norm": 2.127622782788785, + "learning_rate": 5.617209156793476e-06, + "loss": 1.1471, + "step": 50880 + }, + { + "epoch": 1943.9622641509434, + "grad_norm": 2.0779587981444427, + "learning_rate": 5.609343012125934e-06, + "loss": 1.1537, + "step": 50900 + }, + { + "epoch": 1944.7169811320755, + "grad_norm": 2.1654459900473872, + "learning_rate": 5.601526904679125e-06, + "loss": 1.1609, + "step": 50920 + }, + { + "epoch": 1945.4716981132076, + "grad_norm": 1.8696254811746238, + "learning_rate": 5.593760845054552e-06, + "loss": 1.1523, + "step": 50940 + }, + { + "epoch": 1946.2264150943397, + "grad_norm": 1.7856722997496786, + "learning_rate": 5.586044843785832e-06, + "loss": 1.2012, + "step": 50960 + }, + { + "epoch": 1946.9811320754718, + "grad_norm": 1.9401895010628936, + "learning_rate": 5.578378911338684e-06, + "loss": 1.1384, + "step": 50980 + }, + { + "epoch": 1947.7358490566037, + "grad_norm": 2.1643993698581077, + "learning_rate": 5.570763058110911e-06, + "loss": 1.1645, + "step": 51000 + }, + { + "epoch": 1948.4905660377358, + "grad_norm": 2.180473463448981, + "learning_rate": 5.563197294432395e-06, + "loss": 1.1382, + "step": 51020 + }, + { + "epoch": 1949.245283018868, + "grad_norm": 1.9596617879790443, + "learning_rate": 5.555681630565088e-06, + "loss": 1.1539, + "step": 51040 + }, + { + "epoch": 1950.0, + "grad_norm": 1.8975845056567062, + "learning_rate": 5.548216076702974e-06, + "loss": 1.144, + "step": 51060 + }, + { + "epoch": 1950.754716981132, + "grad_norm": 2.188450696803476, + "learning_rate": 5.540800642972071e-06, + "loss": 1.1532, + "step": 51080 + }, + { + "epoch": 1951.5094339622642, + "grad_norm": 1.8285975742024299, + "learning_rate": 5.533435339430416e-06, + "loss": 1.1949, + "step": 51100 + }, + { + "epoch": 1952.2641509433963, + "grad_norm": 1.812418268110745, + "learning_rate": 5.526120176068055e-06, + "loss": 1.1613, + "step": 51120 + }, + { + "epoch": 1953.0188679245282, + "grad_norm": 1.9694834187837782, + "learning_rate": 5.518855162807036e-06, + "loss": 1.1749, + "step": 51140 + }, + { + "epoch": 1953.7735849056603, + "grad_norm": 2.014411336027095, + "learning_rate": 5.511640309501359e-06, + "loss": 1.1364, + "step": 51160 + }, + { + "epoch": 1954.5283018867924, + "grad_norm": 1.663001146626253, + "learning_rate": 5.504475625937011e-06, + "loss": 1.1469, + "step": 51180 + }, + { + "epoch": 1955.2830188679245, + "grad_norm": 3.7966652139269756, + "learning_rate": 5.497361121831918e-06, + "loss": 1.1634, + "step": 51200 + }, + { + "epoch": 1956.0377358490566, + "grad_norm": 1.6321676665862368, + "learning_rate": 5.490296806835955e-06, + "loss": 1.1747, + "step": 51220 + }, + { + "epoch": 1956.7924528301887, + "grad_norm": 1.8693551118320602, + "learning_rate": 5.483282690530914e-06, + "loss": 1.1513, + "step": 51240 + }, + { + "epoch": 1957.5471698113208, + "grad_norm": 1.7254591597688926, + "learning_rate": 5.476318782430499e-06, + "loss": 1.1384, + "step": 51260 + }, + { + "epoch": 1958.301886792453, + "grad_norm": 2.1059814541036284, + "learning_rate": 5.469405091980319e-06, + "loss": 1.145, + "step": 51280 + }, + { + "epoch": 1959.0566037735848, + "grad_norm": 2.4150150564267956, + "learning_rate": 5.462541628557862e-06, + "loss": 1.1727, + "step": 51300 + }, + { + "epoch": 1959.811320754717, + "grad_norm": 2.072986751089322, + "learning_rate": 5.4557284014725005e-06, + "loss": 1.1632, + "step": 51320 + }, + { + "epoch": 1960.566037735849, + "grad_norm": 1.7011080715428424, + "learning_rate": 5.448965419965458e-06, + "loss": 1.1719, + "step": 51340 + }, + { + "epoch": 1961.3207547169811, + "grad_norm": 2.050321684694806, + "learning_rate": 5.442252693209813e-06, + "loss": 1.1523, + "step": 51360 + }, + { + "epoch": 1962.0754716981132, + "grad_norm": 2.3154046947609603, + "learning_rate": 5.4355902303104744e-06, + "loss": 1.1365, + "step": 51380 + }, + { + "epoch": 1962.8301886792453, + "grad_norm": 2.292815295745735, + "learning_rate": 5.4289780403041805e-06, + "loss": 1.1595, + "step": 51400 + }, + { + "epoch": 1963.5849056603774, + "grad_norm": 2.1444563447901253, + "learning_rate": 5.422416132159477e-06, + "loss": 1.1609, + "step": 51420 + }, + { + "epoch": 1964.3396226415093, + "grad_norm": 1.8223405112306774, + "learning_rate": 5.415904514776712e-06, + "loss": 1.128, + "step": 51440 + }, + { + "epoch": 1965.0943396226414, + "grad_norm": 1.9601698616488796, + "learning_rate": 5.40944319698802e-06, + "loss": 1.1785, + "step": 51460 + }, + { + "epoch": 1965.8490566037735, + "grad_norm": 2.570580210466246, + "learning_rate": 5.403032187557308e-06, + "loss": 1.147, + "step": 51480 + }, + { + "epoch": 1966.6037735849056, + "grad_norm": 2.2935471508100553, + "learning_rate": 5.396671495180257e-06, + "loss": 1.1777, + "step": 51500 + }, + { + "epoch": 1967.3584905660377, + "grad_norm": 1.987678318177208, + "learning_rate": 5.390361128484278e-06, + "loss": 1.1283, + "step": 51520 + }, + { + "epoch": 1968.1132075471698, + "grad_norm": 1.9732671384472393, + "learning_rate": 5.38410109602855e-06, + "loss": 1.1631, + "step": 51540 + }, + { + "epoch": 1968.867924528302, + "grad_norm": 2.031999390800106, + "learning_rate": 5.37789140630396e-06, + "loss": 1.1498, + "step": 51560 + }, + { + "epoch": 1969.622641509434, + "grad_norm": 3.612362619019224, + "learning_rate": 5.3717320677331165e-06, + "loss": 1.1449, + "step": 51580 + }, + { + "epoch": 1970.377358490566, + "grad_norm": 2.7552728484649216, + "learning_rate": 5.365623088670337e-06, + "loss": 1.1221, + "step": 51600 + }, + { + "epoch": 1971.132075471698, + "grad_norm": 2.095240315052155, + "learning_rate": 5.359564477401625e-06, + "loss": 1.1635, + "step": 51620 + }, + { + "epoch": 1971.8867924528302, + "grad_norm": 2.051375314186468, + "learning_rate": 5.353556242144684e-06, + "loss": 1.1768, + "step": 51640 + }, + { + "epoch": 1972.6415094339623, + "grad_norm": 1.7681282740339075, + "learning_rate": 5.3475983910488705e-06, + "loss": 1.1524, + "step": 51660 + }, + { + "epoch": 1973.3962264150944, + "grad_norm": 1.9784394422194775, + "learning_rate": 5.34169093219521e-06, + "loss": 1.1694, + "step": 51680 + }, + { + "epoch": 1974.1509433962265, + "grad_norm": 2.1190098778323887, + "learning_rate": 5.3358338735963825e-06, + "loss": 1.1546, + "step": 51700 + }, + { + "epoch": 1974.9056603773586, + "grad_norm": 1.6461495183245571, + "learning_rate": 5.3300272231966895e-06, + "loss": 1.1597, + "step": 51720 + }, + { + "epoch": 1975.6603773584907, + "grad_norm": 2.287937258261333, + "learning_rate": 5.3242709888720875e-06, + "loss": 1.1565, + "step": 51740 + }, + { + "epoch": 1976.4150943396226, + "grad_norm": 1.971738312330891, + "learning_rate": 5.318565178430121e-06, + "loss": 1.1646, + "step": 51760 + }, + { + "epoch": 1977.1698113207547, + "grad_norm": 1.733242389596805, + "learning_rate": 5.312909799609962e-06, + "loss": 1.1507, + "step": 51780 + }, + { + "epoch": 1977.9245283018868, + "grad_norm": 2.2381830913006486, + "learning_rate": 5.307304860082375e-06, + "loss": 1.161, + "step": 51800 + }, + { + "epoch": 1978.6792452830189, + "grad_norm": 1.7639744175828544, + "learning_rate": 5.3017503674497e-06, + "loss": 1.1639, + "step": 51820 + }, + { + "epoch": 1979.433962264151, + "grad_norm": 2.142096606558369, + "learning_rate": 5.296246329245867e-06, + "loss": 1.145, + "step": 51840 + }, + { + "epoch": 1980.188679245283, + "grad_norm": 2.1364940736571905, + "learning_rate": 5.29079275293636e-06, + "loss": 1.1445, + "step": 51860 + }, + { + "epoch": 1980.9433962264152, + "grad_norm": 2.0408860512130063, + "learning_rate": 5.285389645918224e-06, + "loss": 1.1684, + "step": 51880 + }, + { + "epoch": 1981.698113207547, + "grad_norm": 2.1484279394512984, + "learning_rate": 5.280037015520047e-06, + "loss": 1.1427, + "step": 51900 + }, + { + "epoch": 1982.4528301886792, + "grad_norm": 1.8875817727112376, + "learning_rate": 5.27473486900196e-06, + "loss": 1.127, + "step": 51920 + }, + { + "epoch": 1983.2075471698113, + "grad_norm": 1.9694696435513541, + "learning_rate": 5.269483213555604e-06, + "loss": 1.1631, + "step": 51940 + }, + { + "epoch": 1983.9622641509434, + "grad_norm": 1.8852852930999937, + "learning_rate": 5.264282056304144e-06, + "loss": 1.1476, + "step": 51960 + }, + { + "epoch": 1984.7169811320755, + "grad_norm": 2.0442189239889488, + "learning_rate": 5.259131404302259e-06, + "loss": 1.1772, + "step": 51980 + }, + { + "epoch": 1985.4716981132076, + "grad_norm": 2.115147564108749, + "learning_rate": 5.254031264536109e-06, + "loss": 1.1451, + "step": 52000 + } + ], + "logging_steps": 20, + "max_steps": 54000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2077, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8771778183168000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}