{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1985.4716981132076, "eval_steps": 500, "global_step": 52000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.7547169811320755, "grad_norm": 30.673773492701116, "learning_rate": 3.2000000000000005e-05, "loss": 9.2984, "step": 20 }, { "epoch": 1.509433962264151, "grad_norm": 14.373295694588162, "learning_rate": 6.400000000000001e-05, "loss": 7.8844, "step": 40 }, { "epoch": 2.2641509433962264, "grad_norm": 19.800138134391982, "learning_rate": 7.99999818716091e-05, "loss": 6.0394, "step": 60 }, { "epoch": 3.018867924528302, "grad_norm": 2.662186456025038, "learning_rate": 7.9999836844587e-05, "loss": 4.373, "step": 80 }, { "epoch": 3.7735849056603774, "grad_norm": 2.9944135324779984, "learning_rate": 7.999954679110365e-05, "loss": 3.93, "step": 100 }, { "epoch": 4.528301886792453, "grad_norm": 6.144462097933044, "learning_rate": 7.999911171228081e-05, "loss": 3.8796, "step": 120 }, { "epoch": 5.283018867924528, "grad_norm": 1.885170861907592, "learning_rate": 7.999853160980113e-05, "loss": 3.7437, "step": 140 }, { "epoch": 6.037735849056604, "grad_norm": 1.794246585785847, "learning_rate": 7.999780648590806e-05, "loss": 3.6892, "step": 160 }, { "epoch": 6.7924528301886795, "grad_norm": 12.766182271547313, "learning_rate": 7.999693634340594e-05, "loss": 3.6124, "step": 180 }, { "epoch": 7.547169811320755, "grad_norm": 3.014805105995273, "learning_rate": 7.999592118565996e-05, "loss": 3.4224, "step": 200 }, { "epoch": 8.30188679245283, "grad_norm": 1.6809726247876995, "learning_rate": 7.999476101659613e-05, "loss": 3.2811, "step": 220 }, { "epoch": 9.056603773584905, "grad_norm": 1.7074366581291722, "learning_rate": 7.999345584070125e-05, "loss": 3.2012, "step": 240 }, { "epoch": 9.81132075471698, "grad_norm": 1.6971425854714368, "learning_rate": 7.999200566302298e-05, "loss": 3.1281, "step": 260 }, { "epoch": 10.566037735849056, "grad_norm": 2.2456748845663155, "learning_rate": 7.999041048916969e-05, "loss": 3.1133, "step": 280 }, { "epoch": 11.320754716981131, "grad_norm": 2.301731796932263, "learning_rate": 7.998867032531055e-05, "loss": 3.0835, "step": 300 }, { "epoch": 12.075471698113208, "grad_norm": 1.9076791695070954, "learning_rate": 7.998678517817546e-05, "loss": 3.0189, "step": 320 }, { "epoch": 12.830188679245284, "grad_norm": 1.8142406149549974, "learning_rate": 7.9984755055055e-05, "loss": 3.0019, "step": 340 }, { "epoch": 13.584905660377359, "grad_norm": 1.747017241682793, "learning_rate": 7.998257996380048e-05, "loss": 2.9866, "step": 360 }, { "epoch": 14.339622641509434, "grad_norm": 1.430663205424301, "learning_rate": 7.998025991282377e-05, "loss": 3.0026, "step": 380 }, { "epoch": 15.09433962264151, "grad_norm": 2.0885624099422566, "learning_rate": 7.997779491109745e-05, "loss": 2.946, "step": 400 }, { "epoch": 15.849056603773585, "grad_norm": 1.858513295415329, "learning_rate": 7.997518496815465e-05, "loss": 2.9293, "step": 420 }, { "epoch": 16.60377358490566, "grad_norm": 4.071990410305379, "learning_rate": 7.9972430094089e-05, "loss": 2.8812, "step": 440 }, { "epoch": 17.358490566037737, "grad_norm": 1.7862953717408494, "learning_rate": 7.996953029955468e-05, "loss": 2.9285, "step": 460 }, { "epoch": 18.11320754716981, "grad_norm": 1.6791350922351642, "learning_rate": 7.996648559576633e-05, "loss": 2.9062, "step": 480 }, { "epoch": 18.867924528301888, "grad_norm": 2.14320095701687, "learning_rate": 7.996329599449902e-05, "loss": 2.8974, "step": 500 }, { "epoch": 19.62264150943396, "grad_norm": 1.645724273362961, "learning_rate": 7.995996150808815e-05, "loss": 2.842, "step": 520 }, { "epoch": 20.37735849056604, "grad_norm": 1.6833734047290758, "learning_rate": 7.99564821494295e-05, "loss": 2.8603, "step": 540 }, { "epoch": 21.132075471698112, "grad_norm": 1.680341673033382, "learning_rate": 7.995285793197909e-05, "loss": 2.8275, "step": 560 }, { "epoch": 21.88679245283019, "grad_norm": 1.5669198141799856, "learning_rate": 7.994908886975317e-05, "loss": 2.8495, "step": 580 }, { "epoch": 22.641509433962263, "grad_norm": 1.572130529899407, "learning_rate": 7.99451749773282e-05, "loss": 2.7967, "step": 600 }, { "epoch": 23.39622641509434, "grad_norm": 2.0314221097665413, "learning_rate": 7.994111626984069e-05, "loss": 2.8023, "step": 620 }, { "epoch": 24.150943396226417, "grad_norm": 2.524880591484311, "learning_rate": 7.993691276298728e-05, "loss": 2.7928, "step": 640 }, { "epoch": 24.90566037735849, "grad_norm": 3.1215250835282884, "learning_rate": 7.993256447302454e-05, "loss": 2.7998, "step": 660 }, { "epoch": 25.660377358490567, "grad_norm": 1.2555800634223058, "learning_rate": 7.9928071416769e-05, "loss": 2.7862, "step": 680 }, { "epoch": 26.41509433962264, "grad_norm": 1.5918196349327507, "learning_rate": 7.992343361159705e-05, "loss": 2.7825, "step": 700 }, { "epoch": 27.169811320754718, "grad_norm": 1.4446288187541776, "learning_rate": 7.991865107544492e-05, "loss": 2.7566, "step": 720 }, { "epoch": 27.92452830188679, "grad_norm": 1.5498155400201465, "learning_rate": 7.991372382680851e-05, "loss": 2.7341, "step": 740 }, { "epoch": 28.67924528301887, "grad_norm": 1.9707678993278974, "learning_rate": 7.99086518847434e-05, "loss": 2.7315, "step": 760 }, { "epoch": 29.433962264150942, "grad_norm": 1.663827192167774, "learning_rate": 7.99034352688648e-05, "loss": 2.733, "step": 780 }, { "epoch": 30.18867924528302, "grad_norm": 1.9883280378959602, "learning_rate": 7.989807399934738e-05, "loss": 2.7323, "step": 800 }, { "epoch": 30.943396226415093, "grad_norm": 1.535721464256961, "learning_rate": 7.989256809692524e-05, "loss": 2.7081, "step": 820 }, { "epoch": 31.69811320754717, "grad_norm": 1.4966431407465013, "learning_rate": 7.988691758289184e-05, "loss": 2.694, "step": 840 }, { "epoch": 32.45283018867924, "grad_norm": 1.9684497666022975, "learning_rate": 7.988112247909996e-05, "loss": 2.6947, "step": 860 }, { "epoch": 33.20754716981132, "grad_norm": 1.2512316865338262, "learning_rate": 7.987518280796148e-05, "loss": 2.7216, "step": 880 }, { "epoch": 33.9622641509434, "grad_norm": 1.2566119692549285, "learning_rate": 7.986909859244743e-05, "loss": 2.6675, "step": 900 }, { "epoch": 34.716981132075475, "grad_norm": 1.8282134080395354, "learning_rate": 7.986286985608782e-05, "loss": 2.6712, "step": 920 }, { "epoch": 35.471698113207545, "grad_norm": 1.3075929724419728, "learning_rate": 7.985649662297164e-05, "loss": 2.668, "step": 940 }, { "epoch": 36.22641509433962, "grad_norm": 1.4411328906456615, "learning_rate": 7.984997891774664e-05, "loss": 2.6937, "step": 960 }, { "epoch": 36.9811320754717, "grad_norm": 1.8430700250981429, "learning_rate": 7.984331676561932e-05, "loss": 2.6798, "step": 980 }, { "epoch": 37.735849056603776, "grad_norm": 1.4511454692578831, "learning_rate": 7.983651019235483e-05, "loss": 2.6637, "step": 1000 }, { "epoch": 38.490566037735846, "grad_norm": 1.502704294446435, "learning_rate": 7.982955922427681e-05, "loss": 2.6688, "step": 1020 }, { "epoch": 39.24528301886792, "grad_norm": 1.4028762887194124, "learning_rate": 7.982246388826741e-05, "loss": 2.6086, "step": 1040 }, { "epoch": 40.0, "grad_norm": 1.539721758237447, "learning_rate": 7.981522421176697e-05, "loss": 2.6084, "step": 1060 }, { "epoch": 40.75471698113208, "grad_norm": 1.77576635302677, "learning_rate": 7.980784022277421e-05, "loss": 2.6216, "step": 1080 }, { "epoch": 41.509433962264154, "grad_norm": 1.4874495954369062, "learning_rate": 7.980031194984588e-05, "loss": 2.6328, "step": 1100 }, { "epoch": 42.264150943396224, "grad_norm": 1.8846242927156294, "learning_rate": 7.979263942209669e-05, "loss": 2.6427, "step": 1120 }, { "epoch": 43.0188679245283, "grad_norm": 1.6981877996408483, "learning_rate": 7.978482266919936e-05, "loss": 2.6224, "step": 1140 }, { "epoch": 43.77358490566038, "grad_norm": 1.3746555989630926, "learning_rate": 7.977686172138426e-05, "loss": 2.6011, "step": 1160 }, { "epoch": 44.528301886792455, "grad_norm": 1.377657678804025, "learning_rate": 7.97687566094395e-05, "loss": 2.6086, "step": 1180 }, { "epoch": 45.283018867924525, "grad_norm": 1.2094636718352942, "learning_rate": 7.976050736471069e-05, "loss": 2.582, "step": 1200 }, { "epoch": 46.0377358490566, "grad_norm": 1.4433837187551148, "learning_rate": 7.975211401910087e-05, "loss": 2.6294, "step": 1220 }, { "epoch": 46.79245283018868, "grad_norm": 1.5026382784404573, "learning_rate": 7.97435766050704e-05, "loss": 2.5993, "step": 1240 }, { "epoch": 47.54716981132076, "grad_norm": 1.2094136471599368, "learning_rate": 7.973489515563676e-05, "loss": 2.6164, "step": 1260 }, { "epoch": 48.301886792452834, "grad_norm": 1.394688364908413, "learning_rate": 7.972606970437446e-05, "loss": 2.6056, "step": 1280 }, { "epoch": 49.056603773584904, "grad_norm": 1.271568801692499, "learning_rate": 7.971710028541502e-05, "loss": 2.5755, "step": 1300 }, { "epoch": 49.81132075471698, "grad_norm": 1.4259670316825253, "learning_rate": 7.970798693344663e-05, "loss": 2.5759, "step": 1320 }, { "epoch": 50.56603773584906, "grad_norm": 1.3071538390073274, "learning_rate": 7.969872968371418e-05, "loss": 2.6031, "step": 1340 }, { "epoch": 51.320754716981135, "grad_norm": 1.2595773412735998, "learning_rate": 7.968932857201907e-05, "loss": 2.5711, "step": 1360 }, { "epoch": 52.075471698113205, "grad_norm": 2.0429570729259714, "learning_rate": 7.967978363471901e-05, "loss": 2.5662, "step": 1380 }, { "epoch": 52.83018867924528, "grad_norm": 1.6700659590709221, "learning_rate": 7.967009490872805e-05, "loss": 2.5618, "step": 1400 }, { "epoch": 53.58490566037736, "grad_norm": 1.33856858087749, "learning_rate": 7.966026243151624e-05, "loss": 2.5351, "step": 1420 }, { "epoch": 54.339622641509436, "grad_norm": 1.514257636366029, "learning_rate": 7.965028624110956e-05, "loss": 2.5686, "step": 1440 }, { "epoch": 55.094339622641506, "grad_norm": 1.5655072381428023, "learning_rate": 7.964016637608987e-05, "loss": 2.5329, "step": 1460 }, { "epoch": 55.84905660377358, "grad_norm": 1.3215439672221574, "learning_rate": 7.96299028755946e-05, "loss": 2.5701, "step": 1480 }, { "epoch": 56.60377358490566, "grad_norm": 1.4294366995579832, "learning_rate": 7.961949577931671e-05, "loss": 2.5143, "step": 1500 }, { "epoch": 57.35849056603774, "grad_norm": 1.1676370114885968, "learning_rate": 7.960894512750449e-05, "loss": 2.5653, "step": 1520 }, { "epoch": 58.113207547169814, "grad_norm": 1.2026735621707902, "learning_rate": 7.95982509609614e-05, "loss": 2.5161, "step": 1540 }, { "epoch": 58.867924528301884, "grad_norm": 1.2479764772455937, "learning_rate": 7.958741332104596e-05, "loss": 2.508, "step": 1560 }, { "epoch": 59.62264150943396, "grad_norm": 1.1961953679380617, "learning_rate": 7.957643224967155e-05, "loss": 2.5009, "step": 1580 }, { "epoch": 60.37735849056604, "grad_norm": 1.2497242021619674, "learning_rate": 7.956530778930622e-05, "loss": 2.5059, "step": 1600 }, { "epoch": 61.132075471698116, "grad_norm": 1.3171737588939698, "learning_rate": 7.955403998297261e-05, "loss": 2.4988, "step": 1620 }, { "epoch": 61.886792452830186, "grad_norm": 1.4834839050766762, "learning_rate": 7.95426288742477e-05, "loss": 2.4981, "step": 1640 }, { "epoch": 62.64150943396226, "grad_norm": 1.5715059944491987, "learning_rate": 7.953107450726267e-05, "loss": 2.5151, "step": 1660 }, { "epoch": 63.39622641509434, "grad_norm": 1.3272413313721245, "learning_rate": 7.95193769267028e-05, "loss": 2.4963, "step": 1680 }, { "epoch": 64.15094339622641, "grad_norm": 1.0349543461606097, "learning_rate": 7.950753617780715e-05, "loss": 2.4829, "step": 1700 }, { "epoch": 64.90566037735849, "grad_norm": 1.9240255477140202, "learning_rate": 7.949555230636851e-05, "loss": 2.4943, "step": 1720 }, { "epoch": 65.66037735849056, "grad_norm": 1.3252737004710828, "learning_rate": 7.948342535873318e-05, "loss": 2.4642, "step": 1740 }, { "epoch": 66.41509433962264, "grad_norm": 1.4539890356994254, "learning_rate": 7.947115538180077e-05, "loss": 2.4609, "step": 1760 }, { "epoch": 67.16981132075472, "grad_norm": 1.4560503030341407, "learning_rate": 7.945874242302408e-05, "loss": 2.5209, "step": 1780 }, { "epoch": 67.9245283018868, "grad_norm": 1.344513424004851, "learning_rate": 7.944618653040883e-05, "loss": 2.4993, "step": 1800 }, { "epoch": 68.67924528301887, "grad_norm": 1.8832922124286062, "learning_rate": 7.943348775251356e-05, "loss": 2.4646, "step": 1820 }, { "epoch": 69.43396226415095, "grad_norm": 1.2399208576659413, "learning_rate": 7.942064613844938e-05, "loss": 2.4849, "step": 1840 }, { "epoch": 70.18867924528301, "grad_norm": 1.3243651575141913, "learning_rate": 7.940766173787979e-05, "loss": 2.4599, "step": 1860 }, { "epoch": 70.94339622641509, "grad_norm": 1.556289335581103, "learning_rate": 7.939453460102055e-05, "loss": 2.4888, "step": 1880 }, { "epoch": 71.69811320754717, "grad_norm": 1.2063108835158236, "learning_rate": 7.93812647786394e-05, "loss": 2.4403, "step": 1900 }, { "epoch": 72.45283018867924, "grad_norm": 1.381601728211994, "learning_rate": 7.936785232205587e-05, "loss": 2.4616, "step": 1920 }, { "epoch": 73.20754716981132, "grad_norm": 1.1962254019464496, "learning_rate": 7.935429728314119e-05, "loss": 2.4594, "step": 1940 }, { "epoch": 73.9622641509434, "grad_norm": 1.5037943758052086, "learning_rate": 7.934059971431796e-05, "loss": 2.4767, "step": 1960 }, { "epoch": 74.71698113207547, "grad_norm": 1.3520028552568202, "learning_rate": 7.932675966856001e-05, "loss": 2.4627, "step": 1980 }, { "epoch": 75.47169811320755, "grad_norm": 1.1447719169505226, "learning_rate": 7.931277719939217e-05, "loss": 2.4434, "step": 2000 }, { "epoch": 76.22641509433963, "grad_norm": 1.264362441844072, "learning_rate": 7.92986523608901e-05, "loss": 2.4532, "step": 2020 }, { "epoch": 76.98113207547169, "grad_norm": 1.3287557477036405, "learning_rate": 7.928438520768005e-05, "loss": 2.4227, "step": 2040 }, { "epoch": 77.73584905660377, "grad_norm": 1.3638216226843092, "learning_rate": 7.926997579493864e-05, "loss": 2.4124, "step": 2060 }, { "epoch": 78.49056603773585, "grad_norm": 1.8271009676844974, "learning_rate": 7.925542417839267e-05, "loss": 2.4564, "step": 2080 }, { "epoch": 79.24528301886792, "grad_norm": 3.54259081233157, "learning_rate": 7.924073041431895e-05, "loss": 2.4369, "step": 2100 }, { "epoch": 80.0, "grad_norm": 1.4017324354742142, "learning_rate": 7.922589455954394e-05, "loss": 2.4464, "step": 2120 }, { "epoch": 80.75471698113208, "grad_norm": 1.2911982587049995, "learning_rate": 7.921091667144366e-05, "loss": 2.4513, "step": 2140 }, { "epoch": 81.50943396226415, "grad_norm": 1.6022339076405718, "learning_rate": 7.919579680794347e-05, "loss": 2.4203, "step": 2160 }, { "epoch": 82.26415094339623, "grad_norm": 1.192345448314673, "learning_rate": 7.918053502751772e-05, "loss": 2.4254, "step": 2180 }, { "epoch": 83.01886792452831, "grad_norm": 1.6383611170040047, "learning_rate": 7.916513138918968e-05, "loss": 2.4271, "step": 2200 }, { "epoch": 83.77358490566037, "grad_norm": 1.2342470221196802, "learning_rate": 7.91495859525312e-05, "loss": 2.4079, "step": 2220 }, { "epoch": 84.52830188679245, "grad_norm": 1.0846993450602334, "learning_rate": 7.913389877766257e-05, "loss": 2.4383, "step": 2240 }, { "epoch": 85.28301886792453, "grad_norm": 1.6823249556637492, "learning_rate": 7.911806992525215e-05, "loss": 2.4146, "step": 2260 }, { "epoch": 86.0377358490566, "grad_norm": 1.1641636008270617, "learning_rate": 7.91020994565163e-05, "loss": 2.4208, "step": 2280 }, { "epoch": 86.79245283018868, "grad_norm": 1.2267450186018727, "learning_rate": 7.9085987433219e-05, "loss": 2.4123, "step": 2300 }, { "epoch": 87.54716981132076, "grad_norm": 1.3570826999644423, "learning_rate": 7.906973391767178e-05, "loss": 2.3968, "step": 2320 }, { "epoch": 88.30188679245283, "grad_norm": 1.4751948166402733, "learning_rate": 7.905333897273327e-05, "loss": 2.4266, "step": 2340 }, { "epoch": 89.05660377358491, "grad_norm": 1.6442713319159463, "learning_rate": 7.903680266180908e-05, "loss": 2.4226, "step": 2360 }, { "epoch": 89.81132075471699, "grad_norm": 1.3132724406404779, "learning_rate": 7.90201250488516e-05, "loss": 2.419, "step": 2380 }, { "epoch": 90.56603773584905, "grad_norm": 1.4073019579145547, "learning_rate": 7.900330619835963e-05, "loss": 2.3689, "step": 2400 }, { "epoch": 91.32075471698113, "grad_norm": 1.2366514839120522, "learning_rate": 7.89863461753782e-05, "loss": 2.4054, "step": 2420 }, { "epoch": 92.0754716981132, "grad_norm": 1.2825349652701765, "learning_rate": 7.896924504549836e-05, "loss": 2.4019, "step": 2440 }, { "epoch": 92.83018867924528, "grad_norm": 1.836162542809911, "learning_rate": 7.895200287485676e-05, "loss": 2.4177, "step": 2460 }, { "epoch": 93.58490566037736, "grad_norm": 1.1862449779023223, "learning_rate": 7.893461973013567e-05, "loss": 2.417, "step": 2480 }, { "epoch": 94.33962264150944, "grad_norm": 1.4267902121087415, "learning_rate": 7.891709567856242e-05, "loss": 2.3877, "step": 2500 }, { "epoch": 95.09433962264151, "grad_norm": 1.2628527153576017, "learning_rate": 7.889943078790934e-05, "loss": 2.3893, "step": 2520 }, { "epoch": 95.84905660377359, "grad_norm": 1.2789710243072507, "learning_rate": 7.888162512649344e-05, "loss": 2.3747, "step": 2540 }, { "epoch": 96.60377358490567, "grad_norm": 1.2286761119774143, "learning_rate": 7.886367876317615e-05, "loss": 2.3835, "step": 2560 }, { "epoch": 97.35849056603773, "grad_norm": 1.1142509789518844, "learning_rate": 7.884559176736305e-05, "loss": 2.3751, "step": 2580 }, { "epoch": 98.11320754716981, "grad_norm": 1.4479112681435136, "learning_rate": 7.882736420900357e-05, "loss": 2.3885, "step": 2600 }, { "epoch": 98.86792452830188, "grad_norm": 1.363147415477506, "learning_rate": 7.880899615859078e-05, "loss": 2.3738, "step": 2620 }, { "epoch": 99.62264150943396, "grad_norm": 1.1387365076919822, "learning_rate": 7.879048768716105e-05, "loss": 2.3476, "step": 2640 }, { "epoch": 100.37735849056604, "grad_norm": 1.1944352338174065, "learning_rate": 7.87718388662939e-05, "loss": 2.3729, "step": 2660 }, { "epoch": 101.13207547169812, "grad_norm": 1.1017143695500988, "learning_rate": 7.875304976811153e-05, "loss": 2.3846, "step": 2680 }, { "epoch": 101.88679245283019, "grad_norm": 1.250014546065029, "learning_rate": 7.873412046527873e-05, "loss": 2.3928, "step": 2700 }, { "epoch": 102.64150943396227, "grad_norm": 1.4448571670529484, "learning_rate": 7.871505103100243e-05, "loss": 2.3464, "step": 2720 }, { "epoch": 103.39622641509433, "grad_norm": 1.1242909760207218, "learning_rate": 7.869584153903159e-05, "loss": 2.3739, "step": 2740 }, { "epoch": 104.15094339622641, "grad_norm": 2.2842982833142176, "learning_rate": 7.86764920636568e-05, "loss": 2.348, "step": 2760 }, { "epoch": 104.90566037735849, "grad_norm": 1.377894286349549, "learning_rate": 7.865700267970997e-05, "loss": 2.3888, "step": 2780 }, { "epoch": 105.66037735849056, "grad_norm": 1.889252338819464, "learning_rate": 7.863737346256416e-05, "loss": 2.339, "step": 2800 }, { "epoch": 106.41509433962264, "grad_norm": 1.2007024366101338, "learning_rate": 7.861760448813318e-05, "loss": 2.3518, "step": 2820 }, { "epoch": 107.16981132075472, "grad_norm": 1.3150471864332571, "learning_rate": 7.859769583287136e-05, "loss": 2.3755, "step": 2840 }, { "epoch": 107.9245283018868, "grad_norm": 1.3488307619297817, "learning_rate": 7.857764757377321e-05, "loss": 2.3613, "step": 2860 }, { "epoch": 108.67924528301887, "grad_norm": 1.1271224750447038, "learning_rate": 7.855745978837316e-05, "loss": 2.3434, "step": 2880 }, { "epoch": 109.43396226415095, "grad_norm": 1.2792788627087681, "learning_rate": 7.85371325547452e-05, "loss": 2.3475, "step": 2900 }, { "epoch": 110.18867924528301, "grad_norm": 1.1278269502097389, "learning_rate": 7.851666595150267e-05, "loss": 2.3561, "step": 2920 }, { "epoch": 110.94339622641509, "grad_norm": 1.2221588824212564, "learning_rate": 7.849606005779789e-05, "loss": 2.345, "step": 2940 }, { "epoch": 111.69811320754717, "grad_norm": 1.2272636691471697, "learning_rate": 7.84753149533219e-05, "loss": 2.3491, "step": 2960 }, { "epoch": 112.45283018867924, "grad_norm": 1.4379769660358386, "learning_rate": 7.845443071830403e-05, "loss": 2.3703, "step": 2980 }, { "epoch": 113.20754716981132, "grad_norm": 1.1938598523408401, "learning_rate": 7.843340743351179e-05, "loss": 2.3514, "step": 3000 }, { "epoch": 113.9622641509434, "grad_norm": 1.1633264713108291, "learning_rate": 7.841224518025038e-05, "loss": 2.3396, "step": 3020 }, { "epoch": 114.71698113207547, "grad_norm": 1.1889386134705129, "learning_rate": 7.839094404036246e-05, "loss": 2.3654, "step": 3040 }, { "epoch": 115.47169811320755, "grad_norm": 1.2210304404269434, "learning_rate": 7.836950409622788e-05, "loss": 2.3827, "step": 3060 }, { "epoch": 116.22641509433963, "grad_norm": 1.2063342612399106, "learning_rate": 7.834792543076318e-05, "loss": 2.3316, "step": 3080 }, { "epoch": 116.98113207547169, "grad_norm": 1.1263568091149723, "learning_rate": 7.832620812742149e-05, "loss": 2.3483, "step": 3100 }, { "epoch": 117.73584905660377, "grad_norm": 1.1259514670897872, "learning_rate": 7.830435227019208e-05, "loss": 2.3125, "step": 3120 }, { "epoch": 118.49056603773585, "grad_norm": 1.4031978763279247, "learning_rate": 7.828235794360003e-05, "loss": 2.3509, "step": 3140 }, { "epoch": 119.24528301886792, "grad_norm": 1.1004874238643756, "learning_rate": 7.826022523270598e-05, "loss": 2.2975, "step": 3160 }, { "epoch": 120.0, "grad_norm": 1.1440875702771847, "learning_rate": 7.823795422310573e-05, "loss": 2.3048, "step": 3180 }, { "epoch": 120.75471698113208, "grad_norm": 1.254578833443374, "learning_rate": 7.821554500092995e-05, "loss": 2.3253, "step": 3200 }, { "epoch": 121.50943396226415, "grad_norm": 1.3020705320626609, "learning_rate": 7.819299765284377e-05, "loss": 2.32, "step": 3220 }, { "epoch": 122.26415094339623, "grad_norm": 1.144219025307704, "learning_rate": 7.817031226604663e-05, "loss": 2.3338, "step": 3240 }, { "epoch": 123.01886792452831, "grad_norm": 1.4392091454771268, "learning_rate": 7.814748892827171e-05, "loss": 2.3081, "step": 3260 }, { "epoch": 123.77358490566037, "grad_norm": 1.57104334995189, "learning_rate": 7.812452772778576e-05, "loss": 2.3044, "step": 3280 }, { "epoch": 124.52830188679245, "grad_norm": 1.2140782445458616, "learning_rate": 7.810142875338864e-05, "loss": 2.3162, "step": 3300 }, { "epoch": 125.28301886792453, "grad_norm": 1.1430293000699974, "learning_rate": 7.807819209441311e-05, "loss": 2.3349, "step": 3320 }, { "epoch": 126.0377358490566, "grad_norm": 1.3717461598648188, "learning_rate": 7.805481784072435e-05, "loss": 2.3048, "step": 3340 }, { "epoch": 126.79245283018868, "grad_norm": 1.413324243222593, "learning_rate": 7.803130608271972e-05, "loss": 2.2987, "step": 3360 }, { "epoch": 127.54716981132076, "grad_norm": 1.4726228615781376, "learning_rate": 7.80076569113283e-05, "loss": 2.3164, "step": 3380 }, { "epoch": 128.30188679245282, "grad_norm": 1.4760960932985028, "learning_rate": 7.798387041801066e-05, "loss": 2.3314, "step": 3400 }, { "epoch": 129.0566037735849, "grad_norm": 1.429175780411594, "learning_rate": 7.795994669475842e-05, "loss": 2.2752, "step": 3420 }, { "epoch": 129.81132075471697, "grad_norm": 1.1413240245586067, "learning_rate": 7.793588583409394e-05, "loss": 2.333, "step": 3440 }, { "epoch": 130.56603773584905, "grad_norm": 1.298767089765165, "learning_rate": 7.791168792906992e-05, "loss": 2.3227, "step": 3460 }, { "epoch": 131.32075471698113, "grad_norm": 1.2359046339523858, "learning_rate": 7.788735307326908e-05, "loss": 2.3108, "step": 3480 }, { "epoch": 132.0754716981132, "grad_norm": 1.1866095738297588, "learning_rate": 7.786288136080376e-05, "loss": 2.274, "step": 3500 }, { "epoch": 132.83018867924528, "grad_norm": 1.1104279322428132, "learning_rate": 7.78382728863156e-05, "loss": 2.2888, "step": 3520 }, { "epoch": 133.58490566037736, "grad_norm": 2.2680957450657537, "learning_rate": 7.781352774497518e-05, "loss": 2.2938, "step": 3540 }, { "epoch": 134.33962264150944, "grad_norm": 1.6397138146409036, "learning_rate": 7.778864603248155e-05, "loss": 2.3068, "step": 3560 }, { "epoch": 135.0943396226415, "grad_norm": 1.3552905083817133, "learning_rate": 7.7763627845062e-05, "loss": 2.3155, "step": 3580 }, { "epoch": 135.8490566037736, "grad_norm": 1.1820209128101842, "learning_rate": 7.773847327947157e-05, "loss": 2.2937, "step": 3600 }, { "epoch": 136.60377358490567, "grad_norm": 1.2309654111909685, "learning_rate": 7.771318243299278e-05, "loss": 2.2887, "step": 3620 }, { "epoch": 137.35849056603774, "grad_norm": 1.3795089340342572, "learning_rate": 7.768775540343515e-05, "loss": 2.2961, "step": 3640 }, { "epoch": 138.11320754716982, "grad_norm": 1.3662606951792517, "learning_rate": 7.766219228913492e-05, "loss": 2.288, "step": 3660 }, { "epoch": 138.8679245283019, "grad_norm": 1.1081358756463113, "learning_rate": 7.763649318895459e-05, "loss": 2.3193, "step": 3680 }, { "epoch": 139.62264150943398, "grad_norm": 1.3054965758516237, "learning_rate": 7.761065820228258e-05, "loss": 2.2904, "step": 3700 }, { "epoch": 140.37735849056602, "grad_norm": 1.4052953203319152, "learning_rate": 7.758468742903284e-05, "loss": 2.2803, "step": 3720 }, { "epoch": 141.1320754716981, "grad_norm": 1.310015833541638, "learning_rate": 7.755858096964445e-05, "loss": 2.2891, "step": 3740 }, { "epoch": 141.88679245283018, "grad_norm": 1.0645192580358254, "learning_rate": 7.753233892508125e-05, "loss": 2.2982, "step": 3760 }, { "epoch": 142.64150943396226, "grad_norm": 1.0898474528650213, "learning_rate": 7.750596139683145e-05, "loss": 2.2711, "step": 3780 }, { "epoch": 143.39622641509433, "grad_norm": 1.2074165473918712, "learning_rate": 7.747944848690719e-05, "loss": 2.2592, "step": 3800 }, { "epoch": 144.1509433962264, "grad_norm": 1.0959283058664937, "learning_rate": 7.745280029784423e-05, "loss": 2.2813, "step": 3820 }, { "epoch": 144.9056603773585, "grad_norm": 1.2139556681199035, "learning_rate": 7.742601693270148e-05, "loss": 2.2564, "step": 3840 }, { "epoch": 145.66037735849056, "grad_norm": 1.2374163658098694, "learning_rate": 7.739909849506064e-05, "loss": 2.2972, "step": 3860 }, { "epoch": 146.41509433962264, "grad_norm": 1.212582172960113, "learning_rate": 7.737204508902578e-05, "loss": 2.2683, "step": 3880 }, { "epoch": 147.16981132075472, "grad_norm": 1.064638273683967, "learning_rate": 7.734485681922295e-05, "loss": 2.2643, "step": 3900 }, { "epoch": 147.9245283018868, "grad_norm": 1.5185500205423423, "learning_rate": 7.731753379079976e-05, "loss": 2.2825, "step": 3920 }, { "epoch": 148.67924528301887, "grad_norm": 1.2571175031602655, "learning_rate": 7.7290076109425e-05, "loss": 2.2838, "step": 3940 }, { "epoch": 149.43396226415095, "grad_norm": 1.1185096216789012, "learning_rate": 7.726248388128821e-05, "loss": 2.2713, "step": 3960 }, { "epoch": 150.18867924528303, "grad_norm": 1.283741452573828, "learning_rate": 7.723475721309926e-05, "loss": 2.2578, "step": 3980 }, { "epoch": 150.9433962264151, "grad_norm": 1.1735101055664479, "learning_rate": 7.720689621208799e-05, "loss": 2.2584, "step": 4000 }, { "epoch": 151.69811320754718, "grad_norm": 1.1931741706657397, "learning_rate": 7.717890098600371e-05, "loss": 2.2439, "step": 4020 }, { "epoch": 152.45283018867926, "grad_norm": 1.0510206287412838, "learning_rate": 7.715077164311486e-05, "loss": 2.2646, "step": 4040 }, { "epoch": 153.20754716981133, "grad_norm": 1.2236996476850626, "learning_rate": 7.712250829220856e-05, "loss": 2.2518, "step": 4060 }, { "epoch": 153.96226415094338, "grad_norm": 1.4295022161938338, "learning_rate": 7.70941110425902e-05, "loss": 2.2445, "step": 4080 }, { "epoch": 154.71698113207546, "grad_norm": 1.2608108045607223, "learning_rate": 7.706558000408294e-05, "loss": 2.2504, "step": 4100 }, { "epoch": 155.47169811320754, "grad_norm": 1.4378816608236173, "learning_rate": 7.703691528702747e-05, "loss": 2.2433, "step": 4120 }, { "epoch": 156.22641509433961, "grad_norm": 1.3122607821127985, "learning_rate": 7.700811700228138e-05, "loss": 2.2593, "step": 4140 }, { "epoch": 156.9811320754717, "grad_norm": 1.1677763203213758, "learning_rate": 7.697918526121882e-05, "loss": 2.2521, "step": 4160 }, { "epoch": 157.73584905660377, "grad_norm": 1.1304212534843256, "learning_rate": 7.695012017573013e-05, "loss": 2.2743, "step": 4180 }, { "epoch": 158.49056603773585, "grad_norm": 1.2157344056650818, "learning_rate": 7.692092185822129e-05, "loss": 2.2405, "step": 4200 }, { "epoch": 159.24528301886792, "grad_norm": 1.2521062422528308, "learning_rate": 7.689159042161356e-05, "loss": 2.258, "step": 4220 }, { "epoch": 160.0, "grad_norm": 1.417021221810849, "learning_rate": 7.686212597934299e-05, "loss": 2.2187, "step": 4240 }, { "epoch": 160.75471698113208, "grad_norm": 1.0987738687082824, "learning_rate": 7.68325286453601e-05, "loss": 2.2155, "step": 4260 }, { "epoch": 161.50943396226415, "grad_norm": 1.4771801969035276, "learning_rate": 7.680279853412924e-05, "loss": 2.27, "step": 4280 }, { "epoch": 162.26415094339623, "grad_norm": 1.1956274528883593, "learning_rate": 7.677293576062836e-05, "loss": 2.2717, "step": 4300 }, { "epoch": 163.0188679245283, "grad_norm": 1.1219859338242828, "learning_rate": 7.674294044034839e-05, "loss": 2.2487, "step": 4320 }, { "epoch": 163.77358490566039, "grad_norm": 1.255744824066408, "learning_rate": 7.671281268929293e-05, "loss": 2.2366, "step": 4340 }, { "epoch": 164.52830188679246, "grad_norm": 1.112451658029252, "learning_rate": 7.668255262397772e-05, "loss": 2.2377, "step": 4360 }, { "epoch": 165.28301886792454, "grad_norm": 1.1131032086265853, "learning_rate": 7.66521603614302e-05, "loss": 2.2483, "step": 4380 }, { "epoch": 166.03773584905662, "grad_norm": 1.2568117014241036, "learning_rate": 7.662163601918907e-05, "loss": 2.2637, "step": 4400 }, { "epoch": 166.79245283018867, "grad_norm": 1.0763275712599132, "learning_rate": 7.659097971530385e-05, "loss": 2.2275, "step": 4420 }, { "epoch": 167.54716981132074, "grad_norm": 1.0880356132513982, "learning_rate": 7.656019156833438e-05, "loss": 2.227, "step": 4440 }, { "epoch": 168.30188679245282, "grad_norm": 1.0805504953865772, "learning_rate": 7.652927169735042e-05, "loss": 2.2205, "step": 4460 }, { "epoch": 169.0566037735849, "grad_norm": 1.0979536600508317, "learning_rate": 7.649822022193114e-05, "loss": 2.2008, "step": 4480 }, { "epoch": 169.81132075471697, "grad_norm": 1.0424485855679975, "learning_rate": 7.646703726216467e-05, "loss": 2.235, "step": 4500 }, { "epoch": 170.56603773584905, "grad_norm": 1.1541609361962377, "learning_rate": 7.643572293864766e-05, "loss": 2.2297, "step": 4520 }, { "epoch": 171.32075471698113, "grad_norm": 1.1630212513509717, "learning_rate": 7.640427737248479e-05, "loss": 2.2295, "step": 4540 }, { "epoch": 172.0754716981132, "grad_norm": 1.5088805287099432, "learning_rate": 7.637270068528828e-05, "loss": 2.2445, "step": 4560 }, { "epoch": 172.83018867924528, "grad_norm": 1.66773080303759, "learning_rate": 7.634099299917748e-05, "loss": 2.2336, "step": 4580 }, { "epoch": 173.58490566037736, "grad_norm": 1.4239223646642891, "learning_rate": 7.630915443677834e-05, "loss": 2.2128, "step": 4600 }, { "epoch": 174.33962264150944, "grad_norm": 1.2623270496447048, "learning_rate": 7.627718512122297e-05, "loss": 2.2253, "step": 4620 }, { "epoch": 175.0943396226415, "grad_norm": 1.2406324767245749, "learning_rate": 7.624508517614919e-05, "loss": 2.2131, "step": 4640 }, { "epoch": 175.8490566037736, "grad_norm": 1.3130455463591448, "learning_rate": 7.621285472569993e-05, "loss": 2.1944, "step": 4660 }, { "epoch": 176.60377358490567, "grad_norm": 1.1413419622441512, "learning_rate": 7.61804938945229e-05, "loss": 2.2243, "step": 4680 }, { "epoch": 177.35849056603774, "grad_norm": 1.2146654711035267, "learning_rate": 7.614800280777005e-05, "loss": 2.2172, "step": 4700 }, { "epoch": 178.11320754716982, "grad_norm": 1.3634898063511693, "learning_rate": 7.611538159109703e-05, "loss": 2.205, "step": 4720 }, { "epoch": 178.8679245283019, "grad_norm": 1.2311721419826, "learning_rate": 7.608263037066277e-05, "loss": 2.2252, "step": 4740 }, { "epoch": 179.62264150943398, "grad_norm": 1.0908914570592438, "learning_rate": 7.6049749273129e-05, "loss": 2.2138, "step": 4760 }, { "epoch": 180.37735849056602, "grad_norm": 1.1038829505990149, "learning_rate": 7.601673842565972e-05, "loss": 2.1939, "step": 4780 }, { "epoch": 181.1320754716981, "grad_norm": 1.1236372724431538, "learning_rate": 7.598359795592073e-05, "loss": 2.2382, "step": 4800 }, { "epoch": 181.88679245283018, "grad_norm": 1.3232451908070362, "learning_rate": 7.59503279920791e-05, "loss": 2.201, "step": 4820 }, { "epoch": 182.64150943396226, "grad_norm": 1.3292125597941664, "learning_rate": 7.591692866280274e-05, "loss": 2.2058, "step": 4840 }, { "epoch": 183.39622641509433, "grad_norm": 1.1970310296785942, "learning_rate": 7.588340009725985e-05, "loss": 2.206, "step": 4860 }, { "epoch": 184.1509433962264, "grad_norm": 1.055682897860096, "learning_rate": 7.584974242511845e-05, "loss": 2.2148, "step": 4880 }, { "epoch": 184.9056603773585, "grad_norm": 1.1655929048666676, "learning_rate": 7.581595577654584e-05, "loss": 2.2146, "step": 4900 }, { "epoch": 185.66037735849056, "grad_norm": 1.2197862783964168, "learning_rate": 7.578204028220814e-05, "loss": 2.2023, "step": 4920 }, { "epoch": 186.41509433962264, "grad_norm": 1.1536947546834515, "learning_rate": 7.574799607326977e-05, "loss": 2.2074, "step": 4940 }, { "epoch": 187.16981132075472, "grad_norm": 1.1570044860516948, "learning_rate": 7.571382328139293e-05, "loss": 2.2057, "step": 4960 }, { "epoch": 187.9245283018868, "grad_norm": 2.2251854969672165, "learning_rate": 7.56795220387371e-05, "loss": 2.1975, "step": 4980 }, { "epoch": 188.67924528301887, "grad_norm": 1.16489093753128, "learning_rate": 7.564509247795854e-05, "loss": 2.1947, "step": 5000 }, { "epoch": 189.43396226415095, "grad_norm": 1.1610456984999162, "learning_rate": 7.561053473220977e-05, "loss": 2.1861, "step": 5020 }, { "epoch": 190.18867924528303, "grad_norm": 1.173342232590181, "learning_rate": 7.557584893513902e-05, "loss": 2.1997, "step": 5040 }, { "epoch": 190.9433962264151, "grad_norm": 1.1873135989990635, "learning_rate": 7.554103522088976e-05, "loss": 2.1841, "step": 5060 }, { "epoch": 191.69811320754718, "grad_norm": 1.4263143797188473, "learning_rate": 7.550609372410018e-05, "loss": 2.1823, "step": 5080 }, { "epoch": 192.45283018867926, "grad_norm": 1.0849530063111787, "learning_rate": 7.547102457990266e-05, "loss": 2.1842, "step": 5100 }, { "epoch": 193.20754716981133, "grad_norm": 1.1497288768060088, "learning_rate": 7.54358279239232e-05, "loss": 2.2258, "step": 5120 }, { "epoch": 193.96226415094338, "grad_norm": 1.2795496420829302, "learning_rate": 7.540050389228099e-05, "loss": 2.192, "step": 5140 }, { "epoch": 194.71698113207546, "grad_norm": 1.0700549445449614, "learning_rate": 7.536505262158779e-05, "loss": 2.1913, "step": 5160 }, { "epoch": 195.47169811320754, "grad_norm": 1.3697359389801924, "learning_rate": 7.532947424894744e-05, "loss": 2.2044, "step": 5180 }, { "epoch": 196.22641509433961, "grad_norm": 1.0721264053082575, "learning_rate": 7.52937689119554e-05, "loss": 2.1916, "step": 5200 }, { "epoch": 196.9811320754717, "grad_norm": 1.2325173290768243, "learning_rate": 7.525793674869805e-05, "loss": 2.1738, "step": 5220 }, { "epoch": 197.73584905660377, "grad_norm": 1.078471360885739, "learning_rate": 7.522197789775235e-05, "loss": 2.2043, "step": 5240 }, { "epoch": 198.49056603773585, "grad_norm": 1.508079711738152, "learning_rate": 7.518589249818516e-05, "loss": 2.2159, "step": 5260 }, { "epoch": 199.24528301886792, "grad_norm": 1.0511550659614401, "learning_rate": 7.514968068955273e-05, "loss": 2.168, "step": 5280 }, { "epoch": 200.0, "grad_norm": 1.0585993296644824, "learning_rate": 7.511334261190026e-05, "loss": 2.1847, "step": 5300 }, { "epoch": 200.75471698113208, "grad_norm": 1.19584254290663, "learning_rate": 7.507687840576123e-05, "loss": 2.1953, "step": 5320 }, { "epoch": 201.50943396226415, "grad_norm": 1.360707266271236, "learning_rate": 7.504028821215686e-05, "loss": 2.1866, "step": 5340 }, { "epoch": 202.26415094339623, "grad_norm": 1.0326833677791634, "learning_rate": 7.500357217259573e-05, "loss": 2.1889, "step": 5360 }, { "epoch": 203.0188679245283, "grad_norm": 1.1966698046584427, "learning_rate": 7.496673042907302e-05, "loss": 2.204, "step": 5380 }, { "epoch": 203.77358490566039, "grad_norm": 1.1792590946885393, "learning_rate": 7.492976312407011e-05, "loss": 2.1679, "step": 5400 }, { "epoch": 204.52830188679246, "grad_norm": 1.0821551276306904, "learning_rate": 7.489267040055393e-05, "loss": 2.172, "step": 5420 }, { "epoch": 205.28301886792454, "grad_norm": 1.1206896992927644, "learning_rate": 7.48554524019765e-05, "loss": 2.1558, "step": 5440 }, { "epoch": 206.03773584905662, "grad_norm": 1.164481519584628, "learning_rate": 7.481810927227427e-05, "loss": 2.1707, "step": 5460 }, { "epoch": 206.79245283018867, "grad_norm": 1.045173494578065, "learning_rate": 7.47806411558677e-05, "loss": 2.1454, "step": 5480 }, { "epoch": 207.54716981132074, "grad_norm": 1.3037299893846073, "learning_rate": 7.474304819766053e-05, "loss": 2.1735, "step": 5500 }, { "epoch": 208.30188679245282, "grad_norm": 1.1799164756908072, "learning_rate": 7.470533054303937e-05, "loss": 2.1678, "step": 5520 }, { "epoch": 209.0566037735849, "grad_norm": 1.23204534029245, "learning_rate": 7.46674883378731e-05, "loss": 2.18, "step": 5540 }, { "epoch": 209.81132075471697, "grad_norm": 1.1705040244332197, "learning_rate": 7.462952172851219e-05, "loss": 2.1638, "step": 5560 }, { "epoch": 210.56603773584905, "grad_norm": 1.115647376955501, "learning_rate": 7.459143086178838e-05, "loss": 2.1517, "step": 5580 }, { "epoch": 211.32075471698113, "grad_norm": 1.095644914375309, "learning_rate": 7.455321588501378e-05, "loss": 2.1624, "step": 5600 }, { "epoch": 212.0754716981132, "grad_norm": 1.2461377018123299, "learning_rate": 7.451487694598063e-05, "loss": 2.1795, "step": 5620 }, { "epoch": 212.83018867924528, "grad_norm": 1.0808714278402736, "learning_rate": 7.447641419296051e-05, "loss": 2.1857, "step": 5640 }, { "epoch": 213.58490566037736, "grad_norm": 1.175783749152713, "learning_rate": 7.443782777470388e-05, "loss": 2.1489, "step": 5660 }, { "epoch": 214.33962264150944, "grad_norm": 1.0323602107911023, "learning_rate": 7.43991178404394e-05, "loss": 2.1814, "step": 5680 }, { "epoch": 215.0943396226415, "grad_norm": 1.4371901693782694, "learning_rate": 7.436028453987343e-05, "loss": 2.1607, "step": 5700 }, { "epoch": 215.8490566037736, "grad_norm": 1.2749189929859621, "learning_rate": 7.432132802318953e-05, "loss": 2.1344, "step": 5720 }, { "epoch": 216.60377358490567, "grad_norm": 1.7991005001893379, "learning_rate": 7.428224844104763e-05, "loss": 2.1705, "step": 5740 }, { "epoch": 217.35849056603774, "grad_norm": 1.0763947355182082, "learning_rate": 7.424304594458374e-05, "loss": 2.1681, "step": 5760 }, { "epoch": 218.11320754716982, "grad_norm": 1.147647175883896, "learning_rate": 7.420372068540913e-05, "loss": 2.1792, "step": 5780 }, { "epoch": 218.8679245283019, "grad_norm": 1.232169418468151, "learning_rate": 7.41642728156099e-05, "loss": 2.1143, "step": 5800 }, { "epoch": 219.62264150943398, "grad_norm": 1.3992234444810514, "learning_rate": 7.41247024877463e-05, "loss": 2.1612, "step": 5820 }, { "epoch": 220.37735849056602, "grad_norm": 1.2478978185410232, "learning_rate": 7.40850098548522e-05, "loss": 2.1749, "step": 5840 }, { "epoch": 221.1320754716981, "grad_norm": 1.0796153228438745, "learning_rate": 7.404519507043443e-05, "loss": 2.1345, "step": 5860 }, { "epoch": 221.88679245283018, "grad_norm": 1.164330548160425, "learning_rate": 7.40052582884723e-05, "loss": 2.1573, "step": 5880 }, { "epoch": 222.64150943396226, "grad_norm": 1.2041551436276394, "learning_rate": 7.396519966341684e-05, "loss": 2.162, "step": 5900 }, { "epoch": 223.39622641509433, "grad_norm": 1.2780053810145304, "learning_rate": 7.392501935019036e-05, "loss": 2.1524, "step": 5920 }, { "epoch": 224.1509433962264, "grad_norm": 1.069497717017709, "learning_rate": 7.388471750418576e-05, "loss": 2.1427, "step": 5940 }, { "epoch": 224.9056603773585, "grad_norm": 1.1790523262171884, "learning_rate": 7.384429428126599e-05, "loss": 2.1693, "step": 5960 }, { "epoch": 225.66037735849056, "grad_norm": 1.0727940077044007, "learning_rate": 7.380374983776333e-05, "loss": 2.1146, "step": 5980 }, { "epoch": 226.41509433962264, "grad_norm": 1.0481198314836597, "learning_rate": 7.376308433047898e-05, "loss": 2.1563, "step": 6000 }, { "epoch": 227.16981132075472, "grad_norm": 1.3874056107583248, "learning_rate": 7.372229791668223e-05, "loss": 2.1456, "step": 6020 }, { "epoch": 227.9245283018868, "grad_norm": 1.3153838535909976, "learning_rate": 7.368139075411003e-05, "loss": 2.1575, "step": 6040 }, { "epoch": 228.67924528301887, "grad_norm": 1.1788160013410025, "learning_rate": 7.364036300096631e-05, "loss": 2.1437, "step": 6060 }, { "epoch": 229.43396226415095, "grad_norm": 1.1169312984810649, "learning_rate": 7.359921481592136e-05, "loss": 2.1568, "step": 6080 }, { "epoch": 230.18867924528303, "grad_norm": 1.1435068374715258, "learning_rate": 7.355794635811118e-05, "loss": 2.1503, "step": 6100 }, { "epoch": 230.9433962264151, "grad_norm": 1.7552469517638039, "learning_rate": 7.3516557787137e-05, "loss": 2.128, "step": 6120 }, { "epoch": 231.69811320754718, "grad_norm": 1.0779669989000775, "learning_rate": 7.347504926306452e-05, "loss": 2.1485, "step": 6140 }, { "epoch": 232.45283018867926, "grad_norm": 1.186788194688993, "learning_rate": 7.343342094642333e-05, "loss": 2.1576, "step": 6160 }, { "epoch": 233.20754716981133, "grad_norm": 1.0594086679490557, "learning_rate": 7.339167299820636e-05, "loss": 2.1492, "step": 6180 }, { "epoch": 233.96226415094338, "grad_norm": 1.1917321731840318, "learning_rate": 7.334980557986916e-05, "loss": 2.1482, "step": 6200 }, { "epoch": 234.71698113207546, "grad_norm": 1.083198692826801, "learning_rate": 7.330781885332932e-05, "loss": 2.1461, "step": 6220 }, { "epoch": 235.47169811320754, "grad_norm": 1.3139182121317998, "learning_rate": 7.326571298096586e-05, "loss": 2.156, "step": 6240 }, { "epoch": 236.22641509433961, "grad_norm": 1.0854207170845476, "learning_rate": 7.322348812561857e-05, "loss": 2.1258, "step": 6260 }, { "epoch": 236.9811320754717, "grad_norm": 1.5015227061373095, "learning_rate": 7.318114445058739e-05, "loss": 2.1439, "step": 6280 }, { "epoch": 237.73584905660377, "grad_norm": 1.2347794021289429, "learning_rate": 7.313868211963179e-05, "loss": 2.1317, "step": 6300 }, { "epoch": 238.49056603773585, "grad_norm": 1.442835177639965, "learning_rate": 7.309610129697015e-05, "loss": 2.113, "step": 6320 }, { "epoch": 239.24528301886792, "grad_norm": 1.236255276661992, "learning_rate": 7.305340214727905e-05, "loss": 2.1378, "step": 6340 }, { "epoch": 240.0, "grad_norm": 1.205183440308278, "learning_rate": 7.301058483569271e-05, "loss": 2.1336, "step": 6360 }, { "epoch": 240.75471698113208, "grad_norm": 1.3246584618487252, "learning_rate": 7.296764952780239e-05, "loss": 2.1221, "step": 6380 }, { "epoch": 241.50943396226415, "grad_norm": 1.0168670687272512, "learning_rate": 7.292459638965558e-05, "loss": 2.1188, "step": 6400 }, { "epoch": 242.26415094339623, "grad_norm": 1.3467491151924502, "learning_rate": 7.288142558775552e-05, "loss": 2.101, "step": 6420 }, { "epoch": 243.0188679245283, "grad_norm": 1.261074686560294, "learning_rate": 7.283813728906054e-05, "loss": 2.1411, "step": 6440 }, { "epoch": 243.77358490566039, "grad_norm": 1.2485690805022434, "learning_rate": 7.27947316609833e-05, "loss": 2.1277, "step": 6460 }, { "epoch": 244.52830188679246, "grad_norm": 1.3241322758759912, "learning_rate": 7.275120887139026e-05, "loss": 2.1363, "step": 6480 }, { "epoch": 245.28301886792454, "grad_norm": 1.5599386219671891, "learning_rate": 7.270756908860098e-05, "loss": 2.1089, "step": 6500 }, { "epoch": 246.03773584905662, "grad_norm": 1.2114819177389966, "learning_rate": 7.266381248138751e-05, "loss": 2.1089, "step": 6520 }, { "epoch": 246.79245283018867, "grad_norm": 1.1306582742382014, "learning_rate": 7.261993921897364e-05, "loss": 2.1079, "step": 6540 }, { "epoch": 247.54716981132074, "grad_norm": 1.2673326383282852, "learning_rate": 7.257594947103438e-05, "loss": 2.1266, "step": 6560 }, { "epoch": 248.30188679245282, "grad_norm": 1.4019469970816203, "learning_rate": 7.253184340769518e-05, "loss": 2.1481, "step": 6580 }, { "epoch": 249.0566037735849, "grad_norm": 1.0449709444069573, "learning_rate": 7.248762119953135e-05, "loss": 2.1158, "step": 6600 }, { "epoch": 249.81132075471697, "grad_norm": 1.1593445705123036, "learning_rate": 7.244328301756737e-05, "loss": 2.13, "step": 6620 }, { "epoch": 250.56603773584905, "grad_norm": 1.2635129121192081, "learning_rate": 7.23988290332763e-05, "loss": 2.1167, "step": 6640 }, { "epoch": 251.32075471698113, "grad_norm": 1.0527620092255492, "learning_rate": 7.235425941857891e-05, "loss": 2.114, "step": 6660 }, { "epoch": 252.0754716981132, "grad_norm": 1.3035661165388843, "learning_rate": 7.230957434584331e-05, "loss": 2.0928, "step": 6680 }, { "epoch": 252.83018867924528, "grad_norm": 1.0136550616355096, "learning_rate": 7.226477398788402e-05, "loss": 2.0987, "step": 6700 }, { "epoch": 253.58490566037736, "grad_norm": 1.258957796854538, "learning_rate": 7.22198585179615e-05, "loss": 2.1032, "step": 6720 }, { "epoch": 254.33962264150944, "grad_norm": 1.2937771749668925, "learning_rate": 7.21748281097813e-05, "loss": 2.1003, "step": 6740 }, { "epoch": 255.0943396226415, "grad_norm": 1.0533802729958242, "learning_rate": 7.212968293749357e-05, "loss": 2.1201, "step": 6760 }, { "epoch": 255.8490566037736, "grad_norm": 1.0065996122655994, "learning_rate": 7.208442317569225e-05, "loss": 2.1119, "step": 6780 }, { "epoch": 256.60377358490564, "grad_norm": 1.1726423865130644, "learning_rate": 7.203904899941444e-05, "loss": 2.0967, "step": 6800 }, { "epoch": 257.35849056603774, "grad_norm": 1.4137580376820904, "learning_rate": 7.199356058413975e-05, "loss": 2.1297, "step": 6820 }, { "epoch": 258.1132075471698, "grad_norm": 1.2534212871623691, "learning_rate": 7.194795810578956e-05, "loss": 2.1142, "step": 6840 }, { "epoch": 258.8679245283019, "grad_norm": 1.1760816154209972, "learning_rate": 7.190224174072643e-05, "loss": 2.1524, "step": 6860 }, { "epoch": 259.62264150943395, "grad_norm": 1.1576937144547554, "learning_rate": 7.185641166575331e-05, "loss": 2.0873, "step": 6880 }, { "epoch": 260.37735849056605, "grad_norm": 1.18595129264392, "learning_rate": 7.181046805811294e-05, "loss": 2.1118, "step": 6900 }, { "epoch": 261.1320754716981, "grad_norm": 1.1205604370668647, "learning_rate": 7.176441109548715e-05, "loss": 2.0986, "step": 6920 }, { "epoch": 261.8867924528302, "grad_norm": 1.1884901674285933, "learning_rate": 7.171824095599609e-05, "loss": 2.1109, "step": 6940 }, { "epoch": 262.64150943396226, "grad_norm": 1.123509221114028, "learning_rate": 7.167195781819768e-05, "loss": 2.1047, "step": 6960 }, { "epoch": 263.39622641509436, "grad_norm": 1.1260075560640628, "learning_rate": 7.162556186108684e-05, "loss": 2.0972, "step": 6980 }, { "epoch": 264.1509433962264, "grad_norm": 1.191570785218505, "learning_rate": 7.157905326409477e-05, "loss": 2.0938, "step": 7000 }, { "epoch": 264.9056603773585, "grad_norm": 1.1009948219165815, "learning_rate": 7.153243220708831e-05, "loss": 2.1084, "step": 7020 }, { "epoch": 265.66037735849056, "grad_norm": 1.0984593800759155, "learning_rate": 7.148569887036923e-05, "loss": 2.0989, "step": 7040 }, { "epoch": 266.41509433962267, "grad_norm": 1.2379578619669414, "learning_rate": 7.143885343467355e-05, "loss": 2.1166, "step": 7060 }, { "epoch": 267.1698113207547, "grad_norm": 1.1064036960932773, "learning_rate": 7.139189608117077e-05, "loss": 2.1104, "step": 7080 }, { "epoch": 267.92452830188677, "grad_norm": 1.0772108392111555, "learning_rate": 7.134482699146328e-05, "loss": 2.0897, "step": 7100 }, { "epoch": 268.6792452830189, "grad_norm": 1.1292302199915438, "learning_rate": 7.129764634758554e-05, "loss": 2.1157, "step": 7120 }, { "epoch": 269.4339622641509, "grad_norm": 1.1278390668879588, "learning_rate": 7.125035433200346e-05, "loss": 2.0932, "step": 7140 }, { "epoch": 270.188679245283, "grad_norm": 1.0414531139729244, "learning_rate": 7.120295112761368e-05, "loss": 2.1151, "step": 7160 }, { "epoch": 270.9433962264151, "grad_norm": 1.1545750967690267, "learning_rate": 7.115543691774282e-05, "loss": 2.1131, "step": 7180 }, { "epoch": 271.6981132075472, "grad_norm": 1.204421852849513, "learning_rate": 7.110781188614684e-05, "loss": 2.0802, "step": 7200 }, { "epoch": 272.45283018867923, "grad_norm": 1.7831981359411682, "learning_rate": 7.106007621701024e-05, "loss": 2.0798, "step": 7220 }, { "epoch": 273.20754716981133, "grad_norm": 1.3197751132016162, "learning_rate": 7.101223009494545e-05, "loss": 2.0992, "step": 7240 }, { "epoch": 273.9622641509434, "grad_norm": 1.2400242729400996, "learning_rate": 7.096427370499204e-05, "loss": 2.0864, "step": 7260 }, { "epoch": 274.7169811320755, "grad_norm": 1.0912978575620245, "learning_rate": 7.091620723261605e-05, "loss": 2.0923, "step": 7280 }, { "epoch": 275.47169811320754, "grad_norm": 1.231133204650358, "learning_rate": 7.086803086370918e-05, "loss": 2.0795, "step": 7300 }, { "epoch": 276.22641509433964, "grad_norm": 1.2282758399906704, "learning_rate": 7.081974478458825e-05, "loss": 2.0761, "step": 7320 }, { "epoch": 276.9811320754717, "grad_norm": 1.3460196947110317, "learning_rate": 7.077134918199428e-05, "loss": 2.0752, "step": 7340 }, { "epoch": 277.7358490566038, "grad_norm": 1.080960097565614, "learning_rate": 7.072284424309193e-05, "loss": 2.0889, "step": 7360 }, { "epoch": 278.49056603773585, "grad_norm": 1.3111047591517453, "learning_rate": 7.067423015546863e-05, "loss": 2.0839, "step": 7380 }, { "epoch": 279.24528301886795, "grad_norm": 1.5401314919739673, "learning_rate": 7.0625507107134e-05, "loss": 2.0927, "step": 7400 }, { "epoch": 280.0, "grad_norm": 1.1041636245431063, "learning_rate": 7.057667528651904e-05, "loss": 2.0803, "step": 7420 }, { "epoch": 280.75471698113205, "grad_norm": 1.4056508604045173, "learning_rate": 7.052773488247539e-05, "loss": 2.0668, "step": 7440 }, { "epoch": 281.50943396226415, "grad_norm": 1.046923519873644, "learning_rate": 7.047868608427462e-05, "loss": 2.082, "step": 7460 }, { "epoch": 282.2641509433962, "grad_norm": 1.3029328700653047, "learning_rate": 7.042952908160754e-05, "loss": 2.0556, "step": 7480 }, { "epoch": 283.0188679245283, "grad_norm": 1.227982067650406, "learning_rate": 7.03802640645834e-05, "loss": 2.0478, "step": 7500 }, { "epoch": 283.77358490566036, "grad_norm": 1.1015625311453152, "learning_rate": 7.033089122372919e-05, "loss": 2.0773, "step": 7520 }, { "epoch": 284.52830188679246, "grad_norm": 1.316103623119528, "learning_rate": 7.028141074998891e-05, "loss": 2.0756, "step": 7540 }, { "epoch": 285.2830188679245, "grad_norm": 1.255257016262856, "learning_rate": 7.023182283472277e-05, "loss": 2.0866, "step": 7560 }, { "epoch": 286.0377358490566, "grad_norm": 1.030736862070767, "learning_rate": 7.018212766970658e-05, "loss": 2.0723, "step": 7580 }, { "epoch": 286.79245283018867, "grad_norm": 1.293105711429154, "learning_rate": 7.013232544713086e-05, "loss": 2.0759, "step": 7600 }, { "epoch": 287.54716981132077, "grad_norm": 1.538845550854816, "learning_rate": 7.008241635960018e-05, "loss": 2.0238, "step": 7620 }, { "epoch": 288.3018867924528, "grad_norm": 1.1376778839977162, "learning_rate": 7.003240060013241e-05, "loss": 2.0895, "step": 7640 }, { "epoch": 289.0566037735849, "grad_norm": 1.1524198390846205, "learning_rate": 6.998227836215794e-05, "loss": 2.0712, "step": 7660 }, { "epoch": 289.811320754717, "grad_norm": 1.1269455096899952, "learning_rate": 6.9932049839519e-05, "loss": 2.0785, "step": 7680 }, { "epoch": 290.5660377358491, "grad_norm": 1.1947929898571277, "learning_rate": 6.98817152264688e-05, "loss": 2.0789, "step": 7700 }, { "epoch": 291.3207547169811, "grad_norm": 1.05216691350341, "learning_rate": 6.983127471767088e-05, "loss": 2.0721, "step": 7720 }, { "epoch": 292.07547169811323, "grad_norm": 1.0659553585546824, "learning_rate": 6.978072850819832e-05, "loss": 2.0897, "step": 7740 }, { "epoch": 292.8301886792453, "grad_norm": 1.4999207869643305, "learning_rate": 6.9730076793533e-05, "loss": 2.0875, "step": 7760 }, { "epoch": 293.58490566037733, "grad_norm": 1.051015475094539, "learning_rate": 6.967931976956479e-05, "loss": 2.0572, "step": 7780 }, { "epoch": 294.33962264150944, "grad_norm": 1.4062884543226315, "learning_rate": 6.962845763259084e-05, "loss": 2.0783, "step": 7800 }, { "epoch": 295.0943396226415, "grad_norm": 1.3344933104485628, "learning_rate": 6.957749057931486e-05, "loss": 2.0491, "step": 7820 }, { "epoch": 295.8490566037736, "grad_norm": 1.2853456909301206, "learning_rate": 6.952641880684623e-05, "loss": 2.0589, "step": 7840 }, { "epoch": 296.60377358490564, "grad_norm": 0.9567533052896401, "learning_rate": 6.947524251269942e-05, "loss": 2.0638, "step": 7860 }, { "epoch": 297.35849056603774, "grad_norm": 1.0295871659614384, "learning_rate": 6.942396189479305e-05, "loss": 2.0452, "step": 7880 }, { "epoch": 298.1132075471698, "grad_norm": 1.0385361931014787, "learning_rate": 6.937257715144922e-05, "loss": 2.0693, "step": 7900 }, { "epoch": 298.8679245283019, "grad_norm": 1.0809377854877955, "learning_rate": 6.932108848139274e-05, "loss": 2.0657, "step": 7920 }, { "epoch": 299.62264150943395, "grad_norm": 1.2262962690135735, "learning_rate": 6.926949608375031e-05, "loss": 2.0333, "step": 7940 }, { "epoch": 300.37735849056605, "grad_norm": 1.2494078282001366, "learning_rate": 6.921780015804983e-05, "loss": 2.0611, "step": 7960 }, { "epoch": 301.1320754716981, "grad_norm": 1.2191785732688871, "learning_rate": 6.916600090421955e-05, "loss": 2.0414, "step": 7980 }, { "epoch": 301.8867924528302, "grad_norm": 1.3922552836876412, "learning_rate": 6.911409852258734e-05, "loss": 2.0344, "step": 8000 }, { "epoch": 302.64150943396226, "grad_norm": 1.001774801425353, "learning_rate": 6.906209321387992e-05, "loss": 2.0745, "step": 8020 }, { "epoch": 303.39622641509436, "grad_norm": 1.1104337640037032, "learning_rate": 6.900998517922203e-05, "loss": 2.0593, "step": 8040 }, { "epoch": 304.1509433962264, "grad_norm": 1.825335695980726, "learning_rate": 6.895777462013575e-05, "loss": 2.0459, "step": 8060 }, { "epoch": 304.9056603773585, "grad_norm": 1.157162721000543, "learning_rate": 6.89054617385396e-05, "loss": 2.0565, "step": 8080 }, { "epoch": 305.66037735849056, "grad_norm": 1.0813317731646406, "learning_rate": 6.885304673674785e-05, "loss": 2.0647, "step": 8100 }, { "epoch": 306.41509433962267, "grad_norm": 1.1711257270339308, "learning_rate": 6.880052981746973e-05, "loss": 2.0779, "step": 8120 }, { "epoch": 307.1698113207547, "grad_norm": 1.063366248076951, "learning_rate": 6.874791118380859e-05, "loss": 2.0299, "step": 8140 }, { "epoch": 307.92452830188677, "grad_norm": 1.4839153895893722, "learning_rate": 6.869519103926117e-05, "loss": 2.0689, "step": 8160 }, { "epoch": 308.6792452830189, "grad_norm": 1.0599730190677705, "learning_rate": 6.864236958771677e-05, "loss": 2.0559, "step": 8180 }, { "epoch": 309.4339622641509, "grad_norm": 1.1000743640073944, "learning_rate": 6.85894470334565e-05, "loss": 2.0814, "step": 8200 }, { "epoch": 310.188679245283, "grad_norm": 1.1134888630426287, "learning_rate": 6.853642358115248e-05, "loss": 2.0619, "step": 8220 }, { "epoch": 310.9433962264151, "grad_norm": 1.4109893718513755, "learning_rate": 6.848329943586703e-05, "loss": 2.0478, "step": 8240 }, { "epoch": 311.6981132075472, "grad_norm": 1.4005508549478216, "learning_rate": 6.843007480305188e-05, "loss": 2.0451, "step": 8260 }, { "epoch": 312.45283018867923, "grad_norm": 1.1506459796822934, "learning_rate": 6.83767498885474e-05, "loss": 2.0496, "step": 8280 }, { "epoch": 313.20754716981133, "grad_norm": 1.1846013546521996, "learning_rate": 6.832332489858181e-05, "loss": 2.0503, "step": 8300 }, { "epoch": 313.9622641509434, "grad_norm": 1.083146150872066, "learning_rate": 6.826980003977029e-05, "loss": 2.0411, "step": 8320 }, { "epoch": 314.7169811320755, "grad_norm": 1.1083923007981826, "learning_rate": 6.821617551911432e-05, "loss": 2.059, "step": 8340 }, { "epoch": 315.47169811320754, "grad_norm": 1.7089827022606041, "learning_rate": 6.816245154400081e-05, "loss": 2.0316, "step": 8360 }, { "epoch": 316.22641509433964, "grad_norm": 1.2036464145657677, "learning_rate": 6.810862832220125e-05, "loss": 2.0383, "step": 8380 }, { "epoch": 316.9811320754717, "grad_norm": 1.0678492328292477, "learning_rate": 6.8054706061871e-05, "loss": 2.0357, "step": 8400 }, { "epoch": 317.7358490566038, "grad_norm": 1.1510123829327024, "learning_rate": 6.800068497154838e-05, "loss": 2.0509, "step": 8420 }, { "epoch": 318.49056603773585, "grad_norm": 1.1744519756591179, "learning_rate": 6.794656526015402e-05, "loss": 2.0362, "step": 8440 }, { "epoch": 319.24528301886795, "grad_norm": 1.0951767070535987, "learning_rate": 6.78923471369899e-05, "loss": 2.0261, "step": 8460 }, { "epoch": 320.0, "grad_norm": 1.40278574496307, "learning_rate": 6.783803081173856e-05, "loss": 2.0041, "step": 8480 }, { "epoch": 320.75471698113205, "grad_norm": 1.2731462205629138, "learning_rate": 6.778361649446238e-05, "loss": 2.0455, "step": 8500 }, { "epoch": 321.50943396226415, "grad_norm": 1.1686588861352702, "learning_rate": 6.772910439560273e-05, "loss": 2.0328, "step": 8520 }, { "epoch": 322.2641509433962, "grad_norm": 1.0989551544372271, "learning_rate": 6.767449472597907e-05, "loss": 2.0495, "step": 8540 }, { "epoch": 323.0188679245283, "grad_norm": 1.3624805761549945, "learning_rate": 6.761978769678828e-05, "loss": 2.0447, "step": 8560 }, { "epoch": 323.77358490566036, "grad_norm": 1.1444357160826135, "learning_rate": 6.75649835196037e-05, "loss": 2.0663, "step": 8580 }, { "epoch": 324.52830188679246, "grad_norm": 1.1946574491976927, "learning_rate": 6.75100824063744e-05, "loss": 2.0483, "step": 8600 }, { "epoch": 325.2830188679245, "grad_norm": 1.0504702569050626, "learning_rate": 6.745508456942438e-05, "loss": 1.9978, "step": 8620 }, { "epoch": 326.0377358490566, "grad_norm": 0.9767612100068984, "learning_rate": 6.739999022145167e-05, "loss": 2.0382, "step": 8640 }, { "epoch": 326.79245283018867, "grad_norm": 1.0291078738332238, "learning_rate": 6.734479957552753e-05, "loss": 2.0298, "step": 8660 }, { "epoch": 327.54716981132077, "grad_norm": 1.244368475618607, "learning_rate": 6.72895128450957e-05, "loss": 2.005, "step": 8680 }, { "epoch": 328.3018867924528, "grad_norm": 1.1290954094741668, "learning_rate": 6.723413024397144e-05, "loss": 2.0569, "step": 8700 }, { "epoch": 329.0566037735849, "grad_norm": 1.0915375487825718, "learning_rate": 6.717865198634082e-05, "loss": 2.0447, "step": 8720 }, { "epoch": 329.811320754717, "grad_norm": 1.1768398401350053, "learning_rate": 6.71230782867599e-05, "loss": 2.0217, "step": 8740 }, { "epoch": 330.5660377358491, "grad_norm": 1.104835402612007, "learning_rate": 6.706740936015375e-05, "loss": 2.0386, "step": 8760 }, { "epoch": 331.3207547169811, "grad_norm": 1.1248015036534322, "learning_rate": 6.70116454218158e-05, "loss": 2.0103, "step": 8780 }, { "epoch": 332.07547169811323, "grad_norm": 1.2169922349555569, "learning_rate": 6.69557866874069e-05, "loss": 2.0241, "step": 8800 }, { "epoch": 332.8301886792453, "grad_norm": 1.214613807170357, "learning_rate": 6.689983337295448e-05, "loss": 2.0188, "step": 8820 }, { "epoch": 333.58490566037733, "grad_norm": 1.790201434963867, "learning_rate": 6.684378569485181e-05, "loss": 1.9779, "step": 8840 }, { "epoch": 334.33962264150944, "grad_norm": 1.1294537346241684, "learning_rate": 6.678764386985706e-05, "loss": 2.0288, "step": 8860 }, { "epoch": 335.0943396226415, "grad_norm": 1.023220014865738, "learning_rate": 6.673140811509254e-05, "loss": 2.0388, "step": 8880 }, { "epoch": 335.8490566037736, "grad_norm": 1.184810167823652, "learning_rate": 6.667507864804373e-05, "loss": 2.0527, "step": 8900 }, { "epoch": 336.60377358490564, "grad_norm": 1.4108637345041166, "learning_rate": 6.661865568655867e-05, "loss": 2.0521, "step": 8920 }, { "epoch": 337.35849056603774, "grad_norm": 1.1099661578909887, "learning_rate": 6.656213944884687e-05, "loss": 2.0142, "step": 8940 }, { "epoch": 338.1132075471698, "grad_norm": 1.2550646623744501, "learning_rate": 6.650553015347861e-05, "loss": 2.0234, "step": 8960 }, { "epoch": 338.8679245283019, "grad_norm": 1.3226209109800835, "learning_rate": 6.64488280193841e-05, "loss": 2.0026, "step": 8980 }, { "epoch": 339.62264150943395, "grad_norm": 1.2466974723773605, "learning_rate": 6.639203326585253e-05, "loss": 2.0505, "step": 9000 }, { "epoch": 340.37735849056605, "grad_norm": 1.2259925868881607, "learning_rate": 6.633514611253129e-05, "loss": 1.989, "step": 9020 }, { "epoch": 341.1320754716981, "grad_norm": 1.2616555953485367, "learning_rate": 6.627816677942518e-05, "loss": 2.0172, "step": 9040 }, { "epoch": 341.8867924528302, "grad_norm": 1.0660372107925478, "learning_rate": 6.622109548689542e-05, "loss": 2.0235, "step": 9060 }, { "epoch": 342.64150943396226, "grad_norm": 1.2995047263783295, "learning_rate": 6.616393245565893e-05, "loss": 2.0116, "step": 9080 }, { "epoch": 343.39622641509436, "grad_norm": 1.2947984731633606, "learning_rate": 6.610667790678738e-05, "loss": 2.0241, "step": 9100 }, { "epoch": 344.1509433962264, "grad_norm": 1.008247863003288, "learning_rate": 6.60493320617064e-05, "loss": 1.984, "step": 9120 }, { "epoch": 344.9056603773585, "grad_norm": 1.5922178618355085, "learning_rate": 6.599189514219469e-05, "loss": 1.989, "step": 9140 }, { "epoch": 345.66037735849056, "grad_norm": 1.0785537649724395, "learning_rate": 6.593436737038316e-05, "loss": 2.0135, "step": 9160 }, { "epoch": 346.41509433962267, "grad_norm": 1.1766322003509095, "learning_rate": 6.58767489687541e-05, "loss": 2.0021, "step": 9180 }, { "epoch": 347.1698113207547, "grad_norm": 1.3777596137615202, "learning_rate": 6.581904016014026e-05, "loss": 1.9988, "step": 9200 }, { "epoch": 347.92452830188677, "grad_norm": 1.5517987219865874, "learning_rate": 6.57612411677241e-05, "loss": 2.0309, "step": 9220 }, { "epoch": 348.6792452830189, "grad_norm": 1.0560275221648643, "learning_rate": 6.570335221503679e-05, "loss": 1.9923, "step": 9240 }, { "epoch": 349.4339622641509, "grad_norm": 1.3504991405267055, "learning_rate": 6.564537352595744e-05, "loss": 1.9739, "step": 9260 }, { "epoch": 350.188679245283, "grad_norm": 1.153039781830911, "learning_rate": 6.558730532471219e-05, "loss": 1.9803, "step": 9280 }, { "epoch": 350.9433962264151, "grad_norm": 0.9434571532030971, "learning_rate": 6.55291478358734e-05, "loss": 1.9677, "step": 9300 }, { "epoch": 351.6981132075472, "grad_norm": 1.8784899020425583, "learning_rate": 6.547090128435869e-05, "loss": 1.9988, "step": 9320 }, { "epoch": 352.45283018867923, "grad_norm": 1.449139419473746, "learning_rate": 6.541256589543013e-05, "loss": 1.9974, "step": 9340 }, { "epoch": 353.20754716981133, "grad_norm": 1.3936924715065266, "learning_rate": 6.53541418946934e-05, "loss": 2.016, "step": 9360 }, { "epoch": 353.9622641509434, "grad_norm": 1.6114753123601104, "learning_rate": 6.529562950809679e-05, "loss": 2.0021, "step": 9380 }, { "epoch": 354.7169811320755, "grad_norm": 1.109612991369577, "learning_rate": 6.523702896193052e-05, "loss": 1.9928, "step": 9400 }, { "epoch": 355.47169811320754, "grad_norm": 1.2128714849575388, "learning_rate": 6.517834048282572e-05, "loss": 1.9908, "step": 9420 }, { "epoch": 356.22641509433964, "grad_norm": 1.6910853942561526, "learning_rate": 6.511956429775353e-05, "loss": 1.998, "step": 9440 }, { "epoch": 356.9811320754717, "grad_norm": 1.1430018991497974, "learning_rate": 6.506070063402434e-05, "loss": 1.9726, "step": 9460 }, { "epoch": 357.7358490566038, "grad_norm": 1.1413458296675405, "learning_rate": 6.500174971928684e-05, "loss": 1.9972, "step": 9480 }, { "epoch": 358.49056603773585, "grad_norm": 1.1439825840759497, "learning_rate": 6.494271178152717e-05, "loss": 1.9965, "step": 9500 }, { "epoch": 359.24528301886795, "grad_norm": 1.1294761259383999, "learning_rate": 6.488358704906799e-05, "loss": 1.9651, "step": 9520 }, { "epoch": 360.0, "grad_norm": 1.0691230669285636, "learning_rate": 6.482437575056767e-05, "loss": 1.9559, "step": 9540 }, { "epoch": 360.75471698113205, "grad_norm": 1.1830173398938235, "learning_rate": 6.476507811501933e-05, "loss": 2.0035, "step": 9560 }, { "epoch": 361.50943396226415, "grad_norm": 1.0463609194131098, "learning_rate": 6.470569437175001e-05, "loss": 2.0062, "step": 9580 }, { "epoch": 362.2641509433962, "grad_norm": 1.0991234719266971, "learning_rate": 6.464622475041972e-05, "loss": 1.9775, "step": 9600 }, { "epoch": 363.0188679245283, "grad_norm": 1.198408129328553, "learning_rate": 6.458666948102068e-05, "loss": 1.9684, "step": 9620 }, { "epoch": 363.77358490566036, "grad_norm": 1.2518491777745682, "learning_rate": 6.452702879387625e-05, "loss": 2.0052, "step": 9640 }, { "epoch": 364.52830188679246, "grad_norm": 1.1372899117638453, "learning_rate": 6.44673029196402e-05, "loss": 2.005, "step": 9660 }, { "epoch": 365.2830188679245, "grad_norm": 1.1412736472546972, "learning_rate": 6.44074920892957e-05, "loss": 1.9545, "step": 9680 }, { "epoch": 366.0377358490566, "grad_norm": 1.295474241313066, "learning_rate": 6.434759653415454e-05, "loss": 1.9943, "step": 9700 }, { "epoch": 366.79245283018867, "grad_norm": 1.6454682745260736, "learning_rate": 6.42876164858561e-05, "loss": 1.9831, "step": 9720 }, { "epoch": 367.54716981132077, "grad_norm": 1.3152256044573012, "learning_rate": 6.42275521763666e-05, "loss": 1.9898, "step": 9740 }, { "epoch": 368.3018867924528, "grad_norm": 1.1398547695205932, "learning_rate": 6.416740383797806e-05, "loss": 2.0018, "step": 9760 }, { "epoch": 369.0566037735849, "grad_norm": 1.560025618616493, "learning_rate": 6.410717170330754e-05, "loss": 1.9774, "step": 9780 }, { "epoch": 369.811320754717, "grad_norm": 1.1084036628854508, "learning_rate": 6.404685600529614e-05, "loss": 1.9898, "step": 9800 }, { "epoch": 370.5660377358491, "grad_norm": 1.064043317797065, "learning_rate": 6.398645697720813e-05, "loss": 1.9683, "step": 9820 }, { "epoch": 371.3207547169811, "grad_norm": 1.0561133254372814, "learning_rate": 6.392597485263005e-05, "loss": 1.9892, "step": 9840 }, { "epoch": 372.07547169811323, "grad_norm": 1.8498750890966262, "learning_rate": 6.386540986546981e-05, "loss": 2.0028, "step": 9860 }, { "epoch": 372.8301886792453, "grad_norm": 1.1839290834136853, "learning_rate": 6.38047622499558e-05, "loss": 2.0067, "step": 9880 }, { "epoch": 373.58490566037733, "grad_norm": 1.0860526304824587, "learning_rate": 6.374403224063593e-05, "loss": 1.9771, "step": 9900 }, { "epoch": 374.33962264150944, "grad_norm": 1.1746289875773082, "learning_rate": 6.368322007237679e-05, "loss": 1.9693, "step": 9920 }, { "epoch": 375.0943396226415, "grad_norm": 1.3082714258157306, "learning_rate": 6.36223259803627e-05, "loss": 1.9737, "step": 9940 }, { "epoch": 375.8490566037736, "grad_norm": 1.1647865395498773, "learning_rate": 6.356135020009478e-05, "loss": 1.9619, "step": 9960 }, { "epoch": 376.60377358490564, "grad_norm": 0.947059541183795, "learning_rate": 6.350029296739012e-05, "loss": 1.975, "step": 9980 }, { "epoch": 377.35849056603774, "grad_norm": 1.2248902808010191, "learning_rate": 6.343915451838081e-05, "loss": 1.9628, "step": 10000 }, { "epoch": 378.1132075471698, "grad_norm": 1.097611241891744, "learning_rate": 6.337793508951301e-05, "loss": 1.9775, "step": 10020 }, { "epoch": 378.8679245283019, "grad_norm": 1.2529669087878597, "learning_rate": 6.331663491754607e-05, "loss": 1.9468, "step": 10040 }, { "epoch": 379.62264150943395, "grad_norm": 1.1767271144174725, "learning_rate": 6.325525423955162e-05, "loss": 1.9413, "step": 10060 }, { "epoch": 380.37735849056605, "grad_norm": 1.282222785156654, "learning_rate": 6.319379329291262e-05, "loss": 1.9655, "step": 10080 }, { "epoch": 381.1320754716981, "grad_norm": 0.9819686841799513, "learning_rate": 6.313225231532246e-05, "loss": 1.9537, "step": 10100 }, { "epoch": 381.8867924528302, "grad_norm": 1.206003307363446, "learning_rate": 6.307063154478407e-05, "loss": 1.9387, "step": 10120 }, { "epoch": 382.64150943396226, "grad_norm": 1.236739142400694, "learning_rate": 6.300893121960891e-05, "loss": 1.9478, "step": 10140 }, { "epoch": 383.39622641509436, "grad_norm": 1.0017771325975895, "learning_rate": 6.294715157841618e-05, "loss": 1.9714, "step": 10160 }, { "epoch": 384.1509433962264, "grad_norm": 1.0637999951499557, "learning_rate": 6.28852928601318e-05, "loss": 1.9905, "step": 10180 }, { "epoch": 384.9056603773585, "grad_norm": 1.0944082795368726, "learning_rate": 6.282335530398746e-05, "loss": 1.9586, "step": 10200 }, { "epoch": 385.66037735849056, "grad_norm": 1.0420947581782276, "learning_rate": 6.276133914951982e-05, "loss": 2.0008, "step": 10220 }, { "epoch": 386.41509433962267, "grad_norm": 1.2531335945397626, "learning_rate": 6.26992446365695e-05, "loss": 1.9718, "step": 10240 }, { "epoch": 387.1698113207547, "grad_norm": 1.0272789455614961, "learning_rate": 6.26370720052801e-05, "loss": 1.9741, "step": 10260 }, { "epoch": 387.92452830188677, "grad_norm": 1.1543574176007045, "learning_rate": 6.25748214960974e-05, "loss": 1.9508, "step": 10280 }, { "epoch": 388.6792452830189, "grad_norm": 1.123008926585049, "learning_rate": 6.251249334976835e-05, "loss": 1.9238, "step": 10300 }, { "epoch": 389.4339622641509, "grad_norm": 1.1351605673087415, "learning_rate": 6.245008780734015e-05, "loss": 1.9379, "step": 10320 }, { "epoch": 390.188679245283, "grad_norm": 1.139914151072252, "learning_rate": 6.238760511015928e-05, "loss": 1.9863, "step": 10340 }, { "epoch": 390.9433962264151, "grad_norm": 1.3069434137417522, "learning_rate": 6.232504549987069e-05, "loss": 1.9569, "step": 10360 }, { "epoch": 391.6981132075472, "grad_norm": 1.7598014364780348, "learning_rate": 6.22624092184167e-05, "loss": 1.9389, "step": 10380 }, { "epoch": 392.45283018867923, "grad_norm": 1.0862334208555093, "learning_rate": 6.21996965080362e-05, "loss": 1.9744, "step": 10400 }, { "epoch": 393.20754716981133, "grad_norm": 1.1400427784758083, "learning_rate": 6.213690761126365e-05, "loss": 1.9563, "step": 10420 }, { "epoch": 393.9622641509434, "grad_norm": 1.171092319320692, "learning_rate": 6.207404277092816e-05, "loss": 1.9268, "step": 10440 }, { "epoch": 394.7169811320755, "grad_norm": 1.2187674621534166, "learning_rate": 6.201110223015247e-05, "loss": 1.9141, "step": 10460 }, { "epoch": 395.47169811320754, "grad_norm": 1.1182747577783947, "learning_rate": 6.19480862323522e-05, "loss": 1.9498, "step": 10480 }, { "epoch": 396.22641509433964, "grad_norm": 1.2189637302318261, "learning_rate": 6.188499502123471e-05, "loss": 1.9563, "step": 10500 }, { "epoch": 396.9811320754717, "grad_norm": 1.0928304287739772, "learning_rate": 6.18218288407983e-05, "loss": 1.976, "step": 10520 }, { "epoch": 397.7358490566038, "grad_norm": 1.0366879822409767, "learning_rate": 6.17585879353311e-05, "loss": 1.9804, "step": 10540 }, { "epoch": 398.49056603773585, "grad_norm": 1.153371078643115, "learning_rate": 6.169527254941035e-05, "loss": 1.987, "step": 10560 }, { "epoch": 399.24528301886795, "grad_norm": 1.2467206603942558, "learning_rate": 6.163188292790129e-05, "loss": 1.958, "step": 10580 }, { "epoch": 400.0, "grad_norm": 1.222097823579558, "learning_rate": 6.156841931595623e-05, "loss": 1.9838, "step": 10600 }, { "epoch": 400.75471698113205, "grad_norm": 1.1722193895632427, "learning_rate": 6.150488195901367e-05, "loss": 1.9496, "step": 10620 }, { "epoch": 401.50943396226415, "grad_norm": 1.7976530024431303, "learning_rate": 6.144127110279726e-05, "loss": 1.9869, "step": 10640 }, { "epoch": 402.2641509433962, "grad_norm": 1.1341428853515279, "learning_rate": 6.137758699331498e-05, "loss": 1.9235, "step": 10660 }, { "epoch": 403.0188679245283, "grad_norm": 1.107959509965475, "learning_rate": 6.131382987685803e-05, "loss": 1.927, "step": 10680 }, { "epoch": 403.77358490566036, "grad_norm": 1.8525780404729881, "learning_rate": 6.125000000000001e-05, "loss": 1.9487, "step": 10700 }, { "epoch": 404.52830188679246, "grad_norm": 1.1448557605131082, "learning_rate": 6.118609760959587e-05, "loss": 1.9547, "step": 10720 }, { "epoch": 405.2830188679245, "grad_norm": 1.1150883211805585, "learning_rate": 6.112212295278103e-05, "loss": 1.9487, "step": 10740 }, { "epoch": 406.0377358490566, "grad_norm": 1.149956522288425, "learning_rate": 6.105807627697039e-05, "loss": 1.9614, "step": 10760 }, { "epoch": 406.79245283018867, "grad_norm": 1.1217970508214505, "learning_rate": 6.099395782985736e-05, "loss": 1.9555, "step": 10780 }, { "epoch": 407.54716981132077, "grad_norm": 1.261444831314206, "learning_rate": 6.0929767859412914e-05, "loss": 1.9527, "step": 10800 }, { "epoch": 408.3018867924528, "grad_norm": 1.2610523391632782, "learning_rate": 6.086550661388466e-05, "loss": 1.9321, "step": 10820 }, { "epoch": 409.0566037735849, "grad_norm": 1.1090357115444625, "learning_rate": 6.080117434179586e-05, "loss": 1.9211, "step": 10840 }, { "epoch": 409.811320754717, "grad_norm": 0.9790706595618122, "learning_rate": 6.0736771291944384e-05, "loss": 1.9203, "step": 10860 }, { "epoch": 410.5660377358491, "grad_norm": 1.2166651155014474, "learning_rate": 6.067229771340195e-05, "loss": 1.9323, "step": 10880 }, { "epoch": 411.3207547169811, "grad_norm": 1.433284831152631, "learning_rate": 6.0607753855512944e-05, "loss": 1.9623, "step": 10900 }, { "epoch": 412.07547169811323, "grad_norm": 1.5219003618009672, "learning_rate": 6.054313996789358e-05, "loss": 1.9198, "step": 10920 }, { "epoch": 412.8301886792453, "grad_norm": 1.1498060228780786, "learning_rate": 6.047845630043091e-05, "loss": 1.9487, "step": 10940 }, { "epoch": 413.58490566037733, "grad_norm": 1.1202482322074203, "learning_rate": 6.041370310328184e-05, "loss": 1.9067, "step": 10960 }, { "epoch": 414.33962264150944, "grad_norm": 1.2430430079147141, "learning_rate": 6.0348880626872184e-05, "loss": 1.9382, "step": 10980 }, { "epoch": 415.0943396226415, "grad_norm": 1.0869359699622836, "learning_rate": 6.028398912189569e-05, "loss": 1.9611, "step": 11000 }, { "epoch": 415.8490566037736, "grad_norm": 1.2526066034095944, "learning_rate": 6.0219028839313045e-05, "loss": 1.9644, "step": 11020 }, { "epoch": 416.60377358490564, "grad_norm": 1.3228258582837578, "learning_rate": 6.015400003035096e-05, "loss": 1.9401, "step": 11040 }, { "epoch": 417.35849056603774, "grad_norm": 1.9762624742364299, "learning_rate": 6.008890294650111e-05, "loss": 1.9465, "step": 11060 }, { "epoch": 418.1132075471698, "grad_norm": 1.1424506198639062, "learning_rate": 6.0023737839519284e-05, "loss": 1.9439, "step": 11080 }, { "epoch": 418.8679245283019, "grad_norm": 1.2107658482065982, "learning_rate": 5.995850496142429e-05, "loss": 1.9342, "step": 11100 }, { "epoch": 419.62264150943395, "grad_norm": 1.1945042871517195, "learning_rate": 5.989320456449705e-05, "loss": 1.9463, "step": 11120 }, { "epoch": 420.37735849056605, "grad_norm": 1.4521278706471037, "learning_rate": 5.9827836901279616e-05, "loss": 1.9097, "step": 11140 }, { "epoch": 421.1320754716981, "grad_norm": 1.3941560968372226, "learning_rate": 5.97624022245742e-05, "loss": 1.9332, "step": 11160 }, { "epoch": 421.8867924528302, "grad_norm": 1.39870585850457, "learning_rate": 5.969690078744211e-05, "loss": 1.9458, "step": 11180 }, { "epoch": 422.64150943396226, "grad_norm": 1.0274165784628992, "learning_rate": 5.963133284320292e-05, "loss": 1.9365, "step": 11200 }, { "epoch": 423.39622641509436, "grad_norm": 1.1694905737535597, "learning_rate": 5.956569864543338e-05, "loss": 1.8966, "step": 11220 }, { "epoch": 424.1509433962264, "grad_norm": 2.28522960541773, "learning_rate": 5.9499998447966484e-05, "loss": 1.9332, "step": 11240 }, { "epoch": 424.9056603773585, "grad_norm": 1.1252246816652476, "learning_rate": 5.943423250489044e-05, "loss": 1.9308, "step": 11260 }, { "epoch": 425.66037735849056, "grad_norm": 1.132211561056973, "learning_rate": 5.9368401070547756e-05, "loss": 1.9221, "step": 11280 }, { "epoch": 426.41509433962267, "grad_norm": 1.0848974124812198, "learning_rate": 5.93025043995342e-05, "loss": 1.9374, "step": 11300 }, { "epoch": 427.1698113207547, "grad_norm": 1.0363649309093041, "learning_rate": 5.9236542746697845e-05, "loss": 1.9461, "step": 11320 }, { "epoch": 427.92452830188677, "grad_norm": 1.1928905874896651, "learning_rate": 5.9170516367138065e-05, "loss": 1.9378, "step": 11340 }, { "epoch": 428.6792452830189, "grad_norm": 1.1106230737067035, "learning_rate": 5.910442551620457e-05, "loss": 1.942, "step": 11360 }, { "epoch": 429.4339622641509, "grad_norm": 1.1576265708604865, "learning_rate": 5.903827044949638e-05, "loss": 1.9471, "step": 11380 }, { "epoch": 430.188679245283, "grad_norm": 1.1174079944741127, "learning_rate": 5.897205142286091e-05, "loss": 1.8922, "step": 11400 }, { "epoch": 430.9433962264151, "grad_norm": 1.3085632343404145, "learning_rate": 5.890576869239289e-05, "loss": 1.9458, "step": 11420 }, { "epoch": 431.6981132075472, "grad_norm": 1.2863747903344196, "learning_rate": 5.883942251443342e-05, "loss": 1.9099, "step": 11440 }, { "epoch": 432.45283018867923, "grad_norm": 1.0551943593908828, "learning_rate": 5.877301314556899e-05, "loss": 1.9141, "step": 11460 }, { "epoch": 433.20754716981133, "grad_norm": 1.086077987508793, "learning_rate": 5.870654084263047e-05, "loss": 1.96, "step": 11480 }, { "epoch": 433.9622641509434, "grad_norm": 1.3656558008500363, "learning_rate": 5.864000586269215e-05, "loss": 1.904, "step": 11500 }, { "epoch": 434.7169811320755, "grad_norm": 1.4439265227942644, "learning_rate": 5.8573408463070655e-05, "loss": 1.9273, "step": 11520 }, { "epoch": 435.47169811320754, "grad_norm": 1.3611886653187657, "learning_rate": 5.850674890132405e-05, "loss": 1.9034, "step": 11540 }, { "epoch": 436.22641509433964, "grad_norm": 1.3616986059829845, "learning_rate": 5.844002743525081e-05, "loss": 1.9143, "step": 11560 }, { "epoch": 436.9811320754717, "grad_norm": 1.1127209994732485, "learning_rate": 5.8373244322888796e-05, "loss": 1.9467, "step": 11580 }, { "epoch": 437.7358490566038, "grad_norm": 1.2452581872873123, "learning_rate": 5.83063998225143e-05, "loss": 1.946, "step": 11600 }, { "epoch": 438.49056603773585, "grad_norm": 1.169219637417814, "learning_rate": 5.823949419264102e-05, "loss": 1.9057, "step": 11620 }, { "epoch": 439.24528301886795, "grad_norm": 1.3149994286787028, "learning_rate": 5.817252769201905e-05, "loss": 1.8922, "step": 11640 }, { "epoch": 440.0, "grad_norm": 1.3111574851574335, "learning_rate": 5.81055005796339e-05, "loss": 1.9222, "step": 11660 }, { "epoch": 440.75471698113205, "grad_norm": 1.187457856172297, "learning_rate": 5.803841311470551e-05, "loss": 1.9188, "step": 11680 }, { "epoch": 441.50943396226415, "grad_norm": 1.057264779435906, "learning_rate": 5.7971265556687206e-05, "loss": 1.9185, "step": 11700 }, { "epoch": 442.2641509433962, "grad_norm": 1.1403855029477634, "learning_rate": 5.790405816526473e-05, "loss": 1.9328, "step": 11720 }, { "epoch": 443.0188679245283, "grad_norm": 1.2270148994812622, "learning_rate": 5.78367912003552e-05, "loss": 1.8952, "step": 11740 }, { "epoch": 443.77358490566036, "grad_norm": 1.0385291337101263, "learning_rate": 5.776946492210618e-05, "loss": 1.9042, "step": 11760 }, { "epoch": 444.52830188679246, "grad_norm": 1.2996045459665522, "learning_rate": 5.770207959089455e-05, "loss": 1.9373, "step": 11780 }, { "epoch": 445.2830188679245, "grad_norm": 1.1405006769622614, "learning_rate": 5.763463546732563e-05, "loss": 1.9035, "step": 11800 }, { "epoch": 446.0377358490566, "grad_norm": 1.2182586988416257, "learning_rate": 5.756713281223206e-05, "loss": 1.936, "step": 11820 }, { "epoch": 446.79245283018867, "grad_norm": 1.097561953783009, "learning_rate": 5.74995718866729e-05, "loss": 1.9057, "step": 11840 }, { "epoch": 447.54716981132077, "grad_norm": 1.0690078287157365, "learning_rate": 5.743195295193255e-05, "loss": 1.9074, "step": 11860 }, { "epoch": 448.3018867924528, "grad_norm": 1.1127790113128593, "learning_rate": 5.736427626951971e-05, "loss": 1.9269, "step": 11880 }, { "epoch": 449.0566037735849, "grad_norm": 1.0521548682001445, "learning_rate": 5.729654210116646e-05, "loss": 1.897, "step": 11900 }, { "epoch": 449.811320754717, "grad_norm": 1.0831322365716964, "learning_rate": 5.7228750708827196e-05, "loss": 1.9019, "step": 11920 }, { "epoch": 450.5660377358491, "grad_norm": 1.113425539515294, "learning_rate": 5.71609023546776e-05, "loss": 1.8995, "step": 11940 }, { "epoch": 451.3207547169811, "grad_norm": 1.1378527380008467, "learning_rate": 5.709299730111367e-05, "loss": 1.9112, "step": 11960 }, { "epoch": 452.07547169811323, "grad_norm": 1.2308344759482057, "learning_rate": 5.702503581075065e-05, "loss": 1.8869, "step": 11980 }, { "epoch": 452.8301886792453, "grad_norm": 1.3869181367868268, "learning_rate": 5.6957018146422106e-05, "loss": 1.9092, "step": 12000 }, { "epoch": 453.58490566037733, "grad_norm": 1.1702979518774306, "learning_rate": 5.688894457117877e-05, "loss": 1.8944, "step": 12020 }, { "epoch": 454.33962264150944, "grad_norm": 1.2974690219427283, "learning_rate": 5.6820815348287674e-05, "loss": 1.8794, "step": 12040 }, { "epoch": 455.0943396226415, "grad_norm": 1.3757370848375583, "learning_rate": 5.675263074123103e-05, "loss": 1.9208, "step": 12060 }, { "epoch": 455.8490566037736, "grad_norm": 1.3314963474728592, "learning_rate": 5.668439101370524e-05, "loss": 1.8823, "step": 12080 }, { "epoch": 456.60377358490564, "grad_norm": 1.1525239716029143, "learning_rate": 5.6616096429619885e-05, "loss": 1.8778, "step": 12100 }, { "epoch": 457.35849056603774, "grad_norm": 1.1391429331630094, "learning_rate": 5.6547747253096713e-05, "loss": 1.8973, "step": 12120 }, { "epoch": 458.1132075471698, "grad_norm": 1.2813875070982645, "learning_rate": 5.647934374846856e-05, "loss": 1.9037, "step": 12140 }, { "epoch": 458.8679245283019, "grad_norm": 1.130130379386682, "learning_rate": 5.641088618027841e-05, "loss": 1.8946, "step": 12160 }, { "epoch": 459.62264150943395, "grad_norm": 1.189098976296786, "learning_rate": 5.6342374813278305e-05, "loss": 1.9122, "step": 12180 }, { "epoch": 460.37735849056605, "grad_norm": 1.18982288351709, "learning_rate": 5.627380991242839e-05, "loss": 1.8893, "step": 12200 }, { "epoch": 461.1320754716981, "grad_norm": 1.3440462024222728, "learning_rate": 5.6205191742895787e-05, "loss": 1.8879, "step": 12220 }, { "epoch": 461.8867924528302, "grad_norm": 1.0998628162432096, "learning_rate": 5.613652057005367e-05, "loss": 1.8911, "step": 12240 }, { "epoch": 462.64150943396226, "grad_norm": 1.0660994627393063, "learning_rate": 5.6067796659480196e-05, "loss": 1.9055, "step": 12260 }, { "epoch": 463.39622641509436, "grad_norm": 1.7426680752228556, "learning_rate": 5.599902027695745e-05, "loss": 1.897, "step": 12280 }, { "epoch": 464.1509433962264, "grad_norm": 1.388841332022157, "learning_rate": 5.593019168847049e-05, "loss": 1.8812, "step": 12300 }, { "epoch": 464.9056603773585, "grad_norm": 1.2274558384609464, "learning_rate": 5.586131116020621e-05, "loss": 1.8496, "step": 12320 }, { "epoch": 465.66037735849056, "grad_norm": 1.1945002690405846, "learning_rate": 5.5792378958552456e-05, "loss": 1.9146, "step": 12340 }, { "epoch": 466.41509433962267, "grad_norm": 1.1629769495886029, "learning_rate": 5.5723395350096866e-05, "loss": 1.8734, "step": 12360 }, { "epoch": 467.1698113207547, "grad_norm": 1.1703423211235366, "learning_rate": 5.565436060162589e-05, "loss": 1.8882, "step": 12380 }, { "epoch": 467.92452830188677, "grad_norm": 1.3904930914694782, "learning_rate": 5.5585274980123765e-05, "loss": 1.8794, "step": 12400 }, { "epoch": 468.6792452830189, "grad_norm": 1.1043102032574945, "learning_rate": 5.551613875277148e-05, "loss": 1.888, "step": 12420 }, { "epoch": 469.4339622641509, "grad_norm": 1.019172984960956, "learning_rate": 5.5446952186945716e-05, "loss": 1.8887, "step": 12440 }, { "epoch": 470.188679245283, "grad_norm": 1.2815784609995193, "learning_rate": 5.537771555021785e-05, "loss": 1.9026, "step": 12460 }, { "epoch": 470.9433962264151, "grad_norm": 1.0452909524777938, "learning_rate": 5.53084291103529e-05, "loss": 1.8688, "step": 12480 }, { "epoch": 471.6981132075472, "grad_norm": 1.2824929707840547, "learning_rate": 5.5239093135308484e-05, "loss": 1.8568, "step": 12500 }, { "epoch": 472.45283018867923, "grad_norm": 1.0473918662270072, "learning_rate": 5.516970789323382e-05, "loss": 1.8962, "step": 12520 }, { "epoch": 473.20754716981133, "grad_norm": 1.1551860073406197, "learning_rate": 5.5100273652468596e-05, "loss": 1.9053, "step": 12540 }, { "epoch": 473.9622641509434, "grad_norm": 1.2315884678620779, "learning_rate": 5.50307906815421e-05, "loss": 1.8802, "step": 12560 }, { "epoch": 474.7169811320755, "grad_norm": 1.2036057101238689, "learning_rate": 5.496125924917195e-05, "loss": 1.8848, "step": 12580 }, { "epoch": 475.47169811320754, "grad_norm": 1.1443042979106692, "learning_rate": 5.4891679624263313e-05, "loss": 1.8993, "step": 12600 }, { "epoch": 476.22641509433964, "grad_norm": 1.1112985130684456, "learning_rate": 5.482205207590763e-05, "loss": 1.8997, "step": 12620 }, { "epoch": 476.9811320754717, "grad_norm": 1.1198907315048803, "learning_rate": 5.475237687338175e-05, "loss": 1.9204, "step": 12640 }, { "epoch": 477.7358490566038, "grad_norm": 1.0505243476691362, "learning_rate": 5.468265428614679e-05, "loss": 1.8824, "step": 12660 }, { "epoch": 478.49056603773585, "grad_norm": 1.1618158349057395, "learning_rate": 5.461288458384711e-05, "loss": 1.8675, "step": 12680 }, { "epoch": 479.24528301886795, "grad_norm": 1.310696647632245, "learning_rate": 5.454306803630931e-05, "loss": 1.8617, "step": 12700 }, { "epoch": 480.0, "grad_norm": 1.2853008412145361, "learning_rate": 5.447320491354114e-05, "loss": 1.8798, "step": 12720 }, { "epoch": 480.75471698113205, "grad_norm": 1.2035604713641803, "learning_rate": 5.440329548573049e-05, "loss": 1.8505, "step": 12740 }, { "epoch": 481.50943396226415, "grad_norm": 1.301768301418178, "learning_rate": 5.433334002324431e-05, "loss": 1.8849, "step": 12760 }, { "epoch": 482.2641509433962, "grad_norm": 1.0741158531319273, "learning_rate": 5.426333879662761e-05, "loss": 1.8362, "step": 12780 }, { "epoch": 483.0188679245283, "grad_norm": 1.2118720683926874, "learning_rate": 5.419329207660237e-05, "loss": 1.8811, "step": 12800 }, { "epoch": 483.77358490566036, "grad_norm": 1.295829194970654, "learning_rate": 5.412320013406651e-05, "loss": 1.8473, "step": 12820 }, { "epoch": 484.52830188679246, "grad_norm": 1.2658203604478202, "learning_rate": 5.405306324009282e-05, "loss": 1.8728, "step": 12840 }, { "epoch": 485.2830188679245, "grad_norm": 1.2195390339098875, "learning_rate": 5.3982881665928015e-05, "loss": 1.8704, "step": 12860 }, { "epoch": 486.0377358490566, "grad_norm": 1.067227068131729, "learning_rate": 5.391265568299149e-05, "loss": 1.8619, "step": 12880 }, { "epoch": 486.79245283018867, "grad_norm": 1.3306442274846357, "learning_rate": 5.384238556287451e-05, "loss": 1.8638, "step": 12900 }, { "epoch": 487.54716981132077, "grad_norm": 1.2531810114251472, "learning_rate": 5.377207157733893e-05, "loss": 1.8839, "step": 12920 }, { "epoch": 488.3018867924528, "grad_norm": 1.0879029191456078, "learning_rate": 5.370171399831631e-05, "loss": 1.866, "step": 12940 }, { "epoch": 489.0566037735849, "grad_norm": 1.1769881515511749, "learning_rate": 5.363131309790678e-05, "loss": 1.8253, "step": 12960 }, { "epoch": 489.811320754717, "grad_norm": 1.3614975573612427, "learning_rate": 5.356086914837802e-05, "loss": 1.8487, "step": 12980 }, { "epoch": 490.5660377358491, "grad_norm": 1.5342718531352588, "learning_rate": 5.349038242216419e-05, "loss": 1.847, "step": 13000 }, { "epoch": 491.3207547169811, "grad_norm": 1.1571547119310825, "learning_rate": 5.341985319186489e-05, "loss": 1.8822, "step": 13020 }, { "epoch": 492.07547169811323, "grad_norm": 1.1739881074209173, "learning_rate": 5.33492817302441e-05, "loss": 1.8531, "step": 13040 }, { "epoch": 492.8301886792453, "grad_norm": 1.1934573145337144, "learning_rate": 5.3278668310229125e-05, "loss": 1.8986, "step": 13060 }, { "epoch": 493.58490566037733, "grad_norm": 2.343948986647593, "learning_rate": 5.320801320490955e-05, "loss": 1.8935, "step": 13080 }, { "epoch": 494.33962264150944, "grad_norm": 1.0764970314512263, "learning_rate": 5.3137316687536136e-05, "loss": 1.854, "step": 13100 }, { "epoch": 495.0943396226415, "grad_norm": 1.0129335749841757, "learning_rate": 5.3066579031519824e-05, "loss": 1.874, "step": 13120 }, { "epoch": 495.8490566037736, "grad_norm": 1.3577114498479963, "learning_rate": 5.299580051043069e-05, "loss": 1.8534, "step": 13140 }, { "epoch": 496.60377358490564, "grad_norm": 1.1849380554040083, "learning_rate": 5.292498139799678e-05, "loss": 1.8705, "step": 13160 }, { "epoch": 497.35849056603774, "grad_norm": 1.1290155132472126, "learning_rate": 5.2854121968103185e-05, "loss": 1.8659, "step": 13180 }, { "epoch": 498.1132075471698, "grad_norm": 1.3627686926052522, "learning_rate": 5.278322249479088e-05, "loss": 1.8686, "step": 13200 }, { "epoch": 498.8679245283019, "grad_norm": 1.147585604555274, "learning_rate": 5.271228325225573e-05, "loss": 1.8301, "step": 13220 }, { "epoch": 499.62264150943395, "grad_norm": 1.124456551859716, "learning_rate": 5.264130451484736e-05, "loss": 1.846, "step": 13240 }, { "epoch": 500.37735849056605, "grad_norm": 1.1328557419125425, "learning_rate": 5.257028655706819e-05, "loss": 1.8489, "step": 13260 }, { "epoch": 501.1320754716981, "grad_norm": 1.3248366741093285, "learning_rate": 5.249922965357231e-05, "loss": 1.847, "step": 13280 }, { "epoch": 501.8867924528302, "grad_norm": 1.2987470821979115, "learning_rate": 5.24281340791644e-05, "loss": 1.8598, "step": 13300 }, { "epoch": 502.64150943396226, "grad_norm": 1.2954015666799925, "learning_rate": 5.235700010879869e-05, "loss": 1.8144, "step": 13320 }, { "epoch": 503.39622641509436, "grad_norm": 1.14457084913068, "learning_rate": 5.228582801757796e-05, "loss": 1.8666, "step": 13340 }, { "epoch": 504.1509433962264, "grad_norm": 1.1877213110343792, "learning_rate": 5.221461808075237e-05, "loss": 1.8637, "step": 13360 }, { "epoch": 504.9056603773585, "grad_norm": 1.0687096720310838, "learning_rate": 5.214337057371846e-05, "loss": 1.8639, "step": 13380 }, { "epoch": 505.66037735849056, "grad_norm": 1.1296264305151373, "learning_rate": 5.207208577201805e-05, "loss": 1.8508, "step": 13400 }, { "epoch": 506.41509433962267, "grad_norm": 1.4245030753661052, "learning_rate": 5.200076395133721e-05, "loss": 1.8328, "step": 13420 }, { "epoch": 507.1698113207547, "grad_norm": 1.3736962730435212, "learning_rate": 5.1929405387505185e-05, "loss": 1.8402, "step": 13440 }, { "epoch": 507.92452830188677, "grad_norm": 1.3874806329791736, "learning_rate": 5.185801035649329e-05, "loss": 1.8392, "step": 13460 }, { "epoch": 508.6792452830189, "grad_norm": 1.2993168124302985, "learning_rate": 5.1786579134413916e-05, "loss": 1.8357, "step": 13480 }, { "epoch": 509.4339622641509, "grad_norm": 1.1615849238599296, "learning_rate": 5.171511199751936e-05, "loss": 1.8602, "step": 13500 }, { "epoch": 510.188679245283, "grad_norm": 1.313961870036688, "learning_rate": 5.164360922220089e-05, "loss": 1.8276, "step": 13520 }, { "epoch": 510.9433962264151, "grad_norm": 1.240911570140835, "learning_rate": 5.157207108498754e-05, "loss": 1.83, "step": 13540 }, { "epoch": 511.6981132075472, "grad_norm": 1.2739381058558579, "learning_rate": 5.1500497862545134e-05, "loss": 1.864, "step": 13560 }, { "epoch": 512.4528301886793, "grad_norm": 1.3641387795362538, "learning_rate": 5.142888983167516e-05, "loss": 1.9016, "step": 13580 }, { "epoch": 513.2075471698113, "grad_norm": 1.233949050539118, "learning_rate": 5.135724726931374e-05, "loss": 1.8224, "step": 13600 }, { "epoch": 513.9622641509434, "grad_norm": 1.2764553522178392, "learning_rate": 5.128557045253056e-05, "loss": 1.8489, "step": 13620 }, { "epoch": 514.7169811320755, "grad_norm": 1.0847794881407822, "learning_rate": 5.121385965852773e-05, "loss": 1.8433, "step": 13640 }, { "epoch": 515.4716981132076, "grad_norm": 1.211639546404476, "learning_rate": 5.114211516463883e-05, "loss": 1.8592, "step": 13660 }, { "epoch": 516.2264150943396, "grad_norm": 1.6499382505803508, "learning_rate": 5.1070337248327704e-05, "loss": 1.8491, "step": 13680 }, { "epoch": 516.9811320754717, "grad_norm": 1.1415154218905448, "learning_rate": 5.0998526187187506e-05, "loss": 1.8263, "step": 13700 }, { "epoch": 517.7358490566038, "grad_norm": 1.2931592721596668, "learning_rate": 5.092668225893955e-05, "loss": 1.8341, "step": 13720 }, { "epoch": 518.4905660377359, "grad_norm": 1.1289936456910783, "learning_rate": 5.0854805741432266e-05, "loss": 1.8256, "step": 13740 }, { "epoch": 519.2452830188679, "grad_norm": 1.1568681178648177, "learning_rate": 5.078289691264009e-05, "loss": 1.8268, "step": 13760 }, { "epoch": 520.0, "grad_norm": 1.2075151796344337, "learning_rate": 5.071095605066247e-05, "loss": 1.8342, "step": 13780 }, { "epoch": 520.7547169811321, "grad_norm": 1.41061431054736, "learning_rate": 5.063898343372271e-05, "loss": 1.8569, "step": 13800 }, { "epoch": 521.5094339622641, "grad_norm": 1.7141184097601845, "learning_rate": 5.0566979340166915e-05, "loss": 1.8447, "step": 13820 }, { "epoch": 522.2641509433962, "grad_norm": 1.1912730571129804, "learning_rate": 5.0494944048462946e-05, "loss": 1.8632, "step": 13840 }, { "epoch": 523.0188679245283, "grad_norm": 1.2784159482259496, "learning_rate": 5.042287783719931e-05, "loss": 1.8293, "step": 13860 }, { "epoch": 523.7735849056604, "grad_norm": 1.1444265949319492, "learning_rate": 5.0350780985084076e-05, "loss": 1.8423, "step": 13880 }, { "epoch": 524.5283018867924, "grad_norm": 1.1366776283872817, "learning_rate": 5.027865377094383e-05, "loss": 1.8284, "step": 13900 }, { "epoch": 525.2830188679245, "grad_norm": 1.2870871198292675, "learning_rate": 5.020649647372258e-05, "loss": 1.8313, "step": 13920 }, { "epoch": 526.0377358490566, "grad_norm": 1.8138208437079086, "learning_rate": 5.013430937248066e-05, "loss": 1.8382, "step": 13940 }, { "epoch": 526.7924528301887, "grad_norm": 1.319578877513452, "learning_rate": 5.00620927463937e-05, "loss": 1.8343, "step": 13960 }, { "epoch": 527.5471698113207, "grad_norm": 1.2254503656584737, "learning_rate": 4.998984687475148e-05, "loss": 1.8439, "step": 13980 }, { "epoch": 528.3018867924528, "grad_norm": 1.0900572753736815, "learning_rate": 4.9917572036956896e-05, "loss": 1.8339, "step": 14000 }, { "epoch": 529.0566037735849, "grad_norm": 1.3672093328811397, "learning_rate": 4.984526851252489e-05, "loss": 1.8269, "step": 14020 }, { "epoch": 529.811320754717, "grad_norm": 1.0474703180578433, "learning_rate": 4.97729365810813e-05, "loss": 1.8278, "step": 14040 }, { "epoch": 530.566037735849, "grad_norm": 1.302303693187968, "learning_rate": 4.9700576522361875e-05, "loss": 1.8406, "step": 14060 }, { "epoch": 531.3207547169811, "grad_norm": 1.3351319231004828, "learning_rate": 4.96281886162111e-05, "loss": 1.805, "step": 14080 }, { "epoch": 532.0754716981132, "grad_norm": 1.7423062973900807, "learning_rate": 4.955577314258118e-05, "loss": 1.8021, "step": 14100 }, { "epoch": 532.8301886792453, "grad_norm": 1.0851524592672839, "learning_rate": 4.9483330381530944e-05, "loss": 1.8376, "step": 14120 }, { "epoch": 533.5849056603773, "grad_norm": 1.110982412101906, "learning_rate": 4.941086061322473e-05, "loss": 1.8468, "step": 14140 }, { "epoch": 534.3396226415094, "grad_norm": 1.1481042439437046, "learning_rate": 4.933836411793133e-05, "loss": 1.8131, "step": 14160 }, { "epoch": 535.0943396226415, "grad_norm": 1.1504371756112235, "learning_rate": 4.926584117602288e-05, "loss": 1.8081, "step": 14180 }, { "epoch": 535.8490566037735, "grad_norm": 1.1403864383961178, "learning_rate": 4.919329206797387e-05, "loss": 1.823, "step": 14200 }, { "epoch": 536.6037735849056, "grad_norm": 1.2962348904995422, "learning_rate": 4.912071707435988e-05, "loss": 1.8187, "step": 14220 }, { "epoch": 537.3584905660377, "grad_norm": 1.1885752096952027, "learning_rate": 4.904811647585668e-05, "loss": 1.8256, "step": 14240 }, { "epoch": 538.1132075471698, "grad_norm": 1.064497747677543, "learning_rate": 4.897549055323902e-05, "loss": 1.8, "step": 14260 }, { "epoch": 538.8679245283018, "grad_norm": 1.4469124816185257, "learning_rate": 4.8902839587379614e-05, "loss": 1.8365, "step": 14280 }, { "epoch": 539.622641509434, "grad_norm": 1.0326597719869466, "learning_rate": 4.8830163859248014e-05, "loss": 1.812, "step": 14300 }, { "epoch": 540.377358490566, "grad_norm": 1.261127094091647, "learning_rate": 4.875746364990955e-05, "loss": 1.7936, "step": 14320 }, { "epoch": 541.1320754716982, "grad_norm": 1.6850662159573848, "learning_rate": 4.8684739240524185e-05, "loss": 1.8039, "step": 14340 }, { "epoch": 541.8867924528302, "grad_norm": 1.1719859164333604, "learning_rate": 4.861199091234556e-05, "loss": 1.7995, "step": 14360 }, { "epoch": 542.6415094339623, "grad_norm": 1.1168812884827573, "learning_rate": 4.853921894671973e-05, "loss": 1.804, "step": 14380 }, { "epoch": 543.3962264150944, "grad_norm": 1.5041434010962127, "learning_rate": 4.846642362508422e-05, "loss": 1.8042, "step": 14400 }, { "epoch": 544.1509433962265, "grad_norm": 1.2922119772360392, "learning_rate": 4.8393605228966854e-05, "loss": 1.8176, "step": 14420 }, { "epoch": 544.9056603773585, "grad_norm": 1.316092813395267, "learning_rate": 4.832076403998472e-05, "loss": 1.8324, "step": 14440 }, { "epoch": 545.6603773584906, "grad_norm": 1.148925533679318, "learning_rate": 4.8247900339843045e-05, "loss": 1.8249, "step": 14460 }, { "epoch": 546.4150943396227, "grad_norm": 1.3351586320323485, "learning_rate": 4.817501441033409e-05, "loss": 1.8023, "step": 14480 }, { "epoch": 547.1698113207547, "grad_norm": 1.4554583529380825, "learning_rate": 4.810210653333613e-05, "loss": 1.782, "step": 14500 }, { "epoch": 547.9245283018868, "grad_norm": 1.2418737812043639, "learning_rate": 4.802917699081225e-05, "loss": 1.7981, "step": 14520 }, { "epoch": 548.6792452830189, "grad_norm": 1.1837142285238051, "learning_rate": 4.795622606480942e-05, "loss": 1.7982, "step": 14540 }, { "epoch": 549.433962264151, "grad_norm": 1.2674115880751322, "learning_rate": 4.788325403745724e-05, "loss": 1.8055, "step": 14560 }, { "epoch": 550.188679245283, "grad_norm": 2.02523705877845, "learning_rate": 4.7810261190966944e-05, "loss": 1.7905, "step": 14580 }, { "epoch": 550.9433962264151, "grad_norm": 1.3660297273644537, "learning_rate": 4.773724780763023e-05, "loss": 1.8267, "step": 14600 }, { "epoch": 551.6981132075472, "grad_norm": 1.1728070148137189, "learning_rate": 4.766421416981833e-05, "loss": 1.7862, "step": 14620 }, { "epoch": 552.4528301886793, "grad_norm": 1.148521109395332, "learning_rate": 4.759116055998069e-05, "loss": 1.7842, "step": 14640 }, { "epoch": 553.2075471698113, "grad_norm": 1.2578627421373816, "learning_rate": 4.7518087260644065e-05, "loss": 1.8105, "step": 14660 }, { "epoch": 553.9622641509434, "grad_norm": 1.2736902452272465, "learning_rate": 4.744499455441133e-05, "loss": 1.7931, "step": 14680 }, { "epoch": 554.7169811320755, "grad_norm": 1.0794014181765008, "learning_rate": 4.737188272396044e-05, "loss": 1.8043, "step": 14700 }, { "epoch": 555.4716981132076, "grad_norm": 1.3894129104855453, "learning_rate": 4.729875205204327e-05, "loss": 1.8301, "step": 14720 }, { "epoch": 556.2264150943396, "grad_norm": 1.147340224849857, "learning_rate": 4.722560282148459e-05, "loss": 1.8178, "step": 14740 }, { "epoch": 556.9811320754717, "grad_norm": 1.3948879461559769, "learning_rate": 4.7152435315180975e-05, "loss": 1.7648, "step": 14760 }, { "epoch": 557.7358490566038, "grad_norm": 1.3694680696221502, "learning_rate": 4.7079249816099584e-05, "loss": 1.8104, "step": 14780 }, { "epoch": 558.4905660377359, "grad_norm": 1.4147919843537753, "learning_rate": 4.700604660727726e-05, "loss": 1.7721, "step": 14800 }, { "epoch": 559.2452830188679, "grad_norm": 1.2297666792262925, "learning_rate": 4.6932825971819285e-05, "loss": 1.7923, "step": 14820 }, { "epoch": 560.0, "grad_norm": 1.1416590332464547, "learning_rate": 4.6859588192898365e-05, "loss": 1.7709, "step": 14840 }, { "epoch": 560.7547169811321, "grad_norm": 1.2633394473980435, "learning_rate": 4.6786333553753454e-05, "loss": 1.8265, "step": 14860 }, { "epoch": 561.5094339622641, "grad_norm": 1.73410063706433, "learning_rate": 4.671306233768877e-05, "loss": 1.7935, "step": 14880 }, { "epoch": 562.2641509433962, "grad_norm": 1.909552398589606, "learning_rate": 4.663977482807263e-05, "loss": 1.7928, "step": 14900 }, { "epoch": 563.0188679245283, "grad_norm": 2.3340344731557505, "learning_rate": 4.656647130833632e-05, "loss": 1.8083, "step": 14920 }, { "epoch": 563.7735849056604, "grad_norm": 1.5856106264075287, "learning_rate": 4.64931520619731e-05, "loss": 1.8345, "step": 14940 }, { "epoch": 564.5283018867924, "grad_norm": 1.4125116448786768, "learning_rate": 4.6419817372537015e-05, "loss": 1.7764, "step": 14960 }, { "epoch": 565.2830188679245, "grad_norm": 1.1720058705654566, "learning_rate": 4.634646752364185e-05, "loss": 1.7917, "step": 14980 }, { "epoch": 566.0377358490566, "grad_norm": 1.1615325214837866, "learning_rate": 4.627310279896001e-05, "loss": 1.7916, "step": 15000 }, { "epoch": 566.7924528301887, "grad_norm": 1.3392839325444756, "learning_rate": 4.619972348222143e-05, "loss": 1.7803, "step": 15020 }, { "epoch": 567.5471698113207, "grad_norm": 1.3665016393198224, "learning_rate": 4.6126329857212486e-05, "loss": 1.7822, "step": 15040 }, { "epoch": 568.3018867924528, "grad_norm": 1.6085820618369988, "learning_rate": 4.605292220777489e-05, "loss": 1.7889, "step": 15060 }, { "epoch": 569.0566037735849, "grad_norm": 1.4230286645037085, "learning_rate": 4.5979500817804594e-05, "loss": 1.805, "step": 15080 }, { "epoch": 569.811320754717, "grad_norm": 1.1596468566263143, "learning_rate": 4.590606597125065e-05, "loss": 1.7892, "step": 15100 }, { "epoch": 570.566037735849, "grad_norm": 1.4539519726534167, "learning_rate": 4.583261795211423e-05, "loss": 1.7831, "step": 15120 }, { "epoch": 571.3207547169811, "grad_norm": 1.2521318013943803, "learning_rate": 4.575915704444736e-05, "loss": 1.8018, "step": 15140 }, { "epoch": 572.0754716981132, "grad_norm": 1.1819685518944387, "learning_rate": 4.5685683532352e-05, "loss": 1.7866, "step": 15160 }, { "epoch": 572.8301886792453, "grad_norm": 1.1876204585927221, "learning_rate": 4.5612197699978766e-05, "loss": 1.7833, "step": 15180 }, { "epoch": 573.5849056603773, "grad_norm": 1.1556357684763976, "learning_rate": 4.5538699831526006e-05, "loss": 1.8024, "step": 15200 }, { "epoch": 574.3396226415094, "grad_norm": 1.3326980140111142, "learning_rate": 4.5465190211238544e-05, "loss": 1.7829, "step": 15220 }, { "epoch": 575.0943396226415, "grad_norm": 1.308268798679134, "learning_rate": 4.539166912340671e-05, "loss": 1.7766, "step": 15240 }, { "epoch": 575.8490566037735, "grad_norm": 1.1564791044184874, "learning_rate": 4.531813685236516e-05, "loss": 1.8021, "step": 15260 }, { "epoch": 576.6037735849056, "grad_norm": 1.4187942127459952, "learning_rate": 4.524459368249179e-05, "loss": 1.7523, "step": 15280 }, { "epoch": 577.3584905660377, "grad_norm": 1.1994628151621998, "learning_rate": 4.5171039898206644e-05, "loss": 1.7845, "step": 15300 }, { "epoch": 578.1132075471698, "grad_norm": 1.172216325696233, "learning_rate": 4.509747578397086e-05, "loss": 1.7591, "step": 15320 }, { "epoch": 578.8679245283018, "grad_norm": 1.1667988074546227, "learning_rate": 4.5023901624285465e-05, "loss": 1.7955, "step": 15340 }, { "epoch": 579.622641509434, "grad_norm": 1.31427332849911, "learning_rate": 4.495031770369038e-05, "loss": 1.7605, "step": 15360 }, { "epoch": 580.377358490566, "grad_norm": 1.2050607756000014, "learning_rate": 4.487672430676325e-05, "loss": 1.7673, "step": 15380 }, { "epoch": 581.1320754716982, "grad_norm": 1.2087614153318165, "learning_rate": 4.480312171811838e-05, "loss": 1.7876, "step": 15400 }, { "epoch": 581.8867924528302, "grad_norm": 1.3000620466205515, "learning_rate": 4.472951022240562e-05, "loss": 1.7611, "step": 15420 }, { "epoch": 582.6415094339623, "grad_norm": 1.7966112906689369, "learning_rate": 4.4655890104309254e-05, "loss": 1.7702, "step": 15440 }, { "epoch": 583.3962264150944, "grad_norm": 1.198242649687164, "learning_rate": 4.458226164854697e-05, "loss": 1.7942, "step": 15460 }, { "epoch": 584.1509433962265, "grad_norm": 1.6859720478683236, "learning_rate": 4.450862513986861e-05, "loss": 1.758, "step": 15480 }, { "epoch": 584.9056603773585, "grad_norm": 1.1440767805984655, "learning_rate": 4.443498086305525e-05, "loss": 1.7647, "step": 15500 }, { "epoch": 585.6603773584906, "grad_norm": 1.2426581026511485, "learning_rate": 4.436132910291792e-05, "loss": 1.7468, "step": 15520 }, { "epoch": 586.4150943396227, "grad_norm": 1.5652742956982049, "learning_rate": 4.4287670144296675e-05, "loss": 1.7733, "step": 15540 }, { "epoch": 587.1698113207547, "grad_norm": 1.150105537080449, "learning_rate": 4.421400427205934e-05, "loss": 1.7878, "step": 15560 }, { "epoch": 587.9245283018868, "grad_norm": 1.297179193085273, "learning_rate": 4.4140331771100516e-05, "loss": 1.7558, "step": 15580 }, { "epoch": 588.6792452830189, "grad_norm": 1.4354989531166704, "learning_rate": 4.406665292634046e-05, "loss": 1.7652, "step": 15600 }, { "epoch": 589.433962264151, "grad_norm": 1.3544454831633896, "learning_rate": 4.399296802272388e-05, "loss": 1.7695, "step": 15620 }, { "epoch": 590.188679245283, "grad_norm": 1.356710977975809, "learning_rate": 4.3919277345219033e-05, "loss": 1.7317, "step": 15640 }, { "epoch": 590.9433962264151, "grad_norm": 1.3504644293745585, "learning_rate": 4.3845581178816394e-05, "loss": 1.7784, "step": 15660 }, { "epoch": 591.6981132075472, "grad_norm": 1.2934057468915228, "learning_rate": 4.377187980852775e-05, "loss": 1.7655, "step": 15680 }, { "epoch": 592.4528301886793, "grad_norm": 2.7284471186236976, "learning_rate": 4.369817351938495e-05, "loss": 1.7617, "step": 15700 }, { "epoch": 593.2075471698113, "grad_norm": 1.4587946653999224, "learning_rate": 4.3624462596438926e-05, "loss": 1.7675, "step": 15720 }, { "epoch": 593.9622641509434, "grad_norm": 1.824543804524391, "learning_rate": 4.3550747324758475e-05, "loss": 1.7835, "step": 15740 }, { "epoch": 594.7169811320755, "grad_norm": 1.1558960324762337, "learning_rate": 4.3477027989429267e-05, "loss": 1.7848, "step": 15760 }, { "epoch": 595.4716981132076, "grad_norm": 1.3618125278208344, "learning_rate": 4.340330487555261e-05, "loss": 1.7717, "step": 15780 }, { "epoch": 596.2264150943396, "grad_norm": 1.8336334887122832, "learning_rate": 4.332957826824451e-05, "loss": 1.7753, "step": 15800 }, { "epoch": 596.9811320754717, "grad_norm": 1.6035556059617442, "learning_rate": 4.325584845263445e-05, "loss": 1.7507, "step": 15820 }, { "epoch": 597.7358490566038, "grad_norm": 1.1021262642715972, "learning_rate": 4.318211571386428e-05, "loss": 1.7683, "step": 15840 }, { "epoch": 598.4905660377359, "grad_norm": 1.3112589542500708, "learning_rate": 4.310838033708722e-05, "loss": 1.753, "step": 15860 }, { "epoch": 599.2452830188679, "grad_norm": 1.5257318148219035, "learning_rate": 4.303464260746667e-05, "loss": 1.7446, "step": 15880 }, { "epoch": 600.0, "grad_norm": 1.2648959489882874, "learning_rate": 4.296090281017511e-05, "loss": 1.7513, "step": 15900 }, { "epoch": 600.7547169811321, "grad_norm": 1.4414622706601208, "learning_rate": 4.2887161230393034e-05, "loss": 1.7421, "step": 15920 }, { "epoch": 601.5094339622641, "grad_norm": 1.60216933395765, "learning_rate": 4.281341815330784e-05, "loss": 1.7335, "step": 15940 }, { "epoch": 602.2641509433962, "grad_norm": 1.438261210769706, "learning_rate": 4.273967386411267e-05, "loss": 1.7676, "step": 15960 }, { "epoch": 603.0188679245283, "grad_norm": 1.61121062509495, "learning_rate": 4.26659286480054e-05, "loss": 1.7767, "step": 15980 }, { "epoch": 603.7735849056604, "grad_norm": 1.3368219249794455, "learning_rate": 4.2592182790187495e-05, "loss": 1.7615, "step": 16000 }, { "epoch": 604.5283018867924, "grad_norm": 1.1964899050496502, "learning_rate": 4.251843657586285e-05, "loss": 1.7909, "step": 16020 }, { "epoch": 605.2830188679245, "grad_norm": 1.1409598499641234, "learning_rate": 4.244469029023682e-05, "loss": 1.7806, "step": 16040 }, { "epoch": 606.0377358490566, "grad_norm": 1.0775618341358217, "learning_rate": 4.237094421851494e-05, "loss": 1.7696, "step": 16060 }, { "epoch": 606.7924528301887, "grad_norm": 1.201425866436519, "learning_rate": 4.2297198645901986e-05, "loss": 1.7424, "step": 16080 }, { "epoch": 607.5471698113207, "grad_norm": 1.29163631265219, "learning_rate": 4.222345385760079e-05, "loss": 1.749, "step": 16100 }, { "epoch": 608.3018867924528, "grad_norm": 1.4158324908813191, "learning_rate": 4.214971013881114e-05, "loss": 1.7594, "step": 16120 }, { "epoch": 609.0566037735849, "grad_norm": 1.2390733211978042, "learning_rate": 4.2075967774728675e-05, "loss": 1.7707, "step": 16140 }, { "epoch": 609.811320754717, "grad_norm": 1.0960663109570459, "learning_rate": 4.200222705054385e-05, "loss": 1.7633, "step": 16160 }, { "epoch": 610.566037735849, "grad_norm": 1.167381366879647, "learning_rate": 4.1928488251440704e-05, "loss": 1.7735, "step": 16180 }, { "epoch": 611.3207547169811, "grad_norm": 1.468960912277373, "learning_rate": 4.185475166259588e-05, "loss": 1.7222, "step": 16200 }, { "epoch": 612.0754716981132, "grad_norm": 1.2572603668608606, "learning_rate": 4.178101756917746e-05, "loss": 1.7477, "step": 16220 }, { "epoch": 612.8301886792453, "grad_norm": 1.2661070355556836, "learning_rate": 4.170728625634387e-05, "loss": 1.7437, "step": 16240 }, { "epoch": 613.5849056603773, "grad_norm": 1.6793862205908143, "learning_rate": 4.16335580092428e-05, "loss": 1.7518, "step": 16260 }, { "epoch": 614.3396226415094, "grad_norm": 1.3347192318840417, "learning_rate": 4.155983311301006e-05, "loss": 1.7275, "step": 16280 }, { "epoch": 615.0943396226415, "grad_norm": 1.146186653201129, "learning_rate": 4.148611185276852e-05, "loss": 1.7505, "step": 16300 }, { "epoch": 615.8490566037735, "grad_norm": 1.2853858560898548, "learning_rate": 4.1412394513626976e-05, "loss": 1.7345, "step": 16320 }, { "epoch": 616.6037735849056, "grad_norm": 1.3689931241044506, "learning_rate": 4.1338681380679055e-05, "loss": 1.7372, "step": 16340 }, { "epoch": 617.3584905660377, "grad_norm": 1.2520152979412003, "learning_rate": 4.126497273900214e-05, "loss": 1.7749, "step": 16360 }, { "epoch": 618.1132075471698, "grad_norm": 4.4664529214017685, "learning_rate": 4.119126887365623e-05, "loss": 1.7291, "step": 16380 }, { "epoch": 618.8679245283018, "grad_norm": 1.4871942552231863, "learning_rate": 4.111757006968283e-05, "loss": 1.729, "step": 16400 }, { "epoch": 619.622641509434, "grad_norm": 1.7327021169643824, "learning_rate": 4.104387661210391e-05, "loss": 1.7906, "step": 16420 }, { "epoch": 620.377358490566, "grad_norm": 1.3011566548368803, "learning_rate": 4.0970188785920764e-05, "loss": 1.7498, "step": 16440 }, { "epoch": 621.1320754716982, "grad_norm": 1.503913785893422, "learning_rate": 4.0896506876112856e-05, "loss": 1.7333, "step": 16460 }, { "epoch": 621.8867924528302, "grad_norm": 1.2396874135815048, "learning_rate": 4.082283116763683e-05, "loss": 1.7474, "step": 16480 }, { "epoch": 622.6415094339623, "grad_norm": 1.3186465498196096, "learning_rate": 4.07491619454253e-05, "loss": 1.7641, "step": 16500 }, { "epoch": 623.3962264150944, "grad_norm": 1.2224446651472063, "learning_rate": 4.067549949438583e-05, "loss": 1.7596, "step": 16520 }, { "epoch": 624.1509433962265, "grad_norm": 1.299102298479128, "learning_rate": 4.060184409939977e-05, "loss": 1.7399, "step": 16540 }, { "epoch": 624.9056603773585, "grad_norm": 1.2080109960062584, "learning_rate": 4.052819604532121e-05, "loss": 1.7545, "step": 16560 }, { "epoch": 625.6603773584906, "grad_norm": 1.1330156099339754, "learning_rate": 4.04545556169758e-05, "loss": 1.7514, "step": 16580 }, { "epoch": 626.4150943396227, "grad_norm": 1.877556318395021, "learning_rate": 4.038092309915976e-05, "loss": 1.7495, "step": 16600 }, { "epoch": 627.1698113207547, "grad_norm": 1.3430468095941768, "learning_rate": 4.0307298776638696e-05, "loss": 1.7387, "step": 16620 }, { "epoch": 627.9245283018868, "grad_norm": 1.3456306138048115, "learning_rate": 4.023368293414651e-05, "loss": 1.7586, "step": 16640 }, { "epoch": 628.6792452830189, "grad_norm": 1.2925035537026515, "learning_rate": 4.016007585638428e-05, "loss": 1.7222, "step": 16660 }, { "epoch": 629.433962264151, "grad_norm": 1.5060755357936446, "learning_rate": 4.0086477828019247e-05, "loss": 1.734, "step": 16680 }, { "epoch": 630.188679245283, "grad_norm": 1.2358138916528858, "learning_rate": 4.001288913368361e-05, "loss": 1.7585, "step": 16700 }, { "epoch": 630.9433962264151, "grad_norm": 1.0536359575721053, "learning_rate": 3.9939310057973496e-05, "loss": 1.699, "step": 16720 }, { "epoch": 631.6981132075472, "grad_norm": 1.3396521000709494, "learning_rate": 3.986574088544782e-05, "loss": 1.745, "step": 16740 }, { "epoch": 632.4528301886793, "grad_norm": 1.1966711285530698, "learning_rate": 3.979218190062718e-05, "loss": 1.7049, "step": 16760 }, { "epoch": 633.2075471698113, "grad_norm": 1.3511753835544016, "learning_rate": 3.971863338799283e-05, "loss": 1.7319, "step": 16780 }, { "epoch": 633.9622641509434, "grad_norm": 1.2759632464750865, "learning_rate": 3.964509563198547e-05, "loss": 1.7431, "step": 16800 }, { "epoch": 634.7169811320755, "grad_norm": 1.5118791481962728, "learning_rate": 3.957156891700422e-05, "loss": 1.7549, "step": 16820 }, { "epoch": 635.4716981132076, "grad_norm": 1.358305138616916, "learning_rate": 3.949805352740549e-05, "loss": 1.7146, "step": 16840 }, { "epoch": 636.2264150943396, "grad_norm": 1.2468444876323985, "learning_rate": 3.9424549747501916e-05, "loss": 1.6839, "step": 16860 }, { "epoch": 636.9811320754717, "grad_norm": 1.6815476229074826, "learning_rate": 3.9351057861561194e-05, "loss": 1.7381, "step": 16880 }, { "epoch": 637.7358490566038, "grad_norm": 1.3183944234813532, "learning_rate": 3.927757815380507e-05, "loss": 1.737, "step": 16900 }, { "epoch": 638.4905660377359, "grad_norm": 1.2664716657296504, "learning_rate": 3.920411090840813e-05, "loss": 1.7552, "step": 16920 }, { "epoch": 639.2452830188679, "grad_norm": 1.4316626122660758, "learning_rate": 3.9130656409496826e-05, "loss": 1.7035, "step": 16940 }, { "epoch": 640.0, "grad_norm": 1.624465349724497, "learning_rate": 3.90572149411483e-05, "loss": 1.7349, "step": 16960 }, { "epoch": 640.7547169811321, "grad_norm": 1.3525138710560463, "learning_rate": 3.8983786787389264e-05, "loss": 1.7196, "step": 16980 }, { "epoch": 641.5094339622641, "grad_norm": 1.1968289253916946, "learning_rate": 3.891037223219497e-05, "loss": 1.7288, "step": 17000 }, { "epoch": 642.2641509433962, "grad_norm": 1.3150467173282183, "learning_rate": 3.883697155948808e-05, "loss": 1.7478, "step": 17020 }, { "epoch": 643.0188679245283, "grad_norm": 1.3494514082635618, "learning_rate": 3.876358505313754e-05, "loss": 1.7208, "step": 17040 }, { "epoch": 643.7735849056604, "grad_norm": 1.5328078930199742, "learning_rate": 3.869021299695754e-05, "loss": 1.747, "step": 17060 }, { "epoch": 644.5283018867924, "grad_norm": 1.2945392233470665, "learning_rate": 3.8616855674706354e-05, "loss": 1.7225, "step": 17080 }, { "epoch": 645.2830188679245, "grad_norm": 1.2582163265054458, "learning_rate": 3.854351337008532e-05, "loss": 1.7428, "step": 17100 }, { "epoch": 646.0377358490566, "grad_norm": 1.1370724946903576, "learning_rate": 3.847018636673765e-05, "loss": 1.704, "step": 17120 }, { "epoch": 646.7924528301887, "grad_norm": 1.2161474947016768, "learning_rate": 3.839687494824741e-05, "loss": 1.7129, "step": 17140 }, { "epoch": 647.5471698113207, "grad_norm": 1.1033819366614397, "learning_rate": 3.832357939813837e-05, "loss": 1.695, "step": 17160 }, { "epoch": 648.3018867924528, "grad_norm": 1.2993665260901381, "learning_rate": 3.825029999987296e-05, "loss": 1.7022, "step": 17180 }, { "epoch": 649.0566037735849, "grad_norm": 1.2577276608492982, "learning_rate": 3.8177037036851115e-05, "loss": 1.7029, "step": 17200 }, { "epoch": 649.811320754717, "grad_norm": 1.2459092691964395, "learning_rate": 3.810379079240922e-05, "loss": 1.7139, "step": 17220 }, { "epoch": 650.566037735849, "grad_norm": 1.3152629296897698, "learning_rate": 3.8030561549819015e-05, "loss": 1.7088, "step": 17240 }, { "epoch": 651.3207547169811, "grad_norm": 1.2367123181404969, "learning_rate": 3.795734959228645e-05, "loss": 1.6936, "step": 17260 }, { "epoch": 652.0754716981132, "grad_norm": 1.1338754969305556, "learning_rate": 3.7884155202950696e-05, "loss": 1.7151, "step": 17280 }, { "epoch": 652.8301886792453, "grad_norm": 1.2942728726977033, "learning_rate": 3.781097866488291e-05, "loss": 1.712, "step": 17300 }, { "epoch": 653.5849056603773, "grad_norm": 1.39400718208209, "learning_rate": 3.773782026108526e-05, "loss": 1.7181, "step": 17320 }, { "epoch": 654.3396226415094, "grad_norm": 1.3198924641523746, "learning_rate": 3.766468027448973e-05, "loss": 1.6913, "step": 17340 }, { "epoch": 655.0943396226415, "grad_norm": 1.1991934682117795, "learning_rate": 3.759155898795714e-05, "loss": 1.7093, "step": 17360 }, { "epoch": 655.8490566037735, "grad_norm": 1.239259370659102, "learning_rate": 3.751845668427593e-05, "loss": 1.7009, "step": 17380 }, { "epoch": 656.6037735849056, "grad_norm": 1.2833857218204128, "learning_rate": 3.7445373646161176e-05, "loss": 1.7005, "step": 17400 }, { "epoch": 657.3584905660377, "grad_norm": 1.454767822481044, "learning_rate": 3.737231015625341e-05, "loss": 1.6906, "step": 17420 }, { "epoch": 658.1132075471698, "grad_norm": 1.4542141511941185, "learning_rate": 3.729926649711759e-05, "loss": 1.7058, "step": 17440 }, { "epoch": 658.8679245283018, "grad_norm": 1.3091035418860133, "learning_rate": 3.722624295124197e-05, "loss": 1.6885, "step": 17460 }, { "epoch": 659.622641509434, "grad_norm": 1.2943161972236163, "learning_rate": 3.7153239801037014e-05, "loss": 1.714, "step": 17480 }, { "epoch": 660.377358490566, "grad_norm": 1.3377320776810098, "learning_rate": 3.708025732883431e-05, "loss": 1.684, "step": 17500 }, { "epoch": 661.1320754716982, "grad_norm": 1.2629223675934866, "learning_rate": 3.700729581688547e-05, "loss": 1.699, "step": 17520 }, { "epoch": 661.8867924528302, "grad_norm": 1.2451499003174673, "learning_rate": 3.693435554736107e-05, "loss": 1.6818, "step": 17540 }, { "epoch": 662.6415094339623, "grad_norm": 1.5331175213775703, "learning_rate": 3.6861436802349504e-05, "loss": 1.7177, "step": 17560 }, { "epoch": 663.3962264150944, "grad_norm": 1.4360430543768725, "learning_rate": 3.6788539863855925e-05, "loss": 1.7119, "step": 17580 }, { "epoch": 664.1509433962265, "grad_norm": 1.3816247903457854, "learning_rate": 3.671566501380116e-05, "loss": 1.7148, "step": 17600 }, { "epoch": 664.9056603773585, "grad_norm": 1.2778334686031196, "learning_rate": 3.6642812534020636e-05, "loss": 1.6935, "step": 17620 }, { "epoch": 665.6603773584906, "grad_norm": 1.552301737650962, "learning_rate": 3.656998270626322e-05, "loss": 1.6917, "step": 17640 }, { "epoch": 666.4150943396227, "grad_norm": 1.1626344688263202, "learning_rate": 3.649717581219022e-05, "loss": 1.6869, "step": 17660 }, { "epoch": 667.1698113207547, "grad_norm": 1.2478591651994395, "learning_rate": 3.642439213337418e-05, "loss": 1.6964, "step": 17680 }, { "epoch": 667.9245283018868, "grad_norm": 1.1665269494870496, "learning_rate": 3.635163195129796e-05, "loss": 1.706, "step": 17700 }, { "epoch": 668.6792452830189, "grad_norm": 1.2417440240279074, "learning_rate": 3.627889554735346e-05, "loss": 1.6607, "step": 17720 }, { "epoch": 669.433962264151, "grad_norm": 1.4243990985436537, "learning_rate": 3.620618320284067e-05, "loss": 1.6874, "step": 17740 }, { "epoch": 670.188679245283, "grad_norm": 1.4914544739718891, "learning_rate": 3.613349519896652e-05, "loss": 1.6908, "step": 17760 }, { "epoch": 670.9433962264151, "grad_norm": 1.3300772606283862, "learning_rate": 3.606083181684381e-05, "loss": 1.688, "step": 17780 }, { "epoch": 671.6981132075472, "grad_norm": 1.2461357748180606, "learning_rate": 3.5988193337490116e-05, "loss": 1.6547, "step": 17800 }, { "epoch": 672.4528301886793, "grad_norm": 1.370151145210619, "learning_rate": 3.5915580041826694e-05, "loss": 1.7193, "step": 17820 }, { "epoch": 673.2075471698113, "grad_norm": 1.2763659906881193, "learning_rate": 3.5842992210677416e-05, "loss": 1.6808, "step": 17840 }, { "epoch": 673.9622641509434, "grad_norm": 1.2944519984940064, "learning_rate": 3.577043012476768e-05, "loss": 1.7, "step": 17860 }, { "epoch": 674.7169811320755, "grad_norm": 1.3186599824633134, "learning_rate": 3.56978940647233e-05, "loss": 1.6954, "step": 17880 }, { "epoch": 675.4716981132076, "grad_norm": 1.252700498164797, "learning_rate": 3.5625384311069444e-05, "loss": 1.6686, "step": 17900 }, { "epoch": 676.2264150943396, "grad_norm": 1.5231032873107, "learning_rate": 3.555290114422955e-05, "loss": 1.6747, "step": 17920 }, { "epoch": 676.9811320754717, "grad_norm": 1.2910659178037445, "learning_rate": 3.548044484452421e-05, "loss": 1.6778, "step": 17940 }, { "epoch": 677.7358490566038, "grad_norm": 1.398570166804289, "learning_rate": 3.540801569217016e-05, "loss": 1.6949, "step": 17960 }, { "epoch": 678.4905660377359, "grad_norm": 1.4283155036503146, "learning_rate": 3.53356139672791e-05, "loss": 1.682, "step": 17980 }, { "epoch": 679.2452830188679, "grad_norm": 1.3275162110816598, "learning_rate": 3.526323994985669e-05, "loss": 1.695, "step": 18000 }, { "epoch": 680.0, "grad_norm": 1.2754138886413842, "learning_rate": 3.519089391980139e-05, "loss": 1.6977, "step": 18020 }, { "epoch": 680.7547169811321, "grad_norm": 1.3077633836764546, "learning_rate": 3.511857615690347e-05, "loss": 1.6811, "step": 18040 }, { "epoch": 681.5094339622641, "grad_norm": 1.3473268942249876, "learning_rate": 3.504628694084385e-05, "loss": 1.6984, "step": 18060 }, { "epoch": 682.2641509433962, "grad_norm": 1.3350261204503644, "learning_rate": 3.497402655119306e-05, "loss": 1.6567, "step": 18080 }, { "epoch": 683.0188679245283, "grad_norm": 1.243885167646148, "learning_rate": 3.490179526741014e-05, "loss": 1.6837, "step": 18100 }, { "epoch": 683.7735849056604, "grad_norm": 1.4293023473168278, "learning_rate": 3.48295933688416e-05, "loss": 1.7039, "step": 18120 }, { "epoch": 684.5283018867924, "grad_norm": 1.3686594771374196, "learning_rate": 3.4757421134720236e-05, "loss": 1.7067, "step": 18140 }, { "epoch": 685.2830188679245, "grad_norm": 1.6243192735337049, "learning_rate": 3.46852788441642e-05, "loss": 1.6661, "step": 18160 }, { "epoch": 686.0377358490566, "grad_norm": 1.2075045336020302, "learning_rate": 3.461316677617577e-05, "loss": 1.6779, "step": 18180 }, { "epoch": 686.7924528301887, "grad_norm": 1.348462905709941, "learning_rate": 3.4541085209640396e-05, "loss": 1.6962, "step": 18200 }, { "epoch": 687.5471698113207, "grad_norm": 1.370184561468331, "learning_rate": 3.446903442332552e-05, "loss": 1.6819, "step": 18220 }, { "epoch": 688.3018867924528, "grad_norm": 2.4058560541467537, "learning_rate": 3.439701469587961e-05, "loss": 1.6562, "step": 18240 }, { "epoch": 689.0566037735849, "grad_norm": 1.2548392090130422, "learning_rate": 3.4325026305830914e-05, "loss": 1.662, "step": 18260 }, { "epoch": 689.811320754717, "grad_norm": 1.2311253301629015, "learning_rate": 3.4253069531586616e-05, "loss": 1.6629, "step": 18280 }, { "epoch": 690.566037735849, "grad_norm": 1.9966791662877068, "learning_rate": 3.418114465143153e-05, "loss": 1.6592, "step": 18300 }, { "epoch": 691.3207547169811, "grad_norm": 1.2370362395857986, "learning_rate": 3.410925194352715e-05, "loss": 1.6806, "step": 18320 }, { "epoch": 692.0754716981132, "grad_norm": 1.463146145452869, "learning_rate": 3.4037391685910566e-05, "loss": 1.6937, "step": 18340 }, { "epoch": 692.8301886792453, "grad_norm": 1.2590469253316379, "learning_rate": 3.396556415649336e-05, "loss": 1.6746, "step": 18360 }, { "epoch": 693.5849056603773, "grad_norm": 1.3472170619382864, "learning_rate": 3.389376963306052e-05, "loss": 1.681, "step": 18380 }, { "epoch": 694.3396226415094, "grad_norm": 1.4907805923383493, "learning_rate": 3.382200839326942e-05, "loss": 1.6822, "step": 18400 }, { "epoch": 695.0943396226415, "grad_norm": 1.3754366409172392, "learning_rate": 3.375028071464869e-05, "loss": 1.6819, "step": 18420 }, { "epoch": 695.8490566037735, "grad_norm": 1.2854564980336112, "learning_rate": 3.3678586874597176e-05, "loss": 1.6712, "step": 18440 }, { "epoch": 696.6037735849056, "grad_norm": 1.4614311570416143, "learning_rate": 3.3606927150382865e-05, "loss": 1.649, "step": 18460 }, { "epoch": 697.3584905660377, "grad_norm": 1.3139946901519874, "learning_rate": 3.353530181914178e-05, "loss": 1.7062, "step": 18480 }, { "epoch": 698.1132075471698, "grad_norm": 1.4895975475886944, "learning_rate": 3.3463711157876966e-05, "loss": 1.6841, "step": 18500 }, { "epoch": 698.8679245283018, "grad_norm": 1.2111074764483576, "learning_rate": 3.339215544345735e-05, "loss": 1.6799, "step": 18520 }, { "epoch": 699.622641509434, "grad_norm": 1.254964544152517, "learning_rate": 3.3320634952616736e-05, "loss": 1.6554, "step": 18540 }, { "epoch": 700.377358490566, "grad_norm": 1.4098934710763775, "learning_rate": 3.3249149961952686e-05, "loss": 1.6821, "step": 18560 }, { "epoch": 701.1320754716982, "grad_norm": 1.449098110180846, "learning_rate": 3.3177700747925484e-05, "loss": 1.6775, "step": 18580 }, { "epoch": 701.8867924528302, "grad_norm": 1.4166300599178772, "learning_rate": 3.310628758685702e-05, "loss": 1.6647, "step": 18600 }, { "epoch": 702.6415094339623, "grad_norm": 1.3321739096846923, "learning_rate": 3.30349107549298e-05, "loss": 1.6606, "step": 18620 }, { "epoch": 703.3962264150944, "grad_norm": 1.3195021828180338, "learning_rate": 3.2963570528185814e-05, "loss": 1.6414, "step": 18640 }, { "epoch": 704.1509433962265, "grad_norm": 1.2954808039261523, "learning_rate": 3.2892267182525456e-05, "loss": 1.6691, "step": 18660 }, { "epoch": 704.9056603773585, "grad_norm": 1.3215765511079391, "learning_rate": 3.2821000993706524e-05, "loss": 1.6774, "step": 18680 }, { "epoch": 705.6603773584906, "grad_norm": 1.3256079186058618, "learning_rate": 3.2749772237343104e-05, "loss": 1.6675, "step": 18700 }, { "epoch": 706.4150943396227, "grad_norm": 1.3105427183809564, "learning_rate": 3.26785811889045e-05, "loss": 1.669, "step": 18720 }, { "epoch": 707.1698113207547, "grad_norm": 1.1406031822674032, "learning_rate": 3.26074281237142e-05, "loss": 1.6528, "step": 18740 }, { "epoch": 707.9245283018868, "grad_norm": 1.1721675684528943, "learning_rate": 3.253631331694882e-05, "loss": 1.6243, "step": 18760 }, { "epoch": 708.6792452830189, "grad_norm": 1.262858428237141, "learning_rate": 3.2465237043636945e-05, "loss": 1.6811, "step": 18780 }, { "epoch": 709.433962264151, "grad_norm": 1.3398257997775693, "learning_rate": 3.239419957865822e-05, "loss": 1.6531, "step": 18800 }, { "epoch": 710.188679245283, "grad_norm": 1.3245763474105379, "learning_rate": 3.2323201196742164e-05, "loss": 1.6796, "step": 18820 }, { "epoch": 710.9433962264151, "grad_norm": 1.3633874472219405, "learning_rate": 3.225224217246712e-05, "loss": 1.6544, "step": 18840 }, { "epoch": 711.6981132075472, "grad_norm": 1.7407734601052158, "learning_rate": 3.218132278025927e-05, "loss": 1.6765, "step": 18860 }, { "epoch": 712.4528301886793, "grad_norm": 1.4569167040451834, "learning_rate": 3.2110443294391486e-05, "loss": 1.6411, "step": 18880 }, { "epoch": 713.2075471698113, "grad_norm": 1.3711197707215454, "learning_rate": 3.203960398898234e-05, "loss": 1.6385, "step": 18900 }, { "epoch": 713.9622641509434, "grad_norm": 1.2731560765553942, "learning_rate": 3.196880513799497e-05, "loss": 1.6605, "step": 18920 }, { "epoch": 714.7169811320755, "grad_norm": 1.3127125434194904, "learning_rate": 3.189804701523608e-05, "loss": 1.6774, "step": 18940 }, { "epoch": 715.4716981132076, "grad_norm": 1.3249230445075728, "learning_rate": 3.1827329894354874e-05, "loss": 1.6753, "step": 18960 }, { "epoch": 716.2264150943396, "grad_norm": 1.4612490587732805, "learning_rate": 3.1756654048842e-05, "loss": 1.655, "step": 18980 }, { "epoch": 716.9811320754717, "grad_norm": 1.278645383417836, "learning_rate": 3.1686019752028424e-05, "loss": 1.6692, "step": 19000 }, { "epoch": 717.7358490566038, "grad_norm": 1.3408714115191198, "learning_rate": 3.161542727708446e-05, "loss": 1.6448, "step": 19020 }, { "epoch": 718.4905660377359, "grad_norm": 1.8695203026536409, "learning_rate": 3.154487689701869e-05, "loss": 1.6786, "step": 19040 }, { "epoch": 719.2452830188679, "grad_norm": 1.3167685135936378, "learning_rate": 3.147436888467689e-05, "loss": 1.6625, "step": 19060 }, { "epoch": 720.0, "grad_norm": 1.2539486625475944, "learning_rate": 3.140390351274096e-05, "loss": 1.6533, "step": 19080 }, { "epoch": 720.7547169811321, "grad_norm": 1.3710221082026877, "learning_rate": 3.133348105372793e-05, "loss": 1.677, "step": 19100 }, { "epoch": 721.5094339622641, "grad_norm": 1.528521350034396, "learning_rate": 3.126310177998883e-05, "loss": 1.6593, "step": 19120 }, { "epoch": 722.2641509433962, "grad_norm": 1.2092386328287839, "learning_rate": 3.1192765963707726e-05, "loss": 1.669, "step": 19140 }, { "epoch": 723.0188679245283, "grad_norm": 1.605845379972632, "learning_rate": 3.1122473876900574e-05, "loss": 1.6372, "step": 19160 }, { "epoch": 723.7735849056604, "grad_norm": 1.5877097735994508, "learning_rate": 3.105222579141423e-05, "loss": 1.6557, "step": 19180 }, { "epoch": 724.5283018867924, "grad_norm": 1.2516228941598748, "learning_rate": 3.098202197892538e-05, "loss": 1.6513, "step": 19200 }, { "epoch": 725.2830188679245, "grad_norm": 1.2391402579938813, "learning_rate": 3.091186271093947e-05, "loss": 1.6526, "step": 19220 }, { "epoch": 726.0377358490566, "grad_norm": 1.2782890497326889, "learning_rate": 3.084174825878972e-05, "loss": 1.6591, "step": 19240 }, { "epoch": 726.7924528301887, "grad_norm": 1.2506962493164657, "learning_rate": 3.0771678893635963e-05, "loss": 1.65, "step": 19260 }, { "epoch": 727.5471698113207, "grad_norm": 1.768116692306316, "learning_rate": 3.070165488646371e-05, "loss": 1.6516, "step": 19280 }, { "epoch": 728.3018867924528, "grad_norm": 1.559057461009202, "learning_rate": 3.063167650808307e-05, "loss": 1.6616, "step": 19300 }, { "epoch": 729.0566037735849, "grad_norm": 1.2888728962143756, "learning_rate": 3.0561744029127636e-05, "loss": 1.6574, "step": 19320 }, { "epoch": 729.811320754717, "grad_norm": 1.2688734788741953, "learning_rate": 3.049185772005353e-05, "loss": 1.618, "step": 19340 }, { "epoch": 730.566037735849, "grad_norm": 1.155730285013269, "learning_rate": 3.0422017851138287e-05, "loss": 1.6515, "step": 19360 }, { "epoch": 731.3207547169811, "grad_norm": 1.7451043683696195, "learning_rate": 3.0352224692479883e-05, "loss": 1.6371, "step": 19380 }, { "epoch": 732.0754716981132, "grad_norm": 1.526187340694129, "learning_rate": 3.0282478513995598e-05, "loss": 1.6523, "step": 19400 }, { "epoch": 732.8301886792453, "grad_norm": 1.4075608712323138, "learning_rate": 3.0212779585421064e-05, "loss": 1.6335, "step": 19420 }, { "epoch": 733.5849056603773, "grad_norm": 1.345293550699471, "learning_rate": 3.0143128176309125e-05, "loss": 1.6505, "step": 19440 }, { "epoch": 734.3396226415094, "grad_norm": 1.3467855791600631, "learning_rate": 3.007352455602892e-05, "loss": 1.6591, "step": 19460 }, { "epoch": 735.0943396226415, "grad_norm": 1.3667404544607202, "learning_rate": 3.000396899376472e-05, "loss": 1.6244, "step": 19480 }, { "epoch": 735.8490566037735, "grad_norm": 1.2844014927173513, "learning_rate": 2.9934461758514944e-05, "loss": 1.6154, "step": 19500 }, { "epoch": 736.6037735849056, "grad_norm": 1.46598947181564, "learning_rate": 2.986500311909114e-05, "loss": 1.6443, "step": 19520 }, { "epoch": 737.3584905660377, "grad_norm": 1.2682755408237392, "learning_rate": 2.9795593344116856e-05, "loss": 1.6492, "step": 19540 }, { "epoch": 738.1132075471698, "grad_norm": 1.4017683975117536, "learning_rate": 2.972623270202674e-05, "loss": 1.6614, "step": 19560 }, { "epoch": 738.8679245283018, "grad_norm": 1.5142927604100354, "learning_rate": 2.9656921461065357e-05, "loss": 1.6357, "step": 19580 }, { "epoch": 739.622641509434, "grad_norm": 1.2492564466728204, "learning_rate": 2.958765988928627e-05, "loss": 1.6468, "step": 19600 }, { "epoch": 740.377358490566, "grad_norm": 1.4008655564779207, "learning_rate": 2.951844825455089e-05, "loss": 1.64, "step": 19620 }, { "epoch": 741.1320754716982, "grad_norm": 1.2731601803567079, "learning_rate": 2.944928682452759e-05, "loss": 1.6324, "step": 19640 }, { "epoch": 741.8867924528302, "grad_norm": 1.5569572939387173, "learning_rate": 2.9380175866690493e-05, "loss": 1.6368, "step": 19660 }, { "epoch": 742.6415094339623, "grad_norm": 1.3215892057968033, "learning_rate": 2.9311115648318603e-05, "loss": 1.5918, "step": 19680 }, { "epoch": 743.3962264150944, "grad_norm": 1.301974969557669, "learning_rate": 2.924210643649462e-05, "loss": 1.625, "step": 19700 }, { "epoch": 744.1509433962265, "grad_norm": 1.245601615853851, "learning_rate": 2.917314849810405e-05, "loss": 1.6436, "step": 19720 }, { "epoch": 744.9056603773585, "grad_norm": 1.4218013050424188, "learning_rate": 2.9104242099834047e-05, "loss": 1.633, "step": 19740 }, { "epoch": 745.6603773584906, "grad_norm": 1.584425316406802, "learning_rate": 2.9035387508172488e-05, "loss": 1.654, "step": 19760 }, { "epoch": 746.4150943396227, "grad_norm": 1.237326080185327, "learning_rate": 2.896658498940685e-05, "loss": 1.6417, "step": 19780 }, { "epoch": 747.1698113207547, "grad_norm": 1.336327325511772, "learning_rate": 2.8897834809623266e-05, "loss": 1.6278, "step": 19800 }, { "epoch": 747.9245283018868, "grad_norm": 1.3731531069367304, "learning_rate": 2.8829137234705436e-05, "loss": 1.6339, "step": 19820 }, { "epoch": 748.6792452830189, "grad_norm": 1.4396961322439583, "learning_rate": 2.8760492530333595e-05, "loss": 1.6132, "step": 19840 }, { "epoch": 749.433962264151, "grad_norm": 1.4566587475130242, "learning_rate": 2.869190096198354e-05, "loss": 1.6236, "step": 19860 }, { "epoch": 750.188679245283, "grad_norm": 1.3378675846922892, "learning_rate": 2.8623362794925554e-05, "loss": 1.6407, "step": 19880 }, { "epoch": 750.9433962264151, "grad_norm": 1.1248642736382553, "learning_rate": 2.85548782942234e-05, "loss": 1.6328, "step": 19900 }, { "epoch": 751.6981132075472, "grad_norm": 1.281060533625914, "learning_rate": 2.8486447724733283e-05, "loss": 1.6288, "step": 19920 }, { "epoch": 752.4528301886793, "grad_norm": 1.2477580789710936, "learning_rate": 2.841807135110286e-05, "loss": 1.6129, "step": 19940 }, { "epoch": 753.2075471698113, "grad_norm": 1.3050801379092132, "learning_rate": 2.8349749437770146e-05, "loss": 1.6259, "step": 19960 }, { "epoch": 753.9622641509434, "grad_norm": 1.6556396088385372, "learning_rate": 2.8281482248962588e-05, "loss": 1.6264, "step": 19980 }, { "epoch": 754.7169811320755, "grad_norm": 1.442836668716919, "learning_rate": 2.8213270048695976e-05, "loss": 1.6286, "step": 20000 }, { "epoch": 755.4716981132076, "grad_norm": 1.3276233208619523, "learning_rate": 2.814511310077342e-05, "loss": 1.6485, "step": 20020 }, { "epoch": 756.2264150943396, "grad_norm": 1.2751456415696178, "learning_rate": 2.807701166878436e-05, "loss": 1.622, "step": 20040 }, { "epoch": 756.9811320754717, "grad_norm": 1.2003976158870355, "learning_rate": 2.8008966016103532e-05, "loss": 1.6002, "step": 20060 }, { "epoch": 757.7358490566038, "grad_norm": 1.3873947326300384, "learning_rate": 2.7940976405889962e-05, "loss": 1.5892, "step": 20080 }, { "epoch": 758.4905660377359, "grad_norm": 1.6648131685984493, "learning_rate": 2.787304310108591e-05, "loss": 1.6496, "step": 20100 }, { "epoch": 759.2452830188679, "grad_norm": 1.4092462550250433, "learning_rate": 2.780516636441591e-05, "loss": 1.6222, "step": 20120 }, { "epoch": 760.0, "grad_norm": 1.3221797397344044, "learning_rate": 2.7737346458385732e-05, "loss": 1.6276, "step": 20140 }, { "epoch": 760.7547169811321, "grad_norm": 1.2328101453363856, "learning_rate": 2.766958364528132e-05, "loss": 1.6199, "step": 20160 }, { "epoch": 761.5094339622641, "grad_norm": 1.198723191362267, "learning_rate": 2.7601878187167865e-05, "loss": 1.6028, "step": 20180 }, { "epoch": 762.2641509433962, "grad_norm": 1.9424131363478752, "learning_rate": 2.7534230345888686e-05, "loss": 1.6155, "step": 20200 }, { "epoch": 763.0188679245283, "grad_norm": 1.3568601924624037, "learning_rate": 2.7466640383064343e-05, "loss": 1.615, "step": 20220 }, { "epoch": 763.7735849056604, "grad_norm": 1.6734295204532768, "learning_rate": 2.7399108560091492e-05, "loss": 1.6127, "step": 20240 }, { "epoch": 764.5283018867924, "grad_norm": 1.3054727154474908, "learning_rate": 2.7331635138141997e-05, "loss": 1.6121, "step": 20260 }, { "epoch": 765.2830188679245, "grad_norm": 1.4085434131191898, "learning_rate": 2.7264220378161817e-05, "loss": 1.5995, "step": 20280 }, { "epoch": 766.0377358490566, "grad_norm": 1.2882798163186127, "learning_rate": 2.719686454087006e-05, "loss": 1.6209, "step": 20300 }, { "epoch": 766.7924528301887, "grad_norm": 1.3843343328010425, "learning_rate": 2.712956788675799e-05, "loss": 1.6253, "step": 20320 }, { "epoch": 767.5471698113207, "grad_norm": 1.2235858453276647, "learning_rate": 2.7062330676087928e-05, "loss": 1.5965, "step": 20340 }, { "epoch": 768.3018867924528, "grad_norm": 3.572459256976869, "learning_rate": 2.6995153168892342e-05, "loss": 1.6146, "step": 20360 }, { "epoch": 769.0566037735849, "grad_norm": 1.6994398915504043, "learning_rate": 2.692803562497278e-05, "loss": 1.6034, "step": 20380 }, { "epoch": 769.811320754717, "grad_norm": 1.2122097844602269, "learning_rate": 2.6860978303898913e-05, "loss": 1.6133, "step": 20400 }, { "epoch": 770.566037735849, "grad_norm": 1.5836773539567761, "learning_rate": 2.6793981465007477e-05, "loss": 1.6149, "step": 20420 }, { "epoch": 771.3207547169811, "grad_norm": 1.9577284294586506, "learning_rate": 2.6727045367401357e-05, "loss": 1.6038, "step": 20440 }, { "epoch": 772.0754716981132, "grad_norm": 1.453554282623515, "learning_rate": 2.6660170269948445e-05, "loss": 1.6425, "step": 20460 }, { "epoch": 772.8301886792453, "grad_norm": 1.3031139346537821, "learning_rate": 2.65933564312808e-05, "loss": 1.5996, "step": 20480 }, { "epoch": 773.5849056603773, "grad_norm": 1.2921513380534098, "learning_rate": 2.6526604109793517e-05, "loss": 1.6097, "step": 20500 }, { "epoch": 774.3396226415094, "grad_norm": 1.2706024142950736, "learning_rate": 2.6459913563643797e-05, "loss": 1.6151, "step": 20520 }, { "epoch": 775.0943396226415, "grad_norm": 1.3170228555500274, "learning_rate": 2.6393285050749948e-05, "loss": 1.6117, "step": 20540 }, { "epoch": 775.8490566037735, "grad_norm": 1.4811497809397014, "learning_rate": 2.6326718828790347e-05, "loss": 1.6065, "step": 20560 }, { "epoch": 776.6037735849056, "grad_norm": 1.3171774089155976, "learning_rate": 2.6260215155202478e-05, "loss": 1.5846, "step": 20580 }, { "epoch": 777.3584905660377, "grad_norm": 1.4867958271178354, "learning_rate": 2.6193774287181905e-05, "loss": 1.6182, "step": 20600 }, { "epoch": 778.1132075471698, "grad_norm": 1.485845846341643, "learning_rate": 2.612739648168134e-05, "loss": 1.618, "step": 20620 }, { "epoch": 778.8679245283018, "grad_norm": 1.3411546152150449, "learning_rate": 2.6061081995409594e-05, "loss": 1.5979, "step": 20640 }, { "epoch": 779.622641509434, "grad_norm": 1.352180099861608, "learning_rate": 2.5994831084830585e-05, "loss": 1.607, "step": 20660 }, { "epoch": 780.377358490566, "grad_norm": 1.4317453454675355, "learning_rate": 2.5928644006162356e-05, "loss": 1.63, "step": 20680 }, { "epoch": 781.1320754716982, "grad_norm": 1.3910753254665948, "learning_rate": 2.5862521015376083e-05, "loss": 1.6066, "step": 20700 }, { "epoch": 781.8867924528302, "grad_norm": 1.3073180519851255, "learning_rate": 2.579646236819513e-05, "loss": 1.6064, "step": 20720 }, { "epoch": 782.6415094339623, "grad_norm": 1.2595549167905473, "learning_rate": 2.5730468320093977e-05, "loss": 1.5911, "step": 20740 }, { "epoch": 783.3962264150944, "grad_norm": 1.2678103789921547, "learning_rate": 2.566453912629729e-05, "loss": 1.5817, "step": 20760 }, { "epoch": 784.1509433962265, "grad_norm": 1.5236492215060178, "learning_rate": 2.5598675041778895e-05, "loss": 1.6007, "step": 20780 }, { "epoch": 784.9056603773585, "grad_norm": 1.4661421776894412, "learning_rate": 2.553287632126086e-05, "loss": 1.5504, "step": 20800 }, { "epoch": 785.6603773584906, "grad_norm": 2.1927891520328635, "learning_rate": 2.5467143219212452e-05, "loss": 1.5841, "step": 20820 }, { "epoch": 786.4150943396227, "grad_norm": 1.3795117819084444, "learning_rate": 2.5401475989849135e-05, "loss": 1.6066, "step": 20840 }, { "epoch": 787.1698113207547, "grad_norm": 1.4556438165329462, "learning_rate": 2.5335874887131648e-05, "loss": 1.5968, "step": 20860 }, { "epoch": 787.9245283018868, "grad_norm": 1.4073316916031215, "learning_rate": 2.5270340164764954e-05, "loss": 1.5903, "step": 20880 }, { "epoch": 788.6792452830189, "grad_norm": 1.519045194155026, "learning_rate": 2.5204872076197373e-05, "loss": 1.6143, "step": 20900 }, { "epoch": 789.433962264151, "grad_norm": 1.371854180935982, "learning_rate": 2.513947087461945e-05, "loss": 1.5956, "step": 20920 }, { "epoch": 790.188679245283, "grad_norm": 1.3445443793198255, "learning_rate": 2.5074136812963086e-05, "loss": 1.6161, "step": 20940 }, { "epoch": 790.9433962264151, "grad_norm": 1.3427364962397694, "learning_rate": 2.5008870143900505e-05, "loss": 1.5568, "step": 20960 }, { "epoch": 791.6981132075472, "grad_norm": 1.2656549996025988, "learning_rate": 2.4943671119843328e-05, "loss": 1.5955, "step": 20980 }, { "epoch": 792.4528301886793, "grad_norm": 1.4205258402430134, "learning_rate": 2.4878539992941564e-05, "loss": 1.5806, "step": 21000 }, { "epoch": 793.2075471698113, "grad_norm": 1.6035321030423435, "learning_rate": 2.4813477015082614e-05, "loss": 1.6141, "step": 21020 }, { "epoch": 793.9622641509434, "grad_norm": 1.411461969155631, "learning_rate": 2.4748482437890327e-05, "loss": 1.613, "step": 21040 }, { "epoch": 794.7169811320755, "grad_norm": 1.5232357865305386, "learning_rate": 2.4683556512724013e-05, "loss": 1.5999, "step": 21060 }, { "epoch": 795.4716981132076, "grad_norm": 1.423060839013135, "learning_rate": 2.4618699490677522e-05, "loss": 1.6014, "step": 21080 }, { "epoch": 796.2264150943396, "grad_norm": 1.3310370009240546, "learning_rate": 2.4553911622578173e-05, "loss": 1.5633, "step": 21100 }, { "epoch": 796.9811320754717, "grad_norm": 1.5449295211536895, "learning_rate": 2.4489193158985862e-05, "loss": 1.5948, "step": 21120 }, { "epoch": 797.7358490566038, "grad_norm": 1.4953862144554202, "learning_rate": 2.4424544350192054e-05, "loss": 1.5576, "step": 21140 }, { "epoch": 798.4905660377359, "grad_norm": 1.4322654299272977, "learning_rate": 2.4359965446218893e-05, "loss": 1.6043, "step": 21160 }, { "epoch": 799.2452830188679, "grad_norm": 1.230403444648656, "learning_rate": 2.4295456696818116e-05, "loss": 1.5875, "step": 21180 }, { "epoch": 800.0, "grad_norm": 1.2803680639521113, "learning_rate": 2.423101835147014e-05, "loss": 1.5929, "step": 21200 }, { "epoch": 800.7547169811321, "grad_norm": 1.3170298641719804, "learning_rate": 2.4166650659383118e-05, "loss": 1.5807, "step": 21220 }, { "epoch": 801.5094339622641, "grad_norm": 1.6269742919477346, "learning_rate": 2.410235386949199e-05, "loss": 1.6065, "step": 21240 }, { "epoch": 802.2641509433962, "grad_norm": 1.5458337442207868, "learning_rate": 2.4038128230457458e-05, "loss": 1.5717, "step": 21260 }, { "epoch": 803.0188679245283, "grad_norm": 2.161638931230412, "learning_rate": 2.3973973990665043e-05, "loss": 1.5762, "step": 21280 }, { "epoch": 803.7735849056604, "grad_norm": 1.4046972399313973, "learning_rate": 2.3909891398224146e-05, "loss": 1.5661, "step": 21300 }, { "epoch": 804.5283018867924, "grad_norm": 1.35050834441664, "learning_rate": 2.3845880700967103e-05, "loss": 1.5706, "step": 21320 }, { "epoch": 805.2830188679245, "grad_norm": 1.5896148693041472, "learning_rate": 2.3781942146448204e-05, "loss": 1.5729, "step": 21340 }, { "epoch": 806.0377358490566, "grad_norm": 1.5191801749378997, "learning_rate": 2.3718075981942708e-05, "loss": 1.5602, "step": 21360 }, { "epoch": 806.7924528301887, "grad_norm": 1.1849456023631704, "learning_rate": 2.3654282454445914e-05, "loss": 1.5577, "step": 21380 }, { "epoch": 807.5471698113207, "grad_norm": 1.6435607024595327, "learning_rate": 2.3590561810672222e-05, "loss": 1.5806, "step": 21400 }, { "epoch": 808.3018867924528, "grad_norm": 2.2764964715817153, "learning_rate": 2.3526914297054165e-05, "loss": 1.5465, "step": 21420 }, { "epoch": 809.0566037735849, "grad_norm": 1.143095634553467, "learning_rate": 2.3463340159741438e-05, "loss": 1.5608, "step": 21440 }, { "epoch": 809.811320754717, "grad_norm": 1.786405965035776, "learning_rate": 2.3399839644599966e-05, "loss": 1.5685, "step": 21460 }, { "epoch": 810.566037735849, "grad_norm": 1.7826688318895536, "learning_rate": 2.3336412997210945e-05, "loss": 1.5673, "step": 21480 }, { "epoch": 811.3207547169811, "grad_norm": 1.9973557900265262, "learning_rate": 2.3273060462869915e-05, "loss": 1.58, "step": 21500 }, { "epoch": 812.0754716981132, "grad_norm": 2.725826480276118, "learning_rate": 2.320978228658578e-05, "loss": 1.5798, "step": 21520 }, { "epoch": 812.8301886792453, "grad_norm": 1.3785885365626125, "learning_rate": 2.3146578713079873e-05, "loss": 1.584, "step": 21540 }, { "epoch": 813.5849056603773, "grad_norm": 1.3942585361528321, "learning_rate": 2.308344998678499e-05, "loss": 1.5801, "step": 21560 }, { "epoch": 814.3396226415094, "grad_norm": 1.3778634108496939, "learning_rate": 2.3020396351844476e-05, "loss": 1.587, "step": 21580 }, { "epoch": 815.0943396226415, "grad_norm": 1.3226870132637325, "learning_rate": 2.2957418052111304e-05, "loss": 1.5666, "step": 21600 }, { "epoch": 815.8490566037735, "grad_norm": 1.4483636324260303, "learning_rate": 2.2894515331147043e-05, "loss": 1.5721, "step": 21620 }, { "epoch": 816.6037735849056, "grad_norm": 1.2510733452498213, "learning_rate": 2.2831688432220988e-05, "loss": 1.5909, "step": 21640 }, { "epoch": 817.3584905660377, "grad_norm": 1.4064726551185514, "learning_rate": 2.2768937598309226e-05, "loss": 1.5581, "step": 21660 }, { "epoch": 818.1132075471698, "grad_norm": 1.3598025710319712, "learning_rate": 2.2706263072093622e-05, "loss": 1.5798, "step": 21680 }, { "epoch": 818.8679245283018, "grad_norm": 1.4055798705480538, "learning_rate": 2.2643665095960992e-05, "loss": 1.5376, "step": 21700 }, { "epoch": 819.622641509434, "grad_norm": 1.3886244820387288, "learning_rate": 2.258114391200204e-05, "loss": 1.588, "step": 21720 }, { "epoch": 820.377358490566, "grad_norm": 1.672353853467523, "learning_rate": 2.2518699762010527e-05, "loss": 1.5771, "step": 21740 }, { "epoch": 821.1320754716982, "grad_norm": 1.6122695109482281, "learning_rate": 2.245633288748226e-05, "loss": 1.5744, "step": 21760 }, { "epoch": 821.8867924528302, "grad_norm": 1.2184243938930763, "learning_rate": 2.239404352961424e-05, "loss": 1.5579, "step": 21780 }, { "epoch": 822.6415094339623, "grad_norm": 2.6739030707563383, "learning_rate": 2.233183192930362e-05, "loss": 1.5742, "step": 21800 }, { "epoch": 823.3962264150944, "grad_norm": 1.513583565471533, "learning_rate": 2.22696983271469e-05, "loss": 1.5543, "step": 21820 }, { "epoch": 824.1509433962265, "grad_norm": 1.5062076381870015, "learning_rate": 2.2207642963438875e-05, "loss": 1.5578, "step": 21840 }, { "epoch": 824.9056603773585, "grad_norm": 1.4515191009181103, "learning_rate": 2.2145666078171794e-05, "loss": 1.5599, "step": 21860 }, { "epoch": 825.6603773584906, "grad_norm": 1.7800885670540134, "learning_rate": 2.2083767911034394e-05, "loss": 1.5724, "step": 21880 }, { "epoch": 826.4150943396227, "grad_norm": 1.655570469233021, "learning_rate": 2.2021948701410956e-05, "loss": 1.5722, "step": 21900 }, { "epoch": 827.1698113207547, "grad_norm": 1.682338091450034, "learning_rate": 2.1960208688380426e-05, "loss": 1.5289, "step": 21920 }, { "epoch": 827.9245283018868, "grad_norm": 1.3769805944636337, "learning_rate": 2.189854811071546e-05, "loss": 1.5523, "step": 21940 }, { "epoch": 828.6792452830189, "grad_norm": 1.2988448014856364, "learning_rate": 2.183696720688152e-05, "loss": 1.5493, "step": 21960 }, { "epoch": 829.433962264151, "grad_norm": 1.352528590030774, "learning_rate": 2.1775466215035887e-05, "loss": 1.5505, "step": 21980 }, { "epoch": 830.188679245283, "grad_norm": 1.9587571716355492, "learning_rate": 2.1714045373026878e-05, "loss": 1.5611, "step": 22000 }, { "epoch": 830.9433962264151, "grad_norm": 1.4092678213797292, "learning_rate": 2.165270491839274e-05, "loss": 1.5799, "step": 22020 }, { "epoch": 831.6981132075472, "grad_norm": 1.2980309483736483, "learning_rate": 2.159144508836092e-05, "loss": 1.5409, "step": 22040 }, { "epoch": 832.4528301886793, "grad_norm": 2.367411086569801, "learning_rate": 2.1530266119847e-05, "loss": 1.5565, "step": 22060 }, { "epoch": 833.2075471698113, "grad_norm": 1.4677294354247894, "learning_rate": 2.146916824945386e-05, "loss": 1.567, "step": 22080 }, { "epoch": 833.9622641509434, "grad_norm": 1.2034171336508228, "learning_rate": 2.1408151713470727e-05, "loss": 1.5324, "step": 22100 }, { "epoch": 834.7169811320755, "grad_norm": 1.6112142759671855, "learning_rate": 2.1347216747872316e-05, "loss": 1.5728, "step": 22120 }, { "epoch": 835.4716981132076, "grad_norm": 2.3612009460762025, "learning_rate": 2.1286363588317815e-05, "loss": 1.5777, "step": 22140 }, { "epoch": 836.2264150943396, "grad_norm": 1.3794177780422423, "learning_rate": 2.122559247015011e-05, "loss": 1.5337, "step": 22160 }, { "epoch": 836.9811320754717, "grad_norm": 1.4913217058342938, "learning_rate": 2.116490362839475e-05, "loss": 1.5712, "step": 22180 }, { "epoch": 837.7358490566038, "grad_norm": 1.393269094002593, "learning_rate": 2.1104297297759077e-05, "loss": 1.56, "step": 22200 }, { "epoch": 838.4905660377359, "grad_norm": 1.5277254368751014, "learning_rate": 2.104377371263138e-05, "loss": 1.564, "step": 22220 }, { "epoch": 839.2452830188679, "grad_norm": 1.8220574387124733, "learning_rate": 2.0983333107079923e-05, "loss": 1.593, "step": 22240 }, { "epoch": 840.0, "grad_norm": 1.4636327213867844, "learning_rate": 2.0922975714852024e-05, "loss": 1.5482, "step": 22260 }, { "epoch": 840.7547169811321, "grad_norm": 1.374724993121681, "learning_rate": 2.0862701769373194e-05, "loss": 1.5386, "step": 22280 }, { "epoch": 841.5094339622641, "grad_norm": 1.3056844963466483, "learning_rate": 2.0802511503746282e-05, "loss": 1.5499, "step": 22300 }, { "epoch": 842.2641509433962, "grad_norm": 1.8941001751457995, "learning_rate": 2.074240515075041e-05, "loss": 1.5556, "step": 22320 }, { "epoch": 843.0188679245283, "grad_norm": 1.5811456544096827, "learning_rate": 2.0682382942840276e-05, "loss": 1.5301, "step": 22340 }, { "epoch": 843.7735849056604, "grad_norm": 1.6509929914813097, "learning_rate": 2.062244511214511e-05, "loss": 1.5114, "step": 22360 }, { "epoch": 844.5283018867924, "grad_norm": 1.7262725135545645, "learning_rate": 2.0562591890467795e-05, "loss": 1.5771, "step": 22380 }, { "epoch": 845.2830188679245, "grad_norm": 2.3494461416325176, "learning_rate": 2.050282350928407e-05, "loss": 1.5355, "step": 22400 }, { "epoch": 846.0377358490566, "grad_norm": 1.5449531783263548, "learning_rate": 2.0443140199741506e-05, "loss": 1.5322, "step": 22420 }, { "epoch": 846.7924528301887, "grad_norm": 1.6993440968380624, "learning_rate": 2.0383542192658678e-05, "loss": 1.5595, "step": 22440 }, { "epoch": 847.5471698113207, "grad_norm": 1.4219970620295765, "learning_rate": 2.0324029718524266e-05, "loss": 1.544, "step": 22460 }, { "epoch": 848.3018867924528, "grad_norm": 1.4581628071481192, "learning_rate": 2.0264603007496174e-05, "loss": 1.5504, "step": 22480 }, { "epoch": 849.0566037735849, "grad_norm": 1.7218288706081564, "learning_rate": 2.0205262289400635e-05, "loss": 1.5329, "step": 22500 }, { "epoch": 849.811320754717, "grad_norm": 1.557573117936356, "learning_rate": 2.0146007793731277e-05, "loss": 1.5413, "step": 22520 }, { "epoch": 850.566037735849, "grad_norm": 1.556424340318002, "learning_rate": 2.0086839749648294e-05, "loss": 1.585, "step": 22540 }, { "epoch": 851.3207547169811, "grad_norm": 1.5130697235799593, "learning_rate": 2.002775838597753e-05, "loss": 1.5365, "step": 22560 }, { "epoch": 852.0754716981132, "grad_norm": 1.8393652727073544, "learning_rate": 1.9968763931209628e-05, "loss": 1.5459, "step": 22580 }, { "epoch": 852.8301886792453, "grad_norm": 1.5587158507011118, "learning_rate": 1.9909856613499096e-05, "loss": 1.5429, "step": 22600 }, { "epoch": 853.5849056603773, "grad_norm": 1.5786253886757977, "learning_rate": 1.9851036660663427e-05, "loss": 1.5293, "step": 22620 }, { "epoch": 854.3396226415094, "grad_norm": 1.6955187366248636, "learning_rate": 1.9792304300182305e-05, "loss": 1.5488, "step": 22640 }, { "epoch": 855.0943396226415, "grad_norm": 1.429545844614554, "learning_rate": 1.9733659759196588e-05, "loss": 1.533, "step": 22660 }, { "epoch": 855.8490566037735, "grad_norm": 1.3624588099774164, "learning_rate": 1.967510326450757e-05, "loss": 1.5257, "step": 22680 }, { "epoch": 856.6037735849056, "grad_norm": 1.4701659884745055, "learning_rate": 1.9616635042575986e-05, "loss": 1.5579, "step": 22700 }, { "epoch": 857.3584905660377, "grad_norm": 1.258458227155755, "learning_rate": 1.9558255319521186e-05, "loss": 1.5174, "step": 22720 }, { "epoch": 858.1132075471698, "grad_norm": 1.147380018733113, "learning_rate": 1.9499964321120298e-05, "loss": 1.5483, "step": 22740 }, { "epoch": 858.8679245283018, "grad_norm": 1.427160544906616, "learning_rate": 1.9441762272807296e-05, "loss": 1.53, "step": 22760 }, { "epoch": 859.622641509434, "grad_norm": 1.480555621655005, "learning_rate": 1.9383649399672136e-05, "loss": 1.5431, "step": 22780 }, { "epoch": 860.377358490566, "grad_norm": 1.6140763796883943, "learning_rate": 1.9325625926459906e-05, "loss": 1.5372, "step": 22800 }, { "epoch": 861.1320754716982, "grad_norm": 1.3659868727706357, "learning_rate": 1.9267692077569966e-05, "loss": 1.5693, "step": 22820 }, { "epoch": 861.8867924528302, "grad_norm": 1.668704322839176, "learning_rate": 1.9209848077055063e-05, "loss": 1.5491, "step": 22840 }, { "epoch": 862.6415094339623, "grad_norm": 1.6416845244091214, "learning_rate": 1.915209414862045e-05, "loss": 1.5449, "step": 22860 }, { "epoch": 863.3962264150944, "grad_norm": 1.5619688603918687, "learning_rate": 1.9094430515623036e-05, "loss": 1.5109, "step": 22880 }, { "epoch": 864.1509433962265, "grad_norm": 1.5251429637162535, "learning_rate": 1.9036857401070517e-05, "loss": 1.5358, "step": 22900 }, { "epoch": 864.9056603773585, "grad_norm": 1.6195136008209567, "learning_rate": 1.8979375027620553e-05, "loss": 1.5167, "step": 22920 }, { "epoch": 865.6603773584906, "grad_norm": 1.4453402703839808, "learning_rate": 1.8921983617579843e-05, "loss": 1.5345, "step": 22940 }, { "epoch": 866.4150943396227, "grad_norm": 1.6142287693511135, "learning_rate": 1.8864683392903296e-05, "loss": 1.5427, "step": 22960 }, { "epoch": 867.1698113207547, "grad_norm": 1.4589091367603184, "learning_rate": 1.880747457519317e-05, "loss": 1.4945, "step": 22980 }, { "epoch": 867.9245283018868, "grad_norm": 1.485668957375296, "learning_rate": 1.8750357385698233e-05, "loss": 1.5278, "step": 23000 }, { "epoch": 868.6792452830189, "grad_norm": 1.4865684774055008, "learning_rate": 1.8693332045312905e-05, "loss": 1.5178, "step": 23020 }, { "epoch": 869.433962264151, "grad_norm": 1.6955473002125137, "learning_rate": 1.8636398774576337e-05, "loss": 1.5485, "step": 23040 }, { "epoch": 870.188679245283, "grad_norm": 1.5715186186512253, "learning_rate": 1.857955779367166e-05, "loss": 1.5192, "step": 23060 }, { "epoch": 870.9433962264151, "grad_norm": 1.5717069835325073, "learning_rate": 1.8522809322425036e-05, "loss": 1.5106, "step": 23080 }, { "epoch": 871.6981132075472, "grad_norm": 1.3775027498551788, "learning_rate": 1.8466153580304923e-05, "loss": 1.5255, "step": 23100 }, { "epoch": 872.4528301886793, "grad_norm": 1.7060704667189681, "learning_rate": 1.8409590786421106e-05, "loss": 1.5152, "step": 23120 }, { "epoch": 873.2075471698113, "grad_norm": 1.3772746674273528, "learning_rate": 1.8353121159523913e-05, "loss": 1.4952, "step": 23140 }, { "epoch": 873.9622641509434, "grad_norm": 1.6021480905291907, "learning_rate": 1.8296744918003365e-05, "loss": 1.5548, "step": 23160 }, { "epoch": 874.7169811320755, "grad_norm": 1.6510954563611369, "learning_rate": 1.8240462279888328e-05, "loss": 1.5341, "step": 23180 }, { "epoch": 875.4716981132076, "grad_norm": 1.5525128595509998, "learning_rate": 1.8184273462845678e-05, "loss": 1.5399, "step": 23200 }, { "epoch": 876.2264150943396, "grad_norm": 1.3584051699815205, "learning_rate": 1.812817868417943e-05, "loss": 1.5245, "step": 23220 }, { "epoch": 876.9811320754717, "grad_norm": 1.909931733744526, "learning_rate": 1.8072178160829906e-05, "loss": 1.5333, "step": 23240 }, { "epoch": 877.7358490566038, "grad_norm": 1.7102569423853409, "learning_rate": 1.8016272109372925e-05, "loss": 1.5131, "step": 23260 }, { "epoch": 878.4905660377359, "grad_norm": 2.4326218341752384, "learning_rate": 1.7960460746018958e-05, "loss": 1.4983, "step": 23280 }, { "epoch": 879.2452830188679, "grad_norm": 1.6888708257619338, "learning_rate": 1.790474428661225e-05, "loss": 1.5268, "step": 23300 }, { "epoch": 880.0, "grad_norm": 1.4793278776392822, "learning_rate": 1.784912294663003e-05, "loss": 1.5144, "step": 23320 }, { "epoch": 880.7547169811321, "grad_norm": 1.3797110952325906, "learning_rate": 1.7793596941181667e-05, "loss": 1.5224, "step": 23340 }, { "epoch": 881.5094339622641, "grad_norm": 1.5055338530715117, "learning_rate": 1.7738166485007843e-05, "loss": 1.5276, "step": 23360 }, { "epoch": 882.2641509433962, "grad_norm": 1.3850071229139178, "learning_rate": 1.768283179247969e-05, "loss": 1.5216, "step": 23380 }, { "epoch": 883.0188679245283, "grad_norm": 1.4681066166997387, "learning_rate": 1.7627593077597997e-05, "loss": 1.534, "step": 23400 }, { "epoch": 883.7735849056604, "grad_norm": 1.5242995737679692, "learning_rate": 1.7572450553992356e-05, "loss": 1.4992, "step": 23420 }, { "epoch": 884.5283018867924, "grad_norm": 1.642787390621851, "learning_rate": 1.751740443492039e-05, "loss": 1.5002, "step": 23440 }, { "epoch": 885.2830188679245, "grad_norm": 1.490074296578881, "learning_rate": 1.7462454933266846e-05, "loss": 1.5211, "step": 23460 }, { "epoch": 886.0377358490566, "grad_norm": 1.5694629977285655, "learning_rate": 1.740760226154283e-05, "loss": 1.5335, "step": 23480 }, { "epoch": 886.7924528301887, "grad_norm": 1.6846894322403163, "learning_rate": 1.7352846631884956e-05, "loss": 1.4995, "step": 23500 }, { "epoch": 887.5471698113207, "grad_norm": 1.4525398790667088, "learning_rate": 1.7298188256054564e-05, "loss": 1.4957, "step": 23520 }, { "epoch": 888.3018867924528, "grad_norm": 1.635106498771857, "learning_rate": 1.7243627345436874e-05, "loss": 1.5271, "step": 23540 }, { "epoch": 889.0566037735849, "grad_norm": 1.4587656230559394, "learning_rate": 1.7189164111040147e-05, "loss": 1.501, "step": 23560 }, { "epoch": 889.811320754717, "grad_norm": 1.5410070982779924, "learning_rate": 1.71347987634949e-05, "loss": 1.4982, "step": 23580 }, { "epoch": 890.566037735849, "grad_norm": 1.5645035336411055, "learning_rate": 1.708053151305308e-05, "loss": 1.5002, "step": 23600 }, { "epoch": 891.3207547169811, "grad_norm": 1.3307742805961782, "learning_rate": 1.702636256958728e-05, "loss": 1.5184, "step": 23620 }, { "epoch": 892.0754716981132, "grad_norm": 1.6962843737118656, "learning_rate": 1.6972292142589877e-05, "loss": 1.5107, "step": 23640 }, { "epoch": 892.8301886792453, "grad_norm": 1.8950680189724871, "learning_rate": 1.6918320441172233e-05, "loss": 1.517, "step": 23660 }, { "epoch": 893.5849056603773, "grad_norm": 1.7479434721374532, "learning_rate": 1.686444767406395e-05, "loss": 1.5051, "step": 23680 }, { "epoch": 894.3396226415094, "grad_norm": 1.8611101959164753, "learning_rate": 1.6810674049611953e-05, "loss": 1.5063, "step": 23700 }, { "epoch": 895.0943396226415, "grad_norm": 1.5841028344361991, "learning_rate": 1.67569997757798e-05, "loss": 1.481, "step": 23720 }, { "epoch": 895.8490566037735, "grad_norm": 1.5025051335412982, "learning_rate": 1.6703425060146778e-05, "loss": 1.5253, "step": 23740 }, { "epoch": 896.6037735849056, "grad_norm": 2.8439948944917757, "learning_rate": 1.6649950109907165e-05, "loss": 1.5216, "step": 23760 }, { "epoch": 897.3584905660377, "grad_norm": 1.6268608502019901, "learning_rate": 1.6596575131869387e-05, "loss": 1.5334, "step": 23780 }, { "epoch": 898.1132075471698, "grad_norm": 1.4759450457116179, "learning_rate": 1.6543300332455273e-05, "loss": 1.5007, "step": 23800 }, { "epoch": 898.8679245283018, "grad_norm": 1.4818248018036755, "learning_rate": 1.6490125917699203e-05, "loss": 1.4973, "step": 23820 }, { "epoch": 899.622641509434, "grad_norm": 1.548616527993675, "learning_rate": 1.6437052093247303e-05, "loss": 1.517, "step": 23840 }, { "epoch": 900.377358490566, "grad_norm": 1.5445734121981956, "learning_rate": 1.6384079064356744e-05, "loss": 1.521, "step": 23860 }, { "epoch": 901.1320754716982, "grad_norm": 1.5970555623190617, "learning_rate": 1.6331207035894806e-05, "loss": 1.5172, "step": 23880 }, { "epoch": 901.8867924528302, "grad_norm": 1.389904429038452, "learning_rate": 1.6278436212338226e-05, "loss": 1.4987, "step": 23900 }, { "epoch": 902.6415094339623, "grad_norm": 1.3455191149235926, "learning_rate": 1.62257667977723e-05, "loss": 1.5047, "step": 23920 }, { "epoch": 903.3962264150944, "grad_norm": 1.4729168638466097, "learning_rate": 1.6173198995890152e-05, "loss": 1.5032, "step": 23940 }, { "epoch": 904.1509433962265, "grad_norm": 1.5230989764955487, "learning_rate": 1.612073300999191e-05, "loss": 1.5244, "step": 23960 }, { "epoch": 904.9056603773585, "grad_norm": 1.4504907356107584, "learning_rate": 1.6068369042983987e-05, "loss": 1.5072, "step": 23980 }, { "epoch": 905.6603773584906, "grad_norm": 1.3570035581449431, "learning_rate": 1.601610729737819e-05, "loss": 1.5002, "step": 24000 }, { "epoch": 906.4150943396227, "grad_norm": 1.408532335123701, "learning_rate": 1.5963947975291056e-05, "loss": 1.4974, "step": 24020 }, { "epoch": 907.1698113207547, "grad_norm": 1.6703383627319723, "learning_rate": 1.591189127844295e-05, "loss": 1.5056, "step": 24040 }, { "epoch": 907.9245283018868, "grad_norm": 1.4548307957349456, "learning_rate": 1.5859937408157403e-05, "loss": 1.4836, "step": 24060 }, { "epoch": 908.6792452830189, "grad_norm": 1.622725332424491, "learning_rate": 1.5808086565360235e-05, "loss": 1.4652, "step": 24080 }, { "epoch": 909.433962264151, "grad_norm": 1.9382762093036214, "learning_rate": 1.575633895057883e-05, "loss": 1.507, "step": 24100 }, { "epoch": 910.188679245283, "grad_norm": 5.171486198720905, "learning_rate": 1.5704694763941345e-05, "loss": 1.4918, "step": 24120 }, { "epoch": 910.9433962264151, "grad_norm": 1.318697524518072, "learning_rate": 1.5653154205175963e-05, "loss": 1.485, "step": 24140 }, { "epoch": 911.6981132075472, "grad_norm": 1.640456368314345, "learning_rate": 1.5601717473610066e-05, "loss": 1.493, "step": 24160 }, { "epoch": 912.4528301886793, "grad_norm": 1.7783411819352481, "learning_rate": 1.555038476816951e-05, "loss": 1.5233, "step": 24180 }, { "epoch": 913.2075471698113, "grad_norm": 1.8560943552673308, "learning_rate": 1.5499156287377857e-05, "loss": 1.4845, "step": 24200 }, { "epoch": 913.9622641509434, "grad_norm": 1.3922157561757162, "learning_rate": 1.544803222935555e-05, "loss": 1.513, "step": 24220 }, { "epoch": 914.7169811320755, "grad_norm": 1.5964166307266414, "learning_rate": 1.5397012791819248e-05, "loss": 1.5029, "step": 24240 }, { "epoch": 915.4716981132076, "grad_norm": 1.581271765982569, "learning_rate": 1.5346098172080947e-05, "loss": 1.5139, "step": 24260 }, { "epoch": 916.2264150943396, "grad_norm": 1.3829789961056094, "learning_rate": 1.5295288567047304e-05, "loss": 1.4727, "step": 24280 }, { "epoch": 916.9811320754717, "grad_norm": 1.595484488791353, "learning_rate": 1.5244584173218816e-05, "loss": 1.4764, "step": 24300 }, { "epoch": 917.7358490566038, "grad_norm": 1.9817110984943331, "learning_rate": 1.5193985186689126e-05, "loss": 1.488, "step": 24320 }, { "epoch": 918.4905660377359, "grad_norm": 1.5041365073617188, "learning_rate": 1.5143491803144183e-05, "loss": 1.4823, "step": 24340 }, { "epoch": 919.2452830188679, "grad_norm": 1.623717820636255, "learning_rate": 1.5093104217861574e-05, "loss": 1.4711, "step": 24360 }, { "epoch": 920.0, "grad_norm": 1.4153896302283269, "learning_rate": 1.5042822625709687e-05, "loss": 1.4729, "step": 24380 }, { "epoch": 920.7547169811321, "grad_norm": 1.8914526627670851, "learning_rate": 1.499264722114699e-05, "loss": 1.4744, "step": 24400 }, { "epoch": 921.5094339622641, "grad_norm": 1.3579367015171855, "learning_rate": 1.494257819822132e-05, "loss": 1.5068, "step": 24420 }, { "epoch": 922.2641509433962, "grad_norm": 1.7241565511209502, "learning_rate": 1.4892615750569062e-05, "loss": 1.4629, "step": 24440 }, { "epoch": 923.0188679245283, "grad_norm": 1.6169769566812962, "learning_rate": 1.4842760071414446e-05, "loss": 1.4987, "step": 24460 }, { "epoch": 923.7735849056604, "grad_norm": 1.9954016377464863, "learning_rate": 1.4793011353568764e-05, "loss": 1.5263, "step": 24480 }, { "epoch": 924.5283018867924, "grad_norm": 1.4779174235189176, "learning_rate": 1.4743369789429686e-05, "loss": 1.4769, "step": 24500 }, { "epoch": 925.2830188679245, "grad_norm": 1.7019641943900714, "learning_rate": 1.4693835570980468e-05, "loss": 1.4749, "step": 24520 }, { "epoch": 926.0377358490566, "grad_norm": 1.5323014302848716, "learning_rate": 1.4644408889789189e-05, "loss": 1.4984, "step": 24540 }, { "epoch": 926.7924528301887, "grad_norm": 1.446942162217049, "learning_rate": 1.4595089937008062e-05, "loss": 1.4998, "step": 24560 }, { "epoch": 927.5471698113207, "grad_norm": 1.3609927181175356, "learning_rate": 1.4545878903372663e-05, "loss": 1.4765, "step": 24580 }, { "epoch": 928.3018867924528, "grad_norm": 1.4584582755904496, "learning_rate": 1.4496775979201224e-05, "loss": 1.4828, "step": 24600 }, { "epoch": 929.0566037735849, "grad_norm": 1.4254389674669559, "learning_rate": 1.444778135439385e-05, "loss": 1.5041, "step": 24620 }, { "epoch": 929.811320754717, "grad_norm": 1.5655038573484212, "learning_rate": 1.4398895218431825e-05, "loss": 1.4995, "step": 24640 }, { "epoch": 930.566037735849, "grad_norm": 1.623569066402965, "learning_rate": 1.4350117760376843e-05, "loss": 1.4966, "step": 24660 }, { "epoch": 931.3207547169811, "grad_norm": 1.594778698950599, "learning_rate": 1.4301449168870325e-05, "loss": 1.4899, "step": 24680 }, { "epoch": 932.0754716981132, "grad_norm": 1.7627482209727463, "learning_rate": 1.4252889632132667e-05, "loss": 1.4784, "step": 24700 }, { "epoch": 932.8301886792453, "grad_norm": 1.5595702425460922, "learning_rate": 1.4204439337962486e-05, "loss": 1.4962, "step": 24720 }, { "epoch": 933.5849056603773, "grad_norm": 1.6175712268221147, "learning_rate": 1.4156098473735903e-05, "loss": 1.4858, "step": 24740 }, { "epoch": 934.3396226415094, "grad_norm": 1.5528087670883148, "learning_rate": 1.4107867226405882e-05, "loss": 1.4959, "step": 24760 }, { "epoch": 935.0943396226415, "grad_norm": 1.5105693139489524, "learning_rate": 1.4059745782501403e-05, "loss": 1.4694, "step": 24780 }, { "epoch": 935.8490566037735, "grad_norm": 1.424625384350829, "learning_rate": 1.4011734328126825e-05, "loss": 1.4531, "step": 24800 }, { "epoch": 936.6037735849056, "grad_norm": 1.921412092336305, "learning_rate": 1.3963833048961103e-05, "loss": 1.5003, "step": 24820 }, { "epoch": 937.3584905660377, "grad_norm": 1.5289456190701718, "learning_rate": 1.3916042130257145e-05, "loss": 1.5177, "step": 24840 }, { "epoch": 938.1132075471698, "grad_norm": 1.410017115369323, "learning_rate": 1.3868361756841036e-05, "loss": 1.4957, "step": 24860 }, { "epoch": 938.8679245283018, "grad_norm": 1.3741594118478162, "learning_rate": 1.3820792113111323e-05, "loss": 1.4876, "step": 24880 }, { "epoch": 939.622641509434, "grad_norm": 1.5111524219290895, "learning_rate": 1.377333338303833e-05, "loss": 1.4789, "step": 24900 }, { "epoch": 940.377358490566, "grad_norm": 1.2690279082779223, "learning_rate": 1.3725985750163418e-05, "loss": 1.4851, "step": 24920 }, { "epoch": 941.1320754716982, "grad_norm": 1.5760629816984877, "learning_rate": 1.3678749397598337e-05, "loss": 1.4993, "step": 24940 }, { "epoch": 941.8867924528302, "grad_norm": 1.5719387109025893, "learning_rate": 1.363162450802443e-05, "loss": 1.4654, "step": 24960 }, { "epoch": 942.6415094339623, "grad_norm": 1.51578687737706, "learning_rate": 1.3584611263691974e-05, "loss": 1.4985, "step": 24980 }, { "epoch": 943.3962264150944, "grad_norm": 1.5864417766142165, "learning_rate": 1.353770984641948e-05, "loss": 1.4891, "step": 25000 }, { "epoch": 944.1509433962265, "grad_norm": 1.5330683898736195, "learning_rate": 1.3490920437592985e-05, "loss": 1.4928, "step": 25020 }, { "epoch": 944.9056603773585, "grad_norm": 1.8666313722767156, "learning_rate": 1.344424321816535e-05, "loss": 1.4558, "step": 25040 }, { "epoch": 945.6603773584906, "grad_norm": 1.4103376741909914, "learning_rate": 1.3397678368655534e-05, "loss": 1.467, "step": 25060 }, { "epoch": 946.4150943396227, "grad_norm": 1.6978974580611665, "learning_rate": 1.3351226069147934e-05, "loss": 1.4586, "step": 25080 }, { "epoch": 947.1698113207547, "grad_norm": 1.3043741098462962, "learning_rate": 1.3304886499291653e-05, "loss": 1.4651, "step": 25100 }, { "epoch": 947.9245283018868, "grad_norm": 1.5721530761043376, "learning_rate": 1.3258659838299863e-05, "loss": 1.4851, "step": 25120 }, { "epoch": 948.6792452830189, "grad_norm": 2.445174125656233, "learning_rate": 1.3212546264949038e-05, "loss": 1.4861, "step": 25140 }, { "epoch": 949.433962264151, "grad_norm": 3.0455557993861584, "learning_rate": 1.3166545957578312e-05, "loss": 1.4956, "step": 25160 }, { "epoch": 950.188679245283, "grad_norm": 1.481231036001675, "learning_rate": 1.3120659094088763e-05, "loss": 1.4786, "step": 25180 }, { "epoch": 950.9433962264151, "grad_norm": 1.6177001101633584, "learning_rate": 1.3074885851942757e-05, "loss": 1.4691, "step": 25200 }, { "epoch": 951.6981132075472, "grad_norm": 1.7370265253795278, "learning_rate": 1.3029226408163237e-05, "loss": 1.456, "step": 25220 }, { "epoch": 952.4528301886793, "grad_norm": 1.476098649785593, "learning_rate": 1.2983680939333043e-05, "loss": 1.457, "step": 25240 }, { "epoch": 953.2075471698113, "grad_norm": 1.9700691780666086, "learning_rate": 1.2938249621594219e-05, "loss": 1.4916, "step": 25260 }, { "epoch": 953.9622641509434, "grad_norm": 1.4124078828516038, "learning_rate": 1.289293263064734e-05, "loss": 1.4442, "step": 25280 }, { "epoch": 954.7169811320755, "grad_norm": 1.609015057343637, "learning_rate": 1.284773014175086e-05, "loss": 1.4808, "step": 25300 }, { "epoch": 955.4716981132076, "grad_norm": 1.545457288749583, "learning_rate": 1.2802642329720385e-05, "loss": 1.4388, "step": 25320 }, { "epoch": 956.2264150943396, "grad_norm": 1.4137648487617847, "learning_rate": 1.275766936892803e-05, "loss": 1.4558, "step": 25340 }, { "epoch": 956.9811320754717, "grad_norm": 1.7375121010804517, "learning_rate": 1.2712811433301723e-05, "loss": 1.4864, "step": 25360 }, { "epoch": 957.7358490566038, "grad_norm": 2.170614678870875, "learning_rate": 1.2668068696324572e-05, "loss": 1.4668, "step": 25380 }, { "epoch": 958.4905660377359, "grad_norm": 1.3921099231821001, "learning_rate": 1.2623441331034153e-05, "loss": 1.466, "step": 25400 }, { "epoch": 959.2452830188679, "grad_norm": 1.763881906266782, "learning_rate": 1.2578929510021851e-05, "loss": 1.4556, "step": 25420 }, { "epoch": 960.0, "grad_norm": 1.6251732366885816, "learning_rate": 1.2534533405432192e-05, "loss": 1.4831, "step": 25440 }, { "epoch": 960.7547169811321, "grad_norm": 1.35568804382613, "learning_rate": 1.2490253188962184e-05, "loss": 1.4637, "step": 25460 }, { "epoch": 961.5094339622641, "grad_norm": 1.5192686857357145, "learning_rate": 1.2446089031860666e-05, "loss": 1.5039, "step": 25480 }, { "epoch": 962.2641509433962, "grad_norm": 1.645823339942095, "learning_rate": 1.2402041104927622e-05, "loss": 1.4643, "step": 25500 }, { "epoch": 963.0188679245283, "grad_norm": 1.5266645922223165, "learning_rate": 1.2358109578513502e-05, "loss": 1.4609, "step": 25520 }, { "epoch": 963.7735849056604, "grad_norm": 2.012096934939658, "learning_rate": 1.2314294622518637e-05, "loss": 1.4707, "step": 25540 }, { "epoch": 964.5283018867924, "grad_norm": 1.6019652732905527, "learning_rate": 1.227059640639251e-05, "loss": 1.4624, "step": 25560 }, { "epoch": 965.2830188679245, "grad_norm": 1.5459039987734797, "learning_rate": 1.2227015099133119e-05, "loss": 1.4462, "step": 25580 }, { "epoch": 966.0377358490566, "grad_norm": 1.4581354369376407, "learning_rate": 1.2183550869286346e-05, "loss": 1.4602, "step": 25600 }, { "epoch": 966.7924528301887, "grad_norm": 1.5627139982974774, "learning_rate": 1.2140203884945257e-05, "loss": 1.4558, "step": 25620 }, { "epoch": 967.5471698113207, "grad_norm": 1.6163383081813927, "learning_rate": 1.2096974313749544e-05, "loss": 1.442, "step": 25640 }, { "epoch": 968.3018867924528, "grad_norm": 1.4708485221948149, "learning_rate": 1.2053862322884756e-05, "loss": 1.4449, "step": 25660 }, { "epoch": 969.0566037735849, "grad_norm": 1.457232110275896, "learning_rate": 1.2010868079081735e-05, "loss": 1.4714, "step": 25680 }, { "epoch": 969.811320754717, "grad_norm": 2.130030633684405, "learning_rate": 1.1967991748615972e-05, "loss": 1.4672, "step": 25700 }, { "epoch": 970.566037735849, "grad_norm": 1.6585416945015101, "learning_rate": 1.1925233497306898e-05, "loss": 1.4582, "step": 25720 }, { "epoch": 971.3207547169811, "grad_norm": 1.696646559562477, "learning_rate": 1.1882593490517333e-05, "loss": 1.4616, "step": 25740 }, { "epoch": 972.0754716981132, "grad_norm": 1.8347228047889477, "learning_rate": 1.1840071893152767e-05, "loss": 1.4412, "step": 25760 }, { "epoch": 972.8301886792453, "grad_norm": 1.5105738469091443, "learning_rate": 1.1797668869660753e-05, "loss": 1.4476, "step": 25780 }, { "epoch": 973.5849056603773, "grad_norm": 1.6402649798470197, "learning_rate": 1.1755384584030287e-05, "loss": 1.4458, "step": 25800 }, { "epoch": 974.3396226415094, "grad_norm": 1.4580507747280478, "learning_rate": 1.171321919979116e-05, "loss": 1.4414, "step": 25820 }, { "epoch": 975.0943396226415, "grad_norm": 1.8999226743757298, "learning_rate": 1.1671172880013328e-05, "loss": 1.4501, "step": 25840 }, { "epoch": 975.8490566037735, "grad_norm": 1.3767670402035495, "learning_rate": 1.1629245787306247e-05, "loss": 1.4422, "step": 25860 }, { "epoch": 976.6037735849056, "grad_norm": 1.3303378991562944, "learning_rate": 1.158743808381832e-05, "loss": 1.437, "step": 25880 }, { "epoch": 977.3584905660377, "grad_norm": 1.5011235086965091, "learning_rate": 1.1545749931236199e-05, "loss": 1.4225, "step": 25900 }, { "epoch": 978.1132075471698, "grad_norm": 1.7853875208460404, "learning_rate": 1.1504181490784197e-05, "loss": 1.4405, "step": 25920 }, { "epoch": 978.8679245283018, "grad_norm": 1.4852022947554018, "learning_rate": 1.1462732923223643e-05, "loss": 1.4197, "step": 25940 }, { "epoch": 979.622641509434, "grad_norm": 1.492057926353613, "learning_rate": 1.1421404388852275e-05, "loss": 1.4516, "step": 25960 }, { "epoch": 980.377358490566, "grad_norm": 1.8767944270145316, "learning_rate": 1.1380196047503614e-05, "loss": 1.4613, "step": 25980 }, { "epoch": 981.1320754716982, "grad_norm": 1.5723288438267475, "learning_rate": 1.1339108058546365e-05, "loss": 1.4636, "step": 26000 }, { "epoch": 981.8867924528302, "grad_norm": 1.4572390965943247, "learning_rate": 1.1298140580883752e-05, "loss": 1.4291, "step": 26020 }, { "epoch": 982.6415094339623, "grad_norm": 2.0340602707703566, "learning_rate": 1.1257293772952971e-05, "loss": 1.4342, "step": 26040 }, { "epoch": 983.3962264150944, "grad_norm": 1.7563358001308935, "learning_rate": 1.1216567792724513e-05, "loss": 1.44, "step": 26060 }, { "epoch": 984.1509433962265, "grad_norm": 1.7195863256249895, "learning_rate": 1.1175962797701585e-05, "loss": 1.473, "step": 26080 }, { "epoch": 984.9056603773585, "grad_norm": 1.5325109929141458, "learning_rate": 1.1135478944919515e-05, "loss": 1.4537, "step": 26100 }, { "epoch": 985.6603773584906, "grad_norm": 1.4246338183010563, "learning_rate": 1.1095116390945116e-05, "loss": 1.4576, "step": 26120 }, { "epoch": 986.4150943396227, "grad_norm": 1.5264334254918077, "learning_rate": 1.1054875291876081e-05, "loss": 1.4355, "step": 26140 }, { "epoch": 987.1698113207547, "grad_norm": 1.7871427472844674, "learning_rate": 1.101475580334039e-05, "loss": 1.4285, "step": 26160 }, { "epoch": 987.9245283018868, "grad_norm": 1.628111810825388, "learning_rate": 1.0974758080495742e-05, "loss": 1.432, "step": 26180 }, { "epoch": 988.6792452830189, "grad_norm": 1.6079918141380485, "learning_rate": 1.0934882278028875e-05, "loss": 1.473, "step": 26200 }, { "epoch": 989.433962264151, "grad_norm": 1.9227955059143975, "learning_rate": 1.0895128550155048e-05, "loss": 1.4319, "step": 26220 }, { "epoch": 990.188679245283, "grad_norm": 1.4777834491856459, "learning_rate": 1.0855497050617383e-05, "loss": 1.4715, "step": 26240 }, { "epoch": 990.9433962264151, "grad_norm": 1.752347342407413, "learning_rate": 1.0815987932686322e-05, "loss": 1.4483, "step": 26260 }, { "epoch": 991.6981132075472, "grad_norm": 1.7965242738400287, "learning_rate": 1.0776601349158992e-05, "loss": 1.445, "step": 26280 }, { "epoch": 992.4528301886793, "grad_norm": 1.6880482866877031, "learning_rate": 1.0737337452358643e-05, "loss": 1.4289, "step": 26300 }, { "epoch": 993.2075471698113, "grad_norm": 1.3587051959850933, "learning_rate": 1.0698196394134027e-05, "loss": 1.4248, "step": 26320 }, { "epoch": 993.9622641509434, "grad_norm": 1.6893835419836905, "learning_rate": 1.0659178325858868e-05, "loss": 1.4593, "step": 26340 }, { "epoch": 994.7169811320755, "grad_norm": 1.6372424305822535, "learning_rate": 1.0620283398431196e-05, "loss": 1.4248, "step": 26360 }, { "epoch": 995.4716981132076, "grad_norm": 1.628959331603337, "learning_rate": 1.0581511762272856e-05, "loss": 1.459, "step": 26380 }, { "epoch": 996.2264150943396, "grad_norm": 1.9899303146490552, "learning_rate": 1.0542863567328837e-05, "loss": 1.4608, "step": 26400 }, { "epoch": 996.9811320754717, "grad_norm": 1.6980987241375505, "learning_rate": 1.0504338963066745e-05, "loss": 1.4489, "step": 26420 }, { "epoch": 997.7358490566038, "grad_norm": 1.791483449843248, "learning_rate": 1.0465938098476226e-05, "loss": 1.4647, "step": 26440 }, { "epoch": 998.4905660377359, "grad_norm": 1.3823874629634854, "learning_rate": 1.0427661122068363e-05, "loss": 1.431, "step": 26460 }, { "epoch": 999.2452830188679, "grad_norm": 1.7547951381187532, "learning_rate": 1.0389508181875114e-05, "loss": 1.4374, "step": 26480 }, { "epoch": 1000.0, "grad_norm": 1.6329317283212297, "learning_rate": 1.035147942544874e-05, "loss": 1.4436, "step": 26500 }, { "epoch": 1000.7547169811321, "grad_norm": 1.482848334089, "learning_rate": 1.0313574999861255e-05, "loss": 1.4263, "step": 26520 }, { "epoch": 1001.5094339622641, "grad_norm": 1.4085297987389735, "learning_rate": 1.027579505170381e-05, "loss": 1.4423, "step": 26540 }, { "epoch": 1002.2641509433962, "grad_norm": 1.586157768854042, "learning_rate": 1.0238139727086178e-05, "loss": 1.4289, "step": 26560 }, { "epoch": 1003.0188679245283, "grad_norm": 1.4910507620311724, "learning_rate": 1.020060917163614e-05, "loss": 1.4555, "step": 26580 }, { "epoch": 1003.7735849056604, "grad_norm": 1.7298473240434828, "learning_rate": 1.0163203530498955e-05, "loss": 1.4176, "step": 26600 }, { "epoch": 1004.5283018867924, "grad_norm": 1.9395741512745615, "learning_rate": 1.0125922948336813e-05, "loss": 1.4297, "step": 26620 }, { "epoch": 1005.2830188679245, "grad_norm": 1.3752095871887702, "learning_rate": 1.0088767569328215e-05, "loss": 1.4224, "step": 26640 }, { "epoch": 1006.0377358490566, "grad_norm": 1.6566420053219757, "learning_rate": 1.0051737537167479e-05, "loss": 1.4416, "step": 26660 }, { "epoch": 1006.7924528301887, "grad_norm": 1.8401842062612699, "learning_rate": 1.001483299506413e-05, "loss": 1.4406, "step": 26680 }, { "epoch": 1007.5471698113207, "grad_norm": 1.5895021822365676, "learning_rate": 9.978054085742407e-06, "loss": 1.4104, "step": 26700 }, { "epoch": 1008.3018867924528, "grad_norm": 1.5495688189805843, "learning_rate": 9.941400951440674e-06, "loss": 1.4446, "step": 26720 }, { "epoch": 1009.0566037735849, "grad_norm": 1.6376917222270109, "learning_rate": 9.904873733910852e-06, "loss": 1.4023, "step": 26740 }, { "epoch": 1009.811320754717, "grad_norm": 1.7729521919831477, "learning_rate": 9.868472574417906e-06, "loss": 1.4409, "step": 26760 }, { "epoch": 1010.566037735849, "grad_norm": 1.5909106157325896, "learning_rate": 9.832197613739278e-06, "loss": 1.4284, "step": 26780 }, { "epoch": 1011.3207547169811, "grad_norm": 1.5416992698357255, "learning_rate": 9.79604899216437e-06, "loss": 1.4165, "step": 26800 }, { "epoch": 1012.0754716981132, "grad_norm": 1.7245150906399498, "learning_rate": 9.760026849493962e-06, "loss": 1.4281, "step": 26820 }, { "epoch": 1012.8301886792453, "grad_norm": 1.8518007110272525, "learning_rate": 9.7241313250397e-06, "loss": 1.4223, "step": 26840 }, { "epoch": 1013.5849056603773, "grad_norm": 1.593106128312966, "learning_rate": 9.688362557623527e-06, "loss": 1.4377, "step": 26860 }, { "epoch": 1014.3396226415094, "grad_norm": 1.6557177655883284, "learning_rate": 9.6527206855772e-06, "loss": 1.4394, "step": 26880 }, { "epoch": 1015.0943396226415, "grad_norm": 1.5950355314495743, "learning_rate": 9.617205846741719e-06, "loss": 1.4506, "step": 26900 }, { "epoch": 1015.8490566037735, "grad_norm": 1.7685274450403552, "learning_rate": 9.58181817846677e-06, "loss": 1.4484, "step": 26920 }, { "epoch": 1016.6037735849056, "grad_norm": 1.4639040403309866, "learning_rate": 9.54655781761023e-06, "loss": 1.4043, "step": 26940 }, { "epoch": 1017.3584905660377, "grad_norm": 1.6074583945207908, "learning_rate": 9.511424900537656e-06, "loss": 1.4197, "step": 26960 }, { "epoch": 1018.1132075471698, "grad_norm": 1.5459146912367183, "learning_rate": 9.476419563121698e-06, "loss": 1.4232, "step": 26980 }, { "epoch": 1018.8679245283018, "grad_norm": 1.6166722954994783, "learning_rate": 9.441541940741613e-06, "loss": 1.4407, "step": 27000 }, { "epoch": 1019.622641509434, "grad_norm": 1.6533674302686083, "learning_rate": 9.406792168282739e-06, "loss": 1.4393, "step": 27020 }, { "epoch": 1020.377358490566, "grad_norm": 2.1409264555789123, "learning_rate": 9.37217038013597e-06, "loss": 1.4507, "step": 27040 }, { "epoch": 1021.1320754716982, "grad_norm": 1.9876202106584275, "learning_rate": 9.337676710197243e-06, "loss": 1.4486, "step": 27060 }, { "epoch": 1021.8867924528302, "grad_norm": 1.6321392819191982, "learning_rate": 9.303311291866996e-06, "loss": 1.4337, "step": 27080 }, { "epoch": 1022.6415094339623, "grad_norm": 1.5614664744291826, "learning_rate": 9.269074258049671e-06, "loss": 1.4245, "step": 27100 }, { "epoch": 1023.3962264150944, "grad_norm": 1.775529049395487, "learning_rate": 9.234965741153195e-06, "loss": 1.4284, "step": 27120 }, { "epoch": 1024.1509433962265, "grad_norm": 1.4430739083306536, "learning_rate": 9.200985873088487e-06, "loss": 1.4235, "step": 27140 }, { "epoch": 1024.9056603773586, "grad_norm": 2.0811882500763255, "learning_rate": 9.167134785268918e-06, "loss": 1.402, "step": 27160 }, { "epoch": 1025.6603773584907, "grad_norm": 1.5403915703954525, "learning_rate": 9.133412608609811e-06, "loss": 1.4302, "step": 27180 }, { "epoch": 1026.4150943396226, "grad_norm": 1.9685065156678565, "learning_rate": 9.099819473527936e-06, "loss": 1.3969, "step": 27200 }, { "epoch": 1027.1698113207547, "grad_norm": 1.5336587010545035, "learning_rate": 9.066355509941036e-06, "loss": 1.428, "step": 27220 }, { "epoch": 1027.9245283018868, "grad_norm": 1.9045363331404057, "learning_rate": 9.033020847267277e-06, "loss": 1.4521, "step": 27240 }, { "epoch": 1028.6792452830189, "grad_norm": 1.7010720746106325, "learning_rate": 8.999815614424768e-06, "loss": 1.4408, "step": 27260 }, { "epoch": 1029.433962264151, "grad_norm": 1.6652770284797922, "learning_rate": 8.966739939831065e-06, "loss": 1.4275, "step": 27280 }, { "epoch": 1030.188679245283, "grad_norm": 1.438920885601344, "learning_rate": 8.933793951402666e-06, "loss": 1.4363, "step": 27300 }, { "epoch": 1030.9433962264152, "grad_norm": 1.523374273868093, "learning_rate": 8.900977776554543e-06, "loss": 1.4178, "step": 27320 }, { "epoch": 1031.698113207547, "grad_norm": 1.9388166404138083, "learning_rate": 8.868291542199601e-06, "loss": 1.4339, "step": 27340 }, { "epoch": 1032.4528301886792, "grad_norm": 1.910046684059762, "learning_rate": 8.835735374748235e-06, "loss": 1.407, "step": 27360 }, { "epoch": 1033.2075471698113, "grad_norm": 1.5548634820286755, "learning_rate": 8.803309400107802e-06, "loss": 1.4183, "step": 27380 }, { "epoch": 1033.9622641509434, "grad_norm": 1.5932417218331991, "learning_rate": 8.771013743682171e-06, "loss": 1.4447, "step": 27400 }, { "epoch": 1034.7169811320755, "grad_norm": 1.4796581852592556, "learning_rate": 8.738848530371221e-06, "loss": 1.3946, "step": 27420 }, { "epoch": 1035.4716981132076, "grad_norm": 1.6106803868616077, "learning_rate": 8.706813884570337e-06, "loss": 1.4152, "step": 27440 }, { "epoch": 1036.2264150943397, "grad_norm": 1.5383725584269896, "learning_rate": 8.674909930169968e-06, "loss": 1.4344, "step": 27460 }, { "epoch": 1036.9811320754718, "grad_norm": 1.6971458233324348, "learning_rate": 8.643136790555101e-06, "loss": 1.42, "step": 27480 }, { "epoch": 1037.7358490566037, "grad_norm": 1.7975384013574476, "learning_rate": 8.61149458860486e-06, "loss": 1.4456, "step": 27500 }, { "epoch": 1038.4905660377358, "grad_norm": 1.5540181334521903, "learning_rate": 8.579983446691931e-06, "loss": 1.3976, "step": 27520 }, { "epoch": 1039.245283018868, "grad_norm": 1.7107813027346386, "learning_rate": 8.548603486682165e-06, "loss": 1.4119, "step": 27540 }, { "epoch": 1040.0, "grad_norm": 1.7225563012589893, "learning_rate": 8.517354829934086e-06, "loss": 1.4347, "step": 27560 }, { "epoch": 1040.754716981132, "grad_norm": 1.6396983385388997, "learning_rate": 8.486237597298396e-06, "loss": 1.4076, "step": 27580 }, { "epoch": 1041.5094339622642, "grad_norm": 1.59607993020723, "learning_rate": 8.455251909117562e-06, "loss": 1.391, "step": 27600 }, { "epoch": 1042.2641509433963, "grad_norm": 1.6787714792885464, "learning_rate": 8.424397885225284e-06, "loss": 1.4319, "step": 27620 }, { "epoch": 1043.0188679245282, "grad_norm": 1.514103336557697, "learning_rate": 8.39367564494608e-06, "loss": 1.4282, "step": 27640 }, { "epoch": 1043.7735849056603, "grad_norm": 1.6827281624065857, "learning_rate": 8.3630853070948e-06, "loss": 1.4268, "step": 27660 }, { "epoch": 1044.5283018867924, "grad_norm": 1.5242384493420091, "learning_rate": 8.332626989976201e-06, "loss": 1.394, "step": 27680 }, { "epoch": 1045.2830188679245, "grad_norm": 1.5477899241579378, "learning_rate": 8.302300811384443e-06, "loss": 1.4188, "step": 27700 }, { "epoch": 1046.0377358490566, "grad_norm": 1.7533265453937938, "learning_rate": 8.272106888602644e-06, "loss": 1.4147, "step": 27720 }, { "epoch": 1046.7924528301887, "grad_norm": 1.7810905836721207, "learning_rate": 8.242045338402464e-06, "loss": 1.4249, "step": 27740 }, { "epoch": 1047.5471698113208, "grad_norm": 1.6994451629715164, "learning_rate": 8.212116277043624e-06, "loss": 1.4087, "step": 27760 }, { "epoch": 1048.301886792453, "grad_norm": 1.5273771258038336, "learning_rate": 8.18231982027344e-06, "loss": 1.4105, "step": 27780 }, { "epoch": 1049.0566037735848, "grad_norm": 1.7986470388936215, "learning_rate": 8.15265608332641e-06, "loss": 1.417, "step": 27800 }, { "epoch": 1049.811320754717, "grad_norm": 3.7362962798847605, "learning_rate": 8.123125180923732e-06, "loss": 1.4428, "step": 27820 }, { "epoch": 1050.566037735849, "grad_norm": 1.4871345729412693, "learning_rate": 8.093727227272918e-06, "loss": 1.3913, "step": 27840 }, { "epoch": 1051.3207547169811, "grad_norm": 1.6862935331038202, "learning_rate": 8.064462336067288e-06, "loss": 1.4099, "step": 27860 }, { "epoch": 1052.0754716981132, "grad_norm": 1.5729155867984972, "learning_rate": 8.03533062048555e-06, "loss": 1.3896, "step": 27880 }, { "epoch": 1052.8301886792453, "grad_norm": 1.7312033654611378, "learning_rate": 8.006332193191406e-06, "loss": 1.4183, "step": 27900 }, { "epoch": 1053.5849056603774, "grad_norm": 1.737310060702965, "learning_rate": 7.977467166333041e-06, "loss": 1.4098, "step": 27920 }, { "epoch": 1054.3396226415093, "grad_norm": 1.787345801838152, "learning_rate": 7.948735651542762e-06, "loss": 1.4472, "step": 27940 }, { "epoch": 1055.0943396226414, "grad_norm": 1.6643759736424013, "learning_rate": 7.920137759936503e-06, "loss": 1.4248, "step": 27960 }, { "epoch": 1055.8490566037735, "grad_norm": 1.665184448890738, "learning_rate": 7.891673602113444e-06, "loss": 1.4184, "step": 27980 }, { "epoch": 1056.6037735849056, "grad_norm": 1.4651905410431068, "learning_rate": 7.863343288155553e-06, "loss": 1.4117, "step": 28000 }, { "epoch": 1057.3584905660377, "grad_norm": 1.761583496091816, "learning_rate": 7.835146927627195e-06, "loss": 1.4173, "step": 28020 }, { "epoch": 1058.1132075471698, "grad_norm": 1.4468036902445778, "learning_rate": 7.807084629574648e-06, "loss": 1.3899, "step": 28040 }, { "epoch": 1058.867924528302, "grad_norm": 1.9317915574764288, "learning_rate": 7.779156502525752e-06, "loss": 1.4283, "step": 28060 }, { "epoch": 1059.622641509434, "grad_norm": 1.6586645034969292, "learning_rate": 7.751362654489442e-06, "loss": 1.3729, "step": 28080 }, { "epoch": 1060.377358490566, "grad_norm": 1.54736903517111, "learning_rate": 7.72370319295533e-06, "loss": 1.4323, "step": 28100 }, { "epoch": 1061.132075471698, "grad_norm": 1.7410908156190221, "learning_rate": 7.696178224893333e-06, "loss": 1.4446, "step": 28120 }, { "epoch": 1061.8867924528302, "grad_norm": 1.5846972848377703, "learning_rate": 7.668787856753206e-06, "loss": 1.4069, "step": 28140 }, { "epoch": 1062.6415094339623, "grad_norm": 2.0032825052950005, "learning_rate": 7.641532194464159e-06, "loss": 1.4091, "step": 28160 }, { "epoch": 1063.3962264150944, "grad_norm": 1.5526416600245057, "learning_rate": 7.6144113434344445e-06, "loss": 1.3988, "step": 28180 }, { "epoch": 1064.1509433962265, "grad_norm": 1.6399869572854062, "learning_rate": 7.587425408550953e-06, "loss": 1.4317, "step": 28200 }, { "epoch": 1064.9056603773586, "grad_norm": 2.218545819761043, "learning_rate": 7.560574494178785e-06, "loss": 1.4166, "step": 28220 }, { "epoch": 1065.6603773584907, "grad_norm": 1.610893838079929, "learning_rate": 7.5338587041608855e-06, "loss": 1.4034, "step": 28240 }, { "epoch": 1066.4150943396226, "grad_norm": 1.901849515787354, "learning_rate": 7.507278141817603e-06, "loss": 1.4082, "step": 28260 }, { "epoch": 1067.1698113207547, "grad_norm": 1.9915752693535391, "learning_rate": 7.4808329099463165e-06, "loss": 1.4202, "step": 28280 }, { "epoch": 1067.9245283018868, "grad_norm": 2.337231756702343, "learning_rate": 7.454523110821034e-06, "loss": 1.4033, "step": 28300 }, { "epoch": 1068.6792452830189, "grad_norm": 1.4499700621594815, "learning_rate": 7.428348846191982e-06, "loss": 1.4106, "step": 28320 }, { "epoch": 1069.433962264151, "grad_norm": 1.7981102056016145, "learning_rate": 7.402310217285226e-06, "loss": 1.4061, "step": 28340 }, { "epoch": 1070.188679245283, "grad_norm": 1.7129433355903898, "learning_rate": 7.376407324802275e-06, "loss": 1.4019, "step": 28360 }, { "epoch": 1070.9433962264152, "grad_norm": 1.5382026111028457, "learning_rate": 7.350640268919691e-06, "loss": 1.4197, "step": 28380 }, { "epoch": 1071.698113207547, "grad_norm": 1.7225324354326523, "learning_rate": 7.325009149288721e-06, "loss": 1.4061, "step": 28400 }, { "epoch": 1072.4528301886792, "grad_norm": 1.9701222408661871, "learning_rate": 7.299514065034864e-06, "loss": 1.399, "step": 28420 }, { "epoch": 1073.2075471698113, "grad_norm": 2.560013262107365, "learning_rate": 7.2741551147575365e-06, "loss": 1.4011, "step": 28440 }, { "epoch": 1073.9622641509434, "grad_norm": 1.7468598350718882, "learning_rate": 7.248932396529666e-06, "loss": 1.3906, "step": 28460 }, { "epoch": 1074.7169811320755, "grad_norm": 1.5217037013529344, "learning_rate": 7.223846007897321e-06, "loss": 1.3824, "step": 28480 }, { "epoch": 1075.4716981132076, "grad_norm": 1.9246360758156291, "learning_rate": 7.198896045879323e-06, "loss": 1.401, "step": 28500 }, { "epoch": 1076.2264150943397, "grad_norm": 1.6887933139540061, "learning_rate": 7.174082606966883e-06, "loss": 1.4025, "step": 28520 }, { "epoch": 1076.9811320754718, "grad_norm": 1.6294766788073725, "learning_rate": 7.149405787123236e-06, "loss": 1.3986, "step": 28540 }, { "epoch": 1077.7358490566037, "grad_norm": 1.5618807274404587, "learning_rate": 7.124865681783234e-06, "loss": 1.4005, "step": 28560 }, { "epoch": 1078.4905660377358, "grad_norm": 1.6678211596916697, "learning_rate": 7.100462385853021e-06, "loss": 1.4071, "step": 28580 }, { "epoch": 1079.245283018868, "grad_norm": 1.9223978868928677, "learning_rate": 7.07619599370964e-06, "loss": 1.4135, "step": 28600 }, { "epoch": 1080.0, "grad_norm": 1.6632265815235145, "learning_rate": 7.052066599200659e-06, "loss": 1.3882, "step": 28620 }, { "epoch": 1080.754716981132, "grad_norm": 1.6022030717394165, "learning_rate": 7.028074295643851e-06, "loss": 1.3972, "step": 28640 }, { "epoch": 1081.5094339622642, "grad_norm": 1.4991746539828543, "learning_rate": 7.004219175826785e-06, "loss": 1.382, "step": 28660 }, { "epoch": 1082.2641509433963, "grad_norm": 1.6838520383575963, "learning_rate": 6.9805013320064956e-06, "loss": 1.4146, "step": 28680 }, { "epoch": 1083.0188679245282, "grad_norm": 1.8350778781710608, "learning_rate": 6.9569208559091e-06, "loss": 1.4138, "step": 28700 }, { "epoch": 1083.7735849056603, "grad_norm": 1.5249940477637465, "learning_rate": 6.9334778387294835e-06, "loss": 1.403, "step": 28720 }, { "epoch": 1084.5283018867924, "grad_norm": 1.4543697117371763, "learning_rate": 6.910172371130925e-06, "loss": 1.4115, "step": 28740 }, { "epoch": 1085.2830188679245, "grad_norm": 1.8878771205671918, "learning_rate": 6.8870045432447285e-06, "loss": 1.3783, "step": 28760 }, { "epoch": 1086.0377358490566, "grad_norm": 1.6650946199070653, "learning_rate": 6.8639744446698945e-06, "loss": 1.4065, "step": 28780 }, { "epoch": 1086.7924528301887, "grad_norm": 1.9063799347508024, "learning_rate": 6.84108216447278e-06, "loss": 1.3896, "step": 28800 }, { "epoch": 1087.5471698113208, "grad_norm": 1.7745103676453513, "learning_rate": 6.818327791186747e-06, "loss": 1.4068, "step": 28820 }, { "epoch": 1088.301886792453, "grad_norm": 1.6208415487366228, "learning_rate": 6.795711412811805e-06, "loss": 1.3827, "step": 28840 }, { "epoch": 1089.0566037735848, "grad_norm": 1.4568669649899233, "learning_rate": 6.773233116814289e-06, "loss": 1.3918, "step": 28860 }, { "epoch": 1089.811320754717, "grad_norm": 1.861515176168054, "learning_rate": 6.750892990126514e-06, "loss": 1.3901, "step": 28880 }, { "epoch": 1090.566037735849, "grad_norm": 1.7283660067362911, "learning_rate": 6.728691119146446e-06, "loss": 1.4157, "step": 28900 }, { "epoch": 1091.3207547169811, "grad_norm": 1.679598340558233, "learning_rate": 6.706627589737369e-06, "loss": 1.3938, "step": 28920 }, { "epoch": 1092.0754716981132, "grad_norm": 1.5691857730547452, "learning_rate": 6.6847024872275215e-06, "loss": 1.4176, "step": 28940 }, { "epoch": 1092.8301886792453, "grad_norm": 1.5537251935711112, "learning_rate": 6.66291589640982e-06, "loss": 1.3967, "step": 28960 }, { "epoch": 1093.5849056603774, "grad_norm": 1.8881979410475171, "learning_rate": 6.641267901541472e-06, "loss": 1.418, "step": 28980 }, { "epoch": 1094.3396226415093, "grad_norm": 1.613241830342873, "learning_rate": 6.619758586343714e-06, "loss": 1.3901, "step": 29000 }, { "epoch": 1095.0943396226414, "grad_norm": 1.5946632443607534, "learning_rate": 6.598388034001433e-06, "loss": 1.3634, "step": 29020 }, { "epoch": 1095.8490566037735, "grad_norm": 1.8962995366661943, "learning_rate": 6.577156327162867e-06, "loss": 1.392, "step": 29040 }, { "epoch": 1096.6037735849056, "grad_norm": 1.629681556076702, "learning_rate": 6.55606354793928e-06, "loss": 1.4078, "step": 29060 }, { "epoch": 1097.3584905660377, "grad_norm": 1.6952819453222434, "learning_rate": 6.535109777904677e-06, "loss": 1.4017, "step": 29080 }, { "epoch": 1098.1132075471698, "grad_norm": 2.5813616029432267, "learning_rate": 6.514295098095432e-06, "loss": 1.3986, "step": 29100 }, { "epoch": 1098.867924528302, "grad_norm": 1.5192224713062508, "learning_rate": 6.493619589010008e-06, "loss": 1.3995, "step": 29120 }, { "epoch": 1099.622641509434, "grad_norm": 1.5723195273483208, "learning_rate": 6.4730833306086425e-06, "loss": 1.3804, "step": 29140 }, { "epoch": 1100.377358490566, "grad_norm": 1.6397338659549336, "learning_rate": 6.452686402313042e-06, "loss": 1.386, "step": 29160 }, { "epoch": 1101.132075471698, "grad_norm": 1.5791257173150743, "learning_rate": 6.43242888300607e-06, "loss": 1.3847, "step": 29180 }, { "epoch": 1101.8867924528302, "grad_norm": 1.5559887095506482, "learning_rate": 6.412310851031428e-06, "loss": 1.393, "step": 29200 }, { "epoch": 1102.6415094339623, "grad_norm": 1.6663466000474887, "learning_rate": 6.392332384193371e-06, "loss": 1.3896, "step": 29220 }, { "epoch": 1103.3962264150944, "grad_norm": 1.9956674599720932, "learning_rate": 6.372493559756415e-06, "loss": 1.378, "step": 29240 }, { "epoch": 1104.1509433962265, "grad_norm": 1.787105155690102, "learning_rate": 6.352794454445007e-06, "loss": 1.3879, "step": 29260 }, { "epoch": 1104.9056603773586, "grad_norm": 1.561482889041861, "learning_rate": 6.333235144443262e-06, "loss": 1.402, "step": 29280 }, { "epoch": 1105.6603773584907, "grad_norm": 1.8736117457797759, "learning_rate": 6.31381570539463e-06, "loss": 1.3879, "step": 29300 }, { "epoch": 1106.4150943396226, "grad_norm": 1.4692581652153442, "learning_rate": 6.294536212401641e-06, "loss": 1.3914, "step": 29320 }, { "epoch": 1107.1698113207547, "grad_norm": 1.4908544439114542, "learning_rate": 6.275396740025605e-06, "loss": 1.4028, "step": 29340 }, { "epoch": 1107.9245283018868, "grad_norm": 1.488666750171173, "learning_rate": 6.256397362286306e-06, "loss": 1.3799, "step": 29360 }, { "epoch": 1108.6792452830189, "grad_norm": 1.517431762228245, "learning_rate": 6.237538152661723e-06, "loss": 1.3765, "step": 29380 }, { "epoch": 1109.433962264151, "grad_norm": 2.2381909450089803, "learning_rate": 6.218819184087767e-06, "loss": 1.4079, "step": 29400 }, { "epoch": 1110.188679245283, "grad_norm": 1.7858504458920295, "learning_rate": 6.200240528957965e-06, "loss": 1.3554, "step": 29420 }, { "epoch": 1110.9433962264152, "grad_norm": 1.7350524849254911, "learning_rate": 6.181802259123219e-06, "loss": 1.3967, "step": 29440 }, { "epoch": 1111.698113207547, "grad_norm": 1.702971597589678, "learning_rate": 6.163504445891484e-06, "loss": 1.3671, "step": 29460 }, { "epoch": 1112.4528301886792, "grad_norm": 1.7712134929173684, "learning_rate": 6.145347160027524e-06, "loss": 1.3829, "step": 29480 }, { "epoch": 1113.2075471698113, "grad_norm": 1.4073555395505457, "learning_rate": 6.1273304717526284e-06, "loss": 1.4108, "step": 29500 }, { "epoch": 1113.9622641509434, "grad_norm": 1.6527537265171588, "learning_rate": 6.10945445074435e-06, "loss": 1.4068, "step": 29520 }, { "epoch": 1114.7169811320755, "grad_norm": 1.6866987009556351, "learning_rate": 6.091719166136209e-06, "loss": 1.3793, "step": 29540 }, { "epoch": 1115.4716981132076, "grad_norm": 1.7073159356044332, "learning_rate": 6.074124686517448e-06, "loss": 1.3826, "step": 29560 }, { "epoch": 1116.2264150943397, "grad_norm": 1.8230785653176147, "learning_rate": 6.056671079932781e-06, "loss": 1.4153, "step": 29580 }, { "epoch": 1116.9811320754718, "grad_norm": 1.6857598634250675, "learning_rate": 6.0393584138820814e-06, "loss": 1.3887, "step": 29600 }, { "epoch": 1117.7358490566037, "grad_norm": 1.5568678463492682, "learning_rate": 6.022186755320181e-06, "loss": 1.3901, "step": 29620 }, { "epoch": 1118.4905660377358, "grad_norm": 1.8571545157336313, "learning_rate": 6.0051561706565545e-06, "loss": 1.4013, "step": 29640 }, { "epoch": 1119.245283018868, "grad_norm": 2.413996452708785, "learning_rate": 5.988266725755103e-06, "loss": 1.3613, "step": 29660 }, { "epoch": 1120.0, "grad_norm": 1.687989711452293, "learning_rate": 5.9715184859338745e-06, "loss": 1.4031, "step": 29680 }, { "epoch": 1120.754716981132, "grad_norm": 1.7351377623187432, "learning_rate": 5.9549115159648416e-06, "loss": 1.3949, "step": 29700 }, { "epoch": 1121.5094339622642, "grad_norm": 1.6317556572084198, "learning_rate": 5.9384458800736175e-06, "loss": 1.3769, "step": 29720 }, { "epoch": 1122.2641509433963, "grad_norm": 1.5268456230996348, "learning_rate": 5.922121641939213e-06, "loss": 1.3816, "step": 29740 }, { "epoch": 1123.0188679245282, "grad_norm": 1.712558259908726, "learning_rate": 5.905938864693819e-06, "loss": 1.3798, "step": 29760 }, { "epoch": 1123.7735849056603, "grad_norm": 2.381990895927805, "learning_rate": 5.889897610922528e-06, "loss": 1.3607, "step": 29780 }, { "epoch": 1124.5283018867924, "grad_norm": 1.880675021280631, "learning_rate": 5.873997942663118e-06, "loss": 1.3886, "step": 29800 }, { "epoch": 1125.2830188679245, "grad_norm": 1.7160648060328811, "learning_rate": 5.858239921405781e-06, "loss": 1.4049, "step": 29820 }, { "epoch": 1126.0377358490566, "grad_norm": 2.0253315053102656, "learning_rate": 5.842623608092928e-06, "loss": 1.393, "step": 29840 }, { "epoch": 1126.7924528301887, "grad_norm": 1.7870648066969081, "learning_rate": 5.8271490631189085e-06, "loss": 1.3654, "step": 29860 }, { "epoch": 1127.5471698113208, "grad_norm": 2.0620223544323393, "learning_rate": 5.811816346329819e-06, "loss": 1.3776, "step": 29880 }, { "epoch": 1128.301886792453, "grad_norm": 1.892915815700359, "learning_rate": 5.796625517023236e-06, "loss": 1.377, "step": 29900 }, { "epoch": 1129.0566037735848, "grad_norm": 1.6134589423454577, "learning_rate": 5.781576633948012e-06, "loss": 1.3958, "step": 29920 }, { "epoch": 1129.811320754717, "grad_norm": 1.8880173462636753, "learning_rate": 5.766669755304027e-06, "loss": 1.3707, "step": 29940 }, { "epoch": 1130.566037735849, "grad_norm": 1.899687605902805, "learning_rate": 5.75190493874199e-06, "loss": 1.3648, "step": 29960 }, { "epoch": 1131.3207547169811, "grad_norm": 2.0484945041635143, "learning_rate": 5.737282241363189e-06, "loss": 1.3689, "step": 29980 }, { "epoch": 1132.0754716981132, "grad_norm": 1.676321084433534, "learning_rate": 5.72280171971928e-06, "loss": 1.4161, "step": 30000 }, { "epoch": 1132.8301886792453, "grad_norm": 1.7718376566707665, "learning_rate": 5.708463429812077e-06, "loss": 1.3427, "step": 30020 }, { "epoch": 1133.5849056603774, "grad_norm": 1.9751240318001524, "learning_rate": 5.694267427093333e-06, "loss": 1.3674, "step": 30040 }, { "epoch": 1134.3396226415093, "grad_norm": 2.3259508666245754, "learning_rate": 5.680213766464505e-06, "loss": 1.3815, "step": 30060 }, { "epoch": 1135.0943396226414, "grad_norm": 1.7499567507331477, "learning_rate": 5.6663025022765734e-06, "loss": 1.3898, "step": 30080 }, { "epoch": 1135.8490566037735, "grad_norm": 1.7700410283415744, "learning_rate": 5.652533688329809e-06, "loss": 1.3801, "step": 30100 }, { "epoch": 1136.6037735849056, "grad_norm": 2.4028113618062843, "learning_rate": 5.638907377873572e-06, "loss": 1.4025, "step": 30120 }, { "epoch": 1137.3584905660377, "grad_norm": 1.7119758682153656, "learning_rate": 5.625423623606109e-06, "loss": 1.3933, "step": 30140 }, { "epoch": 1138.1132075471698, "grad_norm": 1.6434771622606816, "learning_rate": 5.612082477674341e-06, "loss": 1.3723, "step": 30160 }, { "epoch": 1138.867924528302, "grad_norm": 1.6260264586830788, "learning_rate": 5.598883991673678e-06, "loss": 1.4009, "step": 30180 }, { "epoch": 1139.622641509434, "grad_norm": 3.7353731641696166, "learning_rate": 5.58582821664779e-06, "loss": 1.3621, "step": 30200 }, { "epoch": 1140.377358490566, "grad_norm": 1.567966811159742, "learning_rate": 5.572915203088453e-06, "loss": 1.3679, "step": 30220 }, { "epoch": 1141.132075471698, "grad_norm": 1.7536276327044822, "learning_rate": 5.560145000935302e-06, "loss": 1.3899, "step": 30240 }, { "epoch": 1141.8867924528302, "grad_norm": 1.6246811713037859, "learning_rate": 5.547517659575683e-06, "loss": 1.3754, "step": 30260 }, { "epoch": 1142.6415094339623, "grad_norm": 1.5935354859602073, "learning_rate": 5.535033227844446e-06, "loss": 1.3783, "step": 30280 }, { "epoch": 1143.3962264150944, "grad_norm": 1.5837499746804282, "learning_rate": 5.522691754023736e-06, "loss": 1.3664, "step": 30300 }, { "epoch": 1144.1509433962265, "grad_norm": 1.5561292753074283, "learning_rate": 5.5104932858428386e-06, "loss": 1.3934, "step": 30320 }, { "epoch": 1144.9056603773586, "grad_norm": 1.5051486824601223, "learning_rate": 5.498437870477979e-06, "loss": 1.3569, "step": 30340 }, { "epoch": 1145.6603773584907, "grad_norm": 1.5724530317281036, "learning_rate": 5.48652555455214e-06, "loss": 1.384, "step": 30360 }, { "epoch": 1146.4150943396226, "grad_norm": 1.7499070562961392, "learning_rate": 5.474756384134872e-06, "loss": 1.3661, "step": 30380 }, { "epoch": 1147.1698113207547, "grad_norm": 1.682172454392295, "learning_rate": 5.46313040474215e-06, "loss": 1.3668, "step": 30400 }, { "epoch": 1147.9245283018868, "grad_norm": 1.6400451026874565, "learning_rate": 5.4516476613361565e-06, "loss": 1.3605, "step": 30420 }, { "epoch": 1148.6792452830189, "grad_norm": 1.952384343786011, "learning_rate": 5.440308198325125e-06, "loss": 1.388, "step": 30440 }, { "epoch": 1149.433962264151, "grad_norm": 2.052044266530817, "learning_rate": 5.4291120595631796e-06, "loss": 1.3699, "step": 30460 }, { "epoch": 1150.188679245283, "grad_norm": 1.868354121694302, "learning_rate": 5.4180592883501325e-06, "loss": 1.4099, "step": 30480 }, { "epoch": 1150.9433962264152, "grad_norm": 1.650613691746538, "learning_rate": 5.40714992743136e-06, "loss": 1.3788, "step": 30500 }, { "epoch": 1151.698113207547, "grad_norm": 1.48074352750423, "learning_rate": 5.3963840189976066e-06, "loss": 1.3587, "step": 30520 }, { "epoch": 1152.4528301886792, "grad_norm": 1.914894176993607, "learning_rate": 5.385761604684826e-06, "loss": 1.3622, "step": 30540 }, { "epoch": 1153.2075471698113, "grad_norm": 1.736154691724524, "learning_rate": 5.375282725574028e-06, "loss": 1.3451, "step": 30560 }, { "epoch": 1153.9622641509434, "grad_norm": 1.7175923216328703, "learning_rate": 5.364947422191111e-06, "loss": 1.385, "step": 30580 }, { "epoch": 1154.7169811320755, "grad_norm": 1.8230347081955776, "learning_rate": 5.3547557345067295e-06, "loss": 1.3797, "step": 30600 }, { "epoch": 1155.4716981132076, "grad_norm": 1.4897355923840079, "learning_rate": 5.344707701936093e-06, "loss": 1.3812, "step": 30620 }, { "epoch": 1156.2264150943397, "grad_norm": 1.7795720356372806, "learning_rate": 5.334803363338855e-06, "loss": 1.3508, "step": 30640 }, { "epoch": 1156.9811320754718, "grad_norm": 2.461699887903762, "learning_rate": 5.325042757018952e-06, "loss": 1.3904, "step": 30660 }, { "epoch": 1157.7358490566037, "grad_norm": 1.7684288169829847, "learning_rate": 5.315425920724443e-06, "loss": 1.362, "step": 30680 }, { "epoch": 1158.4905660377358, "grad_norm": 1.9326301215722892, "learning_rate": 5.3059528916473754e-06, "loss": 1.3764, "step": 30700 }, { "epoch": 1159.245283018868, "grad_norm": 1.7547993585411785, "learning_rate": 5.296623706423637e-06, "loss": 1.3624, "step": 30720 }, { "epoch": 1160.0, "grad_norm": 2.2647989876543897, "learning_rate": 5.2874384011328235e-06, "loss": 1.3804, "step": 30740 }, { "epoch": 1160.754716981132, "grad_norm": 1.897412746168143, "learning_rate": 5.278397011298081e-06, "loss": 1.3882, "step": 30760 }, { "epoch": 1161.5094339622642, "grad_norm": 1.5286725772277845, "learning_rate": 5.269499571885985e-06, "loss": 1.381, "step": 30780 }, { "epoch": 1162.2641509433963, "grad_norm": 1.6848292059915215, "learning_rate": 5.260746117306394e-06, "loss": 1.361, "step": 30800 }, { "epoch": 1163.0188679245282, "grad_norm": 1.4576957104143031, "learning_rate": 5.25213668141232e-06, "loss": 1.3773, "step": 30820 }, { "epoch": 1163.7735849056603, "grad_norm": 1.6655981961615232, "learning_rate": 5.243671297499806e-06, "loss": 1.3403, "step": 30840 }, { "epoch": 1164.5283018867924, "grad_norm": 6.016182274377044, "learning_rate": 5.235349998307786e-06, "loss": 1.3994, "step": 30860 }, { "epoch": 1165.2830188679245, "grad_norm": 1.7659588641922745, "learning_rate": 5.227172816017956e-06, "loss": 1.3507, "step": 30880 }, { "epoch": 1166.0377358490566, "grad_norm": 2.0037468459561962, "learning_rate": 5.219139782254665e-06, "loss": 1.3703, "step": 30900 }, { "epoch": 1166.7924528301887, "grad_norm": 2.15024644673786, "learning_rate": 5.211250928084786e-06, "loss": 1.3473, "step": 30920 }, { "epoch": 1167.5471698113208, "grad_norm": 2.5013172573697466, "learning_rate": 5.203506284017583e-06, "loss": 1.3814, "step": 30940 }, { "epoch": 1168.301886792453, "grad_norm": 1.5816513523971083, "learning_rate": 5.195905880004609e-06, "loss": 1.3668, "step": 30960 }, { "epoch": 1169.0566037735848, "grad_norm": 1.512996764161357, "learning_rate": 5.188449745439581e-06, "loss": 1.3581, "step": 30980 }, { "epoch": 1169.811320754717, "grad_norm": 1.536263448282502, "learning_rate": 5.181137909158276e-06, "loss": 1.3277, "step": 31000 }, { "epoch": 1170.566037735849, "grad_norm": 1.6755767673451942, "learning_rate": 5.1739703994384105e-06, "loss": 1.3923, "step": 31020 }, { "epoch": 1171.3207547169811, "grad_norm": 1.7976047665675525, "learning_rate": 5.166947243999532e-06, "loss": 1.3671, "step": 31040 }, { "epoch": 1172.0754716981132, "grad_norm": 1.5604607884699584, "learning_rate": 5.1600684700029165e-06, "loss": 1.3613, "step": 31060 }, { "epoch": 1172.8301886792453, "grad_norm": 1.5133379987405895, "learning_rate": 5.1533341040514576e-06, "loss": 1.3696, "step": 31080 }, { "epoch": 1173.5849056603774, "grad_norm": 1.8992042289915705, "learning_rate": 5.146744172189571e-06, "loss": 1.3464, "step": 31100 }, { "epoch": 1174.3396226415093, "grad_norm": 1.8549085471784923, "learning_rate": 5.140298699903085e-06, "loss": 1.3478, "step": 31120 }, { "epoch": 1175.0943396226414, "grad_norm": 1.6926406458235648, "learning_rate": 5.133997712119152e-06, "loss": 1.3526, "step": 31140 }, { "epoch": 1175.8490566037735, "grad_norm": 1.9538672940442745, "learning_rate": 5.127841233206144e-06, "loss": 1.3686, "step": 31160 }, { "epoch": 1176.6037735849056, "grad_norm": 1.850655603319905, "learning_rate": 5.1218292869735606e-06, "loss": 1.3906, "step": 31180 }, { "epoch": 1177.3584905660377, "grad_norm": 1.7127479688627378, "learning_rate": 5.115961896671935e-06, "loss": 1.3703, "step": 31200 }, { "epoch": 1178.1132075471698, "grad_norm": 1.556614260381109, "learning_rate": 5.110239084992749e-06, "loss": 1.3532, "step": 31220 }, { "epoch": 1178.867924528302, "grad_norm": 2.001126139034296, "learning_rate": 5.1046608740683435e-06, "loss": 1.3929, "step": 31240 }, { "epoch": 1179.622641509434, "grad_norm": 2.127747604876417, "learning_rate": 5.09922728547183e-06, "loss": 1.3657, "step": 31260 }, { "epoch": 1180.377358490566, "grad_norm": 1.8364327564945553, "learning_rate": 5.093938340217008e-06, "loss": 1.3426, "step": 31280 }, { "epoch": 1181.132075471698, "grad_norm": 1.9292610849222944, "learning_rate": 5.088794058758295e-06, "loss": 1.368, "step": 31300 }, { "epoch": 1181.8867924528302, "grad_norm": 2.0114024877177505, "learning_rate": 5.083794460990618e-06, "loss": 1.39, "step": 31320 }, { "epoch": 1182.6415094339623, "grad_norm": 1.5735214803674382, "learning_rate": 5.078939566249372e-06, "loss": 1.3632, "step": 31340 }, { "epoch": 1183.3962264150944, "grad_norm": 1.8428642902345547, "learning_rate": 5.074229393310324e-06, "loss": 1.3757, "step": 31360 }, { "epoch": 1184.1509433962265, "grad_norm": 1.697897177712772, "learning_rate": 5.06966396038955e-06, "loss": 1.354, "step": 31380 }, { "epoch": 1184.9056603773586, "grad_norm": 1.807086734591878, "learning_rate": 5.065243285143349e-06, "loss": 1.3757, "step": 31400 }, { "epoch": 1185.6603773584907, "grad_norm": 1.743179055242126, "learning_rate": 5.0609673846681936e-06, "loss": 1.3819, "step": 31420 }, { "epoch": 1186.4150943396226, "grad_norm": 1.8735264452983302, "learning_rate": 5.056836275500658e-06, "loss": 1.3579, "step": 31440 }, { "epoch": 1187.1698113207547, "grad_norm": 1.5862970321945447, "learning_rate": 5.052849973617347e-06, "loss": 1.3445, "step": 31460 }, { "epoch": 1187.9245283018868, "grad_norm": 1.692517823714256, "learning_rate": 5.049008494434844e-06, "loss": 1.3694, "step": 31480 }, { "epoch": 1188.6792452830189, "grad_norm": 1.6212477472255649, "learning_rate": 5.045311852809638e-06, "loss": 1.3929, "step": 31500 }, { "epoch": 1189.433962264151, "grad_norm": 1.52306373987035, "learning_rate": 5.041760063038081e-06, "loss": 1.3579, "step": 31520 }, { "epoch": 1190.188679245283, "grad_norm": 1.7830544839573095, "learning_rate": 5.038353138856331e-06, "loss": 1.348, "step": 31540 }, { "epoch": 1190.9433962264152, "grad_norm": 1.7203728735463606, "learning_rate": 5.035091093440292e-06, "loss": 1.37, "step": 31560 }, { "epoch": 1191.698113207547, "grad_norm": 1.9298089743408848, "learning_rate": 5.0319739394055525e-06, "loss": 1.3627, "step": 31580 }, { "epoch": 1192.4528301886792, "grad_norm": 1.9488940650586162, "learning_rate": 5.029001688807368e-06, "loss": 1.3537, "step": 31600 }, { "epoch": 1193.2075471698113, "grad_norm": 2.0609178957358667, "learning_rate": 5.026174353140584e-06, "loss": 1.3521, "step": 31620 }, { "epoch": 1193.9622641509434, "grad_norm": 1.710559073613117, "learning_rate": 5.0234919433396115e-06, "loss": 1.3768, "step": 31640 }, { "epoch": 1194.7169811320755, "grad_norm": 1.5082465689013147, "learning_rate": 5.02095446977837e-06, "loss": 1.3893, "step": 31660 }, { "epoch": 1195.4716981132076, "grad_norm": 2.4105153089947526, "learning_rate": 5.018561942270259e-06, "loss": 1.3532, "step": 31680 }, { "epoch": 1196.2264150943397, "grad_norm": 1.5148689250273666, "learning_rate": 5.016314370068112e-06, "loss": 1.3429, "step": 31700 }, { "epoch": 1196.9811320754718, "grad_norm": 1.7305388649029056, "learning_rate": 5.014211761864169e-06, "loss": 1.3559, "step": 31720 }, { "epoch": 1197.7358490566037, "grad_norm": 3.661229816284544, "learning_rate": 5.012254125790028e-06, "loss": 1.37, "step": 31740 }, { "epoch": 1198.4905660377358, "grad_norm": 1.9493540072501139, "learning_rate": 5.010441469416635e-06, "loss": 1.3808, "step": 31760 }, { "epoch": 1199.245283018868, "grad_norm": 1.6896444872077154, "learning_rate": 5.008773799754234e-06, "loss": 1.3631, "step": 31780 }, { "epoch": 1200.0, "grad_norm": 1.884439542410789, "learning_rate": 5.007251123252356e-06, "loss": 1.3638, "step": 31800 }, { "epoch": 1200.754716981132, "grad_norm": 1.98761366434412, "learning_rate": 5.005873445799779e-06, "loss": 1.35, "step": 31820 }, { "epoch": 1201.5094339622642, "grad_norm": 1.8352779283455332, "learning_rate": 5.004640772724519e-06, "loss": 1.3369, "step": 31840 }, { "epoch": 1202.2641509433963, "grad_norm": 1.712020294826759, "learning_rate": 5.003553108793802e-06, "loss": 1.3511, "step": 31860 }, { "epoch": 1203.0188679245282, "grad_norm": 1.6743616923339946, "learning_rate": 5.002610458214054e-06, "loss": 1.3259, "step": 31880 }, { "epoch": 1203.7735849056603, "grad_norm": 1.8393462234102256, "learning_rate": 5.001812824630864e-06, "loss": 1.3646, "step": 31900 }, { "epoch": 1204.5283018867924, "grad_norm": 1.7631293985305598, "learning_rate": 5.001160211128995e-06, "loss": 1.3384, "step": 31920 }, { "epoch": 1205.2830188679245, "grad_norm": 1.6536424071703635, "learning_rate": 5.0006526202323554e-06, "loss": 1.3605, "step": 31940 }, { "epoch": 1206.0377358490566, "grad_norm": 1.5387931434470863, "learning_rate": 5.000290053904e-06, "loss": 1.3892, "step": 31960 }, { "epoch": 1206.7924528301887, "grad_norm": 1.948827205429464, "learning_rate": 5.0000725135461104e-06, "loss": 1.3541, "step": 31980 }, { "epoch": 1207.5471698113208, "grad_norm": 1.595259284912312, "learning_rate": 5e-06, "loss": 1.3478, "step": 32000 }, { "epoch": 1231.5094339622642, "grad_norm": 1.9859843003442184, "learning_rate": 3.1745653570607866e-05, "loss": 1.4161, "step": 32020 }, { "epoch": 1232.2641509433963, "grad_norm": 2.399291840461689, "learning_rate": 3.170382168563073e-05, "loss": 1.4292, "step": 32040 }, { "epoch": 1233.0188679245282, "grad_norm": 2.2207067356830597, "learning_rate": 3.166200444421923e-05, "loss": 1.4248, "step": 32060 }, { "epoch": 1233.7735849056603, "grad_norm": 1.7538222025729717, "learning_rate": 3.1620201903092876e-05, "loss": 1.4549, "step": 32080 }, { "epoch": 1234.5283018867924, "grad_norm": 1.7296401624898199, "learning_rate": 3.157841411895116e-05, "loss": 1.4544, "step": 32100 }, { "epoch": 1235.2830188679245, "grad_norm": 1.6657757057870137, "learning_rate": 3.153664114847362e-05, "loss": 1.4734, "step": 32120 }, { "epoch": 1236.0377358490566, "grad_norm": 1.7240277610891936, "learning_rate": 3.149488304831967e-05, "loss": 1.451, "step": 32140 }, { "epoch": 1236.7924528301887, "grad_norm": 1.6885797820089437, "learning_rate": 3.145313987512854e-05, "loss": 1.4366, "step": 32160 }, { "epoch": 1237.5471698113208, "grad_norm": 1.4963776794399322, "learning_rate": 3.141141168551928e-05, "loss": 1.4652, "step": 32180 }, { "epoch": 1238.301886792453, "grad_norm": 1.4609983523815115, "learning_rate": 3.1369698536090554e-05, "loss": 1.4648, "step": 32200 }, { "epoch": 1239.0566037735848, "grad_norm": 1.9029419687473905, "learning_rate": 3.132800048342065e-05, "loss": 1.4664, "step": 32220 }, { "epoch": 1239.811320754717, "grad_norm": 1.7932066669770592, "learning_rate": 3.128631758406736e-05, "loss": 1.4585, "step": 32240 }, { "epoch": 1240.566037735849, "grad_norm": 1.6253328044167166, "learning_rate": 3.1244649894567945e-05, "loss": 1.4492, "step": 32260 }, { "epoch": 1241.3207547169811, "grad_norm": 1.507775786714413, "learning_rate": 3.120299747143905e-05, "loss": 1.4934, "step": 32280 }, { "epoch": 1242.0754716981132, "grad_norm": 1.7801850010709415, "learning_rate": 3.1161360371176566e-05, "loss": 1.4486, "step": 32300 }, { "epoch": 1242.8301886792453, "grad_norm": 1.6106209389195743, "learning_rate": 3.111973865025564e-05, "loss": 1.4468, "step": 32320 }, { "epoch": 1243.5849056603774, "grad_norm": 1.8027839874458171, "learning_rate": 3.107813236513054e-05, "loss": 1.477, "step": 32340 }, { "epoch": 1244.3396226415093, "grad_norm": 1.883131295400716, "learning_rate": 3.1036541572234594e-05, "loss": 1.4555, "step": 32360 }, { "epoch": 1245.0943396226414, "grad_norm": 1.591157945654413, "learning_rate": 3.099496632798014e-05, "loss": 1.4708, "step": 32380 }, { "epoch": 1245.8490566037735, "grad_norm": 1.6694778342522842, "learning_rate": 3.095340668875842e-05, "loss": 1.4639, "step": 32400 }, { "epoch": 1246.6037735849056, "grad_norm": 1.6841562206011031, "learning_rate": 3.091186271093947e-05, "loss": 1.5116, "step": 32420 }, { "epoch": 1247.3584905660377, "grad_norm": 2.3369379900409943, "learning_rate": 3.0870334450872156e-05, "loss": 1.4754, "step": 32440 }, { "epoch": 1248.1132075471698, "grad_norm": 1.720534890104194, "learning_rate": 3.0828821964883944e-05, "loss": 1.4941, "step": 32460 }, { "epoch": 1248.867924528302, "grad_norm": 1.7549772489735695, "learning_rate": 3.0787325309280966e-05, "loss": 1.4799, "step": 32480 }, { "epoch": 1249.622641509434, "grad_norm": 1.8182084066575632, "learning_rate": 3.074584454034788e-05, "loss": 1.4715, "step": 32500 }, { "epoch": 1250.377358490566, "grad_norm": 1.5605662428278646, "learning_rate": 3.0704379714347736e-05, "loss": 1.4783, "step": 32520 }, { "epoch": 1251.132075471698, "grad_norm": 1.569853865239183, "learning_rate": 3.066293088752203e-05, "loss": 1.4638, "step": 32540 }, { "epoch": 1251.8867924528302, "grad_norm": 1.704579985134968, "learning_rate": 3.062149811609051e-05, "loss": 1.492, "step": 32560 }, { "epoch": 1252.6415094339623, "grad_norm": 1.7794864973864697, "learning_rate": 3.058008145625118e-05, "loss": 1.4705, "step": 32580 }, { "epoch": 1253.3962264150944, "grad_norm": 1.8222736973302784, "learning_rate": 3.053868096418017e-05, "loss": 1.4893, "step": 32600 }, { "epoch": 1254.1509433962265, "grad_norm": 1.5789611538013155, "learning_rate": 3.0497296696031678e-05, "loss": 1.4665, "step": 32620 }, { "epoch": 1254.9056603773586, "grad_norm": 1.657785958532039, "learning_rate": 3.0455928707937924e-05, "loss": 1.491, "step": 32640 }, { "epoch": 1255.6603773584907, "grad_norm": 1.3254023383839637, "learning_rate": 3.0414577056008995e-05, "loss": 1.4823, "step": 32660 }, { "epoch": 1256.4150943396226, "grad_norm": 1.5602437010509045, "learning_rate": 3.0373241796332887e-05, "loss": 1.4704, "step": 32680 }, { "epoch": 1257.1698113207547, "grad_norm": 2.029474586920305, "learning_rate": 3.0331922984975316e-05, "loss": 1.4765, "step": 32700 }, { "epoch": 1257.9245283018868, "grad_norm": 1.8553896972815955, "learning_rate": 3.0290620677979688e-05, "loss": 1.5096, "step": 32720 }, { "epoch": 1258.6792452830189, "grad_norm": 1.4989759048156965, "learning_rate": 3.0249334931367046e-05, "loss": 1.5122, "step": 32740 }, { "epoch": 1259.433962264151, "grad_norm": 1.6763111597334728, "learning_rate": 3.0208065801135942e-05, "loss": 1.4787, "step": 32760 }, { "epoch": 1260.188679245283, "grad_norm": 1.469251133196546, "learning_rate": 3.016681334326244e-05, "loss": 1.4854, "step": 32780 }, { "epoch": 1260.9433962264152, "grad_norm": 1.8501919367454238, "learning_rate": 3.0125577613699926e-05, "loss": 1.4929, "step": 32800 }, { "epoch": 1261.698113207547, "grad_norm": 1.5790438820656068, "learning_rate": 3.0084358668379155e-05, "loss": 1.5055, "step": 32820 }, { "epoch": 1262.4528301886792, "grad_norm": 1.5952733717783116, "learning_rate": 3.004315656320806e-05, "loss": 1.4907, "step": 32840 }, { "epoch": 1263.2075471698113, "grad_norm": 1.6182930520428953, "learning_rate": 3.0001971354071772e-05, "loss": 1.4909, "step": 32860 }, { "epoch": 1263.9622641509434, "grad_norm": 2.2886630268428663, "learning_rate": 2.996080309683252e-05, "loss": 1.4992, "step": 32880 }, { "epoch": 1264.7169811320755, "grad_norm": 1.3793974197803296, "learning_rate": 2.9919651847329483e-05, "loss": 1.5061, "step": 32900 }, { "epoch": 1265.4716981132076, "grad_norm": 1.39182833894468, "learning_rate": 2.9878517661378828e-05, "loss": 1.4591, "step": 32920 }, { "epoch": 1266.2264150943397, "grad_norm": 1.6904437738848905, "learning_rate": 2.9837400594773515e-05, "loss": 1.5118, "step": 32940 }, { "epoch": 1266.9811320754718, "grad_norm": 1.6447748796714898, "learning_rate": 2.979630070328336e-05, "loss": 1.4881, "step": 32960 }, { "epoch": 1267.7358490566037, "grad_norm": 1.3512114550316146, "learning_rate": 2.975521804265484e-05, "loss": 1.4719, "step": 32980 }, { "epoch": 1268.4905660377358, "grad_norm": 1.6317892668767962, "learning_rate": 2.971415266861105e-05, "loss": 1.5057, "step": 33000 }, { "epoch": 1269.245283018868, "grad_norm": 1.6596450520295813, "learning_rate": 2.967310463685166e-05, "loss": 1.481, "step": 33020 }, { "epoch": 1270.0, "grad_norm": 1.6548890468178368, "learning_rate": 2.9632074003052808e-05, "loss": 1.5136, "step": 33040 }, { "epoch": 1270.754716981132, "grad_norm": 1.5074284840254797, "learning_rate": 2.9591060822867042e-05, "loss": 1.4971, "step": 33060 }, { "epoch": 1271.5094339622642, "grad_norm": 1.5075074748556512, "learning_rate": 2.9550065151923238e-05, "loss": 1.4647, "step": 33080 }, { "epoch": 1272.2641509433963, "grad_norm": 1.7144775848474376, "learning_rate": 2.9509087045826505e-05, "loss": 1.5145, "step": 33100 }, { "epoch": 1273.0188679245282, "grad_norm": 1.5547570517351919, "learning_rate": 2.946812656015815e-05, "loss": 1.4806, "step": 33120 }, { "epoch": 1273.7735849056603, "grad_norm": 1.91096744807036, "learning_rate": 2.942718375047554e-05, "loss": 1.4953, "step": 33140 }, { "epoch": 1274.5283018867924, "grad_norm": 1.690681911094072, "learning_rate": 2.9386258672312143e-05, "loss": 1.5043, "step": 33160 }, { "epoch": 1275.2830188679245, "grad_norm": 1.6094990513366627, "learning_rate": 2.93453513811773e-05, "loss": 1.4656, "step": 33180 }, { "epoch": 1276.0377358490566, "grad_norm": 1.7166760221415358, "learning_rate": 2.9304461932556262e-05, "loss": 1.5049, "step": 33200 }, { "epoch": 1276.7924528301887, "grad_norm": 1.4781436729661779, "learning_rate": 2.9263590381910078e-05, "loss": 1.4901, "step": 33220 }, { "epoch": 1277.5471698113208, "grad_norm": 1.6055713664381628, "learning_rate": 2.9222736784675506e-05, "loss": 1.4744, "step": 33240 }, { "epoch": 1278.301886792453, "grad_norm": 1.6185246350349134, "learning_rate": 2.9181901196264983e-05, "loss": 1.4809, "step": 33260 }, { "epoch": 1279.0566037735848, "grad_norm": 1.876852753612874, "learning_rate": 2.9141083672066472e-05, "loss": 1.4737, "step": 33280 }, { "epoch": 1279.811320754717, "grad_norm": 1.646333221814719, "learning_rate": 2.910028426744349e-05, "loss": 1.4807, "step": 33300 }, { "epoch": 1280.566037735849, "grad_norm": 1.4950158846180641, "learning_rate": 2.9059503037734925e-05, "loss": 1.4871, "step": 33320 }, { "epoch": 1281.3207547169811, "grad_norm": 2.5440304246025702, "learning_rate": 2.9018740038255044e-05, "loss": 1.4869, "step": 33340 }, { "epoch": 1282.0754716981132, "grad_norm": 1.5221803613837093, "learning_rate": 2.897799532429339e-05, "loss": 1.4756, "step": 33360 }, { "epoch": 1282.8301886792453, "grad_norm": 1.459833552438949, "learning_rate": 2.8937268951114686e-05, "loss": 1.4782, "step": 33380 }, { "epoch": 1283.5849056603774, "grad_norm": 1.5193291412259906, "learning_rate": 2.8896560973958796e-05, "loss": 1.4925, "step": 33400 }, { "epoch": 1284.3396226415093, "grad_norm": 1.457579254538571, "learning_rate": 2.88558714480406e-05, "loss": 1.4865, "step": 33420 }, { "epoch": 1285.0943396226414, "grad_norm": 2.116390864572185, "learning_rate": 2.8815200428549985e-05, "loss": 1.4823, "step": 33440 }, { "epoch": 1285.8490566037735, "grad_norm": 2.333973476529065, "learning_rate": 2.8774547970651747e-05, "loss": 1.4701, "step": 33460 }, { "epoch": 1286.6037735849056, "grad_norm": 1.4347402180741313, "learning_rate": 2.8733914129485457e-05, "loss": 1.4964, "step": 33480 }, { "epoch": 1287.3584905660377, "grad_norm": 1.5219049837257324, "learning_rate": 2.8693298960165473e-05, "loss": 1.4845, "step": 33500 }, { "epoch": 1288.1132075471698, "grad_norm": 1.7295744561903763, "learning_rate": 2.8652702517780815e-05, "loss": 1.4729, "step": 33520 }, { "epoch": 1288.867924528302, "grad_norm": 1.3491913340767474, "learning_rate": 2.8612124857395097e-05, "loss": 1.4734, "step": 33540 }, { "epoch": 1289.622641509434, "grad_norm": 1.612399971127458, "learning_rate": 2.8571566034046486e-05, "loss": 1.4717, "step": 33560 }, { "epoch": 1290.377358490566, "grad_norm": 1.523340229132746, "learning_rate": 2.8531026102747552e-05, "loss": 1.4784, "step": 33580 }, { "epoch": 1291.132075471698, "grad_norm": 1.391650177787444, "learning_rate": 2.849050511848529e-05, "loss": 1.4968, "step": 33600 }, { "epoch": 1291.8867924528302, "grad_norm": 1.631972432390494, "learning_rate": 2.845000313622095e-05, "loss": 1.4783, "step": 33620 }, { "epoch": 1292.6415094339623, "grad_norm": 1.4676382942374402, "learning_rate": 2.840952021089003e-05, "loss": 1.4724, "step": 33640 }, { "epoch": 1293.3962264150944, "grad_norm": 1.5025191965428788, "learning_rate": 2.83690563974022e-05, "loss": 1.4958, "step": 33660 }, { "epoch": 1294.1509433962265, "grad_norm": 1.6379644083109945, "learning_rate": 2.832861175064119e-05, "loss": 1.4834, "step": 33680 }, { "epoch": 1294.9056603773586, "grad_norm": 1.7312099049664693, "learning_rate": 2.8288186325464705e-05, "loss": 1.4941, "step": 33700 }, { "epoch": 1295.6603773584907, "grad_norm": 1.5113721107585405, "learning_rate": 2.8247780176704408e-05, "loss": 1.4863, "step": 33720 }, { "epoch": 1296.4150943396226, "grad_norm": 1.4187238404455875, "learning_rate": 2.8207393359165837e-05, "loss": 1.4635, "step": 33740 }, { "epoch": 1297.1698113207547, "grad_norm": 1.5036198246572734, "learning_rate": 2.8167025927628266e-05, "loss": 1.4663, "step": 33760 }, { "epoch": 1297.9245283018868, "grad_norm": 1.656299435435026, "learning_rate": 2.8126677936844698e-05, "loss": 1.4809, "step": 33780 }, { "epoch": 1298.6792452830189, "grad_norm": 1.7227294745544, "learning_rate": 2.808634944154176e-05, "loss": 1.4518, "step": 33800 }, { "epoch": 1299.433962264151, "grad_norm": 1.554440422068932, "learning_rate": 2.8046040496419622e-05, "loss": 1.4858, "step": 33820 }, { "epoch": 1300.188679245283, "grad_norm": 1.5684395687858594, "learning_rate": 2.8005751156151996e-05, "loss": 1.4939, "step": 33840 }, { "epoch": 1300.9433962264152, "grad_norm": 1.4791453327586883, "learning_rate": 2.7965481475385922e-05, "loss": 1.4981, "step": 33860 }, { "epoch": 1301.698113207547, "grad_norm": 1.8682361890592045, "learning_rate": 2.792523150874184e-05, "loss": 1.485, "step": 33880 }, { "epoch": 1302.4528301886792, "grad_norm": 1.4376784070576631, "learning_rate": 2.7885001310813394e-05, "loss": 1.4771, "step": 33900 }, { "epoch": 1303.2075471698113, "grad_norm": 1.4919487782728726, "learning_rate": 2.7844790936167448e-05, "loss": 1.4818, "step": 33920 }, { "epoch": 1303.9622641509434, "grad_norm": 1.5926644935407461, "learning_rate": 2.7804600439344004e-05, "loss": 1.481, "step": 33940 }, { "epoch": 1304.7169811320755, "grad_norm": 2.129672326977145, "learning_rate": 2.776442987485605e-05, "loss": 1.4809, "step": 33960 }, { "epoch": 1305.4716981132076, "grad_norm": 1.4661184798946012, "learning_rate": 2.7724279297189564e-05, "loss": 1.4734, "step": 33980 }, { "epoch": 1306.2264150943397, "grad_norm": 1.6422416038082728, "learning_rate": 2.7684148760803404e-05, "loss": 1.4706, "step": 34000 }, { "epoch": 1306.9811320754718, "grad_norm": 1.6788541325557527, "learning_rate": 2.7644038320129247e-05, "loss": 1.4734, "step": 34020 }, { "epoch": 1307.7358490566037, "grad_norm": 1.5820996412366164, "learning_rate": 2.7603948029571546e-05, "loss": 1.4731, "step": 34040 }, { "epoch": 1308.4905660377358, "grad_norm": 1.8093817496261688, "learning_rate": 2.756387794350737e-05, "loss": 1.4876, "step": 34060 }, { "epoch": 1309.245283018868, "grad_norm": 1.4611414622430816, "learning_rate": 2.7523828116286425e-05, "loss": 1.4958, "step": 34080 }, { "epoch": 1310.0, "grad_norm": 1.4982681857066789, "learning_rate": 2.7483798602230905e-05, "loss": 1.4713, "step": 34100 }, { "epoch": 1310.754716981132, "grad_norm": 1.7049190400136933, "learning_rate": 2.744378945563547e-05, "loss": 1.4698, "step": 34120 }, { "epoch": 1311.5094339622642, "grad_norm": 1.465072325468145, "learning_rate": 2.7403800730767165e-05, "loss": 1.4814, "step": 34140 }, { "epoch": 1312.2641509433963, "grad_norm": 1.6806290813940998, "learning_rate": 2.7363832481865326e-05, "loss": 1.4623, "step": 34160 }, { "epoch": 1313.0188679245282, "grad_norm": 1.422949129304357, "learning_rate": 2.7323884763141494e-05, "loss": 1.4798, "step": 34180 }, { "epoch": 1313.7735849056603, "grad_norm": 1.5386955048633302, "learning_rate": 2.728395762877941e-05, "loss": 1.4588, "step": 34200 }, { "epoch": 1314.5283018867924, "grad_norm": 1.5472770555424338, "learning_rate": 2.7244051132934836e-05, "loss": 1.451, "step": 34220 }, { "epoch": 1315.2830188679245, "grad_norm": 1.435168914934391, "learning_rate": 2.72041653297356e-05, "loss": 1.4943, "step": 34240 }, { "epoch": 1316.0377358490566, "grad_norm": 1.4183350034608622, "learning_rate": 2.716430027328143e-05, "loss": 1.4519, "step": 34260 }, { "epoch": 1316.7924528301887, "grad_norm": 1.7134876611489063, "learning_rate": 2.7124456017643914e-05, "loss": 1.4658, "step": 34280 }, { "epoch": 1317.5471698113208, "grad_norm": 1.4042660927164932, "learning_rate": 2.7084632616866437e-05, "loss": 1.4665, "step": 34300 }, { "epoch": 1318.301886792453, "grad_norm": 1.7236176772036846, "learning_rate": 2.7044830124964073e-05, "loss": 1.4598, "step": 34320 }, { "epoch": 1319.0566037735848, "grad_norm": 1.7345912564178498, "learning_rate": 2.7005048595923597e-05, "loss": 1.4941, "step": 34340 }, { "epoch": 1319.811320754717, "grad_norm": 1.6553359599381614, "learning_rate": 2.696528808370328e-05, "loss": 1.448, "step": 34360 }, { "epoch": 1320.566037735849, "grad_norm": 1.654924545197036, "learning_rate": 2.6925548642232916e-05, "loss": 1.453, "step": 34380 }, { "epoch": 1321.3207547169811, "grad_norm": 1.966241914838029, "learning_rate": 2.6885830325413732e-05, "loss": 1.4791, "step": 34400 }, { "epoch": 1322.0754716981132, "grad_norm": 1.613098173730771, "learning_rate": 2.6846133187118266e-05, "loss": 1.4456, "step": 34420 }, { "epoch": 1322.8301886792453, "grad_norm": 1.694164161340185, "learning_rate": 2.6806457281190392e-05, "loss": 1.4697, "step": 34440 }, { "epoch": 1323.5849056603774, "grad_norm": 1.7709910517494127, "learning_rate": 2.6766802661445123e-05, "loss": 1.4767, "step": 34460 }, { "epoch": 1324.3396226415093, "grad_norm": 2.1757270130771547, "learning_rate": 2.672716938166863e-05, "loss": 1.5023, "step": 34480 }, { "epoch": 1325.0943396226414, "grad_norm": 1.618966012864335, "learning_rate": 2.66875574956181e-05, "loss": 1.4459, "step": 34500 }, { "epoch": 1325.8490566037735, "grad_norm": 1.6395370020860782, "learning_rate": 2.6647967057021783e-05, "loss": 1.4716, "step": 34520 }, { "epoch": 1326.6037735849056, "grad_norm": 1.458865429611614, "learning_rate": 2.6608398119578777e-05, "loss": 1.4509, "step": 34540 }, { "epoch": 1327.3584905660377, "grad_norm": 1.8785096977087146, "learning_rate": 2.656885073695903e-05, "loss": 1.4563, "step": 34560 }, { "epoch": 1328.1132075471698, "grad_norm": 1.9390316222323336, "learning_rate": 2.652932496280323e-05, "loss": 1.4851, "step": 34580 }, { "epoch": 1328.867924528302, "grad_norm": 5.310289949887802, "learning_rate": 2.6489820850722802e-05, "loss": 1.4768, "step": 34600 }, { "epoch": 1329.622641509434, "grad_norm": 1.4684731158219795, "learning_rate": 2.6450338454299786e-05, "loss": 1.4516, "step": 34620 }, { "epoch": 1330.377358490566, "grad_norm": 1.639577731583303, "learning_rate": 2.641087782708672e-05, "loss": 1.4654, "step": 34640 }, { "epoch": 1331.132075471698, "grad_norm": 1.6849901015256106, "learning_rate": 2.6371439022606665e-05, "loss": 1.4615, "step": 34660 }, { "epoch": 1331.8867924528302, "grad_norm": 1.619952725687253, "learning_rate": 2.6332022094353024e-05, "loss": 1.4461, "step": 34680 }, { "epoch": 1332.6415094339623, "grad_norm": 1.5608967063706551, "learning_rate": 2.6292627095789594e-05, "loss": 1.4523, "step": 34700 }, { "epoch": 1333.3962264150944, "grad_norm": 1.7568408459896505, "learning_rate": 2.625325408035041e-05, "loss": 1.4758, "step": 34720 }, { "epoch": 1334.1509433962265, "grad_norm": 1.5186845485994895, "learning_rate": 2.6213903101439668e-05, "loss": 1.4527, "step": 34740 }, { "epoch": 1334.9056603773586, "grad_norm": 1.9016010055715276, "learning_rate": 2.6174574212431673e-05, "loss": 1.4708, "step": 34760 }, { "epoch": 1335.6603773584907, "grad_norm": 1.3914584691450766, "learning_rate": 2.6135267466670776e-05, "loss": 1.4519, "step": 34780 }, { "epoch": 1336.4150943396226, "grad_norm": 1.7920706183325235, "learning_rate": 2.6095982917471312e-05, "loss": 1.4551, "step": 34800 }, { "epoch": 1337.1698113207547, "grad_norm": 1.7415199040517522, "learning_rate": 2.6056720618117508e-05, "loss": 1.4618, "step": 34820 }, { "epoch": 1337.9245283018868, "grad_norm": 2.0387577968023423, "learning_rate": 2.6017480621863382e-05, "loss": 1.4336, "step": 34840 }, { "epoch": 1338.6792452830189, "grad_norm": 1.7452335041516622, "learning_rate": 2.5978262981932716e-05, "loss": 1.4845, "step": 34860 }, { "epoch": 1339.433962264151, "grad_norm": 1.8221491527113842, "learning_rate": 2.5939067751518968e-05, "loss": 1.4509, "step": 34880 }, { "epoch": 1340.188679245283, "grad_norm": 1.573534969706598, "learning_rate": 2.58998949837852e-05, "loss": 1.4597, "step": 34900 }, { "epoch": 1340.9433962264152, "grad_norm": 1.7418894779202971, "learning_rate": 2.5860744731864037e-05, "loss": 1.4509, "step": 34920 }, { "epoch": 1341.698113207547, "grad_norm": 2.3533748801857612, "learning_rate": 2.5821617048857514e-05, "loss": 1.4707, "step": 34940 }, { "epoch": 1342.4528301886792, "grad_norm": 1.6384303594662744, "learning_rate": 2.5782511987837087e-05, "loss": 1.4483, "step": 34960 }, { "epoch": 1343.2075471698113, "grad_norm": 1.7437935503570192, "learning_rate": 2.5743429601843493e-05, "loss": 1.4708, "step": 34980 }, { "epoch": 1343.9622641509434, "grad_norm": 1.6299173329516294, "learning_rate": 2.5704369943886763e-05, "loss": 1.4487, "step": 35000 }, { "epoch": 1344.7169811320755, "grad_norm": 1.5340708358576824, "learning_rate": 2.5665333066946082e-05, "loss": 1.4659, "step": 35020 }, { "epoch": 1345.4716981132076, "grad_norm": 1.70280338168885, "learning_rate": 2.5626319023969715e-05, "loss": 1.4547, "step": 35040 }, { "epoch": 1346.2264150943397, "grad_norm": 1.6585665666032239, "learning_rate": 2.558732786787497e-05, "loss": 1.4514, "step": 35060 }, { "epoch": 1346.9811320754718, "grad_norm": 1.562613257380082, "learning_rate": 2.5548359651548126e-05, "loss": 1.4661, "step": 35080 }, { "epoch": 1347.7358490566037, "grad_norm": 1.7392138600300024, "learning_rate": 2.550941442784431e-05, "loss": 1.4546, "step": 35100 }, { "epoch": 1348.4905660377358, "grad_norm": 1.9111375288571992, "learning_rate": 2.5470492249587522e-05, "loss": 1.4478, "step": 35120 }, { "epoch": 1349.245283018868, "grad_norm": 1.4950805686503206, "learning_rate": 2.5431593169570446e-05, "loss": 1.4535, "step": 35140 }, { "epoch": 1350.0, "grad_norm": 2.553809298230812, "learning_rate": 2.539271724055444e-05, "loss": 1.464, "step": 35160 }, { "epoch": 1350.754716981132, "grad_norm": 1.562798272416066, "learning_rate": 2.5353864515269525e-05, "loss": 1.4665, "step": 35180 }, { "epoch": 1351.5094339622642, "grad_norm": 1.5956415565820565, "learning_rate": 2.531503504641416e-05, "loss": 1.4174, "step": 35200 }, { "epoch": 1352.2641509433963, "grad_norm": 1.745867042261029, "learning_rate": 2.5276228886655333e-05, "loss": 1.4738, "step": 35220 }, { "epoch": 1353.0188679245282, "grad_norm": 1.8454370598772634, "learning_rate": 2.5237446088628384e-05, "loss": 1.4407, "step": 35240 }, { "epoch": 1353.7735849056603, "grad_norm": 1.70704191729437, "learning_rate": 2.5198686704936945e-05, "loss": 1.4617, "step": 35260 }, { "epoch": 1354.5283018867924, "grad_norm": 1.410719238952515, "learning_rate": 2.5159950788152942e-05, "loss": 1.4397, "step": 35280 }, { "epoch": 1355.2830188679245, "grad_norm": 1.811804083528806, "learning_rate": 2.512123839081642e-05, "loss": 1.443, "step": 35300 }, { "epoch": 1356.0377358490566, "grad_norm": 1.659319824434148, "learning_rate": 2.508254956543557e-05, "loss": 1.4577, "step": 35320 }, { "epoch": 1356.7924528301887, "grad_norm": 1.5084615900612242, "learning_rate": 2.504388436448657e-05, "loss": 1.4702, "step": 35340 }, { "epoch": 1357.5471698113208, "grad_norm": 1.6272545133599885, "learning_rate": 2.500524284041357e-05, "loss": 1.4397, "step": 35360 }, { "epoch": 1358.301886792453, "grad_norm": 1.470645864952112, "learning_rate": 2.4966625045628615e-05, "loss": 1.4435, "step": 35380 }, { "epoch": 1359.0566037735848, "grad_norm": 1.455775463587072, "learning_rate": 2.4928031032511544e-05, "loss": 1.4554, "step": 35400 }, { "epoch": 1359.811320754717, "grad_norm": 1.6787988136879601, "learning_rate": 2.4889460853409974e-05, "loss": 1.4692, "step": 35420 }, { "epoch": 1360.566037735849, "grad_norm": 1.5640507822516196, "learning_rate": 2.485091456063916e-05, "loss": 1.4528, "step": 35440 }, { "epoch": 1361.3207547169811, "grad_norm": 1.396621608357886, "learning_rate": 2.4812392206481945e-05, "loss": 1.4371, "step": 35460 }, { "epoch": 1362.0754716981132, "grad_norm": 1.8537494645554213, "learning_rate": 2.477389384318876e-05, "loss": 1.4395, "step": 35480 }, { "epoch": 1362.8301886792453, "grad_norm": 1.512928732642698, "learning_rate": 2.4735419522977467e-05, "loss": 1.4914, "step": 35500 }, { "epoch": 1363.5849056603774, "grad_norm": 1.6340922613193214, "learning_rate": 2.46969692980333e-05, "loss": 1.4654, "step": 35520 }, { "epoch": 1364.3396226415093, "grad_norm": 1.5378015561259157, "learning_rate": 2.465854322050881e-05, "loss": 1.4246, "step": 35540 }, { "epoch": 1365.0943396226414, "grad_norm": 1.8471949838761705, "learning_rate": 2.462014134252384e-05, "loss": 1.4386, "step": 35560 }, { "epoch": 1365.8490566037735, "grad_norm": 2.139477793232749, "learning_rate": 2.4581763716165345e-05, "loss": 1.4314, "step": 35580 }, { "epoch": 1366.6037735849056, "grad_norm": 1.5366713623147805, "learning_rate": 2.454341039348746e-05, "loss": 1.4514, "step": 35600 }, { "epoch": 1367.3584905660377, "grad_norm": 2.6632963736452018, "learning_rate": 2.4505081426511286e-05, "loss": 1.4244, "step": 35620 }, { "epoch": 1368.1132075471698, "grad_norm": 1.7507517183403924, "learning_rate": 2.4466776867224914e-05, "loss": 1.4401, "step": 35640 }, { "epoch": 1368.867924528302, "grad_norm": 1.7263277038654796, "learning_rate": 2.4428496767583355e-05, "loss": 1.4569, "step": 35660 }, { "epoch": 1369.622641509434, "grad_norm": 1.664393561735168, "learning_rate": 2.4390241179508404e-05, "loss": 1.4387, "step": 35680 }, { "epoch": 1370.377358490566, "grad_norm": 1.723479394345894, "learning_rate": 2.435201015488865e-05, "loss": 1.4411, "step": 35700 }, { "epoch": 1371.132075471698, "grad_norm": 1.434976866992101, "learning_rate": 2.4313803745579318e-05, "loss": 1.4284, "step": 35720 }, { "epoch": 1371.8867924528302, "grad_norm": 1.4785579710843697, "learning_rate": 2.4275622003402272e-05, "loss": 1.442, "step": 35740 }, { "epoch": 1372.6415094339623, "grad_norm": 1.4377021405339876, "learning_rate": 2.4237464980145938e-05, "loss": 1.4585, "step": 35760 }, { "epoch": 1373.3962264150944, "grad_norm": 1.4439423657468624, "learning_rate": 2.4199332727565162e-05, "loss": 1.4415, "step": 35780 }, { "epoch": 1374.1509433962265, "grad_norm": 1.6286933767716432, "learning_rate": 2.4161225297381257e-05, "loss": 1.4191, "step": 35800 }, { "epoch": 1374.9056603773586, "grad_norm": 1.8061947503706157, "learning_rate": 2.412314274128181e-05, "loss": 1.4328, "step": 35820 }, { "epoch": 1375.6603773584907, "grad_norm": 1.4892866827277318, "learning_rate": 2.408508511092069e-05, "loss": 1.426, "step": 35840 }, { "epoch": 1376.4150943396226, "grad_norm": 2.1944517889347206, "learning_rate": 2.4047052457917976e-05, "loss": 1.4383, "step": 35860 }, { "epoch": 1377.1698113207547, "grad_norm": 1.657764612011157, "learning_rate": 2.4009044833859837e-05, "loss": 1.4335, "step": 35880 }, { "epoch": 1377.9245283018868, "grad_norm": 1.6641457685651413, "learning_rate": 2.397106229029853e-05, "loss": 1.449, "step": 35900 }, { "epoch": 1378.6792452830189, "grad_norm": 1.6180638342974163, "learning_rate": 2.3933104878752255e-05, "loss": 1.4531, "step": 35920 }, { "epoch": 1379.433962264151, "grad_norm": 1.4294375910343768, "learning_rate": 2.3895172650705135e-05, "loss": 1.394, "step": 35940 }, { "epoch": 1380.188679245283, "grad_norm": 1.8277501896092694, "learning_rate": 2.3857265657607175e-05, "loss": 1.3907, "step": 35960 }, { "epoch": 1380.9433962264152, "grad_norm": 1.498142714401942, "learning_rate": 2.381938395087408e-05, "loss": 1.427, "step": 35980 }, { "epoch": 1381.698113207547, "grad_norm": 1.6446695245077154, "learning_rate": 2.3781527581887328e-05, "loss": 1.4267, "step": 36000 }, { "epoch": 1382.4528301886792, "grad_norm": 2.126047948088478, "learning_rate": 2.3743696601993973e-05, "loss": 1.4513, "step": 36020 }, { "epoch": 1383.2075471698113, "grad_norm": 1.5906073184513956, "learning_rate": 2.3705891062506686e-05, "loss": 1.4468, "step": 36040 }, { "epoch": 1383.9622641509434, "grad_norm": 1.6659051387541641, "learning_rate": 2.366811101470359e-05, "loss": 1.4397, "step": 36060 }, { "epoch": 1384.7169811320755, "grad_norm": 1.7950603394090476, "learning_rate": 2.363035650982822e-05, "loss": 1.4314, "step": 36080 }, { "epoch": 1385.4716981132076, "grad_norm": 1.7227503126171113, "learning_rate": 2.359262759908953e-05, "loss": 1.4305, "step": 36100 }, { "epoch": 1386.2264150943397, "grad_norm": 1.5686879263532916, "learning_rate": 2.355492433366169e-05, "loss": 1.4606, "step": 36120 }, { "epoch": 1386.9811320754718, "grad_norm": 1.6010165898998077, "learning_rate": 2.3517246764684138e-05, "loss": 1.441, "step": 36140 }, { "epoch": 1387.7358490566037, "grad_norm": 3.491710911332113, "learning_rate": 2.3479594943261428e-05, "loss": 1.4341, "step": 36160 }, { "epoch": 1388.4905660377358, "grad_norm": 1.6931483101249463, "learning_rate": 2.3441968920463175e-05, "loss": 1.4059, "step": 36180 }, { "epoch": 1389.245283018868, "grad_norm": 1.5814288881168233, "learning_rate": 2.340436874732406e-05, "loss": 1.4494, "step": 36200 }, { "epoch": 1390.0, "grad_norm": 1.7550476965929234, "learning_rate": 2.3366794474843636e-05, "loss": 1.4461, "step": 36220 }, { "epoch": 1390.754716981132, "grad_norm": 1.6037325519139611, "learning_rate": 2.332924615398638e-05, "loss": 1.4324, "step": 36240 }, { "epoch": 1391.5094339622642, "grad_norm": 1.5872440902961078, "learning_rate": 2.3291723835681542e-05, "loss": 1.4229, "step": 36260 }, { "epoch": 1392.2641509433963, "grad_norm": 1.6075974238110624, "learning_rate": 2.3254227570823088e-05, "loss": 1.4319, "step": 36280 }, { "epoch": 1393.0188679245282, "grad_norm": 1.664082496030561, "learning_rate": 2.3216757410269688e-05, "loss": 1.4133, "step": 36300 }, { "epoch": 1393.7735849056603, "grad_norm": 1.868185444331913, "learning_rate": 2.3179313404844556e-05, "loss": 1.4303, "step": 36320 }, { "epoch": 1394.5283018867924, "grad_norm": 1.5709216565532331, "learning_rate": 2.314189560533549e-05, "loss": 1.4136, "step": 36340 }, { "epoch": 1395.2830188679245, "grad_norm": 1.6461901097795721, "learning_rate": 2.3104504062494673e-05, "loss": 1.4359, "step": 36360 }, { "epoch": 1396.0377358490566, "grad_norm": 1.4737937485692245, "learning_rate": 2.306713882703874e-05, "loss": 1.4417, "step": 36380 }, { "epoch": 1396.7924528301887, "grad_norm": 1.62600468664324, "learning_rate": 2.3029799949648578e-05, "loss": 1.4471, "step": 36400 }, { "epoch": 1397.5471698113208, "grad_norm": 2.4473530264247914, "learning_rate": 2.2992487480969405e-05, "loss": 1.4239, "step": 36420 }, { "epoch": 1398.301886792453, "grad_norm": 1.451788707298732, "learning_rate": 2.295520147161054e-05, "loss": 1.4213, "step": 36440 }, { "epoch": 1399.0566037735848, "grad_norm": 1.6561495842890779, "learning_rate": 2.2917941972145448e-05, "loss": 1.4289, "step": 36460 }, { "epoch": 1399.811320754717, "grad_norm": 1.7199804756862742, "learning_rate": 2.288070903311165e-05, "loss": 1.4089, "step": 36480 }, { "epoch": 1400.566037735849, "grad_norm": 1.3767860468778748, "learning_rate": 2.2843502705010602e-05, "loss": 1.43, "step": 36500 }, { "epoch": 1401.3207547169811, "grad_norm": 1.629044071752712, "learning_rate": 2.2806323038307724e-05, "loss": 1.4353, "step": 36520 }, { "epoch": 1402.0754716981132, "grad_norm": 1.5402931594748135, "learning_rate": 2.2769170083432224e-05, "loss": 1.4002, "step": 36540 }, { "epoch": 1402.8301886792453, "grad_norm": 1.6851610727649395, "learning_rate": 2.273204389077707e-05, "loss": 1.4303, "step": 36560 }, { "epoch": 1403.5849056603774, "grad_norm": 1.6351932980143555, "learning_rate": 2.2694944510698992e-05, "loss": 1.4324, "step": 36580 }, { "epoch": 1404.3396226415093, "grad_norm": 1.3360407707287731, "learning_rate": 2.265787199351829e-05, "loss": 1.4296, "step": 36600 }, { "epoch": 1405.0943396226414, "grad_norm": 1.6229856547835415, "learning_rate": 2.2620826389518878e-05, "loss": 1.4132, "step": 36620 }, { "epoch": 1405.8490566037735, "grad_norm": 1.5762261444691155, "learning_rate": 2.258380774894813e-05, "loss": 1.4189, "step": 36640 }, { "epoch": 1406.6037735849056, "grad_norm": 1.6330786646124598, "learning_rate": 2.254681612201684e-05, "loss": 1.4229, "step": 36660 }, { "epoch": 1407.3584905660377, "grad_norm": 1.6074464661210397, "learning_rate": 2.2509851558899212e-05, "loss": 1.4438, "step": 36680 }, { "epoch": 1408.1132075471698, "grad_norm": 2.912277484153031, "learning_rate": 2.2472914109732686e-05, "loss": 1.4195, "step": 36700 }, { "epoch": 1408.867924528302, "grad_norm": 1.6223740817719732, "learning_rate": 2.2436003824617963e-05, "loss": 1.4099, "step": 36720 }, { "epoch": 1409.622641509434, "grad_norm": 1.766781857646511, "learning_rate": 2.2399120753618896e-05, "loss": 1.4168, "step": 36740 }, { "epoch": 1410.377358490566, "grad_norm": 1.5296965456959557, "learning_rate": 2.2362264946762392e-05, "loss": 1.4118, "step": 36760 }, { "epoch": 1411.132075471698, "grad_norm": 1.6610041335566879, "learning_rate": 2.232543645403842e-05, "loss": 1.4166, "step": 36780 }, { "epoch": 1411.8867924528302, "grad_norm": 1.5205836616470723, "learning_rate": 2.228863532539987e-05, "loss": 1.4246, "step": 36800 }, { "epoch": 1412.6415094339623, "grad_norm": 2.018497485986653, "learning_rate": 2.2251861610762556e-05, "loss": 1.4219, "step": 36820 }, { "epoch": 1413.3962264150944, "grad_norm": 1.495393210690481, "learning_rate": 2.221511536000505e-05, "loss": 1.4201, "step": 36840 }, { "epoch": 1414.1509433962265, "grad_norm": 1.5817177891641536, "learning_rate": 2.2178396622968714e-05, "loss": 1.4301, "step": 36860 }, { "epoch": 1414.9056603773586, "grad_norm": 1.5602680564678848, "learning_rate": 2.2141705449457588e-05, "loss": 1.4246, "step": 36880 }, { "epoch": 1415.6603773584907, "grad_norm": 1.5687723652001904, "learning_rate": 2.2105041889238327e-05, "loss": 1.4291, "step": 36900 }, { "epoch": 1416.4150943396226, "grad_norm": 1.6516438298835592, "learning_rate": 2.2068405992040127e-05, "loss": 1.4186, "step": 36920 }, { "epoch": 1417.1698113207547, "grad_norm": 1.6972963029742167, "learning_rate": 2.2031797807554646e-05, "loss": 1.4026, "step": 36940 }, { "epoch": 1417.9245283018868, "grad_norm": 1.6936263753645908, "learning_rate": 2.1995217385435962e-05, "loss": 1.3882, "step": 36960 }, { "epoch": 1418.6792452830189, "grad_norm": 1.697372534880421, "learning_rate": 2.1958664775300517e-05, "loss": 1.4228, "step": 36980 }, { "epoch": 1419.433962264151, "grad_norm": 1.4972148012217616, "learning_rate": 2.192214002672703e-05, "loss": 1.3961, "step": 37000 }, { "epoch": 1420.188679245283, "grad_norm": 1.6926137674291781, "learning_rate": 2.1885643189256404e-05, "loss": 1.4005, "step": 37020 }, { "epoch": 1420.9433962264152, "grad_norm": 1.6182171449313734, "learning_rate": 2.1849174312391693e-05, "loss": 1.3939, "step": 37040 }, { "epoch": 1421.698113207547, "grad_norm": 1.6235165658387523, "learning_rate": 2.181273344559802e-05, "loss": 1.414, "step": 37060 }, { "epoch": 1422.4528301886792, "grad_norm": 1.7354641628437306, "learning_rate": 2.1776320638302533e-05, "loss": 1.4039, "step": 37080 }, { "epoch": 1423.2075471698113, "grad_norm": 1.7598777416483105, "learning_rate": 2.1739935939894332e-05, "loss": 1.4319, "step": 37100 }, { "epoch": 1423.9622641509434, "grad_norm": 1.6119817066147992, "learning_rate": 2.170357939972436e-05, "loss": 1.4083, "step": 37120 }, { "epoch": 1424.7169811320755, "grad_norm": 1.5177195143064601, "learning_rate": 2.1667251067105383e-05, "loss": 1.4084, "step": 37140 }, { "epoch": 1425.4716981132076, "grad_norm": 1.6531623474873094, "learning_rate": 2.1630950991311884e-05, "loss": 1.3961, "step": 37160 }, { "epoch": 1426.2264150943397, "grad_norm": 1.9866189092494402, "learning_rate": 2.159467922158006e-05, "loss": 1.4205, "step": 37180 }, { "epoch": 1426.9811320754718, "grad_norm": 1.6409536663163726, "learning_rate": 2.15584358071077e-05, "loss": 1.4065, "step": 37200 }, { "epoch": 1427.7358490566037, "grad_norm": 1.5972136032609723, "learning_rate": 2.1522220797054117e-05, "loss": 1.3999, "step": 37220 }, { "epoch": 1428.4905660377358, "grad_norm": 1.7176147072411343, "learning_rate": 2.1486034240540095e-05, "loss": 1.4077, "step": 37240 }, { "epoch": 1429.245283018868, "grad_norm": 2.2258677114656655, "learning_rate": 2.1449876186647868e-05, "loss": 1.4174, "step": 37260 }, { "epoch": 1430.0, "grad_norm": 1.702909141767608, "learning_rate": 2.1413746684420938e-05, "loss": 1.3745, "step": 37280 }, { "epoch": 1430.754716981132, "grad_norm": 2.1998514759828915, "learning_rate": 2.1377645782864164e-05, "loss": 1.421, "step": 37300 }, { "epoch": 1431.5094339622642, "grad_norm": 1.4634078104497494, "learning_rate": 2.134157353094355e-05, "loss": 1.4219, "step": 37320 }, { "epoch": 1432.2641509433963, "grad_norm": 1.7232746233155163, "learning_rate": 2.1305529977586244e-05, "loss": 1.4236, "step": 37340 }, { "epoch": 1433.0188679245282, "grad_norm": 1.7372503788909404, "learning_rate": 2.1269515171680505e-05, "loss": 1.391, "step": 37360 }, { "epoch": 1433.7735849056603, "grad_norm": 1.8471672382610358, "learning_rate": 2.1233529162075586e-05, "loss": 1.4087, "step": 37380 }, { "epoch": 1434.5283018867924, "grad_norm": 1.6217649320497987, "learning_rate": 2.1197571997581665e-05, "loss": 1.4239, "step": 37400 }, { "epoch": 1435.2830188679245, "grad_norm": 1.5296478429731253, "learning_rate": 2.1161643726969807e-05, "loss": 1.3958, "step": 37420 }, { "epoch": 1436.0377358490566, "grad_norm": 1.560888539193858, "learning_rate": 2.1125744398971865e-05, "loss": 1.3979, "step": 37440 }, { "epoch": 1436.7924528301887, "grad_norm": 4.524184692042414, "learning_rate": 2.1089874062280467e-05, "loss": 1.4068, "step": 37460 }, { "epoch": 1437.5471698113208, "grad_norm": 1.9907802095010148, "learning_rate": 2.1054032765548943e-05, "loss": 1.4128, "step": 37480 }, { "epoch": 1438.301886792453, "grad_norm": 1.5158745007996666, "learning_rate": 2.1018220557391152e-05, "loss": 1.4206, "step": 37500 }, { "epoch": 1439.0566037735848, "grad_norm": 1.922346737191028, "learning_rate": 2.0982437486381567e-05, "loss": 1.4155, "step": 37520 }, { "epoch": 1439.811320754717, "grad_norm": 1.6671399833246607, "learning_rate": 2.094668360105509e-05, "loss": 1.4052, "step": 37540 }, { "epoch": 1440.566037735849, "grad_norm": 1.6551308179910114, "learning_rate": 2.0910958949907086e-05, "loss": 1.3986, "step": 37560 }, { "epoch": 1441.3207547169811, "grad_norm": 1.5091823329163863, "learning_rate": 2.087526358139325e-05, "loss": 1.3842, "step": 37580 }, { "epoch": 1442.0754716981132, "grad_norm": 1.5775979624954766, "learning_rate": 2.0839597543929547e-05, "loss": 1.3695, "step": 37600 }, { "epoch": 1442.8301886792453, "grad_norm": 1.65888589339979, "learning_rate": 2.0803960885892166e-05, "loss": 1.4212, "step": 37620 }, { "epoch": 1443.5849056603774, "grad_norm": 1.7548961067858515, "learning_rate": 2.0768353655617437e-05, "loss": 1.4113, "step": 37640 }, { "epoch": 1444.3396226415093, "grad_norm": 1.6568444527605615, "learning_rate": 2.0732775901401787e-05, "loss": 1.4097, "step": 37660 }, { "epoch": 1445.0943396226414, "grad_norm": 1.569007851847122, "learning_rate": 2.0697227671501686e-05, "loss": 1.4025, "step": 37680 }, { "epoch": 1445.8490566037735, "grad_norm": 2.054795832820314, "learning_rate": 2.0661709014133507e-05, "loss": 1.379, "step": 37700 }, { "epoch": 1446.6037735849056, "grad_norm": 1.825419627550906, "learning_rate": 2.0626219977473546e-05, "loss": 1.4141, "step": 37720 }, { "epoch": 1447.3584905660377, "grad_norm": 1.5477059924334846, "learning_rate": 2.05907606096579e-05, "loss": 1.3764, "step": 37740 }, { "epoch": 1448.1132075471698, "grad_norm": 1.6756003225697567, "learning_rate": 2.0555330958782456e-05, "loss": 1.3943, "step": 37760 }, { "epoch": 1448.867924528302, "grad_norm": 1.6181024178942431, "learning_rate": 2.0519931072902775e-05, "loss": 1.3828, "step": 37780 }, { "epoch": 1449.622641509434, "grad_norm": 1.5075137093108786, "learning_rate": 2.0484561000034048e-05, "loss": 1.3993, "step": 37800 }, { "epoch": 1450.377358490566, "grad_norm": 1.7611806649373956, "learning_rate": 2.0449220788151017e-05, "loss": 1.4025, "step": 37820 }, { "epoch": 1451.132075471698, "grad_norm": 1.5899695714047575, "learning_rate": 2.0413910485187918e-05, "loss": 1.4011, "step": 37840 }, { "epoch": 1451.8867924528302, "grad_norm": 1.9368764939203147, "learning_rate": 2.0378630139038477e-05, "loss": 1.3914, "step": 37860 }, { "epoch": 1452.6415094339623, "grad_norm": 2.2298811573087938, "learning_rate": 2.0343379797555718e-05, "loss": 1.4096, "step": 37880 }, { "epoch": 1453.3962264150944, "grad_norm": 1.8812446095800621, "learning_rate": 2.0308159508552003e-05, "loss": 1.3994, "step": 37900 }, { "epoch": 1454.1509433962265, "grad_norm": 1.546640573436516, "learning_rate": 2.0272969319798898e-05, "loss": 1.3901, "step": 37920 }, { "epoch": 1454.9056603773586, "grad_norm": 1.6385943250375863, "learning_rate": 2.0237809279027187e-05, "loss": 1.3954, "step": 37940 }, { "epoch": 1455.6603773584907, "grad_norm": 1.5716563974399815, "learning_rate": 2.0202679433926757e-05, "loss": 1.3935, "step": 37960 }, { "epoch": 1456.4150943396226, "grad_norm": 1.735984609092, "learning_rate": 2.0167579832146505e-05, "loss": 1.4118, "step": 37980 }, { "epoch": 1457.1698113207547, "grad_norm": 1.7313373844094564, "learning_rate": 2.013251052129433e-05, "loss": 1.3767, "step": 38000 }, { "epoch": 1457.9245283018868, "grad_norm": 1.7960534040522838, "learning_rate": 2.0097471548937024e-05, "loss": 1.3803, "step": 38020 }, { "epoch": 1458.6792452830189, "grad_norm": 2.5283310401144434, "learning_rate": 2.0062462962600258e-05, "loss": 1.3763, "step": 38040 }, { "epoch": 1459.433962264151, "grad_norm": 1.5401697781512245, "learning_rate": 2.0027484809768506e-05, "loss": 1.3768, "step": 38060 }, { "epoch": 1460.188679245283, "grad_norm": 2.291171375246112, "learning_rate": 1.9992537137884905e-05, "loss": 1.389, "step": 38080 }, { "epoch": 1460.9433962264152, "grad_norm": 1.6878149956470094, "learning_rate": 1.9957619994351278e-05, "loss": 1.3978, "step": 38100 }, { "epoch": 1461.698113207547, "grad_norm": 1.9279881821004916, "learning_rate": 1.9922733426528033e-05, "loss": 1.3576, "step": 38120 }, { "epoch": 1462.4528301886792, "grad_norm": 1.6593792348690906, "learning_rate": 1.9887877481734122e-05, "loss": 1.3827, "step": 38140 }, { "epoch": 1463.2075471698113, "grad_norm": 1.6870370599265458, "learning_rate": 1.9853052207246967e-05, "loss": 1.3498, "step": 38160 }, { "epoch": 1463.9622641509434, "grad_norm": 1.5201114526632646, "learning_rate": 1.981825765030236e-05, "loss": 1.3972, "step": 38180 }, { "epoch": 1464.7169811320755, "grad_norm": 1.676738216954013, "learning_rate": 1.9783493858094444e-05, "loss": 1.3751, "step": 38200 }, { "epoch": 1465.4716981132076, "grad_norm": 1.7174190166635537, "learning_rate": 1.9748760877775622e-05, "loss": 1.3723, "step": 38220 }, { "epoch": 1466.2264150943397, "grad_norm": 1.6832142484740018, "learning_rate": 1.9714058756456533e-05, "loss": 1.383, "step": 38240 }, { "epoch": 1466.9811320754718, "grad_norm": 1.6409050107329164, "learning_rate": 1.9679387541205946e-05, "loss": 1.3868, "step": 38260 }, { "epoch": 1467.7358490566037, "grad_norm": 1.870348325922022, "learning_rate": 1.96447472790507e-05, "loss": 1.4093, "step": 38280 }, { "epoch": 1468.4905660377358, "grad_norm": 1.622451000807429, "learning_rate": 1.9610138016975643e-05, "loss": 1.3908, "step": 38300 }, { "epoch": 1469.245283018868, "grad_norm": 1.5580142525601877, "learning_rate": 1.9575559801923602e-05, "loss": 1.3519, "step": 38320 }, { "epoch": 1470.0, "grad_norm": 5.308564996067893, "learning_rate": 1.95410126807953e-05, "loss": 1.3806, "step": 38340 }, { "epoch": 1470.754716981132, "grad_norm": 1.697128624848967, "learning_rate": 1.9506496700449247e-05, "loss": 1.4021, "step": 38360 }, { "epoch": 1471.5094339622642, "grad_norm": 1.6407506564774348, "learning_rate": 1.9472011907701736e-05, "loss": 1.3889, "step": 38380 }, { "epoch": 1472.2641509433963, "grad_norm": 2.5227247039872567, "learning_rate": 1.9437558349326745e-05, "loss": 1.3656, "step": 38400 }, { "epoch": 1473.0188679245282, "grad_norm": 2.0102914189329155, "learning_rate": 1.9403136072055903e-05, "loss": 1.3631, "step": 38420 }, { "epoch": 1473.7735849056603, "grad_norm": 1.8549253610315775, "learning_rate": 1.9368745122578427e-05, "loss": 1.3835, "step": 38440 }, { "epoch": 1474.5283018867924, "grad_norm": 1.7474837425802672, "learning_rate": 1.9334385547541004e-05, "loss": 1.3876, "step": 38460 }, { "epoch": 1475.2830188679245, "grad_norm": 1.4221446206180013, "learning_rate": 1.930005739354778e-05, "loss": 1.3875, "step": 38480 }, { "epoch": 1476.0377358490566, "grad_norm": 1.567704112230289, "learning_rate": 1.926576070716028e-05, "loss": 1.3787, "step": 38500 }, { "epoch": 1476.7924528301887, "grad_norm": 1.8092743011121888, "learning_rate": 1.9231495534897356e-05, "loss": 1.3746, "step": 38520 }, { "epoch": 1477.5471698113208, "grad_norm": 1.9871542434365639, "learning_rate": 1.919726192323512e-05, "loss": 1.4062, "step": 38540 }, { "epoch": 1478.301886792453, "grad_norm": 2.100192891688555, "learning_rate": 1.916305991860687e-05, "loss": 1.372, "step": 38560 }, { "epoch": 1479.0566037735848, "grad_norm": 1.689968827696773, "learning_rate": 1.912888956740302e-05, "loss": 1.3994, "step": 38580 }, { "epoch": 1479.811320754717, "grad_norm": 1.59619952456533, "learning_rate": 1.9094750915971053e-05, "loss": 1.3547, "step": 38600 }, { "epoch": 1480.566037735849, "grad_norm": 3.206605320072948, "learning_rate": 1.9060644010615473e-05, "loss": 1.4052, "step": 38620 }, { "epoch": 1481.3207547169811, "grad_norm": 1.5795369303879008, "learning_rate": 1.9026568897597735e-05, "loss": 1.3921, "step": 38640 }, { "epoch": 1482.0754716981132, "grad_norm": 2.1910690965934467, "learning_rate": 1.8992525623136132e-05, "loss": 1.3563, "step": 38660 }, { "epoch": 1482.8301886792453, "grad_norm": 1.5353645456337577, "learning_rate": 1.8958514233405793e-05, "loss": 1.4077, "step": 38680 }, { "epoch": 1483.5849056603774, "grad_norm": 1.7836996022414107, "learning_rate": 1.8924534774538593e-05, "loss": 1.3824, "step": 38700 }, { "epoch": 1484.3396226415093, "grad_norm": 1.6136317181444138, "learning_rate": 1.8890587292623113e-05, "loss": 1.3511, "step": 38720 }, { "epoch": 1485.0943396226414, "grad_norm": 1.8211866581007339, "learning_rate": 1.8856671833704565e-05, "loss": 1.3725, "step": 38740 }, { "epoch": 1485.8490566037735, "grad_norm": 1.5979573815344084, "learning_rate": 1.8822788443784704e-05, "loss": 1.3571, "step": 38760 }, { "epoch": 1486.6037735849056, "grad_norm": 1.6365464316772047, "learning_rate": 1.878893716882177e-05, "loss": 1.3588, "step": 38780 }, { "epoch": 1487.3584905660377, "grad_norm": 2.811912712405292, "learning_rate": 1.8755118054730514e-05, "loss": 1.3823, "step": 38800 }, { "epoch": 1488.1132075471698, "grad_norm": 1.684830593576563, "learning_rate": 1.8721331147381986e-05, "loss": 1.3604, "step": 38820 }, { "epoch": 1488.867924528302, "grad_norm": 1.5101192259883982, "learning_rate": 1.868757649260362e-05, "loss": 1.3712, "step": 38840 }, { "epoch": 1489.622641509434, "grad_norm": 1.8516527403548584, "learning_rate": 1.8653854136179047e-05, "loss": 1.3576, "step": 38860 }, { "epoch": 1490.377358490566, "grad_norm": 1.5630443819078437, "learning_rate": 1.8620164123848113e-05, "loss": 1.3729, "step": 38880 }, { "epoch": 1491.132075471698, "grad_norm": 1.9558078371048477, "learning_rate": 1.8586506501306792e-05, "loss": 1.3466, "step": 38900 }, { "epoch": 1491.8867924528302, "grad_norm": 1.6581722869425195, "learning_rate": 1.8552881314207158e-05, "loss": 1.3547, "step": 38920 }, { "epoch": 1492.6415094339623, "grad_norm": 1.9162311420660751, "learning_rate": 1.8519288608157236e-05, "loss": 1.3995, "step": 38940 }, { "epoch": 1493.3962264150944, "grad_norm": 2.8463480242853874, "learning_rate": 1.8485728428721025e-05, "loss": 1.3609, "step": 38960 }, { "epoch": 1494.1509433962265, "grad_norm": 1.7832047879021928, "learning_rate": 1.845220082141838e-05, "loss": 1.3966, "step": 38980 }, { "epoch": 1494.9056603773586, "grad_norm": 1.628697490406908, "learning_rate": 1.841870583172502e-05, "loss": 1.3577, "step": 39000 }, { "epoch": 1495.6603773584907, "grad_norm": 1.7499682485349517, "learning_rate": 1.8385243505072403e-05, "loss": 1.3634, "step": 39020 }, { "epoch": 1496.4150943396226, "grad_norm": 2.369232208734949, "learning_rate": 1.835181388684767e-05, "loss": 1.3804, "step": 39040 }, { "epoch": 1497.1698113207547, "grad_norm": 2.002186669217615, "learning_rate": 1.8318417022393614e-05, "loss": 1.3775, "step": 39060 }, { "epoch": 1497.9245283018868, "grad_norm": 1.7745981101584183, "learning_rate": 1.8285052957008572e-05, "loss": 1.3678, "step": 39080 }, { "epoch": 1498.6792452830189, "grad_norm": 2.3506034698380027, "learning_rate": 1.825172173594644e-05, "loss": 1.3819, "step": 39100 }, { "epoch": 1499.433962264151, "grad_norm": 1.5587811175176152, "learning_rate": 1.8218423404416543e-05, "loss": 1.3623, "step": 39120 }, { "epoch": 1500.188679245283, "grad_norm": 1.5407388891782507, "learning_rate": 1.818515800758359e-05, "loss": 1.3737, "step": 39140 }, { "epoch": 1500.9433962264152, "grad_norm": 1.7105290658502008, "learning_rate": 1.8151925590567624e-05, "loss": 1.3416, "step": 39160 }, { "epoch": 1501.698113207547, "grad_norm": 2.1160472894699973, "learning_rate": 1.811872619844394e-05, "loss": 1.3596, "step": 39180 }, { "epoch": 1502.4528301886792, "grad_norm": 1.7134114803577327, "learning_rate": 1.8085559876243068e-05, "loss": 1.3486, "step": 39200 }, { "epoch": 1503.2075471698113, "grad_norm": 1.5742520539626361, "learning_rate": 1.805242666895068e-05, "loss": 1.3737, "step": 39220 }, { "epoch": 1503.9622641509434, "grad_norm": 1.6841300143409803, "learning_rate": 1.8019326621507504e-05, "loss": 1.3593, "step": 39240 }, { "epoch": 1504.7169811320755, "grad_norm": 1.4678089134086005, "learning_rate": 1.7986259778809304e-05, "loss": 1.3332, "step": 39260 }, { "epoch": 1505.4716981132076, "grad_norm": 1.5583137022685134, "learning_rate": 1.7953226185706828e-05, "loss": 1.3532, "step": 39280 }, { "epoch": 1506.2264150943397, "grad_norm": 1.754522974870956, "learning_rate": 1.7920225887005686e-05, "loss": 1.3969, "step": 39300 }, { "epoch": 1506.9811320754718, "grad_norm": 2.329959975485945, "learning_rate": 1.788725892746638e-05, "loss": 1.3693, "step": 39320 }, { "epoch": 1507.7358490566037, "grad_norm": 1.9736365954487893, "learning_rate": 1.7854325351804138e-05, "loss": 1.3545, "step": 39340 }, { "epoch": 1508.4905660377358, "grad_norm": 2.0873147005956274, "learning_rate": 1.782142520468893e-05, "loss": 1.357, "step": 39360 }, { "epoch": 1509.245283018868, "grad_norm": 1.8565982391914584, "learning_rate": 1.7788558530745406e-05, "loss": 1.3574, "step": 39380 }, { "epoch": 1510.0, "grad_norm": 1.3940404402455406, "learning_rate": 1.7755725374552767e-05, "loss": 1.3322, "step": 39400 }, { "epoch": 1510.754716981132, "grad_norm": 1.6757375238937267, "learning_rate": 1.772292578064481e-05, "loss": 1.3562, "step": 39420 }, { "epoch": 1511.5094339622642, "grad_norm": 2.021397727623104, "learning_rate": 1.769015979350977e-05, "loss": 1.3494, "step": 39440 }, { "epoch": 1512.2641509433963, "grad_norm": 1.9073518931594837, "learning_rate": 1.7657427457590277e-05, "loss": 1.3469, "step": 39460 }, { "epoch": 1513.0188679245282, "grad_norm": 1.5299281651503949, "learning_rate": 1.7624728817283386e-05, "loss": 1.3347, "step": 39480 }, { "epoch": 1513.7735849056603, "grad_norm": 1.6171286973533487, "learning_rate": 1.7592063916940385e-05, "loss": 1.3781, "step": 39500 }, { "epoch": 1514.5283018867924, "grad_norm": 2.3151167774892283, "learning_rate": 1.7559432800866844e-05, "loss": 1.3389, "step": 39520 }, { "epoch": 1515.2830188679245, "grad_norm": 1.8404517143557557, "learning_rate": 1.752683551332248e-05, "loss": 1.3809, "step": 39540 }, { "epoch": 1516.0377358490566, "grad_norm": 1.80165740372062, "learning_rate": 1.749427209852112e-05, "loss": 1.3647, "step": 39560 }, { "epoch": 1516.7924528301887, "grad_norm": 1.5503005965319303, "learning_rate": 1.7461742600630684e-05, "loss": 1.3553, "step": 39580 }, { "epoch": 1517.5471698113208, "grad_norm": 1.7389286642537964, "learning_rate": 1.7429247063773047e-05, "loss": 1.3566, "step": 39600 }, { "epoch": 1518.301886792453, "grad_norm": 1.5514338805704833, "learning_rate": 1.7396785532024062e-05, "loss": 1.3771, "step": 39620 }, { "epoch": 1519.0566037735848, "grad_norm": 1.738553891820026, "learning_rate": 1.7364358049413427e-05, "loss": 1.3608, "step": 39640 }, { "epoch": 1519.811320754717, "grad_norm": 2.2590021667446476, "learning_rate": 1.7331964659924647e-05, "loss": 1.3594, "step": 39660 }, { "epoch": 1520.566037735849, "grad_norm": 1.8008534873454645, "learning_rate": 1.729960540749503e-05, "loss": 1.3446, "step": 39680 }, { "epoch": 1521.3207547169811, "grad_norm": 1.9823359457338208, "learning_rate": 1.7267280336015543e-05, "loss": 1.3604, "step": 39700 }, { "epoch": 1522.0754716981132, "grad_norm": 1.6630862297023916, "learning_rate": 1.723498948933081e-05, "loss": 1.3831, "step": 39720 }, { "epoch": 1522.8301886792453, "grad_norm": 1.9271729195919085, "learning_rate": 1.720273291123901e-05, "loss": 1.3571, "step": 39740 }, { "epoch": 1523.5849056603774, "grad_norm": 1.6944904437475812, "learning_rate": 1.7170510645491884e-05, "loss": 1.3845, "step": 39760 }, { "epoch": 1524.3396226415093, "grad_norm": 2.0111059446030164, "learning_rate": 1.7138322735794582e-05, "loss": 1.3464, "step": 39780 }, { "epoch": 1525.0943396226414, "grad_norm": 1.6636863494806655, "learning_rate": 1.7106169225805703e-05, "loss": 1.3472, "step": 39800 }, { "epoch": 1525.8490566037735, "grad_norm": 1.654778862655826, "learning_rate": 1.7074050159137155e-05, "loss": 1.3517, "step": 39820 }, { "epoch": 1526.6037735849056, "grad_norm": 1.9255620043148591, "learning_rate": 1.7041965579354115e-05, "loss": 1.359, "step": 39840 }, { "epoch": 1527.3584905660377, "grad_norm": 1.7612527068488755, "learning_rate": 1.7009915529975046e-05, "loss": 1.3535, "step": 39860 }, { "epoch": 1528.1132075471698, "grad_norm": 1.6440495946289901, "learning_rate": 1.69779000544715e-05, "loss": 1.3275, "step": 39880 }, { "epoch": 1528.867924528302, "grad_norm": 1.8436152132956103, "learning_rate": 1.6945919196268195e-05, "loss": 1.3269, "step": 39900 }, { "epoch": 1529.622641509434, "grad_norm": 1.5249165119761414, "learning_rate": 1.6913972998742855e-05, "loss": 1.3528, "step": 39920 }, { "epoch": 1530.377358490566, "grad_norm": 1.7116936271233165, "learning_rate": 1.6882061505226197e-05, "loss": 1.3351, "step": 39940 }, { "epoch": 1531.132075471698, "grad_norm": 1.9127818443391411, "learning_rate": 1.68501847590019e-05, "loss": 1.3649, "step": 39960 }, { "epoch": 1531.8867924528302, "grad_norm": 1.7968703663627887, "learning_rate": 1.681834280330646e-05, "loss": 1.3664, "step": 39980 }, { "epoch": 1532.6415094339623, "grad_norm": 2.01086719476703, "learning_rate": 1.6786535681329242e-05, "loss": 1.3354, "step": 40000 }, { "epoch": 1533.3962264150944, "grad_norm": 1.8971146877595166, "learning_rate": 1.6754763436212318e-05, "loss": 1.3459, "step": 40020 }, { "epoch": 1534.1509433962265, "grad_norm": 1.5538558777058122, "learning_rate": 1.6723026111050465e-05, "loss": 1.348, "step": 40040 }, { "epoch": 1534.9056603773586, "grad_norm": 1.6899684943437072, "learning_rate": 1.6691323748891116e-05, "loss": 1.3219, "step": 40060 }, { "epoch": 1535.6603773584907, "grad_norm": 1.5696983044378243, "learning_rate": 1.6659656392734248e-05, "loss": 1.3523, "step": 40080 }, { "epoch": 1536.4150943396226, "grad_norm": 2.008558955924781, "learning_rate": 1.6628024085532394e-05, "loss": 1.3507, "step": 40100 }, { "epoch": 1537.1698113207547, "grad_norm": 1.6814099912171956, "learning_rate": 1.6596426870190517e-05, "loss": 1.3271, "step": 40120 }, { "epoch": 1537.9245283018868, "grad_norm": 1.6392939792056798, "learning_rate": 1.6564864789566017e-05, "loss": 1.3628, "step": 40140 }, { "epoch": 1538.6792452830189, "grad_norm": 1.6937327457602671, "learning_rate": 1.6533337886468593e-05, "loss": 1.3457, "step": 40160 }, { "epoch": 1539.433962264151, "grad_norm": 2.0312461746808674, "learning_rate": 1.650184620366025e-05, "loss": 1.345, "step": 40180 }, { "epoch": 1540.188679245283, "grad_norm": 1.6324403361347462, "learning_rate": 1.647038978385525e-05, "loss": 1.3614, "step": 40200 }, { "epoch": 1540.9433962264152, "grad_norm": 1.8937794347785448, "learning_rate": 1.643896866971998e-05, "loss": 1.3485, "step": 40220 }, { "epoch": 1541.698113207547, "grad_norm": 1.66458657626364, "learning_rate": 1.6407582903872977e-05, "loss": 1.3201, "step": 40240 }, { "epoch": 1542.4528301886792, "grad_norm": 1.637256903291043, "learning_rate": 1.637623252888481e-05, "loss": 1.3287, "step": 40260 }, { "epoch": 1543.2075471698113, "grad_norm": 1.771255607485422, "learning_rate": 1.634491758727804e-05, "loss": 1.3386, "step": 40280 }, { "epoch": 1543.9622641509434, "grad_norm": 2.294826209947056, "learning_rate": 1.6313638121527195e-05, "loss": 1.3443, "step": 40300 }, { "epoch": 1544.7169811320755, "grad_norm": 1.5369973618999444, "learning_rate": 1.6282394174058652e-05, "loss": 1.3199, "step": 40320 }, { "epoch": 1545.4716981132076, "grad_norm": 1.7805574251016163, "learning_rate": 1.6251185787250646e-05, "loss": 1.3427, "step": 40340 }, { "epoch": 1546.2264150943397, "grad_norm": 1.7055546669575088, "learning_rate": 1.6220013003433163e-05, "loss": 1.3595, "step": 40360 }, { "epoch": 1546.9811320754718, "grad_norm": 1.6493151345521173, "learning_rate": 1.618887586488787e-05, "loss": 1.3417, "step": 40380 }, { "epoch": 1547.7358490566037, "grad_norm": 1.7099299752279526, "learning_rate": 1.6157774413848147e-05, "loss": 1.3286, "step": 40400 }, { "epoch": 1548.4905660377358, "grad_norm": 1.6461054638879455, "learning_rate": 1.61267086924989e-05, "loss": 1.3651, "step": 40420 }, { "epoch": 1549.245283018868, "grad_norm": 2.239209937375333, "learning_rate": 1.6095678742976643e-05, "loss": 1.3402, "step": 40440 }, { "epoch": 1550.0, "grad_norm": 1.9293560078530108, "learning_rate": 1.6064684607369317e-05, "loss": 1.3566, "step": 40460 }, { "epoch": 1550.754716981132, "grad_norm": 1.7850904902946119, "learning_rate": 1.603372632771629e-05, "loss": 1.3522, "step": 40480 }, { "epoch": 1551.5094339622642, "grad_norm": 1.8694667431709797, "learning_rate": 1.6002803946008334e-05, "loss": 1.3254, "step": 40500 }, { "epoch": 1552.2641509433963, "grad_norm": 1.8970095488709016, "learning_rate": 1.5971917504187483e-05, "loss": 1.3456, "step": 40520 }, { "epoch": 1553.0188679245282, "grad_norm": 1.473985828394077, "learning_rate": 1.5941067044147068e-05, "loss": 1.3425, "step": 40540 }, { "epoch": 1553.7735849056603, "grad_norm": 2.4173810451888436, "learning_rate": 1.591025260773159e-05, "loss": 1.3616, "step": 40560 }, { "epoch": 1554.5283018867924, "grad_norm": 4.978545098723231, "learning_rate": 1.587947423673667e-05, "loss": 1.3302, "step": 40580 }, { "epoch": 1555.2830188679245, "grad_norm": 1.8210531663627934, "learning_rate": 1.5848731972909058e-05, "loss": 1.3208, "step": 40600 }, { "epoch": 1556.0377358490566, "grad_norm": 1.7214332519076236, "learning_rate": 1.5818025857946504e-05, "loss": 1.3429, "step": 40620 }, { "epoch": 1556.7924528301887, "grad_norm": 1.7715531344419837, "learning_rate": 1.5787355933497722e-05, "loss": 1.3236, "step": 40640 }, { "epoch": 1557.5471698113208, "grad_norm": 1.6774415891925254, "learning_rate": 1.5756722241162336e-05, "loss": 1.3038, "step": 40660 }, { "epoch": 1558.301886792453, "grad_norm": 1.7378768400910978, "learning_rate": 1.5726124822490856e-05, "loss": 1.3393, "step": 40680 }, { "epoch": 1559.0566037735848, "grad_norm": 1.5050468895919773, "learning_rate": 1.569556371898455e-05, "loss": 1.3169, "step": 40700 }, { "epoch": 1559.811320754717, "grad_norm": 1.7494465787076923, "learning_rate": 1.5665038972095462e-05, "loss": 1.3219, "step": 40720 }, { "epoch": 1560.566037735849, "grad_norm": 1.942070137365104, "learning_rate": 1.563455062322631e-05, "loss": 1.3331, "step": 40740 }, { "epoch": 1561.3207547169811, "grad_norm": 1.4760834184650184, "learning_rate": 1.560409871373043e-05, "loss": 1.3371, "step": 40760 }, { "epoch": 1562.0754716981132, "grad_norm": 1.794169465456889, "learning_rate": 1.5573683284911766e-05, "loss": 1.361, "step": 40780 }, { "epoch": 1562.8301886792453, "grad_norm": 1.5717564295595021, "learning_rate": 1.5543304378024745e-05, "loss": 1.3198, "step": 40800 }, { "epoch": 1563.5849056603774, "grad_norm": 1.679663629392091, "learning_rate": 1.5512962034274292e-05, "loss": 1.3225, "step": 40820 }, { "epoch": 1564.3396226415093, "grad_norm": 1.5963236435216681, "learning_rate": 1.5482656294815706e-05, "loss": 1.3475, "step": 40840 }, { "epoch": 1565.0943396226414, "grad_norm": 1.6282234240786269, "learning_rate": 1.5452387200754648e-05, "loss": 1.341, "step": 40860 }, { "epoch": 1565.8490566037735, "grad_norm": 1.8356401444891661, "learning_rate": 1.542215479314709e-05, "loss": 1.3093, "step": 40880 }, { "epoch": 1566.6037735849056, "grad_norm": 1.802086287293627, "learning_rate": 1.5391959112999222e-05, "loss": 1.3234, "step": 40900 }, { "epoch": 1567.3584905660377, "grad_norm": 1.596543322520551, "learning_rate": 1.536180020126744e-05, "loss": 1.3207, "step": 40920 }, { "epoch": 1568.1132075471698, "grad_norm": 1.5925040590351016, "learning_rate": 1.5331678098858253e-05, "loss": 1.3434, "step": 40940 }, { "epoch": 1568.867924528302, "grad_norm": 1.7805635796964523, "learning_rate": 1.5301592846628236e-05, "loss": 1.3189, "step": 40960 }, { "epoch": 1569.622641509434, "grad_norm": 2.270882122194477, "learning_rate": 1.5271544485384005e-05, "loss": 1.3331, "step": 40980 }, { "epoch": 1570.377358490566, "grad_norm": 1.5460374945916004, "learning_rate": 1.524153305588211e-05, "loss": 1.3307, "step": 41000 }, { "epoch": 1571.132075471698, "grad_norm": 1.5240728933202146, "learning_rate": 1.5211558598829046e-05, "loss": 1.3261, "step": 41020 }, { "epoch": 1571.8867924528302, "grad_norm": 1.6551356652204947, "learning_rate": 1.518162115488113e-05, "loss": 1.3444, "step": 41040 }, { "epoch": 1572.6415094339623, "grad_norm": 2.3662389207443897, "learning_rate": 1.5151720764644462e-05, "loss": 1.3078, "step": 41060 }, { "epoch": 1573.3962264150944, "grad_norm": 2.0201453815678336, "learning_rate": 1.5121857468674923e-05, "loss": 1.2931, "step": 41080 }, { "epoch": 1574.1509433962265, "grad_norm": 2.1833616270471428, "learning_rate": 1.509203130747807e-05, "loss": 1.3113, "step": 41100 }, { "epoch": 1574.9056603773586, "grad_norm": 1.6606152578025972, "learning_rate": 1.506224232150908e-05, "loss": 1.3488, "step": 41120 }, { "epoch": 1575.6603773584907, "grad_norm": 2.2621888669728776, "learning_rate": 1.5032490551172706e-05, "loss": 1.293, "step": 41140 }, { "epoch": 1576.4150943396226, "grad_norm": 1.8118753564672168, "learning_rate": 1.5002776036823215e-05, "loss": 1.3288, "step": 41160 }, { "epoch": 1577.1698113207547, "grad_norm": 1.4675478833771125, "learning_rate": 1.4973098818764368e-05, "loss": 1.3181, "step": 41180 }, { "epoch": 1577.9245283018868, "grad_norm": 1.5426718546178322, "learning_rate": 1.4943458937249337e-05, "loss": 1.3041, "step": 41200 }, { "epoch": 1578.6792452830189, "grad_norm": 1.7915801444691424, "learning_rate": 1.4913856432480624e-05, "loss": 1.3244, "step": 41220 }, { "epoch": 1579.433962264151, "grad_norm": 1.6284356095147676, "learning_rate": 1.4884291344610055e-05, "loss": 1.3623, "step": 41240 }, { "epoch": 1580.188679245283, "grad_norm": 2.018808164067539, "learning_rate": 1.4854763713738692e-05, "loss": 1.3265, "step": 41260 }, { "epoch": 1580.9433962264152, "grad_norm": 1.9982741446146173, "learning_rate": 1.48252735799168e-05, "loss": 1.3174, "step": 41280 }, { "epoch": 1581.698113207547, "grad_norm": 1.6089408891188777, "learning_rate": 1.4795820983143804e-05, "loss": 1.3054, "step": 41300 }, { "epoch": 1582.4528301886792, "grad_norm": 1.9322291254142352, "learning_rate": 1.4766405963368183e-05, "loss": 1.3288, "step": 41320 }, { "epoch": 1583.2075471698113, "grad_norm": 2.1500428010731105, "learning_rate": 1.4737028560487459e-05, "loss": 1.3251, "step": 41340 }, { "epoch": 1583.9622641509434, "grad_norm": 1.6754742169090076, "learning_rate": 1.470768881434812e-05, "loss": 1.3111, "step": 41360 }, { "epoch": 1584.7169811320755, "grad_norm": 2.0456793462392864, "learning_rate": 1.4678386764745604e-05, "loss": 1.2852, "step": 41380 }, { "epoch": 1585.4716981132076, "grad_norm": 1.6265186141557229, "learning_rate": 1.4649122451424216e-05, "loss": 1.3246, "step": 41400 }, { "epoch": 1586.2264150943397, "grad_norm": 1.7034501168928484, "learning_rate": 1.4619895914077052e-05, "loss": 1.3061, "step": 41420 }, { "epoch": 1586.9811320754718, "grad_norm": 2.5172683919046834, "learning_rate": 1.459070719234599e-05, "loss": 1.3287, "step": 41440 }, { "epoch": 1587.7358490566037, "grad_norm": 1.681004851075849, "learning_rate": 1.4561556325821593e-05, "loss": 1.307, "step": 41460 }, { "epoch": 1588.4905660377358, "grad_norm": 1.9545864689840218, "learning_rate": 1.4532443354043108e-05, "loss": 1.3015, "step": 41480 }, { "epoch": 1589.245283018868, "grad_norm": 2.2238938165489186, "learning_rate": 1.4503368316498385e-05, "loss": 1.3323, "step": 41500 }, { "epoch": 1590.0, "grad_norm": 2.0846781514374704, "learning_rate": 1.4474331252623795e-05, "loss": 1.3273, "step": 41520 }, { "epoch": 1590.754716981132, "grad_norm": 1.677709576953602, "learning_rate": 1.44453322018042e-05, "loss": 1.3035, "step": 41540 }, { "epoch": 1591.5094339622642, "grad_norm": 1.8283093943528037, "learning_rate": 1.4416371203372931e-05, "loss": 1.3261, "step": 41560 }, { "epoch": 1592.2641509433963, "grad_norm": 1.5077647603013566, "learning_rate": 1.4387448296611699e-05, "loss": 1.3039, "step": 41580 }, { "epoch": 1593.0188679245282, "grad_norm": 1.7410910545917078, "learning_rate": 1.4358563520750539e-05, "loss": 1.3073, "step": 41600 }, { "epoch": 1593.7735849056603, "grad_norm": 1.7856680678107866, "learning_rate": 1.4329716914967761e-05, "loss": 1.3128, "step": 41620 }, { "epoch": 1594.5283018867924, "grad_norm": 2.291591913470402, "learning_rate": 1.4300908518389904e-05, "loss": 1.3067, "step": 41640 }, { "epoch": 1595.2830188679245, "grad_norm": 1.785529683540441, "learning_rate": 1.42721383700917e-05, "loss": 1.3338, "step": 41660 }, { "epoch": 1596.0377358490566, "grad_norm": 1.5806566985978232, "learning_rate": 1.4243406509096e-05, "loss": 1.3212, "step": 41680 }, { "epoch": 1596.7924528301887, "grad_norm": 1.6779162023356309, "learning_rate": 1.4214712974373703e-05, "loss": 1.304, "step": 41700 }, { "epoch": 1597.5471698113208, "grad_norm": 1.7562533688231816, "learning_rate": 1.418605780484373e-05, "loss": 1.2875, "step": 41720 }, { "epoch": 1598.301886792453, "grad_norm": 1.7567734296175508, "learning_rate": 1.4157441039372966e-05, "loss": 1.3145, "step": 41740 }, { "epoch": 1599.0566037735848, "grad_norm": 1.722702892703264, "learning_rate": 1.4128862716776218e-05, "loss": 1.3019, "step": 41760 }, { "epoch": 1599.811320754717, "grad_norm": 1.9956191020255551, "learning_rate": 1.4100322875816148e-05, "loss": 1.3114, "step": 41780 }, { "epoch": 1600.566037735849, "grad_norm": 1.6531331340969986, "learning_rate": 1.4071821555203213e-05, "loss": 1.2966, "step": 41800 }, { "epoch": 1601.3207547169811, "grad_norm": 2.0591826060061376, "learning_rate": 1.4043358793595621e-05, "loss": 1.3015, "step": 41820 }, { "epoch": 1602.0754716981132, "grad_norm": 1.6984765409093496, "learning_rate": 1.4014934629599273e-05, "loss": 1.2995, "step": 41840 }, { "epoch": 1602.8301886792453, "grad_norm": 2.2995147164644165, "learning_rate": 1.3986549101767747e-05, "loss": 1.3184, "step": 41860 }, { "epoch": 1603.5849056603774, "grad_norm": 1.6563391450177631, "learning_rate": 1.39582022486022e-05, "loss": 1.3379, "step": 41880 }, { "epoch": 1604.3396226415093, "grad_norm": 1.8150129139182571, "learning_rate": 1.3929894108551327e-05, "loss": 1.2831, "step": 41900 }, { "epoch": 1605.0943396226414, "grad_norm": 1.6835080581169954, "learning_rate": 1.390162472001131e-05, "loss": 1.3275, "step": 41920 }, { "epoch": 1605.8490566037735, "grad_norm": 1.6294691180875247, "learning_rate": 1.3873394121325766e-05, "loss": 1.2913, "step": 41940 }, { "epoch": 1606.6037735849056, "grad_norm": 1.7180885898850626, "learning_rate": 1.3845202350785745e-05, "loss": 1.2965, "step": 41960 }, { "epoch": 1607.3584905660377, "grad_norm": 1.5583008263797746, "learning_rate": 1.3817049446629576e-05, "loss": 1.2832, "step": 41980 }, { "epoch": 1608.1132075471698, "grad_norm": 1.8079371040717394, "learning_rate": 1.3788935447042895e-05, "loss": 1.2954, "step": 42000 }, { "epoch": 1608.867924528302, "grad_norm": 1.8337600854116936, "learning_rate": 1.3760860390158554e-05, "loss": 1.309, "step": 42020 }, { "epoch": 1609.622641509434, "grad_norm": 1.6419903042017507, "learning_rate": 1.3732824314056604e-05, "loss": 1.3068, "step": 42040 }, { "epoch": 1610.377358490566, "grad_norm": 1.948536117708095, "learning_rate": 1.370482725676423e-05, "loss": 1.3399, "step": 42060 }, { "epoch": 1611.132075471698, "grad_norm": 1.9254279275726736, "learning_rate": 1.3676869256255669e-05, "loss": 1.3151, "step": 42080 }, { "epoch": 1611.8867924528302, "grad_norm": 1.802137918813917, "learning_rate": 1.3648950350452192e-05, "loss": 1.2844, "step": 42100 }, { "epoch": 1612.6415094339623, "grad_norm": 3.3181585562433358, "learning_rate": 1.3621070577222036e-05, "loss": 1.3125, "step": 42120 }, { "epoch": 1613.3962264150944, "grad_norm": 1.6631495329844195, "learning_rate": 1.3593229974380375e-05, "loss": 1.2908, "step": 42140 }, { "epoch": 1614.1509433962265, "grad_norm": 1.8984095064618975, "learning_rate": 1.3565428579689256e-05, "loss": 1.2937, "step": 42160 }, { "epoch": 1614.9056603773586, "grad_norm": 1.8422792147059388, "learning_rate": 1.3537666430857535e-05, "loss": 1.284, "step": 42180 }, { "epoch": 1615.6603773584907, "grad_norm": 1.6231527761557085, "learning_rate": 1.3509943565540833e-05, "loss": 1.297, "step": 42200 }, { "epoch": 1616.4150943396226, "grad_norm": 1.9453735113539294, "learning_rate": 1.3482260021341475e-05, "loss": 1.2902, "step": 42220 }, { "epoch": 1617.1698113207547, "grad_norm": 1.7641921684369601, "learning_rate": 1.345461583580849e-05, "loss": 1.282, "step": 42240 }, { "epoch": 1617.9245283018868, "grad_norm": 2.0387885601326228, "learning_rate": 1.3427011046437513e-05, "loss": 1.2898, "step": 42260 }, { "epoch": 1618.6792452830189, "grad_norm": 1.777342395003277, "learning_rate": 1.3399445690670713e-05, "loss": 1.3168, "step": 42280 }, { "epoch": 1619.433962264151, "grad_norm": 1.6370450829023924, "learning_rate": 1.33719198058968e-05, "loss": 1.3075, "step": 42300 }, { "epoch": 1620.188679245283, "grad_norm": 1.779662328060948, "learning_rate": 1.334443342945093e-05, "loss": 1.2919, "step": 42320 }, { "epoch": 1620.9433962264152, "grad_norm": 1.7581747425039895, "learning_rate": 1.3316986598614685e-05, "loss": 1.3074, "step": 42340 }, { "epoch": 1621.698113207547, "grad_norm": 1.8150800521846453, "learning_rate": 1.3289579350616015e-05, "loss": 1.2807, "step": 42360 }, { "epoch": 1622.4528301886792, "grad_norm": 1.7354020247655273, "learning_rate": 1.3262211722629166e-05, "loss": 1.2826, "step": 42380 }, { "epoch": 1623.2075471698113, "grad_norm": 1.468102174253192, "learning_rate": 1.3234883751774644e-05, "loss": 1.288, "step": 42400 }, { "epoch": 1623.9622641509434, "grad_norm": 1.657454196093378, "learning_rate": 1.3207595475119152e-05, "loss": 1.3106, "step": 42420 }, { "epoch": 1624.7169811320755, "grad_norm": 1.6557640300068772, "learning_rate": 1.3180346929675611e-05, "loss": 1.2731, "step": 42440 }, { "epoch": 1625.4716981132076, "grad_norm": 1.7735353149818989, "learning_rate": 1.3153138152402996e-05, "loss": 1.2763, "step": 42460 }, { "epoch": 1626.2264150943397, "grad_norm": 1.9382385391993158, "learning_rate": 1.3125969180206349e-05, "loss": 1.319, "step": 42480 }, { "epoch": 1626.9811320754718, "grad_norm": 2.136894507958651, "learning_rate": 1.3098840049936733e-05, "loss": 1.2805, "step": 42500 }, { "epoch": 1627.7358490566037, "grad_norm": 2.1483143587792366, "learning_rate": 1.3071750798391171e-05, "loss": 1.2853, "step": 42520 }, { "epoch": 1628.4905660377358, "grad_norm": 1.710874022513826, "learning_rate": 1.304470146231261e-05, "loss": 1.2806, "step": 42540 }, { "epoch": 1629.245283018868, "grad_norm": 1.8951457389499449, "learning_rate": 1.3017692078389823e-05, "loss": 1.2932, "step": 42560 }, { "epoch": 1630.0, "grad_norm": 1.5984669511746095, "learning_rate": 1.299072268325742e-05, "loss": 1.2931, "step": 42580 }, { "epoch": 1630.754716981132, "grad_norm": 1.8192427048671964, "learning_rate": 1.2963793313495747e-05, "loss": 1.2736, "step": 42600 }, { "epoch": 1631.5094339622642, "grad_norm": 1.7556743408681688, "learning_rate": 1.2936904005630886e-05, "loss": 1.2844, "step": 42620 }, { "epoch": 1632.2641509433963, "grad_norm": 1.9584621506348525, "learning_rate": 1.2910054796134588e-05, "loss": 1.2903, "step": 42640 }, { "epoch": 1633.0188679245282, "grad_norm": 1.818910778704821, "learning_rate": 1.2883245721424182e-05, "loss": 1.2982, "step": 42660 }, { "epoch": 1633.7735849056603, "grad_norm": 1.6564371207191282, "learning_rate": 1.2856476817862578e-05, "loss": 1.2719, "step": 42680 }, { "epoch": 1634.5283018867924, "grad_norm": 2.176105338983291, "learning_rate": 1.2829748121758186e-05, "loss": 1.2703, "step": 42700 }, { "epoch": 1635.2830188679245, "grad_norm": 2.053268530527867, "learning_rate": 1.280305966936491e-05, "loss": 1.2745, "step": 42720 }, { "epoch": 1636.0377358490566, "grad_norm": 1.9471448193829715, "learning_rate": 1.2776411496882053e-05, "loss": 1.2924, "step": 42740 }, { "epoch": 1636.7924528301887, "grad_norm": 1.7439666627316233, "learning_rate": 1.2749803640454274e-05, "loss": 1.2883, "step": 42760 }, { "epoch": 1637.5471698113208, "grad_norm": 1.7616849064246298, "learning_rate": 1.2723236136171557e-05, "loss": 1.2901, "step": 42780 }, { "epoch": 1638.301886792453, "grad_norm": 1.9785596426579124, "learning_rate": 1.2696709020069137e-05, "loss": 1.2806, "step": 42800 }, { "epoch": 1639.0566037735848, "grad_norm": 1.8267869166287358, "learning_rate": 1.2670222328127502e-05, "loss": 1.2915, "step": 42820 }, { "epoch": 1639.811320754717, "grad_norm": 1.8394119712662444, "learning_rate": 1.2643776096272298e-05, "loss": 1.2959, "step": 42840 }, { "epoch": 1640.566037735849, "grad_norm": 1.5745567144671013, "learning_rate": 1.2617370360374272e-05, "loss": 1.2542, "step": 42860 }, { "epoch": 1641.3207547169811, "grad_norm": 1.759927673486542, "learning_rate": 1.2591005156249265e-05, "loss": 1.2957, "step": 42880 }, { "epoch": 1642.0754716981132, "grad_norm": 1.8212375700098509, "learning_rate": 1.2564680519658124e-05, "loss": 1.2911, "step": 42900 }, { "epoch": 1642.8301886792453, "grad_norm": 1.7170892376270965, "learning_rate": 1.2538396486306685e-05, "loss": 1.2815, "step": 42920 }, { "epoch": 1643.5849056603774, "grad_norm": 1.9707121868823774, "learning_rate": 1.2512153091845724e-05, "loss": 1.2817, "step": 42940 }, { "epoch": 1644.3396226415093, "grad_norm": 1.6048036207691687, "learning_rate": 1.2485950371870873e-05, "loss": 1.2318, "step": 42960 }, { "epoch": 1645.0943396226414, "grad_norm": 1.7616652416059821, "learning_rate": 1.2459788361922582e-05, "loss": 1.2482, "step": 42980 }, { "epoch": 1645.8490566037735, "grad_norm": 1.971912319619838, "learning_rate": 1.2433667097486137e-05, "loss": 1.2732, "step": 43000 }, { "epoch": 1646.6037735849056, "grad_norm": 1.8554872676424141, "learning_rate": 1.2407586613991493e-05, "loss": 1.2862, "step": 43020 }, { "epoch": 1647.3584905660377, "grad_norm": 1.699680830714332, "learning_rate": 1.2381546946813345e-05, "loss": 1.2783, "step": 43040 }, { "epoch": 1648.1132075471698, "grad_norm": 2.0246817924294174, "learning_rate": 1.2355548131271e-05, "loss": 1.2913, "step": 43060 }, { "epoch": 1648.867924528302, "grad_norm": 1.8248553236680727, "learning_rate": 1.2329590202628339e-05, "loss": 1.2982, "step": 43080 }, { "epoch": 1649.622641509434, "grad_norm": 1.5949265423747017, "learning_rate": 1.2303673196093838e-05, "loss": 1.2682, "step": 43100 }, { "epoch": 1650.377358490566, "grad_norm": 1.8162411778047456, "learning_rate": 1.2277797146820398e-05, "loss": 1.2775, "step": 43120 }, { "epoch": 1651.132075471698, "grad_norm": 1.8713447665462608, "learning_rate": 1.225196208990544e-05, "loss": 1.2816, "step": 43140 }, { "epoch": 1651.8867924528302, "grad_norm": 1.7680462074180785, "learning_rate": 1.2226168060390733e-05, "loss": 1.2583, "step": 43160 }, { "epoch": 1652.6415094339623, "grad_norm": 1.7924027918393708, "learning_rate": 1.2200415093262394e-05, "loss": 1.2631, "step": 43180 }, { "epoch": 1653.3962264150944, "grad_norm": 1.8736863718504475, "learning_rate": 1.2174703223450895e-05, "loss": 1.2841, "step": 43200 }, { "epoch": 1654.1509433962265, "grad_norm": 2.4263803178613257, "learning_rate": 1.2149032485830917e-05, "loss": 1.2549, "step": 43220 }, { "epoch": 1654.9056603773586, "grad_norm": 2.0941904006879746, "learning_rate": 1.212340291522137e-05, "loss": 1.2723, "step": 43240 }, { "epoch": 1655.6603773584907, "grad_norm": 1.8402209538224543, "learning_rate": 1.2097814546385328e-05, "loss": 1.2974, "step": 43260 }, { "epoch": 1656.4150943396226, "grad_norm": 1.6993013644974577, "learning_rate": 1.2072267414029963e-05, "loss": 1.2513, "step": 43280 }, { "epoch": 1657.1698113207547, "grad_norm": 1.7170522867791609, "learning_rate": 1.2046761552806534e-05, "loss": 1.2805, "step": 43300 }, { "epoch": 1657.9245283018868, "grad_norm": 2.0350355441018007, "learning_rate": 1.2021296997310335e-05, "loss": 1.2705, "step": 43320 }, { "epoch": 1658.6792452830189, "grad_norm": 2.4347287101096127, "learning_rate": 1.1995873782080597e-05, "loss": 1.3121, "step": 43340 }, { "epoch": 1659.433962264151, "grad_norm": 2.4941517416955423, "learning_rate": 1.1970491941600483e-05, "loss": 1.283, "step": 43360 }, { "epoch": 1660.188679245283, "grad_norm": 1.5936769777901518, "learning_rate": 1.1945151510297077e-05, "loss": 1.3007, "step": 43380 }, { "epoch": 1660.9433962264152, "grad_norm": 1.7694343184984302, "learning_rate": 1.191985252254125e-05, "loss": 1.2624, "step": 43400 }, { "epoch": 1661.698113207547, "grad_norm": 1.795958862054882, "learning_rate": 1.1894595012647705e-05, "loss": 1.2845, "step": 43420 }, { "epoch": 1662.4528301886792, "grad_norm": 1.7885172471497286, "learning_rate": 1.1869379014874838e-05, "loss": 1.2917, "step": 43440 }, { "epoch": 1663.2075471698113, "grad_norm": 2.120293693613178, "learning_rate": 1.1844204563424761e-05, "loss": 1.2772, "step": 43460 }, { "epoch": 1663.9622641509434, "grad_norm": 1.9597227989228387, "learning_rate": 1.1819071692443259e-05, "loss": 1.2795, "step": 43480 }, { "epoch": 1664.7169811320755, "grad_norm": 1.6400049564311938, "learning_rate": 1.1793980436019665e-05, "loss": 1.2698, "step": 43500 }, { "epoch": 1665.4716981132076, "grad_norm": 1.8644080059944403, "learning_rate": 1.1768930828186929e-05, "loss": 1.2587, "step": 43520 }, { "epoch": 1666.2264150943397, "grad_norm": 1.8282116731915254, "learning_rate": 1.1743922902921463e-05, "loss": 1.3132, "step": 43540 }, { "epoch": 1666.9811320754718, "grad_norm": 1.7249443500233614, "learning_rate": 1.1718956694143148e-05, "loss": 1.2723, "step": 43560 }, { "epoch": 1667.7358490566037, "grad_norm": 2.2697663843146665, "learning_rate": 1.1694032235715316e-05, "loss": 1.2568, "step": 43580 }, { "epoch": 1668.4905660377358, "grad_norm": 1.530647627622187, "learning_rate": 1.1669149561444626e-05, "loss": 1.2717, "step": 43600 }, { "epoch": 1669.245283018868, "grad_norm": 1.9077370898342472, "learning_rate": 1.1644308705081098e-05, "loss": 1.252, "step": 43620 }, { "epoch": 1670.0, "grad_norm": 2.44039353912842, "learning_rate": 1.1619509700318012e-05, "loss": 1.2829, "step": 43640 }, { "epoch": 1670.754716981132, "grad_norm": 1.7743292552737207, "learning_rate": 1.159475258079188e-05, "loss": 1.2831, "step": 43660 }, { "epoch": 1671.5094339622642, "grad_norm": 1.747479187441525, "learning_rate": 1.1570037380082422e-05, "loss": 1.2933, "step": 43680 }, { "epoch": 1672.2641509433963, "grad_norm": 1.6300209643052874, "learning_rate": 1.154536413171247e-05, "loss": 1.262, "step": 43700 }, { "epoch": 1673.0188679245282, "grad_norm": 1.6492759638436003, "learning_rate": 1.1520732869147992e-05, "loss": 1.2733, "step": 43720 }, { "epoch": 1673.7735849056603, "grad_norm": 1.8650633960066672, "learning_rate": 1.149614362579798e-05, "loss": 1.2536, "step": 43740 }, { "epoch": 1674.5283018867924, "grad_norm": 2.1343348774015154, "learning_rate": 1.1471596435014422e-05, "loss": 1.2813, "step": 43760 }, { "epoch": 1675.2830188679245, "grad_norm": 1.8893198944715273, "learning_rate": 1.144709133009231e-05, "loss": 1.2563, "step": 43780 }, { "epoch": 1676.0377358490566, "grad_norm": 2.1120967291275416, "learning_rate": 1.1422628344269509e-05, "loss": 1.2821, "step": 43800 }, { "epoch": 1676.7924528301887, "grad_norm": 1.9495522685303381, "learning_rate": 1.1398207510726789e-05, "loss": 1.2517, "step": 43820 }, { "epoch": 1677.5471698113208, "grad_norm": 2.467904007991342, "learning_rate": 1.1373828862587707e-05, "loss": 1.2561, "step": 43840 }, { "epoch": 1678.301886792453, "grad_norm": 2.0894802109018364, "learning_rate": 1.1349492432918656e-05, "loss": 1.2435, "step": 43860 }, { "epoch": 1679.0566037735848, "grad_norm": 1.828171013483477, "learning_rate": 1.1325198254728714e-05, "loss": 1.2622, "step": 43880 }, { "epoch": 1679.811320754717, "grad_norm": 1.801681773387057, "learning_rate": 1.1300946360969663e-05, "loss": 1.2558, "step": 43900 }, { "epoch": 1680.566037735849, "grad_norm": 1.6735648229781173, "learning_rate": 1.127673678453596e-05, "loss": 1.2758, "step": 43920 }, { "epoch": 1681.3207547169811, "grad_norm": 1.8439021982026453, "learning_rate": 1.1252569558264623e-05, "loss": 1.2711, "step": 43940 }, { "epoch": 1682.0754716981132, "grad_norm": 2.1505206283933016, "learning_rate": 1.1228444714935267e-05, "loss": 1.2654, "step": 43960 }, { "epoch": 1682.8301886792453, "grad_norm": 2.3535064036412763, "learning_rate": 1.1204362287269989e-05, "loss": 1.2864, "step": 43980 }, { "epoch": 1683.5849056603774, "grad_norm": 1.9848669397447662, "learning_rate": 1.1180322307933367e-05, "loss": 1.2678, "step": 44000 }, { "epoch": 1684.3396226415093, "grad_norm": 1.9913924876704123, "learning_rate": 1.1156324809532414e-05, "loss": 1.2676, "step": 44020 }, { "epoch": 1685.0943396226414, "grad_norm": 1.7514208349630622, "learning_rate": 1.1132369824616499e-05, "loss": 1.2616, "step": 44040 }, { "epoch": 1685.8490566037735, "grad_norm": 2.2320848012235888, "learning_rate": 1.1108457385677357e-05, "loss": 1.2342, "step": 44060 }, { "epoch": 1686.6037735849056, "grad_norm": 1.5942465466549, "learning_rate": 1.1084587525148977e-05, "loss": 1.2645, "step": 44080 }, { "epoch": 1687.3584905660377, "grad_norm": 1.7930621881455668, "learning_rate": 1.1060760275407643e-05, "loss": 1.2534, "step": 44100 }, { "epoch": 1688.1132075471698, "grad_norm": 1.9526833117644506, "learning_rate": 1.1036975668771807e-05, "loss": 1.2609, "step": 44120 }, { "epoch": 1688.867924528302, "grad_norm": 2.7437511360247084, "learning_rate": 1.1013233737502087e-05, "loss": 1.2343, "step": 44140 }, { "epoch": 1689.622641509434, "grad_norm": 1.9223463912813794, "learning_rate": 1.098953451380124e-05, "loss": 1.2442, "step": 44160 }, { "epoch": 1690.377358490566, "grad_norm": 1.8610095181805815, "learning_rate": 1.0965878029814056e-05, "loss": 1.2754, "step": 44180 }, { "epoch": 1691.132075471698, "grad_norm": 1.8877853703015002, "learning_rate": 1.0942264317627406e-05, "loss": 1.2491, "step": 44200 }, { "epoch": 1691.8867924528302, "grad_norm": 1.7714097265467896, "learning_rate": 1.09186934092701e-05, "loss": 1.2405, "step": 44220 }, { "epoch": 1692.6415094339623, "grad_norm": 1.7637174019223203, "learning_rate": 1.0895165336712904e-05, "loss": 1.2829, "step": 44240 }, { "epoch": 1693.3962264150944, "grad_norm": 1.7656304793121242, "learning_rate": 1.087168013186851e-05, "loss": 1.2702, "step": 44260 }, { "epoch": 1694.1509433962265, "grad_norm": 1.7808808271916323, "learning_rate": 1.0848237826591417e-05, "loss": 1.2587, "step": 44280 }, { "epoch": 1694.9056603773586, "grad_norm": 2.0266053428110538, "learning_rate": 1.0824838452677987e-05, "loss": 1.2926, "step": 44300 }, { "epoch": 1695.6603773584907, "grad_norm": 1.8115058261411354, "learning_rate": 1.0801482041866307e-05, "loss": 1.2694, "step": 44320 }, { "epoch": 1696.4150943396226, "grad_norm": 2.4945233871026526, "learning_rate": 1.0778168625836231e-05, "loss": 1.2699, "step": 44340 }, { "epoch": 1697.1698113207547, "grad_norm": 1.6502310134668141, "learning_rate": 1.0754898236209268e-05, "loss": 1.2614, "step": 44360 }, { "epoch": 1697.9245283018868, "grad_norm": 1.8453532789577662, "learning_rate": 1.0731670904548564e-05, "loss": 1.2823, "step": 44380 }, { "epoch": 1698.6792452830189, "grad_norm": 1.860653653518071, "learning_rate": 1.070848666235889e-05, "loss": 1.2641, "step": 44400 }, { "epoch": 1699.433962264151, "grad_norm": 2.5399359657927856, "learning_rate": 1.0685345541086543e-05, "loss": 1.2654, "step": 44420 }, { "epoch": 1700.188679245283, "grad_norm": 2.0018966910355798, "learning_rate": 1.0662247572119366e-05, "loss": 1.2504, "step": 44440 }, { "epoch": 1700.9433962264152, "grad_norm": 1.9371452956267547, "learning_rate": 1.0639192786786632e-05, "loss": 1.2599, "step": 44460 }, { "epoch": 1701.698113207547, "grad_norm": 1.6521207456435931, "learning_rate": 1.061618121635906e-05, "loss": 1.2391, "step": 44480 }, { "epoch": 1702.4528301886792, "grad_norm": 1.8656871033709692, "learning_rate": 1.0593212892048769e-05, "loss": 1.2724, "step": 44500 }, { "epoch": 1703.2075471698113, "grad_norm": 1.8848695534792095, "learning_rate": 1.0570287845009191e-05, "loss": 1.2528, "step": 44520 }, { "epoch": 1703.9622641509434, "grad_norm": 1.9187788004054305, "learning_rate": 1.0547406106335084e-05, "loss": 1.2518, "step": 44540 }, { "epoch": 1704.7169811320755, "grad_norm": 1.6729690153958676, "learning_rate": 1.0524567707062449e-05, "loss": 1.2437, "step": 44560 }, { "epoch": 1705.4716981132076, "grad_norm": 1.8430409315959264, "learning_rate": 1.0501772678168493e-05, "loss": 1.2467, "step": 44580 }, { "epoch": 1706.2264150943397, "grad_norm": 1.8213698778380842, "learning_rate": 1.0479021050571638e-05, "loss": 1.261, "step": 44600 }, { "epoch": 1706.9811320754718, "grad_norm": 1.7756575044684015, "learning_rate": 1.0456312855131388e-05, "loss": 1.2278, "step": 44620 }, { "epoch": 1707.7358490566037, "grad_norm": 1.4850849315300283, "learning_rate": 1.0433648122648373e-05, "loss": 1.242, "step": 44640 }, { "epoch": 1708.4905660377358, "grad_norm": 1.9352490992820244, "learning_rate": 1.0411026883864254e-05, "loss": 1.2507, "step": 44660 }, { "epoch": 1709.245283018868, "grad_norm": 2.2842368933958634, "learning_rate": 1.0388449169461693e-05, "loss": 1.2614, "step": 44680 }, { "epoch": 1710.0, "grad_norm": 1.716195015782983, "learning_rate": 1.0365915010064342e-05, "loss": 1.2467, "step": 44700 }, { "epoch": 1710.754716981132, "grad_norm": 2.1393035099583524, "learning_rate": 1.0343424436236746e-05, "loss": 1.2697, "step": 44720 }, { "epoch": 1711.5094339622642, "grad_norm": 2.0773856374828354, "learning_rate": 1.0320977478484364e-05, "loss": 1.2642, "step": 44740 }, { "epoch": 1712.2641509433963, "grad_norm": 1.862877983575214, "learning_rate": 1.0298574167253475e-05, "loss": 1.2269, "step": 44760 }, { "epoch": 1713.0188679245282, "grad_norm": 1.8342101414521328, "learning_rate": 1.0276214532931146e-05, "loss": 1.2535, "step": 44780 }, { "epoch": 1713.7735849056603, "grad_norm": 1.9601396356391216, "learning_rate": 1.0253898605845225e-05, "loss": 1.2327, "step": 44800 }, { "epoch": 1714.5283018867924, "grad_norm": 4.66393766300096, "learning_rate": 1.0231626416264286e-05, "loss": 1.2503, "step": 44820 }, { "epoch": 1715.2830188679245, "grad_norm": 1.8180258292414466, "learning_rate": 1.020939799439755e-05, "loss": 1.2401, "step": 44840 }, { "epoch": 1716.0377358490566, "grad_norm": 2.106671537780403, "learning_rate": 1.0187213370394877e-05, "loss": 1.2536, "step": 44860 }, { "epoch": 1716.7924528301887, "grad_norm": 2.006353528787222, "learning_rate": 1.016507257434674e-05, "loss": 1.2669, "step": 44880 }, { "epoch": 1717.5471698113208, "grad_norm": 1.9080849159374786, "learning_rate": 1.0142975636284143e-05, "loss": 1.2509, "step": 44900 }, { "epoch": 1718.301886792453, "grad_norm": 1.8773707581872159, "learning_rate": 1.0120922586178633e-05, "loss": 1.2675, "step": 44920 }, { "epoch": 1719.0566037735848, "grad_norm": 2.0076660138565647, "learning_rate": 1.00989134539422e-05, "loss": 1.2534, "step": 44940 }, { "epoch": 1719.811320754717, "grad_norm": 2.33419651636862, "learning_rate": 1.0076948269427267e-05, "loss": 1.2397, "step": 44960 }, { "epoch": 1720.566037735849, "grad_norm": 2.1404808355187552, "learning_rate": 1.0055027062426677e-05, "loss": 1.2533, "step": 44980 }, { "epoch": 1721.3207547169811, "grad_norm": 1.8480400039657447, "learning_rate": 1.003314986267358e-05, "loss": 1.2493, "step": 45000 }, { "epoch": 1722.0754716981132, "grad_norm": 1.8827968491873732, "learning_rate": 1.0011316699841473e-05, "loss": 1.2622, "step": 45020 }, { "epoch": 1722.8301886792453, "grad_norm": 1.805703534242214, "learning_rate": 9.989527603544106e-06, "loss": 1.2363, "step": 45040 }, { "epoch": 1723.5849056603774, "grad_norm": 1.808082360236483, "learning_rate": 9.967782603335458e-06, "loss": 1.2487, "step": 45060 }, { "epoch": 1724.3396226415093, "grad_norm": 1.7375867158357146, "learning_rate": 9.946081728709704e-06, "loss": 1.2495, "step": 45080 }, { "epoch": 1725.0943396226414, "grad_norm": 1.9612535567440743, "learning_rate": 9.92442500910116e-06, "loss": 1.229, "step": 45100 }, { "epoch": 1725.8490566037735, "grad_norm": 1.7862147453258874, "learning_rate": 9.902812473884265e-06, "loss": 1.257, "step": 45120 }, { "epoch": 1726.6037735849056, "grad_norm": 1.9883007002332853, "learning_rate": 9.881244152373517e-06, "loss": 1.2449, "step": 45140 }, { "epoch": 1727.3584905660377, "grad_norm": 1.9502309547963228, "learning_rate": 9.859720073823439e-06, "loss": 1.224, "step": 45160 }, { "epoch": 1728.1132075471698, "grad_norm": 1.896645829195727, "learning_rate": 9.838240267428569e-06, "loss": 1.2396, "step": 45180 }, { "epoch": 1728.867924528302, "grad_norm": 1.629877819405046, "learning_rate": 9.816804762323362e-06, "loss": 1.2227, "step": 45200 }, { "epoch": 1729.622641509434, "grad_norm": 1.929708983579025, "learning_rate": 9.795413587582212e-06, "loss": 1.2516, "step": 45220 }, { "epoch": 1730.377358490566, "grad_norm": 2.0413627464070543, "learning_rate": 9.77406677221937e-06, "loss": 1.2514, "step": 45240 }, { "epoch": 1731.132075471698, "grad_norm": 1.9912183520226578, "learning_rate": 9.75276434518892e-06, "loss": 1.2414, "step": 45260 }, { "epoch": 1731.8867924528302, "grad_norm": 1.9880956960393557, "learning_rate": 9.731506335384743e-06, "loss": 1.2419, "step": 45280 }, { "epoch": 1732.6415094339623, "grad_norm": 1.7890362722548563, "learning_rate": 9.710292771640488e-06, "loss": 1.2369, "step": 45300 }, { "epoch": 1733.3962264150944, "grad_norm": 1.7651257661243038, "learning_rate": 9.689123682729494e-06, "loss": 1.2311, "step": 45320 }, { "epoch": 1734.1509433962265, "grad_norm": 2.2714752518928596, "learning_rate": 9.667999097364786e-06, "loss": 1.2367, "step": 45340 }, { "epoch": 1734.9056603773586, "grad_norm": 1.6713670044384341, "learning_rate": 9.646919044199022e-06, "loss": 1.2404, "step": 45360 }, { "epoch": 1735.6603773584907, "grad_norm": 1.6792236680717407, "learning_rate": 9.625883551824463e-06, "loss": 1.2196, "step": 45380 }, { "epoch": 1736.4150943396226, "grad_norm": 2.115412691262234, "learning_rate": 9.604892648772943e-06, "loss": 1.266, "step": 45400 }, { "epoch": 1737.1698113207547, "grad_norm": 1.8051416403865777, "learning_rate": 9.583946363515793e-06, "loss": 1.2226, "step": 45420 }, { "epoch": 1737.9245283018868, "grad_norm": 1.6009792370635079, "learning_rate": 9.563044724463834e-06, "loss": 1.2201, "step": 45440 }, { "epoch": 1738.6792452830189, "grad_norm": 1.6768644470720357, "learning_rate": 9.542187759967324e-06, "loss": 1.2421, "step": 45460 }, { "epoch": 1739.433962264151, "grad_norm": 1.8206836356027367, "learning_rate": 9.521375498315946e-06, "loss": 1.2329, "step": 45480 }, { "epoch": 1740.188679245283, "grad_norm": 1.938456827512391, "learning_rate": 9.500607967738736e-06, "loss": 1.2325, "step": 45500 }, { "epoch": 1740.9433962264152, "grad_norm": 2.2438430820956277, "learning_rate": 9.47988519640406e-06, "loss": 1.2354, "step": 45520 }, { "epoch": 1741.698113207547, "grad_norm": 1.8583994537682718, "learning_rate": 9.459207212419571e-06, "loss": 1.235, "step": 45540 }, { "epoch": 1742.4528301886792, "grad_norm": 1.917428400078255, "learning_rate": 9.438574043832166e-06, "loss": 1.224, "step": 45560 }, { "epoch": 1743.2075471698113, "grad_norm": 1.9824261322614047, "learning_rate": 9.417985718627978e-06, "loss": 1.2129, "step": 45580 }, { "epoch": 1743.9622641509434, "grad_norm": 1.612460364379856, "learning_rate": 9.397442264732312e-06, "loss": 1.2377, "step": 45600 }, { "epoch": 1744.7169811320755, "grad_norm": 1.887246888937515, "learning_rate": 9.376943710009596e-06, "loss": 1.239, "step": 45620 }, { "epoch": 1745.4716981132076, "grad_norm": 1.8662303877058588, "learning_rate": 9.35649008226336e-06, "loss": 1.2157, "step": 45640 }, { "epoch": 1746.2264150943397, "grad_norm": 1.8071426126559238, "learning_rate": 9.336081409236198e-06, "loss": 1.2428, "step": 45660 }, { "epoch": 1746.9811320754718, "grad_norm": 2.0833393122383828, "learning_rate": 9.315717718609757e-06, "loss": 1.2492, "step": 45680 }, { "epoch": 1747.7358490566037, "grad_norm": 1.6715168273471837, "learning_rate": 9.295399038004633e-06, "loss": 1.2266, "step": 45700 }, { "epoch": 1748.4905660377358, "grad_norm": 2.195357416639307, "learning_rate": 9.275125394980386e-06, "loss": 1.2253, "step": 45720 }, { "epoch": 1749.245283018868, "grad_norm": 1.6874680664457093, "learning_rate": 9.254896817035483e-06, "loss": 1.2173, "step": 45740 }, { "epoch": 1750.0, "grad_norm": 1.711392020263029, "learning_rate": 9.234713331607285e-06, "loss": 1.2454, "step": 45760 }, { "epoch": 1750.754716981132, "grad_norm": 2.3706684674067713, "learning_rate": 9.214574966071978e-06, "loss": 1.2308, "step": 45780 }, { "epoch": 1751.5094339622642, "grad_norm": 1.6753069249066932, "learning_rate": 9.19448174774455e-06, "loss": 1.2413, "step": 45800 }, { "epoch": 1752.2641509433963, "grad_norm": 1.769095060241516, "learning_rate": 9.174433703878748e-06, "loss": 1.2319, "step": 45820 }, { "epoch": 1753.0188679245282, "grad_norm": 2.3962310618426756, "learning_rate": 9.154430861667043e-06, "loss": 1.2352, "step": 45840 }, { "epoch": 1753.7735849056603, "grad_norm": 1.9412455526945074, "learning_rate": 9.134473248240613e-06, "loss": 1.2102, "step": 45860 }, { "epoch": 1754.5283018867924, "grad_norm": 1.8463733972517142, "learning_rate": 9.114560890669284e-06, "loss": 1.2332, "step": 45880 }, { "epoch": 1755.2830188679245, "grad_norm": 2.04310291045162, "learning_rate": 9.094693815961489e-06, "loss": 1.214, "step": 45900 }, { "epoch": 1756.0377358490566, "grad_norm": 1.6171015922091012, "learning_rate": 9.074872051064247e-06, "loss": 1.2699, "step": 45920 }, { "epoch": 1756.7924528301887, "grad_norm": 1.9834386299608673, "learning_rate": 9.05509562286311e-06, "loss": 1.2278, "step": 45940 }, { "epoch": 1757.5471698113208, "grad_norm": 1.821750331122801, "learning_rate": 9.035364558182156e-06, "loss": 1.2209, "step": 45960 }, { "epoch": 1758.301886792453, "grad_norm": 2.014697149659363, "learning_rate": 9.01567888378393e-06, "loss": 1.2467, "step": 45980 }, { "epoch": 1759.0566037735848, "grad_norm": 1.7691564521949696, "learning_rate": 8.9960386263694e-06, "loss": 1.2387, "step": 46000 }, { "epoch": 1759.811320754717, "grad_norm": 2.3970514513874353, "learning_rate": 8.976443812577933e-06, "loss": 1.2356, "step": 46020 }, { "epoch": 1760.566037735849, "grad_norm": 1.7866985162824316, "learning_rate": 8.956894468987255e-06, "loss": 1.2192, "step": 46040 }, { "epoch": 1761.3207547169811, "grad_norm": 1.4793276251372218, "learning_rate": 8.93739062211343e-06, "loss": 1.2255, "step": 46060 }, { "epoch": 1762.0754716981132, "grad_norm": 1.629080653433639, "learning_rate": 8.917932298410821e-06, "loss": 1.2293, "step": 46080 }, { "epoch": 1762.8301886792453, "grad_norm": 1.9159436924110016, "learning_rate": 8.898519524272015e-06, "loss": 1.2401, "step": 46100 }, { "epoch": 1763.5849056603774, "grad_norm": 2.067014892731833, "learning_rate": 8.879152326027837e-06, "loss": 1.2344, "step": 46120 }, { "epoch": 1764.3396226415093, "grad_norm": 1.8696210113324339, "learning_rate": 8.859830729947271e-06, "loss": 1.2223, "step": 46140 }, { "epoch": 1765.0943396226414, "grad_norm": 2.110486612271203, "learning_rate": 8.840554762237504e-06, "loss": 1.243, "step": 46160 }, { "epoch": 1765.8490566037735, "grad_norm": 2.123761822878677, "learning_rate": 8.821324449043775e-06, "loss": 1.219, "step": 46180 }, { "epoch": 1766.6037735849056, "grad_norm": 1.9704006034099235, "learning_rate": 8.802139816449425e-06, "loss": 1.2274, "step": 46200 }, { "epoch": 1767.3584905660377, "grad_norm": 2.577400619765411, "learning_rate": 8.783000890475817e-06, "loss": 1.2215, "step": 46220 }, { "epoch": 1768.1132075471698, "grad_norm": 1.9304613762583265, "learning_rate": 8.763907697082349e-06, "loss": 1.2278, "step": 46240 }, { "epoch": 1768.867924528302, "grad_norm": 9.077022357816322, "learning_rate": 8.744860262166374e-06, "loss": 1.2376, "step": 46260 }, { "epoch": 1769.622641509434, "grad_norm": 1.7013870498396941, "learning_rate": 8.72585861156318e-06, "loss": 1.2435, "step": 46280 }, { "epoch": 1770.377358490566, "grad_norm": 2.2733345450497597, "learning_rate": 8.706902771045942e-06, "loss": 1.2491, "step": 46300 }, { "epoch": 1771.132075471698, "grad_norm": 1.7197101765888114, "learning_rate": 8.687992766325712e-06, "loss": 1.2308, "step": 46320 }, { "epoch": 1771.8867924528302, "grad_norm": 1.722161318565123, "learning_rate": 8.669128623051374e-06, "loss": 1.2153, "step": 46340 }, { "epoch": 1772.6415094339623, "grad_norm": 1.768434935423491, "learning_rate": 8.650310366809618e-06, "loss": 1.231, "step": 46360 }, { "epoch": 1773.3962264150944, "grad_norm": 1.754239611346281, "learning_rate": 8.631538023124864e-06, "loss": 1.2132, "step": 46380 }, { "epoch": 1774.1509433962265, "grad_norm": 1.8552614353082573, "learning_rate": 8.612811617459285e-06, "loss": 1.2112, "step": 46400 }, { "epoch": 1774.9056603773586, "grad_norm": 2.0773862798469467, "learning_rate": 8.594131175212718e-06, "loss": 1.2189, "step": 46420 }, { "epoch": 1775.6603773584907, "grad_norm": 2.3280607387947905, "learning_rate": 8.57549672172269e-06, "loss": 1.238, "step": 46440 }, { "epoch": 1776.4150943396226, "grad_norm": 2.1755774262596717, "learning_rate": 8.556908282264332e-06, "loss": 1.2024, "step": 46460 }, { "epoch": 1777.1698113207547, "grad_norm": 1.7187738055157478, "learning_rate": 8.538365882050364e-06, "loss": 1.2234, "step": 46480 }, { "epoch": 1777.9245283018868, "grad_norm": 3.1309450039543165, "learning_rate": 8.51986954623106e-06, "loss": 1.2081, "step": 46500 }, { "epoch": 1778.6792452830189, "grad_norm": 2.1042177578345567, "learning_rate": 8.501419299894205e-06, "loss": 1.1976, "step": 46520 }, { "epoch": 1779.433962264151, "grad_norm": 2.4039696113928586, "learning_rate": 8.483015168065095e-06, "loss": 1.2068, "step": 46540 }, { "epoch": 1780.188679245283, "grad_norm": 2.0537571832378605, "learning_rate": 8.464657175706461e-06, "loss": 1.2143, "step": 46560 }, { "epoch": 1780.9433962264152, "grad_norm": 1.9918815720142324, "learning_rate": 8.44634534771845e-06, "loss": 1.2019, "step": 46580 }, { "epoch": 1781.698113207547, "grad_norm": 3.5070134161926214, "learning_rate": 8.428079708938597e-06, "loss": 1.2117, "step": 46600 }, { "epoch": 1782.4528301886792, "grad_norm": 1.9332698868995186, "learning_rate": 8.409860284141776e-06, "loss": 1.2109, "step": 46620 }, { "epoch": 1783.2075471698113, "grad_norm": 1.8649611050997916, "learning_rate": 8.391687098040202e-06, "loss": 1.2127, "step": 46640 }, { "epoch": 1783.9622641509434, "grad_norm": 2.1126115309707276, "learning_rate": 8.373560175283366e-06, "loss": 1.2071, "step": 46660 }, { "epoch": 1784.7169811320755, "grad_norm": 2.1198410570984145, "learning_rate": 8.355479540457997e-06, "loss": 1.2136, "step": 46680 }, { "epoch": 1785.4716981132076, "grad_norm": 1.6900109710024558, "learning_rate": 8.337445218088043e-06, "loss": 1.2524, "step": 46700 }, { "epoch": 1786.2264150943397, "grad_norm": 1.8630113220385771, "learning_rate": 8.31945723263464e-06, "loss": 1.2265, "step": 46720 }, { "epoch": 1786.9811320754718, "grad_norm": 1.8874455281957463, "learning_rate": 8.301515608496088e-06, "loss": 1.2177, "step": 46740 }, { "epoch": 1787.7358490566037, "grad_norm": 2.014600854617101, "learning_rate": 8.283620370007777e-06, "loss": 1.2181, "step": 46760 }, { "epoch": 1788.4905660377358, "grad_norm": 2.0564703961686885, "learning_rate": 8.2657715414422e-06, "loss": 1.234, "step": 46780 }, { "epoch": 1789.245283018868, "grad_norm": 1.7463019171504772, "learning_rate": 8.247969147008883e-06, "loss": 1.2357, "step": 46800 }, { "epoch": 1790.0, "grad_norm": 2.0207773867855345, "learning_rate": 8.230213210854395e-06, "loss": 1.2148, "step": 46820 }, { "epoch": 1790.754716981132, "grad_norm": 2.3337870810525168, "learning_rate": 8.21250375706228e-06, "loss": 1.237, "step": 46840 }, { "epoch": 1791.5094339622642, "grad_norm": 2.1435617881979563, "learning_rate": 8.194840809653027e-06, "loss": 1.2374, "step": 46860 }, { "epoch": 1792.2641509433963, "grad_norm": 1.9102469560838522, "learning_rate": 8.177224392584056e-06, "loss": 1.209, "step": 46880 }, { "epoch": 1793.0188679245282, "grad_norm": 2.1795923550151737, "learning_rate": 8.159654529749662e-06, "loss": 1.2063, "step": 46900 }, { "epoch": 1793.7735849056603, "grad_norm": 1.823175394536622, "learning_rate": 8.142131244981005e-06, "loss": 1.1934, "step": 46920 }, { "epoch": 1794.5283018867924, "grad_norm": 1.8053211353930545, "learning_rate": 8.12465456204608e-06, "loss": 1.2198, "step": 46940 }, { "epoch": 1795.2830188679245, "grad_norm": 2.2947577379489195, "learning_rate": 8.107224504649651e-06, "loss": 1.2309, "step": 46960 }, { "epoch": 1796.0377358490566, "grad_norm": 1.8475992608945049, "learning_rate": 8.089841096433251e-06, "loss": 1.2087, "step": 46980 }, { "epoch": 1796.7924528301887, "grad_norm": 1.8272879309025556, "learning_rate": 8.072504360975127e-06, "loss": 1.2136, "step": 47000 }, { "epoch": 1797.5471698113208, "grad_norm": 1.8165782997861282, "learning_rate": 8.055214321790241e-06, "loss": 1.1889, "step": 47020 }, { "epoch": 1798.301886792453, "grad_norm": 2.3340672269584726, "learning_rate": 8.03797100233022e-06, "loss": 1.221, "step": 47040 }, { "epoch": 1799.0566037735848, "grad_norm": 2.092467100741215, "learning_rate": 8.020774425983296e-06, "loss": 1.2128, "step": 47060 }, { "epoch": 1799.811320754717, "grad_norm": 2.3746119444632, "learning_rate": 8.003624616074315e-06, "loss": 1.2182, "step": 47080 }, { "epoch": 1800.566037735849, "grad_norm": 1.8281656528438364, "learning_rate": 7.9865215958647e-06, "loss": 1.2263, "step": 47100 }, { "epoch": 1801.3207547169811, "grad_norm": 1.7918154594625133, "learning_rate": 7.969465388552383e-06, "loss": 1.2213, "step": 47120 }, { "epoch": 1802.0754716981132, "grad_norm": 1.6967922986825377, "learning_rate": 7.95245601727184e-06, "loss": 1.2138, "step": 47140 }, { "epoch": 1802.8301886792453, "grad_norm": 2.1758444336626437, "learning_rate": 7.935493505093988e-06, "loss": 1.2148, "step": 47160 }, { "epoch": 1803.5849056603774, "grad_norm": 2.065548344188712, "learning_rate": 7.918577875026188e-06, "loss": 1.225, "step": 47180 }, { "epoch": 1804.3396226415093, "grad_norm": 2.285598146488397, "learning_rate": 7.901709150012234e-06, "loss": 1.2029, "step": 47200 }, { "epoch": 1805.0943396226414, "grad_norm": 1.845588983749011, "learning_rate": 7.884887352932272e-06, "loss": 1.2197, "step": 47220 }, { "epoch": 1805.8490566037735, "grad_norm": 2.1058361117020095, "learning_rate": 7.868112506602826e-06, "loss": 1.2153, "step": 47240 }, { "epoch": 1806.6037735849056, "grad_norm": 2.465710936967516, "learning_rate": 7.851384633776713e-06, "loss": 1.228, "step": 47260 }, { "epoch": 1807.3584905660377, "grad_norm": 2.5705709977723905, "learning_rate": 7.834703757143039e-06, "loss": 1.2098, "step": 47280 }, { "epoch": 1808.1132075471698, "grad_norm": 2.2374731183447105, "learning_rate": 7.818069899327187e-06, "loss": 1.2129, "step": 47300 }, { "epoch": 1808.867924528302, "grad_norm": 1.9504362821950096, "learning_rate": 7.801483082890734e-06, "loss": 1.1901, "step": 47320 }, { "epoch": 1809.622641509434, "grad_norm": 2.3420693435077813, "learning_rate": 7.784943330331486e-06, "loss": 1.211, "step": 47340 }, { "epoch": 1810.377358490566, "grad_norm": 2.031062915249881, "learning_rate": 7.768450664083389e-06, "loss": 1.2156, "step": 47360 }, { "epoch": 1811.132075471698, "grad_norm": 1.6147973470014159, "learning_rate": 7.752005106516516e-06, "loss": 1.2246, "step": 47380 }, { "epoch": 1811.8867924528302, "grad_norm": 3.287052917624636, "learning_rate": 7.735606679937075e-06, "loss": 1.2064, "step": 47400 }, { "epoch": 1812.6415094339623, "grad_norm": 2.2761297016427178, "learning_rate": 7.719255406587317e-06, "loss": 1.212, "step": 47420 }, { "epoch": 1813.3962264150944, "grad_norm": 1.7704387481824377, "learning_rate": 7.702951308645558e-06, "loss": 1.2085, "step": 47440 }, { "epoch": 1814.1509433962265, "grad_norm": 1.8350242336586524, "learning_rate": 7.68669440822611e-06, "loss": 1.222, "step": 47460 }, { "epoch": 1814.9056603773586, "grad_norm": 2.2387441733555202, "learning_rate": 7.67048472737927e-06, "loss": 1.2227, "step": 47480 }, { "epoch": 1815.6603773584907, "grad_norm": 2.0331586599518863, "learning_rate": 7.654322288091307e-06, "loss": 1.2105, "step": 47500 }, { "epoch": 1816.4150943396226, "grad_norm": 1.7302486664188137, "learning_rate": 7.638207112284387e-06, "loss": 1.2006, "step": 47520 }, { "epoch": 1817.1698113207547, "grad_norm": 2.102196000276882, "learning_rate": 7.622139221816588e-06, "loss": 1.2129, "step": 47540 }, { "epoch": 1817.9245283018868, "grad_norm": 2.367853919945459, "learning_rate": 7.606118638481834e-06, "loss": 1.2137, "step": 47560 }, { "epoch": 1818.6792452830189, "grad_norm": 1.7313717857059043, "learning_rate": 7.5901453840099084e-06, "loss": 1.1895, "step": 47580 }, { "epoch": 1819.433962264151, "grad_norm": 1.90549898399535, "learning_rate": 7.574219480066374e-06, "loss": 1.2056, "step": 47600 }, { "epoch": 1820.188679245283, "grad_norm": 2.037261933639343, "learning_rate": 7.55834094825259e-06, "loss": 1.2174, "step": 47620 }, { "epoch": 1820.9433962264152, "grad_norm": 2.106635441636325, "learning_rate": 7.542509810105648e-06, "loss": 1.1982, "step": 47640 }, { "epoch": 1821.698113207547, "grad_norm": 2.092038104009338, "learning_rate": 7.526726087098354e-06, "loss": 1.2218, "step": 47660 }, { "epoch": 1822.4528301886792, "grad_norm": 1.997516919579926, "learning_rate": 7.51098980063922e-06, "loss": 1.2219, "step": 47680 }, { "epoch": 1823.2075471698113, "grad_norm": 1.8136562199600643, "learning_rate": 7.49530097207239e-06, "loss": 1.1796, "step": 47700 }, { "epoch": 1823.9622641509434, "grad_norm": 2.5616204147227934, "learning_rate": 7.47965962267767e-06, "loss": 1.1939, "step": 47720 }, { "epoch": 1824.7169811320755, "grad_norm": 2.1387597203680815, "learning_rate": 7.464065773670437e-06, "loss": 1.1602, "step": 47740 }, { "epoch": 1825.4716981132076, "grad_norm": 1.7803739365612326, "learning_rate": 7.448519446201648e-06, "loss": 1.2392, "step": 47760 }, { "epoch": 1826.2264150943397, "grad_norm": 1.8173426913493826, "learning_rate": 7.433020661357822e-06, "loss": 1.1921, "step": 47780 }, { "epoch": 1826.9811320754718, "grad_norm": 1.7763793595591069, "learning_rate": 7.417569440160968e-06, "loss": 1.2139, "step": 47800 }, { "epoch": 1827.7358490566037, "grad_norm": 2.151890225358, "learning_rate": 7.402165803568603e-06, "loss": 1.1918, "step": 47820 }, { "epoch": 1828.4905660377358, "grad_norm": 2.241696527607786, "learning_rate": 7.386809772473682e-06, "loss": 1.199, "step": 47840 }, { "epoch": 1829.245283018868, "grad_norm": 1.904140122730207, "learning_rate": 7.371501367704594e-06, "loss": 1.175, "step": 47860 }, { "epoch": 1830.0, "grad_norm": 2.2057960272933035, "learning_rate": 7.356240610025147e-06, "loss": 1.2026, "step": 47880 }, { "epoch": 1830.754716981132, "grad_norm": 1.992268245473379, "learning_rate": 7.341027520134496e-06, "loss": 1.2226, "step": 47900 }, { "epoch": 1831.5094339622642, "grad_norm": 1.8558557959544568, "learning_rate": 7.325862118667166e-06, "loss": 1.1879, "step": 47920 }, { "epoch": 1832.2641509433963, "grad_norm": 2.649917575243484, "learning_rate": 7.3107444261929805e-06, "loss": 1.2128, "step": 47940 }, { "epoch": 1833.0188679245282, "grad_norm": 1.838198084617481, "learning_rate": 7.295674463217053e-06, "loss": 1.1932, "step": 47960 }, { "epoch": 1833.7735849056603, "grad_norm": 1.6953634210974582, "learning_rate": 7.280652250179774e-06, "loss": 1.1964, "step": 47980 }, { "epoch": 1834.5283018867924, "grad_norm": 1.6312994813012875, "learning_rate": 7.26567780745675e-06, "loss": 1.1941, "step": 48000 }, { "epoch": 1835.2830188679245, "grad_norm": 2.0702401300059528, "learning_rate": 7.250751155358808e-06, "loss": 1.2005, "step": 48020 }, { "epoch": 1836.0377358490566, "grad_norm": 2.2642885375841395, "learning_rate": 7.2358723141319396e-06, "loss": 1.1894, "step": 48040 }, { "epoch": 1836.7924528301887, "grad_norm": 1.9527405946827057, "learning_rate": 7.2210413039573e-06, "loss": 1.182, "step": 48060 }, { "epoch": 1837.5471698113208, "grad_norm": 2.3498816913200984, "learning_rate": 7.206258144951163e-06, "loss": 1.1913, "step": 48080 }, { "epoch": 1838.301886792453, "grad_norm": 2.123796744980879, "learning_rate": 7.1915228571648876e-06, "loss": 1.2076, "step": 48100 }, { "epoch": 1839.0566037735848, "grad_norm": 1.7252206439090503, "learning_rate": 7.176835460584927e-06, "loss": 1.1861, "step": 48120 }, { "epoch": 1839.811320754717, "grad_norm": 1.8734365315429182, "learning_rate": 7.162195975132747e-06, "loss": 1.1826, "step": 48140 }, { "epoch": 1840.566037735849, "grad_norm": 2.316186234026582, "learning_rate": 7.147604420664858e-06, "loss": 1.177, "step": 48160 }, { "epoch": 1841.3207547169811, "grad_norm": 1.788911685930357, "learning_rate": 7.133060816972735e-06, "loss": 1.1844, "step": 48180 }, { "epoch": 1842.0754716981132, "grad_norm": 1.9701957941688446, "learning_rate": 7.118565183782816e-06, "loss": 1.211, "step": 48200 }, { "epoch": 1842.8301886792453, "grad_norm": 1.721205749217039, "learning_rate": 7.104117540756494e-06, "loss": 1.2045, "step": 48220 }, { "epoch": 1843.5849056603774, "grad_norm": 1.73943762799037, "learning_rate": 7.089717907490048e-06, "loss": 1.2005, "step": 48240 }, { "epoch": 1844.3396226415093, "grad_norm": 2.2562973087741587, "learning_rate": 7.07536630351465e-06, "loss": 1.21, "step": 48260 }, { "epoch": 1845.0943396226414, "grad_norm": 2.10020840598067, "learning_rate": 7.061062748296323e-06, "loss": 1.191, "step": 48280 }, { "epoch": 1845.8490566037735, "grad_norm": 1.8070339824697725, "learning_rate": 7.0468072612359105e-06, "loss": 1.193, "step": 48300 }, { "epoch": 1846.6037735849056, "grad_norm": 1.997808139499102, "learning_rate": 7.032599861669077e-06, "loss": 1.2014, "step": 48320 }, { "epoch": 1847.3584905660377, "grad_norm": 2.0691188983956277, "learning_rate": 7.018440568866245e-06, "loss": 1.1966, "step": 48340 }, { "epoch": 1848.1132075471698, "grad_norm": 2.403979236635362, "learning_rate": 7.004329402032594e-06, "loss": 1.1782, "step": 48360 }, { "epoch": 1848.867924528302, "grad_norm": 2.6004080171234385, "learning_rate": 6.9902663803080305e-06, "loss": 1.1804, "step": 48380 }, { "epoch": 1849.622641509434, "grad_norm": 1.9209297506929766, "learning_rate": 6.976251522767146e-06, "loss": 1.1743, "step": 48400 }, { "epoch": 1850.377358490566, "grad_norm": 1.870852128682101, "learning_rate": 6.962284848419221e-06, "loss": 1.1968, "step": 48420 }, { "epoch": 1851.132075471698, "grad_norm": 2.335271083007723, "learning_rate": 6.948366376208161e-06, "loss": 1.1848, "step": 48440 }, { "epoch": 1851.8867924528302, "grad_norm": 1.6640538101181919, "learning_rate": 6.93449612501252e-06, "loss": 1.201, "step": 48460 }, { "epoch": 1852.6415094339623, "grad_norm": 1.8790103798312214, "learning_rate": 6.920674113645418e-06, "loss": 1.167, "step": 48480 }, { "epoch": 1853.3962264150944, "grad_norm": 1.7578289543420753, "learning_rate": 6.906900360854565e-06, "loss": 1.2007, "step": 48500 }, { "epoch": 1854.1509433962265, "grad_norm": 1.8615274702658844, "learning_rate": 6.893174885322198e-06, "loss": 1.2056, "step": 48520 }, { "epoch": 1854.9056603773586, "grad_norm": 2.3236275376684143, "learning_rate": 6.879497705665089e-06, "loss": 1.1716, "step": 48540 }, { "epoch": 1855.6603773584907, "grad_norm": 2.1963583105378213, "learning_rate": 6.865868840434493e-06, "loss": 1.1769, "step": 48560 }, { "epoch": 1856.4150943396226, "grad_norm": 2.129799150022101, "learning_rate": 6.852288308116133e-06, "loss": 1.1861, "step": 48580 }, { "epoch": 1857.1698113207547, "grad_norm": 1.978798574769679, "learning_rate": 6.8387561271301765e-06, "loss": 1.1971, "step": 48600 }, { "epoch": 1857.9245283018868, "grad_norm": 2.2936585531474596, "learning_rate": 6.8252723158312055e-06, "loss": 1.1911, "step": 48620 }, { "epoch": 1858.6792452830189, "grad_norm": 1.860277938482895, "learning_rate": 6.81183689250821e-06, "loss": 1.1566, "step": 48640 }, { "epoch": 1859.433962264151, "grad_norm": 2.017669140863562, "learning_rate": 6.79844987538453e-06, "loss": 1.1728, "step": 48660 }, { "epoch": 1860.188679245283, "grad_norm": 2.098145579199566, "learning_rate": 6.785111282617849e-06, "loss": 1.1934, "step": 48680 }, { "epoch": 1860.9433962264152, "grad_norm": 1.835823989245946, "learning_rate": 6.771821132300191e-06, "loss": 1.1621, "step": 48700 }, { "epoch": 1861.698113207547, "grad_norm": 1.9612631399268534, "learning_rate": 6.7585794424578464e-06, "loss": 1.1911, "step": 48720 }, { "epoch": 1862.4528301886792, "grad_norm": 2.0755400141270464, "learning_rate": 6.745386231051399e-06, "loss": 1.1804, "step": 48740 }, { "epoch": 1863.2075471698113, "grad_norm": 2.2075989618364984, "learning_rate": 6.732241515975663e-06, "loss": 1.1933, "step": 48760 }, { "epoch": 1863.9622641509434, "grad_norm": 1.9214092744343696, "learning_rate": 6.719145315059678e-06, "loss": 1.1913, "step": 48780 }, { "epoch": 1864.7169811320755, "grad_norm": 2.3557809190891703, "learning_rate": 6.7060976460666846e-06, "loss": 1.1905, "step": 48800 }, { "epoch": 1865.4716981132076, "grad_norm": 2.114305919520162, "learning_rate": 6.693098526694083e-06, "loss": 1.2047, "step": 48820 }, { "epoch": 1866.2264150943397, "grad_norm": 1.8242775313878226, "learning_rate": 6.680147974573452e-06, "loss": 1.1933, "step": 48840 }, { "epoch": 1866.9811320754718, "grad_norm": 2.1056639763813956, "learning_rate": 6.66724600727046e-06, "loss": 1.1808, "step": 48860 }, { "epoch": 1867.7358490566037, "grad_norm": 1.7165725449830957, "learning_rate": 6.654392642284892e-06, "loss": 1.1782, "step": 48880 }, { "epoch": 1868.4905660377358, "grad_norm": 1.7341902387718784, "learning_rate": 6.6415878970506175e-06, "loss": 1.179, "step": 48900 }, { "epoch": 1869.245283018868, "grad_norm": 1.7667425869444906, "learning_rate": 6.6288317889355535e-06, "loss": 1.1754, "step": 48920 }, { "epoch": 1870.0, "grad_norm": 2.2994252135110655, "learning_rate": 6.616124335241648e-06, "loss": 1.1992, "step": 48940 }, { "epoch": 1870.754716981132, "grad_norm": 1.7046658246235185, "learning_rate": 6.603465553204852e-06, "loss": 1.1811, "step": 48960 }, { "epoch": 1871.5094339622642, "grad_norm": 2.1898673540015428, "learning_rate": 6.5908554599951e-06, "loss": 1.178, "step": 48980 }, { "epoch": 1872.2641509433963, "grad_norm": 1.9804369829197095, "learning_rate": 6.578294072716292e-06, "loss": 1.1989, "step": 49000 }, { "epoch": 1873.0188679245282, "grad_norm": 2.065726596455928, "learning_rate": 6.565781408406267e-06, "loss": 1.1931, "step": 49020 }, { "epoch": 1873.7735849056603, "grad_norm": 2.2467788121970123, "learning_rate": 6.553317484036772e-06, "loss": 1.2074, "step": 49040 }, { "epoch": 1874.5283018867924, "grad_norm": 1.991691603079823, "learning_rate": 6.5409023165134424e-06, "loss": 1.1983, "step": 49060 }, { "epoch": 1875.2830188679245, "grad_norm": 2.106299625577455, "learning_rate": 6.528535922675781e-06, "loss": 1.1956, "step": 49080 }, { "epoch": 1876.0377358490566, "grad_norm": 2.037693251120139, "learning_rate": 6.516218319297147e-06, "loss": 1.185, "step": 49100 }, { "epoch": 1876.7924528301887, "grad_norm": 2.3718612692091763, "learning_rate": 6.503949523084718e-06, "loss": 1.1859, "step": 49120 }, { "epoch": 1877.5471698113208, "grad_norm": 1.9858435056818156, "learning_rate": 6.491729550679461e-06, "loss": 1.2076, "step": 49140 }, { "epoch": 1878.301886792453, "grad_norm": 2.147814028235424, "learning_rate": 6.479558418656134e-06, "loss": 1.1682, "step": 49160 }, { "epoch": 1879.0566037735848, "grad_norm": 1.7628164718106505, "learning_rate": 6.467436143523228e-06, "loss": 1.1791, "step": 49180 }, { "epoch": 1879.811320754717, "grad_norm": 1.9837896355936764, "learning_rate": 6.455362741722995e-06, "loss": 1.1977, "step": 49200 }, { "epoch": 1880.566037735849, "grad_norm": 2.0613808893064327, "learning_rate": 6.44333822963138e-06, "loss": 1.1738, "step": 49220 }, { "epoch": 1881.3207547169811, "grad_norm": 1.6736931908615154, "learning_rate": 6.431362623558018e-06, "loss": 1.1774, "step": 49240 }, { "epoch": 1882.0754716981132, "grad_norm": 1.691911714014794, "learning_rate": 6.4194359397462055e-06, "loss": 1.1666, "step": 49260 }, { "epoch": 1882.8301886792453, "grad_norm": 2.1345911027894138, "learning_rate": 6.4075581943728944e-06, "loss": 1.1973, "step": 49280 }, { "epoch": 1883.5849056603774, "grad_norm": 1.9512349129787812, "learning_rate": 6.395729403548645e-06, "loss": 1.1672, "step": 49300 }, { "epoch": 1884.3396226415093, "grad_norm": 2.6451924153676125, "learning_rate": 6.383949583317629e-06, "loss": 1.1695, "step": 49320 }, { "epoch": 1885.0943396226414, "grad_norm": 1.783294063259621, "learning_rate": 6.372218749657584e-06, "loss": 1.1648, "step": 49340 }, { "epoch": 1885.8490566037735, "grad_norm": 2.3799777683561967, "learning_rate": 6.360536918479806e-06, "loss": 1.1776, "step": 49360 }, { "epoch": 1886.6037735849056, "grad_norm": 2.0124780882138347, "learning_rate": 6.348904105629139e-06, "loss": 1.1884, "step": 49380 }, { "epoch": 1887.3584905660377, "grad_norm": 1.8426672524927896, "learning_rate": 6.3373203268839345e-06, "loss": 1.1842, "step": 49400 }, { "epoch": 1888.1132075471698, "grad_norm": 2.0963675882931274, "learning_rate": 6.325785597956021e-06, "loss": 1.1807, "step": 49420 }, { "epoch": 1888.867924528302, "grad_norm": 1.9137633109249375, "learning_rate": 6.314299934490717e-06, "loss": 1.1932, "step": 49440 }, { "epoch": 1889.622641509434, "grad_norm": 2.2927620018796033, "learning_rate": 6.3028633520667744e-06, "loss": 1.186, "step": 49460 }, { "epoch": 1890.377358490566, "grad_norm": 2.7433533031518182, "learning_rate": 6.291475866196384e-06, "loss": 1.1363, "step": 49480 }, { "epoch": 1891.132075471698, "grad_norm": 1.563437023715403, "learning_rate": 6.280137492325147e-06, "loss": 1.2093, "step": 49500 }, { "epoch": 1891.8867924528302, "grad_norm": 2.793822111662886, "learning_rate": 6.2688482458320434e-06, "loss": 1.1751, "step": 49520 }, { "epoch": 1892.6415094339623, "grad_norm": 1.8418670948197584, "learning_rate": 6.25760814202941e-06, "loss": 1.1658, "step": 49540 }, { "epoch": 1893.3962264150944, "grad_norm": 1.8332744098429328, "learning_rate": 6.246417196162944e-06, "loss": 1.1654, "step": 49560 }, { "epoch": 1894.1509433962265, "grad_norm": 1.7817661421186255, "learning_rate": 6.235275423411659e-06, "loss": 1.1764, "step": 49580 }, { "epoch": 1894.9056603773586, "grad_norm": 1.9495189221186473, "learning_rate": 6.224182838887876e-06, "loss": 1.1529, "step": 49600 }, { "epoch": 1895.6603773584907, "grad_norm": 2.4039058315851447, "learning_rate": 6.213139457637196e-06, "loss": 1.1747, "step": 49620 }, { "epoch": 1896.4150943396226, "grad_norm": 1.875771973172552, "learning_rate": 6.202145294638478e-06, "loss": 1.1821, "step": 49640 }, { "epoch": 1897.1698113207547, "grad_norm": 2.1999372490425393, "learning_rate": 6.191200364803824e-06, "loss": 1.1813, "step": 49660 }, { "epoch": 1897.9245283018868, "grad_norm": 11.988202176475387, "learning_rate": 6.180304682978568e-06, "loss": 1.1569, "step": 49680 }, { "epoch": 1898.6792452830189, "grad_norm": 1.8662829336756046, "learning_rate": 6.169458263941242e-06, "loss": 1.1816, "step": 49700 }, { "epoch": 1899.433962264151, "grad_norm": 2.3098966440534294, "learning_rate": 6.158661122403553e-06, "loss": 1.1581, "step": 49720 }, { "epoch": 1900.188679245283, "grad_norm": 2.332658522584547, "learning_rate": 6.1479132730103704e-06, "loss": 1.1946, "step": 49740 }, { "epoch": 1900.9433962264152, "grad_norm": 1.7105735490477962, "learning_rate": 6.137214730339707e-06, "loss": 1.1868, "step": 49760 }, { "epoch": 1901.698113207547, "grad_norm": 2.243808666742797, "learning_rate": 6.126565508902698e-06, "loss": 1.1599, "step": 49780 }, { "epoch": 1902.4528301886792, "grad_norm": 1.8783312097697262, "learning_rate": 6.115965623143589e-06, "loss": 1.1621, "step": 49800 }, { "epoch": 1903.2075471698113, "grad_norm": 1.7166955372139616, "learning_rate": 6.105415087439699e-06, "loss": 1.1862, "step": 49820 }, { "epoch": 1903.9622641509434, "grad_norm": 2.0340234917391524, "learning_rate": 6.094913916101413e-06, "loss": 1.1561, "step": 49840 }, { "epoch": 1904.7169811320755, "grad_norm": 1.8220266868042787, "learning_rate": 6.084462123372144e-06, "loss": 1.1749, "step": 49860 }, { "epoch": 1905.4716981132076, "grad_norm": 2.3373227334868973, "learning_rate": 6.07405972342837e-06, "loss": 1.2081, "step": 49880 }, { "epoch": 1906.2264150943397, "grad_norm": 2.31770817514565, "learning_rate": 6.063706730379534e-06, "loss": 1.1705, "step": 49900 }, { "epoch": 1906.9811320754718, "grad_norm": 1.749701769225866, "learning_rate": 6.053403158268086e-06, "loss": 1.1732, "step": 49920 }, { "epoch": 1907.7358490566037, "grad_norm": 2.0136702275524736, "learning_rate": 6.043149021069432e-06, "loss": 1.1789, "step": 49940 }, { "epoch": 1908.4905660377358, "grad_norm": 1.8991267563990468, "learning_rate": 6.032944332691932e-06, "loss": 1.1691, "step": 49960 }, { "epoch": 1909.245283018868, "grad_norm": 2.178284333271757, "learning_rate": 6.02278910697688e-06, "loss": 1.1698, "step": 49980 }, { "epoch": 1910.0, "grad_norm": 2.270879262505861, "learning_rate": 6.012683357698476e-06, "loss": 1.1424, "step": 50000 }, { "epoch": 1910.754716981132, "grad_norm": 1.6800724855002753, "learning_rate": 6.0026270985638094e-06, "loss": 1.1405, "step": 50020 }, { "epoch": 1911.5094339622642, "grad_norm": 1.9982510142589247, "learning_rate": 5.9926203432128405e-06, "loss": 1.1811, "step": 50040 }, { "epoch": 1912.2641509433963, "grad_norm": 2.072677046394058, "learning_rate": 5.98266310521839e-06, "loss": 1.1832, "step": 50060 }, { "epoch": 1913.0188679245282, "grad_norm": 1.8518177038658126, "learning_rate": 5.972755398086119e-06, "loss": 1.1768, "step": 50080 }, { "epoch": 1913.7735849056603, "grad_norm": 1.7233232781661019, "learning_rate": 5.9628972352545016e-06, "loss": 1.1916, "step": 50100 }, { "epoch": 1914.5283018867924, "grad_norm": 2.2438247684764776, "learning_rate": 5.953088630094804e-06, "loss": 1.1965, "step": 50120 }, { "epoch": 1915.2830188679245, "grad_norm": 2.354329582753457, "learning_rate": 5.943329595911085e-06, "loss": 1.1657, "step": 50140 }, { "epoch": 1916.0377358490566, "grad_norm": 2.0821470705714558, "learning_rate": 5.933620145940163e-06, "loss": 1.1733, "step": 50160 }, { "epoch": 1916.7924528301887, "grad_norm": 2.3851614247004513, "learning_rate": 5.92396029335161e-06, "loss": 1.1973, "step": 50180 }, { "epoch": 1917.5471698113208, "grad_norm": 2.165021041548156, "learning_rate": 5.91435005124771e-06, "loss": 1.1605, "step": 50200 }, { "epoch": 1918.301886792453, "grad_norm": 1.8316074131304803, "learning_rate": 5.904789432663471e-06, "loss": 1.175, "step": 50220 }, { "epoch": 1919.0566037735848, "grad_norm": 2.2444762984325517, "learning_rate": 5.8952784505665775e-06, "loss": 1.1546, "step": 50240 }, { "epoch": 1919.811320754717, "grad_norm": 1.990381226210719, "learning_rate": 5.885817117857409e-06, "loss": 1.1734, "step": 50260 }, { "epoch": 1920.566037735849, "grad_norm": 2.1012346230151935, "learning_rate": 5.876405447368989e-06, "loss": 1.1726, "step": 50280 }, { "epoch": 1921.3207547169811, "grad_norm": 2.036740240000707, "learning_rate": 5.867043451866989e-06, "loss": 1.1858, "step": 50300 }, { "epoch": 1922.0754716981132, "grad_norm": 1.9828813541843844, "learning_rate": 5.85773114404969e-06, "loss": 1.1523, "step": 50320 }, { "epoch": 1922.8301886792453, "grad_norm": 2.1278126328460196, "learning_rate": 5.848468536547991e-06, "loss": 1.1886, "step": 50340 }, { "epoch": 1923.5849056603774, "grad_norm": 2.315788726027488, "learning_rate": 5.8392556419253755e-06, "loss": 1.1686, "step": 50360 }, { "epoch": 1924.3396226415093, "grad_norm": 2.3735624423680117, "learning_rate": 5.830092472677899e-06, "loss": 1.1584, "step": 50380 }, { "epoch": 1925.0943396226414, "grad_norm": 2.3872016424634093, "learning_rate": 5.820979041234169e-06, "loss": 1.1859, "step": 50400 }, { "epoch": 1925.8490566037735, "grad_norm": 2.0316120352053115, "learning_rate": 5.811915359955322e-06, "loss": 1.1578, "step": 50420 }, { "epoch": 1926.6037735849056, "grad_norm": 1.9955741026809004, "learning_rate": 5.8029014411350336e-06, "loss": 1.1699, "step": 50440 }, { "epoch": 1927.3584905660377, "grad_norm": 2.671513853147586, "learning_rate": 5.793937296999476e-06, "loss": 1.1613, "step": 50460 }, { "epoch": 1928.1132075471698, "grad_norm": 2.085910842457962, "learning_rate": 5.785022939707302e-06, "loss": 1.1919, "step": 50480 }, { "epoch": 1928.867924528302, "grad_norm": 2.15354309947986, "learning_rate": 5.77615838134964e-06, "loss": 1.1766, "step": 50500 }, { "epoch": 1929.622641509434, "grad_norm": 2.5693650339132463, "learning_rate": 5.76734363395007e-06, "loss": 1.175, "step": 50520 }, { "epoch": 1930.377358490566, "grad_norm": 3.2479427163076533, "learning_rate": 5.7585787094646196e-06, "loss": 1.1703, "step": 50540 }, { "epoch": 1931.132075471698, "grad_norm": 2.1161416369904695, "learning_rate": 5.749863619781723e-06, "loss": 1.1657, "step": 50560 }, { "epoch": 1931.8867924528302, "grad_norm": 1.7714808950845444, "learning_rate": 5.7411983767222415e-06, "loss": 1.1717, "step": 50580 }, { "epoch": 1932.6415094339623, "grad_norm": 2.535706381586084, "learning_rate": 5.732582992039398e-06, "loss": 1.1553, "step": 50600 }, { "epoch": 1933.3962264150944, "grad_norm": 2.1037009756450527, "learning_rate": 5.724017477418814e-06, "loss": 1.1771, "step": 50620 }, { "epoch": 1934.1509433962265, "grad_norm": 1.849746935628885, "learning_rate": 5.7155018444784526e-06, "loss": 1.1422, "step": 50640 }, { "epoch": 1934.9056603773586, "grad_norm": 1.9210522037566025, "learning_rate": 5.707036104768635e-06, "loss": 1.1756, "step": 50660 }, { "epoch": 1935.6603773584907, "grad_norm": 1.819206038769788, "learning_rate": 5.698620269771997e-06, "loss": 1.1916, "step": 50680 }, { "epoch": 1936.4150943396226, "grad_norm": 2.5377205844625417, "learning_rate": 5.690254350903488e-06, "loss": 1.1619, "step": 50700 }, { "epoch": 1937.1698113207547, "grad_norm": 2.2007521096063902, "learning_rate": 5.681938359510347e-06, "loss": 1.1846, "step": 50720 }, { "epoch": 1937.9245283018868, "grad_norm": 2.192606880082283, "learning_rate": 5.673672306872103e-06, "loss": 1.1699, "step": 50740 }, { "epoch": 1938.6792452830189, "grad_norm": 2.1766069540448436, "learning_rate": 5.665456204200552e-06, "loss": 1.1871, "step": 50760 }, { "epoch": 1939.433962264151, "grad_norm": 1.9751649291014899, "learning_rate": 5.657290062639727e-06, "loss": 1.1474, "step": 50780 }, { "epoch": 1940.188679245283, "grad_norm": 1.890603847246591, "learning_rate": 5.6491738932659e-06, "loss": 1.1559, "step": 50800 }, { "epoch": 1940.9433962264152, "grad_norm": 1.975301210235016, "learning_rate": 5.641107707087573e-06, "loss": 1.1521, "step": 50820 }, { "epoch": 1941.698113207547, "grad_norm": 1.8441779800267277, "learning_rate": 5.6330915150454375e-06, "loss": 1.145, "step": 50840 }, { "epoch": 1942.4528301886792, "grad_norm": 2.4451642203033064, "learning_rate": 5.625125328012387e-06, "loss": 1.1791, "step": 50860 }, { "epoch": 1943.2075471698113, "grad_norm": 2.127622782788785, "learning_rate": 5.617209156793476e-06, "loss": 1.1471, "step": 50880 }, { "epoch": 1943.9622641509434, "grad_norm": 2.0779587981444427, "learning_rate": 5.609343012125934e-06, "loss": 1.1537, "step": 50900 }, { "epoch": 1944.7169811320755, "grad_norm": 2.1654459900473872, "learning_rate": 5.601526904679125e-06, "loss": 1.1609, "step": 50920 }, { "epoch": 1945.4716981132076, "grad_norm": 1.8696254811746238, "learning_rate": 5.593760845054552e-06, "loss": 1.1523, "step": 50940 }, { "epoch": 1946.2264150943397, "grad_norm": 1.7856722997496786, "learning_rate": 5.586044843785832e-06, "loss": 1.2012, "step": 50960 }, { "epoch": 1946.9811320754718, "grad_norm": 1.9401895010628936, "learning_rate": 5.578378911338684e-06, "loss": 1.1384, "step": 50980 }, { "epoch": 1947.7358490566037, "grad_norm": 2.1643993698581077, "learning_rate": 5.570763058110911e-06, "loss": 1.1645, "step": 51000 }, { "epoch": 1948.4905660377358, "grad_norm": 2.180473463448981, "learning_rate": 5.563197294432395e-06, "loss": 1.1382, "step": 51020 }, { "epoch": 1949.245283018868, "grad_norm": 1.9596617879790443, "learning_rate": 5.555681630565088e-06, "loss": 1.1539, "step": 51040 }, { "epoch": 1950.0, "grad_norm": 1.8975845056567062, "learning_rate": 5.548216076702974e-06, "loss": 1.144, "step": 51060 }, { "epoch": 1950.754716981132, "grad_norm": 2.188450696803476, "learning_rate": 5.540800642972071e-06, "loss": 1.1532, "step": 51080 }, { "epoch": 1951.5094339622642, "grad_norm": 1.8285975742024299, "learning_rate": 5.533435339430416e-06, "loss": 1.1949, "step": 51100 }, { "epoch": 1952.2641509433963, "grad_norm": 1.812418268110745, "learning_rate": 5.526120176068055e-06, "loss": 1.1613, "step": 51120 }, { "epoch": 1953.0188679245282, "grad_norm": 1.9694834187837782, "learning_rate": 5.518855162807036e-06, "loss": 1.1749, "step": 51140 }, { "epoch": 1953.7735849056603, "grad_norm": 2.014411336027095, "learning_rate": 5.511640309501359e-06, "loss": 1.1364, "step": 51160 }, { "epoch": 1954.5283018867924, "grad_norm": 1.663001146626253, "learning_rate": 5.504475625937011e-06, "loss": 1.1469, "step": 51180 }, { "epoch": 1955.2830188679245, "grad_norm": 3.7966652139269756, "learning_rate": 5.497361121831918e-06, "loss": 1.1634, "step": 51200 }, { "epoch": 1956.0377358490566, "grad_norm": 1.6321676665862368, "learning_rate": 5.490296806835955e-06, "loss": 1.1747, "step": 51220 }, { "epoch": 1956.7924528301887, "grad_norm": 1.8693551118320602, "learning_rate": 5.483282690530914e-06, "loss": 1.1513, "step": 51240 }, { "epoch": 1957.5471698113208, "grad_norm": 1.7254591597688926, "learning_rate": 5.476318782430499e-06, "loss": 1.1384, "step": 51260 }, { "epoch": 1958.301886792453, "grad_norm": 2.1059814541036284, "learning_rate": 5.469405091980319e-06, "loss": 1.145, "step": 51280 }, { "epoch": 1959.0566037735848, "grad_norm": 2.4150150564267956, "learning_rate": 5.462541628557862e-06, "loss": 1.1727, "step": 51300 }, { "epoch": 1959.811320754717, "grad_norm": 2.072986751089322, "learning_rate": 5.4557284014725005e-06, "loss": 1.1632, "step": 51320 }, { "epoch": 1960.566037735849, "grad_norm": 1.7011080715428424, "learning_rate": 5.448965419965458e-06, "loss": 1.1719, "step": 51340 }, { "epoch": 1961.3207547169811, "grad_norm": 2.050321684694806, "learning_rate": 5.442252693209813e-06, "loss": 1.1523, "step": 51360 }, { "epoch": 1962.0754716981132, "grad_norm": 2.3154046947609603, "learning_rate": 5.4355902303104744e-06, "loss": 1.1365, "step": 51380 }, { "epoch": 1962.8301886792453, "grad_norm": 2.292815295745735, "learning_rate": 5.4289780403041805e-06, "loss": 1.1595, "step": 51400 }, { "epoch": 1963.5849056603774, "grad_norm": 2.1444563447901253, "learning_rate": 5.422416132159477e-06, "loss": 1.1609, "step": 51420 }, { "epoch": 1964.3396226415093, "grad_norm": 1.8223405112306774, "learning_rate": 5.415904514776712e-06, "loss": 1.128, "step": 51440 }, { "epoch": 1965.0943396226414, "grad_norm": 1.9601698616488796, "learning_rate": 5.40944319698802e-06, "loss": 1.1785, "step": 51460 }, { "epoch": 1965.8490566037735, "grad_norm": 2.570580210466246, "learning_rate": 5.403032187557308e-06, "loss": 1.147, "step": 51480 }, { "epoch": 1966.6037735849056, "grad_norm": 2.2935471508100553, "learning_rate": 5.396671495180257e-06, "loss": 1.1777, "step": 51500 }, { "epoch": 1967.3584905660377, "grad_norm": 1.987678318177208, "learning_rate": 5.390361128484278e-06, "loss": 1.1283, "step": 51520 }, { "epoch": 1968.1132075471698, "grad_norm": 1.9732671384472393, "learning_rate": 5.38410109602855e-06, "loss": 1.1631, "step": 51540 }, { "epoch": 1968.867924528302, "grad_norm": 2.031999390800106, "learning_rate": 5.37789140630396e-06, "loss": 1.1498, "step": 51560 }, { "epoch": 1969.622641509434, "grad_norm": 3.612362619019224, "learning_rate": 5.3717320677331165e-06, "loss": 1.1449, "step": 51580 }, { "epoch": 1970.377358490566, "grad_norm": 2.7552728484649216, "learning_rate": 5.365623088670337e-06, "loss": 1.1221, "step": 51600 }, { "epoch": 1971.132075471698, "grad_norm": 2.095240315052155, "learning_rate": 5.359564477401625e-06, "loss": 1.1635, "step": 51620 }, { "epoch": 1971.8867924528302, "grad_norm": 2.051375314186468, "learning_rate": 5.353556242144684e-06, "loss": 1.1768, "step": 51640 }, { "epoch": 1972.6415094339623, "grad_norm": 1.7681282740339075, "learning_rate": 5.3475983910488705e-06, "loss": 1.1524, "step": 51660 }, { "epoch": 1973.3962264150944, "grad_norm": 1.9784394422194775, "learning_rate": 5.34169093219521e-06, "loss": 1.1694, "step": 51680 }, { "epoch": 1974.1509433962265, "grad_norm": 2.1190098778323887, "learning_rate": 5.3358338735963825e-06, "loss": 1.1546, "step": 51700 }, { "epoch": 1974.9056603773586, "grad_norm": 1.6461495183245571, "learning_rate": 5.3300272231966895e-06, "loss": 1.1597, "step": 51720 }, { "epoch": 1975.6603773584907, "grad_norm": 2.287937258261333, "learning_rate": 5.3242709888720875e-06, "loss": 1.1565, "step": 51740 }, { "epoch": 1976.4150943396226, "grad_norm": 1.971738312330891, "learning_rate": 5.318565178430121e-06, "loss": 1.1646, "step": 51760 }, { "epoch": 1977.1698113207547, "grad_norm": 1.733242389596805, "learning_rate": 5.312909799609962e-06, "loss": 1.1507, "step": 51780 }, { "epoch": 1977.9245283018868, "grad_norm": 2.2381830913006486, "learning_rate": 5.307304860082375e-06, "loss": 1.161, "step": 51800 }, { "epoch": 1978.6792452830189, "grad_norm": 1.7639744175828544, "learning_rate": 5.3017503674497e-06, "loss": 1.1639, "step": 51820 }, { "epoch": 1979.433962264151, "grad_norm": 2.142096606558369, "learning_rate": 5.296246329245867e-06, "loss": 1.145, "step": 51840 }, { "epoch": 1980.188679245283, "grad_norm": 2.1364940736571905, "learning_rate": 5.29079275293636e-06, "loss": 1.1445, "step": 51860 }, { "epoch": 1980.9433962264152, "grad_norm": 2.0408860512130063, "learning_rate": 5.285389645918224e-06, "loss": 1.1684, "step": 51880 }, { "epoch": 1981.698113207547, "grad_norm": 2.1484279394512984, "learning_rate": 5.280037015520047e-06, "loss": 1.1427, "step": 51900 }, { "epoch": 1982.4528301886792, "grad_norm": 1.8875817727112376, "learning_rate": 5.27473486900196e-06, "loss": 1.127, "step": 51920 }, { "epoch": 1983.2075471698113, "grad_norm": 1.9694696435513541, "learning_rate": 5.269483213555604e-06, "loss": 1.1631, "step": 51940 }, { "epoch": 1983.9622641509434, "grad_norm": 1.8852852930999937, "learning_rate": 5.264282056304144e-06, "loss": 1.1476, "step": 51960 }, { "epoch": 1984.7169811320755, "grad_norm": 2.0442189239889488, "learning_rate": 5.259131404302259e-06, "loss": 1.1772, "step": 51980 }, { "epoch": 1985.4716981132076, "grad_norm": 2.115147564108749, "learning_rate": 5.254031264536109e-06, "loss": 1.1451, "step": 52000 } ], "logging_steps": 20, "max_steps": 54000, "num_input_tokens_seen": 0, "num_train_epochs": 2077, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8771778183168000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }