diff --git "a/checkpoint-15000/trainer_state.json" "b/checkpoint-15000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-15000/trainer_state.json" @@ -0,0 +1,10661 @@ +{ + "best_global_step": 15000, + "best_metric": 1.7887755632400513, + "best_model_checkpoint": "./qwen3-30m-tinystories-checkpoints/checkpoint-15000", + "epoch": 0.8776035572197519, + "eval_steps": 1000, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.850690381465013e-05, + "grad_norm": Infinity, + "learning_rate": 0.0, + "loss": 11.3909, + "step": 1 + }, + { + "epoch": 0.0005850690381465013, + "grad_norm": 74.58138275146484, + "learning_rate": 6.825273010920438e-08, + "loss": 11.4747, + "step": 10 + }, + { + "epoch": 0.0011701380762930026, + "grad_norm": 70.98603057861328, + "learning_rate": 1.6575663026521062e-07, + "loss": 11.4193, + "step": 20 + }, + { + "epoch": 0.0017552071144395038, + "grad_norm": 67.07854461669922, + "learning_rate": 2.632605304212169e-07, + "loss": 11.3702, + "step": 30 + }, + { + "epoch": 0.002340276152586005, + "grad_norm": 65.53945922851562, + "learning_rate": 3.6076443057722313e-07, + "loss": 11.1989, + "step": 40 + }, + { + "epoch": 0.0029253451907325064, + "grad_norm": 56.668155670166016, + "learning_rate": 4.582683307332293e-07, + "loss": 10.894, + "step": 50 + }, + { + "epoch": 0.0035104142288790077, + "grad_norm": 47.96501159667969, + "learning_rate": 5.557722308892356e-07, + "loss": 10.6112, + "step": 60 + }, + { + "epoch": 0.004095483267025509, + "grad_norm": 36.578582763671875, + "learning_rate": 6.532761310452418e-07, + "loss": 10.2903, + "step": 70 + }, + { + "epoch": 0.00468055230517201, + "grad_norm": 30.915164947509766, + "learning_rate": 7.507800312012481e-07, + "loss": 9.9734, + "step": 80 + }, + { + "epoch": 0.0052656213433185115, + "grad_norm": 24.249662399291992, + "learning_rate": 8.482839313572544e-07, + "loss": 9.7175, + "step": 90 + }, + { + "epoch": 0.005850690381465013, + "grad_norm": 17.660844802856445, + "learning_rate": 9.457878315132606e-07, + "loss": 9.4751, + "step": 100 + }, + { + "epoch": 0.006435759419611514, + "grad_norm": 14.554573059082031, + "learning_rate": 1.043291731669267e-06, + "loss": 9.2603, + "step": 110 + }, + { + "epoch": 0.007020828457758015, + "grad_norm": 12.098219871520996, + "learning_rate": 1.140795631825273e-06, + "loss": 9.0319, + "step": 120 + }, + { + "epoch": 0.007605897495904517, + "grad_norm": 10.675163269042969, + "learning_rate": 1.2382995319812794e-06, + "loss": 8.8657, + "step": 130 + }, + { + "epoch": 0.008190966534051018, + "grad_norm": 9.221511840820312, + "learning_rate": 1.3358034321372856e-06, + "loss": 8.6798, + "step": 140 + }, + { + "epoch": 0.00877603557219752, + "grad_norm": 8.275217056274414, + "learning_rate": 1.4333073322932917e-06, + "loss": 8.5072, + "step": 150 + }, + { + "epoch": 0.00936110461034402, + "grad_norm": 7.335099220275879, + "learning_rate": 1.5308112324492981e-06, + "loss": 8.3525, + "step": 160 + }, + { + "epoch": 0.009946173648490523, + "grad_norm": 6.6783833503723145, + "learning_rate": 1.6283151326053041e-06, + "loss": 8.2143, + "step": 170 + }, + { + "epoch": 0.010531242686637023, + "grad_norm": 5.890490531921387, + "learning_rate": 1.7258190327613103e-06, + "loss": 8.1013, + "step": 180 + }, + { + "epoch": 0.011116311724783525, + "grad_norm": 5.453526496887207, + "learning_rate": 1.823322932917317e-06, + "loss": 7.9714, + "step": 190 + }, + { + "epoch": 0.011701380762930026, + "grad_norm": 4.6703081130981445, + "learning_rate": 1.920826833073323e-06, + "loss": 7.8631, + "step": 200 + }, + { + "epoch": 0.012286449801076528, + "grad_norm": 4.414166450500488, + "learning_rate": 2.0183307332293293e-06, + "loss": 7.7655, + "step": 210 + }, + { + "epoch": 0.012871518839223028, + "grad_norm": 4.081826686859131, + "learning_rate": 2.1158346333853355e-06, + "loss": 7.6717, + "step": 220 + }, + { + "epoch": 0.01345658787736953, + "grad_norm": 4.213140487670898, + "learning_rate": 2.2133385335413417e-06, + "loss": 7.5924, + "step": 230 + }, + { + "epoch": 0.01404165691551603, + "grad_norm": 3.859170436859131, + "learning_rate": 2.310842433697348e-06, + "loss": 7.511, + "step": 240 + }, + { + "epoch": 0.014626725953662533, + "grad_norm": 3.9012818336486816, + "learning_rate": 2.408346333853354e-06, + "loss": 7.4278, + "step": 250 + }, + { + "epoch": 0.015211794991809033, + "grad_norm": 3.8075311183929443, + "learning_rate": 2.5058502340093603e-06, + "loss": 7.3492, + "step": 260 + }, + { + "epoch": 0.015796864029955535, + "grad_norm": 3.5343337059020996, + "learning_rate": 2.603354134165367e-06, + "loss": 7.2719, + "step": 270 + }, + { + "epoch": 0.016381933068102036, + "grad_norm": 3.5580379962921143, + "learning_rate": 2.700858034321373e-06, + "loss": 7.2373, + "step": 280 + }, + { + "epoch": 0.016967002106248536, + "grad_norm": 3.4421544075012207, + "learning_rate": 2.7983619344773792e-06, + "loss": 7.1697, + "step": 290 + }, + { + "epoch": 0.01755207114439504, + "grad_norm": 3.7167348861694336, + "learning_rate": 2.8958658346333854e-06, + "loss": 7.0948, + "step": 300 + }, + { + "epoch": 0.01813714018254154, + "grad_norm": 4.353510856628418, + "learning_rate": 2.993369734789392e-06, + "loss": 7.0373, + "step": 310 + }, + { + "epoch": 0.01872220922068804, + "grad_norm": 4.191625118255615, + "learning_rate": 3.0908736349453982e-06, + "loss": 6.9602, + "step": 320 + }, + { + "epoch": 0.01930727825883454, + "grad_norm": 4.205361843109131, + "learning_rate": 3.188377535101404e-06, + "loss": 6.903, + "step": 330 + }, + { + "epoch": 0.019892347296981045, + "grad_norm": 4.100222110748291, + "learning_rate": 3.28588143525741e-06, + "loss": 6.8439, + "step": 340 + }, + { + "epoch": 0.020477416335127546, + "grad_norm": 4.314079284667969, + "learning_rate": 3.3833853354134164e-06, + "loss": 6.7787, + "step": 350 + }, + { + "epoch": 0.021062485373274046, + "grad_norm": 6.652833938598633, + "learning_rate": 3.4808892355694226e-06, + "loss": 6.7226, + "step": 360 + }, + { + "epoch": 0.021647554411420546, + "grad_norm": 4.056787014007568, + "learning_rate": 3.5783931357254296e-06, + "loss": 6.664, + "step": 370 + }, + { + "epoch": 0.02223262344956705, + "grad_norm": 4.815715312957764, + "learning_rate": 3.675897035881436e-06, + "loss": 6.6125, + "step": 380 + }, + { + "epoch": 0.02281769248771355, + "grad_norm": 4.876301288604736, + "learning_rate": 3.773400936037442e-06, + "loss": 6.5621, + "step": 390 + }, + { + "epoch": 0.02340276152586005, + "grad_norm": 4.692805290222168, + "learning_rate": 3.870904836193448e-06, + "loss": 6.5075, + "step": 400 + }, + { + "epoch": 0.02398783056400655, + "grad_norm": 4.293951034545898, + "learning_rate": 3.968408736349454e-06, + "loss": 6.4574, + "step": 410 + }, + { + "epoch": 0.024572899602153055, + "grad_norm": 4.072322845458984, + "learning_rate": 4.06591263650546e-06, + "loss": 6.3977, + "step": 420 + }, + { + "epoch": 0.025157968640299556, + "grad_norm": 5.009401321411133, + "learning_rate": 4.163416536661466e-06, + "loss": 6.3332, + "step": 430 + }, + { + "epoch": 0.025743037678446056, + "grad_norm": 4.7960286140441895, + "learning_rate": 4.2609204368174725e-06, + "loss": 6.2778, + "step": 440 + }, + { + "epoch": 0.026328106716592557, + "grad_norm": 3.423039197921753, + "learning_rate": 4.358424336973479e-06, + "loss": 6.2403, + "step": 450 + }, + { + "epoch": 0.02691317575473906, + "grad_norm": 4.546107292175293, + "learning_rate": 4.455928237129486e-06, + "loss": 6.1845, + "step": 460 + }, + { + "epoch": 0.02749824479288556, + "grad_norm": 5.357204914093018, + "learning_rate": 4.553432137285492e-06, + "loss": 6.1027, + "step": 470 + }, + { + "epoch": 0.02808331383103206, + "grad_norm": 4.221200942993164, + "learning_rate": 4.650936037441498e-06, + "loss": 6.0522, + "step": 480 + }, + { + "epoch": 0.02866838286917856, + "grad_norm": 3.5187366008758545, + "learning_rate": 4.748439937597504e-06, + "loss": 6.0063, + "step": 490 + }, + { + "epoch": 0.029253451907325066, + "grad_norm": 4.514864444732666, + "learning_rate": 4.8459438377535105e-06, + "loss": 5.9795, + "step": 500 + }, + { + "epoch": 0.029838520945471566, + "grad_norm": 3.3947811126708984, + "learning_rate": 4.943447737909517e-06, + "loss": 5.9023, + "step": 510 + }, + { + "epoch": 0.030423589983618066, + "grad_norm": 5.161691665649414, + "learning_rate": 5.040951638065523e-06, + "loss": 5.8573, + "step": 520 + }, + { + "epoch": 0.031008659021764567, + "grad_norm": 5.400602340698242, + "learning_rate": 5.138455538221529e-06, + "loss": 5.8028, + "step": 530 + }, + { + "epoch": 0.03159372805991107, + "grad_norm": 4.410223007202148, + "learning_rate": 5.235959438377535e-06, + "loss": 5.7999, + "step": 540 + }, + { + "epoch": 0.03217879709805757, + "grad_norm": 3.708657741546631, + "learning_rate": 5.3334633385335414e-06, + "loss": 5.7253, + "step": 550 + }, + { + "epoch": 0.03276386613620407, + "grad_norm": 5.148003101348877, + "learning_rate": 5.430967238689548e-06, + "loss": 5.6967, + "step": 560 + }, + { + "epoch": 0.03334893517435057, + "grad_norm": 5.288032054901123, + "learning_rate": 5.528471138845554e-06, + "loss": 5.6411, + "step": 570 + }, + { + "epoch": 0.03393400421249707, + "grad_norm": 3.524038314819336, + "learning_rate": 5.62597503900156e-06, + "loss": 5.6076, + "step": 580 + }, + { + "epoch": 0.03451907325064357, + "grad_norm": 3.5057692527770996, + "learning_rate": 5.723478939157566e-06, + "loss": 5.5643, + "step": 590 + }, + { + "epoch": 0.03510414228879008, + "grad_norm": 4.593623161315918, + "learning_rate": 5.820982839313572e-06, + "loss": 5.5258, + "step": 600 + }, + { + "epoch": 0.03568921132693658, + "grad_norm": 3.617720127105713, + "learning_rate": 5.918486739469579e-06, + "loss": 5.4787, + "step": 610 + }, + { + "epoch": 0.03627428036508308, + "grad_norm": 3.6132466793060303, + "learning_rate": 6.015990639625586e-06, + "loss": 5.4579, + "step": 620 + }, + { + "epoch": 0.03685934940322958, + "grad_norm": 4.541604518890381, + "learning_rate": 6.113494539781592e-06, + "loss": 5.4188, + "step": 630 + }, + { + "epoch": 0.03744441844137608, + "grad_norm": 4.4366254806518555, + "learning_rate": 6.210998439937598e-06, + "loss": 5.3733, + "step": 640 + }, + { + "epoch": 0.03802948747952258, + "grad_norm": 4.232390403747559, + "learning_rate": 6.308502340093604e-06, + "loss": 5.3406, + "step": 650 + }, + { + "epoch": 0.03861455651766908, + "grad_norm": 3.4591336250305176, + "learning_rate": 6.4060062402496095e-06, + "loss": 5.2955, + "step": 660 + }, + { + "epoch": 0.03919962555581558, + "grad_norm": 5.510738849639893, + "learning_rate": 6.5035101404056166e-06, + "loss": 5.2574, + "step": 670 + }, + { + "epoch": 0.03978469459396209, + "grad_norm": 5.914499282836914, + "learning_rate": 6.601014040561624e-06, + "loss": 5.2101, + "step": 680 + }, + { + "epoch": 0.04036976363210859, + "grad_norm": 4.961345672607422, + "learning_rate": 6.698517940717629e-06, + "loss": 5.2078, + "step": 690 + }, + { + "epoch": 0.04095483267025509, + "grad_norm": 4.458784580230713, + "learning_rate": 6.796021840873636e-06, + "loss": 5.1342, + "step": 700 + }, + { + "epoch": 0.04153990170840159, + "grad_norm": 4.344489574432373, + "learning_rate": 6.893525741029641e-06, + "loss": 5.1115, + "step": 710 + }, + { + "epoch": 0.04212497074654809, + "grad_norm": 3.6247169971466064, + "learning_rate": 6.991029641185648e-06, + "loss": 5.1027, + "step": 720 + }, + { + "epoch": 0.04271003978469459, + "grad_norm": 4.234716415405273, + "learning_rate": 7.088533541341654e-06, + "loss": 5.0847, + "step": 730 + }, + { + "epoch": 0.04329510882284109, + "grad_norm": 3.6908812522888184, + "learning_rate": 7.18603744149766e-06, + "loss": 5.0225, + "step": 740 + }, + { + "epoch": 0.04388017786098759, + "grad_norm": 4.514861583709717, + "learning_rate": 7.283541341653667e-06, + "loss": 5.0108, + "step": 750 + }, + { + "epoch": 0.0444652468991341, + "grad_norm": 4.391042709350586, + "learning_rate": 7.381045241809672e-06, + "loss": 4.9743, + "step": 760 + }, + { + "epoch": 0.0450503159372806, + "grad_norm": 4.511202335357666, + "learning_rate": 7.478549141965679e-06, + "loss": 4.964, + "step": 770 + }, + { + "epoch": 0.0456353849754271, + "grad_norm": 4.672698497772217, + "learning_rate": 7.576053042121685e-06, + "loss": 4.9322, + "step": 780 + }, + { + "epoch": 0.0462204540135736, + "grad_norm": 5.7832207679748535, + "learning_rate": 7.673556942277693e-06, + "loss": 4.9004, + "step": 790 + }, + { + "epoch": 0.0468055230517201, + "grad_norm": 5.457101345062256, + "learning_rate": 7.771060842433697e-06, + "loss": 4.8727, + "step": 800 + }, + { + "epoch": 0.0473905920898666, + "grad_norm": 5.284400463104248, + "learning_rate": 7.868564742589705e-06, + "loss": 4.8428, + "step": 810 + }, + { + "epoch": 0.0479756611280131, + "grad_norm": 5.2293829917907715, + "learning_rate": 7.96606864274571e-06, + "loss": 4.8219, + "step": 820 + }, + { + "epoch": 0.04856073016615961, + "grad_norm": 4.629702091217041, + "learning_rate": 8.063572542901716e-06, + "loss": 4.7985, + "step": 830 + }, + { + "epoch": 0.04914579920430611, + "grad_norm": 4.912006378173828, + "learning_rate": 8.161076443057723e-06, + "loss": 4.8071, + "step": 840 + }, + { + "epoch": 0.04973086824245261, + "grad_norm": 4.5750346183776855, + "learning_rate": 8.258580343213728e-06, + "loss": 4.7561, + "step": 850 + }, + { + "epoch": 0.05031593728059911, + "grad_norm": 4.587753772735596, + "learning_rate": 8.356084243369736e-06, + "loss": 4.7323, + "step": 860 + }, + { + "epoch": 0.05090100631874561, + "grad_norm": 4.676549434661865, + "learning_rate": 8.45358814352574e-06, + "loss": 4.6787, + "step": 870 + }, + { + "epoch": 0.05148607535689211, + "grad_norm": 4.039685249328613, + "learning_rate": 8.551092043681748e-06, + "loss": 4.6808, + "step": 880 + }, + { + "epoch": 0.05207114439503861, + "grad_norm": 4.448736667633057, + "learning_rate": 8.648595943837753e-06, + "loss": 4.6849, + "step": 890 + }, + { + "epoch": 0.05265621343318511, + "grad_norm": 4.613405227661133, + "learning_rate": 8.74609984399376e-06, + "loss": 4.6309, + "step": 900 + }, + { + "epoch": 0.05324128247133162, + "grad_norm": 4.352312088012695, + "learning_rate": 8.843603744149765e-06, + "loss": 4.6045, + "step": 910 + }, + { + "epoch": 0.05382635150947812, + "grad_norm": 7.403280258178711, + "learning_rate": 8.941107644305773e-06, + "loss": 4.6065, + "step": 920 + }, + { + "epoch": 0.05441142054762462, + "grad_norm": 6.342307090759277, + "learning_rate": 9.03861154446178e-06, + "loss": 4.5865, + "step": 930 + }, + { + "epoch": 0.05499648958577112, + "grad_norm": 4.280608177185059, + "learning_rate": 9.136115444617785e-06, + "loss": 4.5471, + "step": 940 + }, + { + "epoch": 0.05558155862391762, + "grad_norm": 5.916503429412842, + "learning_rate": 9.233619344773792e-06, + "loss": 4.5704, + "step": 950 + }, + { + "epoch": 0.05616662766206412, + "grad_norm": 5.4643940925598145, + "learning_rate": 9.331123244929798e-06, + "loss": 4.5332, + "step": 960 + }, + { + "epoch": 0.05675169670021062, + "grad_norm": 6.90775728225708, + "learning_rate": 9.428627145085804e-06, + "loss": 4.5466, + "step": 970 + }, + { + "epoch": 0.05733676573835712, + "grad_norm": 4.458855152130127, + "learning_rate": 9.52613104524181e-06, + "loss": 4.4947, + "step": 980 + }, + { + "epoch": 0.05792183477650363, + "grad_norm": 4.308935642242432, + "learning_rate": 9.623634945397816e-06, + "loss": 4.4815, + "step": 990 + }, + { + "epoch": 0.05850690381465013, + "grad_norm": 4.152543544769287, + "learning_rate": 9.721138845553823e-06, + "loss": 4.4704, + "step": 1000 + }, + { + "epoch": 0.05850690381465013, + "eval_loss": 4.4797258377075195, + "eval_runtime": 32.8353, + "eval_samples_per_second": 687.889, + "eval_steps_per_second": 5.391, + "step": 1000 + }, + { + "epoch": 0.05909197285279663, + "grad_norm": 4.928531646728516, + "learning_rate": 9.818642745709829e-06, + "loss": 4.4228, + "step": 1010 + }, + { + "epoch": 0.05967704189094313, + "grad_norm": 3.708007335662842, + "learning_rate": 9.916146645865835e-06, + "loss": 4.43, + "step": 1020 + }, + { + "epoch": 0.06026211092908963, + "grad_norm": 4.5416693687438965, + "learning_rate": 1.0013650546021841e-05, + "loss": 4.4053, + "step": 1030 + }, + { + "epoch": 0.06084717996723613, + "grad_norm": 4.850225925445557, + "learning_rate": 1.0111154446177847e-05, + "loss": 4.3714, + "step": 1040 + }, + { + "epoch": 0.06143224900538263, + "grad_norm": 4.977739334106445, + "learning_rate": 1.0208658346333853e-05, + "loss": 4.3931, + "step": 1050 + }, + { + "epoch": 0.062017318043529134, + "grad_norm": 6.426198959350586, + "learning_rate": 1.030616224648986e-05, + "loss": 4.3624, + "step": 1060 + }, + { + "epoch": 0.06260238708167563, + "grad_norm": 3.6765670776367188, + "learning_rate": 1.0403666146645866e-05, + "loss": 4.3326, + "step": 1070 + }, + { + "epoch": 0.06318745611982214, + "grad_norm": 6.291701316833496, + "learning_rate": 1.0501170046801872e-05, + "loss": 4.3467, + "step": 1080 + }, + { + "epoch": 0.06377252515796863, + "grad_norm": 5.068548679351807, + "learning_rate": 1.059867394695788e-05, + "loss": 4.3091, + "step": 1090 + }, + { + "epoch": 0.06435759419611514, + "grad_norm": 4.617797374725342, + "learning_rate": 1.0696177847113884e-05, + "loss": 4.2529, + "step": 1100 + }, + { + "epoch": 0.06494266323426165, + "grad_norm": 5.207705974578857, + "learning_rate": 1.0793681747269892e-05, + "loss": 4.2795, + "step": 1110 + }, + { + "epoch": 0.06552773227240814, + "grad_norm": 4.547387599945068, + "learning_rate": 1.0891185647425897e-05, + "loss": 4.2496, + "step": 1120 + }, + { + "epoch": 0.06611280131055465, + "grad_norm": 5.261926174163818, + "learning_rate": 1.0988689547581905e-05, + "loss": 4.2402, + "step": 1130 + }, + { + "epoch": 0.06669787034870114, + "grad_norm": 5.240683555603027, + "learning_rate": 1.108619344773791e-05, + "loss": 4.2273, + "step": 1140 + }, + { + "epoch": 0.06728293938684765, + "grad_norm": 7.20740270614624, + "learning_rate": 1.1183697347893917e-05, + "loss": 4.2256, + "step": 1150 + }, + { + "epoch": 0.06786800842499414, + "grad_norm": 4.675016403198242, + "learning_rate": 1.1281201248049922e-05, + "loss": 4.2221, + "step": 1160 + }, + { + "epoch": 0.06845307746314065, + "grad_norm": 4.353231906890869, + "learning_rate": 1.137870514820593e-05, + "loss": 4.2094, + "step": 1170 + }, + { + "epoch": 0.06903814650128715, + "grad_norm": 5.121499538421631, + "learning_rate": 1.1476209048361936e-05, + "loss": 4.1881, + "step": 1180 + }, + { + "epoch": 0.06962321553943365, + "grad_norm": 6.290802478790283, + "learning_rate": 1.1573712948517942e-05, + "loss": 4.1887, + "step": 1190 + }, + { + "epoch": 0.07020828457758016, + "grad_norm": 3.925096273422241, + "learning_rate": 1.1671216848673948e-05, + "loss": 4.1641, + "step": 1200 + }, + { + "epoch": 0.07079335361572665, + "grad_norm": 5.288980960845947, + "learning_rate": 1.1768720748829954e-05, + "loss": 4.1546, + "step": 1210 + }, + { + "epoch": 0.07137842265387316, + "grad_norm": 5.591447353363037, + "learning_rate": 1.186622464898596e-05, + "loss": 4.1311, + "step": 1220 + }, + { + "epoch": 0.07196349169201965, + "grad_norm": 5.622387409210205, + "learning_rate": 1.1963728549141965e-05, + "loss": 4.0918, + "step": 1230 + }, + { + "epoch": 0.07254856073016616, + "grad_norm": 4.414048194885254, + "learning_rate": 1.2061232449297973e-05, + "loss": 4.1011, + "step": 1240 + }, + { + "epoch": 0.07313362976831266, + "grad_norm": 5.496423721313477, + "learning_rate": 1.2158736349453979e-05, + "loss": 4.0819, + "step": 1250 + }, + { + "epoch": 0.07371869880645916, + "grad_norm": 4.800556182861328, + "learning_rate": 1.2256240249609985e-05, + "loss": 4.0849, + "step": 1260 + }, + { + "epoch": 0.07430376784460567, + "grad_norm": 4.899653911590576, + "learning_rate": 1.2353744149765991e-05, + "loss": 4.0787, + "step": 1270 + }, + { + "epoch": 0.07488883688275216, + "grad_norm": 6.266668319702148, + "learning_rate": 1.2451248049921998e-05, + "loss": 4.0591, + "step": 1280 + }, + { + "epoch": 0.07547390592089867, + "grad_norm": 5.078396320343018, + "learning_rate": 1.2548751950078002e-05, + "loss": 4.0174, + "step": 1290 + }, + { + "epoch": 0.07605897495904516, + "grad_norm": 7.149772644042969, + "learning_rate": 1.264625585023401e-05, + "loss": 4.022, + "step": 1300 + }, + { + "epoch": 0.07664404399719167, + "grad_norm": 5.792842864990234, + "learning_rate": 1.2743759750390016e-05, + "loss": 4.0116, + "step": 1310 + }, + { + "epoch": 0.07722911303533816, + "grad_norm": 4.636885643005371, + "learning_rate": 1.2841263650546024e-05, + "loss": 4.0259, + "step": 1320 + }, + { + "epoch": 0.07781418207348467, + "grad_norm": 5.287436485290527, + "learning_rate": 1.293876755070203e-05, + "loss": 4.0002, + "step": 1330 + }, + { + "epoch": 0.07839925111163117, + "grad_norm": 4.564724922180176, + "learning_rate": 1.3036271450858035e-05, + "loss": 3.9647, + "step": 1340 + }, + { + "epoch": 0.07898432014977767, + "grad_norm": 4.668666362762451, + "learning_rate": 1.313377535101404e-05, + "loss": 3.9654, + "step": 1350 + }, + { + "epoch": 0.07956938918792418, + "grad_norm": 5.322603702545166, + "learning_rate": 1.3231279251170047e-05, + "loss": 3.962, + "step": 1360 + }, + { + "epoch": 0.08015445822607067, + "grad_norm": 4.6104888916015625, + "learning_rate": 1.3328783151326055e-05, + "loss": 3.9548, + "step": 1370 + }, + { + "epoch": 0.08073952726421718, + "grad_norm": 4.587207317352295, + "learning_rate": 1.342628705148206e-05, + "loss": 3.9535, + "step": 1380 + }, + { + "epoch": 0.08132459630236367, + "grad_norm": 7.504576206207275, + "learning_rate": 1.3523790951638066e-05, + "loss": 3.9444, + "step": 1390 + }, + { + "epoch": 0.08190966534051018, + "grad_norm": 6.192226886749268, + "learning_rate": 1.3621294851794072e-05, + "loss": 3.8986, + "step": 1400 + }, + { + "epoch": 0.08249473437865668, + "grad_norm": 5.548381805419922, + "learning_rate": 1.371879875195008e-05, + "loss": 3.9252, + "step": 1410 + }, + { + "epoch": 0.08307980341680318, + "grad_norm": 5.169741153717041, + "learning_rate": 1.3816302652106086e-05, + "loss": 3.8782, + "step": 1420 + }, + { + "epoch": 0.08366487245494969, + "grad_norm": 4.358245372772217, + "learning_rate": 1.391380655226209e-05, + "loss": 3.8844, + "step": 1430 + }, + { + "epoch": 0.08424994149309618, + "grad_norm": 5.795071125030518, + "learning_rate": 1.4011310452418097e-05, + "loss": 3.8661, + "step": 1440 + }, + { + "epoch": 0.08483501053124269, + "grad_norm": 6.101123809814453, + "learning_rate": 1.4108814352574104e-05, + "loss": 3.8696, + "step": 1450 + }, + { + "epoch": 0.08542007956938918, + "grad_norm": 6.722472190856934, + "learning_rate": 1.420631825273011e-05, + "loss": 3.872, + "step": 1460 + }, + { + "epoch": 0.08600514860753569, + "grad_norm": 8.680535316467285, + "learning_rate": 1.4303822152886115e-05, + "loss": 3.8475, + "step": 1470 + }, + { + "epoch": 0.08659021764568219, + "grad_norm": 5.416397571563721, + "learning_rate": 1.4401326053042121e-05, + "loss": 3.8446, + "step": 1480 + }, + { + "epoch": 0.08717528668382869, + "grad_norm": 5.110279083251953, + "learning_rate": 1.449882995319813e-05, + "loss": 3.8312, + "step": 1490 + }, + { + "epoch": 0.08776035572197519, + "grad_norm": 6.0525712966918945, + "learning_rate": 1.4596333853354135e-05, + "loss": 3.8214, + "step": 1500 + }, + { + "epoch": 0.0883454247601217, + "grad_norm": 4.44586706161499, + "learning_rate": 1.4693837753510142e-05, + "loss": 3.8205, + "step": 1510 + }, + { + "epoch": 0.0889304937982682, + "grad_norm": 6.613223075866699, + "learning_rate": 1.4791341653666146e-05, + "loss": 3.814, + "step": 1520 + }, + { + "epoch": 0.0895155628364147, + "grad_norm": 5.635190010070801, + "learning_rate": 1.4888845553822154e-05, + "loss": 3.8014, + "step": 1530 + }, + { + "epoch": 0.0901006318745612, + "grad_norm": 5.6791090965271, + "learning_rate": 1.498634945397816e-05, + "loss": 3.7774, + "step": 1540 + }, + { + "epoch": 0.0906857009127077, + "grad_norm": 6.853143692016602, + "learning_rate": 1.5083853354134166e-05, + "loss": 3.7525, + "step": 1550 + }, + { + "epoch": 0.0912707699508542, + "grad_norm": 5.072564125061035, + "learning_rate": 1.518135725429017e-05, + "loss": 3.7553, + "step": 1560 + }, + { + "epoch": 0.0918558389890007, + "grad_norm": 5.333250522613525, + "learning_rate": 1.527886115444618e-05, + "loss": 3.7588, + "step": 1570 + }, + { + "epoch": 0.0924409080271472, + "grad_norm": 6.792341709136963, + "learning_rate": 1.5376365054602185e-05, + "loss": 3.7355, + "step": 1580 + }, + { + "epoch": 0.09302597706529371, + "grad_norm": 7.016251564025879, + "learning_rate": 1.547386895475819e-05, + "loss": 3.7462, + "step": 1590 + }, + { + "epoch": 0.0936110461034402, + "grad_norm": 5.578188419342041, + "learning_rate": 1.5571372854914197e-05, + "loss": 3.7223, + "step": 1600 + }, + { + "epoch": 0.09419611514158671, + "grad_norm": 6.51141881942749, + "learning_rate": 1.5668876755070203e-05, + "loss": 3.7188, + "step": 1610 + }, + { + "epoch": 0.0947811841797332, + "grad_norm": 4.939900875091553, + "learning_rate": 1.576638065522621e-05, + "loss": 3.7198, + "step": 1620 + }, + { + "epoch": 0.09536625321787971, + "grad_norm": 4.815907001495361, + "learning_rate": 1.5863884555382216e-05, + "loss": 3.706, + "step": 1630 + }, + { + "epoch": 0.0959513222560262, + "grad_norm": 5.851882457733154, + "learning_rate": 1.5961388455538222e-05, + "loss": 3.6968, + "step": 1640 + }, + { + "epoch": 0.09653639129417271, + "grad_norm": 6.580109119415283, + "learning_rate": 1.6058892355694228e-05, + "loss": 3.7002, + "step": 1650 + }, + { + "epoch": 0.09712146033231922, + "grad_norm": 5.999516010284424, + "learning_rate": 1.6156396255850234e-05, + "loss": 3.6531, + "step": 1660 + }, + { + "epoch": 0.09770652937046571, + "grad_norm": 5.747854709625244, + "learning_rate": 1.625390015600624e-05, + "loss": 3.6531, + "step": 1670 + }, + { + "epoch": 0.09829159840861222, + "grad_norm": 7.057636260986328, + "learning_rate": 1.6351404056162247e-05, + "loss": 3.6653, + "step": 1680 + }, + { + "epoch": 0.09887666744675871, + "grad_norm": 5.716787815093994, + "learning_rate": 1.6448907956318253e-05, + "loss": 3.6589, + "step": 1690 + }, + { + "epoch": 0.09946173648490522, + "grad_norm": 5.70604133605957, + "learning_rate": 1.654641185647426e-05, + "loss": 3.6695, + "step": 1700 + }, + { + "epoch": 0.10004680552305172, + "grad_norm": 5.715747356414795, + "learning_rate": 1.6643915756630265e-05, + "loss": 3.6642, + "step": 1710 + }, + { + "epoch": 0.10063187456119822, + "grad_norm": 6.552218437194824, + "learning_rate": 1.674141965678627e-05, + "loss": 3.6694, + "step": 1720 + }, + { + "epoch": 0.10121694359934472, + "grad_norm": 5.934008598327637, + "learning_rate": 1.6838923556942278e-05, + "loss": 3.605, + "step": 1730 + }, + { + "epoch": 0.10180201263749122, + "grad_norm": 6.701034069061279, + "learning_rate": 1.6936427457098284e-05, + "loss": 3.6276, + "step": 1740 + }, + { + "epoch": 0.10238708167563773, + "grad_norm": 6.235359191894531, + "learning_rate": 1.703393135725429e-05, + "loss": 3.6091, + "step": 1750 + }, + { + "epoch": 0.10297215071378422, + "grad_norm": 6.0574750900268555, + "learning_rate": 1.7131435257410296e-05, + "loss": 3.5924, + "step": 1760 + }, + { + "epoch": 0.10355721975193073, + "grad_norm": 6.675718784332275, + "learning_rate": 1.7228939157566302e-05, + "loss": 3.5904, + "step": 1770 + }, + { + "epoch": 0.10414228879007723, + "grad_norm": 5.328856468200684, + "learning_rate": 1.7326443057722312e-05, + "loss": 3.5826, + "step": 1780 + }, + { + "epoch": 0.10472735782822373, + "grad_norm": 5.248366832733154, + "learning_rate": 1.7423946957878315e-05, + "loss": 3.5998, + "step": 1790 + }, + { + "epoch": 0.10531242686637023, + "grad_norm": 8.010108947753906, + "learning_rate": 1.752145085803432e-05, + "loss": 3.5727, + "step": 1800 + }, + { + "epoch": 0.10589749590451673, + "grad_norm": 6.607870578765869, + "learning_rate": 1.7618954758190327e-05, + "loss": 3.5791, + "step": 1810 + }, + { + "epoch": 0.10648256494266324, + "grad_norm": 6.370628356933594, + "learning_rate": 1.7716458658346337e-05, + "loss": 3.5593, + "step": 1820 + }, + { + "epoch": 0.10706763398080973, + "grad_norm": 6.906359672546387, + "learning_rate": 1.7813962558502343e-05, + "loss": 3.5493, + "step": 1830 + }, + { + "epoch": 0.10765270301895624, + "grad_norm": 6.07284688949585, + "learning_rate": 1.7911466458658346e-05, + "loss": 3.5577, + "step": 1840 + }, + { + "epoch": 0.10823777205710274, + "grad_norm": 5.982476711273193, + "learning_rate": 1.8008970358814352e-05, + "loss": 3.567, + "step": 1850 + }, + { + "epoch": 0.10882284109524924, + "grad_norm": 5.691832542419434, + "learning_rate": 1.810647425897036e-05, + "loss": 3.5337, + "step": 1860 + }, + { + "epoch": 0.10940791013339574, + "grad_norm": 6.514523506164551, + "learning_rate": 1.8203978159126368e-05, + "loss": 3.5472, + "step": 1870 + }, + { + "epoch": 0.10999297917154224, + "grad_norm": 4.859272003173828, + "learning_rate": 1.830148205928237e-05, + "loss": 3.5104, + "step": 1880 + }, + { + "epoch": 0.11057804820968874, + "grad_norm": 6.464576244354248, + "learning_rate": 1.8398985959438377e-05, + "loss": 3.5434, + "step": 1890 + }, + { + "epoch": 0.11116311724783524, + "grad_norm": 6.936394691467285, + "learning_rate": 1.8496489859594386e-05, + "loss": 3.5109, + "step": 1900 + }, + { + "epoch": 0.11174818628598175, + "grad_norm": 7.985333442687988, + "learning_rate": 1.8593993759750393e-05, + "loss": 3.491, + "step": 1910 + }, + { + "epoch": 0.11233325532412824, + "grad_norm": 6.374166488647461, + "learning_rate": 1.86914976599064e-05, + "loss": 3.4826, + "step": 1920 + }, + { + "epoch": 0.11291832436227475, + "grad_norm": 6.567166328430176, + "learning_rate": 1.87890015600624e-05, + "loss": 3.4878, + "step": 1930 + }, + { + "epoch": 0.11350339340042125, + "grad_norm": 7.407885551452637, + "learning_rate": 1.8886505460218408e-05, + "loss": 3.477, + "step": 1940 + }, + { + "epoch": 0.11408846243856775, + "grad_norm": 4.784425735473633, + "learning_rate": 1.8984009360374417e-05, + "loss": 3.4835, + "step": 1950 + }, + { + "epoch": 0.11467353147671425, + "grad_norm": 7.632740020751953, + "learning_rate": 1.9081513260530423e-05, + "loss": 3.4784, + "step": 1960 + }, + { + "epoch": 0.11525860051486075, + "grad_norm": 6.997206211090088, + "learning_rate": 1.9179017160686426e-05, + "loss": 3.4736, + "step": 1970 + }, + { + "epoch": 0.11584366955300726, + "grad_norm": 8.513856887817383, + "learning_rate": 1.9276521060842432e-05, + "loss": 3.455, + "step": 1980 + }, + { + "epoch": 0.11642873859115375, + "grad_norm": 6.568483829498291, + "learning_rate": 1.9374024960998442e-05, + "loss": 3.4535, + "step": 1990 + }, + { + "epoch": 0.11701380762930026, + "grad_norm": 6.571296691894531, + "learning_rate": 1.9471528861154448e-05, + "loss": 3.45, + "step": 2000 + }, + { + "epoch": 0.11701380762930026, + "eval_loss": 3.477719783782959, + "eval_runtime": 32.9063, + "eval_samples_per_second": 686.403, + "eval_steps_per_second": 5.379, + "step": 2000 + }, + { + "epoch": 0.11759887666744676, + "grad_norm": 7.265345573425293, + "learning_rate": 1.9569032761310454e-05, + "loss": 3.4645, + "step": 2010 + }, + { + "epoch": 0.11818394570559326, + "grad_norm": 5.9302849769592285, + "learning_rate": 1.9666536661466457e-05, + "loss": 3.4507, + "step": 2020 + }, + { + "epoch": 0.11876901474373976, + "grad_norm": 7.448586463928223, + "learning_rate": 1.9764040561622467e-05, + "loss": 3.4309, + "step": 2030 + }, + { + "epoch": 0.11935408378188626, + "grad_norm": 6.897288799285889, + "learning_rate": 1.9861544461778473e-05, + "loss": 3.4249, + "step": 2040 + }, + { + "epoch": 0.11993915282003276, + "grad_norm": 6.407149314880371, + "learning_rate": 1.995904836193448e-05, + "loss": 3.4136, + "step": 2050 + }, + { + "epoch": 0.12052422185817926, + "grad_norm": 5.520359992980957, + "learning_rate": 2.0056552262090482e-05, + "loss": 3.4011, + "step": 2060 + }, + { + "epoch": 0.12110929089632577, + "grad_norm": 7.033049583435059, + "learning_rate": 2.015405616224649e-05, + "loss": 3.397, + "step": 2070 + }, + { + "epoch": 0.12169435993447227, + "grad_norm": 6.583874702453613, + "learning_rate": 2.0251560062402498e-05, + "loss": 3.4259, + "step": 2080 + }, + { + "epoch": 0.12227942897261877, + "grad_norm": 7.739867210388184, + "learning_rate": 2.0349063962558504e-05, + "loss": 3.4007, + "step": 2090 + }, + { + "epoch": 0.12286449801076527, + "grad_norm": 5.324331283569336, + "learning_rate": 2.044656786271451e-05, + "loss": 3.3776, + "step": 2100 + }, + { + "epoch": 0.12344956704891177, + "grad_norm": 8.3421049118042, + "learning_rate": 2.0544071762870516e-05, + "loss": 3.3819, + "step": 2110 + }, + { + "epoch": 0.12403463608705827, + "grad_norm": 5.55806303024292, + "learning_rate": 2.0641575663026522e-05, + "loss": 3.4016, + "step": 2120 + }, + { + "epoch": 0.12461970512520477, + "grad_norm": 6.500893592834473, + "learning_rate": 2.073907956318253e-05, + "loss": 3.3746, + "step": 2130 + }, + { + "epoch": 0.12520477416335127, + "grad_norm": 8.402754783630371, + "learning_rate": 2.0836583463338535e-05, + "loss": 3.3958, + "step": 2140 + }, + { + "epoch": 0.12578984320149778, + "grad_norm": 6.717187881469727, + "learning_rate": 2.093408736349454e-05, + "loss": 3.3622, + "step": 2150 + }, + { + "epoch": 0.12637491223964428, + "grad_norm": 8.816388130187988, + "learning_rate": 2.1031591263650547e-05, + "loss": 3.3356, + "step": 2160 + }, + { + "epoch": 0.1269599812777908, + "grad_norm": 6.36826229095459, + "learning_rate": 2.1129095163806553e-05, + "loss": 3.3459, + "step": 2170 + }, + { + "epoch": 0.12754505031593727, + "grad_norm": 6.849374771118164, + "learning_rate": 2.122659906396256e-05, + "loss": 3.3458, + "step": 2180 + }, + { + "epoch": 0.12813011935408378, + "grad_norm": 9.058792114257812, + "learning_rate": 2.1324102964118566e-05, + "loss": 3.3629, + "step": 2190 + }, + { + "epoch": 0.12871518839223028, + "grad_norm": 7.605003833770752, + "learning_rate": 2.1421606864274572e-05, + "loss": 3.3279, + "step": 2200 + }, + { + "epoch": 0.1293002574303768, + "grad_norm": 5.16264533996582, + "learning_rate": 2.1519110764430578e-05, + "loss": 3.3407, + "step": 2210 + }, + { + "epoch": 0.1298853264685233, + "grad_norm": 6.889904499053955, + "learning_rate": 2.1616614664586584e-05, + "loss": 3.305, + "step": 2220 + }, + { + "epoch": 0.13047039550666978, + "grad_norm": 6.7097487449646, + "learning_rate": 2.171411856474259e-05, + "loss": 3.3061, + "step": 2230 + }, + { + "epoch": 0.13105546454481629, + "grad_norm": 5.797562122344971, + "learning_rate": 2.1811622464898597e-05, + "loss": 3.3216, + "step": 2240 + }, + { + "epoch": 0.1316405335829628, + "grad_norm": 6.799828052520752, + "learning_rate": 2.1909126365054603e-05, + "loss": 3.3102, + "step": 2250 + }, + { + "epoch": 0.1322256026211093, + "grad_norm": 6.1274094581604, + "learning_rate": 2.200663026521061e-05, + "loss": 3.3222, + "step": 2260 + }, + { + "epoch": 0.13281067165925578, + "grad_norm": 7.176873683929443, + "learning_rate": 2.2104134165366615e-05, + "loss": 3.2857, + "step": 2270 + }, + { + "epoch": 0.1333957406974023, + "grad_norm": 6.563827991485596, + "learning_rate": 2.220163806552262e-05, + "loss": 3.2899, + "step": 2280 + }, + { + "epoch": 0.1339808097355488, + "grad_norm": 6.872474670410156, + "learning_rate": 2.2299141965678628e-05, + "loss": 3.2911, + "step": 2290 + }, + { + "epoch": 0.1345658787736953, + "grad_norm": 7.1769609451293945, + "learning_rate": 2.2396645865834634e-05, + "loss": 3.2967, + "step": 2300 + }, + { + "epoch": 0.1351509478118418, + "grad_norm": 7.671145915985107, + "learning_rate": 2.249414976599064e-05, + "loss": 3.2594, + "step": 2310 + }, + { + "epoch": 0.1357360168499883, + "grad_norm": 5.75607967376709, + "learning_rate": 2.2591653666146646e-05, + "loss": 3.2746, + "step": 2320 + }, + { + "epoch": 0.1363210858881348, + "grad_norm": 8.109906196594238, + "learning_rate": 2.2689157566302656e-05, + "loss": 3.2532, + "step": 2330 + }, + { + "epoch": 0.1369061549262813, + "grad_norm": 7.200730323791504, + "learning_rate": 2.278666146645866e-05, + "loss": 3.2873, + "step": 2340 + }, + { + "epoch": 0.1374912239644278, + "grad_norm": 6.151474952697754, + "learning_rate": 2.2884165366614665e-05, + "loss": 3.2456, + "step": 2350 + }, + { + "epoch": 0.1380762930025743, + "grad_norm": 6.946084022521973, + "learning_rate": 2.298166926677067e-05, + "loss": 3.2469, + "step": 2360 + }, + { + "epoch": 0.1386613620407208, + "grad_norm": 7.4252142906188965, + "learning_rate": 2.307917316692668e-05, + "loss": 3.2722, + "step": 2370 + }, + { + "epoch": 0.1392464310788673, + "grad_norm": 7.327793598175049, + "learning_rate": 2.3176677067082683e-05, + "loss": 3.2286, + "step": 2380 + }, + { + "epoch": 0.1398315001170138, + "grad_norm": 7.8824896812438965, + "learning_rate": 2.327418096723869e-05, + "loss": 3.2182, + "step": 2390 + }, + { + "epoch": 0.14041656915516032, + "grad_norm": 6.346813678741455, + "learning_rate": 2.3371684867394696e-05, + "loss": 3.2365, + "step": 2400 + }, + { + "epoch": 0.1410016381933068, + "grad_norm": 6.541561603546143, + "learning_rate": 2.3469188767550705e-05, + "loss": 3.2289, + "step": 2410 + }, + { + "epoch": 0.1415867072314533, + "grad_norm": 7.458307266235352, + "learning_rate": 2.356669266770671e-05, + "loss": 3.2201, + "step": 2420 + }, + { + "epoch": 0.14217177626959981, + "grad_norm": 6.524603366851807, + "learning_rate": 2.3664196567862714e-05, + "loss": 3.2025, + "step": 2430 + }, + { + "epoch": 0.14275684530774632, + "grad_norm": 6.503066062927246, + "learning_rate": 2.376170046801872e-05, + "loss": 3.1871, + "step": 2440 + }, + { + "epoch": 0.14334191434589283, + "grad_norm": 6.842957973480225, + "learning_rate": 2.385920436817473e-05, + "loss": 3.2161, + "step": 2450 + }, + { + "epoch": 0.1439269833840393, + "grad_norm": 7.761510848999023, + "learning_rate": 2.3956708268330736e-05, + "loss": 3.1938, + "step": 2460 + }, + { + "epoch": 0.14451205242218582, + "grad_norm": 6.82220983505249, + "learning_rate": 2.405421216848674e-05, + "loss": 3.2107, + "step": 2470 + }, + { + "epoch": 0.14509712146033232, + "grad_norm": 8.268922805786133, + "learning_rate": 2.4151716068642745e-05, + "loss": 3.2027, + "step": 2480 + }, + { + "epoch": 0.14568219049847883, + "grad_norm": 6.681731224060059, + "learning_rate": 2.424921996879875e-05, + "loss": 3.1807, + "step": 2490 + }, + { + "epoch": 0.1462672595366253, + "grad_norm": 7.034046649932861, + "learning_rate": 2.434672386895476e-05, + "loss": 3.2013, + "step": 2500 + }, + { + "epoch": 0.14685232857477182, + "grad_norm": 6.686898231506348, + "learning_rate": 2.4444227769110767e-05, + "loss": 3.1643, + "step": 2510 + }, + { + "epoch": 0.14743739761291833, + "grad_norm": 7.608445167541504, + "learning_rate": 2.454173166926677e-05, + "loss": 3.179, + "step": 2520 + }, + { + "epoch": 0.14802246665106483, + "grad_norm": 9.023140907287598, + "learning_rate": 2.4639235569422776e-05, + "loss": 3.1696, + "step": 2530 + }, + { + "epoch": 0.14860753568921134, + "grad_norm": 7.529286861419678, + "learning_rate": 2.4736739469578786e-05, + "loss": 3.1734, + "step": 2540 + }, + { + "epoch": 0.14919260472735782, + "grad_norm": 8.38886833190918, + "learning_rate": 2.4834243369734792e-05, + "loss": 3.1719, + "step": 2550 + }, + { + "epoch": 0.14977767376550433, + "grad_norm": 9.776044845581055, + "learning_rate": 2.4931747269890795e-05, + "loss": 3.1446, + "step": 2560 + }, + { + "epoch": 0.15036274280365083, + "grad_norm": 8.278759002685547, + "learning_rate": 2.50292511700468e-05, + "loss": 3.1607, + "step": 2570 + }, + { + "epoch": 0.15094781184179734, + "grad_norm": 7.201677322387695, + "learning_rate": 2.512675507020281e-05, + "loss": 3.1515, + "step": 2580 + }, + { + "epoch": 0.15153288087994382, + "grad_norm": 7.762319564819336, + "learning_rate": 2.5224258970358817e-05, + "loss": 3.1467, + "step": 2590 + }, + { + "epoch": 0.15211794991809033, + "grad_norm": 5.101878643035889, + "learning_rate": 2.532176287051482e-05, + "loss": 3.1523, + "step": 2600 + }, + { + "epoch": 0.15270301895623684, + "grad_norm": 7.687345504760742, + "learning_rate": 2.541926677067083e-05, + "loss": 3.1665, + "step": 2610 + }, + { + "epoch": 0.15328808799438334, + "grad_norm": 6.708488941192627, + "learning_rate": 2.5516770670826835e-05, + "loss": 3.1428, + "step": 2620 + }, + { + "epoch": 0.15387315703252985, + "grad_norm": 7.85712194442749, + "learning_rate": 2.5614274570982838e-05, + "loss": 3.1536, + "step": 2630 + }, + { + "epoch": 0.15445822607067633, + "grad_norm": 7.619500637054443, + "learning_rate": 2.5711778471138848e-05, + "loss": 3.124, + "step": 2640 + }, + { + "epoch": 0.15504329510882284, + "grad_norm": 5.959424018859863, + "learning_rate": 2.580928237129485e-05, + "loss": 3.1177, + "step": 2650 + }, + { + "epoch": 0.15562836414696934, + "grad_norm": 8.099101066589355, + "learning_rate": 2.590678627145086e-05, + "loss": 3.1004, + "step": 2660 + }, + { + "epoch": 0.15621343318511585, + "grad_norm": 6.553966045379639, + "learning_rate": 2.6004290171606866e-05, + "loss": 3.0941, + "step": 2670 + }, + { + "epoch": 0.15679850222326233, + "grad_norm": 6.345914363861084, + "learning_rate": 2.610179407176287e-05, + "loss": 3.1036, + "step": 2680 + }, + { + "epoch": 0.15738357126140884, + "grad_norm": 9.28085994720459, + "learning_rate": 2.619929797191888e-05, + "loss": 3.1054, + "step": 2690 + }, + { + "epoch": 0.15796864029955535, + "grad_norm": 5.908251762390137, + "learning_rate": 2.6296801872074885e-05, + "loss": 3.1045, + "step": 2700 + }, + { + "epoch": 0.15855370933770185, + "grad_norm": 5.917749404907227, + "learning_rate": 2.6394305772230894e-05, + "loss": 3.096, + "step": 2710 + }, + { + "epoch": 0.15913877837584836, + "grad_norm": 5.648446083068848, + "learning_rate": 2.6491809672386897e-05, + "loss": 3.0965, + "step": 2720 + }, + { + "epoch": 0.15972384741399484, + "grad_norm": 7.2419843673706055, + "learning_rate": 2.65893135725429e-05, + "loss": 3.1189, + "step": 2730 + }, + { + "epoch": 0.16030891645214135, + "grad_norm": 7.118249416351318, + "learning_rate": 2.668681747269891e-05, + "loss": 3.0865, + "step": 2740 + }, + { + "epoch": 0.16089398549028786, + "grad_norm": 7.298707962036133, + "learning_rate": 2.6784321372854916e-05, + "loss": 3.0812, + "step": 2750 + }, + { + "epoch": 0.16147905452843436, + "grad_norm": 7.682481288909912, + "learning_rate": 2.6881825273010925e-05, + "loss": 3.0833, + "step": 2760 + }, + { + "epoch": 0.16206412356658087, + "grad_norm": 7.511553764343262, + "learning_rate": 2.6979329173166928e-05, + "loss": 3.0839, + "step": 2770 + }, + { + "epoch": 0.16264919260472735, + "grad_norm": 8.631921768188477, + "learning_rate": 2.707683307332293e-05, + "loss": 3.073, + "step": 2780 + }, + { + "epoch": 0.16323426164287386, + "grad_norm": 6.621910095214844, + "learning_rate": 2.717433697347894e-05, + "loss": 3.0545, + "step": 2790 + }, + { + "epoch": 0.16381933068102036, + "grad_norm": 7.988832473754883, + "learning_rate": 2.7271840873634947e-05, + "loss": 3.0738, + "step": 2800 + }, + { + "epoch": 0.16440439971916687, + "grad_norm": 7.478857040405273, + "learning_rate": 2.736934477379095e-05, + "loss": 3.07, + "step": 2810 + }, + { + "epoch": 0.16498946875731335, + "grad_norm": 8.259140014648438, + "learning_rate": 2.746684867394696e-05, + "loss": 3.061, + "step": 2820 + }, + { + "epoch": 0.16557453779545986, + "grad_norm": 9.011025428771973, + "learning_rate": 2.7564352574102965e-05, + "loss": 3.0468, + "step": 2830 + }, + { + "epoch": 0.16615960683360637, + "grad_norm": 7.9366912841796875, + "learning_rate": 2.7661856474258975e-05, + "loss": 3.0628, + "step": 2840 + }, + { + "epoch": 0.16674467587175287, + "grad_norm": 7.532613277435303, + "learning_rate": 2.7759360374414978e-05, + "loss": 3.0544, + "step": 2850 + }, + { + "epoch": 0.16732974490989938, + "grad_norm": 7.861001968383789, + "learning_rate": 2.785686427457098e-05, + "loss": 3.0413, + "step": 2860 + }, + { + "epoch": 0.16791481394804586, + "grad_norm": 7.128969192504883, + "learning_rate": 2.795436817472699e-05, + "loss": 3.0316, + "step": 2870 + }, + { + "epoch": 0.16849988298619237, + "grad_norm": 7.6734747886657715, + "learning_rate": 2.8051872074882996e-05, + "loss": 3.0389, + "step": 2880 + }, + { + "epoch": 0.16908495202433887, + "grad_norm": 7.53655481338501, + "learning_rate": 2.8149375975039006e-05, + "loss": 3.0291, + "step": 2890 + }, + { + "epoch": 0.16967002106248538, + "grad_norm": 6.852996349334717, + "learning_rate": 2.824687987519501e-05, + "loss": 3.0447, + "step": 2900 + }, + { + "epoch": 0.17025509010063186, + "grad_norm": 6.887439727783203, + "learning_rate": 2.8344383775351015e-05, + "loss": 3.0401, + "step": 2910 + }, + { + "epoch": 0.17084015913877837, + "grad_norm": 6.632650852203369, + "learning_rate": 2.8441887675507024e-05, + "loss": 3.0279, + "step": 2920 + }, + { + "epoch": 0.17142522817692488, + "grad_norm": 7.538325786590576, + "learning_rate": 2.8539391575663027e-05, + "loss": 3.0225, + "step": 2930 + }, + { + "epoch": 0.17201029721507138, + "grad_norm": 7.271528720855713, + "learning_rate": 2.8636895475819037e-05, + "loss": 3.0125, + "step": 2940 + }, + { + "epoch": 0.1725953662532179, + "grad_norm": 7.787014961242676, + "learning_rate": 2.873439937597504e-05, + "loss": 2.989, + "step": 2950 + }, + { + "epoch": 0.17318043529136437, + "grad_norm": 8.948431015014648, + "learning_rate": 2.8831903276131046e-05, + "loss": 3.0003, + "step": 2960 + }, + { + "epoch": 0.17376550432951088, + "grad_norm": 9.557765007019043, + "learning_rate": 2.8929407176287055e-05, + "loss": 3.0116, + "step": 2970 + }, + { + "epoch": 0.17435057336765739, + "grad_norm": 7.760405540466309, + "learning_rate": 2.9026911076443058e-05, + "loss": 3.0149, + "step": 2980 + }, + { + "epoch": 0.1749356424058039, + "grad_norm": 5.9020094871521, + "learning_rate": 2.912441497659906e-05, + "loss": 2.9782, + "step": 2990 + }, + { + "epoch": 0.17552071144395037, + "grad_norm": 8.86624526977539, + "learning_rate": 2.922191887675507e-05, + "loss": 2.9796, + "step": 3000 + }, + { + "epoch": 0.17552071144395037, + "eval_loss": 3.022794485092163, + "eval_runtime": 33.2732, + "eval_samples_per_second": 678.835, + "eval_steps_per_second": 5.32, + "step": 3000 + }, + { + "epoch": 0.17610578048209688, + "grad_norm": 8.000489234924316, + "learning_rate": 2.9319422776911077e-05, + "loss": 2.9897, + "step": 3010 + }, + { + "epoch": 0.1766908495202434, + "grad_norm": 7.482810974121094, + "learning_rate": 2.9416926677067086e-05, + "loss": 2.9763, + "step": 3020 + }, + { + "epoch": 0.1772759185583899, + "grad_norm": 6.712230682373047, + "learning_rate": 2.951443057722309e-05, + "loss": 2.9875, + "step": 3030 + }, + { + "epoch": 0.1778609875965364, + "grad_norm": 6.222019195556641, + "learning_rate": 2.9611934477379095e-05, + "loss": 2.9817, + "step": 3040 + }, + { + "epoch": 0.17844605663468288, + "grad_norm": 7.0089216232299805, + "learning_rate": 2.9709438377535105e-05, + "loss": 2.9648, + "step": 3050 + }, + { + "epoch": 0.1790311256728294, + "grad_norm": 6.764308929443359, + "learning_rate": 2.9806942277691108e-05, + "loss": 2.9491, + "step": 3060 + }, + { + "epoch": 0.1796161947109759, + "grad_norm": 6.860990524291992, + "learning_rate": 2.9904446177847117e-05, + "loss": 2.9482, + "step": 3070 + }, + { + "epoch": 0.1802012637491224, + "grad_norm": 8.082242965698242, + "learning_rate": 3.000195007800312e-05, + "loss": 2.9424, + "step": 3080 + }, + { + "epoch": 0.1807863327872689, + "grad_norm": 6.335153579711914, + "learning_rate": 3.0099453978159126e-05, + "loss": 2.9491, + "step": 3090 + }, + { + "epoch": 0.1813714018254154, + "grad_norm": 5.9129438400268555, + "learning_rate": 3.0196957878315136e-05, + "loss": 2.9399, + "step": 3100 + }, + { + "epoch": 0.1819564708635619, + "grad_norm": 7.502929210662842, + "learning_rate": 3.029446177847114e-05, + "loss": 2.9699, + "step": 3110 + }, + { + "epoch": 0.1825415399017084, + "grad_norm": 7.992349147796631, + "learning_rate": 3.0391965678627148e-05, + "loss": 2.947, + "step": 3120 + }, + { + "epoch": 0.1831266089398549, + "grad_norm": 6.969609260559082, + "learning_rate": 3.0489469578783154e-05, + "loss": 2.9572, + "step": 3130 + }, + { + "epoch": 0.1837116779780014, + "grad_norm": 6.793538570404053, + "learning_rate": 3.058697347893916e-05, + "loss": 2.9342, + "step": 3140 + }, + { + "epoch": 0.1842967470161479, + "grad_norm": 7.198190689086914, + "learning_rate": 3.0684477379095163e-05, + "loss": 2.9144, + "step": 3150 + }, + { + "epoch": 0.1848818160542944, + "grad_norm": 7.203789234161377, + "learning_rate": 3.078198127925117e-05, + "loss": 2.9352, + "step": 3160 + }, + { + "epoch": 0.18546688509244091, + "grad_norm": 6.604976177215576, + "learning_rate": 3.087948517940718e-05, + "loss": 2.9147, + "step": 3170 + }, + { + "epoch": 0.18605195413058742, + "grad_norm": 8.041712760925293, + "learning_rate": 3.097698907956318e-05, + "loss": 2.917, + "step": 3180 + }, + { + "epoch": 0.1866370231687339, + "grad_norm": 7.25447416305542, + "learning_rate": 3.107449297971919e-05, + "loss": 2.932, + "step": 3190 + }, + { + "epoch": 0.1872220922068804, + "grad_norm": 5.998739719390869, + "learning_rate": 3.11719968798752e-05, + "loss": 2.9249, + "step": 3200 + }, + { + "epoch": 0.18780716124502692, + "grad_norm": 6.867847919464111, + "learning_rate": 3.12695007800312e-05, + "loss": 2.8971, + "step": 3210 + }, + { + "epoch": 0.18839223028317342, + "grad_norm": 7.965513229370117, + "learning_rate": 3.136700468018721e-05, + "loss": 2.9193, + "step": 3220 + }, + { + "epoch": 0.1889772993213199, + "grad_norm": 7.230633735656738, + "learning_rate": 3.146450858034321e-05, + "loss": 2.8864, + "step": 3230 + }, + { + "epoch": 0.1895623683594664, + "grad_norm": 7.703425407409668, + "learning_rate": 3.156201248049922e-05, + "loss": 2.9086, + "step": 3240 + }, + { + "epoch": 0.19014743739761292, + "grad_norm": 7.888120651245117, + "learning_rate": 3.165951638065523e-05, + "loss": 2.8991, + "step": 3250 + }, + { + "epoch": 0.19073250643575942, + "grad_norm": 6.019379615783691, + "learning_rate": 3.175702028081123e-05, + "loss": 2.8756, + "step": 3260 + }, + { + "epoch": 0.19131757547390593, + "grad_norm": 7.056241989135742, + "learning_rate": 3.185452418096724e-05, + "loss": 2.9023, + "step": 3270 + }, + { + "epoch": 0.1919026445120524, + "grad_norm": 6.490424156188965, + "learning_rate": 3.195202808112325e-05, + "loss": 2.8787, + "step": 3280 + }, + { + "epoch": 0.19248771355019892, + "grad_norm": 7.560011386871338, + "learning_rate": 3.204953198127925e-05, + "loss": 2.8739, + "step": 3290 + }, + { + "epoch": 0.19307278258834543, + "grad_norm": 7.446630001068115, + "learning_rate": 3.214703588143526e-05, + "loss": 2.8792, + "step": 3300 + }, + { + "epoch": 0.19365785162649193, + "grad_norm": 7.2592620849609375, + "learning_rate": 3.224453978159126e-05, + "loss": 2.8681, + "step": 3310 + }, + { + "epoch": 0.19424292066463844, + "grad_norm": 7.847536563873291, + "learning_rate": 3.234204368174727e-05, + "loss": 2.8696, + "step": 3320 + }, + { + "epoch": 0.19482798970278492, + "grad_norm": 6.161529064178467, + "learning_rate": 3.243954758190328e-05, + "loss": 2.8651, + "step": 3330 + }, + { + "epoch": 0.19541305874093143, + "grad_norm": 8.979445457458496, + "learning_rate": 3.253705148205928e-05, + "loss": 2.8547, + "step": 3340 + }, + { + "epoch": 0.19599812777907794, + "grad_norm": 7.220419883728027, + "learning_rate": 3.2634555382215294e-05, + "loss": 2.8705, + "step": 3350 + }, + { + "epoch": 0.19658319681722444, + "grad_norm": 9.62748908996582, + "learning_rate": 3.27320592823713e-05, + "loss": 2.8542, + "step": 3360 + }, + { + "epoch": 0.19716826585537092, + "grad_norm": 8.37350845336914, + "learning_rate": 3.28295631825273e-05, + "loss": 2.8604, + "step": 3370 + }, + { + "epoch": 0.19775333489351743, + "grad_norm": 5.47288703918457, + "learning_rate": 3.292706708268331e-05, + "loss": 2.8356, + "step": 3380 + }, + { + "epoch": 0.19833840393166394, + "grad_norm": 6.462109565734863, + "learning_rate": 3.302457098283931e-05, + "loss": 2.8626, + "step": 3390 + }, + { + "epoch": 0.19892347296981044, + "grad_norm": 6.416049003601074, + "learning_rate": 3.312207488299532e-05, + "loss": 2.8567, + "step": 3400 + }, + { + "epoch": 0.19950854200795695, + "grad_norm": 7.844700336456299, + "learning_rate": 3.321957878315133e-05, + "loss": 2.8428, + "step": 3410 + }, + { + "epoch": 0.20009361104610343, + "grad_norm": 6.428390979766846, + "learning_rate": 3.331708268330733e-05, + "loss": 2.8543, + "step": 3420 + }, + { + "epoch": 0.20067868008424994, + "grad_norm": 6.168091297149658, + "learning_rate": 3.3414586583463343e-05, + "loss": 2.8253, + "step": 3430 + }, + { + "epoch": 0.20126374912239645, + "grad_norm": 6.891341686248779, + "learning_rate": 3.351209048361934e-05, + "loss": 2.8438, + "step": 3440 + }, + { + "epoch": 0.20184881816054295, + "grad_norm": 6.019260406494141, + "learning_rate": 3.360959438377535e-05, + "loss": 2.8195, + "step": 3450 + }, + { + "epoch": 0.20243388719868943, + "grad_norm": 7.159237384796143, + "learning_rate": 3.370709828393136e-05, + "loss": 2.8399, + "step": 3460 + }, + { + "epoch": 0.20301895623683594, + "grad_norm": 8.080097198486328, + "learning_rate": 3.380460218408736e-05, + "loss": 2.8269, + "step": 3470 + }, + { + "epoch": 0.20360402527498245, + "grad_norm": 6.480908393859863, + "learning_rate": 3.3902106084243374e-05, + "loss": 2.8172, + "step": 3480 + }, + { + "epoch": 0.20418909431312895, + "grad_norm": 8.009683609008789, + "learning_rate": 3.399960998439938e-05, + "loss": 2.8099, + "step": 3490 + }, + { + "epoch": 0.20477416335127546, + "grad_norm": 7.531735420227051, + "learning_rate": 3.409711388455538e-05, + "loss": 2.8329, + "step": 3500 + }, + { + "epoch": 0.20535923238942194, + "grad_norm": 7.069326400756836, + "learning_rate": 3.419461778471139e-05, + "loss": 2.8049, + "step": 3510 + }, + { + "epoch": 0.20594430142756845, + "grad_norm": 6.012268543243408, + "learning_rate": 3.429212168486739e-05, + "loss": 2.7883, + "step": 3520 + }, + { + "epoch": 0.20652937046571496, + "grad_norm": 6.834994792938232, + "learning_rate": 3.4389625585023405e-05, + "loss": 2.7889, + "step": 3530 + }, + { + "epoch": 0.20711443950386146, + "grad_norm": 6.855382442474365, + "learning_rate": 3.448712948517941e-05, + "loss": 2.7876, + "step": 3540 + }, + { + "epoch": 0.20769950854200794, + "grad_norm": 7.750663757324219, + "learning_rate": 3.458463338533541e-05, + "loss": 2.8055, + "step": 3550 + }, + { + "epoch": 0.20828457758015445, + "grad_norm": 7.690601348876953, + "learning_rate": 3.4682137285491424e-05, + "loss": 2.8125, + "step": 3560 + }, + { + "epoch": 0.20886964661830096, + "grad_norm": 7.2279744148254395, + "learning_rate": 3.477964118564743e-05, + "loss": 2.8021, + "step": 3570 + }, + { + "epoch": 0.20945471565644747, + "grad_norm": 9.441347122192383, + "learning_rate": 3.4877145085803436e-05, + "loss": 2.7889, + "step": 3580 + }, + { + "epoch": 0.21003978469459397, + "grad_norm": 8.27785873413086, + "learning_rate": 3.497464898595944e-05, + "loss": 2.757, + "step": 3590 + }, + { + "epoch": 0.21062485373274045, + "grad_norm": 9.117860794067383, + "learning_rate": 3.507215288611544e-05, + "loss": 2.7889, + "step": 3600 + }, + { + "epoch": 0.21120992277088696, + "grad_norm": 7.041175365447998, + "learning_rate": 3.5169656786271455e-05, + "loss": 2.783, + "step": 3610 + }, + { + "epoch": 0.21179499180903347, + "grad_norm": 7.442233085632324, + "learning_rate": 3.526716068642746e-05, + "loss": 2.7633, + "step": 3620 + }, + { + "epoch": 0.21238006084717997, + "grad_norm": 6.172733306884766, + "learning_rate": 3.536466458658346e-05, + "loss": 2.764, + "step": 3630 + }, + { + "epoch": 0.21296512988532648, + "grad_norm": 7.142703056335449, + "learning_rate": 3.5462168486739473e-05, + "loss": 2.7791, + "step": 3640 + }, + { + "epoch": 0.21355019892347296, + "grad_norm": 5.7952189445495605, + "learning_rate": 3.555967238689547e-05, + "loss": 2.769, + "step": 3650 + }, + { + "epoch": 0.21413526796161947, + "grad_norm": 10.157326698303223, + "learning_rate": 3.5657176287051486e-05, + "loss": 2.7411, + "step": 3660 + }, + { + "epoch": 0.21472033699976598, + "grad_norm": 6.530250072479248, + "learning_rate": 3.575468018720749e-05, + "loss": 2.7762, + "step": 3670 + }, + { + "epoch": 0.21530540603791248, + "grad_norm": 7.70296049118042, + "learning_rate": 3.585218408736349e-05, + "loss": 2.7366, + "step": 3680 + }, + { + "epoch": 0.21589047507605896, + "grad_norm": 7.455280780792236, + "learning_rate": 3.5949687987519504e-05, + "loss": 2.7634, + "step": 3690 + }, + { + "epoch": 0.21647554411420547, + "grad_norm": 7.036149501800537, + "learning_rate": 3.604719188767551e-05, + "loss": 2.7319, + "step": 3700 + }, + { + "epoch": 0.21706061315235198, + "grad_norm": 6.096189498901367, + "learning_rate": 3.614469578783152e-05, + "loss": 2.7485, + "step": 3710 + }, + { + "epoch": 0.21764568219049849, + "grad_norm": 7.679635524749756, + "learning_rate": 3.624219968798752e-05, + "loss": 2.7359, + "step": 3720 + }, + { + "epoch": 0.218230751228645, + "grad_norm": 6.784409046173096, + "learning_rate": 3.633970358814352e-05, + "loss": 2.7361, + "step": 3730 + }, + { + "epoch": 0.21881582026679147, + "grad_norm": 8.628121376037598, + "learning_rate": 3.6437207488299535e-05, + "loss": 2.7352, + "step": 3740 + }, + { + "epoch": 0.21940088930493798, + "grad_norm": 7.44931697845459, + "learning_rate": 3.653471138845554e-05, + "loss": 2.7748, + "step": 3750 + }, + { + "epoch": 0.2199859583430845, + "grad_norm": 8.921013832092285, + "learning_rate": 3.663221528861155e-05, + "loss": 2.7673, + "step": 3760 + }, + { + "epoch": 0.220571027381231, + "grad_norm": 6.315057754516602, + "learning_rate": 3.6729719188767554e-05, + "loss": 2.7237, + "step": 3770 + }, + { + "epoch": 0.22115609641937747, + "grad_norm": 6.82656717300415, + "learning_rate": 3.682722308892356e-05, + "loss": 2.7249, + "step": 3780 + }, + { + "epoch": 0.22174116545752398, + "grad_norm": 6.658133506774902, + "learning_rate": 3.6924726989079566e-05, + "loss": 2.719, + "step": 3790 + }, + { + "epoch": 0.2223262344956705, + "grad_norm": 6.600283145904541, + "learning_rate": 3.702223088923557e-05, + "loss": 2.7274, + "step": 3800 + }, + { + "epoch": 0.222911303533817, + "grad_norm": 6.567437648773193, + "learning_rate": 3.711973478939157e-05, + "loss": 2.7205, + "step": 3810 + }, + { + "epoch": 0.2234963725719635, + "grad_norm": 7.027734756469727, + "learning_rate": 3.7217238689547585e-05, + "loss": 2.6725, + "step": 3820 + }, + { + "epoch": 0.22408144161010998, + "grad_norm": 8.393385887145996, + "learning_rate": 3.731474258970359e-05, + "loss": 2.6966, + "step": 3830 + }, + { + "epoch": 0.2246665106482565, + "grad_norm": 6.2752275466918945, + "learning_rate": 3.74122464898596e-05, + "loss": 2.7036, + "step": 3840 + }, + { + "epoch": 0.225251579686403, + "grad_norm": 6.748968124389648, + "learning_rate": 3.7509750390015603e-05, + "loss": 2.7048, + "step": 3850 + }, + { + "epoch": 0.2258366487245495, + "grad_norm": 7.115694522857666, + "learning_rate": 3.760725429017161e-05, + "loss": 2.7172, + "step": 3860 + }, + { + "epoch": 0.22642171776269598, + "grad_norm": 6.330199718475342, + "learning_rate": 3.7704758190327616e-05, + "loss": 2.6938, + "step": 3870 + }, + { + "epoch": 0.2270067868008425, + "grad_norm": 5.923182487487793, + "learning_rate": 3.780226209048362e-05, + "loss": 2.6663, + "step": 3880 + }, + { + "epoch": 0.227591855838989, + "grad_norm": 6.957324504852295, + "learning_rate": 3.789976599063963e-05, + "loss": 2.6835, + "step": 3890 + }, + { + "epoch": 0.2281769248771355, + "grad_norm": 6.654111862182617, + "learning_rate": 3.7997269890795634e-05, + "loss": 2.6754, + "step": 3900 + }, + { + "epoch": 0.228761993915282, + "grad_norm": 7.712104797363281, + "learning_rate": 3.809477379095164e-05, + "loss": 2.6705, + "step": 3910 + }, + { + "epoch": 0.2293470629534285, + "grad_norm": 6.342864990234375, + "learning_rate": 3.819227769110765e-05, + "loss": 2.6945, + "step": 3920 + }, + { + "epoch": 0.229932131991575, + "grad_norm": 7.208116054534912, + "learning_rate": 3.828978159126365e-05, + "loss": 2.6806, + "step": 3930 + }, + { + "epoch": 0.2305172010297215, + "grad_norm": 5.796600818634033, + "learning_rate": 3.838728549141966e-05, + "loss": 2.6801, + "step": 3940 + }, + { + "epoch": 0.23110227006786802, + "grad_norm": 7.374347686767578, + "learning_rate": 3.8484789391575665e-05, + "loss": 2.6737, + "step": 3950 + }, + { + "epoch": 0.23168733910601452, + "grad_norm": 6.4351725578308105, + "learning_rate": 3.858229329173167e-05, + "loss": 2.6821, + "step": 3960 + }, + { + "epoch": 0.232272408144161, + "grad_norm": 5.877326011657715, + "learning_rate": 3.867979719188768e-05, + "loss": 2.682, + "step": 3970 + }, + { + "epoch": 0.2328574771823075, + "grad_norm": 7.407878875732422, + "learning_rate": 3.8777301092043684e-05, + "loss": 2.6574, + "step": 3980 + }, + { + "epoch": 0.23344254622045402, + "grad_norm": 6.586123943328857, + "learning_rate": 3.887480499219969e-05, + "loss": 2.6619, + "step": 3990 + }, + { + "epoch": 0.23402761525860052, + "grad_norm": 8.05532169342041, + "learning_rate": 3.8972308892355696e-05, + "loss": 2.6809, + "step": 4000 + }, + { + "epoch": 0.23402761525860052, + "eval_loss": 2.6895411014556885, + "eval_runtime": 32.9559, + "eval_samples_per_second": 685.371, + "eval_steps_per_second": 5.371, + "step": 4000 + }, + { + "epoch": 0.234612684296747, + "grad_norm": 7.000280857086182, + "learning_rate": 3.90698127925117e-05, + "loss": 2.6634, + "step": 4010 + }, + { + "epoch": 0.2351977533348935, + "grad_norm": 6.516899585723877, + "learning_rate": 3.916731669266771e-05, + "loss": 2.6467, + "step": 4020 + }, + { + "epoch": 0.23578282237304002, + "grad_norm": 7.567678451538086, + "learning_rate": 3.9264820592823715e-05, + "loss": 2.6625, + "step": 4030 + }, + { + "epoch": 0.23636789141118653, + "grad_norm": 7.418173313140869, + "learning_rate": 3.936232449297972e-05, + "loss": 2.6255, + "step": 4040 + }, + { + "epoch": 0.23695296044933303, + "grad_norm": 7.466084957122803, + "learning_rate": 3.945982839313573e-05, + "loss": 2.6319, + "step": 4050 + }, + { + "epoch": 0.2375380294874795, + "grad_norm": 5.958834171295166, + "learning_rate": 3.9557332293291733e-05, + "loss": 2.6463, + "step": 4060 + }, + { + "epoch": 0.23812309852562602, + "grad_norm": 6.8126115798950195, + "learning_rate": 3.965483619344774e-05, + "loss": 2.6294, + "step": 4070 + }, + { + "epoch": 0.23870816756377253, + "grad_norm": 5.8045172691345215, + "learning_rate": 3.9752340093603746e-05, + "loss": 2.635, + "step": 4080 + }, + { + "epoch": 0.23929323660191903, + "grad_norm": 5.712719440460205, + "learning_rate": 3.984984399375975e-05, + "loss": 2.6208, + "step": 4090 + }, + { + "epoch": 0.23987830564006551, + "grad_norm": 6.578694820404053, + "learning_rate": 3.994734789391576e-05, + "loss": 2.6243, + "step": 4100 + }, + { + "epoch": 0.24046337467821202, + "grad_norm": 7.061134338378906, + "learning_rate": 4.0044851794071764e-05, + "loss": 2.6237, + "step": 4110 + }, + { + "epoch": 0.24104844371635853, + "grad_norm": 7.11826229095459, + "learning_rate": 4.014235569422777e-05, + "loss": 2.6098, + "step": 4120 + }, + { + "epoch": 0.24163351275450504, + "grad_norm": 6.498807430267334, + "learning_rate": 4.023985959438378e-05, + "loss": 2.5917, + "step": 4130 + }, + { + "epoch": 0.24221858179265154, + "grad_norm": 8.083340644836426, + "learning_rate": 4.033736349453978e-05, + "loss": 2.6241, + "step": 4140 + }, + { + "epoch": 0.24280365083079802, + "grad_norm": 6.658701419830322, + "learning_rate": 4.043486739469579e-05, + "loss": 2.6111, + "step": 4150 + }, + { + "epoch": 0.24338871986894453, + "grad_norm": 6.658374309539795, + "learning_rate": 4.0532371294851795e-05, + "loss": 2.5974, + "step": 4160 + }, + { + "epoch": 0.24397378890709104, + "grad_norm": 5.500592231750488, + "learning_rate": 4.062987519500781e-05, + "loss": 2.6055, + "step": 4170 + }, + { + "epoch": 0.24455885794523755, + "grad_norm": 7.191110134124756, + "learning_rate": 4.072737909516381e-05, + "loss": 2.586, + "step": 4180 + }, + { + "epoch": 0.24514392698338405, + "grad_norm": 5.601481914520264, + "learning_rate": 4.0824882995319814e-05, + "loss": 2.5838, + "step": 4190 + }, + { + "epoch": 0.24572899602153053, + "grad_norm": 7.243915557861328, + "learning_rate": 4.092238689547582e-05, + "loss": 2.6021, + "step": 4200 + }, + { + "epoch": 0.24631406505967704, + "grad_norm": 6.745823383331299, + "learning_rate": 4.1019890795631826e-05, + "loss": 2.5894, + "step": 4210 + }, + { + "epoch": 0.24689913409782355, + "grad_norm": 6.750791549682617, + "learning_rate": 4.111739469578783e-05, + "loss": 2.5906, + "step": 4220 + }, + { + "epoch": 0.24748420313597005, + "grad_norm": 5.49812126159668, + "learning_rate": 4.121489859594384e-05, + "loss": 2.5963, + "step": 4230 + }, + { + "epoch": 0.24806927217411653, + "grad_norm": 6.068818092346191, + "learning_rate": 4.1312402496099845e-05, + "loss": 2.5714, + "step": 4240 + }, + { + "epoch": 0.24865434121226304, + "grad_norm": 8.156566619873047, + "learning_rate": 4.140990639625585e-05, + "loss": 2.5775, + "step": 4250 + }, + { + "epoch": 0.24923941025040955, + "grad_norm": 7.217544078826904, + "learning_rate": 4.150741029641186e-05, + "loss": 2.598, + "step": 4260 + }, + { + "epoch": 0.24982447928855606, + "grad_norm": 6.333588600158691, + "learning_rate": 4.1604914196567863e-05, + "loss": 2.5666, + "step": 4270 + }, + { + "epoch": 0.25040954832670254, + "grad_norm": 8.524089813232422, + "learning_rate": 4.170241809672387e-05, + "loss": 2.5593, + "step": 4280 + }, + { + "epoch": 0.25099461736484907, + "grad_norm": 6.358284950256348, + "learning_rate": 4.1799921996879876e-05, + "loss": 2.5823, + "step": 4290 + }, + { + "epoch": 0.25157968640299555, + "grad_norm": 7.729922771453857, + "learning_rate": 4.189742589703589e-05, + "loss": 2.5721, + "step": 4300 + }, + { + "epoch": 0.25216475544114203, + "grad_norm": 6.976046085357666, + "learning_rate": 4.199492979719189e-05, + "loss": 2.5428, + "step": 4310 + }, + { + "epoch": 0.25274982447928857, + "grad_norm": 5.643190860748291, + "learning_rate": 4.2092433697347894e-05, + "loss": 2.5569, + "step": 4320 + }, + { + "epoch": 0.25333489351743504, + "grad_norm": 6.348603248596191, + "learning_rate": 4.21899375975039e-05, + "loss": 2.5518, + "step": 4330 + }, + { + "epoch": 0.2539199625555816, + "grad_norm": 6.93218469619751, + "learning_rate": 4.228744149765991e-05, + "loss": 2.5556, + "step": 4340 + }, + { + "epoch": 0.25450503159372806, + "grad_norm": 5.46391487121582, + "learning_rate": 4.238494539781592e-05, + "loss": 2.5624, + "step": 4350 + }, + { + "epoch": 0.25509010063187454, + "grad_norm": 5.609133720397949, + "learning_rate": 4.248244929797192e-05, + "loss": 2.5607, + "step": 4360 + }, + { + "epoch": 0.2556751696700211, + "grad_norm": 5.519454479217529, + "learning_rate": 4.2579953198127925e-05, + "loss": 2.5633, + "step": 4370 + }, + { + "epoch": 0.25626023870816755, + "grad_norm": 6.445557117462158, + "learning_rate": 4.267745709828394e-05, + "loss": 2.5425, + "step": 4380 + }, + { + "epoch": 0.2568453077463141, + "grad_norm": 5.752540588378906, + "learning_rate": 4.277496099843994e-05, + "loss": 2.5257, + "step": 4390 + }, + { + "epoch": 0.25743037678446057, + "grad_norm": 5.979530334472656, + "learning_rate": 4.2872464898595944e-05, + "loss": 2.539, + "step": 4400 + }, + { + "epoch": 0.25801544582260705, + "grad_norm": 6.049847602844238, + "learning_rate": 4.296996879875195e-05, + "loss": 2.5352, + "step": 4410 + }, + { + "epoch": 0.2586005148607536, + "grad_norm": 6.667181968688965, + "learning_rate": 4.3067472698907956e-05, + "loss": 2.5467, + "step": 4420 + }, + { + "epoch": 0.25918558389890006, + "grad_norm": 7.697716236114502, + "learning_rate": 4.316497659906397e-05, + "loss": 2.5361, + "step": 4430 + }, + { + "epoch": 0.2597706529370466, + "grad_norm": 6.354778289794922, + "learning_rate": 4.326248049921997e-05, + "loss": 2.5129, + "step": 4440 + }, + { + "epoch": 0.2603557219751931, + "grad_norm": 6.5811967849731445, + "learning_rate": 4.3359984399375975e-05, + "loss": 2.5464, + "step": 4450 + }, + { + "epoch": 0.26094079101333956, + "grad_norm": 7.0095391273498535, + "learning_rate": 4.345748829953198e-05, + "loss": 2.5355, + "step": 4460 + }, + { + "epoch": 0.2615258600514861, + "grad_norm": 5.985919952392578, + "learning_rate": 4.355499219968799e-05, + "loss": 2.5218, + "step": 4470 + }, + { + "epoch": 0.26211092908963257, + "grad_norm": 5.000303268432617, + "learning_rate": 4.3652496099844e-05, + "loss": 2.495, + "step": 4480 + }, + { + "epoch": 0.26269599812777905, + "grad_norm": 6.065436363220215, + "learning_rate": 4.375e-05, + "loss": 2.5211, + "step": 4490 + }, + { + "epoch": 0.2632810671659256, + "grad_norm": 5.958404064178467, + "learning_rate": 4.3847503900156006e-05, + "loss": 2.5218, + "step": 4500 + }, + { + "epoch": 0.26386613620407207, + "grad_norm": 7.154194355010986, + "learning_rate": 4.394500780031202e-05, + "loss": 2.5217, + "step": 4510 + }, + { + "epoch": 0.2644512052422186, + "grad_norm": 6.142179012298584, + "learning_rate": 4.404251170046802e-05, + "loss": 2.5182, + "step": 4520 + }, + { + "epoch": 0.2650362742803651, + "grad_norm": 5.819743633270264, + "learning_rate": 4.414001560062403e-05, + "loss": 2.4883, + "step": 4530 + }, + { + "epoch": 0.26562134331851156, + "grad_norm": 5.624899387359619, + "learning_rate": 4.423751950078003e-05, + "loss": 2.4936, + "step": 4540 + }, + { + "epoch": 0.2662064123566581, + "grad_norm": 6.046993732452393, + "learning_rate": 4.433502340093604e-05, + "loss": 2.4942, + "step": 4550 + }, + { + "epoch": 0.2667914813948046, + "grad_norm": 5.426690578460693, + "learning_rate": 4.443252730109205e-05, + "loss": 2.5079, + "step": 4560 + }, + { + "epoch": 0.2673765504329511, + "grad_norm": 7.749000072479248, + "learning_rate": 4.453003120124805e-05, + "loss": 2.5156, + "step": 4570 + }, + { + "epoch": 0.2679616194710976, + "grad_norm": 5.698111057281494, + "learning_rate": 4.462753510140406e-05, + "loss": 2.4932, + "step": 4580 + }, + { + "epoch": 0.26854668850924407, + "grad_norm": 6.094777584075928, + "learning_rate": 4.472503900156007e-05, + "loss": 2.4866, + "step": 4590 + }, + { + "epoch": 0.2691317575473906, + "grad_norm": 6.259509563446045, + "learning_rate": 4.482254290171607e-05, + "loss": 2.5026, + "step": 4600 + }, + { + "epoch": 0.2697168265855371, + "grad_norm": 6.997308254241943, + "learning_rate": 4.492004680187208e-05, + "loss": 2.4985, + "step": 4610 + }, + { + "epoch": 0.2703018956236836, + "grad_norm": 7.453635215759277, + "learning_rate": 4.501755070202808e-05, + "loss": 2.4726, + "step": 4620 + }, + { + "epoch": 0.2708869646618301, + "grad_norm": 7.336577892303467, + "learning_rate": 4.5115054602184086e-05, + "loss": 2.515, + "step": 4630 + }, + { + "epoch": 0.2714720336999766, + "grad_norm": 5.836348056793213, + "learning_rate": 4.52125585023401e-05, + "loss": 2.4867, + "step": 4640 + }, + { + "epoch": 0.2720571027381231, + "grad_norm": 5.77445125579834, + "learning_rate": 4.53100624024961e-05, + "loss": 2.4811, + "step": 4650 + }, + { + "epoch": 0.2726421717762696, + "grad_norm": 7.142433166503906, + "learning_rate": 4.540756630265211e-05, + "loss": 2.4438, + "step": 4660 + }, + { + "epoch": 0.27322724081441613, + "grad_norm": 6.94520902633667, + "learning_rate": 4.550507020280812e-05, + "loss": 2.4817, + "step": 4670 + }, + { + "epoch": 0.2738123098525626, + "grad_norm": 7.556786060333252, + "learning_rate": 4.560257410296412e-05, + "loss": 2.459, + "step": 4680 + }, + { + "epoch": 0.2743973788907091, + "grad_norm": 6.7388410568237305, + "learning_rate": 4.570007800312013e-05, + "loss": 2.458, + "step": 4690 + }, + { + "epoch": 0.2749824479288556, + "grad_norm": 5.675704002380371, + "learning_rate": 4.579758190327613e-05, + "loss": 2.4655, + "step": 4700 + }, + { + "epoch": 0.2755675169670021, + "grad_norm": 5.095277309417725, + "learning_rate": 4.589508580343214e-05, + "loss": 2.4585, + "step": 4710 + }, + { + "epoch": 0.2761525860051486, + "grad_norm": 6.4241557121276855, + "learning_rate": 4.599258970358815e-05, + "loss": 2.4709, + "step": 4720 + }, + { + "epoch": 0.2767376550432951, + "grad_norm": 6.560291290283203, + "learning_rate": 4.609009360374415e-05, + "loss": 2.4494, + "step": 4730 + }, + { + "epoch": 0.2773227240814416, + "grad_norm": 6.607519149780273, + "learning_rate": 4.618759750390016e-05, + "loss": 2.4727, + "step": 4740 + }, + { + "epoch": 0.27790779311958813, + "grad_norm": 5.489260673522949, + "learning_rate": 4.628510140405616e-05, + "loss": 2.4726, + "step": 4750 + }, + { + "epoch": 0.2784928621577346, + "grad_norm": 6.01768159866333, + "learning_rate": 4.6382605304212173e-05, + "loss": 2.4309, + "step": 4760 + }, + { + "epoch": 0.2790779311958811, + "grad_norm": 5.07488489151001, + "learning_rate": 4.648010920436818e-05, + "loss": 2.4317, + "step": 4770 + }, + { + "epoch": 0.2796630002340276, + "grad_norm": 6.565321922302246, + "learning_rate": 4.657761310452418e-05, + "loss": 2.4372, + "step": 4780 + }, + { + "epoch": 0.2802480692721741, + "grad_norm": 6.225085258483887, + "learning_rate": 4.667511700468019e-05, + "loss": 2.4559, + "step": 4790 + }, + { + "epoch": 0.28083313831032064, + "grad_norm": 7.050281047821045, + "learning_rate": 4.67726209048362e-05, + "loss": 2.4525, + "step": 4800 + }, + { + "epoch": 0.2814182073484671, + "grad_norm": 8.226614952087402, + "learning_rate": 4.68701248049922e-05, + "loss": 2.4321, + "step": 4810 + }, + { + "epoch": 0.2820032763866136, + "grad_norm": 5.672790050506592, + "learning_rate": 4.696762870514821e-05, + "loss": 2.4455, + "step": 4820 + }, + { + "epoch": 0.28258834542476013, + "grad_norm": 4.979031562805176, + "learning_rate": 4.706513260530421e-05, + "loss": 2.4338, + "step": 4830 + }, + { + "epoch": 0.2831734144629066, + "grad_norm": 5.17700719833374, + "learning_rate": 4.716263650546022e-05, + "loss": 2.4459, + "step": 4840 + }, + { + "epoch": 0.28375848350105315, + "grad_norm": 5.427080154418945, + "learning_rate": 4.726014040561623e-05, + "loss": 2.4253, + "step": 4850 + }, + { + "epoch": 0.28434355253919963, + "grad_norm": 4.842417240142822, + "learning_rate": 4.735764430577223e-05, + "loss": 2.4411, + "step": 4860 + }, + { + "epoch": 0.2849286215773461, + "grad_norm": 5.447112083435059, + "learning_rate": 4.745514820592824e-05, + "loss": 2.4178, + "step": 4870 + }, + { + "epoch": 0.28551369061549264, + "grad_norm": 5.686522483825684, + "learning_rate": 4.755265210608425e-05, + "loss": 2.4088, + "step": 4880 + }, + { + "epoch": 0.2860987596536391, + "grad_norm": 5.213565349578857, + "learning_rate": 4.7650156006240254e-05, + "loss": 2.4149, + "step": 4890 + }, + { + "epoch": 0.28668382869178566, + "grad_norm": 5.793909072875977, + "learning_rate": 4.774765990639626e-05, + "loss": 2.4147, + "step": 4900 + }, + { + "epoch": 0.28726889772993214, + "grad_norm": 5.464046001434326, + "learning_rate": 4.784516380655226e-05, + "loss": 2.4118, + "step": 4910 + }, + { + "epoch": 0.2878539667680786, + "grad_norm": 4.820284843444824, + "learning_rate": 4.794266770670827e-05, + "loss": 2.4352, + "step": 4920 + }, + { + "epoch": 0.28843903580622515, + "grad_norm": 5.264814376831055, + "learning_rate": 4.804017160686428e-05, + "loss": 2.3934, + "step": 4930 + }, + { + "epoch": 0.28902410484437163, + "grad_norm": 5.475646495819092, + "learning_rate": 4.8137675507020285e-05, + "loss": 2.4216, + "step": 4940 + }, + { + "epoch": 0.2896091738825181, + "grad_norm": 6.288509368896484, + "learning_rate": 4.823517940717629e-05, + "loss": 2.3894, + "step": 4950 + }, + { + "epoch": 0.29019424292066465, + "grad_norm": 6.081909656524658, + "learning_rate": 4.833268330733229e-05, + "loss": 2.4193, + "step": 4960 + }, + { + "epoch": 0.2907793119588111, + "grad_norm": 4.7590436935424805, + "learning_rate": 4.8430187207488303e-05, + "loss": 2.3887, + "step": 4970 + }, + { + "epoch": 0.29136438099695766, + "grad_norm": 5.303084373474121, + "learning_rate": 4.852769110764431e-05, + "loss": 2.402, + "step": 4980 + }, + { + "epoch": 0.29194945003510414, + "grad_norm": 6.459832191467285, + "learning_rate": 4.8625195007800316e-05, + "loss": 2.3999, + "step": 4990 + }, + { + "epoch": 0.2925345190732506, + "grad_norm": 5.453962326049805, + "learning_rate": 4.872269890795632e-05, + "loss": 2.4185, + "step": 5000 + }, + { + "epoch": 0.2925345190732506, + "eval_loss": 2.4351727962493896, + "eval_runtime": 32.8715, + "eval_samples_per_second": 687.13, + "eval_steps_per_second": 5.385, + "step": 5000 + }, + { + "epoch": 0.29311958811139716, + "grad_norm": 4.966829299926758, + "learning_rate": 4.882020280811233e-05, + "loss": 2.3748, + "step": 5010 + }, + { + "epoch": 0.29370465714954364, + "grad_norm": 4.37595796585083, + "learning_rate": 4.8917706708268334e-05, + "loss": 2.3654, + "step": 5020 + }, + { + "epoch": 0.29428972618769017, + "grad_norm": 5.104454040527344, + "learning_rate": 4.901521060842434e-05, + "loss": 2.3867, + "step": 5030 + }, + { + "epoch": 0.29487479522583665, + "grad_norm": 5.938244819641113, + "learning_rate": 4.911271450858034e-05, + "loss": 2.3949, + "step": 5040 + }, + { + "epoch": 0.29545986426398313, + "grad_norm": 5.23139762878418, + "learning_rate": 4.921021840873635e-05, + "loss": 2.396, + "step": 5050 + }, + { + "epoch": 0.29604493330212966, + "grad_norm": 6.198992729187012, + "learning_rate": 4.930772230889236e-05, + "loss": 2.3883, + "step": 5060 + }, + { + "epoch": 0.29663000234027614, + "grad_norm": 5.254321575164795, + "learning_rate": 4.9405226209048365e-05, + "loss": 2.3867, + "step": 5070 + }, + { + "epoch": 0.2972150713784227, + "grad_norm": 5.424856662750244, + "learning_rate": 4.950273010920437e-05, + "loss": 2.375, + "step": 5080 + }, + { + "epoch": 0.29780014041656916, + "grad_norm": 5.270451545715332, + "learning_rate": 4.960023400936038e-05, + "loss": 2.389, + "step": 5090 + }, + { + "epoch": 0.29838520945471564, + "grad_norm": 5.924045085906982, + "learning_rate": 4.9697737909516384e-05, + "loss": 2.3755, + "step": 5100 + }, + { + "epoch": 0.2989702784928622, + "grad_norm": 5.89667272567749, + "learning_rate": 4.979524180967239e-05, + "loss": 2.3807, + "step": 5110 + }, + { + "epoch": 0.29955534753100865, + "grad_norm": 5.598618984222412, + "learning_rate": 4.9892745709828396e-05, + "loss": 2.3751, + "step": 5120 + }, + { + "epoch": 0.30014041656915513, + "grad_norm": 4.600474834442139, + "learning_rate": 4.99902496099844e-05, + "loss": 2.3626, + "step": 5130 + }, + { + "epoch": 0.30072548560730167, + "grad_norm": 6.219119548797607, + "learning_rate": 4.999024876484355e-05, + "loss": 2.3667, + "step": 5140 + }, + { + "epoch": 0.30131055464544815, + "grad_norm": 5.180619716644287, + "learning_rate": 4.9979414059114154e-05, + "loss": 2.3687, + "step": 5150 + }, + { + "epoch": 0.3018956236835947, + "grad_norm": 5.201261043548584, + "learning_rate": 4.9968579353384765e-05, + "loss": 2.3502, + "step": 5160 + }, + { + "epoch": 0.30248069272174116, + "grad_norm": 5.303366184234619, + "learning_rate": 4.9957744647655376e-05, + "loss": 2.3569, + "step": 5170 + }, + { + "epoch": 0.30306576175988764, + "grad_norm": 5.428822040557861, + "learning_rate": 4.994690994192598e-05, + "loss": 2.3348, + "step": 5180 + }, + { + "epoch": 0.3036508307980342, + "grad_norm": 5.351830005645752, + "learning_rate": 4.993607523619659e-05, + "loss": 2.3803, + "step": 5190 + }, + { + "epoch": 0.30423589983618066, + "grad_norm": 6.0211405754089355, + "learning_rate": 4.9925240530467194e-05, + "loss": 2.3644, + "step": 5200 + }, + { + "epoch": 0.3048209688743272, + "grad_norm": 7.071911811828613, + "learning_rate": 4.9914405824737805e-05, + "loss": 2.3277, + "step": 5210 + }, + { + "epoch": 0.30540603791247367, + "grad_norm": 5.648981094360352, + "learning_rate": 4.990357111900841e-05, + "loss": 2.3591, + "step": 5220 + }, + { + "epoch": 0.30599110695062015, + "grad_norm": 6.790600776672363, + "learning_rate": 4.989273641327902e-05, + "loss": 2.3524, + "step": 5230 + }, + { + "epoch": 0.3065761759887667, + "grad_norm": 4.542527675628662, + "learning_rate": 4.988190170754963e-05, + "loss": 2.3719, + "step": 5240 + }, + { + "epoch": 0.30716124502691317, + "grad_norm": 5.31064510345459, + "learning_rate": 4.9871067001820234e-05, + "loss": 2.3564, + "step": 5250 + }, + { + "epoch": 0.3077463140650597, + "grad_norm": 4.80157995223999, + "learning_rate": 4.9860232296090844e-05, + "loss": 2.3403, + "step": 5260 + }, + { + "epoch": 0.3083313831032062, + "grad_norm": 4.676851749420166, + "learning_rate": 4.984939759036145e-05, + "loss": 2.3512, + "step": 5270 + }, + { + "epoch": 0.30891645214135266, + "grad_norm": 5.491031646728516, + "learning_rate": 4.983856288463206e-05, + "loss": 2.3655, + "step": 5280 + }, + { + "epoch": 0.3095015211794992, + "grad_norm": 4.503449440002441, + "learning_rate": 4.982772817890266e-05, + "loss": 2.3377, + "step": 5290 + }, + { + "epoch": 0.3100865902176457, + "grad_norm": 5.480625629425049, + "learning_rate": 4.9816893473173274e-05, + "loss": 2.3358, + "step": 5300 + }, + { + "epoch": 0.3106716592557922, + "grad_norm": 6.19779634475708, + "learning_rate": 4.980605876744388e-05, + "loss": 2.3275, + "step": 5310 + }, + { + "epoch": 0.3112567282939387, + "grad_norm": 5.464353561401367, + "learning_rate": 4.979522406171449e-05, + "loss": 2.343, + "step": 5320 + }, + { + "epoch": 0.31184179733208517, + "grad_norm": 4.754026412963867, + "learning_rate": 4.978438935598509e-05, + "loss": 2.3409, + "step": 5330 + }, + { + "epoch": 0.3124268663702317, + "grad_norm": 4.763670921325684, + "learning_rate": 4.97735546502557e-05, + "loss": 2.3531, + "step": 5340 + }, + { + "epoch": 0.3130119354083782, + "grad_norm": 4.90180778503418, + "learning_rate": 4.9762719944526306e-05, + "loss": 2.3175, + "step": 5350 + }, + { + "epoch": 0.31359700444652466, + "grad_norm": 5.929008960723877, + "learning_rate": 4.975188523879692e-05, + "loss": 2.3216, + "step": 5360 + }, + { + "epoch": 0.3141820734846712, + "grad_norm": 4.891270637512207, + "learning_rate": 4.974105053306752e-05, + "loss": 2.299, + "step": 5370 + }, + { + "epoch": 0.3147671425228177, + "grad_norm": 5.027853488922119, + "learning_rate": 4.973021582733813e-05, + "loss": 2.3193, + "step": 5380 + }, + { + "epoch": 0.3153522115609642, + "grad_norm": 5.234158515930176, + "learning_rate": 4.9719381121608735e-05, + "loss": 2.3083, + "step": 5390 + }, + { + "epoch": 0.3159372805991107, + "grad_norm": 4.940448760986328, + "learning_rate": 4.9708546415879346e-05, + "loss": 2.3116, + "step": 5400 + }, + { + "epoch": 0.31652234963725717, + "grad_norm": 5.376123905181885, + "learning_rate": 4.969771171014995e-05, + "loss": 2.3172, + "step": 5410 + }, + { + "epoch": 0.3171074186754037, + "grad_norm": 5.094320297241211, + "learning_rate": 4.968687700442056e-05, + "loss": 2.3062, + "step": 5420 + }, + { + "epoch": 0.3176924877135502, + "grad_norm": 5.1166815757751465, + "learning_rate": 4.967604229869117e-05, + "loss": 2.3297, + "step": 5430 + }, + { + "epoch": 0.3182775567516967, + "grad_norm": 5.867978572845459, + "learning_rate": 4.9665207592961775e-05, + "loss": 2.3023, + "step": 5440 + }, + { + "epoch": 0.3188626257898432, + "grad_norm": 4.700488567352295, + "learning_rate": 4.9654372887232386e-05, + "loss": 2.333, + "step": 5450 + }, + { + "epoch": 0.3194476948279897, + "grad_norm": 4.4576802253723145, + "learning_rate": 4.964353818150299e-05, + "loss": 2.3016, + "step": 5460 + }, + { + "epoch": 0.3200327638661362, + "grad_norm": 4.902178764343262, + "learning_rate": 4.96327034757736e-05, + "loss": 2.3115, + "step": 5470 + }, + { + "epoch": 0.3206178329042827, + "grad_norm": 5.662380695343018, + "learning_rate": 4.9621868770044204e-05, + "loss": 2.3296, + "step": 5480 + }, + { + "epoch": 0.32120290194242923, + "grad_norm": 5.537148952484131, + "learning_rate": 4.9611034064314815e-05, + "loss": 2.3112, + "step": 5490 + }, + { + "epoch": 0.3217879709805757, + "grad_norm": 5.4465460777282715, + "learning_rate": 4.9600199358585425e-05, + "loss": 2.3103, + "step": 5500 + }, + { + "epoch": 0.3223730400187222, + "grad_norm": 4.8127007484436035, + "learning_rate": 4.958936465285603e-05, + "loss": 2.285, + "step": 5510 + }, + { + "epoch": 0.3229581090568687, + "grad_norm": 5.794064044952393, + "learning_rate": 4.957852994712664e-05, + "loss": 2.3158, + "step": 5520 + }, + { + "epoch": 0.3235431780950152, + "grad_norm": 4.6998419761657715, + "learning_rate": 4.9567695241397244e-05, + "loss": 2.3033, + "step": 5530 + }, + { + "epoch": 0.32412824713316174, + "grad_norm": 5.208774089813232, + "learning_rate": 4.9556860535667854e-05, + "loss": 2.3017, + "step": 5540 + }, + { + "epoch": 0.3247133161713082, + "grad_norm": 4.647343635559082, + "learning_rate": 4.9546025829938465e-05, + "loss": 2.3064, + "step": 5550 + }, + { + "epoch": 0.3252983852094547, + "grad_norm": 4.691363334655762, + "learning_rate": 4.953519112420907e-05, + "loss": 2.2911, + "step": 5560 + }, + { + "epoch": 0.32588345424760123, + "grad_norm": 5.073220252990723, + "learning_rate": 4.952435641847968e-05, + "loss": 2.3136, + "step": 5570 + }, + { + "epoch": 0.3264685232857477, + "grad_norm": 4.9619855880737305, + "learning_rate": 4.9513521712750283e-05, + "loss": 2.2836, + "step": 5580 + }, + { + "epoch": 0.3270535923238942, + "grad_norm": 4.865107536315918, + "learning_rate": 4.9502687007020894e-05, + "loss": 2.2882, + "step": 5590 + }, + { + "epoch": 0.32763866136204073, + "grad_norm": 5.411105155944824, + "learning_rate": 4.94918523012915e-05, + "loss": 2.2829, + "step": 5600 + }, + { + "epoch": 0.3282237304001872, + "grad_norm": 4.278438568115234, + "learning_rate": 4.948101759556211e-05, + "loss": 2.2705, + "step": 5610 + }, + { + "epoch": 0.32880879943833374, + "grad_norm": 4.657479763031006, + "learning_rate": 4.947018288983272e-05, + "loss": 2.2696, + "step": 5620 + }, + { + "epoch": 0.3293938684764802, + "grad_norm": 4.844244003295898, + "learning_rate": 4.945934818410332e-05, + "loss": 2.2699, + "step": 5630 + }, + { + "epoch": 0.3299789375146267, + "grad_norm": 5.175852298736572, + "learning_rate": 4.9448513478373934e-05, + "loss": 2.2651, + "step": 5640 + }, + { + "epoch": 0.33056400655277324, + "grad_norm": 5.476375102996826, + "learning_rate": 4.943767877264454e-05, + "loss": 2.281, + "step": 5650 + }, + { + "epoch": 0.3311490755909197, + "grad_norm": 4.5073161125183105, + "learning_rate": 4.942684406691515e-05, + "loss": 2.2873, + "step": 5660 + }, + { + "epoch": 0.33173414462906625, + "grad_norm": 4.299874782562256, + "learning_rate": 4.941600936118575e-05, + "loss": 2.2748, + "step": 5670 + }, + { + "epoch": 0.33231921366721273, + "grad_norm": 4.959343433380127, + "learning_rate": 4.940517465545636e-05, + "loss": 2.2561, + "step": 5680 + }, + { + "epoch": 0.3329042827053592, + "grad_norm": 3.562264919281006, + "learning_rate": 4.939433994972697e-05, + "loss": 2.2633, + "step": 5690 + }, + { + "epoch": 0.33348935174350575, + "grad_norm": 5.098947048187256, + "learning_rate": 4.938350524399758e-05, + "loss": 2.253, + "step": 5700 + }, + { + "epoch": 0.3340744207816522, + "grad_norm": 4.744395732879639, + "learning_rate": 4.937267053826818e-05, + "loss": 2.2502, + "step": 5710 + }, + { + "epoch": 0.33465948981979876, + "grad_norm": 4.796469211578369, + "learning_rate": 4.936183583253879e-05, + "loss": 2.2544, + "step": 5720 + }, + { + "epoch": 0.33524455885794524, + "grad_norm": 5.252956867218018, + "learning_rate": 4.9351001126809396e-05, + "loss": 2.2763, + "step": 5730 + }, + { + "epoch": 0.3358296278960917, + "grad_norm": 4.890120983123779, + "learning_rate": 4.9340166421080006e-05, + "loss": 2.2714, + "step": 5740 + }, + { + "epoch": 0.33641469693423826, + "grad_norm": 4.864804744720459, + "learning_rate": 4.932933171535061e-05, + "loss": 2.2417, + "step": 5750 + }, + { + "epoch": 0.33699976597238473, + "grad_norm": 4.437565803527832, + "learning_rate": 4.931849700962122e-05, + "loss": 2.2642, + "step": 5760 + }, + { + "epoch": 0.33758483501053127, + "grad_norm": 4.026050090789795, + "learning_rate": 4.9307662303891825e-05, + "loss": 2.2637, + "step": 5770 + }, + { + "epoch": 0.33816990404867775, + "grad_norm": 5.084876537322998, + "learning_rate": 4.9296827598162435e-05, + "loss": 2.2418, + "step": 5780 + }, + { + "epoch": 0.33875497308682423, + "grad_norm": 4.453270435333252, + "learning_rate": 4.928599289243304e-05, + "loss": 2.2494, + "step": 5790 + }, + { + "epoch": 0.33934004212497076, + "grad_norm": 4.903584957122803, + "learning_rate": 4.927515818670365e-05, + "loss": 2.2554, + "step": 5800 + }, + { + "epoch": 0.33992511116311724, + "grad_norm": 4.715571880340576, + "learning_rate": 4.926432348097426e-05, + "loss": 2.2412, + "step": 5810 + }, + { + "epoch": 0.3405101802012637, + "grad_norm": 4.284157752990723, + "learning_rate": 4.9253488775244864e-05, + "loss": 2.227, + "step": 5820 + }, + { + "epoch": 0.34109524923941026, + "grad_norm": 5.13087797164917, + "learning_rate": 4.9242654069515475e-05, + "loss": 2.2568, + "step": 5830 + }, + { + "epoch": 0.34168031827755674, + "grad_norm": 4.200436115264893, + "learning_rate": 4.923181936378608e-05, + "loss": 2.2428, + "step": 5840 + }, + { + "epoch": 0.3422653873157033, + "grad_norm": 4.256232261657715, + "learning_rate": 4.922098465805669e-05, + "loss": 2.2348, + "step": 5850 + }, + { + "epoch": 0.34285045635384975, + "grad_norm": 4.816508769989014, + "learning_rate": 4.9210149952327293e-05, + "loss": 2.251, + "step": 5860 + }, + { + "epoch": 0.34343552539199623, + "grad_norm": 4.752202987670898, + "learning_rate": 4.9199315246597904e-05, + "loss": 2.2462, + "step": 5870 + }, + { + "epoch": 0.34402059443014277, + "grad_norm": 5.202882289886475, + "learning_rate": 4.9188480540868515e-05, + "loss": 2.2369, + "step": 5880 + }, + { + "epoch": 0.34460566346828925, + "grad_norm": 4.4648332595825195, + "learning_rate": 4.917764583513912e-05, + "loss": 2.2206, + "step": 5890 + }, + { + "epoch": 0.3451907325064358, + "grad_norm": 4.554943561553955, + "learning_rate": 4.916681112940973e-05, + "loss": 2.2265, + "step": 5900 + }, + { + "epoch": 0.34577580154458226, + "grad_norm": 5.3754143714904785, + "learning_rate": 4.915597642368033e-05, + "loss": 2.2444, + "step": 5910 + }, + { + "epoch": 0.34636087058272874, + "grad_norm": 4.252150058746338, + "learning_rate": 4.9145141717950944e-05, + "loss": 2.2416, + "step": 5920 + }, + { + "epoch": 0.3469459396208753, + "grad_norm": 4.432397365570068, + "learning_rate": 4.9134307012221554e-05, + "loss": 2.2314, + "step": 5930 + }, + { + "epoch": 0.34753100865902176, + "grad_norm": 4.826864242553711, + "learning_rate": 4.912347230649216e-05, + "loss": 2.2394, + "step": 5940 + }, + { + "epoch": 0.3481160776971683, + "grad_norm": 4.455190658569336, + "learning_rate": 4.911263760076277e-05, + "loss": 2.2337, + "step": 5950 + }, + { + "epoch": 0.34870114673531477, + "grad_norm": 4.309401512145996, + "learning_rate": 4.910180289503337e-05, + "loss": 2.2299, + "step": 5960 + }, + { + "epoch": 0.34928621577346125, + "grad_norm": 3.90104079246521, + "learning_rate": 4.9090968189303983e-05, + "loss": 2.2187, + "step": 5970 + }, + { + "epoch": 0.3498712848116078, + "grad_norm": 3.97798490524292, + "learning_rate": 4.908013348357459e-05, + "loss": 2.2252, + "step": 5980 + }, + { + "epoch": 0.35045635384975427, + "grad_norm": 3.915165424346924, + "learning_rate": 4.90692987778452e-05, + "loss": 2.229, + "step": 5990 + }, + { + "epoch": 0.35104142288790074, + "grad_norm": 4.552550315856934, + "learning_rate": 4.905846407211581e-05, + "loss": 2.2271, + "step": 6000 + }, + { + "epoch": 0.35104142288790074, + "eval_loss": 2.2562904357910156, + "eval_runtime": 32.878, + "eval_samples_per_second": 686.995, + "eval_steps_per_second": 5.384, + "step": 6000 + }, + { + "epoch": 0.3516264919260473, + "grad_norm": 4.913104057312012, + "learning_rate": 4.904762936638641e-05, + "loss": 2.2402, + "step": 6010 + }, + { + "epoch": 0.35221156096419376, + "grad_norm": 4.51350736618042, + "learning_rate": 4.903679466065702e-05, + "loss": 2.2288, + "step": 6020 + }, + { + "epoch": 0.3527966300023403, + "grad_norm": 4.2252516746521, + "learning_rate": 4.902595995492763e-05, + "loss": 2.2299, + "step": 6030 + }, + { + "epoch": 0.3533816990404868, + "grad_norm": 4.424293041229248, + "learning_rate": 4.901512524919824e-05, + "loss": 2.2126, + "step": 6040 + }, + { + "epoch": 0.35396676807863325, + "grad_norm": 4.1710405349731445, + "learning_rate": 4.900429054346884e-05, + "loss": 2.226, + "step": 6050 + }, + { + "epoch": 0.3545518371167798, + "grad_norm": 5.2848711013793945, + "learning_rate": 4.899345583773945e-05, + "loss": 2.2067, + "step": 6060 + }, + { + "epoch": 0.35513690615492627, + "grad_norm": 4.0850958824157715, + "learning_rate": 4.8982621132010056e-05, + "loss": 2.2219, + "step": 6070 + }, + { + "epoch": 0.3557219751930728, + "grad_norm": 4.319845199584961, + "learning_rate": 4.897178642628067e-05, + "loss": 2.2306, + "step": 6080 + }, + { + "epoch": 0.3563070442312193, + "grad_norm": 4.531248569488525, + "learning_rate": 4.896095172055127e-05, + "loss": 2.2022, + "step": 6090 + }, + { + "epoch": 0.35689211326936576, + "grad_norm": 4.507875919342041, + "learning_rate": 4.895011701482188e-05, + "loss": 2.2074, + "step": 6100 + }, + { + "epoch": 0.3574771823075123, + "grad_norm": 4.480566501617432, + "learning_rate": 4.8939282309092485e-05, + "loss": 2.2137, + "step": 6110 + }, + { + "epoch": 0.3580622513456588, + "grad_norm": 4.958229064941406, + "learning_rate": 4.892844760336309e-05, + "loss": 2.2075, + "step": 6120 + }, + { + "epoch": 0.3586473203838053, + "grad_norm": 5.9645094871521, + "learning_rate": 4.89176128976337e-05, + "loss": 2.2138, + "step": 6130 + }, + { + "epoch": 0.3592323894219518, + "grad_norm": 3.6096811294555664, + "learning_rate": 4.890677819190431e-05, + "loss": 2.2238, + "step": 6140 + }, + { + "epoch": 0.35981745846009827, + "grad_norm": 3.838747501373291, + "learning_rate": 4.8895943486174914e-05, + "loss": 2.1938, + "step": 6150 + }, + { + "epoch": 0.3604025274982448, + "grad_norm": 4.420722007751465, + "learning_rate": 4.8885108780445525e-05, + "loss": 2.198, + "step": 6160 + }, + { + "epoch": 0.3609875965363913, + "grad_norm": 4.5280070304870605, + "learning_rate": 4.887427407471613e-05, + "loss": 2.2023, + "step": 6170 + }, + { + "epoch": 0.3615726655745378, + "grad_norm": 4.701565742492676, + "learning_rate": 4.886343936898674e-05, + "loss": 2.1884, + "step": 6180 + }, + { + "epoch": 0.3621577346126843, + "grad_norm": 4.535452842712402, + "learning_rate": 4.885260466325735e-05, + "loss": 2.1856, + "step": 6190 + }, + { + "epoch": 0.3627428036508308, + "grad_norm": 4.0062456130981445, + "learning_rate": 4.8841769957527954e-05, + "loss": 2.1899, + "step": 6200 + }, + { + "epoch": 0.3633278726889773, + "grad_norm": 4.14393424987793, + "learning_rate": 4.8830935251798564e-05, + "loss": 2.2034, + "step": 6210 + }, + { + "epoch": 0.3639129417271238, + "grad_norm": 4.001403331756592, + "learning_rate": 4.882010054606917e-05, + "loss": 2.2284, + "step": 6220 + }, + { + "epoch": 0.3644980107652703, + "grad_norm": 3.7970728874206543, + "learning_rate": 4.880926584033978e-05, + "loss": 2.2015, + "step": 6230 + }, + { + "epoch": 0.3650830798034168, + "grad_norm": 3.886841297149658, + "learning_rate": 4.879843113461038e-05, + "loss": 2.1859, + "step": 6240 + }, + { + "epoch": 0.3656681488415633, + "grad_norm": 4.358884334564209, + "learning_rate": 4.8787596428880993e-05, + "loss": 2.1725, + "step": 6250 + }, + { + "epoch": 0.3662532178797098, + "grad_norm": 3.984745979309082, + "learning_rate": 4.8776761723151604e-05, + "loss": 2.1993, + "step": 6260 + }, + { + "epoch": 0.3668382869178563, + "grad_norm": 4.465595722198486, + "learning_rate": 4.876592701742221e-05, + "loss": 2.1947, + "step": 6270 + }, + { + "epoch": 0.3674233559560028, + "grad_norm": 4.184237003326416, + "learning_rate": 4.875509231169282e-05, + "loss": 2.1927, + "step": 6280 + }, + { + "epoch": 0.3680084249941493, + "grad_norm": 4.188620567321777, + "learning_rate": 4.874425760596342e-05, + "loss": 2.201, + "step": 6290 + }, + { + "epoch": 0.3685934940322958, + "grad_norm": 4.463057041168213, + "learning_rate": 4.873342290023403e-05, + "loss": 2.1683, + "step": 6300 + }, + { + "epoch": 0.36917856307044233, + "grad_norm": 3.967475414276123, + "learning_rate": 4.872258819450464e-05, + "loss": 2.1873, + "step": 6310 + }, + { + "epoch": 0.3697636321085888, + "grad_norm": 4.6598286628723145, + "learning_rate": 4.871175348877525e-05, + "loss": 2.1928, + "step": 6320 + }, + { + "epoch": 0.3703487011467353, + "grad_norm": 3.6281821727752686, + "learning_rate": 4.870091878304586e-05, + "loss": 2.1729, + "step": 6330 + }, + { + "epoch": 0.37093377018488183, + "grad_norm": 3.9503822326660156, + "learning_rate": 4.869008407731646e-05, + "loss": 2.1854, + "step": 6340 + }, + { + "epoch": 0.3715188392230283, + "grad_norm": 4.612656593322754, + "learning_rate": 4.867924937158707e-05, + "loss": 2.1945, + "step": 6350 + }, + { + "epoch": 0.37210390826117484, + "grad_norm": 3.8910892009735107, + "learning_rate": 4.866841466585768e-05, + "loss": 2.1868, + "step": 6360 + }, + { + "epoch": 0.3726889772993213, + "grad_norm": 3.7996418476104736, + "learning_rate": 4.865757996012829e-05, + "loss": 2.1787, + "step": 6370 + }, + { + "epoch": 0.3732740463374678, + "grad_norm": 3.87351655960083, + "learning_rate": 4.86467452543989e-05, + "loss": 2.1948, + "step": 6380 + }, + { + "epoch": 0.37385911537561434, + "grad_norm": 3.9160208702087402, + "learning_rate": 4.86359105486695e-05, + "loss": 2.169, + "step": 6390 + }, + { + "epoch": 0.3744441844137608, + "grad_norm": 4.206494331359863, + "learning_rate": 4.862507584294011e-05, + "loss": 2.1668, + "step": 6400 + }, + { + "epoch": 0.37502925345190735, + "grad_norm": 4.287564277648926, + "learning_rate": 4.8614241137210716e-05, + "loss": 2.147, + "step": 6410 + }, + { + "epoch": 0.37561432249005383, + "grad_norm": 3.2932798862457275, + "learning_rate": 4.860340643148133e-05, + "loss": 2.1728, + "step": 6420 + }, + { + "epoch": 0.3761993915282003, + "grad_norm": 3.478147506713867, + "learning_rate": 4.859257172575193e-05, + "loss": 2.149, + "step": 6430 + }, + { + "epoch": 0.37678446056634685, + "grad_norm": 3.8248722553253174, + "learning_rate": 4.858173702002254e-05, + "loss": 2.1654, + "step": 6440 + }, + { + "epoch": 0.3773695296044933, + "grad_norm": 3.995995283126831, + "learning_rate": 4.8570902314293145e-05, + "loss": 2.1433, + "step": 6450 + }, + { + "epoch": 0.3779545986426398, + "grad_norm": 4.023491382598877, + "learning_rate": 4.8560067608563756e-05, + "loss": 2.1474, + "step": 6460 + }, + { + "epoch": 0.37853966768078634, + "grad_norm": 3.819854497909546, + "learning_rate": 4.854923290283436e-05, + "loss": 2.1426, + "step": 6470 + }, + { + "epoch": 0.3791247367189328, + "grad_norm": 4.172093391418457, + "learning_rate": 4.853839819710497e-05, + "loss": 2.1618, + "step": 6480 + }, + { + "epoch": 0.37970980575707936, + "grad_norm": 4.433021068572998, + "learning_rate": 4.8527563491375574e-05, + "loss": 2.1748, + "step": 6490 + }, + { + "epoch": 0.38029487479522583, + "grad_norm": 3.5362703800201416, + "learning_rate": 4.851672878564618e-05, + "loss": 2.159, + "step": 6500 + }, + { + "epoch": 0.3808799438333723, + "grad_norm": 4.867645740509033, + "learning_rate": 4.850589407991679e-05, + "loss": 2.1408, + "step": 6510 + }, + { + "epoch": 0.38146501287151885, + "grad_norm": 3.868028163909912, + "learning_rate": 4.84950593741874e-05, + "loss": 2.1691, + "step": 6520 + }, + { + "epoch": 0.38205008190966533, + "grad_norm": 4.7892632484436035, + "learning_rate": 4.8484224668458003e-05, + "loss": 2.1385, + "step": 6530 + }, + { + "epoch": 0.38263515094781186, + "grad_norm": 4.342502117156982, + "learning_rate": 4.8473389962728614e-05, + "loss": 2.1568, + "step": 6540 + }, + { + "epoch": 0.38322021998595834, + "grad_norm": 3.181288480758667, + "learning_rate": 4.846255525699922e-05, + "loss": 2.1646, + "step": 6550 + }, + { + "epoch": 0.3838052890241048, + "grad_norm": 3.6439156532287598, + "learning_rate": 4.845172055126983e-05, + "loss": 2.1437, + "step": 6560 + }, + { + "epoch": 0.38439035806225136, + "grad_norm": 3.674603223800659, + "learning_rate": 4.844088584554044e-05, + "loss": 2.139, + "step": 6570 + }, + { + "epoch": 0.38497542710039784, + "grad_norm": 3.6404075622558594, + "learning_rate": 4.843005113981104e-05, + "loss": 2.1575, + "step": 6580 + }, + { + "epoch": 0.3855604961385444, + "grad_norm": 4.105569362640381, + "learning_rate": 4.8419216434081654e-05, + "loss": 2.1287, + "step": 6590 + }, + { + "epoch": 0.38614556517669085, + "grad_norm": 3.6591904163360596, + "learning_rate": 4.840838172835226e-05, + "loss": 2.1481, + "step": 6600 + }, + { + "epoch": 0.38673063421483733, + "grad_norm": 3.716917037963867, + "learning_rate": 4.839754702262287e-05, + "loss": 2.1453, + "step": 6610 + }, + { + "epoch": 0.38731570325298387, + "grad_norm": 3.096949815750122, + "learning_rate": 4.838671231689347e-05, + "loss": 2.1441, + "step": 6620 + }, + { + "epoch": 0.38790077229113035, + "grad_norm": 4.8269877433776855, + "learning_rate": 4.837587761116408e-05, + "loss": 2.1459, + "step": 6630 + }, + { + "epoch": 0.3884858413292769, + "grad_norm": 3.99076509475708, + "learning_rate": 4.8365042905434693e-05, + "loss": 2.1447, + "step": 6640 + }, + { + "epoch": 0.38907091036742336, + "grad_norm": 4.299583435058594, + "learning_rate": 4.83542081997053e-05, + "loss": 2.1487, + "step": 6650 + }, + { + "epoch": 0.38965597940556984, + "grad_norm": 3.4991822242736816, + "learning_rate": 4.834337349397591e-05, + "loss": 2.1478, + "step": 6660 + }, + { + "epoch": 0.3902410484437164, + "grad_norm": 3.737506866455078, + "learning_rate": 4.833253878824651e-05, + "loss": 2.1309, + "step": 6670 + }, + { + "epoch": 0.39082611748186286, + "grad_norm": 3.6791441440582275, + "learning_rate": 4.832170408251712e-05, + "loss": 2.1178, + "step": 6680 + }, + { + "epoch": 0.39141118652000934, + "grad_norm": 3.7794225215911865, + "learning_rate": 4.8310869376787726e-05, + "loss": 2.1259, + "step": 6690 + }, + { + "epoch": 0.39199625555815587, + "grad_norm": 3.5005481243133545, + "learning_rate": 4.830003467105834e-05, + "loss": 2.1157, + "step": 6700 + }, + { + "epoch": 0.39258132459630235, + "grad_norm": 3.5226516723632812, + "learning_rate": 4.828919996532895e-05, + "loss": 2.1316, + "step": 6710 + }, + { + "epoch": 0.3931663936344489, + "grad_norm": 3.876497507095337, + "learning_rate": 4.827836525959955e-05, + "loss": 2.1423, + "step": 6720 + }, + { + "epoch": 0.39375146267259536, + "grad_norm": 3.6278369426727295, + "learning_rate": 4.826753055387016e-05, + "loss": 2.126, + "step": 6730 + }, + { + "epoch": 0.39433653171074184, + "grad_norm": 3.727219820022583, + "learning_rate": 4.8256695848140766e-05, + "loss": 2.1351, + "step": 6740 + }, + { + "epoch": 0.3949216007488884, + "grad_norm": 3.3571794033050537, + "learning_rate": 4.824586114241138e-05, + "loss": 2.1124, + "step": 6750 + }, + { + "epoch": 0.39550666978703486, + "grad_norm": 3.536644220352173, + "learning_rate": 4.823502643668198e-05, + "loss": 2.1305, + "step": 6760 + }, + { + "epoch": 0.3960917388251814, + "grad_norm": 3.635080575942993, + "learning_rate": 4.822419173095259e-05, + "loss": 2.1437, + "step": 6770 + }, + { + "epoch": 0.3966768078633279, + "grad_norm": 3.8800854682922363, + "learning_rate": 4.82133570252232e-05, + "loss": 2.1182, + "step": 6780 + }, + { + "epoch": 0.39726187690147435, + "grad_norm": 3.610511541366577, + "learning_rate": 4.8202522319493806e-05, + "loss": 2.1224, + "step": 6790 + }, + { + "epoch": 0.3978469459396209, + "grad_norm": 3.46185040473938, + "learning_rate": 4.8191687613764416e-05, + "loss": 2.1232, + "step": 6800 + }, + { + "epoch": 0.39843201497776737, + "grad_norm": 4.061455249786377, + "learning_rate": 4.818085290803502e-05, + "loss": 2.114, + "step": 6810 + }, + { + "epoch": 0.3990170840159139, + "grad_norm": 3.367604970932007, + "learning_rate": 4.817001820230563e-05, + "loss": 2.1408, + "step": 6820 + }, + { + "epoch": 0.3996021530540604, + "grad_norm": 3.236743927001953, + "learning_rate": 4.8159183496576235e-05, + "loss": 2.1306, + "step": 6830 + }, + { + "epoch": 0.40018722209220686, + "grad_norm": 3.9895989894866943, + "learning_rate": 4.8148348790846845e-05, + "loss": 2.1159, + "step": 6840 + }, + { + "epoch": 0.4007722911303534, + "grad_norm": 3.605041027069092, + "learning_rate": 4.813751408511745e-05, + "loss": 2.1265, + "step": 6850 + }, + { + "epoch": 0.4013573601684999, + "grad_norm": 3.9209108352661133, + "learning_rate": 4.812667937938806e-05, + "loss": 2.1244, + "step": 6860 + }, + { + "epoch": 0.40194242920664636, + "grad_norm": 3.7729952335357666, + "learning_rate": 4.8115844673658664e-05, + "loss": 2.1278, + "step": 6870 + }, + { + "epoch": 0.4025274982447929, + "grad_norm": 3.1840415000915527, + "learning_rate": 4.810500996792927e-05, + "loss": 2.1081, + "step": 6880 + }, + { + "epoch": 0.40311256728293937, + "grad_norm": 3.7455289363861084, + "learning_rate": 4.809417526219988e-05, + "loss": 2.1137, + "step": 6890 + }, + { + "epoch": 0.4036976363210859, + "grad_norm": 3.829472303390503, + "learning_rate": 4.808334055647049e-05, + "loss": 2.1014, + "step": 6900 + }, + { + "epoch": 0.4042827053592324, + "grad_norm": 3.576166868209839, + "learning_rate": 4.807250585074109e-05, + "loss": 2.0974, + "step": 6910 + }, + { + "epoch": 0.40486777439737887, + "grad_norm": 3.469914197921753, + "learning_rate": 4.8061671145011703e-05, + "loss": 2.1062, + "step": 6920 + }, + { + "epoch": 0.4054528434355254, + "grad_norm": 3.674126386642456, + "learning_rate": 4.805083643928231e-05, + "loss": 2.1468, + "step": 6930 + }, + { + "epoch": 0.4060379124736719, + "grad_norm": 3.9619503021240234, + "learning_rate": 4.804000173355292e-05, + "loss": 2.1047, + "step": 6940 + }, + { + "epoch": 0.4066229815118184, + "grad_norm": 4.336227893829346, + "learning_rate": 4.802916702782352e-05, + "loss": 2.1179, + "step": 6950 + }, + { + "epoch": 0.4072080505499649, + "grad_norm": 3.4930968284606934, + "learning_rate": 4.801833232209413e-05, + "loss": 2.1162, + "step": 6960 + }, + { + "epoch": 0.4077931195881114, + "grad_norm": 4.0153093338012695, + "learning_rate": 4.800749761636474e-05, + "loss": 2.1258, + "step": 6970 + }, + { + "epoch": 0.4083781886262579, + "grad_norm": 4.021363258361816, + "learning_rate": 4.799666291063535e-05, + "loss": 2.1077, + "step": 6980 + }, + { + "epoch": 0.4089632576644044, + "grad_norm": 3.5499660968780518, + "learning_rate": 4.798582820490596e-05, + "loss": 2.0913, + "step": 6990 + }, + { + "epoch": 0.4095483267025509, + "grad_norm": 3.684610605239868, + "learning_rate": 4.797499349917656e-05, + "loss": 2.1125, + "step": 7000 + }, + { + "epoch": 0.4095483267025509, + "eval_loss": 2.1340787410736084, + "eval_runtime": 33.2107, + "eval_samples_per_second": 680.113, + "eval_steps_per_second": 5.33, + "step": 7000 + }, + { + "epoch": 0.4101333957406974, + "grad_norm": 3.1665306091308594, + "learning_rate": 4.796415879344717e-05, + "loss": 2.1156, + "step": 7010 + }, + { + "epoch": 0.4107184647788439, + "grad_norm": 3.7780699729919434, + "learning_rate": 4.795332408771778e-05, + "loss": 2.1167, + "step": 7020 + }, + { + "epoch": 0.4113035338169904, + "grad_norm": 3.646026372909546, + "learning_rate": 4.794248938198839e-05, + "loss": 2.0914, + "step": 7030 + }, + { + "epoch": 0.4118886028551369, + "grad_norm": 3.2241885662078857, + "learning_rate": 4.7931654676259e-05, + "loss": 2.0817, + "step": 7040 + }, + { + "epoch": 0.41247367189328343, + "grad_norm": 3.7400856018066406, + "learning_rate": 4.79208199705296e-05, + "loss": 2.1034, + "step": 7050 + }, + { + "epoch": 0.4130587409314299, + "grad_norm": 3.5327847003936768, + "learning_rate": 4.790998526480021e-05, + "loss": 2.1021, + "step": 7060 + }, + { + "epoch": 0.4136438099695764, + "grad_norm": 3.391343116760254, + "learning_rate": 4.7899150559070816e-05, + "loss": 2.0921, + "step": 7070 + }, + { + "epoch": 0.4142288790077229, + "grad_norm": 3.5834646224975586, + "learning_rate": 4.7888315853341426e-05, + "loss": 2.1011, + "step": 7080 + }, + { + "epoch": 0.4148139480458694, + "grad_norm": 3.5589120388031006, + "learning_rate": 4.787748114761204e-05, + "loss": 2.1088, + "step": 7090 + }, + { + "epoch": 0.4153990170840159, + "grad_norm": 3.714026689529419, + "learning_rate": 4.786664644188264e-05, + "loss": 2.0905, + "step": 7100 + }, + { + "epoch": 0.4159840861221624, + "grad_norm": 3.1687493324279785, + "learning_rate": 4.785581173615325e-05, + "loss": 2.0949, + "step": 7110 + }, + { + "epoch": 0.4165691551603089, + "grad_norm": 3.405789375305176, + "learning_rate": 4.7844977030423855e-05, + "loss": 2.1164, + "step": 7120 + }, + { + "epoch": 0.41715422419845544, + "grad_norm": 3.6965200901031494, + "learning_rate": 4.7834142324694466e-05, + "loss": 2.0818, + "step": 7130 + }, + { + "epoch": 0.4177392932366019, + "grad_norm": 3.331749677658081, + "learning_rate": 4.782330761896507e-05, + "loss": 2.0815, + "step": 7140 + }, + { + "epoch": 0.4183243622747484, + "grad_norm": 3.4534761905670166, + "learning_rate": 4.781247291323568e-05, + "loss": 2.0866, + "step": 7150 + }, + { + "epoch": 0.41890943131289493, + "grad_norm": 3.4378294944763184, + "learning_rate": 4.780163820750629e-05, + "loss": 2.0796, + "step": 7160 + }, + { + "epoch": 0.4194945003510414, + "grad_norm": 3.429190158843994, + "learning_rate": 4.7790803501776895e-05, + "loss": 2.1054, + "step": 7170 + }, + { + "epoch": 0.42007956938918795, + "grad_norm": 3.732862949371338, + "learning_rate": 4.7779968796047506e-05, + "loss": 2.0766, + "step": 7180 + }, + { + "epoch": 0.4206646384273344, + "grad_norm": 3.3273720741271973, + "learning_rate": 4.776913409031811e-05, + "loss": 2.0829, + "step": 7190 + }, + { + "epoch": 0.4212497074654809, + "grad_norm": 3.6161081790924072, + "learning_rate": 4.775829938458872e-05, + "loss": 2.0964, + "step": 7200 + }, + { + "epoch": 0.42183477650362744, + "grad_norm": 3.734306573867798, + "learning_rate": 4.7747464678859324e-05, + "loss": 2.0983, + "step": 7210 + }, + { + "epoch": 0.4224198455417739, + "grad_norm": 3.6056926250457764, + "learning_rate": 4.7736629973129935e-05, + "loss": 2.0983, + "step": 7220 + }, + { + "epoch": 0.42300491457992045, + "grad_norm": 4.19010591506958, + "learning_rate": 4.772579526740054e-05, + "loss": 2.088, + "step": 7230 + }, + { + "epoch": 0.42358998361806693, + "grad_norm": 3.8854901790618896, + "learning_rate": 4.771496056167115e-05, + "loss": 2.0525, + "step": 7240 + }, + { + "epoch": 0.4241750526562134, + "grad_norm": 2.9379961490631104, + "learning_rate": 4.770412585594175e-05, + "loss": 2.0687, + "step": 7250 + }, + { + "epoch": 0.42476012169435995, + "grad_norm": 3.3113505840301514, + "learning_rate": 4.769329115021236e-05, + "loss": 2.0761, + "step": 7260 + }, + { + "epoch": 0.42534519073250643, + "grad_norm": 3.4135236740112305, + "learning_rate": 4.768245644448297e-05, + "loss": 2.0778, + "step": 7270 + }, + { + "epoch": 0.42593025977065296, + "grad_norm": 3.078312635421753, + "learning_rate": 4.767162173875358e-05, + "loss": 2.0794, + "step": 7280 + }, + { + "epoch": 0.42651532880879944, + "grad_norm": 3.3383445739746094, + "learning_rate": 4.766078703302418e-05, + "loss": 2.1035, + "step": 7290 + }, + { + "epoch": 0.4271003978469459, + "grad_norm": 3.7967700958251953, + "learning_rate": 4.764995232729479e-05, + "loss": 2.0653, + "step": 7300 + }, + { + "epoch": 0.42768546688509246, + "grad_norm": 3.5977187156677246, + "learning_rate": 4.76391176215654e-05, + "loss": 2.0886, + "step": 7310 + }, + { + "epoch": 0.42827053592323894, + "grad_norm": 3.4763169288635254, + "learning_rate": 4.762828291583601e-05, + "loss": 2.0761, + "step": 7320 + }, + { + "epoch": 0.4288556049613854, + "grad_norm": 3.5627222061157227, + "learning_rate": 4.761744821010661e-05, + "loss": 2.0738, + "step": 7330 + }, + { + "epoch": 0.42944067399953195, + "grad_norm": 3.332817554473877, + "learning_rate": 4.760661350437722e-05, + "loss": 2.0494, + "step": 7340 + }, + { + "epoch": 0.43002574303767843, + "grad_norm": 3.426633834838867, + "learning_rate": 4.759577879864783e-05, + "loss": 2.085, + "step": 7350 + }, + { + "epoch": 0.43061081207582497, + "grad_norm": 4.186905860900879, + "learning_rate": 4.7584944092918436e-05, + "loss": 2.0687, + "step": 7360 + }, + { + "epoch": 0.43119588111397145, + "grad_norm": 3.312936544418335, + "learning_rate": 4.757410938718905e-05, + "loss": 2.083, + "step": 7370 + }, + { + "epoch": 0.4317809501521179, + "grad_norm": 3.5514976978302, + "learning_rate": 4.756327468145965e-05, + "loss": 2.0851, + "step": 7380 + }, + { + "epoch": 0.43236601919026446, + "grad_norm": 3.7130894660949707, + "learning_rate": 4.755243997573026e-05, + "loss": 2.0737, + "step": 7390 + }, + { + "epoch": 0.43295108822841094, + "grad_norm": 3.208683490753174, + "learning_rate": 4.7541605270000865e-05, + "loss": 2.0561, + "step": 7400 + }, + { + "epoch": 0.4335361572665575, + "grad_norm": 2.9311470985412598, + "learning_rate": 4.7530770564271476e-05, + "loss": 2.0777, + "step": 7410 + }, + { + "epoch": 0.43412122630470396, + "grad_norm": 3.604767322540283, + "learning_rate": 4.751993585854209e-05, + "loss": 2.0724, + "step": 7420 + }, + { + "epoch": 0.43470629534285044, + "grad_norm": 3.267223596572876, + "learning_rate": 4.750910115281269e-05, + "loss": 2.0586, + "step": 7430 + }, + { + "epoch": 0.43529136438099697, + "grad_norm": 3.4856209754943848, + "learning_rate": 4.74982664470833e-05, + "loss": 2.0406, + "step": 7440 + }, + { + "epoch": 0.43587643341914345, + "grad_norm": 3.0627779960632324, + "learning_rate": 4.7487431741353905e-05, + "loss": 2.0815, + "step": 7450 + }, + { + "epoch": 0.43646150245729, + "grad_norm": 3.3373963832855225, + "learning_rate": 4.7476597035624516e-05, + "loss": 2.0607, + "step": 7460 + }, + { + "epoch": 0.43704657149543646, + "grad_norm": 3.3001632690429688, + "learning_rate": 4.7465762329895126e-05, + "loss": 2.0863, + "step": 7470 + }, + { + "epoch": 0.43763164053358294, + "grad_norm": 3.293153762817383, + "learning_rate": 4.745492762416573e-05, + "loss": 2.0566, + "step": 7480 + }, + { + "epoch": 0.4382167095717295, + "grad_norm": 3.0837268829345703, + "learning_rate": 4.744409291843634e-05, + "loss": 2.0595, + "step": 7490 + }, + { + "epoch": 0.43880177860987596, + "grad_norm": 3.005446672439575, + "learning_rate": 4.7433258212706945e-05, + "loss": 2.0765, + "step": 7500 + }, + { + "epoch": 0.4393868476480225, + "grad_norm": 3.1165568828582764, + "learning_rate": 4.7422423506977555e-05, + "loss": 2.0745, + "step": 7510 + }, + { + "epoch": 0.439971916686169, + "grad_norm": 3.6857714653015137, + "learning_rate": 4.741158880124816e-05, + "loss": 2.0614, + "step": 7520 + }, + { + "epoch": 0.44055698572431545, + "grad_norm": 3.5658390522003174, + "learning_rate": 4.740075409551877e-05, + "loss": 2.0433, + "step": 7530 + }, + { + "epoch": 0.441142054762462, + "grad_norm": 3.83376407623291, + "learning_rate": 4.738991938978938e-05, + "loss": 2.0543, + "step": 7540 + }, + { + "epoch": 0.44172712380060847, + "grad_norm": 3.5536608695983887, + "learning_rate": 4.7379084684059984e-05, + "loss": 2.06, + "step": 7550 + }, + { + "epoch": 0.44231219283875495, + "grad_norm": 3.2147018909454346, + "learning_rate": 4.7368249978330595e-05, + "loss": 2.0606, + "step": 7560 + }, + { + "epoch": 0.4428972618769015, + "grad_norm": 3.2198257446289062, + "learning_rate": 4.73574152726012e-05, + "loss": 2.0558, + "step": 7570 + }, + { + "epoch": 0.44348233091504796, + "grad_norm": 3.0399866104125977, + "learning_rate": 4.734658056687181e-05, + "loss": 2.0628, + "step": 7580 + }, + { + "epoch": 0.4440673999531945, + "grad_norm": 3.3716390132904053, + "learning_rate": 4.733574586114241e-05, + "loss": 2.0527, + "step": 7590 + }, + { + "epoch": 0.444652468991341, + "grad_norm": 4.297920227050781, + "learning_rate": 4.7324911155413024e-05, + "loss": 2.0598, + "step": 7600 + }, + { + "epoch": 0.44523753802948746, + "grad_norm": 3.2941293716430664, + "learning_rate": 4.731407644968363e-05, + "loss": 2.0409, + "step": 7610 + }, + { + "epoch": 0.445822607067634, + "grad_norm": 3.186908006668091, + "learning_rate": 4.730324174395424e-05, + "loss": 2.0494, + "step": 7620 + }, + { + "epoch": 0.44640767610578047, + "grad_norm": 3.0207295417785645, + "learning_rate": 4.729240703822484e-05, + "loss": 2.0312, + "step": 7630 + }, + { + "epoch": 0.446992745143927, + "grad_norm": 3.2845561504364014, + "learning_rate": 4.7281572332495446e-05, + "loss": 2.0638, + "step": 7640 + }, + { + "epoch": 0.4475778141820735, + "grad_norm": 3.0266265869140625, + "learning_rate": 4.727073762676606e-05, + "loss": 2.0337, + "step": 7650 + }, + { + "epoch": 0.44816288322021997, + "grad_norm": 3.1072146892547607, + "learning_rate": 4.725990292103667e-05, + "loss": 2.0494, + "step": 7660 + }, + { + "epoch": 0.4487479522583665, + "grad_norm": 3.867323398590088, + "learning_rate": 4.724906821530727e-05, + "loss": 2.0274, + "step": 7670 + }, + { + "epoch": 0.449333021296513, + "grad_norm": 3.6978070735931396, + "learning_rate": 4.723823350957788e-05, + "loss": 2.0433, + "step": 7680 + }, + { + "epoch": 0.4499180903346595, + "grad_norm": 2.7239317893981934, + "learning_rate": 4.7227398803848486e-05, + "loss": 2.0591, + "step": 7690 + }, + { + "epoch": 0.450503159372806, + "grad_norm": 3.2210021018981934, + "learning_rate": 4.7216564098119097e-05, + "loss": 2.0546, + "step": 7700 + }, + { + "epoch": 0.4510882284109525, + "grad_norm": 3.0967519283294678, + "learning_rate": 4.72057293923897e-05, + "loss": 2.0294, + "step": 7710 + }, + { + "epoch": 0.451673297449099, + "grad_norm": 2.9180734157562256, + "learning_rate": 4.719489468666031e-05, + "loss": 2.0485, + "step": 7720 + }, + { + "epoch": 0.4522583664872455, + "grad_norm": 3.779921054840088, + "learning_rate": 4.718405998093092e-05, + "loss": 2.0204, + "step": 7730 + }, + { + "epoch": 0.45284343552539197, + "grad_norm": 3.4785077571868896, + "learning_rate": 4.7173225275201526e-05, + "loss": 2.0232, + "step": 7740 + }, + { + "epoch": 0.4534285045635385, + "grad_norm": 3.6493630409240723, + "learning_rate": 4.7162390569472136e-05, + "loss": 2.0414, + "step": 7750 + }, + { + "epoch": 0.454013573601685, + "grad_norm": 3.237908124923706, + "learning_rate": 4.715155586374274e-05, + "loss": 2.0499, + "step": 7760 + }, + { + "epoch": 0.4545986426398315, + "grad_norm": 3.2096498012542725, + "learning_rate": 4.714072115801335e-05, + "loss": 2.0469, + "step": 7770 + }, + { + "epoch": 0.455183711677978, + "grad_norm": 3.6893773078918457, + "learning_rate": 4.7129886452283955e-05, + "loss": 2.0415, + "step": 7780 + }, + { + "epoch": 0.4557687807161245, + "grad_norm": 3.96789288520813, + "learning_rate": 4.7119051746554565e-05, + "loss": 2.0342, + "step": 7790 + }, + { + "epoch": 0.456353849754271, + "grad_norm": 2.856855630874634, + "learning_rate": 4.7108217040825176e-05, + "loss": 2.0066, + "step": 7800 + }, + { + "epoch": 0.4569389187924175, + "grad_norm": 2.9286069869995117, + "learning_rate": 4.709738233509578e-05, + "loss": 2.0206, + "step": 7810 + }, + { + "epoch": 0.457523987830564, + "grad_norm": 2.9229488372802734, + "learning_rate": 4.708654762936639e-05, + "loss": 2.031, + "step": 7820 + }, + { + "epoch": 0.4581090568687105, + "grad_norm": 2.753567695617676, + "learning_rate": 4.7075712923636994e-05, + "loss": 2.0576, + "step": 7830 + }, + { + "epoch": 0.458694125906857, + "grad_norm": 3.075653553009033, + "learning_rate": 4.7064878217907605e-05, + "loss": 2.0372, + "step": 7840 + }, + { + "epoch": 0.4592791949450035, + "grad_norm": 3.623875379562378, + "learning_rate": 4.7054043512178216e-05, + "loss": 2.0189, + "step": 7850 + }, + { + "epoch": 0.45986426398315, + "grad_norm": 3.7254042625427246, + "learning_rate": 4.704320880644882e-05, + "loss": 2.0444, + "step": 7860 + }, + { + "epoch": 0.46044933302129654, + "grad_norm": 3.1463561058044434, + "learning_rate": 4.703237410071943e-05, + "loss": 2.0362, + "step": 7870 + }, + { + "epoch": 0.461034402059443, + "grad_norm": 2.980988025665283, + "learning_rate": 4.7021539394990034e-05, + "loss": 2.0029, + "step": 7880 + }, + { + "epoch": 0.4616194710975895, + "grad_norm": 2.9668774604797363, + "learning_rate": 4.7010704689260645e-05, + "loss": 2.01, + "step": 7890 + }, + { + "epoch": 0.46220454013573603, + "grad_norm": 2.988330602645874, + "learning_rate": 4.699986998353125e-05, + "loss": 2.0128, + "step": 7900 + }, + { + "epoch": 0.4627896091738825, + "grad_norm": 3.223737955093384, + "learning_rate": 4.698903527780186e-05, + "loss": 2.0377, + "step": 7910 + }, + { + "epoch": 0.46337467821202905, + "grad_norm": 3.488316774368286, + "learning_rate": 4.697820057207247e-05, + "loss": 2.0406, + "step": 7920 + }, + { + "epoch": 0.4639597472501755, + "grad_norm": 3.1316211223602295, + "learning_rate": 4.6967365866343074e-05, + "loss": 2.0445, + "step": 7930 + }, + { + "epoch": 0.464544816288322, + "grad_norm": 3.0853753089904785, + "learning_rate": 4.6956531160613684e-05, + "loss": 2.018, + "step": 7940 + }, + { + "epoch": 0.46512988532646854, + "grad_norm": 3.0089550018310547, + "learning_rate": 4.694569645488429e-05, + "loss": 2.0484, + "step": 7950 + }, + { + "epoch": 0.465714954364615, + "grad_norm": 3.0519258975982666, + "learning_rate": 4.69348617491549e-05, + "loss": 2.0253, + "step": 7960 + }, + { + "epoch": 0.4663000234027615, + "grad_norm": 3.008615255355835, + "learning_rate": 4.69240270434255e-05, + "loss": 2.0241, + "step": 7970 + }, + { + "epoch": 0.46688509244090803, + "grad_norm": 3.0559892654418945, + "learning_rate": 4.691319233769611e-05, + "loss": 2.0306, + "step": 7980 + }, + { + "epoch": 0.4674701614790545, + "grad_norm": 3.179555654525757, + "learning_rate": 4.690235763196672e-05, + "loss": 2.0247, + "step": 7990 + }, + { + "epoch": 0.46805523051720105, + "grad_norm": 2.9754326343536377, + "learning_rate": 4.689152292623733e-05, + "loss": 2.0181, + "step": 8000 + }, + { + "epoch": 0.46805523051720105, + "eval_loss": 2.0491013526916504, + "eval_runtime": 32.8892, + "eval_samples_per_second": 686.761, + "eval_steps_per_second": 5.382, + "step": 8000 + }, + { + "epoch": 0.46864029955534753, + "grad_norm": 3.0024614334106445, + "learning_rate": 4.688177169108087e-05, + "loss": 2.0308, + "step": 8010 + }, + { + "epoch": 0.469225368593494, + "grad_norm": 3.2817656993865967, + "learning_rate": 4.687093698535148e-05, + "loss": 2.0252, + "step": 8020 + }, + { + "epoch": 0.46981043763164054, + "grad_norm": 3.4774303436279297, + "learning_rate": 4.6860102279622084e-05, + "loss": 2.0253, + "step": 8030 + }, + { + "epoch": 0.470395506669787, + "grad_norm": 3.1436619758605957, + "learning_rate": 4.6849267573892694e-05, + "loss": 1.9974, + "step": 8040 + }, + { + "epoch": 0.47098057570793356, + "grad_norm": 3.2040257453918457, + "learning_rate": 4.68384328681633e-05, + "loss": 2.0182, + "step": 8050 + }, + { + "epoch": 0.47156564474608004, + "grad_norm": 2.7292635440826416, + "learning_rate": 4.682759816243391e-05, + "loss": 2.0051, + "step": 8060 + }, + { + "epoch": 0.4721507137842265, + "grad_norm": 3.0518476963043213, + "learning_rate": 4.681676345670451e-05, + "loss": 2.0317, + "step": 8070 + }, + { + "epoch": 0.47273578282237305, + "grad_norm": 3.0847766399383545, + "learning_rate": 4.6805928750975124e-05, + "loss": 2.0107, + "step": 8080 + }, + { + "epoch": 0.47332085186051953, + "grad_norm": 3.142925262451172, + "learning_rate": 4.6795094045245734e-05, + "loss": 1.9974, + "step": 8090 + }, + { + "epoch": 0.47390592089866607, + "grad_norm": 3.2043142318725586, + "learning_rate": 4.678425933951634e-05, + "loss": 2.004, + "step": 8100 + }, + { + "epoch": 0.47449098993681255, + "grad_norm": 3.5305492877960205, + "learning_rate": 4.677342463378695e-05, + "loss": 2.0057, + "step": 8110 + }, + { + "epoch": 0.475076058974959, + "grad_norm": 3.114525556564331, + "learning_rate": 4.676258992805755e-05, + "loss": 2.0039, + "step": 8120 + }, + { + "epoch": 0.47566112801310556, + "grad_norm": 3.0347917079925537, + "learning_rate": 4.675175522232816e-05, + "loss": 2.0058, + "step": 8130 + }, + { + "epoch": 0.47624619705125204, + "grad_norm": 3.3880183696746826, + "learning_rate": 4.6740920516598774e-05, + "loss": 1.9881, + "step": 8140 + }, + { + "epoch": 0.4768312660893986, + "grad_norm": 3.2141289710998535, + "learning_rate": 4.673008581086938e-05, + "loss": 2.0171, + "step": 8150 + }, + { + "epoch": 0.47741633512754506, + "grad_norm": 3.0941874980926514, + "learning_rate": 4.671925110513999e-05, + "loss": 2.0026, + "step": 8160 + }, + { + "epoch": 0.47800140416569153, + "grad_norm": 3.1300344467163086, + "learning_rate": 4.670841639941059e-05, + "loss": 2.0248, + "step": 8170 + }, + { + "epoch": 0.47858647320383807, + "grad_norm": 2.854504108428955, + "learning_rate": 4.66975816936812e-05, + "loss": 2.0093, + "step": 8180 + }, + { + "epoch": 0.47917154224198455, + "grad_norm": 3.4655094146728516, + "learning_rate": 4.668674698795181e-05, + "loss": 2.0159, + "step": 8190 + }, + { + "epoch": 0.47975661128013103, + "grad_norm": 3.1394827365875244, + "learning_rate": 4.667591228222242e-05, + "loss": 2.0063, + "step": 8200 + }, + { + "epoch": 0.48034168031827756, + "grad_norm": 2.783161163330078, + "learning_rate": 4.666507757649303e-05, + "loss": 2.0216, + "step": 8210 + }, + { + "epoch": 0.48092674935642404, + "grad_norm": 2.9278018474578857, + "learning_rate": 4.665424287076363e-05, + "loss": 1.987, + "step": 8220 + }, + { + "epoch": 0.4815118183945706, + "grad_norm": 2.9299025535583496, + "learning_rate": 4.664340816503424e-05, + "loss": 1.9977, + "step": 8230 + }, + { + "epoch": 0.48209688743271706, + "grad_norm": 3.176558494567871, + "learning_rate": 4.6632573459304846e-05, + "loss": 2.0113, + "step": 8240 + }, + { + "epoch": 0.48268195647086354, + "grad_norm": 2.812845230102539, + "learning_rate": 4.662173875357546e-05, + "loss": 2.0071, + "step": 8250 + }, + { + "epoch": 0.4832670255090101, + "grad_norm": 2.8802714347839355, + "learning_rate": 4.661090404784606e-05, + "loss": 1.9992, + "step": 8260 + }, + { + "epoch": 0.48385209454715655, + "grad_norm": 3.153393507003784, + "learning_rate": 4.660006934211667e-05, + "loss": 1.997, + "step": 8270 + }, + { + "epoch": 0.4844371635853031, + "grad_norm": 3.5300729274749756, + "learning_rate": 4.658923463638728e-05, + "loss": 2.0098, + "step": 8280 + }, + { + "epoch": 0.48502223262344957, + "grad_norm": 3.140918731689453, + "learning_rate": 4.6578399930657886e-05, + "loss": 2.0247, + "step": 8290 + }, + { + "epoch": 0.48560730166159605, + "grad_norm": 2.7743959426879883, + "learning_rate": 4.65675652249285e-05, + "loss": 1.9984, + "step": 8300 + }, + { + "epoch": 0.4861923706997426, + "grad_norm": 3.316230297088623, + "learning_rate": 4.65567305191991e-05, + "loss": 1.9915, + "step": 8310 + }, + { + "epoch": 0.48677743973788906, + "grad_norm": 2.7156434059143066, + "learning_rate": 4.654589581346971e-05, + "loss": 1.9722, + "step": 8320 + }, + { + "epoch": 0.4873625087760356, + "grad_norm": 2.750061511993408, + "learning_rate": 4.653506110774032e-05, + "loss": 1.9787, + "step": 8330 + }, + { + "epoch": 0.4879475778141821, + "grad_norm": 2.9723360538482666, + "learning_rate": 4.6524226402010926e-05, + "loss": 1.9958, + "step": 8340 + }, + { + "epoch": 0.48853264685232856, + "grad_norm": 3.052290678024292, + "learning_rate": 4.6513391696281536e-05, + "loss": 1.9954, + "step": 8350 + }, + { + "epoch": 0.4891177158904751, + "grad_norm": 2.863341808319092, + "learning_rate": 4.650255699055214e-05, + "loss": 1.9895, + "step": 8360 + }, + { + "epoch": 0.48970278492862157, + "grad_norm": 2.7923905849456787, + "learning_rate": 4.6491722284822744e-05, + "loss": 2.0055, + "step": 8370 + }, + { + "epoch": 0.4902878539667681, + "grad_norm": 2.772634506225586, + "learning_rate": 4.6480887579093355e-05, + "loss": 2.0262, + "step": 8380 + }, + { + "epoch": 0.4908729230049146, + "grad_norm": 2.662858247756958, + "learning_rate": 4.647005287336396e-05, + "loss": 1.9983, + "step": 8390 + }, + { + "epoch": 0.49145799204306106, + "grad_norm": 3.000667095184326, + "learning_rate": 4.645921816763457e-05, + "loss": 2.0038, + "step": 8400 + }, + { + "epoch": 0.4920430610812076, + "grad_norm": 3.1513285636901855, + "learning_rate": 4.644838346190517e-05, + "loss": 1.982, + "step": 8410 + }, + { + "epoch": 0.4926281301193541, + "grad_norm": 3.7106268405914307, + "learning_rate": 4.6437548756175784e-05, + "loss": 1.9832, + "step": 8420 + }, + { + "epoch": 0.49321319915750056, + "grad_norm": 2.8524065017700195, + "learning_rate": 4.642671405044639e-05, + "loss": 1.9971, + "step": 8430 + }, + { + "epoch": 0.4937982681956471, + "grad_norm": 3.094773769378662, + "learning_rate": 4.6415879344717e-05, + "loss": 1.9896, + "step": 8440 + }, + { + "epoch": 0.4943833372337936, + "grad_norm": 2.696147918701172, + "learning_rate": 4.64050446389876e-05, + "loss": 1.9827, + "step": 8450 + }, + { + "epoch": 0.4949684062719401, + "grad_norm": 2.760470151901245, + "learning_rate": 4.639420993325821e-05, + "loss": 2.0084, + "step": 8460 + }, + { + "epoch": 0.4955534753100866, + "grad_norm": 3.0959277153015137, + "learning_rate": 4.6383375227528823e-05, + "loss": 1.9848, + "step": 8470 + }, + { + "epoch": 0.49613854434823307, + "grad_norm": 3.3799521923065186, + "learning_rate": 4.637254052179943e-05, + "loss": 1.9857, + "step": 8480 + }, + { + "epoch": 0.4967236133863796, + "grad_norm": 2.89595890045166, + "learning_rate": 4.636170581607004e-05, + "loss": 1.9954, + "step": 8490 + }, + { + "epoch": 0.4973086824245261, + "grad_norm": 3.051623582839966, + "learning_rate": 4.635087111034064e-05, + "loss": 1.9872, + "step": 8500 + }, + { + "epoch": 0.4978937514626726, + "grad_norm": 3.40632700920105, + "learning_rate": 4.634003640461125e-05, + "loss": 1.9807, + "step": 8510 + }, + { + "epoch": 0.4984788205008191, + "grad_norm": 3.2578604221343994, + "learning_rate": 4.632920169888186e-05, + "loss": 1.9818, + "step": 8520 + }, + { + "epoch": 0.4990638895389656, + "grad_norm": 2.9621846675872803, + "learning_rate": 4.631836699315247e-05, + "loss": 1.9688, + "step": 8530 + }, + { + "epoch": 0.4996489585771121, + "grad_norm": 2.722932815551758, + "learning_rate": 4.630753228742308e-05, + "loss": 2.0045, + "step": 8540 + }, + { + "epoch": 0.5002340276152586, + "grad_norm": 2.9824562072753906, + "learning_rate": 4.629669758169368e-05, + "loss": 1.9938, + "step": 8550 + }, + { + "epoch": 0.5008190966534051, + "grad_norm": 3.1173856258392334, + "learning_rate": 4.628586287596429e-05, + "loss": 2.0003, + "step": 8560 + }, + { + "epoch": 0.5014041656915516, + "grad_norm": 3.090847969055176, + "learning_rate": 4.6275028170234896e-05, + "loss": 1.9714, + "step": 8570 + }, + { + "epoch": 0.5019892347296981, + "grad_norm": 2.8026397228240967, + "learning_rate": 4.626419346450551e-05, + "loss": 1.9752, + "step": 8580 + }, + { + "epoch": 0.5025743037678446, + "grad_norm": 2.6576125621795654, + "learning_rate": 4.625335875877612e-05, + "loss": 1.9743, + "step": 8590 + }, + { + "epoch": 0.5031593728059911, + "grad_norm": 3.0443480014801025, + "learning_rate": 4.624252405304672e-05, + "loss": 1.968, + "step": 8600 + }, + { + "epoch": 0.5037444418441376, + "grad_norm": 2.8763580322265625, + "learning_rate": 4.623168934731733e-05, + "loss": 1.9871, + "step": 8610 + }, + { + "epoch": 0.5043295108822841, + "grad_norm": 2.9126877784729004, + "learning_rate": 4.6220854641587936e-05, + "loss": 1.9721, + "step": 8620 + }, + { + "epoch": 0.5049145799204307, + "grad_norm": 3.264709234237671, + "learning_rate": 4.6210019935858546e-05, + "loss": 1.9518, + "step": 8630 + }, + { + "epoch": 0.5054996489585771, + "grad_norm": 2.7619988918304443, + "learning_rate": 4.619918523012915e-05, + "loss": 1.9594, + "step": 8640 + }, + { + "epoch": 0.5060847179967236, + "grad_norm": 3.709038019180298, + "learning_rate": 4.618835052439976e-05, + "loss": 1.9734, + "step": 8650 + }, + { + "epoch": 0.5066697870348701, + "grad_norm": 2.939643383026123, + "learning_rate": 4.617751581867037e-05, + "loss": 1.9619, + "step": 8660 + }, + { + "epoch": 0.5072548560730166, + "grad_norm": 2.5877466201782227, + "learning_rate": 4.6166681112940975e-05, + "loss": 1.9743, + "step": 8670 + }, + { + "epoch": 0.5078399251111632, + "grad_norm": 3.3611338138580322, + "learning_rate": 4.6155846407211586e-05, + "loss": 1.9482, + "step": 8680 + }, + { + "epoch": 0.5084249941493096, + "grad_norm": 3.017352819442749, + "learning_rate": 4.614501170148219e-05, + "loss": 1.9766, + "step": 8690 + }, + { + "epoch": 0.5090100631874561, + "grad_norm": 3.222299814224243, + "learning_rate": 4.61341769957528e-05, + "loss": 1.9736, + "step": 8700 + }, + { + "epoch": 0.5095951322256026, + "grad_norm": 2.6856226921081543, + "learning_rate": 4.6123342290023404e-05, + "loss": 1.9739, + "step": 8710 + }, + { + "epoch": 0.5101802012637491, + "grad_norm": 2.8677382469177246, + "learning_rate": 4.6112507584294015e-05, + "loss": 1.9684, + "step": 8720 + }, + { + "epoch": 0.5107652703018957, + "grad_norm": 2.7384872436523438, + "learning_rate": 4.6101672878564626e-05, + "loss": 1.9567, + "step": 8730 + }, + { + "epoch": 0.5113503393400421, + "grad_norm": 2.9494054317474365, + "learning_rate": 4.609083817283523e-05, + "loss": 1.9759, + "step": 8740 + }, + { + "epoch": 0.5119354083781886, + "grad_norm": 2.8027944564819336, + "learning_rate": 4.6080003467105833e-05, + "loss": 1.973, + "step": 8750 + }, + { + "epoch": 0.5125204774163351, + "grad_norm": 2.8359196186065674, + "learning_rate": 4.6069168761376444e-05, + "loss": 1.9602, + "step": 8760 + }, + { + "epoch": 0.5131055464544816, + "grad_norm": 2.939121723175049, + "learning_rate": 4.605833405564705e-05, + "loss": 1.9683, + "step": 8770 + }, + { + "epoch": 0.5136906154926282, + "grad_norm": 2.6717183589935303, + "learning_rate": 4.604749934991766e-05, + "loss": 1.9844, + "step": 8780 + }, + { + "epoch": 0.5142756845307747, + "grad_norm": 2.7001607418060303, + "learning_rate": 4.603666464418826e-05, + "loss": 1.9681, + "step": 8790 + }, + { + "epoch": 0.5148607535689211, + "grad_norm": 2.741868495941162, + "learning_rate": 4.602582993845887e-05, + "loss": 1.9712, + "step": 8800 + }, + { + "epoch": 0.5154458226070676, + "grad_norm": 2.8736281394958496, + "learning_rate": 4.601499523272948e-05, + "loss": 1.9481, + "step": 8810 + }, + { + "epoch": 0.5160308916452141, + "grad_norm": 2.995229482650757, + "learning_rate": 4.600416052700009e-05, + "loss": 1.967, + "step": 8820 + }, + { + "epoch": 0.5166159606833607, + "grad_norm": 2.766552209854126, + "learning_rate": 4.599332582127069e-05, + "loss": 1.944, + "step": 8830 + }, + { + "epoch": 0.5172010297215072, + "grad_norm": 2.873028039932251, + "learning_rate": 4.59824911155413e-05, + "loss": 1.9683, + "step": 8840 + }, + { + "epoch": 0.5177860987596536, + "grad_norm": 2.9468166828155518, + "learning_rate": 4.597165640981191e-05, + "loss": 1.9673, + "step": 8850 + }, + { + "epoch": 0.5183711677978001, + "grad_norm": 2.9273269176483154, + "learning_rate": 4.596082170408252e-05, + "loss": 1.9673, + "step": 8860 + }, + { + "epoch": 0.5189562368359466, + "grad_norm": 2.883070945739746, + "learning_rate": 4.594998699835313e-05, + "loss": 1.9756, + "step": 8870 + }, + { + "epoch": 0.5195413058740932, + "grad_norm": 2.8058786392211914, + "learning_rate": 4.593915229262373e-05, + "loss": 1.9754, + "step": 8880 + }, + { + "epoch": 0.5201263749122397, + "grad_norm": 2.772822380065918, + "learning_rate": 4.592831758689434e-05, + "loss": 1.9542, + "step": 8890 + }, + { + "epoch": 0.5207114439503862, + "grad_norm": 3.028837203979492, + "learning_rate": 4.5917482881164946e-05, + "loss": 1.9623, + "step": 8900 + }, + { + "epoch": 0.5212965129885326, + "grad_norm": 2.7821860313415527, + "learning_rate": 4.5906648175435556e-05, + "loss": 1.9397, + "step": 8910 + }, + { + "epoch": 0.5218815820266791, + "grad_norm": 2.968295097351074, + "learning_rate": 4.589581346970617e-05, + "loss": 1.9706, + "step": 8920 + }, + { + "epoch": 0.5224666510648257, + "grad_norm": 2.7903077602386475, + "learning_rate": 4.588497876397677e-05, + "loss": 1.9597, + "step": 8930 + }, + { + "epoch": 0.5230517201029722, + "grad_norm": 2.9452643394470215, + "learning_rate": 4.587414405824738e-05, + "loss": 1.9538, + "step": 8940 + }, + { + "epoch": 0.5236367891411187, + "grad_norm": 2.813835620880127, + "learning_rate": 4.5863309352517985e-05, + "loss": 1.9779, + "step": 8950 + }, + { + "epoch": 0.5242218581792651, + "grad_norm": 3.3261756896972656, + "learning_rate": 4.5852474646788596e-05, + "loss": 1.9583, + "step": 8960 + }, + { + "epoch": 0.5248069272174116, + "grad_norm": 3.1700422763824463, + "learning_rate": 4.584163994105921e-05, + "loss": 1.9702, + "step": 8970 + }, + { + "epoch": 0.5253919962555581, + "grad_norm": 2.9026143550872803, + "learning_rate": 4.583080523532981e-05, + "loss": 1.9575, + "step": 8980 + }, + { + "epoch": 0.5259770652937047, + "grad_norm": 2.6958534717559814, + "learning_rate": 4.581997052960042e-05, + "loss": 1.9403, + "step": 8990 + }, + { + "epoch": 0.5265621343318512, + "grad_norm": 2.888125419616699, + "learning_rate": 4.5809135823871025e-05, + "loss": 1.9619, + "step": 9000 + }, + { + "epoch": 0.5265621343318512, + "eval_loss": 1.988519549369812, + "eval_runtime": 32.7943, + "eval_samples_per_second": 688.748, + "eval_steps_per_second": 5.397, + "step": 9000 + }, + { + "epoch": 0.5271472033699977, + "grad_norm": 2.8204216957092285, + "learning_rate": 4.5798301118141636e-05, + "loss": 1.9552, + "step": 9010 + }, + { + "epoch": 0.5277322724081441, + "grad_norm": 2.705501079559326, + "learning_rate": 4.578746641241224e-05, + "loss": 1.9331, + "step": 9020 + }, + { + "epoch": 0.5283173414462906, + "grad_norm": 3.4426255226135254, + "learning_rate": 4.577663170668285e-05, + "loss": 1.9633, + "step": 9030 + }, + { + "epoch": 0.5289024104844372, + "grad_norm": 3.1119680404663086, + "learning_rate": 4.576579700095346e-05, + "loss": 1.9535, + "step": 9040 + }, + { + "epoch": 0.5294874795225837, + "grad_norm": 2.6733815670013428, + "learning_rate": 4.5754962295224065e-05, + "loss": 1.9565, + "step": 9050 + }, + { + "epoch": 0.5300725485607302, + "grad_norm": 2.6338391304016113, + "learning_rate": 4.5744127589494675e-05, + "loss": 1.9432, + "step": 9060 + }, + { + "epoch": 0.5306576175988766, + "grad_norm": 3.1127588748931885, + "learning_rate": 4.573329288376528e-05, + "loss": 1.9497, + "step": 9070 + }, + { + "epoch": 0.5312426866370231, + "grad_norm": 2.75769305229187, + "learning_rate": 4.572245817803589e-05, + "loss": 1.9616, + "step": 9080 + }, + { + "epoch": 0.5318277556751697, + "grad_norm": 2.7948832511901855, + "learning_rate": 4.5711623472306494e-05, + "loss": 1.9256, + "step": 9090 + }, + { + "epoch": 0.5324128247133162, + "grad_norm": 3.516594171524048, + "learning_rate": 4.5700788766577104e-05, + "loss": 1.9509, + "step": 9100 + }, + { + "epoch": 0.5329978937514627, + "grad_norm": 3.1064114570617676, + "learning_rate": 4.5689954060847715e-05, + "loss": 1.9562, + "step": 9110 + }, + { + "epoch": 0.5335829627896091, + "grad_norm": 2.7173867225646973, + "learning_rate": 4.567911935511832e-05, + "loss": 1.9294, + "step": 9120 + }, + { + "epoch": 0.5341680318277556, + "grad_norm": 2.5651350021362305, + "learning_rate": 4.566828464938892e-05, + "loss": 1.9345, + "step": 9130 + }, + { + "epoch": 0.5347531008659022, + "grad_norm": 3.0915350914001465, + "learning_rate": 4.5657449943659533e-05, + "loss": 1.963, + "step": 9140 + }, + { + "epoch": 0.5353381699040487, + "grad_norm": 3.119758129119873, + "learning_rate": 4.564661523793014e-05, + "loss": 1.9615, + "step": 9150 + }, + { + "epoch": 0.5359232389421952, + "grad_norm": 3.0232558250427246, + "learning_rate": 4.563578053220075e-05, + "loss": 1.9472, + "step": 9160 + }, + { + "epoch": 0.5365083079803417, + "grad_norm": 3.1270885467529297, + "learning_rate": 4.562494582647135e-05, + "loss": 1.9578, + "step": 9170 + }, + { + "epoch": 0.5370933770184881, + "grad_norm": 2.6653079986572266, + "learning_rate": 4.561411112074196e-05, + "loss": 1.9559, + "step": 9180 + }, + { + "epoch": 0.5376784460566347, + "grad_norm": 2.5003676414489746, + "learning_rate": 4.5603276415012566e-05, + "loss": 1.9506, + "step": 9190 + }, + { + "epoch": 0.5382635150947812, + "grad_norm": 3.149050712585449, + "learning_rate": 4.559244170928318e-05, + "loss": 1.9515, + "step": 9200 + }, + { + "epoch": 0.5388485841329277, + "grad_norm": 3.065380573272705, + "learning_rate": 4.558160700355378e-05, + "loss": 1.9637, + "step": 9210 + }, + { + "epoch": 0.5394336531710742, + "grad_norm": 3.0215847492218018, + "learning_rate": 4.557077229782439e-05, + "loss": 1.9436, + "step": 9220 + }, + { + "epoch": 0.5400187222092206, + "grad_norm": 2.449134111404419, + "learning_rate": 4.5559937592095e-05, + "loss": 1.9449, + "step": 9230 + }, + { + "epoch": 0.5406037912473672, + "grad_norm": 2.719134569168091, + "learning_rate": 4.5549102886365606e-05, + "loss": 1.9448, + "step": 9240 + }, + { + "epoch": 0.5411888602855137, + "grad_norm": 3.018068552017212, + "learning_rate": 4.553826818063622e-05, + "loss": 1.9425, + "step": 9250 + }, + { + "epoch": 0.5417739293236602, + "grad_norm": 3.179966926574707, + "learning_rate": 4.552743347490682e-05, + "loss": 1.9558, + "step": 9260 + }, + { + "epoch": 0.5423589983618067, + "grad_norm": 2.709228992462158, + "learning_rate": 4.551659876917743e-05, + "loss": 1.937, + "step": 9270 + }, + { + "epoch": 0.5429440673999532, + "grad_norm": 3.194866180419922, + "learning_rate": 4.5505764063448035e-05, + "loss": 1.9557, + "step": 9280 + }, + { + "epoch": 0.5435291364380997, + "grad_norm": 2.693955183029175, + "learning_rate": 4.5494929357718646e-05, + "loss": 1.9344, + "step": 9290 + }, + { + "epoch": 0.5441142054762462, + "grad_norm": 2.6477768421173096, + "learning_rate": 4.5484094651989256e-05, + "loss": 1.9447, + "step": 9300 + }, + { + "epoch": 0.5446992745143927, + "grad_norm": 2.8476829528808594, + "learning_rate": 4.547325994625986e-05, + "loss": 1.9544, + "step": 9310 + }, + { + "epoch": 0.5452843435525392, + "grad_norm": 3.310645341873169, + "learning_rate": 4.546242524053047e-05, + "loss": 1.931, + "step": 9320 + }, + { + "epoch": 0.5458694125906857, + "grad_norm": 3.0192174911499023, + "learning_rate": 4.5451590534801075e-05, + "loss": 1.9239, + "step": 9330 + }, + { + "epoch": 0.5464544816288323, + "grad_norm": 3.089679002761841, + "learning_rate": 4.5440755829071685e-05, + "loss": 1.9569, + "step": 9340 + }, + { + "epoch": 0.5470395506669787, + "grad_norm": 2.737011194229126, + "learning_rate": 4.542992112334229e-05, + "loss": 1.9615, + "step": 9350 + }, + { + "epoch": 0.5476246197051252, + "grad_norm": 2.5949594974517822, + "learning_rate": 4.54190864176129e-05, + "loss": 1.9555, + "step": 9360 + }, + { + "epoch": 0.5482096887432717, + "grad_norm": 2.5985922813415527, + "learning_rate": 4.540825171188351e-05, + "loss": 1.9418, + "step": 9370 + }, + { + "epoch": 0.5487947577814182, + "grad_norm": 2.8382458686828613, + "learning_rate": 4.5397417006154114e-05, + "loss": 1.9498, + "step": 9380 + }, + { + "epoch": 0.5493798268195647, + "grad_norm": 2.902757167816162, + "learning_rate": 4.5386582300424725e-05, + "loss": 1.9241, + "step": 9390 + }, + { + "epoch": 0.5499648958577112, + "grad_norm": 2.6310393810272217, + "learning_rate": 4.537574759469533e-05, + "loss": 1.9382, + "step": 9400 + }, + { + "epoch": 0.5505499648958577, + "grad_norm": 2.8555400371551514, + "learning_rate": 4.536491288896594e-05, + "loss": 1.9393, + "step": 9410 + }, + { + "epoch": 0.5511350339340042, + "grad_norm": 2.8217170238494873, + "learning_rate": 4.535407818323655e-05, + "loss": 1.9474, + "step": 9420 + }, + { + "epoch": 0.5517201029721507, + "grad_norm": 2.6833455562591553, + "learning_rate": 4.5343243477507154e-05, + "loss": 1.935, + "step": 9430 + }, + { + "epoch": 0.5523051720102972, + "grad_norm": 2.5669028759002686, + "learning_rate": 4.5332408771777765e-05, + "loss": 1.9242, + "step": 9440 + }, + { + "epoch": 0.5528902410484438, + "grad_norm": 2.365229368209839, + "learning_rate": 4.532157406604837e-05, + "loss": 1.9181, + "step": 9450 + }, + { + "epoch": 0.5534753100865902, + "grad_norm": 2.3770415782928467, + "learning_rate": 4.531073936031898e-05, + "loss": 1.9343, + "step": 9460 + }, + { + "epoch": 0.5540603791247367, + "grad_norm": 2.888568639755249, + "learning_rate": 4.529990465458958e-05, + "loss": 1.9239, + "step": 9470 + }, + { + "epoch": 0.5546454481628832, + "grad_norm": 2.6745684146881104, + "learning_rate": 4.5289069948860194e-05, + "loss": 1.9198, + "step": 9480 + }, + { + "epoch": 0.5552305172010297, + "grad_norm": 2.769791603088379, + "learning_rate": 4.52782352431308e-05, + "loss": 1.9259, + "step": 9490 + }, + { + "epoch": 0.5558155862391763, + "grad_norm": 2.7529733180999756, + "learning_rate": 4.526740053740141e-05, + "loss": 1.9309, + "step": 9500 + }, + { + "epoch": 0.5564006552773227, + "grad_norm": 2.5826923847198486, + "learning_rate": 4.525656583167201e-05, + "loss": 1.9223, + "step": 9510 + }, + { + "epoch": 0.5569857243154692, + "grad_norm": 2.7464563846588135, + "learning_rate": 4.524573112594262e-05, + "loss": 1.9407, + "step": 9520 + }, + { + "epoch": 0.5575707933536157, + "grad_norm": 2.6886658668518066, + "learning_rate": 4.523489642021323e-05, + "loss": 1.9291, + "step": 9530 + }, + { + "epoch": 0.5581558623917622, + "grad_norm": 2.545024871826172, + "learning_rate": 4.522406171448384e-05, + "loss": 1.938, + "step": 9540 + }, + { + "epoch": 0.5587409314299088, + "grad_norm": 3.003643751144409, + "learning_rate": 4.521322700875444e-05, + "loss": 1.9306, + "step": 9550 + }, + { + "epoch": 0.5593260004680553, + "grad_norm": 2.937375068664551, + "learning_rate": 4.520239230302505e-05, + "loss": 1.9159, + "step": 9560 + }, + { + "epoch": 0.5599110695062017, + "grad_norm": 2.6693642139434814, + "learning_rate": 4.5191557597295656e-05, + "loss": 1.9212, + "step": 9570 + }, + { + "epoch": 0.5604961385443482, + "grad_norm": 2.9933528900146484, + "learning_rate": 4.5180722891566266e-05, + "loss": 1.9405, + "step": 9580 + }, + { + "epoch": 0.5610812075824947, + "grad_norm": 2.6250669956207275, + "learning_rate": 4.516988818583687e-05, + "loss": 1.9365, + "step": 9590 + }, + { + "epoch": 0.5616662766206413, + "grad_norm": 3.2932937145233154, + "learning_rate": 4.515905348010748e-05, + "loss": 1.9049, + "step": 9600 + }, + { + "epoch": 0.5622513456587878, + "grad_norm": 2.7064127922058105, + "learning_rate": 4.514821877437809e-05, + "loss": 1.9152, + "step": 9610 + }, + { + "epoch": 0.5628364146969342, + "grad_norm": 3.106694221496582, + "learning_rate": 4.5137384068648695e-05, + "loss": 1.9308, + "step": 9620 + }, + { + "epoch": 0.5634214837350807, + "grad_norm": 2.67669415473938, + "learning_rate": 4.5126549362919306e-05, + "loss": 1.924, + "step": 9630 + }, + { + "epoch": 0.5640065527732272, + "grad_norm": 2.5048086643218994, + "learning_rate": 4.511571465718991e-05, + "loss": 1.9272, + "step": 9640 + }, + { + "epoch": 0.5645916218113738, + "grad_norm": 2.565117359161377, + "learning_rate": 4.510487995146052e-05, + "loss": 1.911, + "step": 9650 + }, + { + "epoch": 0.5651766908495203, + "grad_norm": 2.791459798812866, + "learning_rate": 4.5094045245731124e-05, + "loss": 1.9435, + "step": 9660 + }, + { + "epoch": 0.5657617598876667, + "grad_norm": 2.540802240371704, + "learning_rate": 4.5083210540001735e-05, + "loss": 1.9202, + "step": 9670 + }, + { + "epoch": 0.5663468289258132, + "grad_norm": 2.574413776397705, + "learning_rate": 4.5072375834272346e-05, + "loss": 1.9054, + "step": 9680 + }, + { + "epoch": 0.5669318979639597, + "grad_norm": 2.7606639862060547, + "learning_rate": 4.506154112854295e-05, + "loss": 1.9112, + "step": 9690 + }, + { + "epoch": 0.5675169670021063, + "grad_norm": 2.577214002609253, + "learning_rate": 4.505070642281356e-05, + "loss": 1.9383, + "step": 9700 + }, + { + "epoch": 0.5681020360402528, + "grad_norm": 3.2329492568969727, + "learning_rate": 4.5039871717084164e-05, + "loss": 1.9186, + "step": 9710 + }, + { + "epoch": 0.5686871050783993, + "grad_norm": 2.3580706119537354, + "learning_rate": 4.5029037011354775e-05, + "loss": 1.9207, + "step": 9720 + }, + { + "epoch": 0.5692721741165457, + "grad_norm": 2.7489230632781982, + "learning_rate": 4.501820230562538e-05, + "loss": 1.9074, + "step": 9730 + }, + { + "epoch": 0.5698572431546922, + "grad_norm": 2.715179443359375, + "learning_rate": 4.500736759989599e-05, + "loss": 1.9226, + "step": 9740 + }, + { + "epoch": 0.5704423121928388, + "grad_norm": 2.5996034145355225, + "learning_rate": 4.49965328941666e-05, + "loss": 1.9195, + "step": 9750 + }, + { + "epoch": 0.5710273812309853, + "grad_norm": 2.2735533714294434, + "learning_rate": 4.4985698188437204e-05, + "loss": 1.9123, + "step": 9760 + }, + { + "epoch": 0.5716124502691318, + "grad_norm": 2.8914954662323, + "learning_rate": 4.4974863482707814e-05, + "loss": 1.8968, + "step": 9770 + }, + { + "epoch": 0.5721975193072782, + "grad_norm": 2.3669464588165283, + "learning_rate": 4.496402877697842e-05, + "loss": 1.9019, + "step": 9780 + }, + { + "epoch": 0.5727825883454247, + "grad_norm": 2.299055576324463, + "learning_rate": 4.495319407124903e-05, + "loss": 1.9097, + "step": 9790 + }, + { + "epoch": 0.5733676573835713, + "grad_norm": 2.7267439365386963, + "learning_rate": 4.494235936551964e-05, + "loss": 1.9099, + "step": 9800 + }, + { + "epoch": 0.5739527264217178, + "grad_norm": 2.719928503036499, + "learning_rate": 4.4931524659790243e-05, + "loss": 1.9153, + "step": 9810 + }, + { + "epoch": 0.5745377954598643, + "grad_norm": 2.3605523109436035, + "learning_rate": 4.4920689954060854e-05, + "loss": 1.9114, + "step": 9820 + }, + { + "epoch": 0.5751228644980108, + "grad_norm": 2.6353838443756104, + "learning_rate": 4.490985524833146e-05, + "loss": 1.9038, + "step": 9830 + }, + { + "epoch": 0.5757079335361572, + "grad_norm": 2.8869564533233643, + "learning_rate": 4.489902054260207e-05, + "loss": 1.9231, + "step": 9840 + }, + { + "epoch": 0.5762930025743037, + "grad_norm": 2.4160549640655518, + "learning_rate": 4.488818583687267e-05, + "loss": 1.9169, + "step": 9850 + }, + { + "epoch": 0.5768780716124503, + "grad_norm": 2.7771711349487305, + "learning_rate": 4.487735113114328e-05, + "loss": 1.9102, + "step": 9860 + }, + { + "epoch": 0.5774631406505968, + "grad_norm": 2.596022129058838, + "learning_rate": 4.486651642541389e-05, + "loss": 1.8996, + "step": 9870 + }, + { + "epoch": 0.5780482096887433, + "grad_norm": 2.697697401046753, + "learning_rate": 4.48556817196845e-05, + "loss": 1.9214, + "step": 9880 + }, + { + "epoch": 0.5786332787268897, + "grad_norm": 2.8675873279571533, + "learning_rate": 4.48448470139551e-05, + "loss": 1.9225, + "step": 9890 + }, + { + "epoch": 0.5792183477650362, + "grad_norm": 2.576171636581421, + "learning_rate": 4.483401230822571e-05, + "loss": 1.8994, + "step": 9900 + }, + { + "epoch": 0.5798034168031828, + "grad_norm": 2.5201292037963867, + "learning_rate": 4.4823177602496316e-05, + "loss": 1.924, + "step": 9910 + }, + { + "epoch": 0.5803884858413293, + "grad_norm": 2.743922472000122, + "learning_rate": 4.481234289676693e-05, + "loss": 1.917, + "step": 9920 + }, + { + "epoch": 0.5809735548794758, + "grad_norm": 2.5058281421661377, + "learning_rate": 4.480150819103753e-05, + "loss": 1.9088, + "step": 9930 + }, + { + "epoch": 0.5815586239176223, + "grad_norm": 2.6733038425445557, + "learning_rate": 4.479067348530814e-05, + "loss": 1.9063, + "step": 9940 + }, + { + "epoch": 0.5821436929557687, + "grad_norm": 2.307478904724121, + "learning_rate": 4.4779838779578745e-05, + "loss": 1.9141, + "step": 9950 + }, + { + "epoch": 0.5827287619939153, + "grad_norm": 2.5298328399658203, + "learning_rate": 4.4769004073849356e-05, + "loss": 1.8878, + "step": 9960 + }, + { + "epoch": 0.5833138310320618, + "grad_norm": 2.2040703296661377, + "learning_rate": 4.475816936811996e-05, + "loss": 1.9078, + "step": 9970 + }, + { + "epoch": 0.5838989000702083, + "grad_norm": 2.5069639682769775, + "learning_rate": 4.474733466239057e-05, + "loss": 1.905, + "step": 9980 + }, + { + "epoch": 0.5844839691083548, + "grad_norm": 2.407330274581909, + "learning_rate": 4.473649995666118e-05, + "loss": 1.8948, + "step": 9990 + }, + { + "epoch": 0.5850690381465012, + "grad_norm": 2.486588478088379, + "learning_rate": 4.4725665250931785e-05, + "loss": 1.8696, + "step": 10000 + }, + { + "epoch": 0.5850690381465012, + "eval_loss": 1.9392573833465576, + "eval_runtime": 32.9186, + "eval_samples_per_second": 686.147, + "eval_steps_per_second": 5.377, + "step": 10000 + }, + { + "epoch": 0.5856541071846478, + "grad_norm": 2.3855130672454834, + "learning_rate": 4.4714830545202395e-05, + "loss": 1.9053, + "step": 10010 + }, + { + "epoch": 0.5862391762227943, + "grad_norm": 2.5895402431488037, + "learning_rate": 4.4703995839473e-05, + "loss": 1.9227, + "step": 10020 + }, + { + "epoch": 0.5868242452609408, + "grad_norm": 2.6328535079956055, + "learning_rate": 4.469316113374361e-05, + "loss": 1.9124, + "step": 10030 + }, + { + "epoch": 0.5874093142990873, + "grad_norm": 2.5550880432128906, + "learning_rate": 4.4682326428014214e-05, + "loss": 1.9004, + "step": 10040 + }, + { + "epoch": 0.5879943833372338, + "grad_norm": 3.0855913162231445, + "learning_rate": 4.4671491722284824e-05, + "loss": 1.907, + "step": 10050 + }, + { + "epoch": 0.5885794523753803, + "grad_norm": 2.6303694248199463, + "learning_rate": 4.4660657016555435e-05, + "loss": 1.9061, + "step": 10060 + }, + { + "epoch": 0.5891645214135268, + "grad_norm": 2.308624267578125, + "learning_rate": 4.464982231082604e-05, + "loss": 1.9015, + "step": 10070 + }, + { + "epoch": 0.5897495904516733, + "grad_norm": 2.56894850730896, + "learning_rate": 4.463898760509665e-05, + "loss": 1.9148, + "step": 10080 + }, + { + "epoch": 0.5903346594898198, + "grad_norm": 2.979659080505371, + "learning_rate": 4.4628152899367253e-05, + "loss": 1.8973, + "step": 10090 + }, + { + "epoch": 0.5909197285279663, + "grad_norm": 2.4559600353240967, + "learning_rate": 4.46184016642108e-05, + "loss": 1.8978, + "step": 10100 + }, + { + "epoch": 0.5915047975661128, + "grad_norm": 2.296816110610962, + "learning_rate": 4.460756695848141e-05, + "loss": 1.883, + "step": 10110 + }, + { + "epoch": 0.5920898666042593, + "grad_norm": 2.6355438232421875, + "learning_rate": 4.4596732252752016e-05, + "loss": 1.8959, + "step": 10120 + }, + { + "epoch": 0.5926749356424058, + "grad_norm": 2.155111312866211, + "learning_rate": 4.458589754702263e-05, + "loss": 1.896, + "step": 10130 + }, + { + "epoch": 0.5932600046805523, + "grad_norm": 2.830075979232788, + "learning_rate": 4.457506284129323e-05, + "loss": 1.9077, + "step": 10140 + }, + { + "epoch": 0.5938450737186988, + "grad_norm": 2.871953010559082, + "learning_rate": 4.456422813556384e-05, + "loss": 1.8999, + "step": 10150 + }, + { + "epoch": 0.5944301427568454, + "grad_norm": 2.53003191947937, + "learning_rate": 4.455339342983445e-05, + "loss": 1.8856, + "step": 10160 + }, + { + "epoch": 0.5950152117949918, + "grad_norm": 2.4554378986358643, + "learning_rate": 4.4542558724105056e-05, + "loss": 1.9078, + "step": 10170 + }, + { + "epoch": 0.5956002808331383, + "grad_norm": 2.6662404537200928, + "learning_rate": 4.4531724018375666e-05, + "loss": 1.888, + "step": 10180 + }, + { + "epoch": 0.5961853498712848, + "grad_norm": 2.813572406768799, + "learning_rate": 4.452088931264627e-05, + "loss": 1.8859, + "step": 10190 + }, + { + "epoch": 0.5967704189094313, + "grad_norm": 2.7059476375579834, + "learning_rate": 4.451005460691688e-05, + "loss": 1.8968, + "step": 10200 + }, + { + "epoch": 0.5973554879475779, + "grad_norm": 2.3598759174346924, + "learning_rate": 4.4499219901187485e-05, + "loss": 1.8915, + "step": 10210 + }, + { + "epoch": 0.5979405569857243, + "grad_norm": 2.571669340133667, + "learning_rate": 4.4488385195458095e-05, + "loss": 1.8792, + "step": 10220 + }, + { + "epoch": 0.5985256260238708, + "grad_norm": 2.730414867401123, + "learning_rate": 4.4477550489728706e-05, + "loss": 1.9103, + "step": 10230 + }, + { + "epoch": 0.5991106950620173, + "grad_norm": 2.474123477935791, + "learning_rate": 4.446671578399931e-05, + "loss": 1.8858, + "step": 10240 + }, + { + "epoch": 0.5996957641001638, + "grad_norm": 2.308326005935669, + "learning_rate": 4.445588107826992e-05, + "loss": 1.8906, + "step": 10250 + }, + { + "epoch": 0.6002808331383103, + "grad_norm": 2.3476154804229736, + "learning_rate": 4.4445046372540525e-05, + "loss": 1.888, + "step": 10260 + }, + { + "epoch": 0.6008659021764569, + "grad_norm": 2.4964070320129395, + "learning_rate": 4.4434211666811135e-05, + "loss": 1.8849, + "step": 10270 + }, + { + "epoch": 0.6014509712146033, + "grad_norm": 2.56335186958313, + "learning_rate": 4.442337696108174e-05, + "loss": 1.8751, + "step": 10280 + }, + { + "epoch": 0.6020360402527498, + "grad_norm": 2.5936009883880615, + "learning_rate": 4.441254225535234e-05, + "loss": 1.8851, + "step": 10290 + }, + { + "epoch": 0.6026211092908963, + "grad_norm": 2.2149081230163574, + "learning_rate": 4.4401707549622954e-05, + "loss": 1.8863, + "step": 10300 + }, + { + "epoch": 0.6032061783290428, + "grad_norm": 2.2840425968170166, + "learning_rate": 4.439087284389356e-05, + "loss": 1.8773, + "step": 10310 + }, + { + "epoch": 0.6037912473671894, + "grad_norm": 2.238036632537842, + "learning_rate": 4.438003813816417e-05, + "loss": 1.9078, + "step": 10320 + }, + { + "epoch": 0.6043763164053358, + "grad_norm": 2.526669502258301, + "learning_rate": 4.436920343243477e-05, + "loss": 1.8912, + "step": 10330 + }, + { + "epoch": 0.6049613854434823, + "grad_norm": 2.7186319828033447, + "learning_rate": 4.435836872670538e-05, + "loss": 1.8733, + "step": 10340 + }, + { + "epoch": 0.6055464544816288, + "grad_norm": 2.6983120441436768, + "learning_rate": 4.434753402097599e-05, + "loss": 1.9117, + "step": 10350 + }, + { + "epoch": 0.6061315235197753, + "grad_norm": 2.5510473251342773, + "learning_rate": 4.43366993152466e-05, + "loss": 1.9081, + "step": 10360 + }, + { + "epoch": 0.6067165925579219, + "grad_norm": 2.3356246948242188, + "learning_rate": 4.432586460951721e-05, + "loss": 1.8865, + "step": 10370 + }, + { + "epoch": 0.6073016615960684, + "grad_norm": 2.245009422302246, + "learning_rate": 4.431502990378781e-05, + "loss": 1.9001, + "step": 10380 + }, + { + "epoch": 0.6078867306342148, + "grad_norm": 2.486527919769287, + "learning_rate": 4.430419519805842e-05, + "loss": 1.8764, + "step": 10390 + }, + { + "epoch": 0.6084717996723613, + "grad_norm": 2.660658359527588, + "learning_rate": 4.4293360492329026e-05, + "loss": 1.8654, + "step": 10400 + }, + { + "epoch": 0.6090568687105078, + "grad_norm": 2.2322134971618652, + "learning_rate": 4.428252578659964e-05, + "loss": 1.8745, + "step": 10410 + }, + { + "epoch": 0.6096419377486544, + "grad_norm": 2.4662458896636963, + "learning_rate": 4.427169108087025e-05, + "loss": 1.8974, + "step": 10420 + }, + { + "epoch": 0.6102270067868009, + "grad_norm": 2.390181064605713, + "learning_rate": 4.426085637514085e-05, + "loss": 1.8979, + "step": 10430 + }, + { + "epoch": 0.6108120758249473, + "grad_norm": 2.6455304622650146, + "learning_rate": 4.425002166941146e-05, + "loss": 1.907, + "step": 10440 + }, + { + "epoch": 0.6113971448630938, + "grad_norm": 2.3246021270751953, + "learning_rate": 4.4239186963682066e-05, + "loss": 1.8709, + "step": 10450 + }, + { + "epoch": 0.6119822139012403, + "grad_norm": 2.330378770828247, + "learning_rate": 4.4228352257952676e-05, + "loss": 1.9014, + "step": 10460 + }, + { + "epoch": 0.6125672829393869, + "grad_norm": 2.7336392402648926, + "learning_rate": 4.421751755222328e-05, + "loss": 1.8781, + "step": 10470 + }, + { + "epoch": 0.6131523519775334, + "grad_norm": 2.1353819370269775, + "learning_rate": 4.420668284649389e-05, + "loss": 1.8962, + "step": 10480 + }, + { + "epoch": 0.6137374210156799, + "grad_norm": 2.481919765472412, + "learning_rate": 4.41958481407645e-05, + "loss": 1.8746, + "step": 10490 + }, + { + "epoch": 0.6143224900538263, + "grad_norm": 2.19250226020813, + "learning_rate": 4.4185013435035105e-05, + "loss": 1.864, + "step": 10500 + }, + { + "epoch": 0.6149075590919728, + "grad_norm": 2.465822219848633, + "learning_rate": 4.4174178729305716e-05, + "loss": 1.8635, + "step": 10510 + }, + { + "epoch": 0.6154926281301194, + "grad_norm": 2.599414587020874, + "learning_rate": 4.416334402357632e-05, + "loss": 1.8891, + "step": 10520 + }, + { + "epoch": 0.6160776971682659, + "grad_norm": 2.6029112339019775, + "learning_rate": 4.415250931784693e-05, + "loss": 1.8769, + "step": 10530 + }, + { + "epoch": 0.6166627662064124, + "grad_norm": 2.596947193145752, + "learning_rate": 4.414167461211754e-05, + "loss": 1.873, + "step": 10540 + }, + { + "epoch": 0.6172478352445588, + "grad_norm": 2.2264811992645264, + "learning_rate": 4.4130839906388145e-05, + "loss": 1.8814, + "step": 10550 + }, + { + "epoch": 0.6178329042827053, + "grad_norm": 2.8780710697174072, + "learning_rate": 4.4120005200658756e-05, + "loss": 1.8985, + "step": 10560 + }, + { + "epoch": 0.6184179733208519, + "grad_norm": 2.2392032146453857, + "learning_rate": 4.410917049492936e-05, + "loss": 1.8803, + "step": 10570 + }, + { + "epoch": 0.6190030423589984, + "grad_norm": 2.5182511806488037, + "learning_rate": 4.409833578919997e-05, + "loss": 1.9045, + "step": 10580 + }, + { + "epoch": 0.6195881113971449, + "grad_norm": 2.4167566299438477, + "learning_rate": 4.4087501083470574e-05, + "loss": 1.8722, + "step": 10590 + }, + { + "epoch": 0.6201731804352913, + "grad_norm": 2.725717067718506, + "learning_rate": 4.4076666377741185e-05, + "loss": 1.9027, + "step": 10600 + }, + { + "epoch": 0.6207582494734378, + "grad_norm": 2.6926050186157227, + "learning_rate": 4.4065831672011795e-05, + "loss": 1.8795, + "step": 10610 + }, + { + "epoch": 0.6213433185115844, + "grad_norm": 2.424448251724243, + "learning_rate": 4.40549969662824e-05, + "loss": 1.9109, + "step": 10620 + }, + { + "epoch": 0.6219283875497309, + "grad_norm": 2.562350034713745, + "learning_rate": 4.404416226055301e-05, + "loss": 1.8793, + "step": 10630 + }, + { + "epoch": 0.6225134565878774, + "grad_norm": 2.0050010681152344, + "learning_rate": 4.4033327554823614e-05, + "loss": 1.8771, + "step": 10640 + }, + { + "epoch": 0.6230985256260239, + "grad_norm": 2.676021099090576, + "learning_rate": 4.4022492849094224e-05, + "loss": 1.8417, + "step": 10650 + }, + { + "epoch": 0.6236835946641703, + "grad_norm": 2.1573829650878906, + "learning_rate": 4.401165814336483e-05, + "loss": 1.8918, + "step": 10660 + }, + { + "epoch": 0.6242686637023169, + "grad_norm": 2.3537485599517822, + "learning_rate": 4.400082343763543e-05, + "loss": 1.8624, + "step": 10670 + }, + { + "epoch": 0.6248537327404634, + "grad_norm": 2.4822888374328613, + "learning_rate": 4.398998873190604e-05, + "loss": 1.8879, + "step": 10680 + }, + { + "epoch": 0.6254388017786099, + "grad_norm": 2.568939208984375, + "learning_rate": 4.397915402617665e-05, + "loss": 1.8706, + "step": 10690 + }, + { + "epoch": 0.6260238708167564, + "grad_norm": 2.765045166015625, + "learning_rate": 4.396831932044726e-05, + "loss": 1.8539, + "step": 10700 + }, + { + "epoch": 0.6266089398549028, + "grad_norm": 2.2909529209136963, + "learning_rate": 4.395748461471786e-05, + "loss": 1.8762, + "step": 10710 + }, + { + "epoch": 0.6271940088930493, + "grad_norm": 2.5994081497192383, + "learning_rate": 4.394664990898847e-05, + "loss": 1.8795, + "step": 10720 + }, + { + "epoch": 0.6277790779311959, + "grad_norm": 2.7127110958099365, + "learning_rate": 4.393581520325908e-05, + "loss": 1.895, + "step": 10730 + }, + { + "epoch": 0.6283641469693424, + "grad_norm": 2.851942539215088, + "learning_rate": 4.3924980497529686e-05, + "loss": 1.8717, + "step": 10740 + }, + { + "epoch": 0.6289492160074889, + "grad_norm": 2.775986433029175, + "learning_rate": 4.39141457918003e-05, + "loss": 1.8928, + "step": 10750 + }, + { + "epoch": 0.6295342850456354, + "grad_norm": 2.636653184890747, + "learning_rate": 4.39033110860709e-05, + "loss": 1.9088, + "step": 10760 + }, + { + "epoch": 0.6301193540837818, + "grad_norm": 3.055116653442383, + "learning_rate": 4.389247638034151e-05, + "loss": 1.8883, + "step": 10770 + }, + { + "epoch": 0.6307044231219284, + "grad_norm": 2.1687746047973633, + "learning_rate": 4.3881641674612115e-05, + "loss": 1.8692, + "step": 10780 + }, + { + "epoch": 0.6312894921600749, + "grad_norm": 2.5500965118408203, + "learning_rate": 4.3870806968882726e-05, + "loss": 1.8771, + "step": 10790 + }, + { + "epoch": 0.6318745611982214, + "grad_norm": 2.5698626041412354, + "learning_rate": 4.385997226315334e-05, + "loss": 1.8848, + "step": 10800 + }, + { + "epoch": 0.6324596302363679, + "grad_norm": 2.6392934322357178, + "learning_rate": 4.384913755742394e-05, + "loss": 1.8927, + "step": 10810 + }, + { + "epoch": 0.6330446992745143, + "grad_norm": 2.3182108402252197, + "learning_rate": 4.383830285169455e-05, + "loss": 1.8699, + "step": 10820 + }, + { + "epoch": 0.6336297683126609, + "grad_norm": 2.579468250274658, + "learning_rate": 4.3827468145965155e-05, + "loss": 1.873, + "step": 10830 + }, + { + "epoch": 0.6342148373508074, + "grad_norm": 2.428497791290283, + "learning_rate": 4.3816633440235766e-05, + "loss": 1.8768, + "step": 10840 + }, + { + "epoch": 0.6347999063889539, + "grad_norm": 2.2186734676361084, + "learning_rate": 4.380579873450637e-05, + "loss": 1.8667, + "step": 10850 + }, + { + "epoch": 0.6353849754271004, + "grad_norm": 2.3525521755218506, + "learning_rate": 4.379496402877698e-05, + "loss": 1.8637, + "step": 10860 + }, + { + "epoch": 0.6359700444652469, + "grad_norm": 2.1592352390289307, + "learning_rate": 4.378412932304759e-05, + "loss": 1.8808, + "step": 10870 + }, + { + "epoch": 0.6365551135033934, + "grad_norm": 2.6208226680755615, + "learning_rate": 4.3773294617318195e-05, + "loss": 1.8842, + "step": 10880 + }, + { + "epoch": 0.6371401825415399, + "grad_norm": 2.305011510848999, + "learning_rate": 4.3762459911588805e-05, + "loss": 1.8726, + "step": 10890 + }, + { + "epoch": 0.6377252515796864, + "grad_norm": 2.504804849624634, + "learning_rate": 4.375162520585941e-05, + "loss": 1.8585, + "step": 10900 + }, + { + "epoch": 0.6383103206178329, + "grad_norm": 2.5403716564178467, + "learning_rate": 4.374079050013002e-05, + "loss": 1.8732, + "step": 10910 + }, + { + "epoch": 0.6388953896559794, + "grad_norm": 2.566857099533081, + "learning_rate": 4.372995579440063e-05, + "loss": 1.8803, + "step": 10920 + }, + { + "epoch": 0.639480458694126, + "grad_norm": 2.641162633895874, + "learning_rate": 4.3719121088671234e-05, + "loss": 1.8783, + "step": 10930 + }, + { + "epoch": 0.6400655277322724, + "grad_norm": 2.552086591720581, + "learning_rate": 4.3708286382941845e-05, + "loss": 1.8715, + "step": 10940 + }, + { + "epoch": 0.6406505967704189, + "grad_norm": 2.5366320610046387, + "learning_rate": 4.369745167721245e-05, + "loss": 1.8654, + "step": 10950 + }, + { + "epoch": 0.6412356658085654, + "grad_norm": 2.456026792526245, + "learning_rate": 4.368661697148306e-05, + "loss": 1.8727, + "step": 10960 + }, + { + "epoch": 0.6418207348467119, + "grad_norm": 2.3273329734802246, + "learning_rate": 4.3675782265753664e-05, + "loss": 1.8537, + "step": 10970 + }, + { + "epoch": 0.6424058038848585, + "grad_norm": 2.406642436981201, + "learning_rate": 4.3664947560024274e-05, + "loss": 1.8519, + "step": 10980 + }, + { + "epoch": 0.6429908729230049, + "grad_norm": 2.6327714920043945, + "learning_rate": 4.3654112854294885e-05, + "loss": 1.8403, + "step": 10990 + }, + { + "epoch": 0.6435759419611514, + "grad_norm": 2.2505338191986084, + "learning_rate": 4.364327814856549e-05, + "loss": 1.8842, + "step": 11000 + }, + { + "epoch": 0.6435759419611514, + "eval_loss": 1.8975489139556885, + "eval_runtime": 32.9108, + "eval_samples_per_second": 686.311, + "eval_steps_per_second": 5.378, + "step": 11000 + }, + { + "epoch": 0.6441610109992979, + "grad_norm": 2.600928783416748, + "learning_rate": 4.36324434428361e-05, + "loss": 1.8724, + "step": 11010 + }, + { + "epoch": 0.6447460800374444, + "grad_norm": 2.502467632293701, + "learning_rate": 4.36216087371067e-05, + "loss": 1.8703, + "step": 11020 + }, + { + "epoch": 0.645331149075591, + "grad_norm": 2.466125011444092, + "learning_rate": 4.3610774031377314e-05, + "loss": 1.852, + "step": 11030 + }, + { + "epoch": 0.6459162181137375, + "grad_norm": 2.447097063064575, + "learning_rate": 4.359993932564792e-05, + "loss": 1.8621, + "step": 11040 + }, + { + "epoch": 0.6465012871518839, + "grad_norm": 2.538102626800537, + "learning_rate": 4.358910461991852e-05, + "loss": 1.8726, + "step": 11050 + }, + { + "epoch": 0.6470863561900304, + "grad_norm": 2.245022773742676, + "learning_rate": 4.357826991418913e-05, + "loss": 1.856, + "step": 11060 + }, + { + "epoch": 0.6476714252281769, + "grad_norm": 2.4832162857055664, + "learning_rate": 4.3567435208459736e-05, + "loss": 1.8597, + "step": 11070 + }, + { + "epoch": 0.6482564942663235, + "grad_norm": 2.479484796524048, + "learning_rate": 4.355660050273035e-05, + "loss": 1.857, + "step": 11080 + }, + { + "epoch": 0.64884156330447, + "grad_norm": 2.3473899364471436, + "learning_rate": 4.354576579700095e-05, + "loss": 1.8353, + "step": 11090 + }, + { + "epoch": 0.6494266323426164, + "grad_norm": 2.569596529006958, + "learning_rate": 4.353493109127156e-05, + "loss": 1.8579, + "step": 11100 + }, + { + "epoch": 0.6500117013807629, + "grad_norm": 2.2029175758361816, + "learning_rate": 4.352409638554217e-05, + "loss": 1.8419, + "step": 11110 + }, + { + "epoch": 0.6505967704189094, + "grad_norm": 2.508474349975586, + "learning_rate": 4.3513261679812776e-05, + "loss": 1.8523, + "step": 11120 + }, + { + "epoch": 0.6511818394570559, + "grad_norm": 2.4294273853302, + "learning_rate": 4.3502426974083386e-05, + "loss": 1.8679, + "step": 11130 + }, + { + "epoch": 0.6517669084952025, + "grad_norm": 2.5722177028656006, + "learning_rate": 4.349159226835399e-05, + "loss": 1.8768, + "step": 11140 + }, + { + "epoch": 0.652351977533349, + "grad_norm": 2.3803882598876953, + "learning_rate": 4.34807575626246e-05, + "loss": 1.8343, + "step": 11150 + }, + { + "epoch": 0.6529370465714954, + "grad_norm": 2.3920538425445557, + "learning_rate": 4.3469922856895205e-05, + "loss": 1.8438, + "step": 11160 + }, + { + "epoch": 0.6535221156096419, + "grad_norm": 2.182251214981079, + "learning_rate": 4.3459088151165815e-05, + "loss": 1.8577, + "step": 11170 + }, + { + "epoch": 0.6541071846477884, + "grad_norm": 2.1784307956695557, + "learning_rate": 4.3448253445436426e-05, + "loss": 1.877, + "step": 11180 + }, + { + "epoch": 0.654692253685935, + "grad_norm": 2.4287562370300293, + "learning_rate": 4.343741873970703e-05, + "loss": 1.8683, + "step": 11190 + }, + { + "epoch": 0.6552773227240815, + "grad_norm": 2.4880290031433105, + "learning_rate": 4.342658403397764e-05, + "loss": 1.8574, + "step": 11200 + }, + { + "epoch": 0.6558623917622279, + "grad_norm": 2.514012098312378, + "learning_rate": 4.3415749328248244e-05, + "loss": 1.8639, + "step": 11210 + }, + { + "epoch": 0.6564474608003744, + "grad_norm": 2.2680182456970215, + "learning_rate": 4.3404914622518855e-05, + "loss": 1.8618, + "step": 11220 + }, + { + "epoch": 0.6570325298385209, + "grad_norm": 2.5025250911712646, + "learning_rate": 4.339407991678946e-05, + "loss": 1.8699, + "step": 11230 + }, + { + "epoch": 0.6576175988766675, + "grad_norm": 2.5042147636413574, + "learning_rate": 4.338324521106007e-05, + "loss": 1.8475, + "step": 11240 + }, + { + "epoch": 0.658202667914814, + "grad_norm": 2.2626304626464844, + "learning_rate": 4.337241050533068e-05, + "loss": 1.8628, + "step": 11250 + }, + { + "epoch": 0.6587877369529604, + "grad_norm": 2.4627599716186523, + "learning_rate": 4.3361575799601284e-05, + "loss": 1.8707, + "step": 11260 + }, + { + "epoch": 0.6593728059911069, + "grad_norm": 2.3688318729400635, + "learning_rate": 4.3350741093871895e-05, + "loss": 1.8508, + "step": 11270 + }, + { + "epoch": 0.6599578750292534, + "grad_norm": 2.445836305618286, + "learning_rate": 4.33399063881425e-05, + "loss": 1.8499, + "step": 11280 + }, + { + "epoch": 0.6605429440674, + "grad_norm": 2.743366241455078, + "learning_rate": 4.332907168241311e-05, + "loss": 1.8519, + "step": 11290 + }, + { + "epoch": 0.6611280131055465, + "grad_norm": 2.3247838020324707, + "learning_rate": 4.331823697668371e-05, + "loss": 1.8557, + "step": 11300 + }, + { + "epoch": 0.661713082143693, + "grad_norm": 2.600133180618286, + "learning_rate": 4.3307402270954324e-05, + "loss": 1.8404, + "step": 11310 + }, + { + "epoch": 0.6622981511818394, + "grad_norm": 2.2469217777252197, + "learning_rate": 4.3296567565224934e-05, + "loss": 1.8488, + "step": 11320 + }, + { + "epoch": 0.6628832202199859, + "grad_norm": 2.316084384918213, + "learning_rate": 4.328573285949554e-05, + "loss": 1.8428, + "step": 11330 + }, + { + "epoch": 0.6634682892581325, + "grad_norm": 2.4769036769866943, + "learning_rate": 4.327489815376615e-05, + "loss": 1.8213, + "step": 11340 + }, + { + "epoch": 0.664053358296279, + "grad_norm": 2.5895142555236816, + "learning_rate": 4.326406344803675e-05, + "loss": 1.8419, + "step": 11350 + }, + { + "epoch": 0.6646384273344255, + "grad_norm": 2.3412725925445557, + "learning_rate": 4.3253228742307363e-05, + "loss": 1.8595, + "step": 11360 + }, + { + "epoch": 0.6652234963725719, + "grad_norm": 2.1715099811553955, + "learning_rate": 4.3242394036577974e-05, + "loss": 1.8533, + "step": 11370 + }, + { + "epoch": 0.6658085654107184, + "grad_norm": 2.392047882080078, + "learning_rate": 4.323155933084858e-05, + "loss": 1.8457, + "step": 11380 + }, + { + "epoch": 0.666393634448865, + "grad_norm": 2.389444351196289, + "learning_rate": 4.322072462511919e-05, + "loss": 1.8344, + "step": 11390 + }, + { + "epoch": 0.6669787034870115, + "grad_norm": 2.534252643585205, + "learning_rate": 4.320988991938979e-05, + "loss": 1.8545, + "step": 11400 + }, + { + "epoch": 0.667563772525158, + "grad_norm": 2.231694221496582, + "learning_rate": 4.31990552136604e-05, + "loss": 1.8378, + "step": 11410 + }, + { + "epoch": 0.6681488415633045, + "grad_norm": 2.2439799308776855, + "learning_rate": 4.318822050793101e-05, + "loss": 1.8514, + "step": 11420 + }, + { + "epoch": 0.6687339106014509, + "grad_norm": 2.46791672706604, + "learning_rate": 4.317738580220161e-05, + "loss": 1.8713, + "step": 11430 + }, + { + "epoch": 0.6693189796395975, + "grad_norm": 2.191307783126831, + "learning_rate": 4.316655109647222e-05, + "loss": 1.8355, + "step": 11440 + }, + { + "epoch": 0.669904048677744, + "grad_norm": 2.6098711490631104, + "learning_rate": 4.3155716390742825e-05, + "loss": 1.8486, + "step": 11450 + }, + { + "epoch": 0.6704891177158905, + "grad_norm": 2.4625377655029297, + "learning_rate": 4.3144881685013436e-05, + "loss": 1.8524, + "step": 11460 + }, + { + "epoch": 0.671074186754037, + "grad_norm": 2.25602388381958, + "learning_rate": 4.313404697928404e-05, + "loss": 1.8731, + "step": 11470 + }, + { + "epoch": 0.6716592557921834, + "grad_norm": 2.372896671295166, + "learning_rate": 4.312321227355465e-05, + "loss": 1.855, + "step": 11480 + }, + { + "epoch": 0.67224432483033, + "grad_norm": 2.2534735202789307, + "learning_rate": 4.3112377567825254e-05, + "loss": 1.8266, + "step": 11490 + }, + { + "epoch": 0.6728293938684765, + "grad_norm": 2.086245536804199, + "learning_rate": 4.3101542862095865e-05, + "loss": 1.8575, + "step": 11500 + }, + { + "epoch": 0.673414462906623, + "grad_norm": 2.378171443939209, + "learning_rate": 4.3090708156366476e-05, + "loss": 1.8575, + "step": 11510 + }, + { + "epoch": 0.6739995319447695, + "grad_norm": 2.7426085472106934, + "learning_rate": 4.307987345063708e-05, + "loss": 1.8456, + "step": 11520 + }, + { + "epoch": 0.674584600982916, + "grad_norm": 2.460563898086548, + "learning_rate": 4.306903874490769e-05, + "loss": 1.8514, + "step": 11530 + }, + { + "epoch": 0.6751696700210625, + "grad_norm": 2.62347412109375, + "learning_rate": 4.3058204039178294e-05, + "loss": 1.8143, + "step": 11540 + }, + { + "epoch": 0.675754739059209, + "grad_norm": 2.5109753608703613, + "learning_rate": 4.3047369333448905e-05, + "loss": 1.8411, + "step": 11550 + }, + { + "epoch": 0.6763398080973555, + "grad_norm": 2.1473875045776367, + "learning_rate": 4.3036534627719515e-05, + "loss": 1.8687, + "step": 11560 + }, + { + "epoch": 0.676924877135502, + "grad_norm": 2.2462000846862793, + "learning_rate": 4.302569992199012e-05, + "loss": 1.832, + "step": 11570 + }, + { + "epoch": 0.6775099461736485, + "grad_norm": 2.349515676498413, + "learning_rate": 4.301486521626073e-05, + "loss": 1.8549, + "step": 11580 + }, + { + "epoch": 0.6780950152117949, + "grad_norm": 2.465280771255493, + "learning_rate": 4.3004030510531334e-05, + "loss": 1.8387, + "step": 11590 + }, + { + "epoch": 0.6786800842499415, + "grad_norm": 2.32588267326355, + "learning_rate": 4.2993195804801944e-05, + "loss": 1.8221, + "step": 11600 + }, + { + "epoch": 0.679265153288088, + "grad_norm": 2.281193256378174, + "learning_rate": 4.298236109907255e-05, + "loss": 1.8229, + "step": 11610 + }, + { + "epoch": 0.6798502223262345, + "grad_norm": 2.3777170181274414, + "learning_rate": 4.297152639334316e-05, + "loss": 1.8443, + "step": 11620 + }, + { + "epoch": 0.680435291364381, + "grad_norm": 2.4681644439697266, + "learning_rate": 4.296069168761377e-05, + "loss": 1.8465, + "step": 11630 + }, + { + "epoch": 0.6810203604025274, + "grad_norm": 2.4217424392700195, + "learning_rate": 4.2949856981884373e-05, + "loss": 1.833, + "step": 11640 + }, + { + "epoch": 0.681605429440674, + "grad_norm": 2.2018558979034424, + "learning_rate": 4.2939022276154984e-05, + "loss": 1.8302, + "step": 11650 + }, + { + "epoch": 0.6821904984788205, + "grad_norm": 2.3427114486694336, + "learning_rate": 4.292818757042559e-05, + "loss": 1.8142, + "step": 11660 + }, + { + "epoch": 0.682775567516967, + "grad_norm": 2.2417588233947754, + "learning_rate": 4.29173528646962e-05, + "loss": 1.8423, + "step": 11670 + }, + { + "epoch": 0.6833606365551135, + "grad_norm": 2.4857912063598633, + "learning_rate": 4.29065181589668e-05, + "loss": 1.8411, + "step": 11680 + }, + { + "epoch": 0.68394570559326, + "grad_norm": 2.2812535762786865, + "learning_rate": 4.289568345323741e-05, + "loss": 1.8427, + "step": 11690 + }, + { + "epoch": 0.6845307746314065, + "grad_norm": 2.4076366424560547, + "learning_rate": 4.2884848747508024e-05, + "loss": 1.8212, + "step": 11700 + }, + { + "epoch": 0.685115843669553, + "grad_norm": 2.2592592239379883, + "learning_rate": 4.287401404177863e-05, + "loss": 1.8392, + "step": 11710 + }, + { + "epoch": 0.6857009127076995, + "grad_norm": 2.708106279373169, + "learning_rate": 4.286317933604924e-05, + "loss": 1.8437, + "step": 11720 + }, + { + "epoch": 0.686285981745846, + "grad_norm": 2.5834312438964844, + "learning_rate": 4.285234463031984e-05, + "loss": 1.8423, + "step": 11730 + }, + { + "epoch": 0.6868710507839925, + "grad_norm": 2.069519519805908, + "learning_rate": 4.284150992459045e-05, + "loss": 1.828, + "step": 11740 + }, + { + "epoch": 0.6874561198221391, + "grad_norm": 2.4198737144470215, + "learning_rate": 4.2830675218861063e-05, + "loss": 1.837, + "step": 11750 + }, + { + "epoch": 0.6880411888602855, + "grad_norm": 2.548240900039673, + "learning_rate": 4.281984051313167e-05, + "loss": 1.8202, + "step": 11760 + }, + { + "epoch": 0.688626257898432, + "grad_norm": 2.1774189472198486, + "learning_rate": 4.280900580740228e-05, + "loss": 1.8393, + "step": 11770 + }, + { + "epoch": 0.6892113269365785, + "grad_norm": 2.4470956325531006, + "learning_rate": 4.279817110167288e-05, + "loss": 1.8353, + "step": 11780 + }, + { + "epoch": 0.689796395974725, + "grad_norm": 2.182567596435547, + "learning_rate": 4.278733639594349e-05, + "loss": 1.8389, + "step": 11790 + }, + { + "epoch": 0.6903814650128716, + "grad_norm": 2.361693859100342, + "learning_rate": 4.2776501690214096e-05, + "loss": 1.8228, + "step": 11800 + }, + { + "epoch": 0.690966534051018, + "grad_norm": 2.4118566513061523, + "learning_rate": 4.27656669844847e-05, + "loss": 1.8262, + "step": 11810 + }, + { + "epoch": 0.6915516030891645, + "grad_norm": 2.080482006072998, + "learning_rate": 4.275483227875531e-05, + "loss": 1.8124, + "step": 11820 + }, + { + "epoch": 0.692136672127311, + "grad_norm": 2.473153829574585, + "learning_rate": 4.2743997573025915e-05, + "loss": 1.835, + "step": 11830 + }, + { + "epoch": 0.6927217411654575, + "grad_norm": 2.421844959259033, + "learning_rate": 4.2733162867296525e-05, + "loss": 1.8292, + "step": 11840 + }, + { + "epoch": 0.6933068102036041, + "grad_norm": 2.239104747772217, + "learning_rate": 4.272232816156713e-05, + "loss": 1.843, + "step": 11850 + }, + { + "epoch": 0.6938918792417506, + "grad_norm": 2.295302391052246, + "learning_rate": 4.271149345583774e-05, + "loss": 1.8352, + "step": 11860 + }, + { + "epoch": 0.694476948279897, + "grad_norm": 2.262066602706909, + "learning_rate": 4.2700658750108344e-05, + "loss": 1.8375, + "step": 11870 + }, + { + "epoch": 0.6950620173180435, + "grad_norm": 2.059486150741577, + "learning_rate": 4.2689824044378954e-05, + "loss": 1.8411, + "step": 11880 + }, + { + "epoch": 0.69564708635619, + "grad_norm": 2.3972668647766113, + "learning_rate": 4.2678989338649565e-05, + "loss": 1.8457, + "step": 11890 + }, + { + "epoch": 0.6962321553943366, + "grad_norm": 2.273416519165039, + "learning_rate": 4.266815463292017e-05, + "loss": 1.822, + "step": 11900 + }, + { + "epoch": 0.6968172244324831, + "grad_norm": 2.123689889907837, + "learning_rate": 4.265731992719078e-05, + "loss": 1.843, + "step": 11910 + }, + { + "epoch": 0.6974022934706295, + "grad_norm": 2.390474557876587, + "learning_rate": 4.2646485221461383e-05, + "loss": 1.8302, + "step": 11920 + }, + { + "epoch": 0.697987362508776, + "grad_norm": 2.23185658454895, + "learning_rate": 4.2635650515731994e-05, + "loss": 1.8417, + "step": 11930 + }, + { + "epoch": 0.6985724315469225, + "grad_norm": 2.342500686645508, + "learning_rate": 4.26248158100026e-05, + "loss": 1.8378, + "step": 11940 + }, + { + "epoch": 0.6991575005850691, + "grad_norm": 2.2459075450897217, + "learning_rate": 4.261398110427321e-05, + "loss": 1.8489, + "step": 11950 + }, + { + "epoch": 0.6997425696232156, + "grad_norm": 2.178471803665161, + "learning_rate": 4.260314639854382e-05, + "loss": 1.8229, + "step": 11960 + }, + { + "epoch": 0.700327638661362, + "grad_norm": 2.1089115142822266, + "learning_rate": 4.259231169281442e-05, + "loss": 1.8302, + "step": 11970 + }, + { + "epoch": 0.7009127076995085, + "grad_norm": 2.2374985218048096, + "learning_rate": 4.2581476987085034e-05, + "loss": 1.8297, + "step": 11980 + }, + { + "epoch": 0.701497776737655, + "grad_norm": 2.225367784500122, + "learning_rate": 4.257064228135564e-05, + "loss": 1.8288, + "step": 11990 + }, + { + "epoch": 0.7020828457758015, + "grad_norm": 2.2500967979431152, + "learning_rate": 4.255980757562625e-05, + "loss": 1.8283, + "step": 12000 + }, + { + "epoch": 0.7020828457758015, + "eval_loss": 1.8661189079284668, + "eval_runtime": 32.9292, + "eval_samples_per_second": 685.927, + "eval_steps_per_second": 5.375, + "step": 12000 + }, + { + "epoch": 0.7026679148139481, + "grad_norm": 2.4038751125335693, + "learning_rate": 4.254897286989686e-05, + "loss": 1.8379, + "step": 12010 + }, + { + "epoch": 0.7032529838520946, + "grad_norm": 2.396484136581421, + "learning_rate": 4.253813816416746e-05, + "loss": 1.8321, + "step": 12020 + }, + { + "epoch": 0.703838052890241, + "grad_norm": 2.5604639053344727, + "learning_rate": 4.2527303458438073e-05, + "loss": 1.8237, + "step": 12030 + }, + { + "epoch": 0.7044231219283875, + "grad_norm": 2.269286632537842, + "learning_rate": 4.251646875270868e-05, + "loss": 1.8364, + "step": 12040 + }, + { + "epoch": 0.705008190966534, + "grad_norm": 2.2743921279907227, + "learning_rate": 4.250563404697929e-05, + "loss": 1.844, + "step": 12050 + }, + { + "epoch": 0.7055932600046806, + "grad_norm": 2.3710079193115234, + "learning_rate": 4.249479934124989e-05, + "loss": 1.8166, + "step": 12060 + }, + { + "epoch": 0.7061783290428271, + "grad_norm": 2.226456642150879, + "learning_rate": 4.24839646355205e-05, + "loss": 1.8098, + "step": 12070 + }, + { + "epoch": 0.7067633980809735, + "grad_norm": 2.327904462814331, + "learning_rate": 4.247312992979111e-05, + "loss": 1.825, + "step": 12080 + }, + { + "epoch": 0.70734846711912, + "grad_norm": 2.1387457847595215, + "learning_rate": 4.246229522406172e-05, + "loss": 1.8278, + "step": 12090 + }, + { + "epoch": 0.7079335361572665, + "grad_norm": 2.3355820178985596, + "learning_rate": 4.245146051833233e-05, + "loss": 1.8392, + "step": 12100 + }, + { + "epoch": 0.7085186051954131, + "grad_norm": 2.237257719039917, + "learning_rate": 4.244062581260293e-05, + "loss": 1.8317, + "step": 12110 + }, + { + "epoch": 0.7091036742335596, + "grad_norm": 2.287775993347168, + "learning_rate": 4.242979110687354e-05, + "loss": 1.8117, + "step": 12120 + }, + { + "epoch": 0.7096887432717061, + "grad_norm": 2.1789908409118652, + "learning_rate": 4.2418956401144146e-05, + "loss": 1.8155, + "step": 12130 + }, + { + "epoch": 0.7102738123098525, + "grad_norm": 2.241799831390381, + "learning_rate": 4.240812169541476e-05, + "loss": 1.8132, + "step": 12140 + }, + { + "epoch": 0.710858881347999, + "grad_norm": 2.4363460540771484, + "learning_rate": 4.239728698968537e-05, + "loss": 1.8346, + "step": 12150 + }, + { + "epoch": 0.7114439503861456, + "grad_norm": 2.3826022148132324, + "learning_rate": 4.238645228395597e-05, + "loss": 1.8396, + "step": 12160 + }, + { + "epoch": 0.7120290194242921, + "grad_norm": 2.4969215393066406, + "learning_rate": 4.237670104879952e-05, + "loss": 1.8225, + "step": 12170 + }, + { + "epoch": 0.7126140884624386, + "grad_norm": 2.394442319869995, + "learning_rate": 4.236586634307012e-05, + "loss": 1.8237, + "step": 12180 + }, + { + "epoch": 0.713199157500585, + "grad_norm": 2.1934778690338135, + "learning_rate": 4.2355031637340734e-05, + "loss": 1.8109, + "step": 12190 + }, + { + "epoch": 0.7137842265387315, + "grad_norm": 2.405651807785034, + "learning_rate": 4.234419693161134e-05, + "loss": 1.8112, + "step": 12200 + }, + { + "epoch": 0.7143692955768781, + "grad_norm": 2.3921384811401367, + "learning_rate": 4.233336222588195e-05, + "loss": 1.8124, + "step": 12210 + }, + { + "epoch": 0.7149543646150246, + "grad_norm": 2.2401416301727295, + "learning_rate": 4.232252752015255e-05, + "loss": 1.8123, + "step": 12220 + }, + { + "epoch": 0.7155394336531711, + "grad_norm": 2.2835497856140137, + "learning_rate": 4.231169281442316e-05, + "loss": 1.8422, + "step": 12230 + }, + { + "epoch": 0.7161245026913176, + "grad_norm": 2.071018695831299, + "learning_rate": 4.230085810869377e-05, + "loss": 1.8257, + "step": 12240 + }, + { + "epoch": 0.716709571729464, + "grad_norm": 2.0772836208343506, + "learning_rate": 4.229002340296438e-05, + "loss": 1.846, + "step": 12250 + }, + { + "epoch": 0.7172946407676106, + "grad_norm": 2.2114098072052, + "learning_rate": 4.227918869723498e-05, + "loss": 1.8082, + "step": 12260 + }, + { + "epoch": 0.7178797098057571, + "grad_norm": 2.2233076095581055, + "learning_rate": 4.226835399150559e-05, + "loss": 1.8094, + "step": 12270 + }, + { + "epoch": 0.7184647788439036, + "grad_norm": 2.1712427139282227, + "learning_rate": 4.2257519285776196e-05, + "loss": 1.8191, + "step": 12280 + }, + { + "epoch": 0.7190498478820501, + "grad_norm": 2.3484385013580322, + "learning_rate": 4.2246684580046806e-05, + "loss": 1.8029, + "step": 12290 + }, + { + "epoch": 0.7196349169201965, + "grad_norm": 2.663083553314209, + "learning_rate": 4.223584987431742e-05, + "loss": 1.8055, + "step": 12300 + }, + { + "epoch": 0.7202199859583431, + "grad_norm": 2.2003955841064453, + "learning_rate": 4.222501516858802e-05, + "loss": 1.8025, + "step": 12310 + }, + { + "epoch": 0.7208050549964896, + "grad_norm": 2.600931167602539, + "learning_rate": 4.221418046285863e-05, + "loss": 1.8286, + "step": 12320 + }, + { + "epoch": 0.7213901240346361, + "grad_norm": 2.159895896911621, + "learning_rate": 4.2203345757129236e-05, + "loss": 1.8291, + "step": 12330 + }, + { + "epoch": 0.7219751930727826, + "grad_norm": 2.146587610244751, + "learning_rate": 4.2192511051399846e-05, + "loss": 1.833, + "step": 12340 + }, + { + "epoch": 0.722560262110929, + "grad_norm": 2.00009822845459, + "learning_rate": 4.218167634567045e-05, + "loss": 1.8351, + "step": 12350 + }, + { + "epoch": 0.7231453311490756, + "grad_norm": 2.445350408554077, + "learning_rate": 4.217084163994106e-05, + "loss": 1.8033, + "step": 12360 + }, + { + "epoch": 0.7237304001872221, + "grad_norm": 2.214927911758423, + "learning_rate": 4.216000693421167e-05, + "loss": 1.8374, + "step": 12370 + }, + { + "epoch": 0.7243154692253686, + "grad_norm": 2.0212528705596924, + "learning_rate": 4.2149172228482275e-05, + "loss": 1.8254, + "step": 12380 + }, + { + "epoch": 0.7249005382635151, + "grad_norm": 2.297071933746338, + "learning_rate": 4.2138337522752886e-05, + "loss": 1.8206, + "step": 12390 + }, + { + "epoch": 0.7254856073016616, + "grad_norm": 2.245631456375122, + "learning_rate": 4.212750281702349e-05, + "loss": 1.8102, + "step": 12400 + }, + { + "epoch": 0.7260706763398082, + "grad_norm": 2.130162477493286, + "learning_rate": 4.21166681112941e-05, + "loss": 1.7988, + "step": 12410 + }, + { + "epoch": 0.7266557453779546, + "grad_norm": 2.2783031463623047, + "learning_rate": 4.2105833405564704e-05, + "loss": 1.8331, + "step": 12420 + }, + { + "epoch": 0.7272408144161011, + "grad_norm": 2.159310817718506, + "learning_rate": 4.2094998699835315e-05, + "loss": 1.807, + "step": 12430 + }, + { + "epoch": 0.7278258834542476, + "grad_norm": 2.1670851707458496, + "learning_rate": 4.2084163994105926e-05, + "loss": 1.8178, + "step": 12440 + }, + { + "epoch": 0.7284109524923941, + "grad_norm": 2.1287641525268555, + "learning_rate": 4.207332928837653e-05, + "loss": 1.7967, + "step": 12450 + }, + { + "epoch": 0.7289960215305405, + "grad_norm": 2.221125602722168, + "learning_rate": 4.206249458264714e-05, + "loss": 1.8099, + "step": 12460 + }, + { + "epoch": 0.7295810905686871, + "grad_norm": 2.233140468597412, + "learning_rate": 4.2051659876917744e-05, + "loss": 1.8204, + "step": 12470 + }, + { + "epoch": 0.7301661596068336, + "grad_norm": 2.361776828765869, + "learning_rate": 4.2040825171188355e-05, + "loss": 1.7967, + "step": 12480 + }, + { + "epoch": 0.7307512286449801, + "grad_norm": 2.2732789516448975, + "learning_rate": 4.2029990465458965e-05, + "loss": 1.8025, + "step": 12490 + }, + { + "epoch": 0.7313362976831266, + "grad_norm": 2.416445255279541, + "learning_rate": 4.201915575972957e-05, + "loss": 1.8168, + "step": 12500 + }, + { + "epoch": 0.7319213667212731, + "grad_norm": 2.4886395931243896, + "learning_rate": 4.200832105400018e-05, + "loss": 1.8307, + "step": 12510 + }, + { + "epoch": 0.7325064357594196, + "grad_norm": 2.198817014694214, + "learning_rate": 4.1997486348270784e-05, + "loss": 1.7854, + "step": 12520 + }, + { + "epoch": 0.7330915047975661, + "grad_norm": 2.3238842487335205, + "learning_rate": 4.1986651642541394e-05, + "loss": 1.818, + "step": 12530 + }, + { + "epoch": 0.7336765738357126, + "grad_norm": 2.1258158683776855, + "learning_rate": 4.1975816936812e-05, + "loss": 1.8094, + "step": 12540 + }, + { + "epoch": 0.7342616428738591, + "grad_norm": 2.285569190979004, + "learning_rate": 4.196498223108261e-05, + "loss": 1.8373, + "step": 12550 + }, + { + "epoch": 0.7348467119120056, + "grad_norm": 2.2520456314086914, + "learning_rate": 4.195414752535321e-05, + "loss": 1.8051, + "step": 12560 + }, + { + "epoch": 0.7354317809501522, + "grad_norm": 2.1500024795532227, + "learning_rate": 4.194331281962382e-05, + "loss": 1.8016, + "step": 12570 + }, + { + "epoch": 0.7360168499882986, + "grad_norm": 2.357511043548584, + "learning_rate": 4.193247811389443e-05, + "loss": 1.7934, + "step": 12580 + }, + { + "epoch": 0.7366019190264451, + "grad_norm": 2.78424072265625, + "learning_rate": 4.192164340816504e-05, + "loss": 1.818, + "step": 12590 + }, + { + "epoch": 0.7371869880645916, + "grad_norm": 2.2109782695770264, + "learning_rate": 4.191080870243564e-05, + "loss": 1.8194, + "step": 12600 + }, + { + "epoch": 0.7377720571027381, + "grad_norm": 2.16252064704895, + "learning_rate": 4.1899973996706246e-05, + "loss": 1.8262, + "step": 12610 + }, + { + "epoch": 0.7383571261408847, + "grad_norm": 2.298553705215454, + "learning_rate": 4.1889139290976856e-05, + "loss": 1.8306, + "step": 12620 + }, + { + "epoch": 0.7389421951790311, + "grad_norm": 2.263638496398926, + "learning_rate": 4.187830458524747e-05, + "loss": 1.812, + "step": 12630 + }, + { + "epoch": 0.7395272642171776, + "grad_norm": 2.3591926097869873, + "learning_rate": 4.186746987951807e-05, + "loss": 1.821, + "step": 12640 + }, + { + "epoch": 0.7401123332553241, + "grad_norm": 2.2733230590820312, + "learning_rate": 4.185663517378868e-05, + "loss": 1.7971, + "step": 12650 + }, + { + "epoch": 0.7406974022934706, + "grad_norm": 2.2241532802581787, + "learning_rate": 4.1845800468059285e-05, + "loss": 1.8038, + "step": 12660 + }, + { + "epoch": 0.7412824713316172, + "grad_norm": 2.2268617153167725, + "learning_rate": 4.1834965762329896e-05, + "loss": 1.8177, + "step": 12670 + }, + { + "epoch": 0.7418675403697637, + "grad_norm": 2.2764389514923096, + "learning_rate": 4.1824131056600506e-05, + "loss": 1.8028, + "step": 12680 + }, + { + "epoch": 0.7424526094079101, + "grad_norm": 2.1625442504882812, + "learning_rate": 4.181329635087111e-05, + "loss": 1.7934, + "step": 12690 + }, + { + "epoch": 0.7430376784460566, + "grad_norm": 2.1627142429351807, + "learning_rate": 4.180246164514172e-05, + "loss": 1.8098, + "step": 12700 + }, + { + "epoch": 0.7436227474842031, + "grad_norm": 2.154620409011841, + "learning_rate": 4.1791626939412325e-05, + "loss": 1.7991, + "step": 12710 + }, + { + "epoch": 0.7442078165223497, + "grad_norm": 2.3641762733459473, + "learning_rate": 4.1780792233682935e-05, + "loss": 1.8189, + "step": 12720 + }, + { + "epoch": 0.7447928855604962, + "grad_norm": 2.147836923599243, + "learning_rate": 4.176995752795354e-05, + "loss": 1.816, + "step": 12730 + }, + { + "epoch": 0.7453779545986426, + "grad_norm": 2.1792352199554443, + "learning_rate": 4.175912282222415e-05, + "loss": 1.8134, + "step": 12740 + }, + { + "epoch": 0.7459630236367891, + "grad_norm": 2.1565744876861572, + "learning_rate": 4.174828811649476e-05, + "loss": 1.8068, + "step": 12750 + }, + { + "epoch": 0.7465480926749356, + "grad_norm": 2.180833578109741, + "learning_rate": 4.1737453410765365e-05, + "loss": 1.8147, + "step": 12760 + }, + { + "epoch": 0.7471331617130822, + "grad_norm": 2.168684244155884, + "learning_rate": 4.1726618705035975e-05, + "loss": 1.8169, + "step": 12770 + }, + { + "epoch": 0.7477182307512287, + "grad_norm": 2.1652324199676514, + "learning_rate": 4.171578399930658e-05, + "loss": 1.8089, + "step": 12780 + }, + { + "epoch": 0.7483032997893752, + "grad_norm": 2.0710554122924805, + "learning_rate": 4.170494929357719e-05, + "loss": 1.8271, + "step": 12790 + }, + { + "epoch": 0.7488883688275216, + "grad_norm": 2.2925968170166016, + "learning_rate": 4.1694114587847794e-05, + "loss": 1.8002, + "step": 12800 + }, + { + "epoch": 0.7494734378656681, + "grad_norm": 2.288947820663452, + "learning_rate": 4.1683279882118404e-05, + "loss": 1.7926, + "step": 12810 + }, + { + "epoch": 0.7500585069038147, + "grad_norm": 2.253843069076538, + "learning_rate": 4.1672445176389015e-05, + "loss": 1.8134, + "step": 12820 + }, + { + "epoch": 0.7506435759419612, + "grad_norm": 2.0366721153259277, + "learning_rate": 4.166161047065962e-05, + "loss": 1.8008, + "step": 12830 + }, + { + "epoch": 0.7512286449801077, + "grad_norm": 2.075852155685425, + "learning_rate": 4.165077576493023e-05, + "loss": 1.8109, + "step": 12840 + }, + { + "epoch": 0.7518137140182541, + "grad_norm": 1.9715243577957153, + "learning_rate": 4.163994105920083e-05, + "loss": 1.7818, + "step": 12850 + }, + { + "epoch": 0.7523987830564006, + "grad_norm": 2.028249979019165, + "learning_rate": 4.1629106353471444e-05, + "loss": 1.8159, + "step": 12860 + }, + { + "epoch": 0.7529838520945471, + "grad_norm": 2.1904890537261963, + "learning_rate": 4.1618271647742055e-05, + "loss": 1.8009, + "step": 12870 + }, + { + "epoch": 0.7535689211326937, + "grad_norm": 2.268209934234619, + "learning_rate": 4.160743694201266e-05, + "loss": 1.8159, + "step": 12880 + }, + { + "epoch": 0.7541539901708402, + "grad_norm": 2.372553586959839, + "learning_rate": 4.159660223628327e-05, + "loss": 1.7993, + "step": 12890 + }, + { + "epoch": 0.7547390592089867, + "grad_norm": 2.107041597366333, + "learning_rate": 4.158576753055387e-05, + "loss": 1.8115, + "step": 12900 + }, + { + "epoch": 0.7553241282471331, + "grad_norm": 2.11906099319458, + "learning_rate": 4.1574932824824484e-05, + "loss": 1.7929, + "step": 12910 + }, + { + "epoch": 0.7559091972852796, + "grad_norm": 2.265476703643799, + "learning_rate": 4.156409811909509e-05, + "loss": 1.7985, + "step": 12920 + }, + { + "epoch": 0.7564942663234262, + "grad_norm": 2.1723320484161377, + "learning_rate": 4.15532634133657e-05, + "loss": 1.8124, + "step": 12930 + }, + { + "epoch": 0.7570793353615727, + "grad_norm": 2.0519330501556396, + "learning_rate": 4.15424287076363e-05, + "loss": 1.8189, + "step": 12940 + }, + { + "epoch": 0.7576644043997192, + "grad_norm": 2.3468713760375977, + "learning_rate": 4.153159400190691e-05, + "loss": 1.8122, + "step": 12950 + }, + { + "epoch": 0.7582494734378656, + "grad_norm": 2.240692615509033, + "learning_rate": 4.1520759296177516e-05, + "loss": 1.8048, + "step": 12960 + }, + { + "epoch": 0.7588345424760121, + "grad_norm": 2.0491087436676025, + "learning_rate": 4.150992459044813e-05, + "loss": 1.7927, + "step": 12970 + }, + { + "epoch": 0.7594196115141587, + "grad_norm": 2.0549869537353516, + "learning_rate": 4.149908988471873e-05, + "loss": 1.7974, + "step": 12980 + }, + { + "epoch": 0.7600046805523052, + "grad_norm": 2.093815326690674, + "learning_rate": 4.1488255178989335e-05, + "loss": 1.8029, + "step": 12990 + }, + { + "epoch": 0.7605897495904517, + "grad_norm": 2.4611542224884033, + "learning_rate": 4.1477420473259945e-05, + "loss": 1.7951, + "step": 13000 + }, + { + "epoch": 0.7605897495904517, + "eval_loss": 1.8347017765045166, + "eval_runtime": 33.3944, + "eval_samples_per_second": 676.37, + "eval_steps_per_second": 5.3, + "step": 13000 + }, + { + "epoch": 0.7611748186285981, + "grad_norm": 2.0336036682128906, + "learning_rate": 4.1466585767530556e-05, + "loss": 1.7923, + "step": 13010 + }, + { + "epoch": 0.7617598876667446, + "grad_norm": 2.2139673233032227, + "learning_rate": 4.145575106180116e-05, + "loss": 1.7899, + "step": 13020 + }, + { + "epoch": 0.7623449567048912, + "grad_norm": 2.075909376144409, + "learning_rate": 4.144491635607177e-05, + "loss": 1.798, + "step": 13030 + }, + { + "epoch": 0.7629300257430377, + "grad_norm": 2.1344552040100098, + "learning_rate": 4.1434081650342375e-05, + "loss": 1.7982, + "step": 13040 + }, + { + "epoch": 0.7635150947811842, + "grad_norm": 2.187000036239624, + "learning_rate": 4.1423246944612985e-05, + "loss": 1.7939, + "step": 13050 + }, + { + "epoch": 0.7641001638193307, + "grad_norm": 2.181638479232788, + "learning_rate": 4.1412412238883596e-05, + "loss": 1.7855, + "step": 13060 + }, + { + "epoch": 0.7646852328574771, + "grad_norm": 2.079551935195923, + "learning_rate": 4.14015775331542e-05, + "loss": 1.7806, + "step": 13070 + }, + { + "epoch": 0.7652703018956237, + "grad_norm": 2.2021992206573486, + "learning_rate": 4.139074282742481e-05, + "loss": 1.7865, + "step": 13080 + }, + { + "epoch": 0.7658553709337702, + "grad_norm": 2.2345454692840576, + "learning_rate": 4.1379908121695414e-05, + "loss": 1.7904, + "step": 13090 + }, + { + "epoch": 0.7664404399719167, + "grad_norm": 2.190561056137085, + "learning_rate": 4.1369073415966025e-05, + "loss": 1.8026, + "step": 13100 + }, + { + "epoch": 0.7670255090100632, + "grad_norm": 2.0615131855010986, + "learning_rate": 4.135823871023663e-05, + "loss": 1.801, + "step": 13110 + }, + { + "epoch": 0.7676105780482096, + "grad_norm": 2.3059847354888916, + "learning_rate": 4.134740400450724e-05, + "loss": 1.8018, + "step": 13120 + }, + { + "epoch": 0.7681956470863562, + "grad_norm": 2.2317423820495605, + "learning_rate": 4.133656929877785e-05, + "loss": 1.7966, + "step": 13130 + }, + { + "epoch": 0.7687807161245027, + "grad_norm": 2.5545566082000732, + "learning_rate": 4.1325734593048454e-05, + "loss": 1.7725, + "step": 13140 + }, + { + "epoch": 0.7693657851626492, + "grad_norm": 2.356539249420166, + "learning_rate": 4.1314899887319065e-05, + "loss": 1.8047, + "step": 13150 + }, + { + "epoch": 0.7699508542007957, + "grad_norm": 2.217050313949585, + "learning_rate": 4.130406518158967e-05, + "loss": 1.8081, + "step": 13160 + }, + { + "epoch": 0.7705359232389422, + "grad_norm": 2.1588828563690186, + "learning_rate": 4.129323047586028e-05, + "loss": 1.7845, + "step": 13170 + }, + { + "epoch": 0.7711209922770887, + "grad_norm": 2.007859468460083, + "learning_rate": 4.128239577013088e-05, + "loss": 1.7837, + "step": 13180 + }, + { + "epoch": 0.7717060613152352, + "grad_norm": 2.427004814147949, + "learning_rate": 4.1271561064401494e-05, + "loss": 1.7961, + "step": 13190 + }, + { + "epoch": 0.7722911303533817, + "grad_norm": 2.4226276874542236, + "learning_rate": 4.1260726358672104e-05, + "loss": 1.8009, + "step": 13200 + }, + { + "epoch": 0.7728761993915282, + "grad_norm": 2.206604242324829, + "learning_rate": 4.124989165294271e-05, + "loss": 1.7972, + "step": 13210 + }, + { + "epoch": 0.7734612684296747, + "grad_norm": 2.162381172180176, + "learning_rate": 4.123905694721332e-05, + "loss": 1.7926, + "step": 13220 + }, + { + "epoch": 0.7740463374678213, + "grad_norm": 2.2817039489746094, + "learning_rate": 4.122822224148392e-05, + "loss": 1.8021, + "step": 13230 + }, + { + "epoch": 0.7746314065059677, + "grad_norm": 2.210137128829956, + "learning_rate": 4.121738753575453e-05, + "loss": 1.7927, + "step": 13240 + }, + { + "epoch": 0.7752164755441142, + "grad_norm": 2.3482539653778076, + "learning_rate": 4.120655283002514e-05, + "loss": 1.79, + "step": 13250 + }, + { + "epoch": 0.7758015445822607, + "grad_norm": 2.2445783615112305, + "learning_rate": 4.119571812429575e-05, + "loss": 1.815, + "step": 13260 + }, + { + "epoch": 0.7763866136204072, + "grad_norm": 2.1627860069274902, + "learning_rate": 4.118488341856636e-05, + "loss": 1.8104, + "step": 13270 + }, + { + "epoch": 0.7769716826585538, + "grad_norm": 2.144334316253662, + "learning_rate": 4.117404871283696e-05, + "loss": 1.7931, + "step": 13280 + }, + { + "epoch": 0.7775567516967002, + "grad_norm": 2.036248207092285, + "learning_rate": 4.116321400710757e-05, + "loss": 1.8053, + "step": 13290 + }, + { + "epoch": 0.7781418207348467, + "grad_norm": 2.200634002685547, + "learning_rate": 4.115237930137818e-05, + "loss": 1.7837, + "step": 13300 + }, + { + "epoch": 0.7787268897729932, + "grad_norm": 1.9898886680603027, + "learning_rate": 4.114154459564879e-05, + "loss": 1.7929, + "step": 13310 + }, + { + "epoch": 0.7793119588111397, + "grad_norm": 2.052625894546509, + "learning_rate": 4.113070988991939e-05, + "loss": 1.793, + "step": 13320 + }, + { + "epoch": 0.7798970278492862, + "grad_norm": 1.878947138786316, + "learning_rate": 4.111987518419e-05, + "loss": 1.7819, + "step": 13330 + }, + { + "epoch": 0.7804820968874328, + "grad_norm": 2.1028265953063965, + "learning_rate": 4.1109040478460606e-05, + "loss": 1.7882, + "step": 13340 + }, + { + "epoch": 0.7810671659255792, + "grad_norm": 2.019439220428467, + "learning_rate": 4.1098205772731216e-05, + "loss": 1.7984, + "step": 13350 + }, + { + "epoch": 0.7816522349637257, + "grad_norm": 2.273960828781128, + "learning_rate": 4.108737106700182e-05, + "loss": 1.7773, + "step": 13360 + }, + { + "epoch": 0.7822373040018722, + "grad_norm": 2.0112483501434326, + "learning_rate": 4.1076536361272424e-05, + "loss": 1.7824, + "step": 13370 + }, + { + "epoch": 0.7828223730400187, + "grad_norm": 2.0834248065948486, + "learning_rate": 4.1065701655543035e-05, + "loss": 1.7849, + "step": 13380 + }, + { + "epoch": 0.7834074420781653, + "grad_norm": 2.0201098918914795, + "learning_rate": 4.1054866949813645e-05, + "loss": 1.8134, + "step": 13390 + }, + { + "epoch": 0.7839925111163117, + "grad_norm": 2.0641140937805176, + "learning_rate": 4.104403224408425e-05, + "loss": 1.7887, + "step": 13400 + }, + { + "epoch": 0.7845775801544582, + "grad_norm": 2.317190408706665, + "learning_rate": 4.103319753835486e-05, + "loss": 1.7729, + "step": 13410 + }, + { + "epoch": 0.7851626491926047, + "grad_norm": 2.0303170680999756, + "learning_rate": 4.1022362832625464e-05, + "loss": 1.7876, + "step": 13420 + }, + { + "epoch": 0.7857477182307512, + "grad_norm": 2.0011823177337646, + "learning_rate": 4.1011528126896074e-05, + "loss": 1.7875, + "step": 13430 + }, + { + "epoch": 0.7863327872688978, + "grad_norm": 2.155177116394043, + "learning_rate": 4.100069342116668e-05, + "loss": 1.7997, + "step": 13440 + }, + { + "epoch": 0.7869178563070443, + "grad_norm": 2.2054944038391113, + "learning_rate": 4.098985871543729e-05, + "loss": 1.7987, + "step": 13450 + }, + { + "epoch": 0.7875029253451907, + "grad_norm": 2.164259910583496, + "learning_rate": 4.09790240097079e-05, + "loss": 1.7965, + "step": 13460 + }, + { + "epoch": 0.7880879943833372, + "grad_norm": 2.407395601272583, + "learning_rate": 4.0968189303978504e-05, + "loss": 1.7776, + "step": 13470 + }, + { + "epoch": 0.7886730634214837, + "grad_norm": 2.186887264251709, + "learning_rate": 4.0957354598249114e-05, + "loss": 1.8011, + "step": 13480 + }, + { + "epoch": 0.7892581324596303, + "grad_norm": 2.1906139850616455, + "learning_rate": 4.094651989251972e-05, + "loss": 1.7958, + "step": 13490 + }, + { + "epoch": 0.7898432014977768, + "grad_norm": 1.9954781532287598, + "learning_rate": 4.093568518679033e-05, + "loss": 1.7921, + "step": 13500 + }, + { + "epoch": 0.7904282705359232, + "grad_norm": 2.1564385890960693, + "learning_rate": 4.092485048106094e-05, + "loss": 1.7999, + "step": 13510 + }, + { + "epoch": 0.7910133395740697, + "grad_norm": 1.9688284397125244, + "learning_rate": 4.091401577533154e-05, + "loss": 1.803, + "step": 13520 + }, + { + "epoch": 0.7915984086122162, + "grad_norm": 2.140964984893799, + "learning_rate": 4.0903181069602154e-05, + "loss": 1.7948, + "step": 13530 + }, + { + "epoch": 0.7921834776503628, + "grad_norm": 2.2255685329437256, + "learning_rate": 4.089234636387276e-05, + "loss": 1.7724, + "step": 13540 + }, + { + "epoch": 0.7927685466885093, + "grad_norm": 2.1159400939941406, + "learning_rate": 4.088151165814337e-05, + "loss": 1.7722, + "step": 13550 + }, + { + "epoch": 0.7933536157266557, + "grad_norm": 2.3552114963531494, + "learning_rate": 4.087067695241397e-05, + "loss": 1.7931, + "step": 13560 + }, + { + "epoch": 0.7939386847648022, + "grad_norm": 2.264587879180908, + "learning_rate": 4.085984224668458e-05, + "loss": 1.7901, + "step": 13570 + }, + { + "epoch": 0.7945237538029487, + "grad_norm": 2.120084285736084, + "learning_rate": 4.0849007540955194e-05, + "loss": 1.7841, + "step": 13580 + }, + { + "epoch": 0.7951088228410953, + "grad_norm": 2.56168532371521, + "learning_rate": 4.08381728352258e-05, + "loss": 1.7755, + "step": 13590 + }, + { + "epoch": 0.7956938918792418, + "grad_norm": 2.0523221492767334, + "learning_rate": 4.082733812949641e-05, + "loss": 1.7983, + "step": 13600 + }, + { + "epoch": 0.7962789609173883, + "grad_norm": 2.218662738800049, + "learning_rate": 4.081650342376701e-05, + "loss": 1.7997, + "step": 13610 + }, + { + "epoch": 0.7968640299555347, + "grad_norm": 2.318753957748413, + "learning_rate": 4.080566871803762e-05, + "loss": 1.7881, + "step": 13620 + }, + { + "epoch": 0.7974490989936812, + "grad_norm": 2.2360517978668213, + "learning_rate": 4.0794834012308226e-05, + "loss": 1.7723, + "step": 13630 + }, + { + "epoch": 0.7980341680318278, + "grad_norm": 2.156599521636963, + "learning_rate": 4.078399930657884e-05, + "loss": 1.7965, + "step": 13640 + }, + { + "epoch": 0.7986192370699743, + "grad_norm": 2.4479095935821533, + "learning_rate": 4.077316460084945e-05, + "loss": 1.7659, + "step": 13650 + }, + { + "epoch": 0.7992043061081208, + "grad_norm": 2.3190088272094727, + "learning_rate": 4.076232989512005e-05, + "loss": 1.7738, + "step": 13660 + }, + { + "epoch": 0.7997893751462672, + "grad_norm": 2.1156094074249268, + "learning_rate": 4.075149518939066e-05, + "loss": 1.7843, + "step": 13670 + }, + { + "epoch": 0.8003744441844137, + "grad_norm": 1.9653937816619873, + "learning_rate": 4.0740660483661266e-05, + "loss": 1.7926, + "step": 13680 + }, + { + "epoch": 0.8009595132225603, + "grad_norm": 2.0302317142486572, + "learning_rate": 4.072982577793188e-05, + "loss": 1.7808, + "step": 13690 + }, + { + "epoch": 0.8015445822607068, + "grad_norm": 1.9557278156280518, + "learning_rate": 4.071899107220248e-05, + "loss": 1.7832, + "step": 13700 + }, + { + "epoch": 0.8021296512988533, + "grad_norm": 2.1157281398773193, + "learning_rate": 4.070815636647309e-05, + "loss": 1.7904, + "step": 13710 + }, + { + "epoch": 0.8027147203369998, + "grad_norm": 1.9550879001617432, + "learning_rate": 4.0697321660743695e-05, + "loss": 1.7717, + "step": 13720 + }, + { + "epoch": 0.8032997893751462, + "grad_norm": 2.03267240524292, + "learning_rate": 4.0686486955014306e-05, + "loss": 1.7846, + "step": 13730 + }, + { + "epoch": 0.8038848584132927, + "grad_norm": 2.071942090988159, + "learning_rate": 4.067565224928491e-05, + "loss": 1.7852, + "step": 13740 + }, + { + "epoch": 0.8044699274514393, + "grad_norm": 2.1972901821136475, + "learning_rate": 4.0664817543555513e-05, + "loss": 1.7884, + "step": 13750 + }, + { + "epoch": 0.8050549964895858, + "grad_norm": 2.365922451019287, + "learning_rate": 4.0653982837826124e-05, + "loss": 1.7784, + "step": 13760 + }, + { + "epoch": 0.8056400655277323, + "grad_norm": 1.9824973344802856, + "learning_rate": 4.0643148132096735e-05, + "loss": 1.7822, + "step": 13770 + }, + { + "epoch": 0.8062251345658787, + "grad_norm": 2.0482287406921387, + "learning_rate": 4.063231342636734e-05, + "loss": 1.7823, + "step": 13780 + }, + { + "epoch": 0.8068102036040252, + "grad_norm": 2.3646090030670166, + "learning_rate": 4.062147872063795e-05, + "loss": 1.7755, + "step": 13790 + }, + { + "epoch": 0.8073952726421718, + "grad_norm": 2.076508045196533, + "learning_rate": 4.061064401490855e-05, + "loss": 1.797, + "step": 13800 + }, + { + "epoch": 0.8079803416803183, + "grad_norm": 2.0609309673309326, + "learning_rate": 4.0599809309179164e-05, + "loss": 1.7724, + "step": 13810 + }, + { + "epoch": 0.8085654107184648, + "grad_norm": 1.9612531661987305, + "learning_rate": 4.058897460344977e-05, + "loss": 1.7911, + "step": 13820 + }, + { + "epoch": 0.8091504797566113, + "grad_norm": 2.5766351222991943, + "learning_rate": 4.057813989772038e-05, + "loss": 1.7697, + "step": 13830 + }, + { + "epoch": 0.8097355487947577, + "grad_norm": 2.2884480953216553, + "learning_rate": 4.056730519199099e-05, + "loss": 1.7841, + "step": 13840 + }, + { + "epoch": 0.8103206178329043, + "grad_norm": 2.225314140319824, + "learning_rate": 4.055647048626159e-05, + "loss": 1.7791, + "step": 13850 + }, + { + "epoch": 0.8109056868710508, + "grad_norm": 2.3383941650390625, + "learning_rate": 4.0545635780532203e-05, + "loss": 1.7866, + "step": 13860 + }, + { + "epoch": 0.8114907559091973, + "grad_norm": 1.9902864694595337, + "learning_rate": 4.053480107480281e-05, + "loss": 1.775, + "step": 13870 + }, + { + "epoch": 0.8120758249473438, + "grad_norm": 2.2651472091674805, + "learning_rate": 4.052396636907342e-05, + "loss": 1.8184, + "step": 13880 + }, + { + "epoch": 0.8126608939854902, + "grad_norm": 1.9579837322235107, + "learning_rate": 4.051313166334402e-05, + "loss": 1.786, + "step": 13890 + }, + { + "epoch": 0.8132459630236368, + "grad_norm": 1.9739134311676025, + "learning_rate": 4.050229695761463e-05, + "loss": 1.7822, + "step": 13900 + }, + { + "epoch": 0.8138310320617833, + "grad_norm": 2.2460858821868896, + "learning_rate": 4.049146225188524e-05, + "loss": 1.7817, + "step": 13910 + }, + { + "epoch": 0.8144161010999298, + "grad_norm": 2.289285659790039, + "learning_rate": 4.048062754615585e-05, + "loss": 1.7826, + "step": 13920 + }, + { + "epoch": 0.8150011701380763, + "grad_norm": 2.1503984928131104, + "learning_rate": 4.046979284042646e-05, + "loss": 1.7621, + "step": 13930 + }, + { + "epoch": 0.8155862391762227, + "grad_norm": 1.753171682357788, + "learning_rate": 4.045895813469706e-05, + "loss": 1.7543, + "step": 13940 + }, + { + "epoch": 0.8161713082143693, + "grad_norm": 2.041449785232544, + "learning_rate": 4.044812342896767e-05, + "loss": 1.792, + "step": 13950 + }, + { + "epoch": 0.8167563772525158, + "grad_norm": 2.147860288619995, + "learning_rate": 4.043728872323828e-05, + "loss": 1.7714, + "step": 13960 + }, + { + "epoch": 0.8173414462906623, + "grad_norm": 2.1903316974639893, + "learning_rate": 4.042645401750889e-05, + "loss": 1.7842, + "step": 13970 + }, + { + "epoch": 0.8179265153288088, + "grad_norm": 2.1307685375213623, + "learning_rate": 4.04156193117795e-05, + "loss": 1.7687, + "step": 13980 + }, + { + "epoch": 0.8185115843669553, + "grad_norm": 2.311890125274658, + "learning_rate": 4.04047846060501e-05, + "loss": 1.7802, + "step": 13990 + }, + { + "epoch": 0.8190966534051018, + "grad_norm": 2.1691980361938477, + "learning_rate": 4.039394990032071e-05, + "loss": 1.7758, + "step": 14000 + }, + { + "epoch": 0.8190966534051018, + "eval_loss": 1.811227798461914, + "eval_runtime": 32.8991, + "eval_samples_per_second": 686.553, + "eval_steps_per_second": 5.38, + "step": 14000 + }, + { + "epoch": 0.8196817224432483, + "grad_norm": 1.8881202936172485, + "learning_rate": 4.0383115194591316e-05, + "loss": 1.7686, + "step": 14010 + }, + { + "epoch": 0.8202667914813948, + "grad_norm": 2.0305862426757812, + "learning_rate": 4.0372280488861926e-05, + "loss": 1.7613, + "step": 14020 + }, + { + "epoch": 0.8208518605195413, + "grad_norm": 2.0115017890930176, + "learning_rate": 4.036144578313254e-05, + "loss": 1.7541, + "step": 14030 + }, + { + "epoch": 0.8214369295576878, + "grad_norm": 2.1282949447631836, + "learning_rate": 4.035061107740314e-05, + "loss": 1.7744, + "step": 14040 + }, + { + "epoch": 0.8220219985958344, + "grad_norm": 1.9637404680252075, + "learning_rate": 4.033977637167375e-05, + "loss": 1.7798, + "step": 14050 + }, + { + "epoch": 0.8226070676339808, + "grad_norm": 2.001265048980713, + "learning_rate": 4.0328941665944355e-05, + "loss": 1.7799, + "step": 14060 + }, + { + "epoch": 0.8231921366721273, + "grad_norm": 1.8603737354278564, + "learning_rate": 4.0318106960214966e-05, + "loss": 1.7811, + "step": 14070 + }, + { + "epoch": 0.8237772057102738, + "grad_norm": 2.1149039268493652, + "learning_rate": 4.030727225448557e-05, + "loss": 1.7676, + "step": 14080 + }, + { + "epoch": 0.8243622747484203, + "grad_norm": 2.1059296131134033, + "learning_rate": 4.029643754875618e-05, + "loss": 1.7742, + "step": 14090 + }, + { + "epoch": 0.8249473437865669, + "grad_norm": 2.0027103424072266, + "learning_rate": 4.0285602843026784e-05, + "loss": 1.7582, + "step": 14100 + }, + { + "epoch": 0.8255324128247133, + "grad_norm": 1.8544400930404663, + "learning_rate": 4.0274768137297395e-05, + "loss": 1.7632, + "step": 14110 + }, + { + "epoch": 0.8261174818628598, + "grad_norm": 2.018418550491333, + "learning_rate": 4.0263933431568e-05, + "loss": 1.7764, + "step": 14120 + }, + { + "epoch": 0.8267025509010063, + "grad_norm": 2.0599818229675293, + "learning_rate": 4.02530987258386e-05, + "loss": 1.7597, + "step": 14130 + }, + { + "epoch": 0.8272876199391528, + "grad_norm": 1.9878023862838745, + "learning_rate": 4.0242264020109213e-05, + "loss": 1.7765, + "step": 14140 + }, + { + "epoch": 0.8278726889772994, + "grad_norm": 2.118743896484375, + "learning_rate": 4.0231429314379824e-05, + "loss": 1.7919, + "step": 14150 + }, + { + "epoch": 0.8284577580154459, + "grad_norm": 2.013658285140991, + "learning_rate": 4.022059460865043e-05, + "loss": 1.7845, + "step": 14160 + }, + { + "epoch": 0.8290428270535923, + "grad_norm": 1.9822301864624023, + "learning_rate": 4.020975990292104e-05, + "loss": 1.7593, + "step": 14170 + }, + { + "epoch": 0.8296278960917388, + "grad_norm": 2.015939235687256, + "learning_rate": 4.019892519719164e-05, + "loss": 1.7671, + "step": 14180 + }, + { + "epoch": 0.8302129651298853, + "grad_norm": 2.0594239234924316, + "learning_rate": 4.018809049146225e-05, + "loss": 1.7704, + "step": 14190 + }, + { + "epoch": 0.8307980341680318, + "grad_norm": 2.036076545715332, + "learning_rate": 4.017725578573286e-05, + "loss": 1.7792, + "step": 14200 + }, + { + "epoch": 0.8313831032061784, + "grad_norm": 2.0017409324645996, + "learning_rate": 4.016642108000347e-05, + "loss": 1.788, + "step": 14210 + }, + { + "epoch": 0.8319681722443248, + "grad_norm": 2.090041399002075, + "learning_rate": 4.015558637427408e-05, + "loss": 1.7835, + "step": 14220 + }, + { + "epoch": 0.8325532412824713, + "grad_norm": 2.0856072902679443, + "learning_rate": 4.014475166854468e-05, + "loss": 1.777, + "step": 14230 + }, + { + "epoch": 0.8331383103206178, + "grad_norm": 2.120039701461792, + "learning_rate": 4.013391696281529e-05, + "loss": 1.7865, + "step": 14240 + }, + { + "epoch": 0.8337233793587643, + "grad_norm": 1.795345664024353, + "learning_rate": 4.01230822570859e-05, + "loss": 1.7734, + "step": 14250 + }, + { + "epoch": 0.8343084483969109, + "grad_norm": 2.060981273651123, + "learning_rate": 4.011224755135651e-05, + "loss": 1.7667, + "step": 14260 + }, + { + "epoch": 0.8348935174350574, + "grad_norm": 2.2059693336486816, + "learning_rate": 4.010141284562711e-05, + "loss": 1.7681, + "step": 14270 + }, + { + "epoch": 0.8354785864732038, + "grad_norm": 2.347867488861084, + "learning_rate": 4.009057813989772e-05, + "loss": 1.7676, + "step": 14280 + }, + { + "epoch": 0.8360636555113503, + "grad_norm": 2.1035139560699463, + "learning_rate": 4.007974343416833e-05, + "loss": 1.7774, + "step": 14290 + }, + { + "epoch": 0.8366487245494968, + "grad_norm": 2.080390214920044, + "learning_rate": 4.0068908728438936e-05, + "loss": 1.784, + "step": 14300 + }, + { + "epoch": 0.8372337935876434, + "grad_norm": 1.8486807346343994, + "learning_rate": 4.005807402270955e-05, + "loss": 1.7672, + "step": 14310 + }, + { + "epoch": 0.8378188626257899, + "grad_norm": 2.0855510234832764, + "learning_rate": 4.004723931698015e-05, + "loss": 1.7722, + "step": 14320 + }, + { + "epoch": 0.8384039316639363, + "grad_norm": 2.106159210205078, + "learning_rate": 4.003640461125076e-05, + "loss": 1.7789, + "step": 14330 + }, + { + "epoch": 0.8389890007020828, + "grad_norm": 1.996299386024475, + "learning_rate": 4.002556990552137e-05, + "loss": 1.7627, + "step": 14340 + }, + { + "epoch": 0.8395740697402293, + "grad_norm": 1.917320966720581, + "learning_rate": 4.0014735199791976e-05, + "loss": 1.7481, + "step": 14350 + }, + { + "epoch": 0.8401591387783759, + "grad_norm": 1.979839563369751, + "learning_rate": 4.000390049406259e-05, + "loss": 1.7679, + "step": 14360 + }, + { + "epoch": 0.8407442078165224, + "grad_norm": 1.9442750215530396, + "learning_rate": 3.999306578833319e-05, + "loss": 1.7429, + "step": 14370 + }, + { + "epoch": 0.8413292768546689, + "grad_norm": 2.063513994216919, + "learning_rate": 3.99822310826038e-05, + "loss": 1.7493, + "step": 14380 + }, + { + "epoch": 0.8419143458928153, + "grad_norm": 2.130357027053833, + "learning_rate": 3.9971396376874405e-05, + "loss": 1.7665, + "step": 14390 + }, + { + "epoch": 0.8424994149309618, + "grad_norm": 1.9091861248016357, + "learning_rate": 3.9960561671145016e-05, + "loss": 1.7788, + "step": 14400 + }, + { + "epoch": 0.8430844839691084, + "grad_norm": 2.1781694889068604, + "learning_rate": 3.9949726965415626e-05, + "loss": 1.7675, + "step": 14410 + }, + { + "epoch": 0.8436695530072549, + "grad_norm": 2.1577939987182617, + "learning_rate": 3.993889225968623e-05, + "loss": 1.8043, + "step": 14420 + }, + { + "epoch": 0.8442546220454014, + "grad_norm": 2.245948553085327, + "learning_rate": 3.992805755395684e-05, + "loss": 1.7799, + "step": 14430 + }, + { + "epoch": 0.8448396910835478, + "grad_norm": 2.1553475856781006, + "learning_rate": 3.9917222848227445e-05, + "loss": 1.7751, + "step": 14440 + }, + { + "epoch": 0.8454247601216943, + "grad_norm": 2.1432957649230957, + "learning_rate": 3.9906388142498055e-05, + "loss": 1.7715, + "step": 14450 + }, + { + "epoch": 0.8460098291598409, + "grad_norm": 2.2047276496887207, + "learning_rate": 3.989555343676866e-05, + "loss": 1.7764, + "step": 14460 + }, + { + "epoch": 0.8465948981979874, + "grad_norm": 2.033047914505005, + "learning_rate": 3.988471873103927e-05, + "loss": 1.7581, + "step": 14470 + }, + { + "epoch": 0.8471799672361339, + "grad_norm": 2.1348419189453125, + "learning_rate": 3.9873884025309874e-05, + "loss": 1.769, + "step": 14480 + }, + { + "epoch": 0.8477650362742803, + "grad_norm": 1.9743399620056152, + "learning_rate": 3.9863049319580484e-05, + "loss": 1.769, + "step": 14490 + }, + { + "epoch": 0.8483501053124268, + "grad_norm": 2.0515522956848145, + "learning_rate": 3.985221461385109e-05, + "loss": 1.7826, + "step": 14500 + }, + { + "epoch": 0.8489351743505734, + "grad_norm": 2.083169460296631, + "learning_rate": 3.9842463378694637e-05, + "loss": 1.762, + "step": 14510 + }, + { + "epoch": 0.8495202433887199, + "grad_norm": 1.9494489431381226, + "learning_rate": 3.983162867296524e-05, + "loss": 1.746, + "step": 14520 + }, + { + "epoch": 0.8501053124268664, + "grad_norm": 2.033761501312256, + "learning_rate": 3.982079396723585e-05, + "loss": 1.7545, + "step": 14530 + }, + { + "epoch": 0.8506903814650129, + "grad_norm": 2.093231439590454, + "learning_rate": 3.9809959261506455e-05, + "loss": 1.7654, + "step": 14540 + }, + { + "epoch": 0.8512754505031593, + "grad_norm": 2.190495014190674, + "learning_rate": 3.9799124555777066e-05, + "loss": 1.7648, + "step": 14550 + }, + { + "epoch": 0.8518605195413059, + "grad_norm": 2.08601713180542, + "learning_rate": 3.978828985004767e-05, + "loss": 1.7716, + "step": 14560 + }, + { + "epoch": 0.8524455885794524, + "grad_norm": 2.0569472312927246, + "learning_rate": 3.977745514431828e-05, + "loss": 1.7338, + "step": 14570 + }, + { + "epoch": 0.8530306576175989, + "grad_norm": 2.0191664695739746, + "learning_rate": 3.976662043858889e-05, + "loss": 1.7566, + "step": 14580 + }, + { + "epoch": 0.8536157266557454, + "grad_norm": 1.9392517805099487, + "learning_rate": 3.9755785732859495e-05, + "loss": 1.7725, + "step": 14590 + }, + { + "epoch": 0.8542007956938918, + "grad_norm": 2.015103816986084, + "learning_rate": 3.9744951027130105e-05, + "loss": 1.7681, + "step": 14600 + }, + { + "epoch": 0.8547858647320383, + "grad_norm": 2.0415878295898438, + "learning_rate": 3.973411632140071e-05, + "loss": 1.7668, + "step": 14610 + }, + { + "epoch": 0.8553709337701849, + "grad_norm": 2.0121288299560547, + "learning_rate": 3.972328161567132e-05, + "loss": 1.7694, + "step": 14620 + }, + { + "epoch": 0.8559560028083314, + "grad_norm": 1.8269927501678467, + "learning_rate": 3.971244690994193e-05, + "loss": 1.7649, + "step": 14630 + }, + { + "epoch": 0.8565410718464779, + "grad_norm": 1.9186128377914429, + "learning_rate": 3.9701612204212534e-05, + "loss": 1.7646, + "step": 14640 + }, + { + "epoch": 0.8571261408846244, + "grad_norm": 2.2198486328125, + "learning_rate": 3.9690777498483145e-05, + "loss": 1.7547, + "step": 14650 + }, + { + "epoch": 0.8577112099227708, + "grad_norm": 2.064858913421631, + "learning_rate": 3.967994279275375e-05, + "loss": 1.7534, + "step": 14660 + }, + { + "epoch": 0.8582962789609174, + "grad_norm": 2.0336713790893555, + "learning_rate": 3.966910808702436e-05, + "loss": 1.7727, + "step": 14670 + }, + { + "epoch": 0.8588813479990639, + "grad_norm": 2.0075385570526123, + "learning_rate": 3.965827338129496e-05, + "loss": 1.7648, + "step": 14680 + }, + { + "epoch": 0.8594664170372104, + "grad_norm": 1.9816136360168457, + "learning_rate": 3.9647438675565574e-05, + "loss": 1.7584, + "step": 14690 + }, + { + "epoch": 0.8600514860753569, + "grad_norm": 2.010467290878296, + "learning_rate": 3.9636603969836185e-05, + "loss": 1.7599, + "step": 14700 + }, + { + "epoch": 0.8606365551135033, + "grad_norm": 1.93532395362854, + "learning_rate": 3.962576926410679e-05, + "loss": 1.7582, + "step": 14710 + }, + { + "epoch": 0.8612216241516499, + "grad_norm": 2.2819302082061768, + "learning_rate": 3.96149345583774e-05, + "loss": 1.7521, + "step": 14720 + }, + { + "epoch": 0.8618066931897964, + "grad_norm": 1.9902760982513428, + "learning_rate": 3.9604099852648e-05, + "loss": 1.7847, + "step": 14730 + }, + { + "epoch": 0.8623917622279429, + "grad_norm": 1.9093637466430664, + "learning_rate": 3.9593265146918614e-05, + "loss": 1.7531, + "step": 14740 + }, + { + "epoch": 0.8629768312660894, + "grad_norm": 2.047818899154663, + "learning_rate": 3.958243044118922e-05, + "loss": 1.7574, + "step": 14750 + }, + { + "epoch": 0.8635619003042359, + "grad_norm": 1.7359449863433838, + "learning_rate": 3.957159573545983e-05, + "loss": 1.7517, + "step": 14760 + }, + { + "epoch": 0.8641469693423824, + "grad_norm": 1.9739172458648682, + "learning_rate": 3.956076102973044e-05, + "loss": 1.7479, + "step": 14770 + }, + { + "epoch": 0.8647320383805289, + "grad_norm": 2.036006450653076, + "learning_rate": 3.954992632400104e-05, + "loss": 1.7445, + "step": 14780 + }, + { + "epoch": 0.8653171074186754, + "grad_norm": 2.089719772338867, + "learning_rate": 3.953909161827165e-05, + "loss": 1.7544, + "step": 14790 + }, + { + "epoch": 0.8659021764568219, + "grad_norm": 2.240567207336426, + "learning_rate": 3.952825691254226e-05, + "loss": 1.7681, + "step": 14800 + }, + { + "epoch": 0.8664872454949684, + "grad_norm": 1.9674317836761475, + "learning_rate": 3.951742220681287e-05, + "loss": 1.7579, + "step": 14810 + }, + { + "epoch": 0.867072314533115, + "grad_norm": 2.033182144165039, + "learning_rate": 3.950658750108348e-05, + "loss": 1.7352, + "step": 14820 + }, + { + "epoch": 0.8676573835712614, + "grad_norm": 2.0038890838623047, + "learning_rate": 3.949575279535408e-05, + "loss": 1.7516, + "step": 14830 + }, + { + "epoch": 0.8682424526094079, + "grad_norm": 1.9181404113769531, + "learning_rate": 3.9484918089624686e-05, + "loss": 1.7462, + "step": 14840 + }, + { + "epoch": 0.8688275216475544, + "grad_norm": 2.1039316654205322, + "learning_rate": 3.94740833838953e-05, + "loss": 1.7515, + "step": 14850 + }, + { + "epoch": 0.8694125906857009, + "grad_norm": 1.826553463935852, + "learning_rate": 3.94632486781659e-05, + "loss": 1.7653, + "step": 14860 + }, + { + "epoch": 0.8699976597238475, + "grad_norm": 2.0121426582336426, + "learning_rate": 3.945241397243651e-05, + "loss": 1.7538, + "step": 14870 + }, + { + "epoch": 0.8705827287619939, + "grad_norm": 2.070495843887329, + "learning_rate": 3.9441579266707115e-05, + "loss": 1.7494, + "step": 14880 + }, + { + "epoch": 0.8711677978001404, + "grad_norm": 1.9261424541473389, + "learning_rate": 3.9430744560977726e-05, + "loss": 1.7616, + "step": 14890 + }, + { + "epoch": 0.8717528668382869, + "grad_norm": 1.9813910722732544, + "learning_rate": 3.941990985524833e-05, + "loss": 1.7398, + "step": 14900 + }, + { + "epoch": 0.8723379358764334, + "grad_norm": 2.116058588027954, + "learning_rate": 3.940907514951894e-05, + "loss": 1.7468, + "step": 14910 + }, + { + "epoch": 0.87292300491458, + "grad_norm": 1.9075366258621216, + "learning_rate": 3.9398240443789544e-05, + "loss": 1.7734, + "step": 14920 + }, + { + "epoch": 0.8735080739527264, + "grad_norm": 1.8735154867172241, + "learning_rate": 3.9387405738060155e-05, + "loss": 1.7419, + "step": 14930 + }, + { + "epoch": 0.8740931429908729, + "grad_norm": 2.1218554973602295, + "learning_rate": 3.937657103233076e-05, + "loss": 1.7409, + "step": 14940 + }, + { + "epoch": 0.8746782120290194, + "grad_norm": 1.801939606666565, + "learning_rate": 3.936573632660137e-05, + "loss": 1.7643, + "step": 14950 + }, + { + "epoch": 0.8752632810671659, + "grad_norm": 2.1351418495178223, + "learning_rate": 3.935490162087198e-05, + "loss": 1.755, + "step": 14960 + }, + { + "epoch": 0.8758483501053125, + "grad_norm": 2.1627988815307617, + "learning_rate": 3.9344066915142584e-05, + "loss": 1.7517, + "step": 14970 + }, + { + "epoch": 0.876433419143459, + "grad_norm": 1.993920922279358, + "learning_rate": 3.9333232209413195e-05, + "loss": 1.7439, + "step": 14980 + }, + { + "epoch": 0.8770184881816054, + "grad_norm": 1.961388111114502, + "learning_rate": 3.93223975036838e-05, + "loss": 1.7561, + "step": 14990 + }, + { + "epoch": 0.8776035572197519, + "grad_norm": 2.0150465965270996, + "learning_rate": 3.931156279795441e-05, + "loss": 1.7675, + "step": 15000 + }, + { + "epoch": 0.8776035572197519, + "eval_loss": 1.7887755632400513, + "eval_runtime": 33.0265, + "eval_samples_per_second": 683.904, + "eval_steps_per_second": 5.359, + "step": 15000 + } + ], + "logging_steps": 10, + "max_steps": 51276, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.380120498266112e+17, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}