diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16597 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.995776135163675, + "eval_steps": 500, + "global_step": 2365, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021119324181626186, + "grad_norm": 8.875621183840114, + "learning_rate": 3.3755274261603377e-07, + "loss": 1.8303, + "step": 1 + }, + { + "epoch": 0.004223864836325237, + "grad_norm": 8.855071534538828, + "learning_rate": 6.751054852320675e-07, + "loss": 1.8342, + "step": 2 + }, + { + "epoch": 0.006335797254487857, + "grad_norm": 8.800447854784716, + "learning_rate": 1.0126582278481013e-06, + "loss": 1.8528, + "step": 3 + }, + { + "epoch": 0.008447729672650475, + "grad_norm": 8.6759101289337, + "learning_rate": 1.350210970464135e-06, + "loss": 1.8386, + "step": 4 + }, + { + "epoch": 0.010559662090813094, + "grad_norm": 8.181714875885277, + "learning_rate": 1.6877637130801689e-06, + "loss": 1.8274, + "step": 5 + }, + { + "epoch": 0.012671594508975714, + "grad_norm": 8.013634134064644, + "learning_rate": 2.0253164556962026e-06, + "loss": 1.8286, + "step": 6 + }, + { + "epoch": 0.014783526927138331, + "grad_norm": 6.408224349834773, + "learning_rate": 2.3628691983122364e-06, + "loss": 1.7907, + "step": 7 + }, + { + "epoch": 0.01689545934530095, + "grad_norm": 5.926318476667063, + "learning_rate": 2.70042194092827e-06, + "loss": 1.7865, + "step": 8 + }, + { + "epoch": 0.01900739176346357, + "grad_norm": 3.397894558555903, + "learning_rate": 3.037974683544304e-06, + "loss": 1.7288, + "step": 9 + }, + { + "epoch": 0.021119324181626188, + "grad_norm": 2.876773270018399, + "learning_rate": 3.3755274261603377e-06, + "loss": 1.7315, + "step": 10 + }, + { + "epoch": 0.023231256599788808, + "grad_norm": 2.5798689449840646, + "learning_rate": 3.713080168776372e-06, + "loss": 1.705, + "step": 11 + }, + { + "epoch": 0.025343189017951427, + "grad_norm": 6.125711108707957, + "learning_rate": 4.050632911392405e-06, + "loss": 1.7221, + "step": 12 + }, + { + "epoch": 0.027455121436114043, + "grad_norm": 6.227969526346888, + "learning_rate": 4.3881856540084394e-06, + "loss": 1.7034, + "step": 13 + }, + { + "epoch": 0.029567053854276663, + "grad_norm": 5.734227495516121, + "learning_rate": 4.725738396624473e-06, + "loss": 1.7033, + "step": 14 + }, + { + "epoch": 0.03167898627243928, + "grad_norm": 4.730692132813407, + "learning_rate": 5.063291139240507e-06, + "loss": 1.6944, + "step": 15 + }, + { + "epoch": 0.0337909186906019, + "grad_norm": 3.625249929360866, + "learning_rate": 5.40084388185654e-06, + "loss": 1.6269, + "step": 16 + }, + { + "epoch": 0.03590285110876452, + "grad_norm": 3.0713059946821524, + "learning_rate": 5.7383966244725745e-06, + "loss": 1.6425, + "step": 17 + }, + { + "epoch": 0.03801478352692714, + "grad_norm": 2.346404475616461, + "learning_rate": 6.075949367088608e-06, + "loss": 1.5935, + "step": 18 + }, + { + "epoch": 0.04012671594508976, + "grad_norm": 1.6661572782681875, + "learning_rate": 6.413502109704642e-06, + "loss": 1.5977, + "step": 19 + }, + { + "epoch": 0.042238648363252376, + "grad_norm": 1.4717571971742118, + "learning_rate": 6.751054852320675e-06, + "loss": 1.5748, + "step": 20 + }, + { + "epoch": 0.044350580781414996, + "grad_norm": 1.5638427898277212, + "learning_rate": 7.08860759493671e-06, + "loss": 1.5797, + "step": 21 + }, + { + "epoch": 0.046462513199577615, + "grad_norm": 1.7473243626420212, + "learning_rate": 7.426160337552744e-06, + "loss": 1.5488, + "step": 22 + }, + { + "epoch": 0.048574445617740235, + "grad_norm": 1.748269807810292, + "learning_rate": 7.763713080168777e-06, + "loss": 1.5495, + "step": 23 + }, + { + "epoch": 0.050686378035902854, + "grad_norm": 1.3993867024408402, + "learning_rate": 8.10126582278481e-06, + "loss": 1.5263, + "step": 24 + }, + { + "epoch": 0.05279831045406547, + "grad_norm": 1.1713343109696195, + "learning_rate": 8.438818565400846e-06, + "loss": 1.5186, + "step": 25 + }, + { + "epoch": 0.054910242872228086, + "grad_norm": 1.1938543525159353, + "learning_rate": 8.776371308016879e-06, + "loss": 1.5195, + "step": 26 + }, + { + "epoch": 0.057022175290390706, + "grad_norm": 1.0324469466037556, + "learning_rate": 9.113924050632912e-06, + "loss": 1.5197, + "step": 27 + }, + { + "epoch": 0.059134107708553325, + "grad_norm": 0.9811498199425162, + "learning_rate": 9.451476793248946e-06, + "loss": 1.4907, + "step": 28 + }, + { + "epoch": 0.061246040126715945, + "grad_norm": 0.8676189326169521, + "learning_rate": 9.78902953586498e-06, + "loss": 1.4846, + "step": 29 + }, + { + "epoch": 0.06335797254487856, + "grad_norm": 0.8423360758117718, + "learning_rate": 1.0126582278481014e-05, + "loss": 1.4761, + "step": 30 + }, + { + "epoch": 0.06546990496304118, + "grad_norm": 0.762958358372631, + "learning_rate": 1.0464135021097049e-05, + "loss": 1.4939, + "step": 31 + }, + { + "epoch": 0.0675818373812038, + "grad_norm": 0.6823273951508456, + "learning_rate": 1.080168776371308e-05, + "loss": 1.4677, + "step": 32 + }, + { + "epoch": 0.06969376979936642, + "grad_norm": 0.7505346208372009, + "learning_rate": 1.1139240506329114e-05, + "loss": 1.4733, + "step": 33 + }, + { + "epoch": 0.07180570221752904, + "grad_norm": 0.6831593394469089, + "learning_rate": 1.1476793248945149e-05, + "loss": 1.4797, + "step": 34 + }, + { + "epoch": 0.07391763463569166, + "grad_norm": 0.6273264264650537, + "learning_rate": 1.1814345991561182e-05, + "loss": 1.4615, + "step": 35 + }, + { + "epoch": 0.07602956705385427, + "grad_norm": 0.6322401003010556, + "learning_rate": 1.2151898734177216e-05, + "loss": 1.448, + "step": 36 + }, + { + "epoch": 0.0781414994720169, + "grad_norm": 0.617104608977321, + "learning_rate": 1.248945147679325e-05, + "loss": 1.4474, + "step": 37 + }, + { + "epoch": 0.08025343189017951, + "grad_norm": 0.5448718323959989, + "learning_rate": 1.2827004219409284e-05, + "loss": 1.4574, + "step": 38 + }, + { + "epoch": 0.08236536430834214, + "grad_norm": 0.6262410871873982, + "learning_rate": 1.3164556962025317e-05, + "loss": 1.4304, + "step": 39 + }, + { + "epoch": 0.08447729672650475, + "grad_norm": 0.5549746387187513, + "learning_rate": 1.350210970464135e-05, + "loss": 1.4401, + "step": 40 + }, + { + "epoch": 0.08658922914466737, + "grad_norm": 0.5403542960312616, + "learning_rate": 1.3839662447257384e-05, + "loss": 1.4314, + "step": 41 + }, + { + "epoch": 0.08870116156282999, + "grad_norm": 0.5324851215134767, + "learning_rate": 1.417721518987342e-05, + "loss": 1.4624, + "step": 42 + }, + { + "epoch": 0.0908130939809926, + "grad_norm": 0.5095380199717499, + "learning_rate": 1.4514767932489453e-05, + "loss": 1.4481, + "step": 43 + }, + { + "epoch": 0.09292502639915523, + "grad_norm": 0.43362914045385265, + "learning_rate": 1.4852320675105488e-05, + "loss": 1.4363, + "step": 44 + }, + { + "epoch": 0.09503695881731784, + "grad_norm": 0.5628031755150383, + "learning_rate": 1.5189873417721521e-05, + "loss": 1.4314, + "step": 45 + }, + { + "epoch": 0.09714889123548047, + "grad_norm": 0.4505425762378718, + "learning_rate": 1.5527426160337554e-05, + "loss": 1.4301, + "step": 46 + }, + { + "epoch": 0.09926082365364308, + "grad_norm": 0.5919671158809601, + "learning_rate": 1.5864978902953586e-05, + "loss": 1.4075, + "step": 47 + }, + { + "epoch": 0.10137275607180571, + "grad_norm": 0.55646833694591, + "learning_rate": 1.620253164556962e-05, + "loss": 1.4314, + "step": 48 + }, + { + "epoch": 0.10348468848996832, + "grad_norm": 0.5724596847940949, + "learning_rate": 1.6540084388185656e-05, + "loss": 1.4345, + "step": 49 + }, + { + "epoch": 0.10559662090813093, + "grad_norm": 0.5193329867778335, + "learning_rate": 1.687763713080169e-05, + "loss": 1.4036, + "step": 50 + }, + { + "epoch": 0.10770855332629356, + "grad_norm": 0.586219029774943, + "learning_rate": 1.7215189873417723e-05, + "loss": 1.4078, + "step": 51 + }, + { + "epoch": 0.10982048574445617, + "grad_norm": 0.6639111028852875, + "learning_rate": 1.7552742616033758e-05, + "loss": 1.4258, + "step": 52 + }, + { + "epoch": 0.1119324181626188, + "grad_norm": 1.0168671981344457, + "learning_rate": 1.789029535864979e-05, + "loss": 1.4145, + "step": 53 + }, + { + "epoch": 0.11404435058078141, + "grad_norm": 1.3140770456516093, + "learning_rate": 1.8227848101265824e-05, + "loss": 1.4226, + "step": 54 + }, + { + "epoch": 0.11615628299894404, + "grad_norm": 0.961694236657747, + "learning_rate": 1.856540084388186e-05, + "loss": 1.4075, + "step": 55 + }, + { + "epoch": 0.11826821541710665, + "grad_norm": 1.085718199347348, + "learning_rate": 1.890295358649789e-05, + "loss": 1.4092, + "step": 56 + }, + { + "epoch": 0.12038014783526928, + "grad_norm": 1.1388345625989433, + "learning_rate": 1.9240506329113926e-05, + "loss": 1.3971, + "step": 57 + }, + { + "epoch": 0.12249208025343189, + "grad_norm": 0.9526207233157661, + "learning_rate": 1.957805907172996e-05, + "loss": 1.3927, + "step": 58 + }, + { + "epoch": 0.1246040126715945, + "grad_norm": 0.9079764534596166, + "learning_rate": 1.9915611814345993e-05, + "loss": 1.3886, + "step": 59 + }, + { + "epoch": 0.12671594508975711, + "grad_norm": 1.386865721327574, + "learning_rate": 2.0253164556962028e-05, + "loss": 1.4144, + "step": 60 + }, + { + "epoch": 0.12882787750791974, + "grad_norm": 0.7366271862062238, + "learning_rate": 2.0590717299578063e-05, + "loss": 1.4068, + "step": 61 + }, + { + "epoch": 0.13093980992608237, + "grad_norm": 1.0083572674534806, + "learning_rate": 2.0928270042194098e-05, + "loss": 1.39, + "step": 62 + }, + { + "epoch": 0.133051742344245, + "grad_norm": 1.7333989646284906, + "learning_rate": 2.1265822784810126e-05, + "loss": 1.4075, + "step": 63 + }, + { + "epoch": 0.1351636747624076, + "grad_norm": 0.6743386892916848, + "learning_rate": 2.160337552742616e-05, + "loss": 1.3832, + "step": 64 + }, + { + "epoch": 0.13727560718057022, + "grad_norm": 2.4510174691280047, + "learning_rate": 2.1940928270042193e-05, + "loss": 1.4017, + "step": 65 + }, + { + "epoch": 0.13938753959873285, + "grad_norm": 1.550393444240206, + "learning_rate": 2.2278481012658228e-05, + "loss": 1.4023, + "step": 66 + }, + { + "epoch": 0.14149947201689547, + "grad_norm": 2.5159931566538236, + "learning_rate": 2.2616033755274263e-05, + "loss": 1.3853, + "step": 67 + }, + { + "epoch": 0.14361140443505807, + "grad_norm": 2.721071354156489, + "learning_rate": 2.2953586497890298e-05, + "loss": 1.4249, + "step": 68 + }, + { + "epoch": 0.1457233368532207, + "grad_norm": 0.9669780013382273, + "learning_rate": 2.329113924050633e-05, + "loss": 1.395, + "step": 69 + }, + { + "epoch": 0.14783526927138332, + "grad_norm": 2.8548557980824265, + "learning_rate": 2.3628691983122365e-05, + "loss": 1.3965, + "step": 70 + }, + { + "epoch": 0.14994720168954592, + "grad_norm": 2.727611626815892, + "learning_rate": 2.39662447257384e-05, + "loss": 1.4209, + "step": 71 + }, + { + "epoch": 0.15205913410770855, + "grad_norm": 1.3838915008321806, + "learning_rate": 2.430379746835443e-05, + "loss": 1.402, + "step": 72 + }, + { + "epoch": 0.15417106652587118, + "grad_norm": 2.029519792420893, + "learning_rate": 2.4641350210970467e-05, + "loss": 1.4029, + "step": 73 + }, + { + "epoch": 0.1562829989440338, + "grad_norm": 2.0207434197609886, + "learning_rate": 2.49789029535865e-05, + "loss": 1.4202, + "step": 74 + }, + { + "epoch": 0.1583949313621964, + "grad_norm": 1.662021405461945, + "learning_rate": 2.5316455696202537e-05, + "loss": 1.3869, + "step": 75 + }, + { + "epoch": 0.16050686378035903, + "grad_norm": 1.765916824452665, + "learning_rate": 2.5654008438818568e-05, + "loss": 1.4078, + "step": 76 + }, + { + "epoch": 0.16261879619852165, + "grad_norm": 1.9777575579912845, + "learning_rate": 2.59915611814346e-05, + "loss": 1.4175, + "step": 77 + }, + { + "epoch": 0.16473072861668428, + "grad_norm": 1.3350714005443987, + "learning_rate": 2.6329113924050635e-05, + "loss": 1.3896, + "step": 78 + }, + { + "epoch": 0.16684266103484688, + "grad_norm": 1.4775525479032736, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.3933, + "step": 79 + }, + { + "epoch": 0.1689545934530095, + "grad_norm": 1.5028927786904727, + "learning_rate": 2.70042194092827e-05, + "loss": 1.3782, + "step": 80 + }, + { + "epoch": 0.17106652587117213, + "grad_norm": 0.8528874592298704, + "learning_rate": 2.7341772151898737e-05, + "loss": 1.3772, + "step": 81 + }, + { + "epoch": 0.17317845828933473, + "grad_norm": 1.2548349577069091, + "learning_rate": 2.767932489451477e-05, + "loss": 1.3746, + "step": 82 + }, + { + "epoch": 0.17529039070749736, + "grad_norm": 1.4542442843016439, + "learning_rate": 2.8016877637130803e-05, + "loss": 1.3816, + "step": 83 + }, + { + "epoch": 0.17740232312565998, + "grad_norm": 1.7349916380092472, + "learning_rate": 2.835443037974684e-05, + "loss": 1.3731, + "step": 84 + }, + { + "epoch": 0.1795142555438226, + "grad_norm": 1.072734496962205, + "learning_rate": 2.8691983122362873e-05, + "loss": 1.3911, + "step": 85 + }, + { + "epoch": 0.1816261879619852, + "grad_norm": 1.2061487437038363, + "learning_rate": 2.9029535864978905e-05, + "loss": 1.3893, + "step": 86 + }, + { + "epoch": 0.18373812038014783, + "grad_norm": 1.3018792384648255, + "learning_rate": 2.936708860759494e-05, + "loss": 1.3732, + "step": 87 + }, + { + "epoch": 0.18585005279831046, + "grad_norm": 1.2142356827564, + "learning_rate": 2.9704641350210975e-05, + "loss": 1.3718, + "step": 88 + }, + { + "epoch": 0.18796198521647306, + "grad_norm": 0.9999566781186233, + "learning_rate": 3.0042194092827007e-05, + "loss": 1.3919, + "step": 89 + }, + { + "epoch": 0.1900739176346357, + "grad_norm": 1.5476669449347846, + "learning_rate": 3.0379746835443042e-05, + "loss": 1.3815, + "step": 90 + }, + { + "epoch": 0.1921858500527983, + "grad_norm": 1.7581489613600996, + "learning_rate": 3.0717299578059074e-05, + "loss": 1.3795, + "step": 91 + }, + { + "epoch": 0.19429778247096094, + "grad_norm": 0.9845281359526621, + "learning_rate": 3.105485232067511e-05, + "loss": 1.3792, + "step": 92 + }, + { + "epoch": 0.19640971488912354, + "grad_norm": 1.2758285817268824, + "learning_rate": 3.1392405063291144e-05, + "loss": 1.3747, + "step": 93 + }, + { + "epoch": 0.19852164730728616, + "grad_norm": 1.4227668442471215, + "learning_rate": 3.172995780590717e-05, + "loss": 1.3592, + "step": 94 + }, + { + "epoch": 0.2006335797254488, + "grad_norm": 1.5035048426426103, + "learning_rate": 3.206751054852321e-05, + "loss": 1.3786, + "step": 95 + }, + { + "epoch": 0.20274551214361142, + "grad_norm": 1.377023529421612, + "learning_rate": 3.240506329113924e-05, + "loss": 1.3591, + "step": 96 + }, + { + "epoch": 0.20485744456177402, + "grad_norm": 1.6036945428541038, + "learning_rate": 3.274261603375528e-05, + "loss": 1.388, + "step": 97 + }, + { + "epoch": 0.20696937697993664, + "grad_norm": 1.1607222404025872, + "learning_rate": 3.308016877637131e-05, + "loss": 1.3676, + "step": 98 + }, + { + "epoch": 0.20908130939809927, + "grad_norm": 1.7888255859761832, + "learning_rate": 3.341772151898735e-05, + "loss": 1.3675, + "step": 99 + }, + { + "epoch": 0.21119324181626187, + "grad_norm": 0.9855891037226862, + "learning_rate": 3.375527426160338e-05, + "loss": 1.3882, + "step": 100 + }, + { + "epoch": 0.2133051742344245, + "grad_norm": 2.054162678544088, + "learning_rate": 3.409282700421941e-05, + "loss": 1.3676, + "step": 101 + }, + { + "epoch": 0.21541710665258712, + "grad_norm": 0.9324932865471639, + "learning_rate": 3.4430379746835445e-05, + "loss": 1.3759, + "step": 102 + }, + { + "epoch": 0.21752903907074975, + "grad_norm": 1.250991969020926, + "learning_rate": 3.476793248945148e-05, + "loss": 1.3539, + "step": 103 + }, + { + "epoch": 0.21964097148891235, + "grad_norm": 2.1308042144139776, + "learning_rate": 3.5105485232067516e-05, + "loss": 1.3805, + "step": 104 + }, + { + "epoch": 0.22175290390707497, + "grad_norm": 1.4790813621786558, + "learning_rate": 3.5443037974683544e-05, + "loss": 1.3887, + "step": 105 + }, + { + "epoch": 0.2238648363252376, + "grad_norm": 1.632780171651607, + "learning_rate": 3.578059071729958e-05, + "loss": 1.3654, + "step": 106 + }, + { + "epoch": 0.22597676874340022, + "grad_norm": 2.215286012392554, + "learning_rate": 3.6118143459915614e-05, + "loss": 1.3671, + "step": 107 + }, + { + "epoch": 0.22808870116156282, + "grad_norm": 1.3500743200827587, + "learning_rate": 3.645569620253165e-05, + "loss": 1.3751, + "step": 108 + }, + { + "epoch": 0.23020063357972545, + "grad_norm": 2.8632694550325137, + "learning_rate": 3.6793248945147684e-05, + "loss": 1.3798, + "step": 109 + }, + { + "epoch": 0.23231256599788808, + "grad_norm": 2.047401721238556, + "learning_rate": 3.713080168776372e-05, + "loss": 1.3668, + "step": 110 + }, + { + "epoch": 0.23442449841605068, + "grad_norm": 3.4760968561138488, + "learning_rate": 3.746835443037975e-05, + "loss": 1.3794, + "step": 111 + }, + { + "epoch": 0.2365364308342133, + "grad_norm": 3.3182133675100425, + "learning_rate": 3.780590717299578e-05, + "loss": 1.3877, + "step": 112 + }, + { + "epoch": 0.23864836325237593, + "grad_norm": 1.5651469605126829, + "learning_rate": 3.814345991561182e-05, + "loss": 1.3844, + "step": 113 + }, + { + "epoch": 0.24076029567053855, + "grad_norm": 2.3595477331601558, + "learning_rate": 3.848101265822785e-05, + "loss": 1.3742, + "step": 114 + }, + { + "epoch": 0.24287222808870115, + "grad_norm": 1.655160862854714, + "learning_rate": 3.881856540084389e-05, + "loss": 1.3486, + "step": 115 + }, + { + "epoch": 0.24498416050686378, + "grad_norm": 1.755659565669462, + "learning_rate": 3.915611814345992e-05, + "loss": 1.3573, + "step": 116 + }, + { + "epoch": 0.2470960929250264, + "grad_norm": 1.9046064438937027, + "learning_rate": 3.949367088607596e-05, + "loss": 1.371, + "step": 117 + }, + { + "epoch": 0.249208025343189, + "grad_norm": 1.4844979054169858, + "learning_rate": 3.9831223628691986e-05, + "loss": 1.3732, + "step": 118 + }, + { + "epoch": 0.25131995776135163, + "grad_norm": 1.6986275953789156, + "learning_rate": 4.016877637130802e-05, + "loss": 1.3824, + "step": 119 + }, + { + "epoch": 0.25343189017951423, + "grad_norm": 1.534853186493837, + "learning_rate": 4.0506329113924056e-05, + "loss": 1.3813, + "step": 120 + }, + { + "epoch": 0.2555438225976769, + "grad_norm": 1.3350204438337796, + "learning_rate": 4.0843881856540084e-05, + "loss": 1.3651, + "step": 121 + }, + { + "epoch": 0.2576557550158395, + "grad_norm": 2.2899522850853975, + "learning_rate": 4.1181434599156126e-05, + "loss": 1.3617, + "step": 122 + }, + { + "epoch": 0.25976768743400214, + "grad_norm": 1.3866772333412911, + "learning_rate": 4.1518987341772154e-05, + "loss": 1.3793, + "step": 123 + }, + { + "epoch": 0.26187961985216474, + "grad_norm": 2.099440294813428, + "learning_rate": 4.1856540084388196e-05, + "loss": 1.3731, + "step": 124 + }, + { + "epoch": 0.26399155227032733, + "grad_norm": 2.15524186114956, + "learning_rate": 4.2194092827004224e-05, + "loss": 1.3483, + "step": 125 + }, + { + "epoch": 0.26610348468849, + "grad_norm": 1.7376672118493466, + "learning_rate": 4.253164556962025e-05, + "loss": 1.3632, + "step": 126 + }, + { + "epoch": 0.2682154171066526, + "grad_norm": 1.6171550309737637, + "learning_rate": 4.286919831223629e-05, + "loss": 1.3635, + "step": 127 + }, + { + "epoch": 0.2703273495248152, + "grad_norm": 2.188408016001274, + "learning_rate": 4.320675105485232e-05, + "loss": 1.3637, + "step": 128 + }, + { + "epoch": 0.27243928194297784, + "grad_norm": 1.194380414173649, + "learning_rate": 4.354430379746836e-05, + "loss": 1.3571, + "step": 129 + }, + { + "epoch": 0.27455121436114044, + "grad_norm": 2.472701659825226, + "learning_rate": 4.3881856540084386e-05, + "loss": 1.3785, + "step": 130 + }, + { + "epoch": 0.27666314677930304, + "grad_norm": 1.9463539367698568, + "learning_rate": 4.421940928270043e-05, + "loss": 1.3558, + "step": 131 + }, + { + "epoch": 0.2787750791974657, + "grad_norm": 1.7875083416475672, + "learning_rate": 4.4556962025316456e-05, + "loss": 1.3642, + "step": 132 + }, + { + "epoch": 0.2808870116156283, + "grad_norm": 1.7913617358156209, + "learning_rate": 4.48945147679325e-05, + "loss": 1.3414, + "step": 133 + }, + { + "epoch": 0.28299894403379094, + "grad_norm": 1.326629959577606, + "learning_rate": 4.5232067510548526e-05, + "loss": 1.3559, + "step": 134 + }, + { + "epoch": 0.28511087645195354, + "grad_norm": 1.371960133641923, + "learning_rate": 4.556962025316456e-05, + "loss": 1.3741, + "step": 135 + }, + { + "epoch": 0.28722280887011614, + "grad_norm": 1.88679941724459, + "learning_rate": 4.5907172995780596e-05, + "loss": 1.3665, + "step": 136 + }, + { + "epoch": 0.2893347412882788, + "grad_norm": 1.759298568489704, + "learning_rate": 4.624472573839663e-05, + "loss": 1.3875, + "step": 137 + }, + { + "epoch": 0.2914466737064414, + "grad_norm": 1.6088516337019414, + "learning_rate": 4.658227848101266e-05, + "loss": 1.3711, + "step": 138 + }, + { + "epoch": 0.293558606124604, + "grad_norm": 1.6611308563463725, + "learning_rate": 4.69198312236287e-05, + "loss": 1.3579, + "step": 139 + }, + { + "epoch": 0.29567053854276665, + "grad_norm": 1.6108060067643875, + "learning_rate": 4.725738396624473e-05, + "loss": 1.3613, + "step": 140 + }, + { + "epoch": 0.29778247096092925, + "grad_norm": 1.4557221091315724, + "learning_rate": 4.759493670886076e-05, + "loss": 1.369, + "step": 141 + }, + { + "epoch": 0.29989440337909185, + "grad_norm": 2.7547946832025274, + "learning_rate": 4.79324894514768e-05, + "loss": 1.3671, + "step": 142 + }, + { + "epoch": 0.3020063357972545, + "grad_norm": 1.497592604330233, + "learning_rate": 4.827004219409283e-05, + "loss": 1.3656, + "step": 143 + }, + { + "epoch": 0.3041182682154171, + "grad_norm": 2.1591028670246275, + "learning_rate": 4.860759493670886e-05, + "loss": 1.3491, + "step": 144 + }, + { + "epoch": 0.30623020063357975, + "grad_norm": 1.6147663931223117, + "learning_rate": 4.89451476793249e-05, + "loss": 1.3564, + "step": 145 + }, + { + "epoch": 0.30834213305174235, + "grad_norm": 2.0885522757355828, + "learning_rate": 4.928270042194093e-05, + "loss": 1.3687, + "step": 146 + }, + { + "epoch": 0.31045406546990495, + "grad_norm": 1.9744051177243023, + "learning_rate": 4.962025316455696e-05, + "loss": 1.3547, + "step": 147 + }, + { + "epoch": 0.3125659978880676, + "grad_norm": 1.8616527312452302, + "learning_rate": 4.9957805907173e-05, + "loss": 1.3532, + "step": 148 + }, + { + "epoch": 0.3146779303062302, + "grad_norm": 1.6028312511570681, + "learning_rate": 5.029535864978903e-05, + "loss": 1.3535, + "step": 149 + }, + { + "epoch": 0.3167898627243928, + "grad_norm": 1.7291810687501137, + "learning_rate": 5.063291139240507e-05, + "loss": 1.3506, + "step": 150 + }, + { + "epoch": 0.31890179514255546, + "grad_norm": 1.7325450760779524, + "learning_rate": 5.09704641350211e-05, + "loss": 1.3335, + "step": 151 + }, + { + "epoch": 0.32101372756071805, + "grad_norm": 2.6449972078089696, + "learning_rate": 5.1308016877637136e-05, + "loss": 1.3556, + "step": 152 + }, + { + "epoch": 0.32312565997888065, + "grad_norm": 1.3107962566902684, + "learning_rate": 5.164556962025317e-05, + "loss": 1.3556, + "step": 153 + }, + { + "epoch": 0.3252375923970433, + "grad_norm": 2.7905338803387676, + "learning_rate": 5.19831223628692e-05, + "loss": 1.3604, + "step": 154 + }, + { + "epoch": 0.3273495248152059, + "grad_norm": 2.170413165259546, + "learning_rate": 5.2320675105485235e-05, + "loss": 1.368, + "step": 155 + }, + { + "epoch": 0.32946145723336856, + "grad_norm": 1.774213241825499, + "learning_rate": 5.265822784810127e-05, + "loss": 1.3454, + "step": 156 + }, + { + "epoch": 0.33157338965153116, + "grad_norm": 1.9732890291104068, + "learning_rate": 5.2995780590717305e-05, + "loss": 1.3518, + "step": 157 + }, + { + "epoch": 0.33368532206969376, + "grad_norm": 1.434864878503445, + "learning_rate": 5.333333333333333e-05, + "loss": 1.3602, + "step": 158 + }, + { + "epoch": 0.3357972544878564, + "grad_norm": 2.5650305192709766, + "learning_rate": 5.3670886075949375e-05, + "loss": 1.3608, + "step": 159 + }, + { + "epoch": 0.337909186906019, + "grad_norm": 1.8233456796530711, + "learning_rate": 5.40084388185654e-05, + "loss": 1.3594, + "step": 160 + }, + { + "epoch": 0.3400211193241816, + "grad_norm": 1.5464772601160979, + "learning_rate": 5.434599156118144e-05, + "loss": 1.3421, + "step": 161 + }, + { + "epoch": 0.34213305174234426, + "grad_norm": 3.736497054418843, + "learning_rate": 5.468354430379747e-05, + "loss": 1.3557, + "step": 162 + }, + { + "epoch": 0.34424498416050686, + "grad_norm": 2.6809110884933336, + "learning_rate": 5.502109704641351e-05, + "loss": 1.3716, + "step": 163 + }, + { + "epoch": 0.34635691657866946, + "grad_norm": 3.2428407358178717, + "learning_rate": 5.535864978902954e-05, + "loss": 1.3614, + "step": 164 + }, + { + "epoch": 0.3484688489968321, + "grad_norm": 2.427263438574737, + "learning_rate": 5.569620253164558e-05, + "loss": 1.3459, + "step": 165 + }, + { + "epoch": 0.3505807814149947, + "grad_norm": 2.9513639147927884, + "learning_rate": 5.603375527426161e-05, + "loss": 1.3607, + "step": 166 + }, + { + "epoch": 0.3526927138331573, + "grad_norm": 1.7876620596482466, + "learning_rate": 5.637130801687765e-05, + "loss": 1.3596, + "step": 167 + }, + { + "epoch": 0.35480464625131997, + "grad_norm": 2.6465916807017016, + "learning_rate": 5.670886075949368e-05, + "loss": 1.3472, + "step": 168 + }, + { + "epoch": 0.35691657866948256, + "grad_norm": 1.597833531550829, + "learning_rate": 5.7046413502109705e-05, + "loss": 1.349, + "step": 169 + }, + { + "epoch": 0.3590285110876452, + "grad_norm": 2.3416997744760626, + "learning_rate": 5.738396624472575e-05, + "loss": 1.3573, + "step": 170 + }, + { + "epoch": 0.3611404435058078, + "grad_norm": 1.695333134138557, + "learning_rate": 5.7721518987341775e-05, + "loss": 1.3547, + "step": 171 + }, + { + "epoch": 0.3632523759239704, + "grad_norm": 2.219647614930492, + "learning_rate": 5.805907172995781e-05, + "loss": 1.3559, + "step": 172 + }, + { + "epoch": 0.36536430834213307, + "grad_norm": 1.9112908587349349, + "learning_rate": 5.839662447257384e-05, + "loss": 1.3378, + "step": 173 + }, + { + "epoch": 0.36747624076029567, + "grad_norm": 1.4631138068686715, + "learning_rate": 5.873417721518988e-05, + "loss": 1.3467, + "step": 174 + }, + { + "epoch": 0.36958817317845827, + "grad_norm": 1.979719081174038, + "learning_rate": 5.907172995780591e-05, + "loss": 1.3436, + "step": 175 + }, + { + "epoch": 0.3717001055966209, + "grad_norm": 1.2894848753750747, + "learning_rate": 5.940928270042195e-05, + "loss": 1.3493, + "step": 176 + }, + { + "epoch": 0.3738120380147835, + "grad_norm": 1.702060920632648, + "learning_rate": 5.974683544303798e-05, + "loss": 1.3614, + "step": 177 + }, + { + "epoch": 0.3759239704329461, + "grad_norm": 1.1509900827661992, + "learning_rate": 6.0084388185654014e-05, + "loss": 1.3568, + "step": 178 + }, + { + "epoch": 0.3780359028511088, + "grad_norm": 2.1190000714400274, + "learning_rate": 6.042194092827005e-05, + "loss": 1.3667, + "step": 179 + }, + { + "epoch": 0.3801478352692714, + "grad_norm": 1.674259001492176, + "learning_rate": 6.0759493670886084e-05, + "loss": 1.3686, + "step": 180 + }, + { + "epoch": 0.382259767687434, + "grad_norm": 1.156565811165061, + "learning_rate": 6.109704641350211e-05, + "loss": 1.3611, + "step": 181 + }, + { + "epoch": 0.3843717001055966, + "grad_norm": 1.7965164795880233, + "learning_rate": 6.143459915611815e-05, + "loss": 1.3382, + "step": 182 + }, + { + "epoch": 0.3864836325237592, + "grad_norm": 1.3529784654727606, + "learning_rate": 6.177215189873418e-05, + "loss": 1.3543, + "step": 183 + }, + { + "epoch": 0.3885955649419219, + "grad_norm": 2.0824291028247837, + "learning_rate": 6.210970464135022e-05, + "loss": 1.3754, + "step": 184 + }, + { + "epoch": 0.3907074973600845, + "grad_norm": 1.9634845490222723, + "learning_rate": 6.244725738396625e-05, + "loss": 1.3745, + "step": 185 + }, + { + "epoch": 0.3928194297782471, + "grad_norm": 1.4654900764984071, + "learning_rate": 6.278481012658229e-05, + "loss": 1.3587, + "step": 186 + }, + { + "epoch": 0.39493136219640973, + "grad_norm": 2.231609281209892, + "learning_rate": 6.312236286919832e-05, + "loss": 1.3525, + "step": 187 + }, + { + "epoch": 0.39704329461457233, + "grad_norm": 1.3611917446100434, + "learning_rate": 6.345991561181434e-05, + "loss": 1.353, + "step": 188 + }, + { + "epoch": 0.3991552270327349, + "grad_norm": 1.9950108641395976, + "learning_rate": 6.379746835443039e-05, + "loss": 1.3452, + "step": 189 + }, + { + "epoch": 0.4012671594508976, + "grad_norm": 1.2926417197410858, + "learning_rate": 6.413502109704641e-05, + "loss": 1.3494, + "step": 190 + }, + { + "epoch": 0.4033790918690602, + "grad_norm": 1.4533539244069502, + "learning_rate": 6.447257383966245e-05, + "loss": 1.3487, + "step": 191 + }, + { + "epoch": 0.40549102428722283, + "grad_norm": 1.2680444286572334, + "learning_rate": 6.481012658227848e-05, + "loss": 1.3452, + "step": 192 + }, + { + "epoch": 0.40760295670538543, + "grad_norm": 1.7951865267043807, + "learning_rate": 6.514767932489452e-05, + "loss": 1.3623, + "step": 193 + }, + { + "epoch": 0.40971488912354803, + "grad_norm": 1.7808683015898121, + "learning_rate": 6.548523206751055e-05, + "loss": 1.3558, + "step": 194 + }, + { + "epoch": 0.4118268215417107, + "grad_norm": 1.3892107648633607, + "learning_rate": 6.582278481012659e-05, + "loss": 1.3638, + "step": 195 + }, + { + "epoch": 0.4139387539598733, + "grad_norm": 1.8151073240323714, + "learning_rate": 6.616033755274262e-05, + "loss": 1.3605, + "step": 196 + }, + { + "epoch": 0.4160506863780359, + "grad_norm": 1.2847639978205745, + "learning_rate": 6.649789029535865e-05, + "loss": 1.3447, + "step": 197 + }, + { + "epoch": 0.41816261879619854, + "grad_norm": 1.8055087198836253, + "learning_rate": 6.68354430379747e-05, + "loss": 1.3519, + "step": 198 + }, + { + "epoch": 0.42027455121436114, + "grad_norm": 1.4356298103142857, + "learning_rate": 6.717299578059072e-05, + "loss": 1.3379, + "step": 199 + }, + { + "epoch": 0.42238648363252373, + "grad_norm": 1.5548017045814744, + "learning_rate": 6.751054852320676e-05, + "loss": 1.3409, + "step": 200 + }, + { + "epoch": 0.4244984160506864, + "grad_norm": 1.529499374192984, + "learning_rate": 6.784810126582279e-05, + "loss": 1.3404, + "step": 201 + }, + { + "epoch": 0.426610348468849, + "grad_norm": 1.1865906308075251, + "learning_rate": 6.818565400843882e-05, + "loss": 1.3403, + "step": 202 + }, + { + "epoch": 0.42872228088701164, + "grad_norm": 1.5507592339402745, + "learning_rate": 6.852320675105486e-05, + "loss": 1.3445, + "step": 203 + }, + { + "epoch": 0.43083421330517424, + "grad_norm": 1.723572793294543, + "learning_rate": 6.886075949367089e-05, + "loss": 1.3371, + "step": 204 + }, + { + "epoch": 0.43294614572333684, + "grad_norm": 1.8373052820190752, + "learning_rate": 6.919831223628693e-05, + "loss": 1.3548, + "step": 205 + }, + { + "epoch": 0.4350580781414995, + "grad_norm": 0.8562785954071918, + "learning_rate": 6.953586497890296e-05, + "loss": 1.3257, + "step": 206 + }, + { + "epoch": 0.4371700105596621, + "grad_norm": 1.384901225136291, + "learning_rate": 6.9873417721519e-05, + "loss": 1.345, + "step": 207 + }, + { + "epoch": 0.4392819429778247, + "grad_norm": 1.412421463638516, + "learning_rate": 7.021097046413503e-05, + "loss": 1.3513, + "step": 208 + }, + { + "epoch": 0.44139387539598735, + "grad_norm": 1.5418742575859916, + "learning_rate": 7.054852320675107e-05, + "loss": 1.346, + "step": 209 + }, + { + "epoch": 0.44350580781414994, + "grad_norm": 1.8275963237999167, + "learning_rate": 7.088607594936709e-05, + "loss": 1.3543, + "step": 210 + }, + { + "epoch": 0.44561774023231254, + "grad_norm": 1.2402715573469387, + "learning_rate": 7.122362869198312e-05, + "loss": 1.3534, + "step": 211 + }, + { + "epoch": 0.4477296726504752, + "grad_norm": 1.5966567800193525, + "learning_rate": 7.156118143459916e-05, + "loss": 1.3416, + "step": 212 + }, + { + "epoch": 0.4498416050686378, + "grad_norm": 1.8725837238307474, + "learning_rate": 7.189873417721519e-05, + "loss": 1.3449, + "step": 213 + }, + { + "epoch": 0.45195353748680045, + "grad_norm": 1.2774757600016737, + "learning_rate": 7.223628691983123e-05, + "loss": 1.3448, + "step": 214 + }, + { + "epoch": 0.45406546990496305, + "grad_norm": 2.0235045633637947, + "learning_rate": 7.257383966244726e-05, + "loss": 1.3428, + "step": 215 + }, + { + "epoch": 0.45617740232312565, + "grad_norm": 1.0251031709477145, + "learning_rate": 7.29113924050633e-05, + "loss": 1.3437, + "step": 216 + }, + { + "epoch": 0.4582893347412883, + "grad_norm": 2.245913622634812, + "learning_rate": 7.324894514767933e-05, + "loss": 1.3527, + "step": 217 + }, + { + "epoch": 0.4604012671594509, + "grad_norm": 1.3939520111538766, + "learning_rate": 7.358649789029537e-05, + "loss": 1.3476, + "step": 218 + }, + { + "epoch": 0.4625131995776135, + "grad_norm": 1.7796929422862167, + "learning_rate": 7.39240506329114e-05, + "loss": 1.3476, + "step": 219 + }, + { + "epoch": 0.46462513199577615, + "grad_norm": 1.4816818892215315, + "learning_rate": 7.426160337552744e-05, + "loss": 1.3616, + "step": 220 + }, + { + "epoch": 0.46673706441393875, + "grad_norm": 1.4378065943052305, + "learning_rate": 7.459915611814347e-05, + "loss": 1.3436, + "step": 221 + }, + { + "epoch": 0.46884899683210135, + "grad_norm": 1.3976215855892826, + "learning_rate": 7.49367088607595e-05, + "loss": 1.346, + "step": 222 + }, + { + "epoch": 0.470960929250264, + "grad_norm": 1.8869184041183378, + "learning_rate": 7.527426160337554e-05, + "loss": 1.3474, + "step": 223 + }, + { + "epoch": 0.4730728616684266, + "grad_norm": 1.621466880552275, + "learning_rate": 7.561181434599156e-05, + "loss": 1.3529, + "step": 224 + }, + { + "epoch": 0.4751847940865892, + "grad_norm": 0.898118934721064, + "learning_rate": 7.59493670886076e-05, + "loss": 1.3331, + "step": 225 + }, + { + "epoch": 0.47729672650475186, + "grad_norm": 1.6713209385522612, + "learning_rate": 7.628691983122363e-05, + "loss": 1.3355, + "step": 226 + }, + { + "epoch": 0.47940865892291445, + "grad_norm": 1.8933958617105142, + "learning_rate": 7.662447257383967e-05, + "loss": 1.3588, + "step": 227 + }, + { + "epoch": 0.4815205913410771, + "grad_norm": 1.1948885380609904, + "learning_rate": 7.69620253164557e-05, + "loss": 1.352, + "step": 228 + }, + { + "epoch": 0.4836325237592397, + "grad_norm": 1.8240549642563384, + "learning_rate": 7.729957805907174e-05, + "loss": 1.3434, + "step": 229 + }, + { + "epoch": 0.4857444561774023, + "grad_norm": 1.1287257039006235, + "learning_rate": 7.763713080168777e-05, + "loss": 1.3494, + "step": 230 + }, + { + "epoch": 0.48785638859556496, + "grad_norm": 2.645825140391552, + "learning_rate": 7.79746835443038e-05, + "loss": 1.3627, + "step": 231 + }, + { + "epoch": 0.48996832101372756, + "grad_norm": 1.7246408819208252, + "learning_rate": 7.831223628691984e-05, + "loss": 1.3665, + "step": 232 + }, + { + "epoch": 0.49208025343189016, + "grad_norm": 2.1948788649987803, + "learning_rate": 7.864978902953587e-05, + "loss": 1.362, + "step": 233 + }, + { + "epoch": 0.4941921858500528, + "grad_norm": 1.5174686333931684, + "learning_rate": 7.898734177215191e-05, + "loss": 1.3613, + "step": 234 + }, + { + "epoch": 0.4963041182682154, + "grad_norm": 1.7766243631709684, + "learning_rate": 7.932489451476794e-05, + "loss": 1.3515, + "step": 235 + }, + { + "epoch": 0.498416050686378, + "grad_norm": 1.3715344298002186, + "learning_rate": 7.966244725738397e-05, + "loss": 1.3573, + "step": 236 + }, + { + "epoch": 0.5005279831045406, + "grad_norm": 1.4069645390055168, + "learning_rate": 8e-05, + "loss": 1.3404, + "step": 237 + }, + { + "epoch": 0.5026399155227033, + "grad_norm": 1.085566236238456, + "learning_rate": 7.999995641004559e-05, + "loss": 1.3203, + "step": 238 + }, + { + "epoch": 0.5047518479408659, + "grad_norm": 1.8993048560272996, + "learning_rate": 7.999982564027732e-05, + "loss": 1.3595, + "step": 239 + }, + { + "epoch": 0.5068637803590285, + "grad_norm": 1.4237461004075913, + "learning_rate": 7.999960769098022e-05, + "loss": 1.3443, + "step": 240 + }, + { + "epoch": 0.5089757127771911, + "grad_norm": 1.2947805963655645, + "learning_rate": 7.999930256262932e-05, + "loss": 1.3323, + "step": 241 + }, + { + "epoch": 0.5110876451953538, + "grad_norm": 1.137586440209036, + "learning_rate": 7.999891025588963e-05, + "loss": 1.3494, + "step": 242 + }, + { + "epoch": 0.5131995776135164, + "grad_norm": 2.2047443415471646, + "learning_rate": 7.999843077161619e-05, + "loss": 1.3379, + "step": 243 + }, + { + "epoch": 0.515311510031679, + "grad_norm": 1.5248734511312991, + "learning_rate": 7.999786411085404e-05, + "loss": 1.3573, + "step": 244 + }, + { + "epoch": 0.5174234424498416, + "grad_norm": 1.4748633787034717, + "learning_rate": 7.999721027483819e-05, + "loss": 1.3424, + "step": 245 + }, + { + "epoch": 0.5195353748680043, + "grad_norm": 1.8179065827383007, + "learning_rate": 7.999646926499371e-05, + "loss": 1.3403, + "step": 246 + }, + { + "epoch": 0.5216473072861668, + "grad_norm": 1.1508883748717864, + "learning_rate": 7.999564108293562e-05, + "loss": 1.3414, + "step": 247 + }, + { + "epoch": 0.5237592397043295, + "grad_norm": 2.0262867353937875, + "learning_rate": 7.999472573046892e-05, + "loss": 1.337, + "step": 248 + }, + { + "epoch": 0.5258711721224921, + "grad_norm": 1.4720882119095748, + "learning_rate": 7.999372320958863e-05, + "loss": 1.3692, + "step": 249 + }, + { + "epoch": 0.5279831045406547, + "grad_norm": 1.6324958515053671, + "learning_rate": 7.999263352247976e-05, + "loss": 1.3433, + "step": 250 + }, + { + "epoch": 0.5300950369588173, + "grad_norm": 1.457587140332806, + "learning_rate": 7.999145667151725e-05, + "loss": 1.3496, + "step": 251 + }, + { + "epoch": 0.53220696937698, + "grad_norm": 1.53961911453376, + "learning_rate": 7.999019265926606e-05, + "loss": 1.3494, + "step": 252 + }, + { + "epoch": 0.5343189017951425, + "grad_norm": 0.9669777771177444, + "learning_rate": 7.99888414884811e-05, + "loss": 1.3477, + "step": 253 + }, + { + "epoch": 0.5364308342133052, + "grad_norm": 1.9272341586869124, + "learning_rate": 7.998740316210726e-05, + "loss": 1.3525, + "step": 254 + }, + { + "epoch": 0.5385427666314678, + "grad_norm": 1.0621888989613304, + "learning_rate": 7.998587768327933e-05, + "loss": 1.3439, + "step": 255 + }, + { + "epoch": 0.5406546990496304, + "grad_norm": 1.977594063303179, + "learning_rate": 7.998426505532213e-05, + "loss": 1.3478, + "step": 256 + }, + { + "epoch": 0.542766631467793, + "grad_norm": 1.587097878811253, + "learning_rate": 7.998256528175034e-05, + "loss": 1.3649, + "step": 257 + }, + { + "epoch": 0.5448785638859557, + "grad_norm": 1.5214904479004552, + "learning_rate": 7.998077836626865e-05, + "loss": 1.3627, + "step": 258 + }, + { + "epoch": 0.5469904963041182, + "grad_norm": 1.2970335087623621, + "learning_rate": 7.997890431277161e-05, + "loss": 1.3525, + "step": 259 + }, + { + "epoch": 0.5491024287222809, + "grad_norm": 0.897791991066211, + "learning_rate": 7.997694312534374e-05, + "loss": 1.3254, + "step": 260 + }, + { + "epoch": 0.5512143611404435, + "grad_norm": 1.1834532927483181, + "learning_rate": 7.997489480825941e-05, + "loss": 1.3291, + "step": 261 + }, + { + "epoch": 0.5533262935586061, + "grad_norm": 1.6297150554515962, + "learning_rate": 7.997275936598295e-05, + "loss": 1.3357, + "step": 262 + }, + { + "epoch": 0.5554382259767687, + "grad_norm": 1.3051228253368348, + "learning_rate": 7.997053680316855e-05, + "loss": 1.3313, + "step": 263 + }, + { + "epoch": 0.5575501583949314, + "grad_norm": 1.0342902681361341, + "learning_rate": 7.996822712466026e-05, + "loss": 1.3433, + "step": 264 + }, + { + "epoch": 0.5596620908130939, + "grad_norm": 1.2837864979313294, + "learning_rate": 7.996583033549204e-05, + "loss": 1.3246, + "step": 265 + }, + { + "epoch": 0.5617740232312566, + "grad_norm": 1.4935382914062, + "learning_rate": 7.996334644088769e-05, + "loss": 1.3533, + "step": 266 + }, + { + "epoch": 0.5638859556494192, + "grad_norm": 1.2390189770091729, + "learning_rate": 7.996077544626083e-05, + "loss": 1.3249, + "step": 267 + }, + { + "epoch": 0.5659978880675819, + "grad_norm": 1.2330031320856942, + "learning_rate": 7.995811735721495e-05, + "loss": 1.3124, + "step": 268 + }, + { + "epoch": 0.5681098204857444, + "grad_norm": 1.158941936934339, + "learning_rate": 7.995537217954336e-05, + "loss": 1.3293, + "step": 269 + }, + { + "epoch": 0.5702217529039071, + "grad_norm": 1.4262056232096023, + "learning_rate": 7.995253991922914e-05, + "loss": 1.3378, + "step": 270 + }, + { + "epoch": 0.5723336853220697, + "grad_norm": 1.3593425661210137, + "learning_rate": 7.994962058244522e-05, + "loss": 1.3405, + "step": 271 + }, + { + "epoch": 0.5744456177402323, + "grad_norm": 0.9927410956602071, + "learning_rate": 7.994661417555428e-05, + "loss": 1.3321, + "step": 272 + }, + { + "epoch": 0.5765575501583949, + "grad_norm": 1.2580315449566972, + "learning_rate": 7.994352070510878e-05, + "loss": 1.3307, + "step": 273 + }, + { + "epoch": 0.5786694825765576, + "grad_norm": 0.8137936386866628, + "learning_rate": 7.994034017785092e-05, + "loss": 1.3307, + "step": 274 + }, + { + "epoch": 0.5807814149947201, + "grad_norm": 1.3953512693634094, + "learning_rate": 7.993707260071268e-05, + "loss": 1.3162, + "step": 275 + }, + { + "epoch": 0.5828933474128828, + "grad_norm": 1.3075720794030763, + "learning_rate": 7.993371798081569e-05, + "loss": 1.351, + "step": 276 + }, + { + "epoch": 0.5850052798310454, + "grad_norm": 1.0488753129747648, + "learning_rate": 7.993027632547138e-05, + "loss": 1.3182, + "step": 277 + }, + { + "epoch": 0.587117212249208, + "grad_norm": 1.3104635926791215, + "learning_rate": 7.992674764218081e-05, + "loss": 1.3202, + "step": 278 + }, + { + "epoch": 0.5892291446673706, + "grad_norm": 1.1247808916912587, + "learning_rate": 7.992313193863473e-05, + "loss": 1.3216, + "step": 279 + }, + { + "epoch": 0.5913410770855333, + "grad_norm": 1.4263669759525615, + "learning_rate": 7.991942922271358e-05, + "loss": 1.332, + "step": 280 + }, + { + "epoch": 0.5934530095036958, + "grad_norm": 0.8896507999525592, + "learning_rate": 7.99156395024874e-05, + "loss": 1.3274, + "step": 281 + }, + { + "epoch": 0.5955649419218585, + "grad_norm": 1.1490109722846542, + "learning_rate": 7.991176278621589e-05, + "loss": 1.3332, + "step": 282 + }, + { + "epoch": 0.5976768743400211, + "grad_norm": 1.3005454613101521, + "learning_rate": 7.990779908234835e-05, + "loss": 1.3217, + "step": 283 + }, + { + "epoch": 0.5997888067581837, + "grad_norm": 1.0906476060379882, + "learning_rate": 7.990374839952364e-05, + "loss": 1.3423, + "step": 284 + }, + { + "epoch": 0.6019007391763463, + "grad_norm": 1.437631905830119, + "learning_rate": 7.989961074657024e-05, + "loss": 1.3391, + "step": 285 + }, + { + "epoch": 0.604012671594509, + "grad_norm": 0.7437858110971602, + "learning_rate": 7.989538613250613e-05, + "loss": 1.3138, + "step": 286 + }, + { + "epoch": 0.6061246040126715, + "grad_norm": 0.9507644324103506, + "learning_rate": 7.989107456653888e-05, + "loss": 1.314, + "step": 287 + }, + { + "epoch": 0.6082365364308342, + "grad_norm": 1.0515642748560732, + "learning_rate": 7.988667605806549e-05, + "loss": 1.3426, + "step": 288 + }, + { + "epoch": 0.6103484688489969, + "grad_norm": 1.4584764910436843, + "learning_rate": 7.988219061667254e-05, + "loss": 1.3379, + "step": 289 + }, + { + "epoch": 0.6124604012671595, + "grad_norm": 1.121826568394146, + "learning_rate": 7.987761825213603e-05, + "loss": 1.3091, + "step": 290 + }, + { + "epoch": 0.614572333685322, + "grad_norm": 1.267429296243072, + "learning_rate": 7.987295897442141e-05, + "loss": 1.3236, + "step": 291 + }, + { + "epoch": 0.6166842661034847, + "grad_norm": 0.8738313693099367, + "learning_rate": 7.986821279368358e-05, + "loss": 1.3098, + "step": 292 + }, + { + "epoch": 0.6187961985216474, + "grad_norm": 1.2382288951800307, + "learning_rate": 7.98633797202668e-05, + "loss": 1.3426, + "step": 293 + }, + { + "epoch": 0.6209081309398099, + "grad_norm": 0.8862458929735162, + "learning_rate": 7.985845976470478e-05, + "loss": 1.3214, + "step": 294 + }, + { + "epoch": 0.6230200633579726, + "grad_norm": 1.626546679668042, + "learning_rate": 7.985345293772052e-05, + "loss": 1.3179, + "step": 295 + }, + { + "epoch": 0.6251319957761352, + "grad_norm": 0.8225349354088443, + "learning_rate": 7.984835925022641e-05, + "loss": 1.3394, + "step": 296 + }, + { + "epoch": 0.6272439281942978, + "grad_norm": 1.1643126255885599, + "learning_rate": 7.984317871332412e-05, + "loss": 1.3384, + "step": 297 + }, + { + "epoch": 0.6293558606124604, + "grad_norm": 1.1215919913147396, + "learning_rate": 7.983791133830463e-05, + "loss": 1.3166, + "step": 298 + }, + { + "epoch": 0.6314677930306231, + "grad_norm": 1.1530928795594722, + "learning_rate": 7.983255713664814e-05, + "loss": 1.3518, + "step": 299 + }, + { + "epoch": 0.6335797254487856, + "grad_norm": 1.5100853191136279, + "learning_rate": 7.982711612002415e-05, + "loss": 1.3381, + "step": 300 + }, + { + "epoch": 0.6356916578669483, + "grad_norm": 0.8940577303185709, + "learning_rate": 7.982158830029136e-05, + "loss": 1.3292, + "step": 301 + }, + { + "epoch": 0.6378035902851109, + "grad_norm": 1.5289012707181917, + "learning_rate": 7.98159736894976e-05, + "loss": 1.3314, + "step": 302 + }, + { + "epoch": 0.6399155227032735, + "grad_norm": 0.9787740693552307, + "learning_rate": 7.98102722998799e-05, + "loss": 1.3097, + "step": 303 + }, + { + "epoch": 0.6420274551214361, + "grad_norm": 1.8462467587502853, + "learning_rate": 7.980448414386447e-05, + "loss": 1.3373, + "step": 304 + }, + { + "epoch": 0.6441393875395988, + "grad_norm": 1.0735168572959468, + "learning_rate": 7.979860923406654e-05, + "loss": 1.3389, + "step": 305 + }, + { + "epoch": 0.6462513199577613, + "grad_norm": 2.153463792780476, + "learning_rate": 7.979264758329048e-05, + "loss": 1.3531, + "step": 306 + }, + { + "epoch": 0.648363252375924, + "grad_norm": 1.5775546641299552, + "learning_rate": 7.978659920452972e-05, + "loss": 1.3381, + "step": 307 + }, + { + "epoch": 0.6504751847940866, + "grad_norm": 1.6347089367568972, + "learning_rate": 7.978046411096663e-05, + "loss": 1.3361, + "step": 308 + }, + { + "epoch": 0.6525871172122492, + "grad_norm": 1.1760326019474914, + "learning_rate": 7.977424231597268e-05, + "loss": 1.3291, + "step": 309 + }, + { + "epoch": 0.6546990496304118, + "grad_norm": 1.2087769881072643, + "learning_rate": 7.976793383310822e-05, + "loss": 1.3215, + "step": 310 + }, + { + "epoch": 0.6568109820485745, + "grad_norm": 1.1465506183563698, + "learning_rate": 7.976153867612261e-05, + "loss": 1.3313, + "step": 311 + }, + { + "epoch": 0.6589229144667371, + "grad_norm": 1.1144693888088653, + "learning_rate": 7.97550568589541e-05, + "loss": 1.3353, + "step": 312 + }, + { + "epoch": 0.6610348468848997, + "grad_norm": 1.0405130402164826, + "learning_rate": 7.974848839572971e-05, + "loss": 1.3416, + "step": 313 + }, + { + "epoch": 0.6631467793030623, + "grad_norm": 1.1072724908197504, + "learning_rate": 7.974183330076546e-05, + "loss": 1.3464, + "step": 314 + }, + { + "epoch": 0.665258711721225, + "grad_norm": 1.1425298680962128, + "learning_rate": 7.97350915885661e-05, + "loss": 1.3338, + "step": 315 + }, + { + "epoch": 0.6673706441393875, + "grad_norm": 1.0416275787794114, + "learning_rate": 7.972826327382516e-05, + "loss": 1.306, + "step": 316 + }, + { + "epoch": 0.6694825765575502, + "grad_norm": 0.6950478210848091, + "learning_rate": 7.972134837142497e-05, + "loss": 1.3257, + "step": 317 + }, + { + "epoch": 0.6715945089757128, + "grad_norm": 0.8497405788804049, + "learning_rate": 7.971434689643652e-05, + "loss": 1.3294, + "step": 318 + }, + { + "epoch": 0.6737064413938754, + "grad_norm": 1.1725518441789529, + "learning_rate": 7.97072588641195e-05, + "loss": 1.3337, + "step": 319 + }, + { + "epoch": 0.675818373812038, + "grad_norm": 1.0576372140999108, + "learning_rate": 7.970008428992228e-05, + "loss": 1.3382, + "step": 320 + }, + { + "epoch": 0.6779303062302007, + "grad_norm": 0.9405243392016331, + "learning_rate": 7.96928231894818e-05, + "loss": 1.3189, + "step": 321 + }, + { + "epoch": 0.6800422386483632, + "grad_norm": 1.1108802232453396, + "learning_rate": 7.968547557862366e-05, + "loss": 1.337, + "step": 322 + }, + { + "epoch": 0.6821541710665259, + "grad_norm": 1.1233220118818836, + "learning_rate": 7.967804147336191e-05, + "loss": 1.3251, + "step": 323 + }, + { + "epoch": 0.6842661034846885, + "grad_norm": 0.9357414115659357, + "learning_rate": 7.967052088989921e-05, + "loss": 1.3157, + "step": 324 + }, + { + "epoch": 0.6863780359028511, + "grad_norm": 1.281072933350799, + "learning_rate": 7.966291384462664e-05, + "loss": 1.315, + "step": 325 + }, + { + "epoch": 0.6884899683210137, + "grad_norm": 0.930197305038493, + "learning_rate": 7.965522035412371e-05, + "loss": 1.3234, + "step": 326 + }, + { + "epoch": 0.6906019007391764, + "grad_norm": 0.7776528836323324, + "learning_rate": 7.964744043515839e-05, + "loss": 1.3292, + "step": 327 + }, + { + "epoch": 0.6927138331573389, + "grad_norm": 0.8116549984014394, + "learning_rate": 7.963957410468698e-05, + "loss": 1.3169, + "step": 328 + }, + { + "epoch": 0.6948257655755016, + "grad_norm": 1.1294044308286046, + "learning_rate": 7.963162137985416e-05, + "loss": 1.3194, + "step": 329 + }, + { + "epoch": 0.6969376979936642, + "grad_norm": 1.0663867769679873, + "learning_rate": 7.962358227799286e-05, + "loss": 1.3069, + "step": 330 + }, + { + "epoch": 0.6990496304118268, + "grad_norm": 1.002884479012273, + "learning_rate": 7.961545681662427e-05, + "loss": 1.3195, + "step": 331 + }, + { + "epoch": 0.7011615628299894, + "grad_norm": 1.136425663647768, + "learning_rate": 7.960724501345783e-05, + "loss": 1.3289, + "step": 332 + }, + { + "epoch": 0.7032734952481521, + "grad_norm": 1.0081136508005393, + "learning_rate": 7.959894688639115e-05, + "loss": 1.3184, + "step": 333 + }, + { + "epoch": 0.7053854276663146, + "grad_norm": 0.9692736799962353, + "learning_rate": 7.959056245350996e-05, + "loss": 1.3273, + "step": 334 + }, + { + "epoch": 0.7074973600844773, + "grad_norm": 0.829905875378608, + "learning_rate": 7.958209173308811e-05, + "loss": 1.3203, + "step": 335 + }, + { + "epoch": 0.7096092925026399, + "grad_norm": 0.7947625433996741, + "learning_rate": 7.957353474358755e-05, + "loss": 1.3155, + "step": 336 + }, + { + "epoch": 0.7117212249208026, + "grad_norm": 1.0638389249689606, + "learning_rate": 7.95648915036582e-05, + "loss": 1.3259, + "step": 337 + }, + { + "epoch": 0.7138331573389651, + "grad_norm": 0.884541331674712, + "learning_rate": 7.955616203213797e-05, + "loss": 1.3024, + "step": 338 + }, + { + "epoch": 0.7159450897571278, + "grad_norm": 0.7171795600348532, + "learning_rate": 7.954734634805274e-05, + "loss": 1.3048, + "step": 339 + }, + { + "epoch": 0.7180570221752904, + "grad_norm": 0.7598542130206587, + "learning_rate": 7.953844447061626e-05, + "loss": 1.3098, + "step": 340 + }, + { + "epoch": 0.720168954593453, + "grad_norm": 0.956331105232553, + "learning_rate": 7.952945641923015e-05, + "loss": 1.3117, + "step": 341 + }, + { + "epoch": 0.7222808870116156, + "grad_norm": 1.4039441824297616, + "learning_rate": 7.952038221348387e-05, + "loss": 1.3225, + "step": 342 + }, + { + "epoch": 0.7243928194297783, + "grad_norm": 0.5070644157300858, + "learning_rate": 7.951122187315462e-05, + "loss": 1.317, + "step": 343 + }, + { + "epoch": 0.7265047518479408, + "grad_norm": 0.8698898731850847, + "learning_rate": 7.950197541820732e-05, + "loss": 1.3085, + "step": 344 + }, + { + "epoch": 0.7286166842661035, + "grad_norm": 1.4960738321606875, + "learning_rate": 7.949264286879462e-05, + "loss": 1.2978, + "step": 345 + }, + { + "epoch": 0.7307286166842661, + "grad_norm": 0.480220839945915, + "learning_rate": 7.948322424525679e-05, + "loss": 1.3184, + "step": 346 + }, + { + "epoch": 0.7328405491024287, + "grad_norm": 1.1475311961104322, + "learning_rate": 7.94737195681217e-05, + "loss": 1.3329, + "step": 347 + }, + { + "epoch": 0.7349524815205913, + "grad_norm": 1.3346418551409345, + "learning_rate": 7.946412885810475e-05, + "loss": 1.3007, + "step": 348 + }, + { + "epoch": 0.737064413938754, + "grad_norm": 0.6306070419857764, + "learning_rate": 7.94544521361089e-05, + "loss": 1.3077, + "step": 349 + }, + { + "epoch": 0.7391763463569165, + "grad_norm": 1.1394318754084258, + "learning_rate": 7.944468942322453e-05, + "loss": 1.3114, + "step": 350 + }, + { + "epoch": 0.7412882787750792, + "grad_norm": 1.0772937795373183, + "learning_rate": 7.943484074072943e-05, + "loss": 1.3095, + "step": 351 + }, + { + "epoch": 0.7434002111932418, + "grad_norm": 0.7063316376193721, + "learning_rate": 7.942490611008882e-05, + "loss": 1.3282, + "step": 352 + }, + { + "epoch": 0.7455121436114044, + "grad_norm": 0.8856002168663664, + "learning_rate": 7.94148855529552e-05, + "loss": 1.3184, + "step": 353 + }, + { + "epoch": 0.747624076029567, + "grad_norm": 0.705597029269222, + "learning_rate": 7.94047790911683e-05, + "loss": 1.3194, + "step": 354 + }, + { + "epoch": 0.7497360084477297, + "grad_norm": 0.5877708260776283, + "learning_rate": 7.939458674675519e-05, + "loss": 1.3061, + "step": 355 + }, + { + "epoch": 0.7518479408658922, + "grad_norm": 0.7230795149715871, + "learning_rate": 7.938430854193005e-05, + "loss": 1.3097, + "step": 356 + }, + { + "epoch": 0.7539598732840549, + "grad_norm": 0.6046032201199484, + "learning_rate": 7.937394449909418e-05, + "loss": 1.313, + "step": 357 + }, + { + "epoch": 0.7560718057022175, + "grad_norm": 0.588798182291123, + "learning_rate": 7.9363494640836e-05, + "loss": 1.3051, + "step": 358 + }, + { + "epoch": 0.7581837381203802, + "grad_norm": 0.4895404918180025, + "learning_rate": 7.935295898993096e-05, + "loss": 1.3374, + "step": 359 + }, + { + "epoch": 0.7602956705385427, + "grad_norm": 0.5988336111768012, + "learning_rate": 7.934233756934147e-05, + "loss": 1.3323, + "step": 360 + }, + { + "epoch": 0.7624076029567054, + "grad_norm": 1.0610912030892126, + "learning_rate": 7.933163040221692e-05, + "loss": 1.3261, + "step": 361 + }, + { + "epoch": 0.764519535374868, + "grad_norm": 0.859581434972512, + "learning_rate": 7.932083751189352e-05, + "loss": 1.3096, + "step": 362 + }, + { + "epoch": 0.7666314677930306, + "grad_norm": 0.9567146686397326, + "learning_rate": 7.930995892189439e-05, + "loss": 1.31, + "step": 363 + }, + { + "epoch": 0.7687434002111933, + "grad_norm": 1.110818168894519, + "learning_rate": 7.929899465592934e-05, + "loss": 1.3488, + "step": 364 + }, + { + "epoch": 0.7708553326293559, + "grad_norm": 0.9334702635814248, + "learning_rate": 7.928794473789503e-05, + "loss": 1.3248, + "step": 365 + }, + { + "epoch": 0.7729672650475184, + "grad_norm": 0.5876351161718532, + "learning_rate": 7.927680919187467e-05, + "loss": 1.292, + "step": 366 + }, + { + "epoch": 0.7750791974656811, + "grad_norm": 0.9549019053489745, + "learning_rate": 7.926558804213818e-05, + "loss": 1.3229, + "step": 367 + }, + { + "epoch": 0.7771911298838438, + "grad_norm": 0.854133454304937, + "learning_rate": 7.925428131314205e-05, + "loss": 1.3659, + "step": 368 + }, + { + "epoch": 0.7793030623020063, + "grad_norm": 0.8941238133570847, + "learning_rate": 7.924288902952925e-05, + "loss": 1.3196, + "step": 369 + }, + { + "epoch": 0.781414994720169, + "grad_norm": 1.1432662439554775, + "learning_rate": 7.923141121612922e-05, + "loss": 1.3334, + "step": 370 + }, + { + "epoch": 0.7835269271383316, + "grad_norm": 1.1109659683264685, + "learning_rate": 7.921984789795787e-05, + "loss": 1.3504, + "step": 371 + }, + { + "epoch": 0.7856388595564942, + "grad_norm": 1.1104292140447694, + "learning_rate": 7.920819910021738e-05, + "loss": 1.3354, + "step": 372 + }, + { + "epoch": 0.7877507919746568, + "grad_norm": 1.0677731456882777, + "learning_rate": 7.919646484829631e-05, + "loss": 1.3293, + "step": 373 + }, + { + "epoch": 0.7898627243928195, + "grad_norm": 0.8959145912849347, + "learning_rate": 7.918464516776943e-05, + "loss": 1.3217, + "step": 374 + }, + { + "epoch": 0.791974656810982, + "grad_norm": 0.8521020471331988, + "learning_rate": 7.91727400843977e-05, + "loss": 1.323, + "step": 375 + }, + { + "epoch": 0.7940865892291447, + "grad_norm": 1.1887432569059493, + "learning_rate": 7.916074962412823e-05, + "loss": 1.3244, + "step": 376 + }, + { + "epoch": 0.7961985216473073, + "grad_norm": 0.9053663035450702, + "learning_rate": 7.914867381309418e-05, + "loss": 1.3204, + "step": 377 + }, + { + "epoch": 0.7983104540654699, + "grad_norm": 0.6255051036391813, + "learning_rate": 7.913651267761478e-05, + "loss": 1.3171, + "step": 378 + }, + { + "epoch": 0.8004223864836325, + "grad_norm": 0.6586883343133585, + "learning_rate": 7.912426624419518e-05, + "loss": 1.3131, + "step": 379 + }, + { + "epoch": 0.8025343189017952, + "grad_norm": 0.6411875182984956, + "learning_rate": 7.911193453952646e-05, + "loss": 1.3231, + "step": 380 + }, + { + "epoch": 0.8046462513199577, + "grad_norm": 0.7071556004236775, + "learning_rate": 7.909951759048555e-05, + "loss": 1.3118, + "step": 381 + }, + { + "epoch": 0.8067581837381204, + "grad_norm": 0.7235283164285223, + "learning_rate": 7.908701542413515e-05, + "loss": 1.3049, + "step": 382 + }, + { + "epoch": 0.808870116156283, + "grad_norm": 0.8849669606315279, + "learning_rate": 7.90744280677237e-05, + "loss": 1.3331, + "step": 383 + }, + { + "epoch": 0.8109820485744457, + "grad_norm": 1.083862082794503, + "learning_rate": 7.906175554868531e-05, + "loss": 1.3168, + "step": 384 + }, + { + "epoch": 0.8130939809926082, + "grad_norm": 0.8107664479933251, + "learning_rate": 7.904899789463974e-05, + "loss": 1.3024, + "step": 385 + }, + { + "epoch": 0.8152059134107709, + "grad_norm": 0.8118663560165448, + "learning_rate": 7.903615513339224e-05, + "loss": 1.2967, + "step": 386 + }, + { + "epoch": 0.8173178458289335, + "grad_norm": 0.7769174744639543, + "learning_rate": 7.902322729293357e-05, + "loss": 1.3132, + "step": 387 + }, + { + "epoch": 0.8194297782470961, + "grad_norm": 0.6750894531330086, + "learning_rate": 7.901021440143995e-05, + "loss": 1.3053, + "step": 388 + }, + { + "epoch": 0.8215417106652587, + "grad_norm": 0.6654628219195626, + "learning_rate": 7.899711648727294e-05, + "loss": 1.3412, + "step": 389 + }, + { + "epoch": 0.8236536430834214, + "grad_norm": 0.580587918734637, + "learning_rate": 7.898393357897942e-05, + "loss": 1.326, + "step": 390 + }, + { + "epoch": 0.8257655755015839, + "grad_norm": 0.44881166020807506, + "learning_rate": 7.897066570529151e-05, + "loss": 1.3304, + "step": 391 + }, + { + "epoch": 0.8278775079197466, + "grad_norm": 0.8451040367703937, + "learning_rate": 7.895731289512649e-05, + "loss": 1.3154, + "step": 392 + }, + { + "epoch": 0.8299894403379092, + "grad_norm": 0.5621690877389763, + "learning_rate": 7.894387517758679e-05, + "loss": 1.3212, + "step": 393 + }, + { + "epoch": 0.8321013727560718, + "grad_norm": 0.7271727347779741, + "learning_rate": 7.89303525819599e-05, + "loss": 1.3383, + "step": 394 + }, + { + "epoch": 0.8342133051742344, + "grad_norm": 0.785074782841274, + "learning_rate": 7.891674513771827e-05, + "loss": 1.3182, + "step": 395 + }, + { + "epoch": 0.8363252375923971, + "grad_norm": 0.7944515968959052, + "learning_rate": 7.89030528745193e-05, + "loss": 1.3375, + "step": 396 + }, + { + "epoch": 0.8384371700105596, + "grad_norm": 0.8973272203551529, + "learning_rate": 7.888927582220523e-05, + "loss": 1.3168, + "step": 397 + }, + { + "epoch": 0.8405491024287223, + "grad_norm": 1.0524607318144934, + "learning_rate": 7.887541401080313e-05, + "loss": 1.3188, + "step": 398 + }, + { + "epoch": 0.8426610348468849, + "grad_norm": 0.9148070069832123, + "learning_rate": 7.886146747052479e-05, + "loss": 1.3104, + "step": 399 + }, + { + "epoch": 0.8447729672650475, + "grad_norm": 20.543074777699186, + "learning_rate": 7.884743623176666e-05, + "loss": 1.3749, + "step": 400 + }, + { + "epoch": 0.8468848996832101, + "grad_norm": 1.713571585046419, + "learning_rate": 7.883332032510979e-05, + "loss": 1.3366, + "step": 401 + }, + { + "epoch": 0.8489968321013728, + "grad_norm": 0.9332097777384384, + "learning_rate": 7.881911978131977e-05, + "loss": 1.3326, + "step": 402 + }, + { + "epoch": 0.8511087645195353, + "grad_norm": 0.8979569919352449, + "learning_rate": 7.880483463134663e-05, + "loss": 1.3464, + "step": 403 + }, + { + "epoch": 0.853220696937698, + "grad_norm": 1.0693999428630778, + "learning_rate": 7.879046490632487e-05, + "loss": 1.3424, + "step": 404 + }, + { + "epoch": 0.8553326293558606, + "grad_norm": 0.9391570852799092, + "learning_rate": 7.877601063757323e-05, + "loss": 1.3489, + "step": 405 + }, + { + "epoch": 0.8574445617740233, + "grad_norm": 0.7694727611907659, + "learning_rate": 7.876147185659476e-05, + "loss": 1.3425, + "step": 406 + }, + { + "epoch": 0.8595564941921858, + "grad_norm": 0.6505087944865335, + "learning_rate": 7.874684859507672e-05, + "loss": 1.3143, + "step": 407 + }, + { + "epoch": 0.8616684266103485, + "grad_norm": 0.6956206227053953, + "learning_rate": 7.873214088489047e-05, + "loss": 1.3329, + "step": 408 + }, + { + "epoch": 0.8637803590285111, + "grad_norm": 0.6388004537665296, + "learning_rate": 7.871734875809142e-05, + "loss": 1.3106, + "step": 409 + }, + { + "epoch": 0.8658922914466737, + "grad_norm": 0.889108094353506, + "learning_rate": 7.870247224691897e-05, + "loss": 1.3267, + "step": 410 + }, + { + "epoch": 0.8680042238648363, + "grad_norm": 0.6398574850874186, + "learning_rate": 7.868751138379647e-05, + "loss": 1.318, + "step": 411 + }, + { + "epoch": 0.870116156282999, + "grad_norm": 0.7584123575304376, + "learning_rate": 7.867246620133107e-05, + "loss": 1.3035, + "step": 412 + }, + { + "epoch": 0.8722280887011615, + "grad_norm": 0.5430321031501266, + "learning_rate": 7.865733673231372e-05, + "loss": 1.3175, + "step": 413 + }, + { + "epoch": 0.8743400211193242, + "grad_norm": 0.5008539505505039, + "learning_rate": 7.864212300971904e-05, + "loss": 1.3189, + "step": 414 + }, + { + "epoch": 0.8764519535374868, + "grad_norm": 0.6166529914117741, + "learning_rate": 7.862682506670532e-05, + "loss": 1.3131, + "step": 415 + }, + { + "epoch": 0.8785638859556494, + "grad_norm": 0.5973998674051332, + "learning_rate": 7.86114429366144e-05, + "loss": 1.3111, + "step": 416 + }, + { + "epoch": 0.880675818373812, + "grad_norm": 0.583051395255283, + "learning_rate": 7.859597665297159e-05, + "loss": 1.3254, + "step": 417 + }, + { + "epoch": 0.8827877507919747, + "grad_norm": 0.6677307575074054, + "learning_rate": 7.858042624948563e-05, + "loss": 1.3127, + "step": 418 + }, + { + "epoch": 0.8848996832101372, + "grad_norm": 1.6327795562424998, + "learning_rate": 7.856479176004854e-05, + "loss": 1.3205, + "step": 419 + }, + { + "epoch": 0.8870116156282999, + "grad_norm": 626.4701248932723, + "learning_rate": 7.854907321873573e-05, + "loss": 6.688, + "step": 420 + }, + { + "epoch": 0.8891235480464625, + "grad_norm": 8.109575419851813, + "learning_rate": 7.853327065980567e-05, + "loss": 1.4205, + "step": 421 + }, + { + "epoch": 0.8912354804646251, + "grad_norm": 2.2087945165728606, + "learning_rate": 7.851738411770003e-05, + "loss": 1.3607, + "step": 422 + }, + { + "epoch": 0.8933474128827877, + "grad_norm": 0.7951752239838442, + "learning_rate": 7.850141362704346e-05, + "loss": 1.3449, + "step": 423 + }, + { + "epoch": 0.8954593453009504, + "grad_norm": 0.8682751310988109, + "learning_rate": 7.848535922264365e-05, + "loss": 1.333, + "step": 424 + }, + { + "epoch": 0.8975712777191129, + "grad_norm": 1.1943821418977494, + "learning_rate": 7.846922093949109e-05, + "loss": 1.3234, + "step": 425 + }, + { + "epoch": 0.8996832101372756, + "grad_norm": 0.8211550853193312, + "learning_rate": 7.84529988127592e-05, + "loss": 1.3402, + "step": 426 + }, + { + "epoch": 0.9017951425554382, + "grad_norm": 0.7529544514900285, + "learning_rate": 7.843669287780399e-05, + "loss": 1.3293, + "step": 427 + }, + { + "epoch": 0.9039070749736009, + "grad_norm": 1.0662537164881922, + "learning_rate": 7.842030317016425e-05, + "loss": 1.3331, + "step": 428 + }, + { + "epoch": 0.9060190073917634, + "grad_norm": 1.0450891853253332, + "learning_rate": 7.840382972556132e-05, + "loss": 1.3195, + "step": 429 + }, + { + "epoch": 0.9081309398099261, + "grad_norm": 0.8936402808950642, + "learning_rate": 7.8387272579899e-05, + "loss": 1.3097, + "step": 430 + }, + { + "epoch": 0.9102428722280888, + "grad_norm": 1.0263228373428297, + "learning_rate": 7.837063176926357e-05, + "loss": 1.3346, + "step": 431 + }, + { + "epoch": 0.9123548046462513, + "grad_norm": 1.212510707414097, + "learning_rate": 7.835390732992366e-05, + "loss": 1.3571, + "step": 432 + }, + { + "epoch": 0.914466737064414, + "grad_norm": 1.2652467372925484, + "learning_rate": 7.833709929833012e-05, + "loss": 1.3418, + "step": 433 + }, + { + "epoch": 0.9165786694825766, + "grad_norm": 1.2501926006313295, + "learning_rate": 7.832020771111602e-05, + "loss": 1.3232, + "step": 434 + }, + { + "epoch": 0.9186906019007391, + "grad_norm": 0.6971152532776265, + "learning_rate": 7.830323260509653e-05, + "loss": 1.3181, + "step": 435 + }, + { + "epoch": 0.9208025343189018, + "grad_norm": 1.2367261828614264, + "learning_rate": 7.828617401726887e-05, + "loss": 1.3228, + "step": 436 + }, + { + "epoch": 0.9229144667370645, + "grad_norm": 0.872173294223612, + "learning_rate": 7.82690319848122e-05, + "loss": 1.3325, + "step": 437 + }, + { + "epoch": 0.925026399155227, + "grad_norm": 1.0886366603697517, + "learning_rate": 7.825180654508752e-05, + "loss": 1.3243, + "step": 438 + }, + { + "epoch": 0.9271383315733897, + "grad_norm": 0.6577656229865265, + "learning_rate": 7.823449773563763e-05, + "loss": 1.323, + "step": 439 + }, + { + "epoch": 0.9292502639915523, + "grad_norm": 0.7751320448683581, + "learning_rate": 7.821710559418707e-05, + "loss": 1.3085, + "step": 440 + }, + { + "epoch": 0.9313621964097148, + "grad_norm": 0.8258405714055443, + "learning_rate": 7.819963015864196e-05, + "loss": 1.3429, + "step": 441 + }, + { + "epoch": 0.9334741288278775, + "grad_norm": 0.7414979549995195, + "learning_rate": 7.818207146708997e-05, + "loss": 1.3172, + "step": 442 + }, + { + "epoch": 0.9355860612460402, + "grad_norm": 0.4798304432235842, + "learning_rate": 7.816442955780024e-05, + "loss": 1.3142, + "step": 443 + }, + { + "epoch": 0.9376979936642027, + "grad_norm": 0.4565828748403837, + "learning_rate": 7.814670446922324e-05, + "loss": 1.3052, + "step": 444 + }, + { + "epoch": 0.9398099260823654, + "grad_norm": 0.4305961908134174, + "learning_rate": 7.812889623999077e-05, + "loss": 1.3067, + "step": 445 + }, + { + "epoch": 0.941921858500528, + "grad_norm": 0.4777869621273663, + "learning_rate": 7.811100490891586e-05, + "loss": 1.3261, + "step": 446 + }, + { + "epoch": 0.9440337909186906, + "grad_norm": 0.46954082387644963, + "learning_rate": 7.809303051499259e-05, + "loss": 1.3187, + "step": 447 + }, + { + "epoch": 0.9461457233368532, + "grad_norm": 0.3501849886405323, + "learning_rate": 7.807497309739612e-05, + "loss": 1.3155, + "step": 448 + }, + { + "epoch": 0.9482576557550159, + "grad_norm": 0.3860355708599927, + "learning_rate": 7.805683269548255e-05, + "loss": 1.3154, + "step": 449 + }, + { + "epoch": 0.9503695881731784, + "grad_norm": 0.31558666471175423, + "learning_rate": 7.803860934878885e-05, + "loss": 1.3251, + "step": 450 + }, + { + "epoch": 0.9524815205913411, + "grad_norm": 0.39267789066283365, + "learning_rate": 7.802030309703276e-05, + "loss": 1.3029, + "step": 451 + }, + { + "epoch": 0.9545934530095037, + "grad_norm": 0.3734427534042282, + "learning_rate": 7.80019139801127e-05, + "loss": 1.3122, + "step": 452 + }, + { + "epoch": 0.9567053854276664, + "grad_norm": 0.36094260356241653, + "learning_rate": 7.798344203810774e-05, + "loss": 1.3099, + "step": 453 + }, + { + "epoch": 0.9588173178458289, + "grad_norm": 0.9600654765176504, + "learning_rate": 7.796488731127739e-05, + "loss": 1.336, + "step": 454 + }, + { + "epoch": 0.9609292502639916, + "grad_norm": 0.41396627257546437, + "learning_rate": 7.794624984006169e-05, + "loss": 1.3102, + "step": 455 + }, + { + "epoch": 0.9630411826821542, + "grad_norm": 0.5089675827004326, + "learning_rate": 7.792752966508091e-05, + "loss": 1.3118, + "step": 456 + }, + { + "epoch": 0.9651531151003168, + "grad_norm": 0.4877642561354866, + "learning_rate": 7.790872682713567e-05, + "loss": 1.3226, + "step": 457 + }, + { + "epoch": 0.9672650475184794, + "grad_norm": 0.4301255926710275, + "learning_rate": 7.78898413672067e-05, + "loss": 1.3122, + "step": 458 + }, + { + "epoch": 0.9693769799366421, + "grad_norm": 0.5463750726046798, + "learning_rate": 7.787087332645481e-05, + "loss": 1.3179, + "step": 459 + }, + { + "epoch": 0.9714889123548046, + "grad_norm": 0.5301072933823587, + "learning_rate": 7.785182274622081e-05, + "loss": 1.3005, + "step": 460 + }, + { + "epoch": 0.9736008447729673, + "grad_norm": 0.7675898366918714, + "learning_rate": 7.783268966802539e-05, + "loss": 1.327, + "step": 461 + }, + { + "epoch": 0.9757127771911299, + "grad_norm": 0.6591011737571214, + "learning_rate": 7.781347413356906e-05, + "loss": 1.3209, + "step": 462 + }, + { + "epoch": 0.9778247096092925, + "grad_norm": 0.6744174847762054, + "learning_rate": 7.779417618473203e-05, + "loss": 1.3113, + "step": 463 + }, + { + "epoch": 0.9799366420274551, + "grad_norm": 0.6816013769625404, + "learning_rate": 7.777479586357412e-05, + "loss": 1.2957, + "step": 464 + }, + { + "epoch": 0.9820485744456178, + "grad_norm": 0.6732303902758084, + "learning_rate": 7.775533321233471e-05, + "loss": 1.3076, + "step": 465 + }, + { + "epoch": 0.9841605068637803, + "grad_norm": 0.5955988713207605, + "learning_rate": 7.77357882734326e-05, + "loss": 1.3375, + "step": 466 + }, + { + "epoch": 0.986272439281943, + "grad_norm": 0.7140853644885363, + "learning_rate": 7.771616108946596e-05, + "loss": 1.308, + "step": 467 + }, + { + "epoch": 0.9883843717001056, + "grad_norm": 0.7436744869245421, + "learning_rate": 7.769645170321216e-05, + "loss": 1.3313, + "step": 468 + }, + { + "epoch": 0.9904963041182682, + "grad_norm": 0.7229665088632112, + "learning_rate": 7.767666015762776e-05, + "loss": 1.3147, + "step": 469 + }, + { + "epoch": 0.9926082365364308, + "grad_norm": 0.488973429711688, + "learning_rate": 7.765678649584843e-05, + "loss": 1.2932, + "step": 470 + }, + { + "epoch": 0.9947201689545935, + "grad_norm": 0.4987705039613546, + "learning_rate": 7.763683076118875e-05, + "loss": 1.3195, + "step": 471 + }, + { + "epoch": 0.996832101372756, + "grad_norm": 0.48665051225806194, + "learning_rate": 7.761679299714218e-05, + "loss": 1.3188, + "step": 472 + }, + { + "epoch": 0.9989440337909187, + "grad_norm": 0.5562076281616162, + "learning_rate": 7.7596673247381e-05, + "loss": 1.3032, + "step": 473 + }, + { + "epoch": 1.0013199577613516, + "grad_norm": 0.6112036195489211, + "learning_rate": 7.757647155575616e-05, + "loss": 1.3046, + "step": 474 + }, + { + "epoch": 1.0034318901795143, + "grad_norm": 0.6415367157825724, + "learning_rate": 7.755618796629719e-05, + "loss": 1.2819, + "step": 475 + }, + { + "epoch": 1.005543822597677, + "grad_norm": 0.5945749049879571, + "learning_rate": 7.753582252321213e-05, + "loss": 1.3082, + "step": 476 + }, + { + "epoch": 1.0076557550158396, + "grad_norm": 0.6658655436436091, + "learning_rate": 7.751537527088742e-05, + "loss": 1.2865, + "step": 477 + }, + { + "epoch": 1.009767687434002, + "grad_norm": 0.6941503312430217, + "learning_rate": 7.749484625388781e-05, + "loss": 1.2722, + "step": 478 + }, + { + "epoch": 1.0118796198521647, + "grad_norm": 0.8966515104657662, + "learning_rate": 7.747423551695621e-05, + "loss": 1.3049, + "step": 479 + }, + { + "epoch": 1.0139915522703273, + "grad_norm": 0.8892336256546287, + "learning_rate": 7.745354310501371e-05, + "loss": 1.2937, + "step": 480 + }, + { + "epoch": 1.01610348468849, + "grad_norm": 0.9909123165024439, + "learning_rate": 7.743276906315937e-05, + "loss": 1.2797, + "step": 481 + }, + { + "epoch": 1.0182154171066526, + "grad_norm": 0.955196011844625, + "learning_rate": 7.741191343667016e-05, + "loss": 1.2998, + "step": 482 + }, + { + "epoch": 1.0203273495248153, + "grad_norm": 0.7594879225996365, + "learning_rate": 7.739097627100088e-05, + "loss": 1.2908, + "step": 483 + }, + { + "epoch": 1.0224392819429777, + "grad_norm": 0.45208098058940765, + "learning_rate": 7.736995761178399e-05, + "loss": 1.2891, + "step": 484 + }, + { + "epoch": 1.0245512143611404, + "grad_norm": 0.42389767144555923, + "learning_rate": 7.734885750482967e-05, + "loss": 1.2676, + "step": 485 + }, + { + "epoch": 1.026663146779303, + "grad_norm": 0.7091847326658339, + "learning_rate": 7.732767599612555e-05, + "loss": 1.2854, + "step": 486 + }, + { + "epoch": 1.0287750791974657, + "grad_norm": 0.7356174671368765, + "learning_rate": 7.730641313183662e-05, + "loss": 1.285, + "step": 487 + }, + { + "epoch": 1.0308870116156283, + "grad_norm": 0.6341008672911264, + "learning_rate": 7.728506895830531e-05, + "loss": 1.2967, + "step": 488 + }, + { + "epoch": 1.032998944033791, + "grad_norm": 0.3783423091662912, + "learning_rate": 7.726364352205117e-05, + "loss": 1.2848, + "step": 489 + }, + { + "epoch": 1.0351108764519534, + "grad_norm": 0.36978169997901056, + "learning_rate": 7.724213686977088e-05, + "loss": 1.2954, + "step": 490 + }, + { + "epoch": 1.037222808870116, + "grad_norm": 0.47235493057446765, + "learning_rate": 7.722054904833816e-05, + "loss": 1.2972, + "step": 491 + }, + { + "epoch": 1.0393347412882787, + "grad_norm": 0.5747896208196642, + "learning_rate": 7.719888010480361e-05, + "loss": 1.2839, + "step": 492 + }, + { + "epoch": 1.0414466737064414, + "grad_norm": 0.5479122331846492, + "learning_rate": 7.717713008639464e-05, + "loss": 1.289, + "step": 493 + }, + { + "epoch": 1.043558606124604, + "grad_norm": 0.6106380957438209, + "learning_rate": 7.715529904051536e-05, + "loss": 1.3055, + "step": 494 + }, + { + "epoch": 1.0456705385427667, + "grad_norm": 0.4769879668256277, + "learning_rate": 7.71333870147465e-05, + "loss": 1.2838, + "step": 495 + }, + { + "epoch": 1.0477824709609294, + "grad_norm": 0.7434701327718548, + "learning_rate": 7.711139405684527e-05, + "loss": 1.287, + "step": 496 + }, + { + "epoch": 1.0498944033790918, + "grad_norm": 0.7915968669528708, + "learning_rate": 7.708932021474524e-05, + "loss": 1.2952, + "step": 497 + }, + { + "epoch": 1.0520063357972544, + "grad_norm": 0.9126436938305715, + "learning_rate": 7.706716553655636e-05, + "loss": 1.2886, + "step": 498 + }, + { + "epoch": 1.054118268215417, + "grad_norm": 0.33560649972456624, + "learning_rate": 7.704493007056463e-05, + "loss": 1.2877, + "step": 499 + }, + { + "epoch": 1.0562302006335798, + "grad_norm": 0.6378383690764622, + "learning_rate": 7.702261386523224e-05, + "loss": 1.2985, + "step": 500 + }, + { + "epoch": 1.0583421330517424, + "grad_norm": 0.9294967842271373, + "learning_rate": 7.700021696919732e-05, + "loss": 1.292, + "step": 501 + }, + { + "epoch": 1.060454065469905, + "grad_norm": 0.8681396418925423, + "learning_rate": 7.697773943127381e-05, + "loss": 1.2872, + "step": 502 + }, + { + "epoch": 1.0625659978880675, + "grad_norm": 0.6055349171445156, + "learning_rate": 7.695518130045147e-05, + "loss": 1.2984, + "step": 503 + }, + { + "epoch": 1.0646779303062301, + "grad_norm": 0.7302298080193907, + "learning_rate": 7.693254262589572e-05, + "loss": 1.3065, + "step": 504 + }, + { + "epoch": 1.0667898627243928, + "grad_norm": 0.6739755949862284, + "learning_rate": 7.690982345694748e-05, + "loss": 1.3036, + "step": 505 + }, + { + "epoch": 1.0689017951425555, + "grad_norm": 0.628671804780483, + "learning_rate": 7.688702384312311e-05, + "loss": 1.3135, + "step": 506 + }, + { + "epoch": 1.071013727560718, + "grad_norm": 0.3988726990898124, + "learning_rate": 7.686414383411434e-05, + "loss": 1.3001, + "step": 507 + }, + { + "epoch": 1.0731256599788808, + "grad_norm": 0.38842901402323227, + "learning_rate": 7.68411834797881e-05, + "loss": 1.2739, + "step": 508 + }, + { + "epoch": 1.0752375923970432, + "grad_norm": 0.4161437116523836, + "learning_rate": 7.681814283018641e-05, + "loss": 1.3068, + "step": 509 + }, + { + "epoch": 1.0773495248152059, + "grad_norm": 0.4264775779134912, + "learning_rate": 7.679502193552634e-05, + "loss": 1.2871, + "step": 510 + }, + { + "epoch": 1.0794614572333685, + "grad_norm": 0.39587499065666, + "learning_rate": 7.67718208461998e-05, + "loss": 1.2805, + "step": 511 + }, + { + "epoch": 1.0815733896515312, + "grad_norm": 0.33058331464454865, + "learning_rate": 7.674853961277352e-05, + "loss": 1.2984, + "step": 512 + }, + { + "epoch": 1.0836853220696938, + "grad_norm": 0.28215364211696325, + "learning_rate": 7.672517828598891e-05, + "loss": 1.2965, + "step": 513 + }, + { + "epoch": 1.0857972544878565, + "grad_norm": 0.34947284059336886, + "learning_rate": 7.67017369167619e-05, + "loss": 1.2991, + "step": 514 + }, + { + "epoch": 1.087909186906019, + "grad_norm": 0.40260211240376065, + "learning_rate": 7.667821555618293e-05, + "loss": 1.3007, + "step": 515 + }, + { + "epoch": 1.0900211193241816, + "grad_norm": 0.3618280771569929, + "learning_rate": 7.665461425551673e-05, + "loss": 1.2914, + "step": 516 + }, + { + "epoch": 1.0921330517423442, + "grad_norm": 0.3926547817491922, + "learning_rate": 7.663093306620231e-05, + "loss": 1.2818, + "step": 517 + }, + { + "epoch": 1.0942449841605069, + "grad_norm": 0.5659615412472125, + "learning_rate": 7.660717203985273e-05, + "loss": 1.3085, + "step": 518 + }, + { + "epoch": 1.0963569165786695, + "grad_norm": 0.5837822227867112, + "learning_rate": 7.65833312282551e-05, + "loss": 1.2804, + "step": 519 + }, + { + "epoch": 1.0984688489968322, + "grad_norm": 0.589510290514974, + "learning_rate": 7.655941068337046e-05, + "loss": 1.281, + "step": 520 + }, + { + "epoch": 1.1005807814149948, + "grad_norm": 0.6003329624559646, + "learning_rate": 7.653541045733353e-05, + "loss": 1.3015, + "step": 521 + }, + { + "epoch": 1.1026927138331573, + "grad_norm": 0.6370282116444752, + "learning_rate": 7.651133060245276e-05, + "loss": 1.2829, + "step": 522 + }, + { + "epoch": 1.10480464625132, + "grad_norm": 0.6884335764258969, + "learning_rate": 7.648717117121016e-05, + "loss": 1.2838, + "step": 523 + }, + { + "epoch": 1.1069165786694826, + "grad_norm": 0.701664140770596, + "learning_rate": 7.646293221626112e-05, + "loss": 1.2844, + "step": 524 + }, + { + "epoch": 1.1090285110876452, + "grad_norm": 0.5207752048979116, + "learning_rate": 7.643861379043443e-05, + "loss": 1.296, + "step": 525 + }, + { + "epoch": 1.1111404435058079, + "grad_norm": 0.30941187808093756, + "learning_rate": 7.641421594673202e-05, + "loss": 1.3068, + "step": 526 + }, + { + "epoch": 1.1132523759239705, + "grad_norm": 0.2727924152322591, + "learning_rate": 7.638973873832892e-05, + "loss": 1.2819, + "step": 527 + }, + { + "epoch": 1.115364308342133, + "grad_norm": 0.3101893193053521, + "learning_rate": 7.636518221857318e-05, + "loss": 1.289, + "step": 528 + }, + { + "epoch": 1.1174762407602956, + "grad_norm": 0.34787472872635233, + "learning_rate": 7.634054644098566e-05, + "loss": 1.2719, + "step": 529 + }, + { + "epoch": 1.1195881731784583, + "grad_norm": 0.34490053948837146, + "learning_rate": 7.631583145925998e-05, + "loss": 1.2905, + "step": 530 + }, + { + "epoch": 1.121700105596621, + "grad_norm": 0.4166497614273231, + "learning_rate": 7.629103732726241e-05, + "loss": 1.2958, + "step": 531 + }, + { + "epoch": 1.1238120380147836, + "grad_norm": 0.35820363113917364, + "learning_rate": 7.626616409903166e-05, + "loss": 1.2823, + "step": 532 + }, + { + "epoch": 1.1259239704329462, + "grad_norm": 0.32952400814908384, + "learning_rate": 7.624121182877893e-05, + "loss": 1.2782, + "step": 533 + }, + { + "epoch": 1.1280359028511087, + "grad_norm": 0.36518248578976603, + "learning_rate": 7.621618057088759e-05, + "loss": 1.2957, + "step": 534 + }, + { + "epoch": 1.1301478352692713, + "grad_norm": 0.3656254053795771, + "learning_rate": 7.619107037991323e-05, + "loss": 1.3079, + "step": 535 + }, + { + "epoch": 1.132259767687434, + "grad_norm": 0.3496517312904986, + "learning_rate": 7.616588131058346e-05, + "loss": 1.2792, + "step": 536 + }, + { + "epoch": 1.1343717001055966, + "grad_norm": 0.2971702608753361, + "learning_rate": 7.614061341779778e-05, + "loss": 1.2798, + "step": 537 + }, + { + "epoch": 1.1364836325237593, + "grad_norm": 0.4000062542173163, + "learning_rate": 7.611526675662751e-05, + "loss": 1.2803, + "step": 538 + }, + { + "epoch": 1.138595564941922, + "grad_norm": 0.5093545735794924, + "learning_rate": 7.608984138231566e-05, + "loss": 1.2867, + "step": 539 + }, + { + "epoch": 1.1407074973600846, + "grad_norm": 0.44566618648074996, + "learning_rate": 7.606433735027675e-05, + "loss": 1.2864, + "step": 540 + }, + { + "epoch": 1.142819429778247, + "grad_norm": 0.3841618176796274, + "learning_rate": 7.603875471609677e-05, + "loss": 1.2907, + "step": 541 + }, + { + "epoch": 1.1449313621964097, + "grad_norm": 0.440468305700516, + "learning_rate": 7.601309353553302e-05, + "loss": 1.2961, + "step": 542 + }, + { + "epoch": 1.1470432946145723, + "grad_norm": 0.5221378975701709, + "learning_rate": 7.598735386451397e-05, + "loss": 1.2843, + "step": 543 + }, + { + "epoch": 1.149155227032735, + "grad_norm": 0.4914640486431547, + "learning_rate": 7.596153575913919e-05, + "loss": 1.2727, + "step": 544 + }, + { + "epoch": 1.1512671594508976, + "grad_norm": 0.4480797554460926, + "learning_rate": 7.593563927567916e-05, + "loss": 1.2804, + "step": 545 + }, + { + "epoch": 1.15337909186906, + "grad_norm": 0.4529800889226529, + "learning_rate": 7.590966447057524e-05, + "loss": 1.2876, + "step": 546 + }, + { + "epoch": 1.1554910242872227, + "grad_norm": 0.4289265584942922, + "learning_rate": 7.588361140043941e-05, + "loss": 1.2892, + "step": 547 + }, + { + "epoch": 1.1576029567053854, + "grad_norm": 0.5286668584693756, + "learning_rate": 7.585748012205433e-05, + "loss": 1.2932, + "step": 548 + }, + { + "epoch": 1.159714889123548, + "grad_norm": 0.5394314286932536, + "learning_rate": 7.583127069237303e-05, + "loss": 1.2948, + "step": 549 + }, + { + "epoch": 1.1618268215417107, + "grad_norm": 0.5464002222947238, + "learning_rate": 7.580498316851891e-05, + "loss": 1.2779, + "step": 550 + }, + { + "epoch": 1.1639387539598733, + "grad_norm": 0.4236259869310714, + "learning_rate": 7.577861760778557e-05, + "loss": 1.2745, + "step": 551 + }, + { + "epoch": 1.166050686378036, + "grad_norm": 0.4695909850127443, + "learning_rate": 7.575217406763669e-05, + "loss": 1.2737, + "step": 552 + }, + { + "epoch": 1.1681626187961984, + "grad_norm": 0.4839198945897, + "learning_rate": 7.572565260570588e-05, + "loss": 1.2775, + "step": 553 + }, + { + "epoch": 1.170274551214361, + "grad_norm": 0.39874416229694915, + "learning_rate": 7.569905327979667e-05, + "loss": 1.2831, + "step": 554 + }, + { + "epoch": 1.1723864836325237, + "grad_norm": 0.3692045605725901, + "learning_rate": 7.567237614788216e-05, + "loss": 1.2771, + "step": 555 + }, + { + "epoch": 1.1744984160506864, + "grad_norm": 0.45297064220498945, + "learning_rate": 7.564562126810513e-05, + "loss": 1.2912, + "step": 556 + }, + { + "epoch": 1.176610348468849, + "grad_norm": 0.44716148933124583, + "learning_rate": 7.561878869877779e-05, + "loss": 1.2795, + "step": 557 + }, + { + "epoch": 1.1787222808870117, + "grad_norm": 0.3324011250574207, + "learning_rate": 7.559187849838164e-05, + "loss": 1.2825, + "step": 558 + }, + { + "epoch": 1.1808342133051744, + "grad_norm": 0.2910699225973643, + "learning_rate": 7.556489072556741e-05, + "loss": 1.266, + "step": 559 + }, + { + "epoch": 1.1829461457233368, + "grad_norm": 0.4070361309647773, + "learning_rate": 7.55378254391549e-05, + "loss": 1.2822, + "step": 560 + }, + { + "epoch": 1.1850580781414994, + "grad_norm": 0.5257260744478912, + "learning_rate": 7.551068269813282e-05, + "loss": 1.2703, + "step": 561 + }, + { + "epoch": 1.187170010559662, + "grad_norm": 0.8168470236264866, + "learning_rate": 7.548346256165873e-05, + "loss": 1.2747, + "step": 562 + }, + { + "epoch": 1.1892819429778247, + "grad_norm": 1.0368994558843532, + "learning_rate": 7.545616508905884e-05, + "loss": 1.2752, + "step": 563 + }, + { + "epoch": 1.1913938753959874, + "grad_norm": 0.9382603081356848, + "learning_rate": 7.542879033982795e-05, + "loss": 1.2877, + "step": 564 + }, + { + "epoch": 1.1935058078141498, + "grad_norm": 0.7504145623980235, + "learning_rate": 7.540133837362924e-05, + "loss": 1.2803, + "step": 565 + }, + { + "epoch": 1.1956177402323125, + "grad_norm": 0.5203379303522989, + "learning_rate": 7.537380925029423e-05, + "loss": 1.276, + "step": 566 + }, + { + "epoch": 1.1977296726504751, + "grad_norm": 0.4280362356562467, + "learning_rate": 7.534620302982255e-05, + "loss": 1.2836, + "step": 567 + }, + { + "epoch": 1.1998416050686378, + "grad_norm": 0.5715100434949246, + "learning_rate": 7.531851977238194e-05, + "loss": 1.3112, + "step": 568 + }, + { + "epoch": 1.2019535374868004, + "grad_norm": 0.529650090729069, + "learning_rate": 7.529075953830795e-05, + "loss": 1.2917, + "step": 569 + }, + { + "epoch": 1.204065469904963, + "grad_norm": 0.4240759893671528, + "learning_rate": 7.526292238810398e-05, + "loss": 1.2796, + "step": 570 + }, + { + "epoch": 1.2061774023231258, + "grad_norm": 0.3351830983131411, + "learning_rate": 7.523500838244103e-05, + "loss": 1.2659, + "step": 571 + }, + { + "epoch": 1.2082893347412882, + "grad_norm": 0.3875651544845907, + "learning_rate": 7.520701758215759e-05, + "loss": 1.28, + "step": 572 + }, + { + "epoch": 1.2104012671594508, + "grad_norm": 0.4723044898817449, + "learning_rate": 7.517895004825956e-05, + "loss": 1.2777, + "step": 573 + }, + { + "epoch": 1.2125131995776135, + "grad_norm": 0.4409175530200886, + "learning_rate": 7.515080584192009e-05, + "loss": 1.262, + "step": 574 + }, + { + "epoch": 1.2146251319957762, + "grad_norm": 0.401351246905006, + "learning_rate": 7.512258502447937e-05, + "loss": 1.2821, + "step": 575 + }, + { + "epoch": 1.2167370644139388, + "grad_norm": 0.4392999027411052, + "learning_rate": 7.509428765744464e-05, + "loss": 1.2864, + "step": 576 + }, + { + "epoch": 1.2188489968321015, + "grad_norm": 0.4823742447409453, + "learning_rate": 7.506591380248992e-05, + "loss": 1.294, + "step": 577 + }, + { + "epoch": 1.2209609292502641, + "grad_norm": 0.7842833475362873, + "learning_rate": 7.503746352145598e-05, + "loss": 1.2993, + "step": 578 + }, + { + "epoch": 1.2230728616684265, + "grad_norm": 0.5448821355015401, + "learning_rate": 7.500893687635015e-05, + "loss": 1.2848, + "step": 579 + }, + { + "epoch": 1.2251847940865892, + "grad_norm": 0.40160908670751816, + "learning_rate": 7.498033392934619e-05, + "loss": 1.2815, + "step": 580 + }, + { + "epoch": 1.2272967265047519, + "grad_norm": 0.25206012460538446, + "learning_rate": 7.495165474278411e-05, + "loss": 1.2788, + "step": 581 + }, + { + "epoch": 1.2294086589229145, + "grad_norm": 0.34242675034948983, + "learning_rate": 7.492289937917019e-05, + "loss": 1.2755, + "step": 582 + }, + { + "epoch": 1.2315205913410772, + "grad_norm": 0.39581738713469844, + "learning_rate": 7.489406790117666e-05, + "loss": 1.2883, + "step": 583 + }, + { + "epoch": 1.2336325237592396, + "grad_norm": 0.3352612228411988, + "learning_rate": 7.486516037164166e-05, + "loss": 1.2686, + "step": 584 + }, + { + "epoch": 1.2357444561774023, + "grad_norm": 0.3054627288246552, + "learning_rate": 7.483617685356906e-05, + "loss": 1.2633, + "step": 585 + }, + { + "epoch": 1.237856388595565, + "grad_norm": 0.3695841001211128, + "learning_rate": 7.48071174101284e-05, + "loss": 1.3058, + "step": 586 + }, + { + "epoch": 1.2399683210137276, + "grad_norm": 0.4446084194760978, + "learning_rate": 7.477798210465467e-05, + "loss": 1.2785, + "step": 587 + }, + { + "epoch": 1.2420802534318902, + "grad_norm": 0.4269750861176684, + "learning_rate": 7.474877100064821e-05, + "loss": 1.2861, + "step": 588 + }, + { + "epoch": 1.2441921858500529, + "grad_norm": 0.39168340302050403, + "learning_rate": 7.471948416177453e-05, + "loss": 1.282, + "step": 589 + }, + { + "epoch": 1.2463041182682155, + "grad_norm": 0.4184183935321298, + "learning_rate": 7.469012165186425e-05, + "loss": 1.2701, + "step": 590 + }, + { + "epoch": 1.248416050686378, + "grad_norm": 0.37832889573834116, + "learning_rate": 7.466068353491287e-05, + "loss": 1.2726, + "step": 591 + }, + { + "epoch": 1.2505279831045406, + "grad_norm": 0.38649556458594686, + "learning_rate": 7.46311698750807e-05, + "loss": 1.2842, + "step": 592 + }, + { + "epoch": 1.2526399155227033, + "grad_norm": 0.4564674922915447, + "learning_rate": 7.460158073669272e-05, + "loss": 1.2664, + "step": 593 + }, + { + "epoch": 1.254751847940866, + "grad_norm": 0.4983295254737727, + "learning_rate": 7.457191618423838e-05, + "loss": 1.2732, + "step": 594 + }, + { + "epoch": 1.2568637803590286, + "grad_norm": 0.5157116782412413, + "learning_rate": 7.45421762823715e-05, + "loss": 1.2838, + "step": 595 + }, + { + "epoch": 1.258975712777191, + "grad_norm": 0.5314821014150098, + "learning_rate": 7.451236109591013e-05, + "loss": 1.2807, + "step": 596 + }, + { + "epoch": 1.2610876451953539, + "grad_norm": 0.5177405104140198, + "learning_rate": 7.448247068983639e-05, + "loss": 1.2737, + "step": 597 + }, + { + "epoch": 1.2631995776135163, + "grad_norm": 0.4154389983405361, + "learning_rate": 7.445250512929637e-05, + "loss": 1.2735, + "step": 598 + }, + { + "epoch": 1.265311510031679, + "grad_norm": 0.36464102229904516, + "learning_rate": 7.442246447959992e-05, + "loss": 1.289, + "step": 599 + }, + { + "epoch": 1.2674234424498416, + "grad_norm": 0.32766598727120955, + "learning_rate": 7.439234880622059e-05, + "loss": 1.2786, + "step": 600 + }, + { + "epoch": 1.2695353748680043, + "grad_norm": 0.41529483308165593, + "learning_rate": 7.436215817479541e-05, + "loss": 1.2651, + "step": 601 + }, + { + "epoch": 1.271647307286167, + "grad_norm": 0.4794047879949704, + "learning_rate": 7.43318926511248e-05, + "loss": 1.2942, + "step": 602 + }, + { + "epoch": 1.2737592397043294, + "grad_norm": 0.2876168025788088, + "learning_rate": 7.430155230117239e-05, + "loss": 1.2919, + "step": 603 + }, + { + "epoch": 1.275871172122492, + "grad_norm": 0.3300546111683379, + "learning_rate": 7.427113719106491e-05, + "loss": 1.2985, + "step": 604 + }, + { + "epoch": 1.2779831045406547, + "grad_norm": 0.4521148236134709, + "learning_rate": 7.424064738709201e-05, + "loss": 1.2896, + "step": 605 + }, + { + "epoch": 1.2800950369588173, + "grad_norm": 0.4161674602371336, + "learning_rate": 7.421008295570616e-05, + "loss": 1.2782, + "step": 606 + }, + { + "epoch": 1.28220696937698, + "grad_norm": 0.3566896396838749, + "learning_rate": 7.417944396352248e-05, + "loss": 1.2833, + "step": 607 + }, + { + "epoch": 1.2843189017951426, + "grad_norm": 0.3165161094848006, + "learning_rate": 7.414873047731856e-05, + "loss": 1.2741, + "step": 608 + }, + { + "epoch": 1.2864308342133053, + "grad_norm": 0.6856638872537303, + "learning_rate": 7.41179425640344e-05, + "loss": 1.2833, + "step": 609 + }, + { + "epoch": 1.2885427666314677, + "grad_norm": 0.5077010348105481, + "learning_rate": 7.408708029077214e-05, + "loss": 1.2767, + "step": 610 + }, + { + "epoch": 1.2906546990496304, + "grad_norm": 0.6194182002241105, + "learning_rate": 7.40561437247961e-05, + "loss": 1.2653, + "step": 611 + }, + { + "epoch": 1.292766631467793, + "grad_norm": 0.7133964128822143, + "learning_rate": 7.40251329335324e-05, + "loss": 1.2756, + "step": 612 + }, + { + "epoch": 1.2948785638859557, + "grad_norm": 0.7574183129410421, + "learning_rate": 7.399404798456902e-05, + "loss": 1.2652, + "step": 613 + }, + { + "epoch": 1.2969904963041183, + "grad_norm": 0.7826302918543997, + "learning_rate": 7.396288894565551e-05, + "loss": 1.2851, + "step": 614 + }, + { + "epoch": 1.2991024287222808, + "grad_norm": 0.7177995625352279, + "learning_rate": 7.393165588470295e-05, + "loss": 1.2782, + "step": 615 + }, + { + "epoch": 1.3012143611404436, + "grad_norm": 0.8123119970558077, + "learning_rate": 7.390034886978372e-05, + "loss": 1.2766, + "step": 616 + }, + { + "epoch": 1.303326293558606, + "grad_norm": 0.8353212587687888, + "learning_rate": 7.386896796913137e-05, + "loss": 1.2844, + "step": 617 + }, + { + "epoch": 1.3054382259767687, + "grad_norm": 0.7108384489057616, + "learning_rate": 7.383751325114053e-05, + "loss": 1.3019, + "step": 618 + }, + { + "epoch": 1.3075501583949314, + "grad_norm": 0.7447936412899987, + "learning_rate": 7.380598478436665e-05, + "loss": 1.2844, + "step": 619 + }, + { + "epoch": 1.309662090813094, + "grad_norm": 0.8207469437896129, + "learning_rate": 7.377438263752598e-05, + "loss": 1.2877, + "step": 620 + }, + { + "epoch": 1.3117740232312567, + "grad_norm": 0.5375760605114689, + "learning_rate": 7.374270687949531e-05, + "loss": 1.288, + "step": 621 + }, + { + "epoch": 1.3138859556494191, + "grad_norm": 0.29046744233389915, + "learning_rate": 7.37109575793119e-05, + "loss": 1.27, + "step": 622 + }, + { + "epoch": 1.3159978880675818, + "grad_norm": 1.9657888891573572, + "learning_rate": 7.367913480617326e-05, + "loss": 1.2789, + "step": 623 + }, + { + "epoch": 1.3181098204857444, + "grad_norm": 0.3399053549119543, + "learning_rate": 7.364723862943707e-05, + "loss": 1.2758, + "step": 624 + }, + { + "epoch": 1.320221752903907, + "grad_norm": 0.507517871938796, + "learning_rate": 7.361526911862095e-05, + "loss": 1.2732, + "step": 625 + }, + { + "epoch": 1.3223336853220697, + "grad_norm": 0.4706702020662118, + "learning_rate": 7.358322634340241e-05, + "loss": 1.2831, + "step": 626 + }, + { + "epoch": 1.3244456177402324, + "grad_norm": 0.6080290493213188, + "learning_rate": 7.355111037361857e-05, + "loss": 1.2888, + "step": 627 + }, + { + "epoch": 1.326557550158395, + "grad_norm": 0.7327814203236522, + "learning_rate": 7.351892127926613e-05, + "loss": 1.2966, + "step": 628 + }, + { + "epoch": 1.3286694825765575, + "grad_norm": 0.8139927225686161, + "learning_rate": 7.348665913050115e-05, + "loss": 1.2817, + "step": 629 + }, + { + "epoch": 1.3307814149947201, + "grad_norm": 0.5035111094814956, + "learning_rate": 7.34543239976389e-05, + "loss": 1.2978, + "step": 630 + }, + { + "epoch": 1.3328933474128828, + "grad_norm": 0.4693600251038984, + "learning_rate": 7.342191595115374e-05, + "loss": 1.3008, + "step": 631 + }, + { + "epoch": 1.3350052798310454, + "grad_norm": 0.4876872731875356, + "learning_rate": 7.338943506167893e-05, + "loss": 1.2759, + "step": 632 + }, + { + "epoch": 1.337117212249208, + "grad_norm": 0.3017451985121869, + "learning_rate": 7.33568814000065e-05, + "loss": 1.2832, + "step": 633 + }, + { + "epoch": 1.3392291446673705, + "grad_norm": 0.6909687482891133, + "learning_rate": 7.332425503708705e-05, + "loss": 1.283, + "step": 634 + }, + { + "epoch": 1.3413410770855332, + "grad_norm": 250.04395622671262, + "learning_rate": 7.32915560440297e-05, + "loss": 7.5705, + "step": 635 + }, + { + "epoch": 1.3434530095036958, + "grad_norm": 0.9751983721914335, + "learning_rate": 7.325878449210182e-05, + "loss": 1.2924, + "step": 636 + }, + { + "epoch": 1.3455649419218585, + "grad_norm": 1.5550555037390674, + "learning_rate": 7.322594045272892e-05, + "loss": 1.2891, + "step": 637 + }, + { + "epoch": 1.3476768743400211, + "grad_norm": 0.7317233095035209, + "learning_rate": 7.319302399749453e-05, + "loss": 1.3048, + "step": 638 + }, + { + "epoch": 1.3497888067581838, + "grad_norm": 1.2286488800068556, + "learning_rate": 7.316003519813999e-05, + "loss": 1.3078, + "step": 639 + }, + { + "epoch": 1.3519007391763465, + "grad_norm": 0.8809389368034148, + "learning_rate": 7.312697412656427e-05, + "loss": 1.2776, + "step": 640 + }, + { + "epoch": 1.3540126715945089, + "grad_norm": 1.0099574658969992, + "learning_rate": 7.309384085482396e-05, + "loss": 1.29, + "step": 641 + }, + { + "epoch": 1.3561246040126715, + "grad_norm": 0.678526302329501, + "learning_rate": 7.306063545513293e-05, + "loss": 1.292, + "step": 642 + }, + { + "epoch": 1.3582365364308342, + "grad_norm": 0.8959161450930594, + "learning_rate": 7.302735799986224e-05, + "loss": 1.3036, + "step": 643 + }, + { + "epoch": 1.3603484688489969, + "grad_norm": 0.4679909274107738, + "learning_rate": 7.299400856154007e-05, + "loss": 1.2813, + "step": 644 + }, + { + "epoch": 1.3624604012671595, + "grad_norm": 0.7758415567722067, + "learning_rate": 7.296058721285142e-05, + "loss": 1.3041, + "step": 645 + }, + { + "epoch": 1.364572333685322, + "grad_norm": 0.620666598432961, + "learning_rate": 7.292709402663805e-05, + "loss": 1.294, + "step": 646 + }, + { + "epoch": 1.3666842661034848, + "grad_norm": 0.6610375814341847, + "learning_rate": 7.28935290758983e-05, + "loss": 1.3384, + "step": 647 + }, + { + "epoch": 1.3687961985216472, + "grad_norm": 0.43955055085685735, + "learning_rate": 7.285989243378686e-05, + "loss": 1.3017, + "step": 648 + }, + { + "epoch": 1.37090813093981, + "grad_norm": 0.4211069269666333, + "learning_rate": 7.282618417361476e-05, + "loss": 1.2956, + "step": 649 + }, + { + "epoch": 1.3730200633579726, + "grad_norm": 0.44182368239991154, + "learning_rate": 7.279240436884908e-05, + "loss": 1.2907, + "step": 650 + }, + { + "epoch": 1.3751319957761352, + "grad_norm": 0.44125250651352066, + "learning_rate": 7.275855309311277e-05, + "loss": 1.2973, + "step": 651 + }, + { + "epoch": 1.3772439281942979, + "grad_norm": 0.3678427744696407, + "learning_rate": 7.272463042018466e-05, + "loss": 1.3027, + "step": 652 + }, + { + "epoch": 1.3793558606124603, + "grad_norm": 0.3150023271714157, + "learning_rate": 7.269063642399912e-05, + "loss": 1.2938, + "step": 653 + }, + { + "epoch": 1.381467793030623, + "grad_norm": 0.354370426383399, + "learning_rate": 7.2656571178646e-05, + "loss": 1.2734, + "step": 654 + }, + { + "epoch": 1.3835797254487856, + "grad_norm": 0.35410838530151373, + "learning_rate": 7.262243475837041e-05, + "loss": 1.2781, + "step": 655 + }, + { + "epoch": 1.3856916578669483, + "grad_norm": 0.3286381553080614, + "learning_rate": 7.258822723757261e-05, + "loss": 1.2774, + "step": 656 + }, + { + "epoch": 1.387803590285111, + "grad_norm": 0.2996986744998582, + "learning_rate": 7.25539486908078e-05, + "loss": 1.2793, + "step": 657 + }, + { + "epoch": 1.3899155227032736, + "grad_norm": 0.37137517916024465, + "learning_rate": 7.251959919278602e-05, + "loss": 1.2793, + "step": 658 + }, + { + "epoch": 1.3920274551214362, + "grad_norm": 0.3905982793258464, + "learning_rate": 7.248517881837189e-05, + "loss": 1.2677, + "step": 659 + }, + { + "epoch": 1.3941393875395987, + "grad_norm": 0.39977433552776076, + "learning_rate": 7.245068764258456e-05, + "loss": 1.2903, + "step": 660 + }, + { + "epoch": 1.3962513199577613, + "grad_norm": 0.3563985004674569, + "learning_rate": 7.241612574059746e-05, + "loss": 1.2901, + "step": 661 + }, + { + "epoch": 1.398363252375924, + "grad_norm": 0.6746707880731194, + "learning_rate": 7.238149318773819e-05, + "loss": 1.2803, + "step": 662 + }, + { + "epoch": 1.4004751847940866, + "grad_norm": 0.6379394960820778, + "learning_rate": 7.234679005948829e-05, + "loss": 1.2743, + "step": 663 + }, + { + "epoch": 1.4025871172122493, + "grad_norm": 202.05150354797905, + "learning_rate": 7.231201643148319e-05, + "loss": 1.6473, + "step": 664 + }, + { + "epoch": 1.4046990496304117, + "grad_norm": 0.7331797074874938, + "learning_rate": 7.22771723795119e-05, + "loss": 1.3273, + "step": 665 + }, + { + "epoch": 1.4068109820485746, + "grad_norm": 0.8730024918777102, + "learning_rate": 7.224225797951697e-05, + "loss": 1.2967, + "step": 666 + }, + { + "epoch": 1.408922914466737, + "grad_norm": 0.9617367551857442, + "learning_rate": 7.220727330759424e-05, + "loss": 1.2892, + "step": 667 + }, + { + "epoch": 1.4110348468848997, + "grad_norm": 0.9432692589244119, + "learning_rate": 7.217221843999274e-05, + "loss": 1.2903, + "step": 668 + }, + { + "epoch": 1.4131467793030623, + "grad_norm": 1.1090088613943099, + "learning_rate": 7.213709345311446e-05, + "loss": 1.2912, + "step": 669 + }, + { + "epoch": 1.415258711721225, + "grad_norm": 1.283801427667681, + "learning_rate": 7.210189842351423e-05, + "loss": 1.2983, + "step": 670 + }, + { + "epoch": 1.4173706441393876, + "grad_norm": 0.6154480600951517, + "learning_rate": 7.206663342789954e-05, + "loss": 1.2964, + "step": 671 + }, + { + "epoch": 1.41948257655755, + "grad_norm": 1.1327634827953843, + "learning_rate": 7.203129854313038e-05, + "loss": 1.2855, + "step": 672 + }, + { + "epoch": 1.4215945089757127, + "grad_norm": 0.7801473707540039, + "learning_rate": 7.199589384621904e-05, + "loss": 1.2773, + "step": 673 + }, + { + "epoch": 1.4237064413938754, + "grad_norm": 0.539599574954038, + "learning_rate": 7.196041941432998e-05, + "loss": 1.2845, + "step": 674 + }, + { + "epoch": 1.425818373812038, + "grad_norm": 0.48439972104513523, + "learning_rate": 7.192487532477962e-05, + "loss": 1.2793, + "step": 675 + }, + { + "epoch": 1.4279303062302007, + "grad_norm": 0.4254997696299013, + "learning_rate": 7.188926165503625e-05, + "loss": 1.298, + "step": 676 + }, + { + "epoch": 1.4300422386483633, + "grad_norm": 0.525525126641831, + "learning_rate": 7.185357848271978e-05, + "loss": 1.2828, + "step": 677 + }, + { + "epoch": 1.432154171066526, + "grad_norm": 0.5463974113855428, + "learning_rate": 7.181782588560157e-05, + "loss": 1.2751, + "step": 678 + }, + { + "epoch": 1.4342661034846884, + "grad_norm": 0.43138033400283227, + "learning_rate": 7.178200394160439e-05, + "loss": 1.2759, + "step": 679 + }, + { + "epoch": 1.436378035902851, + "grad_norm": 0.38191722344123696, + "learning_rate": 7.174611272880203e-05, + "loss": 1.28, + "step": 680 + }, + { + "epoch": 1.4384899683210137, + "grad_norm": 0.3941853139176621, + "learning_rate": 7.17101523254193e-05, + "loss": 1.2809, + "step": 681 + }, + { + "epoch": 1.4406019007391764, + "grad_norm": 0.3712685903359914, + "learning_rate": 7.167412280983186e-05, + "loss": 1.2889, + "step": 682 + }, + { + "epoch": 1.442713833157339, + "grad_norm": 0.35448277578851217, + "learning_rate": 7.163802426056594e-05, + "loss": 1.2758, + "step": 683 + }, + { + "epoch": 1.4448257655755015, + "grad_norm": 0.32093578720388627, + "learning_rate": 7.160185675629824e-05, + "loss": 1.2931, + "step": 684 + }, + { + "epoch": 1.4469376979936643, + "grad_norm": 0.3148937106983001, + "learning_rate": 7.156562037585576e-05, + "loss": 1.2741, + "step": 685 + }, + { + "epoch": 1.4490496304118268, + "grad_norm": 0.2784503869996258, + "learning_rate": 7.152931519821559e-05, + "loss": 1.3097, + "step": 686 + }, + { + "epoch": 1.4511615628299894, + "grad_norm": 0.3795391414206824, + "learning_rate": 7.14929413025048e-05, + "loss": 1.2678, + "step": 687 + }, + { + "epoch": 1.453273495248152, + "grad_norm": 0.404641735420034, + "learning_rate": 7.145649876800021e-05, + "loss": 1.2781, + "step": 688 + }, + { + "epoch": 1.4553854276663147, + "grad_norm": 4.467657145479155, + "learning_rate": 7.141998767412824e-05, + "loss": 1.3808, + "step": 689 + }, + { + "epoch": 1.4574973600844774, + "grad_norm": 0.8100481717814243, + "learning_rate": 7.138340810046473e-05, + "loss": 1.2937, + "step": 690 + }, + { + "epoch": 1.4596092925026398, + "grad_norm": 2.4068818082813115, + "learning_rate": 7.13467601267348e-05, + "loss": 1.3063, + "step": 691 + }, + { + "epoch": 1.4617212249208025, + "grad_norm": 1.313211049743803, + "learning_rate": 7.131004383281259e-05, + "loss": 1.2945, + "step": 692 + }, + { + "epoch": 1.4638331573389651, + "grad_norm": 1.5557470074924693, + "learning_rate": 7.12732592987212e-05, + "loss": 1.2795, + "step": 693 + }, + { + "epoch": 1.4659450897571278, + "grad_norm": 0.8826925554302014, + "learning_rate": 7.123640660463244e-05, + "loss": 1.2735, + "step": 694 + }, + { + "epoch": 1.4680570221752904, + "grad_norm": 1.836504725821882, + "learning_rate": 7.119948583086666e-05, + "loss": 1.3136, + "step": 695 + }, + { + "epoch": 1.470168954593453, + "grad_norm": 1.2204371811789174, + "learning_rate": 7.116249705789262e-05, + "loss": 1.2857, + "step": 696 + }, + { + "epoch": 1.4722808870116157, + "grad_norm": 1.791240035118859, + "learning_rate": 7.112544036632726e-05, + "loss": 1.3122, + "step": 697 + }, + { + "epoch": 1.4743928194297782, + "grad_norm": 1.4289115518711695, + "learning_rate": 7.108831583693554e-05, + "loss": 1.2916, + "step": 698 + }, + { + "epoch": 1.4765047518479408, + "grad_norm": 1.369750545961057, + "learning_rate": 7.10511235506303e-05, + "loss": 1.2844, + "step": 699 + }, + { + "epoch": 1.4786166842661035, + "grad_norm": 0.8616220958615717, + "learning_rate": 7.101386358847205e-05, + "loss": 1.2911, + "step": 700 + }, + { + "epoch": 1.4807286166842661, + "grad_norm": 1.2786398819912985, + "learning_rate": 7.097653603166878e-05, + "loss": 1.2884, + "step": 701 + }, + { + "epoch": 1.4828405491024288, + "grad_norm": 0.6802077640551776, + "learning_rate": 7.093914096157582e-05, + "loss": 1.2923, + "step": 702 + }, + { + "epoch": 1.4849524815205912, + "grad_norm": 0.8996074862687778, + "learning_rate": 7.090167845969564e-05, + "loss": 1.2948, + "step": 703 + }, + { + "epoch": 1.4870644139387539, + "grad_norm": 0.7032659018911217, + "learning_rate": 7.086414860767767e-05, + "loss": 1.2983, + "step": 704 + }, + { + "epoch": 1.4891763463569165, + "grad_norm": 0.5402753239737249, + "learning_rate": 7.082655148731816e-05, + "loss": 1.2743, + "step": 705 + }, + { + "epoch": 1.4912882787750792, + "grad_norm": 0.6669177639888252, + "learning_rate": 7.078888718055992e-05, + "loss": 1.2757, + "step": 706 + }, + { + "epoch": 1.4934002111932418, + "grad_norm": 2.4203924727356636, + "learning_rate": 7.075115576949224e-05, + "loss": 1.2903, + "step": 707 + }, + { + "epoch": 1.4955121436114045, + "grad_norm": 0.8698029922968441, + "learning_rate": 7.071335733635063e-05, + "loss": 1.3005, + "step": 708 + }, + { + "epoch": 1.4976240760295672, + "grad_norm": 0.9249953977514133, + "learning_rate": 7.06754919635167e-05, + "loss": 1.2869, + "step": 709 + }, + { + "epoch": 1.4997360084477296, + "grad_norm": 0.48290509030852746, + "learning_rate": 7.063755973351794e-05, + "loss": 1.2988, + "step": 710 + }, + { + "epoch": 1.5018479408658922, + "grad_norm": 0.7735467814354282, + "learning_rate": 7.059956072902756e-05, + "loss": 1.2933, + "step": 711 + }, + { + "epoch": 1.503959873284055, + "grad_norm": 0.5470322570675089, + "learning_rate": 7.05614950328643e-05, + "loss": 1.2754, + "step": 712 + }, + { + "epoch": 1.5060718057022175, + "grad_norm": 0.5071142069011031, + "learning_rate": 7.052336272799227e-05, + "loss": 1.2852, + "step": 713 + }, + { + "epoch": 1.5081837381203802, + "grad_norm": 0.5887988109677051, + "learning_rate": 7.048516389752072e-05, + "loss": 1.2908, + "step": 714 + }, + { + "epoch": 1.5102956705385426, + "grad_norm": 0.42873224004003097, + "learning_rate": 7.044689862470393e-05, + "loss": 1.3031, + "step": 715 + }, + { + "epoch": 1.5124076029567055, + "grad_norm": 0.49992899642310573, + "learning_rate": 7.040856699294098e-05, + "loss": 1.2652, + "step": 716 + }, + { + "epoch": 1.514519535374868, + "grad_norm": 0.4212353438225059, + "learning_rate": 7.037016908577555e-05, + "loss": 1.2739, + "step": 717 + }, + { + "epoch": 1.5166314677930306, + "grad_norm": 0.41994023191503416, + "learning_rate": 7.033170498689581e-05, + "loss": 1.2755, + "step": 718 + }, + { + "epoch": 1.5187434002111933, + "grad_norm": 0.34392346569624044, + "learning_rate": 7.029317478013419e-05, + "loss": 1.2846, + "step": 719 + }, + { + "epoch": 1.520855332629356, + "grad_norm": 0.42294357606346183, + "learning_rate": 7.025457854946715e-05, + "loss": 1.2787, + "step": 720 + }, + { + "epoch": 1.5229672650475186, + "grad_norm": 0.3377994868433744, + "learning_rate": 7.021591637901511e-05, + "loss": 1.2864, + "step": 721 + }, + { + "epoch": 1.525079197465681, + "grad_norm": 0.29092170648929383, + "learning_rate": 7.017718835304219e-05, + "loss": 1.2856, + "step": 722 + }, + { + "epoch": 1.5271911298838439, + "grad_norm": 0.37853643835081957, + "learning_rate": 7.013839455595602e-05, + "loss": 1.2901, + "step": 723 + }, + { + "epoch": 1.5293030623020063, + "grad_norm": 0.3630595279649805, + "learning_rate": 7.009953507230758e-05, + "loss": 1.2741, + "step": 724 + }, + { + "epoch": 1.531414994720169, + "grad_norm": 0.35314210849743743, + "learning_rate": 7.006060998679106e-05, + "loss": 1.2791, + "step": 725 + }, + { + "epoch": 1.5335269271383316, + "grad_norm": 0.27021176022765303, + "learning_rate": 7.002161938424356e-05, + "loss": 1.2715, + "step": 726 + }, + { + "epoch": 1.535638859556494, + "grad_norm": 0.2617634675253953, + "learning_rate": 6.998256334964506e-05, + "loss": 1.2773, + "step": 727 + }, + { + "epoch": 1.537750791974657, + "grad_norm": 0.2728357371598455, + "learning_rate": 6.994344196811803e-05, + "loss": 1.2675, + "step": 728 + }, + { + "epoch": 1.5398627243928193, + "grad_norm": 0.23730909738308, + "learning_rate": 6.990425532492747e-05, + "loss": 1.2833, + "step": 729 + }, + { + "epoch": 1.541974656810982, + "grad_norm": 0.23106144532710876, + "learning_rate": 6.98650035054806e-05, + "loss": 1.2631, + "step": 730 + }, + { + "epoch": 1.5440865892291447, + "grad_norm": 0.679051848153465, + "learning_rate": 6.982568659532663e-05, + "loss": 1.2876, + "step": 731 + }, + { + "epoch": 1.5461985216473073, + "grad_norm": 0.20617652720475904, + "learning_rate": 6.97863046801567e-05, + "loss": 1.268, + "step": 732 + }, + { + "epoch": 1.54831045406547, + "grad_norm": 0.24790412035204185, + "learning_rate": 6.97468578458036e-05, + "loss": 1.2646, + "step": 733 + }, + { + "epoch": 1.5504223864836324, + "grad_norm": 0.22160760349555336, + "learning_rate": 6.97073461782416e-05, + "loss": 1.2943, + "step": 734 + }, + { + "epoch": 1.5525343189017953, + "grad_norm": 0.19171321226486226, + "learning_rate": 6.966776976358631e-05, + "loss": 1.2895, + "step": 735 + }, + { + "epoch": 1.5546462513199577, + "grad_norm": 0.21752576511557972, + "learning_rate": 6.962812868809443e-05, + "loss": 1.2726, + "step": 736 + }, + { + "epoch": 1.5567581837381204, + "grad_norm": 0.22446028766348675, + "learning_rate": 6.95884230381636e-05, + "loss": 1.2873, + "step": 737 + }, + { + "epoch": 1.558870116156283, + "grad_norm": 0.22397311028974853, + "learning_rate": 6.954865290033218e-05, + "loss": 1.2824, + "step": 738 + }, + { + "epoch": 1.5609820485744457, + "grad_norm": 0.26733897242530225, + "learning_rate": 6.950881836127909e-05, + "loss": 1.2919, + "step": 739 + }, + { + "epoch": 1.5630939809926083, + "grad_norm": 0.29773363136035647, + "learning_rate": 6.946891950782363e-05, + "loss": 1.276, + "step": 740 + }, + { + "epoch": 1.5652059134107708, + "grad_norm": 0.20776919900626573, + "learning_rate": 6.942895642692527e-05, + "loss": 1.2787, + "step": 741 + }, + { + "epoch": 1.5673178458289336, + "grad_norm": 0.2465488980585706, + "learning_rate": 6.938892920568343e-05, + "loss": 1.2687, + "step": 742 + }, + { + "epoch": 1.569429778247096, + "grad_norm": 0.23429357268990428, + "learning_rate": 6.934883793133737e-05, + "loss": 1.2772, + "step": 743 + }, + { + "epoch": 1.5715417106652587, + "grad_norm": 0.28722331852376787, + "learning_rate": 6.930868269126591e-05, + "loss": 1.2837, + "step": 744 + }, + { + "epoch": 1.5736536430834214, + "grad_norm": 0.18922849890338658, + "learning_rate": 6.926846357298733e-05, + "loss": 1.2779, + "step": 745 + }, + { + "epoch": 1.5757655755015838, + "grad_norm": 0.19543932497694178, + "learning_rate": 6.922818066415907e-05, + "loss": 1.27, + "step": 746 + }, + { + "epoch": 1.5778775079197467, + "grad_norm": 0.20531800561135824, + "learning_rate": 6.918783405257767e-05, + "loss": 1.2812, + "step": 747 + }, + { + "epoch": 1.5799894403379091, + "grad_norm": 0.2298782338241903, + "learning_rate": 6.914742382617847e-05, + "loss": 1.2846, + "step": 748 + }, + { + "epoch": 1.5821013727560718, + "grad_norm": 0.214993620246805, + "learning_rate": 6.910695007303546e-05, + "loss": 1.294, + "step": 749 + }, + { + "epoch": 1.5842133051742344, + "grad_norm": 0.1741107583839997, + "learning_rate": 6.906641288136109e-05, + "loss": 1.2751, + "step": 750 + }, + { + "epoch": 1.586325237592397, + "grad_norm": 0.1976600814048005, + "learning_rate": 6.902581233950608e-05, + "loss": 1.2691, + "step": 751 + }, + { + "epoch": 1.5884371700105597, + "grad_norm": 0.28246960558427325, + "learning_rate": 6.898514853595923e-05, + "loss": 1.295, + "step": 752 + }, + { + "epoch": 1.5905491024287222, + "grad_norm": 0.22179176867353234, + "learning_rate": 6.89444215593472e-05, + "loss": 1.2786, + "step": 753 + }, + { + "epoch": 1.592661034846885, + "grad_norm": 0.21377311656555895, + "learning_rate": 6.890363149843434e-05, + "loss": 1.2833, + "step": 754 + }, + { + "epoch": 1.5947729672650475, + "grad_norm": 0.19606414083417922, + "learning_rate": 6.88627784421225e-05, + "loss": 1.2723, + "step": 755 + }, + { + "epoch": 1.5968848996832101, + "grad_norm": 0.19732017488067358, + "learning_rate": 6.882186247945081e-05, + "loss": 1.2595, + "step": 756 + }, + { + "epoch": 1.5989968321013728, + "grad_norm": 0.24181799812150206, + "learning_rate": 6.878088369959553e-05, + "loss": 1.2736, + "step": 757 + }, + { + "epoch": 1.6011087645195352, + "grad_norm": 0.19406560772938114, + "learning_rate": 6.87398421918698e-05, + "loss": 1.2748, + "step": 758 + }, + { + "epoch": 1.603220696937698, + "grad_norm": 0.2134885951463712, + "learning_rate": 6.869873804572352e-05, + "loss": 1.282, + "step": 759 + }, + { + "epoch": 1.6053326293558605, + "grad_norm": 0.22098971465039285, + "learning_rate": 6.865757135074309e-05, + "loss": 1.2703, + "step": 760 + }, + { + "epoch": 1.6074445617740234, + "grad_norm": 0.27754559577971377, + "learning_rate": 6.861634219665118e-05, + "loss": 1.2727, + "step": 761 + }, + { + "epoch": 1.6095564941921858, + "grad_norm": 0.33042159954776607, + "learning_rate": 6.857505067330667e-05, + "loss": 1.2856, + "step": 762 + }, + { + "epoch": 1.6116684266103485, + "grad_norm": 0.23503909769098338, + "learning_rate": 6.853369687070433e-05, + "loss": 1.2737, + "step": 763 + }, + { + "epoch": 1.6137803590285111, + "grad_norm": 0.18624476924219, + "learning_rate": 6.84922808789747e-05, + "loss": 1.2834, + "step": 764 + }, + { + "epoch": 1.6158922914466736, + "grad_norm": 0.21759001832493485, + "learning_rate": 6.845080278838382e-05, + "loss": 1.2744, + "step": 765 + }, + { + "epoch": 1.6180042238648364, + "grad_norm": 0.21773047648973404, + "learning_rate": 6.840926268933308e-05, + "loss": 1.2855, + "step": 766 + }, + { + "epoch": 1.6201161562829989, + "grad_norm": 0.24863657206156972, + "learning_rate": 6.836766067235906e-05, + "loss": 1.264, + "step": 767 + }, + { + "epoch": 1.6222280887011615, + "grad_norm": 0.18753181213659226, + "learning_rate": 6.832599682813324e-05, + "loss": 1.262, + "step": 768 + }, + { + "epoch": 1.6243400211193242, + "grad_norm": 0.18665730649633688, + "learning_rate": 6.828427124746191e-05, + "loss": 1.2944, + "step": 769 + }, + { + "epoch": 1.6264519535374868, + "grad_norm": 0.19137923955295882, + "learning_rate": 6.824248402128584e-05, + "loss": 1.2624, + "step": 770 + }, + { + "epoch": 1.6285638859556495, + "grad_norm": 0.1994957398032779, + "learning_rate": 6.820063524068018e-05, + "loss": 1.277, + "step": 771 + }, + { + "epoch": 1.630675818373812, + "grad_norm": 0.1819500391066722, + "learning_rate": 6.81587249968543e-05, + "loss": 1.2661, + "step": 772 + }, + { + "epoch": 1.6327877507919748, + "grad_norm": 0.20956202210627548, + "learning_rate": 6.811675338115147e-05, + "loss": 1.2718, + "step": 773 + }, + { + "epoch": 1.6348996832101372, + "grad_norm": 0.21862944188405084, + "learning_rate": 6.807472048504871e-05, + "loss": 1.2943, + "step": 774 + }, + { + "epoch": 1.6370116156282999, + "grad_norm": 0.22541077670190493, + "learning_rate": 6.803262640015663e-05, + "loss": 1.2678, + "step": 775 + }, + { + "epoch": 1.6391235480464625, + "grad_norm": 0.23737311209278808, + "learning_rate": 6.79904712182192e-05, + "loss": 1.2718, + "step": 776 + }, + { + "epoch": 1.641235480464625, + "grad_norm": 0.20168760341896647, + "learning_rate": 6.794825503111352e-05, + "loss": 1.2785, + "step": 777 + }, + { + "epoch": 1.6433474128827879, + "grad_norm": 0.23163655490327442, + "learning_rate": 6.790597793084968e-05, + "loss": 1.278, + "step": 778 + }, + { + "epoch": 1.6454593453009503, + "grad_norm": 0.2211585258788502, + "learning_rate": 6.786364000957057e-05, + "loss": 1.2761, + "step": 779 + }, + { + "epoch": 1.647571277719113, + "grad_norm": 0.22463598649758829, + "learning_rate": 6.782124135955153e-05, + "loss": 1.2757, + "step": 780 + }, + { + "epoch": 1.6496832101372756, + "grad_norm": 0.21004305527494382, + "learning_rate": 6.777878207320035e-05, + "loss": 1.2771, + "step": 781 + }, + { + "epoch": 1.6517951425554382, + "grad_norm": 0.2444016364035034, + "learning_rate": 6.773626224305694e-05, + "loss": 1.281, + "step": 782 + }, + { + "epoch": 1.653907074973601, + "grad_norm": 0.2525609640130888, + "learning_rate": 6.769368196179319e-05, + "loss": 1.2787, + "step": 783 + }, + { + "epoch": 1.6560190073917633, + "grad_norm": 0.23360022984424214, + "learning_rate": 6.765104132221272e-05, + "loss": 1.2835, + "step": 784 + }, + { + "epoch": 1.6581309398099262, + "grad_norm": 0.23532536569909107, + "learning_rate": 6.760834041725068e-05, + "loss": 1.278, + "step": 785 + }, + { + "epoch": 1.6602428722280886, + "grad_norm": 0.18846114634223005, + "learning_rate": 6.756557933997363e-05, + "loss": 1.2779, + "step": 786 + }, + { + "epoch": 1.6623548046462513, + "grad_norm": 1.1113827453495886, + "learning_rate": 6.752275818357923e-05, + "loss": 1.3087, + "step": 787 + }, + { + "epoch": 1.664466737064414, + "grad_norm": 0.24566817876254954, + "learning_rate": 6.747987704139607e-05, + "loss": 1.2877, + "step": 788 + }, + { + "epoch": 1.6665786694825766, + "grad_norm": 0.2615668411486279, + "learning_rate": 6.743693600688353e-05, + "loss": 1.2776, + "step": 789 + }, + { + "epoch": 1.6686906019007393, + "grad_norm": 0.25940959613011355, + "learning_rate": 6.73939351736315e-05, + "loss": 1.2703, + "step": 790 + }, + { + "epoch": 1.6708025343189017, + "grad_norm": 0.2420012517004281, + "learning_rate": 6.735087463536017e-05, + "loss": 1.2823, + "step": 791 + }, + { + "epoch": 1.6729144667370646, + "grad_norm": 0.24699707373117374, + "learning_rate": 6.73077544859199e-05, + "loss": 1.29, + "step": 792 + }, + { + "epoch": 1.675026399155227, + "grad_norm": 0.34119050383870536, + "learning_rate": 6.726457481929096e-05, + "loss": 1.2952, + "step": 793 + }, + { + "epoch": 1.6771383315733897, + "grad_norm": 0.3456267970240915, + "learning_rate": 6.722133572958333e-05, + "loss": 1.2648, + "step": 794 + }, + { + "epoch": 1.6792502639915523, + "grad_norm": 0.35349544496679774, + "learning_rate": 6.717803731103653e-05, + "loss": 1.2859, + "step": 795 + }, + { + "epoch": 1.6813621964097147, + "grad_norm": 0.7599419342604081, + "learning_rate": 6.713467965801933e-05, + "loss": 1.3006, + "step": 796 + }, + { + "epoch": 1.6834741288278776, + "grad_norm": 0.3504423046309135, + "learning_rate": 6.709126286502965e-05, + "loss": 1.2721, + "step": 797 + }, + { + "epoch": 1.68558606124604, + "grad_norm": 0.39659555921977124, + "learning_rate": 6.70477870266943e-05, + "loss": 1.2692, + "step": 798 + }, + { + "epoch": 1.6876979936642027, + "grad_norm": 0.3483844507047085, + "learning_rate": 6.700425223776876e-05, + "loss": 1.2766, + "step": 799 + }, + { + "epoch": 1.6898099260823654, + "grad_norm": 0.3322222399317103, + "learning_rate": 6.696065859313699e-05, + "loss": 1.2875, + "step": 800 + }, + { + "epoch": 1.691921858500528, + "grad_norm": 0.3044793854248822, + "learning_rate": 6.691700618781127e-05, + "loss": 1.2861, + "step": 801 + }, + { + "epoch": 1.6940337909186907, + "grad_norm": 0.3498581250316946, + "learning_rate": 6.687329511693192e-05, + "loss": 1.2754, + "step": 802 + }, + { + "epoch": 1.696145723336853, + "grad_norm": 0.2981644695869502, + "learning_rate": 6.682952547576707e-05, + "loss": 1.2692, + "step": 803 + }, + { + "epoch": 1.698257655755016, + "grad_norm": 0.2339645754233329, + "learning_rate": 6.678569735971261e-05, + "loss": 1.2933, + "step": 804 + }, + { + "epoch": 1.7003695881731784, + "grad_norm": 0.24947310724991595, + "learning_rate": 6.674181086429178e-05, + "loss": 1.2868, + "step": 805 + }, + { + "epoch": 1.702481520591341, + "grad_norm": 0.34713412494361795, + "learning_rate": 6.669786608515512e-05, + "loss": 1.2889, + "step": 806 + }, + { + "epoch": 1.7045934530095037, + "grad_norm": 0.34381840393346413, + "learning_rate": 6.665386311808017e-05, + "loss": 1.291, + "step": 807 + }, + { + "epoch": 1.7067053854276664, + "grad_norm": 0.33439420115330215, + "learning_rate": 6.660980205897128e-05, + "loss": 1.2692, + "step": 808 + }, + { + "epoch": 1.708817317845829, + "grad_norm": 0.2643065725018699, + "learning_rate": 6.656568300385945e-05, + "loss": 1.2639, + "step": 809 + }, + { + "epoch": 1.7109292502639915, + "grad_norm": 0.1973950414051402, + "learning_rate": 6.652150604890206e-05, + "loss": 1.2601, + "step": 810 + }, + { + "epoch": 1.7130411826821543, + "grad_norm": 0.22674270567137866, + "learning_rate": 6.647727129038266e-05, + "loss": 1.283, + "step": 811 + }, + { + "epoch": 1.7151531151003168, + "grad_norm": 0.3592576409053149, + "learning_rate": 6.643297882471084e-05, + "loss": 1.2756, + "step": 812 + }, + { + "epoch": 1.7172650475184794, + "grad_norm": 0.39167197486494204, + "learning_rate": 6.63886287484219e-05, + "loss": 1.27, + "step": 813 + }, + { + "epoch": 1.719376979936642, + "grad_norm": 0.3110452305979911, + "learning_rate": 6.634422115817673e-05, + "loss": 1.2816, + "step": 814 + }, + { + "epoch": 1.7214889123548045, + "grad_norm": 0.445273259678396, + "learning_rate": 6.629975615076158e-05, + "loss": 1.2919, + "step": 815 + }, + { + "epoch": 1.7236008447729674, + "grad_norm": 0.22738954860856714, + "learning_rate": 6.625523382308784e-05, + "loss": 1.3034, + "step": 816 + }, + { + "epoch": 1.7257127771911298, + "grad_norm": 0.35015749572526067, + "learning_rate": 6.621065427219181e-05, + "loss": 1.2892, + "step": 817 + }, + { + "epoch": 1.7278247096092925, + "grad_norm": 0.35658811064902657, + "learning_rate": 6.616601759523453e-05, + "loss": 1.273, + "step": 818 + }, + { + "epoch": 1.7299366420274551, + "grad_norm": 0.6229517913499475, + "learning_rate": 6.612132388950152e-05, + "loss": 1.2717, + "step": 819 + }, + { + "epoch": 1.7320485744456178, + "grad_norm": 0.34837817671837257, + "learning_rate": 6.607657325240263e-05, + "loss": 1.2666, + "step": 820 + }, + { + "epoch": 1.7341605068637804, + "grad_norm": 0.27952841671039724, + "learning_rate": 6.603176578147174e-05, + "loss": 1.2843, + "step": 821 + }, + { + "epoch": 1.7362724392819429, + "grad_norm": 0.2202581987470998, + "learning_rate": 6.598690157436668e-05, + "loss": 1.276, + "step": 822 + }, + { + "epoch": 1.7383843717001057, + "grad_norm": 0.258470012508921, + "learning_rate": 6.594198072886885e-05, + "loss": 1.2866, + "step": 823 + }, + { + "epoch": 1.7404963041182682, + "grad_norm": 0.2908376254282354, + "learning_rate": 6.589700334288314e-05, + "loss": 1.272, + "step": 824 + }, + { + "epoch": 1.7426082365364308, + "grad_norm": 0.983653700073602, + "learning_rate": 6.585196951443763e-05, + "loss": 1.2915, + "step": 825 + }, + { + "epoch": 1.7447201689545935, + "grad_norm": 0.28424596828372284, + "learning_rate": 6.580687934168352e-05, + "loss": 1.2776, + "step": 826 + }, + { + "epoch": 1.746832101372756, + "grad_norm": 0.29459436934540817, + "learning_rate": 6.576173292289467e-05, + "loss": 1.2804, + "step": 827 + }, + { + "epoch": 1.7489440337909188, + "grad_norm": 0.3439079175917186, + "learning_rate": 6.571653035646762e-05, + "loss": 1.2784, + "step": 828 + }, + { + "epoch": 1.7510559662090812, + "grad_norm": 0.3682932594889793, + "learning_rate": 6.567127174092127e-05, + "loss": 1.2787, + "step": 829 + }, + { + "epoch": 1.753167898627244, + "grad_norm": 0.38513529893272264, + "learning_rate": 6.562595717489666e-05, + "loss": 1.2722, + "step": 830 + }, + { + "epoch": 1.7552798310454065, + "grad_norm": 0.4315827593313158, + "learning_rate": 6.558058675715678e-05, + "loss": 1.2871, + "step": 831 + }, + { + "epoch": 1.7573917634635692, + "grad_norm": 0.2932779887163237, + "learning_rate": 6.553516058658636e-05, + "loss": 1.3033, + "step": 832 + }, + { + "epoch": 1.7595036958817318, + "grad_norm": 0.2558916666268171, + "learning_rate": 6.548967876219163e-05, + "loss": 1.2571, + "step": 833 + }, + { + "epoch": 1.7616156282998943, + "grad_norm": 0.25813055132040713, + "learning_rate": 6.544414138310014e-05, + "loss": 1.2852, + "step": 834 + }, + { + "epoch": 1.7637275607180571, + "grad_norm": 0.23434598182411936, + "learning_rate": 6.539854854856048e-05, + "loss": 1.2747, + "step": 835 + }, + { + "epoch": 1.7658394931362196, + "grad_norm": 0.30536404373729326, + "learning_rate": 6.535290035794213e-05, + "loss": 1.2696, + "step": 836 + }, + { + "epoch": 1.7679514255543822, + "grad_norm": 0.2401786178245981, + "learning_rate": 6.530719691073523e-05, + "loss": 1.2797, + "step": 837 + }, + { + "epoch": 1.7700633579725449, + "grad_norm": 0.2514036403723258, + "learning_rate": 6.526143830655033e-05, + "loss": 1.2806, + "step": 838 + }, + { + "epoch": 1.7721752903907075, + "grad_norm": 0.2647941450048387, + "learning_rate": 6.521562464511821e-05, + "loss": 1.2798, + "step": 839 + }, + { + "epoch": 1.7742872228088702, + "grad_norm": 0.21497335694760678, + "learning_rate": 6.516975602628964e-05, + "loss": 1.306, + "step": 840 + }, + { + "epoch": 1.7763991552270326, + "grad_norm": 0.23009588674515416, + "learning_rate": 6.512383255003517e-05, + "loss": 1.2781, + "step": 841 + }, + { + "epoch": 1.7785110876451955, + "grad_norm": 0.23886517048681297, + "learning_rate": 6.507785431644491e-05, + "loss": 1.2888, + "step": 842 + }, + { + "epoch": 1.780623020063358, + "grad_norm": 0.2624099890949477, + "learning_rate": 6.503182142572831e-05, + "loss": 1.2902, + "step": 843 + }, + { + "epoch": 1.7827349524815206, + "grad_norm": 0.24998408001680877, + "learning_rate": 6.498573397821396e-05, + "loss": 1.2743, + "step": 844 + }, + { + "epoch": 1.7848468848996832, + "grad_norm": 0.2570649603268964, + "learning_rate": 6.493959207434934e-05, + "loss": 1.267, + "step": 845 + }, + { + "epoch": 1.7869588173178457, + "grad_norm": 0.21859772009355397, + "learning_rate": 6.489339581470065e-05, + "loss": 1.2776, + "step": 846 + }, + { + "epoch": 1.7890707497360085, + "grad_norm": 0.2195282004685981, + "learning_rate": 6.484714529995247e-05, + "loss": 1.2875, + "step": 847 + }, + { + "epoch": 1.791182682154171, + "grad_norm": 0.24633198667169848, + "learning_rate": 6.480084063090775e-05, + "loss": 1.2849, + "step": 848 + }, + { + "epoch": 1.7932946145723336, + "grad_norm": 0.2720940678910248, + "learning_rate": 6.475448190848741e-05, + "loss": 1.284, + "step": 849 + }, + { + "epoch": 1.7954065469904963, + "grad_norm": 0.24523532328325837, + "learning_rate": 6.470806923373015e-05, + "loss": 1.2765, + "step": 850 + }, + { + "epoch": 1.797518479408659, + "grad_norm": 0.244756249838113, + "learning_rate": 6.46616027077923e-05, + "loss": 1.2732, + "step": 851 + }, + { + "epoch": 1.7996304118268216, + "grad_norm": 0.2231328437458323, + "learning_rate": 6.461508243194754e-05, + "loss": 1.2568, + "step": 852 + }, + { + "epoch": 1.801742344244984, + "grad_norm": 0.241957878127689, + "learning_rate": 6.456850850758673e-05, + "loss": 1.277, + "step": 853 + }, + { + "epoch": 1.803854276663147, + "grad_norm": 0.8304333749273339, + "learning_rate": 6.452188103621759e-05, + "loss": 1.265, + "step": 854 + }, + { + "epoch": 1.8059662090813093, + "grad_norm": 0.24348076434773927, + "learning_rate": 6.447520011946463e-05, + "loss": 1.281, + "step": 855 + }, + { + "epoch": 1.808078141499472, + "grad_norm": 0.4187867327042092, + "learning_rate": 6.442846585906878e-05, + "loss": 1.2775, + "step": 856 + }, + { + "epoch": 1.8101900739176346, + "grad_norm": 0.33834059841736397, + "learning_rate": 6.438167835688726e-05, + "loss": 1.2653, + "step": 857 + }, + { + "epoch": 1.8123020063357973, + "grad_norm": 0.3138666881551426, + "learning_rate": 6.433483771489333e-05, + "loss": 1.2712, + "step": 858 + }, + { + "epoch": 1.81441393875396, + "grad_norm": 0.27757144456733124, + "learning_rate": 6.428794403517606e-05, + "loss": 1.28, + "step": 859 + }, + { + "epoch": 1.8165258711721224, + "grad_norm": 0.27070121358154886, + "learning_rate": 6.424099741994009e-05, + "loss": 1.2745, + "step": 860 + }, + { + "epoch": 1.8186378035902853, + "grad_norm": 0.22567819894273644, + "learning_rate": 6.419399797150551e-05, + "loss": 1.2582, + "step": 861 + }, + { + "epoch": 1.8207497360084477, + "grad_norm": 0.35556668657098167, + "learning_rate": 6.414694579230747e-05, + "loss": 1.2974, + "step": 862 + }, + { + "epoch": 1.8228616684266103, + "grad_norm": 0.22539778660780374, + "learning_rate": 6.409984098489611e-05, + "loss": 1.2746, + "step": 863 + }, + { + "epoch": 1.824973600844773, + "grad_norm": 0.22586828378051058, + "learning_rate": 6.405268365193624e-05, + "loss": 1.27, + "step": 864 + }, + { + "epoch": 1.8270855332629354, + "grad_norm": 0.21466748101528774, + "learning_rate": 6.400547389620716e-05, + "loss": 1.2762, + "step": 865 + }, + { + "epoch": 1.8291974656810983, + "grad_norm": 0.2468905995376267, + "learning_rate": 6.395821182060244e-05, + "loss": 1.2673, + "step": 866 + }, + { + "epoch": 1.8313093980992607, + "grad_norm": 0.24662072439773686, + "learning_rate": 6.391089752812963e-05, + "loss": 1.2667, + "step": 867 + }, + { + "epoch": 1.8334213305174234, + "grad_norm": 0.20237194741411185, + "learning_rate": 6.386353112191017e-05, + "loss": 1.2676, + "step": 868 + }, + { + "epoch": 1.835533262935586, + "grad_norm": 0.2001295793432772, + "learning_rate": 6.3816112705179e-05, + "loss": 1.2917, + "step": 869 + }, + { + "epoch": 1.8376451953537487, + "grad_norm": 0.20224112760905033, + "learning_rate": 6.376864238128447e-05, + "loss": 1.271, + "step": 870 + }, + { + "epoch": 1.8397571277719114, + "grad_norm": 0.23130131510433, + "learning_rate": 6.372112025368802e-05, + "loss": 1.2682, + "step": 871 + }, + { + "epoch": 1.8418690601900738, + "grad_norm": 0.18865132892239897, + "learning_rate": 6.367354642596406e-05, + "loss": 1.2619, + "step": 872 + }, + { + "epoch": 1.8439809926082367, + "grad_norm": 0.21405417870130442, + "learning_rate": 6.362592100179958e-05, + "loss": 1.269, + "step": 873 + }, + { + "epoch": 1.846092925026399, + "grad_norm": 0.19588222467536098, + "learning_rate": 6.357824408499414e-05, + "loss": 1.2909, + "step": 874 + }, + { + "epoch": 1.8482048574445618, + "grad_norm": 0.17116733132993237, + "learning_rate": 6.353051577945945e-05, + "loss": 1.2701, + "step": 875 + }, + { + "epoch": 1.8503167898627244, + "grad_norm": 0.20475389882877484, + "learning_rate": 6.348273618921923e-05, + "loss": 1.2743, + "step": 876 + }, + { + "epoch": 1.852428722280887, + "grad_norm": 0.21406462193332704, + "learning_rate": 6.343490541840899e-05, + "loss": 1.2753, + "step": 877 + }, + { + "epoch": 1.8545406546990497, + "grad_norm": 0.19016954176324458, + "learning_rate": 6.338702357127581e-05, + "loss": 1.2817, + "step": 878 + }, + { + "epoch": 1.8566525871172121, + "grad_norm": 0.17006986052220266, + "learning_rate": 6.333909075217804e-05, + "loss": 1.2581, + "step": 879 + }, + { + "epoch": 1.858764519535375, + "grad_norm": 0.1959809466326926, + "learning_rate": 6.329110706558519e-05, + "loss": 1.2827, + "step": 880 + }, + { + "epoch": 1.8608764519535375, + "grad_norm": 0.22266415529760764, + "learning_rate": 6.324307261607754e-05, + "loss": 1.2661, + "step": 881 + }, + { + "epoch": 1.8629883843717001, + "grad_norm": 0.2434197719567273, + "learning_rate": 6.31949875083461e-05, + "loss": 1.2861, + "step": 882 + }, + { + "epoch": 1.8651003167898628, + "grad_norm": 0.21476252725737421, + "learning_rate": 6.314685184719224e-05, + "loss": 1.2678, + "step": 883 + }, + { + "epoch": 1.8672122492080252, + "grad_norm": 0.17983619277408608, + "learning_rate": 6.309866573752752e-05, + "loss": 1.2608, + "step": 884 + }, + { + "epoch": 1.869324181626188, + "grad_norm": 0.31642591749899646, + "learning_rate": 6.305042928437347e-05, + "loss": 1.2614, + "step": 885 + }, + { + "epoch": 1.8714361140443505, + "grad_norm": 0.20756647606391307, + "learning_rate": 6.300214259286132e-05, + "loss": 1.2702, + "step": 886 + }, + { + "epoch": 1.8735480464625132, + "grad_norm": 0.24133572167853992, + "learning_rate": 6.295380576823182e-05, + "loss": 1.2743, + "step": 887 + }, + { + "epoch": 1.8756599788806758, + "grad_norm": 0.28169154920266476, + "learning_rate": 6.290541891583495e-05, + "loss": 1.2811, + "step": 888 + }, + { + "epoch": 1.8777719112988385, + "grad_norm": 0.25686912974096127, + "learning_rate": 6.285698214112975e-05, + "loss": 1.2655, + "step": 889 + }, + { + "epoch": 1.8798838437170011, + "grad_norm": 0.25398401311767305, + "learning_rate": 6.280849554968406e-05, + "loss": 1.2758, + "step": 890 + }, + { + "epoch": 1.8819957761351636, + "grad_norm": 0.2966553952606397, + "learning_rate": 6.275995924717429e-05, + "loss": 1.2498, + "step": 891 + }, + { + "epoch": 1.8841077085533264, + "grad_norm": 0.2622657311899408, + "learning_rate": 6.271137333938521e-05, + "loss": 1.2696, + "step": 892 + }, + { + "epoch": 1.8862196409714889, + "grad_norm": 0.218248532461383, + "learning_rate": 6.26627379322097e-05, + "loss": 1.2636, + "step": 893 + }, + { + "epoch": 1.8883315733896515, + "grad_norm": 0.25483702764397875, + "learning_rate": 6.26140531316485e-05, + "loss": 1.271, + "step": 894 + }, + { + "epoch": 1.8904435058078142, + "grad_norm": 0.2998688445119493, + "learning_rate": 6.256531904381006e-05, + "loss": 1.2852, + "step": 895 + }, + { + "epoch": 1.8925554382259766, + "grad_norm": 0.34264960856212334, + "learning_rate": 6.251653577491015e-05, + "loss": 1.2698, + "step": 896 + }, + { + "epoch": 1.8946673706441395, + "grad_norm": 0.3774012858632406, + "learning_rate": 6.246770343127186e-05, + "loss": 1.2626, + "step": 897 + }, + { + "epoch": 1.896779303062302, + "grad_norm": 0.3647447881720372, + "learning_rate": 6.241882211932513e-05, + "loss": 1.2543, + "step": 898 + }, + { + "epoch": 1.8988912354804648, + "grad_norm": 0.27958116633267405, + "learning_rate": 6.236989194560667e-05, + "loss": 1.2709, + "step": 899 + }, + { + "epoch": 1.9010031678986272, + "grad_norm": 0.20373508357348444, + "learning_rate": 6.23209130167597e-05, + "loss": 1.2786, + "step": 900 + }, + { + "epoch": 1.9031151003167899, + "grad_norm": 0.20391654565184672, + "learning_rate": 6.227188543953368e-05, + "loss": 1.2801, + "step": 901 + }, + { + "epoch": 1.9052270327349525, + "grad_norm": 0.294300893373528, + "learning_rate": 6.22228093207841e-05, + "loss": 1.2593, + "step": 902 + }, + { + "epoch": 1.907338965153115, + "grad_norm": 0.3042345932817949, + "learning_rate": 6.217368476747223e-05, + "loss": 1.2708, + "step": 903 + }, + { + "epoch": 1.9094508975712778, + "grad_norm": 0.4537221502859941, + "learning_rate": 6.212451188666496e-05, + "loss": 1.3005, + "step": 904 + }, + { + "epoch": 1.9115628299894403, + "grad_norm": 0.27472746295767725, + "learning_rate": 6.207529078553445e-05, + "loss": 1.2666, + "step": 905 + }, + { + "epoch": 1.913674762407603, + "grad_norm": 0.3181053687427052, + "learning_rate": 6.202602157135797e-05, + "loss": 1.2879, + "step": 906 + }, + { + "epoch": 1.9157866948257656, + "grad_norm": 0.2990638425271841, + "learning_rate": 6.197670435151767e-05, + "loss": 1.2797, + "step": 907 + }, + { + "epoch": 1.9178986272439282, + "grad_norm": 0.2826893466791449, + "learning_rate": 6.192733923350032e-05, + "loss": 1.2862, + "step": 908 + }, + { + "epoch": 1.9200105596620909, + "grad_norm": 0.8751895117980362, + "learning_rate": 6.18779263248971e-05, + "loss": 1.3002, + "step": 909 + }, + { + "epoch": 1.9221224920802533, + "grad_norm": 0.22253032664710404, + "learning_rate": 6.182846573340326e-05, + "loss": 1.273, + "step": 910 + }, + { + "epoch": 1.9242344244984162, + "grad_norm": 0.2555711960994068, + "learning_rate": 6.177895756681815e-05, + "loss": 1.2888, + "step": 911 + }, + { + "epoch": 1.9263463569165786, + "grad_norm": 0.2478669127073892, + "learning_rate": 6.172940193304462e-05, + "loss": 1.264, + "step": 912 + }, + { + "epoch": 1.9284582893347413, + "grad_norm": 0.3216611546261824, + "learning_rate": 6.167979894008911e-05, + "loss": 1.2782, + "step": 913 + }, + { + "epoch": 1.930570221752904, + "grad_norm": 0.37835855749460073, + "learning_rate": 6.163014869606122e-05, + "loss": 1.2756, + "step": 914 + }, + { + "epoch": 1.9326821541710664, + "grad_norm": 0.32566209775101357, + "learning_rate": 6.158045130917352e-05, + "loss": 1.2693, + "step": 915 + }, + { + "epoch": 1.9347940865892292, + "grad_norm": 0.2397661581734682, + "learning_rate": 6.153070688774138e-05, + "loss": 1.2664, + "step": 916 + }, + { + "epoch": 1.9369060190073917, + "grad_norm": 0.2401442752120567, + "learning_rate": 6.148091554018265e-05, + "loss": 1.2742, + "step": 917 + }, + { + "epoch": 1.9390179514255543, + "grad_norm": 0.24307007860206983, + "learning_rate": 6.143107737501745e-05, + "loss": 1.2837, + "step": 918 + }, + { + "epoch": 1.941129883843717, + "grad_norm": 0.20355877067445932, + "learning_rate": 6.138119250086793e-05, + "loss": 1.2766, + "step": 919 + }, + { + "epoch": 1.9432418162618796, + "grad_norm": 0.2386192318252359, + "learning_rate": 6.13312610264581e-05, + "loss": 1.276, + "step": 920 + }, + { + "epoch": 1.9453537486800423, + "grad_norm": 0.28043143663328396, + "learning_rate": 6.128128306061347e-05, + "loss": 1.2701, + "step": 921 + }, + { + "epoch": 1.9474656810982047, + "grad_norm": 0.3425427319290159, + "learning_rate": 6.123125871226091e-05, + "loss": 1.2912, + "step": 922 + }, + { + "epoch": 1.9495776135163676, + "grad_norm": 0.3783451159284105, + "learning_rate": 6.118118809042836e-05, + "loss": 1.2718, + "step": 923 + }, + { + "epoch": 1.95168954593453, + "grad_norm": 0.2789886763546799, + "learning_rate": 6.113107130424463e-05, + "loss": 1.2745, + "step": 924 + }, + { + "epoch": 1.9538014783526927, + "grad_norm": 0.24298238074828069, + "learning_rate": 6.108090846293916e-05, + "loss": 1.2805, + "step": 925 + }, + { + "epoch": 1.9559134107708553, + "grad_norm": 0.27271930513705794, + "learning_rate": 6.103069967584172e-05, + "loss": 1.2672, + "step": 926 + }, + { + "epoch": 1.958025343189018, + "grad_norm": 0.2333902212493314, + "learning_rate": 6.098044505238228e-05, + "loss": 1.2802, + "step": 927 + }, + { + "epoch": 1.9601372756071807, + "grad_norm": 0.23232406765406288, + "learning_rate": 6.093014470209065e-05, + "loss": 1.283, + "step": 928 + }, + { + "epoch": 1.962249208025343, + "grad_norm": 0.24510631321984616, + "learning_rate": 6.0879798734596344e-05, + "loss": 1.2905, + "step": 929 + }, + { + "epoch": 1.964361140443506, + "grad_norm": 0.3241311215748556, + "learning_rate": 6.082940725962827e-05, + "loss": 1.2836, + "step": 930 + }, + { + "epoch": 1.9664730728616684, + "grad_norm": 0.2727572199302141, + "learning_rate": 6.077897038701455e-05, + "loss": 1.2688, + "step": 931 + }, + { + "epoch": 1.968585005279831, + "grad_norm": 0.2505430321304983, + "learning_rate": 6.0728488226682205e-05, + "loss": 1.2769, + "step": 932 + }, + { + "epoch": 1.9706969376979937, + "grad_norm": 0.27424228949274115, + "learning_rate": 6.067796088865702e-05, + "loss": 1.2734, + "step": 933 + }, + { + "epoch": 1.9728088701161561, + "grad_norm": 0.2732766441273058, + "learning_rate": 6.062738848306321e-05, + "loss": 1.2751, + "step": 934 + }, + { + "epoch": 1.974920802534319, + "grad_norm": 0.2823736995117376, + "learning_rate": 6.05767711201232e-05, + "loss": 1.2682, + "step": 935 + }, + { + "epoch": 1.9770327349524814, + "grad_norm": 0.4071567125852377, + "learning_rate": 6.052610891015743e-05, + "loss": 1.2931, + "step": 936 + }, + { + "epoch": 1.979144667370644, + "grad_norm": 0.24167166372511595, + "learning_rate": 6.047540196358405e-05, + "loss": 1.2737, + "step": 937 + }, + { + "epoch": 1.9812565997888067, + "grad_norm": 0.29195691293389836, + "learning_rate": 6.042465039091875e-05, + "loss": 1.2675, + "step": 938 + }, + { + "epoch": 1.9833685322069694, + "grad_norm": 0.3072111590376307, + "learning_rate": 6.0373854302774484e-05, + "loss": 1.2802, + "step": 939 + }, + { + "epoch": 1.985480464625132, + "grad_norm": 0.20456589971848202, + "learning_rate": 6.0323013809861185e-05, + "loss": 1.2626, + "step": 940 + }, + { + "epoch": 1.9875923970432945, + "grad_norm": 0.2266291376672384, + "learning_rate": 6.02721290229856e-05, + "loss": 1.271, + "step": 941 + }, + { + "epoch": 1.9897043294614574, + "grad_norm": 0.23774319218905987, + "learning_rate": 6.022120005305101e-05, + "loss": 1.2587, + "step": 942 + }, + { + "epoch": 1.9918162618796198, + "grad_norm": 0.2518709250339977, + "learning_rate": 6.017022701105698e-05, + "loss": 1.2704, + "step": 943 + }, + { + "epoch": 1.9939281942977825, + "grad_norm": 0.22166598001508955, + "learning_rate": 6.011921000809915e-05, + "loss": 1.2523, + "step": 944 + }, + { + "epoch": 1.996040126715945, + "grad_norm": 0.23442717366253985, + "learning_rate": 6.006814915536895e-05, + "loss": 1.2634, + "step": 945 + }, + { + "epoch": 1.9981520591341078, + "grad_norm": 0.25607206914793956, + "learning_rate": 6.001704456415341e-05, + "loss": 1.2672, + "step": 946 + }, + { + "epoch": 2.000527983104541, + "grad_norm": 0.24553797481554412, + "learning_rate": 5.996589634583485e-05, + "loss": 1.2544, + "step": 947 + }, + { + "epoch": 2.0026399155227033, + "grad_norm": 0.24064471608912932, + "learning_rate": 5.9914704611890694e-05, + "loss": 1.2452, + "step": 948 + }, + { + "epoch": 2.0047518479408657, + "grad_norm": 0.2423636029000241, + "learning_rate": 5.986346947389323e-05, + "loss": 1.2447, + "step": 949 + }, + { + "epoch": 2.0068637803590286, + "grad_norm": 0.30735263747301533, + "learning_rate": 5.9812191043509316e-05, + "loss": 1.2546, + "step": 950 + }, + { + "epoch": 2.008975712777191, + "grad_norm": 0.36230263202234414, + "learning_rate": 5.976086943250017e-05, + "loss": 1.2441, + "step": 951 + }, + { + "epoch": 2.011087645195354, + "grad_norm": 0.3225182267838951, + "learning_rate": 5.970950475272112e-05, + "loss": 1.262, + "step": 952 + }, + { + "epoch": 2.0131995776135163, + "grad_norm": 0.2510050030080928, + "learning_rate": 5.9658097116121375e-05, + "loss": 1.2494, + "step": 953 + }, + { + "epoch": 2.015311510031679, + "grad_norm": 0.2518487977519238, + "learning_rate": 5.9606646634743775e-05, + "loss": 1.2618, + "step": 954 + }, + { + "epoch": 2.0174234424498416, + "grad_norm": 0.3394272651452207, + "learning_rate": 5.955515342072451e-05, + "loss": 1.2395, + "step": 955 + }, + { + "epoch": 2.019535374868004, + "grad_norm": 0.3720310911988316, + "learning_rate": 5.950361758629292e-05, + "loss": 1.2659, + "step": 956 + }, + { + "epoch": 2.021647307286167, + "grad_norm": 0.2839470817619634, + "learning_rate": 5.945203924377126e-05, + "loss": 1.2342, + "step": 957 + }, + { + "epoch": 2.0237592397043294, + "grad_norm": 0.2643050607272122, + "learning_rate": 5.9400418505574394e-05, + "loss": 1.2292, + "step": 958 + }, + { + "epoch": 2.0258711721224922, + "grad_norm": 0.3424805668823396, + "learning_rate": 5.9348755484209597e-05, + "loss": 1.2534, + "step": 959 + }, + { + "epoch": 2.0279831045406547, + "grad_norm": 0.3399447635419856, + "learning_rate": 5.929705029227632e-05, + "loss": 1.2622, + "step": 960 + }, + { + "epoch": 2.030095036958817, + "grad_norm": 0.3357916134607048, + "learning_rate": 5.924530304246591e-05, + "loss": 1.2401, + "step": 961 + }, + { + "epoch": 2.03220696937698, + "grad_norm": 0.3268803682782474, + "learning_rate": 5.9193513847561364e-05, + "loss": 1.242, + "step": 962 + }, + { + "epoch": 2.0343189017951424, + "grad_norm": 0.3092994615776028, + "learning_rate": 5.914168282043714e-05, + "loss": 1.2626, + "step": 963 + }, + { + "epoch": 2.0364308342133053, + "grad_norm": 0.33534854700116284, + "learning_rate": 5.908981007405881e-05, + "loss": 1.2495, + "step": 964 + }, + { + "epoch": 2.0385427666314677, + "grad_norm": 0.7664179400772683, + "learning_rate": 5.903789572148295e-05, + "loss": 1.2502, + "step": 965 + }, + { + "epoch": 2.0406546990496306, + "grad_norm": 0.2834695018806711, + "learning_rate": 5.898593987585674e-05, + "loss": 1.2482, + "step": 966 + }, + { + "epoch": 2.042766631467793, + "grad_norm": 0.28742417674458504, + "learning_rate": 5.893394265041783e-05, + "loss": 1.2339, + "step": 967 + }, + { + "epoch": 2.0448785638859555, + "grad_norm": 0.2811491786688612, + "learning_rate": 5.888190415849407e-05, + "loss": 1.2416, + "step": 968 + }, + { + "epoch": 2.0469904963041183, + "grad_norm": 0.3155622029323847, + "learning_rate": 5.882982451350321e-05, + "loss": 1.2502, + "step": 969 + }, + { + "epoch": 2.0491024287222808, + "grad_norm": 0.38831632301603436, + "learning_rate": 5.877770382895275e-05, + "loss": 1.2627, + "step": 970 + }, + { + "epoch": 2.0512143611404436, + "grad_norm": 0.48592365885580646, + "learning_rate": 5.872554221843958e-05, + "loss": 1.2514, + "step": 971 + }, + { + "epoch": 2.053326293558606, + "grad_norm": 0.6207327699454414, + "learning_rate": 5.867333979564982e-05, + "loss": 1.242, + "step": 972 + }, + { + "epoch": 2.055438225976769, + "grad_norm": 0.7443819089005387, + "learning_rate": 5.862109667435854e-05, + "loss": 1.2464, + "step": 973 + }, + { + "epoch": 2.0575501583949314, + "grad_norm": 0.7621810602514473, + "learning_rate": 5.8568812968429466e-05, + "loss": 1.2571, + "step": 974 + }, + { + "epoch": 2.059662090813094, + "grad_norm": 0.5681520397018405, + "learning_rate": 5.8516488791814864e-05, + "loss": 1.2497, + "step": 975 + }, + { + "epoch": 2.0617740232312567, + "grad_norm": 0.2463021907050864, + "learning_rate": 5.846412425855512e-05, + "loss": 1.241, + "step": 976 + }, + { + "epoch": 2.063885955649419, + "grad_norm": 0.36579765573954703, + "learning_rate": 5.8411719482778646e-05, + "loss": 1.2508, + "step": 977 + }, + { + "epoch": 2.065997888067582, + "grad_norm": 0.450767593446072, + "learning_rate": 5.835927457870151e-05, + "loss": 1.2413, + "step": 978 + }, + { + "epoch": 2.0681098204857444, + "grad_norm": 0.39619693065139316, + "learning_rate": 5.830678966062727e-05, + "loss": 1.2461, + "step": 979 + }, + { + "epoch": 2.070221752903907, + "grad_norm": 0.3216075847193361, + "learning_rate": 5.825426484294669e-05, + "loss": 1.2478, + "step": 980 + }, + { + "epoch": 2.0723336853220697, + "grad_norm": 0.27781641111207334, + "learning_rate": 5.820170024013747e-05, + "loss": 1.2604, + "step": 981 + }, + { + "epoch": 2.074445617740232, + "grad_norm": 0.29813143120925995, + "learning_rate": 5.8149095966764065e-05, + "loss": 1.2504, + "step": 982 + }, + { + "epoch": 2.076557550158395, + "grad_norm": 0.3046300352736453, + "learning_rate": 5.8096452137477354e-05, + "loss": 1.2393, + "step": 983 + }, + { + "epoch": 2.0786694825765575, + "grad_norm": 0.2529818487940805, + "learning_rate": 5.804376886701445e-05, + "loss": 1.2648, + "step": 984 + }, + { + "epoch": 2.0807814149947204, + "grad_norm": 0.20360800503004425, + "learning_rate": 5.799104627019841e-05, + "loss": 1.2415, + "step": 985 + }, + { + "epoch": 2.082893347412883, + "grad_norm": 0.24816795453814627, + "learning_rate": 5.7938284461938034e-05, + "loss": 1.2525, + "step": 986 + }, + { + "epoch": 2.085005279831045, + "grad_norm": 0.24907806258513168, + "learning_rate": 5.788548355722755e-05, + "loss": 1.2398, + "step": 987 + }, + { + "epoch": 2.087117212249208, + "grad_norm": 0.20744097182519683, + "learning_rate": 5.783264367114641e-05, + "loss": 1.2453, + "step": 988 + }, + { + "epoch": 2.0892291446673705, + "grad_norm": 0.251570319052998, + "learning_rate": 5.777976491885904e-05, + "loss": 1.2316, + "step": 989 + }, + { + "epoch": 2.0913410770855334, + "grad_norm": 0.23117498543160564, + "learning_rate": 5.7726847415614516e-05, + "loss": 1.2353, + "step": 990 + }, + { + "epoch": 2.093453009503696, + "grad_norm": 0.28388537534618546, + "learning_rate": 5.767389127674647e-05, + "loss": 1.2413, + "step": 991 + }, + { + "epoch": 2.0955649419218587, + "grad_norm": 0.27086911637878275, + "learning_rate": 5.762089661767265e-05, + "loss": 1.2623, + "step": 992 + }, + { + "epoch": 2.097676874340021, + "grad_norm": 0.18739309454863132, + "learning_rate": 5.756786355389482e-05, + "loss": 1.2476, + "step": 993 + }, + { + "epoch": 2.0997888067581836, + "grad_norm": 0.18256190268147293, + "learning_rate": 5.75147922009984e-05, + "loss": 1.2589, + "step": 994 + }, + { + "epoch": 2.1019007391763465, + "grad_norm": 0.1935117804858815, + "learning_rate": 5.74616826746523e-05, + "loss": 1.2564, + "step": 995 + }, + { + "epoch": 2.104012671594509, + "grad_norm": 0.2141518540128478, + "learning_rate": 5.740853509060859e-05, + "loss": 1.2473, + "step": 996 + }, + { + "epoch": 2.1061246040126718, + "grad_norm": 0.20467242862479754, + "learning_rate": 5.735534956470233e-05, + "loss": 1.2562, + "step": 997 + }, + { + "epoch": 2.108236536430834, + "grad_norm": 0.2481433496350472, + "learning_rate": 5.730212621285124e-05, + "loss": 1.2612, + "step": 998 + }, + { + "epoch": 2.1103484688489966, + "grad_norm": 0.23113834846015066, + "learning_rate": 5.72488651510555e-05, + "loss": 1.2427, + "step": 999 + }, + { + "epoch": 2.1124604012671595, + "grad_norm": 0.22111635582458883, + "learning_rate": 5.719556649539748e-05, + "loss": 1.2414, + "step": 1000 + }, + { + "epoch": 2.114572333685322, + "grad_norm": 0.30045367308101867, + "learning_rate": 5.714223036204144e-05, + "loss": 1.2457, + "step": 1001 + }, + { + "epoch": 2.116684266103485, + "grad_norm": 0.2768983456553527, + "learning_rate": 5.7088856867233424e-05, + "loss": 1.2494, + "step": 1002 + }, + { + "epoch": 2.1187961985216472, + "grad_norm": 0.19869667576678127, + "learning_rate": 5.70354461273008e-05, + "loss": 1.2556, + "step": 1003 + }, + { + "epoch": 2.12090813093981, + "grad_norm": 0.15632274063057292, + "learning_rate": 5.6981998258652155e-05, + "loss": 1.2532, + "step": 1004 + }, + { + "epoch": 2.1230200633579726, + "grad_norm": 0.17209274247301762, + "learning_rate": 5.6928513377777004e-05, + "loss": 1.2409, + "step": 1005 + }, + { + "epoch": 2.125131995776135, + "grad_norm": 0.1988685321833541, + "learning_rate": 5.687499160124552e-05, + "loss": 1.2477, + "step": 1006 + }, + { + "epoch": 2.127243928194298, + "grad_norm": 0.22542275444994228, + "learning_rate": 5.6821433045708305e-05, + "loss": 1.2467, + "step": 1007 + }, + { + "epoch": 2.1293558606124603, + "grad_norm": 0.24870004879348137, + "learning_rate": 5.67678378278961e-05, + "loss": 1.2353, + "step": 1008 + }, + { + "epoch": 2.131467793030623, + "grad_norm": 0.2544197070648611, + "learning_rate": 5.6714206064619564e-05, + "loss": 1.2556, + "step": 1009 + }, + { + "epoch": 2.1335797254487856, + "grad_norm": 0.19993034657424727, + "learning_rate": 5.6660537872769e-05, + "loss": 1.2526, + "step": 1010 + }, + { + "epoch": 2.135691657866948, + "grad_norm": 0.17130333842778386, + "learning_rate": 5.66068333693141e-05, + "loss": 1.2356, + "step": 1011 + }, + { + "epoch": 2.137803590285111, + "grad_norm": 0.19982074347183873, + "learning_rate": 5.655309267130371e-05, + "loss": 1.2367, + "step": 1012 + }, + { + "epoch": 2.1399155227032733, + "grad_norm": 0.21737898936533367, + "learning_rate": 5.649931589586557e-05, + "loss": 1.2436, + "step": 1013 + }, + { + "epoch": 2.142027455121436, + "grad_norm": 0.2690985714436049, + "learning_rate": 5.6445503160206045e-05, + "loss": 1.2441, + "step": 1014 + }, + { + "epoch": 2.1441393875395987, + "grad_norm": 0.29262064958745215, + "learning_rate": 5.639165458160985e-05, + "loss": 1.2418, + "step": 1015 + }, + { + "epoch": 2.1462513199577615, + "grad_norm": 0.22294687221057685, + "learning_rate": 5.6337770277439854e-05, + "loss": 1.245, + "step": 1016 + }, + { + "epoch": 2.148363252375924, + "grad_norm": 0.19395839124345676, + "learning_rate": 5.628385036513677e-05, + "loss": 1.2553, + "step": 1017 + }, + { + "epoch": 2.1504751847940864, + "grad_norm": 0.19468865359138895, + "learning_rate": 5.622989496221891e-05, + "loss": 1.2447, + "step": 1018 + }, + { + "epoch": 2.1525871172122493, + "grad_norm": 0.227805473228302, + "learning_rate": 5.617590418628198e-05, + "loss": 1.26, + "step": 1019 + }, + { + "epoch": 2.1546990496304117, + "grad_norm": 0.254988848514442, + "learning_rate": 5.612187815499873e-05, + "loss": 1.246, + "step": 1020 + }, + { + "epoch": 2.1568109820485746, + "grad_norm": 0.17527818769311249, + "learning_rate": 5.606781698611879e-05, + "loss": 1.2512, + "step": 1021 + }, + { + "epoch": 2.158922914466737, + "grad_norm": 0.18244293179055326, + "learning_rate": 5.601372079746833e-05, + "loss": 1.2549, + "step": 1022 + }, + { + "epoch": 2.1610348468849, + "grad_norm": 0.2089712246515339, + "learning_rate": 5.5959589706949895e-05, + "loss": 1.2453, + "step": 1023 + }, + { + "epoch": 2.1631467793030623, + "grad_norm": 0.214157232799511, + "learning_rate": 5.590542383254207e-05, + "loss": 1.2537, + "step": 1024 + }, + { + "epoch": 2.1652587117212247, + "grad_norm": 0.18054044771216582, + "learning_rate": 5.585122329229924e-05, + "loss": 1.2356, + "step": 1025 + }, + { + "epoch": 2.1673706441393876, + "grad_norm": 0.1556681597036634, + "learning_rate": 5.5796988204351356e-05, + "loss": 1.2502, + "step": 1026 + }, + { + "epoch": 2.16948257655755, + "grad_norm": 0.16045959845262117, + "learning_rate": 5.574271868690369e-05, + "loss": 1.2503, + "step": 1027 + }, + { + "epoch": 2.171594508975713, + "grad_norm": 0.476067744863914, + "learning_rate": 5.568841485823652e-05, + "loss": 1.2449, + "step": 1028 + }, + { + "epoch": 2.1737064413938754, + "grad_norm": 0.15535945813498897, + "learning_rate": 5.563407683670491e-05, + "loss": 1.2508, + "step": 1029 + }, + { + "epoch": 2.175818373812038, + "grad_norm": 0.1783560451933153, + "learning_rate": 5.5579704740738474e-05, + "loss": 1.2618, + "step": 1030 + }, + { + "epoch": 2.1779303062302007, + "grad_norm": 0.3860193372348054, + "learning_rate": 5.5525298688841064e-05, + "loss": 1.2549, + "step": 1031 + }, + { + "epoch": 2.180042238648363, + "grad_norm": 0.18867553503708998, + "learning_rate": 5.547085879959054e-05, + "loss": 1.2644, + "step": 1032 + }, + { + "epoch": 2.182154171066526, + "grad_norm": 0.1996135367367514, + "learning_rate": 5.54163851916385e-05, + "loss": 1.2449, + "step": 1033 + }, + { + "epoch": 2.1842661034846884, + "grad_norm": 0.2108948061933035, + "learning_rate": 5.536187798371008e-05, + "loss": 1.2394, + "step": 1034 + }, + { + "epoch": 2.1863780359028513, + "grad_norm": 0.1640578904792832, + "learning_rate": 5.5307337294603595e-05, + "loss": 1.2557, + "step": 1035 + }, + { + "epoch": 2.1884899683210137, + "grad_norm": 0.19987728079195727, + "learning_rate": 5.525276324319037e-05, + "loss": 1.2308, + "step": 1036 + }, + { + "epoch": 2.190601900739176, + "grad_norm": 0.20805330343113868, + "learning_rate": 5.51981559484144e-05, + "loss": 1.2637, + "step": 1037 + }, + { + "epoch": 2.192713833157339, + "grad_norm": 0.22119418609931205, + "learning_rate": 5.514351552929218e-05, + "loss": 1.265, + "step": 1038 + }, + { + "epoch": 2.1948257655755015, + "grad_norm": 0.1708095804422749, + "learning_rate": 5.5088842104912385e-05, + "loss": 1.2472, + "step": 1039 + }, + { + "epoch": 2.1969376979936643, + "grad_norm": 0.29900386600960477, + "learning_rate": 5.50341357944356e-05, + "loss": 1.2504, + "step": 1040 + }, + { + "epoch": 2.1990496304118268, + "grad_norm": 0.2201948492404822, + "learning_rate": 5.497939671709411e-05, + "loss": 1.2392, + "step": 1041 + }, + { + "epoch": 2.2011615628299896, + "grad_norm": 0.1893447660401754, + "learning_rate": 5.492462499219161e-05, + "loss": 1.2415, + "step": 1042 + }, + { + "epoch": 2.203273495248152, + "grad_norm": 0.161482686773741, + "learning_rate": 5.486982073910295e-05, + "loss": 1.2626, + "step": 1043 + }, + { + "epoch": 2.2053854276663145, + "grad_norm": 0.16906064441794516, + "learning_rate": 5.481498407727387e-05, + "loss": 1.2477, + "step": 1044 + }, + { + "epoch": 2.2074973600844774, + "grad_norm": 0.18758900873888765, + "learning_rate": 5.476011512622076e-05, + "loss": 1.2379, + "step": 1045 + }, + { + "epoch": 2.20960929250264, + "grad_norm": 0.21300180473189598, + "learning_rate": 5.470521400553038e-05, + "loss": 1.2609, + "step": 1046 + }, + { + "epoch": 2.2117212249208027, + "grad_norm": 0.21740376989004997, + "learning_rate": 5.4650280834859586e-05, + "loss": 1.2468, + "step": 1047 + }, + { + "epoch": 2.213833157338965, + "grad_norm": 0.2051530389854009, + "learning_rate": 5.4595315733935085e-05, + "loss": 1.2643, + "step": 1048 + }, + { + "epoch": 2.2159450897571276, + "grad_norm": 0.21599174079250838, + "learning_rate": 5.454031882255319e-05, + "loss": 1.2285, + "step": 1049 + }, + { + "epoch": 2.2180570221752904, + "grad_norm": 0.2705770974861844, + "learning_rate": 5.448529022057958e-05, + "loss": 1.2662, + "step": 1050 + }, + { + "epoch": 2.220168954593453, + "grad_norm": 0.20332016876476244, + "learning_rate": 5.443023004794894e-05, + "loss": 1.2414, + "step": 1051 + }, + { + "epoch": 2.2222808870116157, + "grad_norm": 0.17516821279014874, + "learning_rate": 5.4375138424664784e-05, + "loss": 1.2452, + "step": 1052 + }, + { + "epoch": 2.224392819429778, + "grad_norm": 0.18392850223967008, + "learning_rate": 5.4320015470799204e-05, + "loss": 1.2532, + "step": 1053 + }, + { + "epoch": 2.226504751847941, + "grad_norm": 0.2003226566214449, + "learning_rate": 5.4264861306492525e-05, + "loss": 1.2386, + "step": 1054 + }, + { + "epoch": 2.2286166842661035, + "grad_norm": 0.2247984896173137, + "learning_rate": 5.4209676051953137e-05, + "loss": 1.2559, + "step": 1055 + }, + { + "epoch": 2.230728616684266, + "grad_norm": 0.2063192484644091, + "learning_rate": 5.415445982745719e-05, + "loss": 1.2362, + "step": 1056 + }, + { + "epoch": 2.232840549102429, + "grad_norm": 0.24405714813172402, + "learning_rate": 5.4099212753348296e-05, + "loss": 1.2408, + "step": 1057 + }, + { + "epoch": 2.2349524815205912, + "grad_norm": 0.2728079001044463, + "learning_rate": 5.4043934950037337e-05, + "loss": 1.2346, + "step": 1058 + }, + { + "epoch": 2.237064413938754, + "grad_norm": 0.2565276104938986, + "learning_rate": 5.398862653800215e-05, + "loss": 1.2448, + "step": 1059 + }, + { + "epoch": 2.2391763463569165, + "grad_norm": 0.22587163142601074, + "learning_rate": 5.39332876377873e-05, + "loss": 1.2419, + "step": 1060 + }, + { + "epoch": 2.2412882787750794, + "grad_norm": 0.39828901861150157, + "learning_rate": 5.3877918370003816e-05, + "loss": 1.2543, + "step": 1061 + }, + { + "epoch": 2.243400211193242, + "grad_norm": 0.17727302773104608, + "learning_rate": 5.382251885532886e-05, + "loss": 1.2455, + "step": 1062 + }, + { + "epoch": 2.2455121436114043, + "grad_norm": 0.1655621682788704, + "learning_rate": 5.376708921450555e-05, + "loss": 1.2413, + "step": 1063 + }, + { + "epoch": 2.247624076029567, + "grad_norm": 0.21523876564955063, + "learning_rate": 5.371162956834267e-05, + "loss": 1.2356, + "step": 1064 + }, + { + "epoch": 2.2497360084477296, + "grad_norm": 0.1805812758764724, + "learning_rate": 5.365614003771439e-05, + "loss": 1.2468, + "step": 1065 + }, + { + "epoch": 2.2518479408658925, + "grad_norm": 0.19214953840649984, + "learning_rate": 5.360062074356004e-05, + "loss": 1.2447, + "step": 1066 + }, + { + "epoch": 2.253959873284055, + "grad_norm": 0.2374240774845078, + "learning_rate": 5.3545071806883745e-05, + "loss": 1.2659, + "step": 1067 + }, + { + "epoch": 2.2560718057022173, + "grad_norm": 0.17671575868699227, + "learning_rate": 5.3489493348754335e-05, + "loss": 1.2478, + "step": 1068 + }, + { + "epoch": 2.25818373812038, + "grad_norm": 0.17405859001969776, + "learning_rate": 5.3433885490304916e-05, + "loss": 1.2439, + "step": 1069 + }, + { + "epoch": 2.2602956705385426, + "grad_norm": 0.18056582280674702, + "learning_rate": 5.337824835273266e-05, + "loss": 1.2472, + "step": 1070 + }, + { + "epoch": 2.2624076029567055, + "grad_norm": 0.18826561304577497, + "learning_rate": 5.332258205729862e-05, + "loss": 1.2584, + "step": 1071 + }, + { + "epoch": 2.264519535374868, + "grad_norm": 0.21066920618228502, + "learning_rate": 5.326688672532735e-05, + "loss": 1.2387, + "step": 1072 + }, + { + "epoch": 2.2666314677930304, + "grad_norm": 0.27030205975847615, + "learning_rate": 5.321116247820669e-05, + "loss": 1.232, + "step": 1073 + }, + { + "epoch": 2.2687434002111933, + "grad_norm": 0.2652314661203708, + "learning_rate": 5.315540943738752e-05, + "loss": 1.2568, + "step": 1074 + }, + { + "epoch": 2.2708553326293557, + "grad_norm": 0.16964857597034322, + "learning_rate": 5.3099627724383453e-05, + "loss": 1.2394, + "step": 1075 + }, + { + "epoch": 2.2729672650475186, + "grad_norm": 0.24933754575930006, + "learning_rate": 5.304381746077061e-05, + "loss": 1.2351, + "step": 1076 + }, + { + "epoch": 2.275079197465681, + "grad_norm": 0.23882134520829693, + "learning_rate": 5.298797876818735e-05, + "loss": 1.244, + "step": 1077 + }, + { + "epoch": 2.277191129883844, + "grad_norm": 0.16925509612875952, + "learning_rate": 5.293211176833395e-05, + "loss": 1.2275, + "step": 1078 + }, + { + "epoch": 2.2793030623020063, + "grad_norm": 0.1576260348868354, + "learning_rate": 5.287621658297243e-05, + "loss": 1.247, + "step": 1079 + }, + { + "epoch": 2.281414994720169, + "grad_norm": 0.2461828765307337, + "learning_rate": 5.28202933339262e-05, + "loss": 1.2489, + "step": 1080 + }, + { + "epoch": 2.2835269271383316, + "grad_norm": 0.1883317760587402, + "learning_rate": 5.276434214307987e-05, + "loss": 1.234, + "step": 1081 + }, + { + "epoch": 2.285638859556494, + "grad_norm": 0.17863655503832898, + "learning_rate": 5.270836313237892e-05, + "loss": 1.26, + "step": 1082 + }, + { + "epoch": 2.287750791974657, + "grad_norm": 0.18012322120696866, + "learning_rate": 5.2652356423829494e-05, + "loss": 1.2509, + "step": 1083 + }, + { + "epoch": 2.2898627243928193, + "grad_norm": 0.25023689625800205, + "learning_rate": 5.2596322139498076e-05, + "loss": 1.2524, + "step": 1084 + }, + { + "epoch": 2.2919746568109822, + "grad_norm": 0.20413034182277892, + "learning_rate": 5.254026040151126e-05, + "loss": 1.2408, + "step": 1085 + }, + { + "epoch": 2.2940865892291447, + "grad_norm": 0.2666876578793213, + "learning_rate": 5.2484171332055464e-05, + "loss": 1.2506, + "step": 1086 + }, + { + "epoch": 2.296198521647307, + "grad_norm": 0.3440469730993779, + "learning_rate": 5.242805505337671e-05, + "loss": 1.2429, + "step": 1087 + }, + { + "epoch": 2.29831045406547, + "grad_norm": 0.3350436700822338, + "learning_rate": 5.237191168778028e-05, + "loss": 1.2533, + "step": 1088 + }, + { + "epoch": 2.3004223864836324, + "grad_norm": 0.3270808415513926, + "learning_rate": 5.231574135763053e-05, + "loss": 1.2496, + "step": 1089 + }, + { + "epoch": 2.3025343189017953, + "grad_norm": 0.3481745373480984, + "learning_rate": 5.2259544185350545e-05, + "loss": 1.2492, + "step": 1090 + }, + { + "epoch": 2.3046462513199577, + "grad_norm": 0.22730367388302586, + "learning_rate": 5.220332029342196e-05, + "loss": 1.2689, + "step": 1091 + }, + { + "epoch": 2.30675818373812, + "grad_norm": 0.20493607688388416, + "learning_rate": 5.214706980438459e-05, + "loss": 1.2433, + "step": 1092 + }, + { + "epoch": 2.308870116156283, + "grad_norm": 0.2716388087970246, + "learning_rate": 5.2090792840836275e-05, + "loss": 1.2532, + "step": 1093 + }, + { + "epoch": 2.3109820485744454, + "grad_norm": 0.2717925506017584, + "learning_rate": 5.2034489525432516e-05, + "loss": 1.2424, + "step": 1094 + }, + { + "epoch": 2.3130939809926083, + "grad_norm": 0.1761204019499454, + "learning_rate": 5.1978159980886255e-05, + "loss": 1.2486, + "step": 1095 + }, + { + "epoch": 2.3152059134107708, + "grad_norm": 0.22010701228487328, + "learning_rate": 5.192180432996761e-05, + "loss": 1.2496, + "step": 1096 + }, + { + "epoch": 2.3173178458289336, + "grad_norm": 0.26120472661937827, + "learning_rate": 5.18654226955036e-05, + "loss": 1.247, + "step": 1097 + }, + { + "epoch": 2.319429778247096, + "grad_norm": 0.2540546992154857, + "learning_rate": 5.180901520037787e-05, + "loss": 1.2513, + "step": 1098 + }, + { + "epoch": 2.321541710665259, + "grad_norm": 0.25765625862038866, + "learning_rate": 5.1752581967530416e-05, + "loss": 1.2768, + "step": 1099 + }, + { + "epoch": 2.3236536430834214, + "grad_norm": 0.1728416111650792, + "learning_rate": 5.1696123119957346e-05, + "loss": 1.2434, + "step": 1100 + }, + { + "epoch": 2.325765575501584, + "grad_norm": 0.1909793373428338, + "learning_rate": 5.163963878071059e-05, + "loss": 1.2403, + "step": 1101 + }, + { + "epoch": 2.3278775079197467, + "grad_norm": 0.22531951890607435, + "learning_rate": 5.1583129072897624e-05, + "loss": 1.2536, + "step": 1102 + }, + { + "epoch": 2.329989440337909, + "grad_norm": 0.2630715040758262, + "learning_rate": 5.1526594119681255e-05, + "loss": 1.2368, + "step": 1103 + }, + { + "epoch": 2.332101372756072, + "grad_norm": 0.6552173927505186, + "learning_rate": 5.147003404427926e-05, + "loss": 1.2509, + "step": 1104 + }, + { + "epoch": 2.3342133051742344, + "grad_norm": 0.22258366186661344, + "learning_rate": 5.141344896996422e-05, + "loss": 1.2481, + "step": 1105 + }, + { + "epoch": 2.336325237592397, + "grad_norm": 0.3128723491427478, + "learning_rate": 5.135683902006316e-05, + "loss": 1.2606, + "step": 1106 + }, + { + "epoch": 2.3384371700105597, + "grad_norm": 0.3100109057129654, + "learning_rate": 5.1300204317957315e-05, + "loss": 1.2517, + "step": 1107 + }, + { + "epoch": 2.340549102428722, + "grad_norm": 0.27748785248578167, + "learning_rate": 5.124354498708192e-05, + "loss": 1.2502, + "step": 1108 + }, + { + "epoch": 2.342661034846885, + "grad_norm": 0.23366609697723223, + "learning_rate": 5.118686115092585e-05, + "loss": 1.2582, + "step": 1109 + }, + { + "epoch": 2.3447729672650475, + "grad_norm": 0.23278010233922508, + "learning_rate": 5.113015293303139e-05, + "loss": 1.2501, + "step": 1110 + }, + { + "epoch": 2.34688489968321, + "grad_norm": 0.2685786547170507, + "learning_rate": 5.107342045699397e-05, + "loss": 1.264, + "step": 1111 + }, + { + "epoch": 2.348996832101373, + "grad_norm": 0.2375670649167216, + "learning_rate": 5.1016663846461926e-05, + "loss": 1.2714, + "step": 1112 + }, + { + "epoch": 2.351108764519535, + "grad_norm": 0.23217538754895975, + "learning_rate": 5.095988322513611e-05, + "loss": 1.2589, + "step": 1113 + }, + { + "epoch": 2.353220696937698, + "grad_norm": 0.247067879500134, + "learning_rate": 5.0903078716769794e-05, + "loss": 1.2599, + "step": 1114 + }, + { + "epoch": 2.3553326293558605, + "grad_norm": 0.21355571268835735, + "learning_rate": 5.084625044516825e-05, + "loss": 1.2304, + "step": 1115 + }, + { + "epoch": 2.3574445617740234, + "grad_norm": 0.17985048231751646, + "learning_rate": 5.078939853418858e-05, + "loss": 1.2474, + "step": 1116 + }, + { + "epoch": 2.359556494192186, + "grad_norm": 0.1990360399795899, + "learning_rate": 5.07325231077394e-05, + "loss": 1.2559, + "step": 1117 + }, + { + "epoch": 2.3616684266103487, + "grad_norm": 0.19101687497049258, + "learning_rate": 5.067562428978055e-05, + "loss": 1.2647, + "step": 1118 + }, + { + "epoch": 2.363780359028511, + "grad_norm": 0.20990979030593176, + "learning_rate": 5.0618702204322896e-05, + "loss": 1.2497, + "step": 1119 + }, + { + "epoch": 2.3658922914466736, + "grad_norm": 0.2098944730518184, + "learning_rate": 5.0561756975428e-05, + "loss": 1.2418, + "step": 1120 + }, + { + "epoch": 2.3680042238648364, + "grad_norm": 0.23041848456025402, + "learning_rate": 5.050478872720782e-05, + "loss": 1.2486, + "step": 1121 + }, + { + "epoch": 2.370116156282999, + "grad_norm": 0.204523242565548, + "learning_rate": 5.044779758382456e-05, + "loss": 1.2341, + "step": 1122 + }, + { + "epoch": 2.3722280887011618, + "grad_norm": 0.15996793417608082, + "learning_rate": 5.039078366949027e-05, + "loss": 1.2697, + "step": 1123 + }, + { + "epoch": 2.374340021119324, + "grad_norm": 0.19267646661941712, + "learning_rate": 5.033374710846666e-05, + "loss": 1.2404, + "step": 1124 + }, + { + "epoch": 2.3764519535374866, + "grad_norm": 0.28362143811672574, + "learning_rate": 5.027668802506477e-05, + "loss": 1.2579, + "step": 1125 + }, + { + "epoch": 2.3785638859556495, + "grad_norm": 0.21596111271137067, + "learning_rate": 5.021960654364475e-05, + "loss": 1.2436, + "step": 1126 + }, + { + "epoch": 2.380675818373812, + "grad_norm": 0.20094655848119677, + "learning_rate": 5.0162502788615557e-05, + "loss": 1.2492, + "step": 1127 + }, + { + "epoch": 2.382787750791975, + "grad_norm": 0.18154639192503888, + "learning_rate": 5.0105376884434694e-05, + "loss": 1.2631, + "step": 1128 + }, + { + "epoch": 2.3848996832101372, + "grad_norm": 0.2164119658225382, + "learning_rate": 5.0048228955607944e-05, + "loss": 1.2508, + "step": 1129 + }, + { + "epoch": 2.3870116156282997, + "grad_norm": 0.20768439465743524, + "learning_rate": 4.999105912668908e-05, + "loss": 1.2576, + "step": 1130 + }, + { + "epoch": 2.3891235480464625, + "grad_norm": 0.22484403138169642, + "learning_rate": 4.9933867522279624e-05, + "loss": 1.2472, + "step": 1131 + }, + { + "epoch": 2.391235480464625, + "grad_norm": 0.22359069908434262, + "learning_rate": 4.987665426702853e-05, + "loss": 1.2545, + "step": 1132 + }, + { + "epoch": 2.393347412882788, + "grad_norm": 0.2102983794257633, + "learning_rate": 4.981941948563197e-05, + "loss": 1.2283, + "step": 1133 + }, + { + "epoch": 2.3954593453009503, + "grad_norm": 0.1545387520253174, + "learning_rate": 4.9762163302833017e-05, + "loss": 1.256, + "step": 1134 + }, + { + "epoch": 2.397571277719113, + "grad_norm": 0.17393014056057302, + "learning_rate": 4.970488584342141e-05, + "loss": 1.2574, + "step": 1135 + }, + { + "epoch": 2.3996832101372756, + "grad_norm": 0.2510407499922059, + "learning_rate": 4.9647587232233205e-05, + "loss": 1.2472, + "step": 1136 + }, + { + "epoch": 2.4017951425554385, + "grad_norm": 0.4359822178768595, + "learning_rate": 4.959026759415062e-05, + "loss": 1.2666, + "step": 1137 + }, + { + "epoch": 2.403907074973601, + "grad_norm": 0.1785301561813968, + "learning_rate": 4.953292705410166e-05, + "loss": 1.2656, + "step": 1138 + }, + { + "epoch": 2.4060190073917633, + "grad_norm": 0.2866187842881373, + "learning_rate": 4.947556573705991e-05, + "loss": 1.2437, + "step": 1139 + }, + { + "epoch": 2.408130939809926, + "grad_norm": 0.2873026410127267, + "learning_rate": 4.941818376804423e-05, + "loss": 1.2683, + "step": 1140 + }, + { + "epoch": 2.4102428722280886, + "grad_norm": 0.23707614907913305, + "learning_rate": 4.9360781272118494e-05, + "loss": 1.2491, + "step": 1141 + }, + { + "epoch": 2.4123548046462515, + "grad_norm": 0.27946523520702, + "learning_rate": 4.930335837439131e-05, + "loss": 1.2625, + "step": 1142 + }, + { + "epoch": 2.414466737064414, + "grad_norm": 0.27709482664494195, + "learning_rate": 4.924591520001576e-05, + "loss": 1.2486, + "step": 1143 + }, + { + "epoch": 2.4165786694825764, + "grad_norm": 0.22269313213495537, + "learning_rate": 4.9188451874189085e-05, + "loss": 1.2613, + "step": 1144 + }, + { + "epoch": 2.4186906019007393, + "grad_norm": 0.17544385254508066, + "learning_rate": 4.9130968522152485e-05, + "loss": 1.2413, + "step": 1145 + }, + { + "epoch": 2.4208025343189017, + "grad_norm": 0.2071047348199058, + "learning_rate": 4.9073465269190806e-05, + "loss": 1.2571, + "step": 1146 + }, + { + "epoch": 2.4229144667370646, + "grad_norm": 0.2666109501919446, + "learning_rate": 4.9015942240632256e-05, + "loss": 1.2501, + "step": 1147 + }, + { + "epoch": 2.425026399155227, + "grad_norm": 0.5030840816767344, + "learning_rate": 4.8958399561848134e-05, + "loss": 1.2688, + "step": 1148 + }, + { + "epoch": 2.4271383315733894, + "grad_norm": 0.24748859883610416, + "learning_rate": 4.890083735825258e-05, + "loss": 1.2353, + "step": 1149 + }, + { + "epoch": 2.4292502639915523, + "grad_norm": 0.22507118459875936, + "learning_rate": 4.8843255755302285e-05, + "loss": 1.2377, + "step": 1150 + }, + { + "epoch": 2.4313621964097147, + "grad_norm": 0.18889474850439067, + "learning_rate": 4.8785654878496234e-05, + "loss": 1.2367, + "step": 1151 + }, + { + "epoch": 2.4334741288278776, + "grad_norm": 0.18171476809962037, + "learning_rate": 4.8728034853375386e-05, + "loss": 1.2501, + "step": 1152 + }, + { + "epoch": 2.43558606124604, + "grad_norm": 0.2304694685448346, + "learning_rate": 4.867039580552248e-05, + "loss": 1.2443, + "step": 1153 + }, + { + "epoch": 2.437697993664203, + "grad_norm": 0.20632260647592277, + "learning_rate": 4.861273786056165e-05, + "loss": 1.2365, + "step": 1154 + }, + { + "epoch": 2.4398099260823654, + "grad_norm": 0.3322301507471067, + "learning_rate": 4.855506114415829e-05, + "loss": 1.249, + "step": 1155 + }, + { + "epoch": 2.4419218585005282, + "grad_norm": 1.2791992597792383, + "learning_rate": 4.849736578201866e-05, + "loss": 1.2792, + "step": 1156 + }, + { + "epoch": 2.4440337909186907, + "grad_norm": 0.2298539964091799, + "learning_rate": 4.8439651899889696e-05, + "loss": 1.2605, + "step": 1157 + }, + { + "epoch": 2.446145723336853, + "grad_norm": 0.28273316703071977, + "learning_rate": 4.838191962355863e-05, + "loss": 1.2497, + "step": 1158 + }, + { + "epoch": 2.448257655755016, + "grad_norm": 0.21042904989505587, + "learning_rate": 4.832416907885284e-05, + "loss": 1.2515, + "step": 1159 + }, + { + "epoch": 2.4503695881731784, + "grad_norm": 0.17165633003789918, + "learning_rate": 4.8266400391639516e-05, + "loss": 1.2233, + "step": 1160 + }, + { + "epoch": 2.4524815205913413, + "grad_norm": 0.9530719562793561, + "learning_rate": 4.820861368782537e-05, + "loss": 1.2455, + "step": 1161 + }, + { + "epoch": 2.4545934530095037, + "grad_norm": 0.2165751241173494, + "learning_rate": 4.815080909335641e-05, + "loss": 1.256, + "step": 1162 + }, + { + "epoch": 2.456705385427666, + "grad_norm": 0.20833748843344618, + "learning_rate": 4.8092986734217595e-05, + "loss": 1.2321, + "step": 1163 + }, + { + "epoch": 2.458817317845829, + "grad_norm": 1.4891778004503624, + "learning_rate": 4.803514673643265e-05, + "loss": 1.259, + "step": 1164 + }, + { + "epoch": 2.4609292502639915, + "grad_norm": 0.21224925271759607, + "learning_rate": 4.7977289226063704e-05, + "loss": 1.2642, + "step": 1165 + }, + { + "epoch": 2.4630411826821543, + "grad_norm": 0.36887223810039727, + "learning_rate": 4.791941432921105e-05, + "loss": 1.2537, + "step": 1166 + }, + { + "epoch": 2.4651531151003168, + "grad_norm": 0.5114817293854793, + "learning_rate": 4.7861522172012926e-05, + "loss": 1.2481, + "step": 1167 + }, + { + "epoch": 2.467265047518479, + "grad_norm": 0.35831364378927555, + "learning_rate": 4.780361288064514e-05, + "loss": 1.2401, + "step": 1168 + }, + { + "epoch": 2.469376979936642, + "grad_norm": 0.41492253751802605, + "learning_rate": 4.774568658132087e-05, + "loss": 1.2667, + "step": 1169 + }, + { + "epoch": 2.4714889123548045, + "grad_norm": 0.4015541006655434, + "learning_rate": 4.7687743400290334e-05, + "loss": 1.2533, + "step": 1170 + }, + { + "epoch": 2.4736008447729674, + "grad_norm": 0.3792381657310599, + "learning_rate": 4.762978346384057e-05, + "loss": 1.2336, + "step": 1171 + }, + { + "epoch": 2.47571277719113, + "grad_norm": 0.31841715077036975, + "learning_rate": 4.757180689829516e-05, + "loss": 1.2483, + "step": 1172 + }, + { + "epoch": 2.4778247096092927, + "grad_norm": 0.2967641412763141, + "learning_rate": 4.751381383001386e-05, + "loss": 1.2661, + "step": 1173 + }, + { + "epoch": 2.479936642027455, + "grad_norm": 0.2883225209947814, + "learning_rate": 4.745580438539243e-05, + "loss": 1.2638, + "step": 1174 + }, + { + "epoch": 2.4820485744456176, + "grad_norm": 0.2674122423602831, + "learning_rate": 4.739777869086235e-05, + "loss": 1.2618, + "step": 1175 + }, + { + "epoch": 2.4841605068637804, + "grad_norm": 0.3427265434960991, + "learning_rate": 4.7339736872890446e-05, + "loss": 1.2669, + "step": 1176 + }, + { + "epoch": 2.486272439281943, + "grad_norm": 0.22237778127081625, + "learning_rate": 4.728167905797877e-05, + "loss": 1.2369, + "step": 1177 + }, + { + "epoch": 2.4883843717001057, + "grad_norm": 0.22729515936506317, + "learning_rate": 4.722360537266417e-05, + "loss": 1.2491, + "step": 1178 + }, + { + "epoch": 2.490496304118268, + "grad_norm": 0.26856971528392914, + "learning_rate": 4.716551594351814e-05, + "loss": 1.2403, + "step": 1179 + }, + { + "epoch": 2.492608236536431, + "grad_norm": 0.1947159254533651, + "learning_rate": 4.710741089714645e-05, + "loss": 1.2471, + "step": 1180 + }, + { + "epoch": 2.4947201689545935, + "grad_norm": 0.20892297887661074, + "learning_rate": 4.704929036018888e-05, + "loss": 1.2631, + "step": 1181 + }, + { + "epoch": 2.496832101372756, + "grad_norm": 0.24454079463563133, + "learning_rate": 4.699115445931903e-05, + "loss": 1.2502, + "step": 1182 + }, + { + "epoch": 2.498944033790919, + "grad_norm": 0.2412350094542188, + "learning_rate": 4.693300332124398e-05, + "loss": 1.2496, + "step": 1183 + }, + { + "epoch": 2.501055966209081, + "grad_norm": 0.19531214837042782, + "learning_rate": 4.687483707270399e-05, + "loss": 1.2456, + "step": 1184 + }, + { + "epoch": 2.503167898627244, + "grad_norm": 0.23690246526636335, + "learning_rate": 4.6816655840472276e-05, + "loss": 1.2559, + "step": 1185 + }, + { + "epoch": 2.5052798310454065, + "grad_norm": 0.2348503319124506, + "learning_rate": 4.6758459751354685e-05, + "loss": 1.23, + "step": 1186 + }, + { + "epoch": 2.507391763463569, + "grad_norm": 0.18633064626779913, + "learning_rate": 4.670024893218946e-05, + "loss": 1.2536, + "step": 1187 + }, + { + "epoch": 2.509503695881732, + "grad_norm": 0.18702479810071668, + "learning_rate": 4.664202350984696e-05, + "loss": 1.2487, + "step": 1188 + }, + { + "epoch": 2.5116156282998943, + "grad_norm": 0.2141621450120758, + "learning_rate": 4.658378361122936e-05, + "loss": 1.2526, + "step": 1189 + }, + { + "epoch": 2.513727560718057, + "grad_norm": 0.20621119068491564, + "learning_rate": 4.652552936327039e-05, + "loss": 1.2569, + "step": 1190 + }, + { + "epoch": 2.5158394931362196, + "grad_norm": 0.17011648450290637, + "learning_rate": 4.646726089293503e-05, + "loss": 1.2451, + "step": 1191 + }, + { + "epoch": 2.517951425554382, + "grad_norm": 0.18242623499328212, + "learning_rate": 4.640897832721929e-05, + "loss": 1.2567, + "step": 1192 + }, + { + "epoch": 2.520063357972545, + "grad_norm": 0.20534422161506397, + "learning_rate": 4.635068179314989e-05, + "loss": 1.2574, + "step": 1193 + }, + { + "epoch": 2.5221752903907078, + "grad_norm": 0.1807366290905649, + "learning_rate": 4.629237141778402e-05, + "loss": 1.2568, + "step": 1194 + }, + { + "epoch": 2.52428722280887, + "grad_norm": 0.18204973783160128, + "learning_rate": 4.623404732820896e-05, + "loss": 1.2515, + "step": 1195 + }, + { + "epoch": 2.5263991552270326, + "grad_norm": 0.19596390491925145, + "learning_rate": 4.6175709651541955e-05, + "loss": 1.2571, + "step": 1196 + }, + { + "epoch": 2.5285110876451955, + "grad_norm": 0.2859496965557565, + "learning_rate": 4.6117358514929847e-05, + "loss": 1.2475, + "step": 1197 + }, + { + "epoch": 2.530623020063358, + "grad_norm": 0.21294331908755035, + "learning_rate": 4.605899404554878e-05, + "loss": 1.2576, + "step": 1198 + }, + { + "epoch": 2.532734952481521, + "grad_norm": 0.14949005294162412, + "learning_rate": 4.600061637060401e-05, + "loss": 1.2591, + "step": 1199 + }, + { + "epoch": 2.5348468848996832, + "grad_norm": 0.2784421451785545, + "learning_rate": 4.594222561732954e-05, + "loss": 1.2633, + "step": 1200 + }, + { + "epoch": 2.5369588173178457, + "grad_norm": 0.26071979017010494, + "learning_rate": 4.588382191298787e-05, + "loss": 1.2559, + "step": 1201 + }, + { + "epoch": 2.5390707497360085, + "grad_norm": 0.16518705063613232, + "learning_rate": 4.582540538486976e-05, + "loss": 1.2527, + "step": 1202 + }, + { + "epoch": 2.541182682154171, + "grad_norm": 0.2587116689597209, + "learning_rate": 4.5766976160293875e-05, + "loss": 1.2415, + "step": 1203 + }, + { + "epoch": 2.543294614572334, + "grad_norm": 0.21217321717169924, + "learning_rate": 4.57085343666066e-05, + "loss": 1.2604, + "step": 1204 + }, + { + "epoch": 2.5454065469904963, + "grad_norm": 0.22556765116743738, + "learning_rate": 4.565008013118168e-05, + "loss": 1.2516, + "step": 1205 + }, + { + "epoch": 2.5475184794086587, + "grad_norm": 0.20396892262424146, + "learning_rate": 4.5591613581419984e-05, + "loss": 1.2616, + "step": 1206 + }, + { + "epoch": 2.5496304118268216, + "grad_norm": 0.20439036509971908, + "learning_rate": 4.553313484474924e-05, + "loss": 1.2441, + "step": 1207 + }, + { + "epoch": 2.551742344244984, + "grad_norm": 0.19944849483917157, + "learning_rate": 4.54746440486237e-05, + "loss": 1.2559, + "step": 1208 + }, + { + "epoch": 2.553854276663147, + "grad_norm": 0.20160184200084566, + "learning_rate": 4.5416141320523934e-05, + "loss": 1.2485, + "step": 1209 + }, + { + "epoch": 2.5559662090813093, + "grad_norm": 0.2608232686358766, + "learning_rate": 4.535762678795651e-05, + "loss": 1.2468, + "step": 1210 + }, + { + "epoch": 2.5580781414994718, + "grad_norm": 0.47810581406282004, + "learning_rate": 4.529910057845371e-05, + "loss": 1.2464, + "step": 1211 + }, + { + "epoch": 2.5601900739176346, + "grad_norm": 0.20668135058392803, + "learning_rate": 4.524056281957327e-05, + "loss": 1.2393, + "step": 1212 + }, + { + "epoch": 2.5623020063357975, + "grad_norm": 0.24900039108108693, + "learning_rate": 4.5182013638898105e-05, + "loss": 1.2369, + "step": 1213 + }, + { + "epoch": 2.56441393875396, + "grad_norm": 0.2198803876275304, + "learning_rate": 4.512345316403602e-05, + "loss": 1.2457, + "step": 1214 + }, + { + "epoch": 2.5665258711721224, + "grad_norm": 0.280449292461708, + "learning_rate": 4.506488152261945e-05, + "loss": 1.2351, + "step": 1215 + }, + { + "epoch": 2.5686378035902853, + "grad_norm": 0.2474853558095518, + "learning_rate": 4.500629884230513e-05, + "loss": 1.2526, + "step": 1216 + }, + { + "epoch": 2.5707497360084477, + "grad_norm": 0.20123042210510086, + "learning_rate": 4.494770525077392e-05, + "loss": 1.2498, + "step": 1217 + }, + { + "epoch": 2.5728616684266106, + "grad_norm": 0.21012697005024505, + "learning_rate": 4.4889100875730366e-05, + "loss": 1.2487, + "step": 1218 + }, + { + "epoch": 2.574973600844773, + "grad_norm": 0.2090951719563702, + "learning_rate": 4.4830485844902594e-05, + "loss": 1.2558, + "step": 1219 + }, + { + "epoch": 2.5770855332629354, + "grad_norm": 0.20795655025944507, + "learning_rate": 4.477186028604194e-05, + "loss": 1.261, + "step": 1220 + }, + { + "epoch": 2.5791974656810983, + "grad_norm": 0.18092602642325412, + "learning_rate": 4.471322432692266e-05, + "loss": 1.2499, + "step": 1221 + }, + { + "epoch": 2.5813093980992607, + "grad_norm": 0.22238180672192892, + "learning_rate": 4.465457809534171e-05, + "loss": 1.2517, + "step": 1222 + }, + { + "epoch": 2.5834213305174236, + "grad_norm": 0.19326697310054206, + "learning_rate": 4.4595921719118404e-05, + "loss": 1.2555, + "step": 1223 + }, + { + "epoch": 2.585533262935586, + "grad_norm": 0.21809634718781648, + "learning_rate": 4.453725532609419e-05, + "loss": 1.2429, + "step": 1224 + }, + { + "epoch": 2.5876451953537485, + "grad_norm": 0.16764358938375948, + "learning_rate": 4.4478579044132314e-05, + "loss": 1.2536, + "step": 1225 + }, + { + "epoch": 2.5897571277719114, + "grad_norm": 0.1937868027936208, + "learning_rate": 4.4419893001117635e-05, + "loss": 1.2416, + "step": 1226 + }, + { + "epoch": 2.591869060190074, + "grad_norm": 0.17965337611682097, + "learning_rate": 4.4361197324956225e-05, + "loss": 1.2423, + "step": 1227 + }, + { + "epoch": 2.5939809926082367, + "grad_norm": 0.15239294044626062, + "learning_rate": 4.4302492143575184e-05, + "loss": 1.2416, + "step": 1228 + }, + { + "epoch": 2.596092925026399, + "grad_norm": 0.18012343658818428, + "learning_rate": 4.424377758492233e-05, + "loss": 1.2611, + "step": 1229 + }, + { + "epoch": 2.5982048574445615, + "grad_norm": 0.15264160197210302, + "learning_rate": 4.418505377696589e-05, + "loss": 1.2532, + "step": 1230 + }, + { + "epoch": 2.6003167898627244, + "grad_norm": 0.18531454098508315, + "learning_rate": 4.412632084769428e-05, + "loss": 1.2411, + "step": 1231 + }, + { + "epoch": 2.6024287222808873, + "grad_norm": 0.2259737675154307, + "learning_rate": 4.4067578925115796e-05, + "loss": 1.2415, + "step": 1232 + }, + { + "epoch": 2.6045406546990497, + "grad_norm": 0.2372855432549408, + "learning_rate": 4.40088281372583e-05, + "loss": 1.2612, + "step": 1233 + }, + { + "epoch": 2.606652587117212, + "grad_norm": 0.21932256426407334, + "learning_rate": 4.395006861216903e-05, + "loss": 1.2603, + "step": 1234 + }, + { + "epoch": 2.608764519535375, + "grad_norm": 0.2150312009730113, + "learning_rate": 4.38913004779142e-05, + "loss": 1.2429, + "step": 1235 + }, + { + "epoch": 2.6108764519535375, + "grad_norm": 0.20780701081527142, + "learning_rate": 4.383252386257886e-05, + "loss": 1.2468, + "step": 1236 + }, + { + "epoch": 2.6129883843717003, + "grad_norm": 0.20152124461237875, + "learning_rate": 4.3773738894266494e-05, + "loss": 1.2545, + "step": 1237 + }, + { + "epoch": 2.6151003167898628, + "grad_norm": 0.40623226207867474, + "learning_rate": 4.3714945701098807e-05, + "loss": 1.2509, + "step": 1238 + }, + { + "epoch": 2.617212249208025, + "grad_norm": 0.18350085019395168, + "learning_rate": 4.365614441121544e-05, + "loss": 1.2297, + "step": 1239 + }, + { + "epoch": 2.619324181626188, + "grad_norm": 0.18907090152930897, + "learning_rate": 4.359733515277365e-05, + "loss": 1.2726, + "step": 1240 + }, + { + "epoch": 2.6214361140443505, + "grad_norm": 0.16715509385331273, + "learning_rate": 4.353851805394809e-05, + "loss": 1.2374, + "step": 1241 + }, + { + "epoch": 2.6235480464625134, + "grad_norm": 0.1505857396212693, + "learning_rate": 4.34796932429305e-05, + "loss": 1.2495, + "step": 1242 + }, + { + "epoch": 2.625659978880676, + "grad_norm": 0.3008134801215598, + "learning_rate": 4.342086084792941e-05, + "loss": 1.2411, + "step": 1243 + }, + { + "epoch": 2.6277719112988382, + "grad_norm": 0.22493309617702645, + "learning_rate": 4.336202099716991e-05, + "loss": 1.25, + "step": 1244 + }, + { + "epoch": 2.629883843717001, + "grad_norm": 0.1844772424959283, + "learning_rate": 4.33031738188933e-05, + "loss": 1.2604, + "step": 1245 + }, + { + "epoch": 2.6319957761351636, + "grad_norm": 0.16266968413017036, + "learning_rate": 4.324431944135688e-05, + "loss": 1.2578, + "step": 1246 + }, + { + "epoch": 2.6341077085533264, + "grad_norm": 0.1640984846561993, + "learning_rate": 4.318545799283363e-05, + "loss": 1.2446, + "step": 1247 + }, + { + "epoch": 2.636219640971489, + "grad_norm": 0.2656789116543793, + "learning_rate": 4.3126589601611945e-05, + "loss": 1.2402, + "step": 1248 + }, + { + "epoch": 2.6383315733896513, + "grad_norm": 0.18100012004558239, + "learning_rate": 4.306771439599535e-05, + "loss": 1.2458, + "step": 1249 + }, + { + "epoch": 2.640443505807814, + "grad_norm": 0.15130356633472192, + "learning_rate": 4.3008832504302215e-05, + "loss": 1.2608, + "step": 1250 + }, + { + "epoch": 2.6425554382259766, + "grad_norm": 0.1641863653154754, + "learning_rate": 4.2949944054865496e-05, + "loss": 1.2449, + "step": 1251 + }, + { + "epoch": 2.6446673706441395, + "grad_norm": 0.15798148112806584, + "learning_rate": 4.289104917603243e-05, + "loss": 1.2438, + "step": 1252 + }, + { + "epoch": 2.646779303062302, + "grad_norm": 0.14758663322901172, + "learning_rate": 4.2832147996164287e-05, + "loss": 1.234, + "step": 1253 + }, + { + "epoch": 2.648891235480465, + "grad_norm": 0.14529639893492569, + "learning_rate": 4.277324064363603e-05, + "loss": 1.2595, + "step": 1254 + }, + { + "epoch": 2.651003167898627, + "grad_norm": 0.18584647318260675, + "learning_rate": 4.27143272468361e-05, + "loss": 1.2466, + "step": 1255 + }, + { + "epoch": 2.65311510031679, + "grad_norm": 0.22283672931351609, + "learning_rate": 4.2655407934166126e-05, + "loss": 1.2398, + "step": 1256 + }, + { + "epoch": 2.6552270327349525, + "grad_norm": 0.13681138643568688, + "learning_rate": 4.2596482834040616e-05, + "loss": 1.2433, + "step": 1257 + }, + { + "epoch": 2.657338965153115, + "grad_norm": 0.1487383582947486, + "learning_rate": 4.2537552074886684e-05, + "loss": 1.2484, + "step": 1258 + }, + { + "epoch": 2.659450897571278, + "grad_norm": 0.16768296194209623, + "learning_rate": 4.247861578514379e-05, + "loss": 1.2685, + "step": 1259 + }, + { + "epoch": 2.6615628299894403, + "grad_norm": 0.18273027955717852, + "learning_rate": 4.2419674093263435e-05, + "loss": 1.2393, + "step": 1260 + }, + { + "epoch": 2.663674762407603, + "grad_norm": 0.15778775093750105, + "learning_rate": 4.2360727127708916e-05, + "loss": 1.2324, + "step": 1261 + }, + { + "epoch": 2.6657866948257656, + "grad_norm": 0.14206434869133824, + "learning_rate": 4.2301775016955e-05, + "loss": 1.2388, + "step": 1262 + }, + { + "epoch": 2.667898627243928, + "grad_norm": 0.162289067636816, + "learning_rate": 4.2242817889487676e-05, + "loss": 1.2319, + "step": 1263 + }, + { + "epoch": 2.670010559662091, + "grad_norm": 0.1527094707727967, + "learning_rate": 4.2183855873803876e-05, + "loss": 1.2495, + "step": 1264 + }, + { + "epoch": 2.6721224920802533, + "grad_norm": 0.1386426220742963, + "learning_rate": 4.212488909841118e-05, + "loss": 1.2536, + "step": 1265 + }, + { + "epoch": 2.674234424498416, + "grad_norm": 0.15414473715745036, + "learning_rate": 4.206591769182753e-05, + "loss": 1.2467, + "step": 1266 + }, + { + "epoch": 2.6763463569165786, + "grad_norm": 0.16090949238373514, + "learning_rate": 4.200694178258097e-05, + "loss": 1.2538, + "step": 1267 + }, + { + "epoch": 2.678458289334741, + "grad_norm": 0.1750064511080838, + "learning_rate": 4.194796149920938e-05, + "loss": 1.223, + "step": 1268 + }, + { + "epoch": 2.680570221752904, + "grad_norm": 0.14858243014474026, + "learning_rate": 4.188897697026014e-05, + "loss": 1.2388, + "step": 1269 + }, + { + "epoch": 2.6826821541710664, + "grad_norm": 0.136229170085759, + "learning_rate": 4.1829988324289896e-05, + "loss": 1.2455, + "step": 1270 + }, + { + "epoch": 2.6847940865892292, + "grad_norm": 0.1429146861635407, + "learning_rate": 4.177099568986427e-05, + "loss": 1.2592, + "step": 1271 + }, + { + "epoch": 2.6869060190073917, + "grad_norm": 0.12719689771729453, + "learning_rate": 4.1711999195557567e-05, + "loss": 1.2532, + "step": 1272 + }, + { + "epoch": 2.689017951425554, + "grad_norm": 0.14474300606459736, + "learning_rate": 4.165299896995253e-05, + "loss": 1.2666, + "step": 1273 + }, + { + "epoch": 2.691129883843717, + "grad_norm": 0.16305536173245808, + "learning_rate": 4.1593995141640004e-05, + "loss": 1.2413, + "step": 1274 + }, + { + "epoch": 2.69324181626188, + "grad_norm": 0.1607097812387902, + "learning_rate": 4.153498783921869e-05, + "loss": 1.2417, + "step": 1275 + }, + { + "epoch": 2.6953537486800423, + "grad_norm": 0.18635822745956204, + "learning_rate": 4.14759771912949e-05, + "loss": 1.2312, + "step": 1276 + }, + { + "epoch": 2.6974656810982047, + "grad_norm": 0.18877404228136124, + "learning_rate": 4.141696332648217e-05, + "loss": 1.2338, + "step": 1277 + }, + { + "epoch": 2.6995776135163676, + "grad_norm": 0.17456937434970463, + "learning_rate": 4.135794637340109e-05, + "loss": 1.238, + "step": 1278 + }, + { + "epoch": 2.70168954593453, + "grad_norm": 0.19285857830448844, + "learning_rate": 4.129892646067899e-05, + "loss": 1.2463, + "step": 1279 + }, + { + "epoch": 2.703801478352693, + "grad_norm": 0.16574779297029807, + "learning_rate": 4.123990371694963e-05, + "loss": 1.261, + "step": 1280 + }, + { + "epoch": 2.7059134107708553, + "grad_norm": 0.15925602968282257, + "learning_rate": 4.118087827085295e-05, + "loss": 1.2444, + "step": 1281 + }, + { + "epoch": 2.7080253431890178, + "grad_norm": 0.41627664119455543, + "learning_rate": 4.112185025103476e-05, + "loss": 1.2877, + "step": 1282 + }, + { + "epoch": 2.7101372756071807, + "grad_norm": 0.21112920459548892, + "learning_rate": 4.106281978614651e-05, + "loss": 1.2409, + "step": 1283 + }, + { + "epoch": 2.712249208025343, + "grad_norm": 0.13803877039144863, + "learning_rate": 4.100378700484497e-05, + "loss": 1.2417, + "step": 1284 + }, + { + "epoch": 2.714361140443506, + "grad_norm": 0.20826933330179756, + "learning_rate": 4.094475203579192e-05, + "loss": 1.2549, + "step": 1285 + }, + { + "epoch": 2.7164730728616684, + "grad_norm": 0.2001236861552444, + "learning_rate": 4.0885715007653974e-05, + "loss": 1.2555, + "step": 1286 + }, + { + "epoch": 2.718585005279831, + "grad_norm": 0.17628778629110142, + "learning_rate": 4.082667604910218e-05, + "loss": 1.2688, + "step": 1287 + }, + { + "epoch": 2.7206969376979937, + "grad_norm": 0.3004202354027872, + "learning_rate": 4.0767635288811816e-05, + "loss": 1.2732, + "step": 1288 + }, + { + "epoch": 2.722808870116156, + "grad_norm": 0.18557583916890574, + "learning_rate": 4.0708592855462094e-05, + "loss": 1.2656, + "step": 1289 + }, + { + "epoch": 2.724920802534319, + "grad_norm": 0.18458833833181457, + "learning_rate": 4.0649548877735875e-05, + "loss": 1.2484, + "step": 1290 + }, + { + "epoch": 2.7270327349524814, + "grad_norm": 0.2009979597703134, + "learning_rate": 4.059050348431934e-05, + "loss": 1.2373, + "step": 1291 + }, + { + "epoch": 2.729144667370644, + "grad_norm": 0.14584546827413772, + "learning_rate": 4.053145680390181e-05, + "loss": 1.2404, + "step": 1292 + }, + { + "epoch": 2.7312565997888067, + "grad_norm": 0.19440629141488908, + "learning_rate": 4.047240896517539e-05, + "loss": 1.2473, + "step": 1293 + }, + { + "epoch": 2.7333685322069696, + "grad_norm": 0.1631058588822202, + "learning_rate": 4.0413360096834696e-05, + "loss": 1.2551, + "step": 1294 + }, + { + "epoch": 2.735480464625132, + "grad_norm": 0.1706377854170495, + "learning_rate": 4.035431032757662e-05, + "loss": 1.2511, + "step": 1295 + }, + { + "epoch": 2.7375923970432945, + "grad_norm": 0.1676702534932309, + "learning_rate": 4.0295259786099994e-05, + "loss": 1.2531, + "step": 1296 + }, + { + "epoch": 2.7397043294614574, + "grad_norm": 0.15344973842004275, + "learning_rate": 4.0236208601105335e-05, + "loss": 1.2576, + "step": 1297 + }, + { + "epoch": 2.74181626187962, + "grad_norm": 0.16008971424742968, + "learning_rate": 4.017715690129458e-05, + "loss": 1.2465, + "step": 1298 + }, + { + "epoch": 2.7439281942977827, + "grad_norm": 0.15435086305404416, + "learning_rate": 4.011810481537074e-05, + "loss": 1.2399, + "step": 1299 + }, + { + "epoch": 2.746040126715945, + "grad_norm": 0.16832554202402972, + "learning_rate": 4.005905247203774e-05, + "loss": 1.2419, + "step": 1300 + }, + { + "epoch": 2.7481520591341075, + "grad_norm": 0.17390901223896177, + "learning_rate": 4e-05, + "loss": 1.231, + "step": 1301 + }, + { + "epoch": 2.7502639915522704, + "grad_norm": 0.18325280849926004, + "learning_rate": 3.994094752796227e-05, + "loss": 1.2388, + "step": 1302 + }, + { + "epoch": 2.752375923970433, + "grad_norm": 0.24402082963579863, + "learning_rate": 3.988189518462927e-05, + "loss": 1.2353, + "step": 1303 + }, + { + "epoch": 2.7544878563885957, + "grad_norm": 0.22169399361700795, + "learning_rate": 3.982284309870543e-05, + "loss": 1.25, + "step": 1304 + }, + { + "epoch": 2.756599788806758, + "grad_norm": 0.46443318629283575, + "learning_rate": 3.976379139889468e-05, + "loss": 1.2426, + "step": 1305 + }, + { + "epoch": 2.7587117212249206, + "grad_norm": 0.2424981789951497, + "learning_rate": 3.970474021390002e-05, + "loss": 1.2538, + "step": 1306 + }, + { + "epoch": 2.7608236536430835, + "grad_norm": 0.19460603807747523, + "learning_rate": 3.964568967242338e-05, + "loss": 1.2322, + "step": 1307 + }, + { + "epoch": 2.762935586061246, + "grad_norm": 0.16263868453814642, + "learning_rate": 3.958663990316532e-05, + "loss": 1.2285, + "step": 1308 + }, + { + "epoch": 2.7650475184794088, + "grad_norm": 0.21579779199600754, + "learning_rate": 3.952759103482462e-05, + "loss": 1.241, + "step": 1309 + }, + { + "epoch": 2.767159450897571, + "grad_norm": 0.1920605003032425, + "learning_rate": 3.946854319609821e-05, + "loss": 1.2574, + "step": 1310 + }, + { + "epoch": 2.7692713833157336, + "grad_norm": 0.17671356917848932, + "learning_rate": 3.940949651568067e-05, + "loss": 1.247, + "step": 1311 + }, + { + "epoch": 2.7713833157338965, + "grad_norm": 0.1631620576422733, + "learning_rate": 3.935045112226414e-05, + "loss": 1.2359, + "step": 1312 + }, + { + "epoch": 2.7734952481520594, + "grad_norm": 0.16738647794875786, + "learning_rate": 3.929140714453791e-05, + "loss": 1.2406, + "step": 1313 + }, + { + "epoch": 2.775607180570222, + "grad_norm": 0.15463685319555867, + "learning_rate": 3.9232364711188184e-05, + "loss": 1.2578, + "step": 1314 + }, + { + "epoch": 2.7777191129883843, + "grad_norm": 0.16279139484248137, + "learning_rate": 3.917332395089784e-05, + "loss": 1.2505, + "step": 1315 + }, + { + "epoch": 2.779831045406547, + "grad_norm": 0.16086974064401302, + "learning_rate": 3.911428499234604e-05, + "loss": 1.2207, + "step": 1316 + }, + { + "epoch": 2.7819429778247096, + "grad_norm": 0.16524482867425364, + "learning_rate": 3.9055247964208084e-05, + "loss": 1.2473, + "step": 1317 + }, + { + "epoch": 2.7840549102428724, + "grad_norm": 0.14660324001566058, + "learning_rate": 3.8996212995155046e-05, + "loss": 1.257, + "step": 1318 + }, + { + "epoch": 2.786166842661035, + "grad_norm": 0.1394125946226826, + "learning_rate": 3.8937180213853494e-05, + "loss": 1.2385, + "step": 1319 + }, + { + "epoch": 2.7882787750791973, + "grad_norm": 0.16306277455232898, + "learning_rate": 3.8878149748965245e-05, + "loss": 1.2485, + "step": 1320 + }, + { + "epoch": 2.79039070749736, + "grad_norm": 0.18380780937749647, + "learning_rate": 3.881912172914706e-05, + "loss": 1.2432, + "step": 1321 + }, + { + "epoch": 2.7925026399155226, + "grad_norm": 0.2315175474346091, + "learning_rate": 3.876009628305039e-05, + "loss": 1.2444, + "step": 1322 + }, + { + "epoch": 2.7946145723336855, + "grad_norm": 0.17077499124948442, + "learning_rate": 3.870107353932102e-05, + "loss": 1.2357, + "step": 1323 + }, + { + "epoch": 2.796726504751848, + "grad_norm": 0.1915223457645163, + "learning_rate": 3.8642053626598917e-05, + "loss": 1.2512, + "step": 1324 + }, + { + "epoch": 2.7988384371700104, + "grad_norm": 0.17896032452380545, + "learning_rate": 3.858303667351785e-05, + "loss": 1.2513, + "step": 1325 + }, + { + "epoch": 2.8009503695881732, + "grad_norm": 0.47069662590133143, + "learning_rate": 3.852402280870511e-05, + "loss": 1.2451, + "step": 1326 + }, + { + "epoch": 2.8030623020063357, + "grad_norm": 0.24993446541541084, + "learning_rate": 3.846501216078132e-05, + "loss": 1.2497, + "step": 1327 + }, + { + "epoch": 2.8051742344244985, + "grad_norm": 0.18721990084009657, + "learning_rate": 3.840600485836001e-05, + "loss": 1.2654, + "step": 1328 + }, + { + "epoch": 2.807286166842661, + "grad_norm": 0.17806668754833868, + "learning_rate": 3.834700103004747e-05, + "loss": 1.2537, + "step": 1329 + }, + { + "epoch": 2.8093980992608234, + "grad_norm": 0.16937617866710103, + "learning_rate": 3.828800080444244e-05, + "loss": 1.2493, + "step": 1330 + }, + { + "epoch": 2.8115100316789863, + "grad_norm": 0.17165198046379243, + "learning_rate": 3.822900431013574e-05, + "loss": 1.243, + "step": 1331 + }, + { + "epoch": 2.813621964097149, + "grad_norm": 0.16251526824128915, + "learning_rate": 3.817001167571012e-05, + "loss": 1.2561, + "step": 1332 + }, + { + "epoch": 2.8157338965153116, + "grad_norm": 0.17440112625855608, + "learning_rate": 3.811102302973987e-05, + "loss": 1.2582, + "step": 1333 + }, + { + "epoch": 2.817845828933474, + "grad_norm": 0.16622174029432177, + "learning_rate": 3.8052038500790623e-05, + "loss": 1.2668, + "step": 1334 + }, + { + "epoch": 2.819957761351637, + "grad_norm": 0.20264748690177628, + "learning_rate": 3.7993058217419045e-05, + "loss": 1.2611, + "step": 1335 + }, + { + "epoch": 2.8220696937697993, + "grad_norm": 0.1968884953611614, + "learning_rate": 3.7934082308172484e-05, + "loss": 1.2436, + "step": 1336 + }, + { + "epoch": 2.824181626187962, + "grad_norm": 0.17246120349253796, + "learning_rate": 3.787511090158884e-05, + "loss": 1.2389, + "step": 1337 + }, + { + "epoch": 2.8262935586061246, + "grad_norm": 0.17496918559975405, + "learning_rate": 3.781614412619614e-05, + "loss": 1.2542, + "step": 1338 + }, + { + "epoch": 2.828405491024287, + "grad_norm": 0.180261595058359, + "learning_rate": 3.775718211051233e-05, + "loss": 1.233, + "step": 1339 + }, + { + "epoch": 2.83051742344245, + "grad_norm": 0.15783575956081317, + "learning_rate": 3.769822498304501e-05, + "loss": 1.2539, + "step": 1340 + }, + { + "epoch": 2.8326293558606124, + "grad_norm": 0.15980325392061756, + "learning_rate": 3.76392728722911e-05, + "loss": 1.2723, + "step": 1341 + }, + { + "epoch": 2.8347412882787753, + "grad_norm": 0.1671378733562258, + "learning_rate": 3.758032590673657e-05, + "loss": 1.2488, + "step": 1342 + }, + { + "epoch": 2.8368532206969377, + "grad_norm": 0.1377817800465948, + "learning_rate": 3.752138421485622e-05, + "loss": 1.2593, + "step": 1343 + }, + { + "epoch": 2.8389651531151, + "grad_norm": 0.15286973059079462, + "learning_rate": 3.7462447925113316e-05, + "loss": 1.2547, + "step": 1344 + }, + { + "epoch": 2.841077085533263, + "grad_norm": 0.17201017270878505, + "learning_rate": 3.740351716595939e-05, + "loss": 1.2546, + "step": 1345 + }, + { + "epoch": 2.8431890179514254, + "grad_norm": 0.17283668262630822, + "learning_rate": 3.7344592065833874e-05, + "loss": 1.2442, + "step": 1346 + }, + { + "epoch": 2.8453009503695883, + "grad_norm": 0.17485012699989297, + "learning_rate": 3.7285672753163914e-05, + "loss": 1.2591, + "step": 1347 + }, + { + "epoch": 2.8474128827877507, + "grad_norm": 0.1747239606256227, + "learning_rate": 3.722675935636399e-05, + "loss": 1.2562, + "step": 1348 + }, + { + "epoch": 2.849524815205913, + "grad_norm": 0.15261145932221679, + "learning_rate": 3.716785200383573e-05, + "loss": 1.2293, + "step": 1349 + }, + { + "epoch": 2.851636747624076, + "grad_norm": 0.5949112434774901, + "learning_rate": 3.710895082396758e-05, + "loss": 1.2812, + "step": 1350 + }, + { + "epoch": 2.853748680042239, + "grad_norm": 0.17117764014307688, + "learning_rate": 3.705005594513451e-05, + "loss": 1.2415, + "step": 1351 + }, + { + "epoch": 2.8558606124604013, + "grad_norm": 0.16708323332235744, + "learning_rate": 3.69911674956978e-05, + "loss": 1.2476, + "step": 1352 + }, + { + "epoch": 2.857972544878564, + "grad_norm": 0.19328096044732057, + "learning_rate": 3.6932285604004664e-05, + "loss": 1.2522, + "step": 1353 + }, + { + "epoch": 2.8600844772967267, + "grad_norm": 0.1913717836972792, + "learning_rate": 3.6873410398388075e-05, + "loss": 1.2502, + "step": 1354 + }, + { + "epoch": 2.862196409714889, + "grad_norm": 0.17725717935287896, + "learning_rate": 3.681454200716638e-05, + "loss": 1.2382, + "step": 1355 + }, + { + "epoch": 2.864308342133052, + "grad_norm": 0.18380103339431025, + "learning_rate": 3.6755680558643135e-05, + "loss": 1.2593, + "step": 1356 + }, + { + "epoch": 2.8664202745512144, + "grad_norm": 0.17545966015171377, + "learning_rate": 3.669682618110671e-05, + "loss": 1.2482, + "step": 1357 + }, + { + "epoch": 2.868532206969377, + "grad_norm": 0.2280199769637377, + "learning_rate": 3.6637979002830106e-05, + "loss": 1.2318, + "step": 1358 + }, + { + "epoch": 2.8706441393875397, + "grad_norm": 0.17900090283903328, + "learning_rate": 3.657913915207061e-05, + "loss": 1.2541, + "step": 1359 + }, + { + "epoch": 2.872756071805702, + "grad_norm": 0.19200211234308043, + "learning_rate": 3.652030675706952e-05, + "loss": 1.249, + "step": 1360 + }, + { + "epoch": 2.874868004223865, + "grad_norm": 0.24518131607988217, + "learning_rate": 3.6461481946051917e-05, + "loss": 1.2582, + "step": 1361 + }, + { + "epoch": 2.8769799366420274, + "grad_norm": 0.16970705203420663, + "learning_rate": 3.640266484722637e-05, + "loss": 1.2467, + "step": 1362 + }, + { + "epoch": 2.87909186906019, + "grad_norm": 0.16236768684798242, + "learning_rate": 3.634385558878457e-05, + "loss": 1.2605, + "step": 1363 + }, + { + "epoch": 2.8812038014783528, + "grad_norm": 0.1649360416198374, + "learning_rate": 3.62850542989012e-05, + "loss": 1.256, + "step": 1364 + }, + { + "epoch": 2.883315733896515, + "grad_norm": 0.16421960745621672, + "learning_rate": 3.622626110573351e-05, + "loss": 1.2494, + "step": 1365 + }, + { + "epoch": 2.885427666314678, + "grad_norm": 0.15823773231040744, + "learning_rate": 3.616747613742114e-05, + "loss": 1.2467, + "step": 1366 + }, + { + "epoch": 2.8875395987328405, + "grad_norm": 0.14750199515460344, + "learning_rate": 3.6108699522085805e-05, + "loss": 1.2541, + "step": 1367 + }, + { + "epoch": 2.889651531151003, + "grad_norm": 0.14704843764395795, + "learning_rate": 3.604993138783098e-05, + "loss": 1.257, + "step": 1368 + }, + { + "epoch": 2.891763463569166, + "grad_norm": 0.1517189977333485, + "learning_rate": 3.599117186274172e-05, + "loss": 1.2416, + "step": 1369 + }, + { + "epoch": 2.8938753959873287, + "grad_norm": 0.15081652267373385, + "learning_rate": 3.593242107488422e-05, + "loss": 1.2534, + "step": 1370 + }, + { + "epoch": 2.895987328405491, + "grad_norm": 0.1658126870322483, + "learning_rate": 3.5873679152305724e-05, + "loss": 1.2599, + "step": 1371 + }, + { + "epoch": 2.8980992608236535, + "grad_norm": 0.16652331528840855, + "learning_rate": 3.581494622303413e-05, + "loss": 1.233, + "step": 1372 + }, + { + "epoch": 2.9002111932418164, + "grad_norm": 0.15733317782342027, + "learning_rate": 3.575622241507768e-05, + "loss": 1.2381, + "step": 1373 + }, + { + "epoch": 2.902323125659979, + "grad_norm": 0.1649705149757225, + "learning_rate": 3.569750785642483e-05, + "loss": 1.2418, + "step": 1374 + }, + { + "epoch": 2.9044350580781417, + "grad_norm": 0.19129224035433043, + "learning_rate": 3.563880267504378e-05, + "loss": 1.2658, + "step": 1375 + }, + { + "epoch": 2.906546990496304, + "grad_norm": 0.15171568929039042, + "learning_rate": 3.558010699888237e-05, + "loss": 1.2413, + "step": 1376 + }, + { + "epoch": 2.9086589229144666, + "grad_norm": 0.3944188088795608, + "learning_rate": 3.552142095586769e-05, + "loss": 1.2744, + "step": 1377 + }, + { + "epoch": 2.9107708553326295, + "grad_norm": 0.19484617059270434, + "learning_rate": 3.546274467390583e-05, + "loss": 1.2577, + "step": 1378 + }, + { + "epoch": 2.912882787750792, + "grad_norm": 0.1653843471402946, + "learning_rate": 3.540407828088161e-05, + "loss": 1.2481, + "step": 1379 + }, + { + "epoch": 2.914994720168955, + "grad_norm": 0.17652932532096136, + "learning_rate": 3.53454219046583e-05, + "loss": 1.2643, + "step": 1380 + }, + { + "epoch": 2.917106652587117, + "grad_norm": 0.18966487943431148, + "learning_rate": 3.528677567307734e-05, + "loss": 1.2362, + "step": 1381 + }, + { + "epoch": 2.9192185850052796, + "grad_norm": 0.316650469822133, + "learning_rate": 3.522813971395808e-05, + "loss": 1.2564, + "step": 1382 + }, + { + "epoch": 2.9213305174234425, + "grad_norm": 0.19617393566337205, + "learning_rate": 3.5169514155097405e-05, + "loss": 1.2514, + "step": 1383 + }, + { + "epoch": 2.923442449841605, + "grad_norm": 0.19480226497186165, + "learning_rate": 3.5110899124269654e-05, + "loss": 1.257, + "step": 1384 + }, + { + "epoch": 2.925554382259768, + "grad_norm": 0.15620513082799156, + "learning_rate": 3.5052294749226094e-05, + "loss": 1.2421, + "step": 1385 + }, + { + "epoch": 2.9276663146779303, + "grad_norm": 0.15804900382955278, + "learning_rate": 3.4993701157694874e-05, + "loss": 1.2657, + "step": 1386 + }, + { + "epoch": 2.9297782470960927, + "grad_norm": 0.15679777586977608, + "learning_rate": 3.4935118477380565e-05, + "loss": 1.2598, + "step": 1387 + }, + { + "epoch": 2.9318901795142556, + "grad_norm": 0.1896381031281462, + "learning_rate": 3.4876546835963985e-05, + "loss": 1.2643, + "step": 1388 + }, + { + "epoch": 2.934002111932418, + "grad_norm": 0.13647943392484213, + "learning_rate": 3.4817986361101915e-05, + "loss": 1.2419, + "step": 1389 + }, + { + "epoch": 2.936114044350581, + "grad_norm": 0.14997995967250793, + "learning_rate": 3.4759437180426745e-05, + "loss": 1.2457, + "step": 1390 + }, + { + "epoch": 2.9382259767687433, + "grad_norm": 0.1360877019346515, + "learning_rate": 3.470089942154632e-05, + "loss": 1.2459, + "step": 1391 + }, + { + "epoch": 2.940337909186906, + "grad_norm": 0.1431882072720808, + "learning_rate": 3.46423732120435e-05, + "loss": 1.2524, + "step": 1392 + }, + { + "epoch": 2.9424498416050686, + "grad_norm": 0.14814289297380318, + "learning_rate": 3.458385867947607e-05, + "loss": 1.2455, + "step": 1393 + }, + { + "epoch": 2.9445617740232315, + "grad_norm": 0.20887198165807616, + "learning_rate": 3.452535595137631e-05, + "loss": 1.2447, + "step": 1394 + }, + { + "epoch": 2.946673706441394, + "grad_norm": 0.14548008856160743, + "learning_rate": 3.4466865155250776e-05, + "loss": 1.2441, + "step": 1395 + }, + { + "epoch": 2.9487856388595564, + "grad_norm": 0.16970986648577802, + "learning_rate": 3.4408386418580036e-05, + "loss": 1.2471, + "step": 1396 + }, + { + "epoch": 2.9508975712777192, + "grad_norm": 0.19535303688998287, + "learning_rate": 3.4349919868818336e-05, + "loss": 1.2445, + "step": 1397 + }, + { + "epoch": 2.9530095036958817, + "grad_norm": 0.16599324745072103, + "learning_rate": 3.429146563339341e-05, + "loss": 1.2498, + "step": 1398 + }, + { + "epoch": 2.9551214361140445, + "grad_norm": 0.18272052468427682, + "learning_rate": 3.423302383970613e-05, + "loss": 1.2426, + "step": 1399 + }, + { + "epoch": 2.957233368532207, + "grad_norm": 0.1895387870651528, + "learning_rate": 3.417459461513025e-05, + "loss": 1.2602, + "step": 1400 + }, + { + "epoch": 2.9593453009503694, + "grad_norm": 0.15562551211774833, + "learning_rate": 3.411617808701214e-05, + "loss": 1.2455, + "step": 1401 + }, + { + "epoch": 2.9614572333685323, + "grad_norm": 0.18380887667156742, + "learning_rate": 3.405777438267047e-05, + "loss": 1.2323, + "step": 1402 + }, + { + "epoch": 2.9635691657866947, + "grad_norm": 0.16157140772732223, + "learning_rate": 3.3999383629395995e-05, + "loss": 1.2377, + "step": 1403 + }, + { + "epoch": 2.9656810982048576, + "grad_norm": 0.17908652476948922, + "learning_rate": 3.3941005954451226e-05, + "loss": 1.262, + "step": 1404 + }, + { + "epoch": 2.96779303062302, + "grad_norm": 0.14196068174183477, + "learning_rate": 3.388264148507016e-05, + "loss": 1.2503, + "step": 1405 + }, + { + "epoch": 2.9699049630411825, + "grad_norm": 0.29064774195069637, + "learning_rate": 3.3824290348458065e-05, + "loss": 1.2556, + "step": 1406 + }, + { + "epoch": 2.9720168954593453, + "grad_norm": 0.1406063097899163, + "learning_rate": 3.376595267179106e-05, + "loss": 1.247, + "step": 1407 + }, + { + "epoch": 2.9741288278775078, + "grad_norm": 0.18187034657231363, + "learning_rate": 3.3707628582215996e-05, + "loss": 1.2361, + "step": 1408 + }, + { + "epoch": 2.9762407602956706, + "grad_norm": 0.15774826765508418, + "learning_rate": 3.364931820685012e-05, + "loss": 1.2626, + "step": 1409 + }, + { + "epoch": 2.978352692713833, + "grad_norm": 0.1632684092663418, + "learning_rate": 3.3591021672780714e-05, + "loss": 1.2457, + "step": 1410 + }, + { + "epoch": 2.9804646251319955, + "grad_norm": 0.177678261182772, + "learning_rate": 3.353273910706499e-05, + "loss": 1.2472, + "step": 1411 + }, + { + "epoch": 2.9825765575501584, + "grad_norm": 0.1687761341388544, + "learning_rate": 3.347447063672963e-05, + "loss": 1.2398, + "step": 1412 + }, + { + "epoch": 2.9846884899683213, + "grad_norm": 0.1886770863829917, + "learning_rate": 3.341621638877064e-05, + "loss": 1.2397, + "step": 1413 + }, + { + "epoch": 2.9868004223864837, + "grad_norm": 0.17075215206270264, + "learning_rate": 3.3357976490153046e-05, + "loss": 1.2574, + "step": 1414 + }, + { + "epoch": 2.988912354804646, + "grad_norm": 0.14517857752169927, + "learning_rate": 3.329975106781055e-05, + "loss": 1.2456, + "step": 1415 + }, + { + "epoch": 2.991024287222809, + "grad_norm": 0.16779368838229222, + "learning_rate": 3.324154024864533e-05, + "loss": 1.2548, + "step": 1416 + }, + { + "epoch": 2.9931362196409714, + "grad_norm": 0.16447829380446374, + "learning_rate": 3.318334415952774e-05, + "loss": 1.2589, + "step": 1417 + }, + { + "epoch": 2.9952481520591343, + "grad_norm": 0.1832469855257109, + "learning_rate": 3.312516292729602e-05, + "loss": 1.2371, + "step": 1418 + }, + { + "epoch": 2.9973600844772967, + "grad_norm": 0.17937932911146878, + "learning_rate": 3.3066996678756025e-05, + "loss": 1.2547, + "step": 1419 + }, + { + "epoch": 2.999472016895459, + "grad_norm": 0.19393395423294793, + "learning_rate": 3.300884554068097e-05, + "loss": 1.2436, + "step": 1420 + }, + { + "epoch": 3.0018479408658925, + "grad_norm": 0.17582740320571208, + "learning_rate": 3.2950709639811135e-05, + "loss": 1.2286, + "step": 1421 + }, + { + "epoch": 3.003959873284055, + "grad_norm": 0.20677802173314594, + "learning_rate": 3.289258910285357e-05, + "loss": 1.2173, + "step": 1422 + }, + { + "epoch": 3.0060718057022173, + "grad_norm": 0.18822313762068393, + "learning_rate": 3.283448405648187e-05, + "loss": 1.2089, + "step": 1423 + }, + { + "epoch": 3.00818373812038, + "grad_norm": 0.19113479122721141, + "learning_rate": 3.2776394627335834e-05, + "loss": 1.2262, + "step": 1424 + }, + { + "epoch": 3.0102956705385426, + "grad_norm": 0.19588750363003146, + "learning_rate": 3.271832094202124e-05, + "loss": 1.2127, + "step": 1425 + }, + { + "epoch": 3.0124076029567055, + "grad_norm": 0.19860030969520354, + "learning_rate": 3.266026312710957e-05, + "loss": 1.2181, + "step": 1426 + }, + { + "epoch": 3.014519535374868, + "grad_norm": 0.17297610484647122, + "learning_rate": 3.2602221309137666e-05, + "loss": 1.2389, + "step": 1427 + }, + { + "epoch": 3.016631467793031, + "grad_norm": 0.17001945084021206, + "learning_rate": 3.2544195614607586e-05, + "loss": 1.2264, + "step": 1428 + }, + { + "epoch": 3.0187434002111933, + "grad_norm": 0.14893478847951844, + "learning_rate": 3.248618616998616e-05, + "loss": 1.2303, + "step": 1429 + }, + { + "epoch": 3.0208553326293557, + "grad_norm": 0.18541555911294172, + "learning_rate": 3.2428193101704855e-05, + "loss": 1.2255, + "step": 1430 + }, + { + "epoch": 3.0229672650475186, + "grad_norm": 0.14604305304015458, + "learning_rate": 3.2370216536159436e-05, + "loss": 1.2252, + "step": 1431 + }, + { + "epoch": 3.025079197465681, + "grad_norm": 0.17683358032200713, + "learning_rate": 3.231225659970968e-05, + "loss": 1.2183, + "step": 1432 + }, + { + "epoch": 3.027191129883844, + "grad_norm": 0.14961387357236167, + "learning_rate": 3.225431341867916e-05, + "loss": 1.2279, + "step": 1433 + }, + { + "epoch": 3.0293030623020063, + "grad_norm": 0.1784974732642884, + "learning_rate": 3.219638711935488e-05, + "loss": 1.2393, + "step": 1434 + }, + { + "epoch": 3.031414994720169, + "grad_norm": 0.1973064471517419, + "learning_rate": 3.213847782798708e-05, + "loss": 1.2175, + "step": 1435 + }, + { + "epoch": 3.0335269271383316, + "grad_norm": 0.14959625951598324, + "learning_rate": 3.2080585670788964e-05, + "loss": 1.2172, + "step": 1436 + }, + { + "epoch": 3.035638859556494, + "grad_norm": 0.17459678456865269, + "learning_rate": 3.202271077393631e-05, + "loss": 1.2234, + "step": 1437 + }, + { + "epoch": 3.037750791974657, + "grad_norm": 0.14096792901557148, + "learning_rate": 3.196485326356737e-05, + "loss": 1.2237, + "step": 1438 + }, + { + "epoch": 3.0398627243928193, + "grad_norm": 0.1624527217438897, + "learning_rate": 3.190701326578241e-05, + "loss": 1.2177, + "step": 1439 + }, + { + "epoch": 3.0419746568109822, + "grad_norm": 0.1441736918571477, + "learning_rate": 3.1849190906643595e-05, + "loss": 1.1996, + "step": 1440 + }, + { + "epoch": 3.0440865892291447, + "grad_norm": 0.13709776062720713, + "learning_rate": 3.179138631217463e-05, + "loss": 1.2248, + "step": 1441 + }, + { + "epoch": 3.046198521647307, + "grad_norm": 0.14298623325421855, + "learning_rate": 3.173359960836049e-05, + "loss": 1.2389, + "step": 1442 + }, + { + "epoch": 3.04831045406547, + "grad_norm": 0.15955067352968158, + "learning_rate": 3.167583092114717e-05, + "loss": 1.2287, + "step": 1443 + }, + { + "epoch": 3.0504223864836324, + "grad_norm": 0.1484782109490905, + "learning_rate": 3.1618080376441384e-05, + "loss": 1.2249, + "step": 1444 + }, + { + "epoch": 3.0525343189017953, + "grad_norm": 0.17862914934591118, + "learning_rate": 3.156034810011032e-05, + "loss": 1.233, + "step": 1445 + }, + { + "epoch": 3.0546462513199577, + "grad_norm": 0.16271770267049285, + "learning_rate": 3.1502634217981346e-05, + "loss": 1.2154, + "step": 1446 + }, + { + "epoch": 3.0567581837381206, + "grad_norm": 0.13144878840684637, + "learning_rate": 3.1444938855841716e-05, + "loss": 1.2235, + "step": 1447 + }, + { + "epoch": 3.058870116156283, + "grad_norm": 0.12081753523589324, + "learning_rate": 3.138726213943837e-05, + "loss": 1.2165, + "step": 1448 + }, + { + "epoch": 3.0609820485744454, + "grad_norm": 0.11068649331786819, + "learning_rate": 3.132960419447754e-05, + "loss": 1.2283, + "step": 1449 + }, + { + "epoch": 3.0630939809926083, + "grad_norm": 0.1373571355374936, + "learning_rate": 3.127196514662461e-05, + "loss": 1.2263, + "step": 1450 + }, + { + "epoch": 3.0652059134107708, + "grad_norm": 0.12907674384697604, + "learning_rate": 3.121434512150378e-05, + "loss": 1.2228, + "step": 1451 + }, + { + "epoch": 3.0673178458289336, + "grad_norm": 0.13312674531566382, + "learning_rate": 3.115674424469772e-05, + "loss": 1.2306, + "step": 1452 + }, + { + "epoch": 3.069429778247096, + "grad_norm": 0.14323403767720924, + "learning_rate": 3.109916264174743e-05, + "loss": 1.2175, + "step": 1453 + }, + { + "epoch": 3.071541710665259, + "grad_norm": 0.1367902075706264, + "learning_rate": 3.104160043815187e-05, + "loss": 1.2265, + "step": 1454 + }, + { + "epoch": 3.0736536430834214, + "grad_norm": 0.1534954521574034, + "learning_rate": 3.0984057759367764e-05, + "loss": 1.2239, + "step": 1455 + }, + { + "epoch": 3.075765575501584, + "grad_norm": 0.12379808453679084, + "learning_rate": 3.09265347308092e-05, + "loss": 1.226, + "step": 1456 + }, + { + "epoch": 3.0778775079197467, + "grad_norm": 0.14804578051423758, + "learning_rate": 3.0869031477847515e-05, + "loss": 1.2281, + "step": 1457 + }, + { + "epoch": 3.079989440337909, + "grad_norm": 0.172957977893228, + "learning_rate": 3.081154812581093e-05, + "loss": 1.2151, + "step": 1458 + }, + { + "epoch": 3.082101372756072, + "grad_norm": 0.14697144507084708, + "learning_rate": 3.075408479998425e-05, + "loss": 1.2333, + "step": 1459 + }, + { + "epoch": 3.0842133051742344, + "grad_norm": 0.21233195195468527, + "learning_rate": 3.06966416256087e-05, + "loss": 1.2401, + "step": 1460 + }, + { + "epoch": 3.086325237592397, + "grad_norm": 0.23436693661650576, + "learning_rate": 3.063921872788151e-05, + "loss": 1.244, + "step": 1461 + }, + { + "epoch": 3.0884371700105597, + "grad_norm": 0.17472954991642695, + "learning_rate": 3.058181623195577e-05, + "loss": 1.2274, + "step": 1462 + }, + { + "epoch": 3.090549102428722, + "grad_norm": 0.16046276924257835, + "learning_rate": 3.05244342629401e-05, + "loss": 1.226, + "step": 1463 + }, + { + "epoch": 3.092661034846885, + "grad_norm": 0.19456740706090947, + "learning_rate": 3.046707294589835e-05, + "loss": 1.2277, + "step": 1464 + }, + { + "epoch": 3.0947729672650475, + "grad_norm": 0.14175183809399627, + "learning_rate": 3.0409732405849405e-05, + "loss": 1.2225, + "step": 1465 + }, + { + "epoch": 3.0968848996832103, + "grad_norm": 0.16169259061904662, + "learning_rate": 3.0352412767766805e-05, + "loss": 1.2302, + "step": 1466 + }, + { + "epoch": 3.098996832101373, + "grad_norm": 0.12654657300362096, + "learning_rate": 3.0295114156578606e-05, + "loss": 1.2326, + "step": 1467 + }, + { + "epoch": 3.101108764519535, + "grad_norm": 0.1676959173528841, + "learning_rate": 3.0237836697166997e-05, + "loss": 1.2359, + "step": 1468 + }, + { + "epoch": 3.103220696937698, + "grad_norm": 0.15431335000223506, + "learning_rate": 3.0180580514368037e-05, + "loss": 1.2294, + "step": 1469 + }, + { + "epoch": 3.1053326293558605, + "grad_norm": 0.15420644578382214, + "learning_rate": 3.012334573297149e-05, + "loss": 1.221, + "step": 1470 + }, + { + "epoch": 3.1074445617740234, + "grad_norm": 0.13740922495271773, + "learning_rate": 3.0066132477720393e-05, + "loss": 1.2174, + "step": 1471 + }, + { + "epoch": 3.109556494192186, + "grad_norm": 0.16474387552158157, + "learning_rate": 3.000894087331092e-05, + "loss": 1.2299, + "step": 1472 + }, + { + "epoch": 3.1116684266103483, + "grad_norm": 0.12723514211647963, + "learning_rate": 2.995177104439207e-05, + "loss": 1.2193, + "step": 1473 + }, + { + "epoch": 3.113780359028511, + "grad_norm": 0.1627194201785723, + "learning_rate": 2.9894623115565316e-05, + "loss": 1.2299, + "step": 1474 + }, + { + "epoch": 3.1158922914466736, + "grad_norm": 0.13463445899319115, + "learning_rate": 2.9837497211384457e-05, + "loss": 1.2425, + "step": 1475 + }, + { + "epoch": 3.1180042238648364, + "grad_norm": 0.1384794756695371, + "learning_rate": 2.978039345635526e-05, + "loss": 1.2261, + "step": 1476 + }, + { + "epoch": 3.120116156282999, + "grad_norm": 0.14529496089085508, + "learning_rate": 2.9723311974935235e-05, + "loss": 1.2124, + "step": 1477 + }, + { + "epoch": 3.1222280887011618, + "grad_norm": 0.1410362294486071, + "learning_rate": 2.9666252891533354e-05, + "loss": 1.2185, + "step": 1478 + }, + { + "epoch": 3.124340021119324, + "grad_norm": 0.16837557798429187, + "learning_rate": 2.960921633050973e-05, + "loss": 1.217, + "step": 1479 + }, + { + "epoch": 3.1264519535374866, + "grad_norm": 0.1355447342408687, + "learning_rate": 2.9552202416175456e-05, + "loss": 1.2336, + "step": 1480 + }, + { + "epoch": 3.1285638859556495, + "grad_norm": 0.13868555296260204, + "learning_rate": 2.9495211272792184e-05, + "loss": 1.2313, + "step": 1481 + }, + { + "epoch": 3.130675818373812, + "grad_norm": 0.14596763563232837, + "learning_rate": 2.943824302457202e-05, + "loss": 1.2284, + "step": 1482 + }, + { + "epoch": 3.132787750791975, + "grad_norm": 0.1391828626041363, + "learning_rate": 2.938129779567712e-05, + "loss": 1.2196, + "step": 1483 + }, + { + "epoch": 3.1348996832101372, + "grad_norm": 0.13915663251192065, + "learning_rate": 2.9324375710219455e-05, + "loss": 1.2384, + "step": 1484 + }, + { + "epoch": 3.1370116156282997, + "grad_norm": 0.12888363447912315, + "learning_rate": 2.9267476892260625e-05, + "loss": 1.2339, + "step": 1485 + }, + { + "epoch": 3.1391235480464625, + "grad_norm": 0.1442487585335087, + "learning_rate": 2.9210601465811428e-05, + "loss": 1.214, + "step": 1486 + }, + { + "epoch": 3.141235480464625, + "grad_norm": 0.12779858488775397, + "learning_rate": 2.915374955483177e-05, + "loss": 1.2271, + "step": 1487 + }, + { + "epoch": 3.143347412882788, + "grad_norm": 0.154259565458381, + "learning_rate": 2.9096921283230223e-05, + "loss": 1.2184, + "step": 1488 + }, + { + "epoch": 3.1454593453009503, + "grad_norm": 0.13235739243389294, + "learning_rate": 2.90401167748639e-05, + "loss": 1.222, + "step": 1489 + }, + { + "epoch": 3.147571277719113, + "grad_norm": 0.1395073446308979, + "learning_rate": 2.898333615353809e-05, + "loss": 1.2319, + "step": 1490 + }, + { + "epoch": 3.1496832101372756, + "grad_norm": 0.1356326606201067, + "learning_rate": 2.892657954300603e-05, + "loss": 1.2112, + "step": 1491 + }, + { + "epoch": 3.151795142555438, + "grad_norm": 0.13222705413064154, + "learning_rate": 2.8869847066968632e-05, + "loss": 1.2202, + "step": 1492 + }, + { + "epoch": 3.153907074973601, + "grad_norm": 0.14715382992802928, + "learning_rate": 2.8813138849074165e-05, + "loss": 1.2312, + "step": 1493 + }, + { + "epoch": 3.1560190073917633, + "grad_norm": 0.13687916140339668, + "learning_rate": 2.8756455012918088e-05, + "loss": 1.2205, + "step": 1494 + }, + { + "epoch": 3.158130939809926, + "grad_norm": 0.15677015416164256, + "learning_rate": 2.86997956820427e-05, + "loss": 1.2451, + "step": 1495 + }, + { + "epoch": 3.1602428722280886, + "grad_norm": 0.15146814232599243, + "learning_rate": 2.864316097993686e-05, + "loss": 1.2317, + "step": 1496 + }, + { + "epoch": 3.1623548046462515, + "grad_norm": 0.17092633213128666, + "learning_rate": 2.85865510300358e-05, + "loss": 1.223, + "step": 1497 + }, + { + "epoch": 3.164466737064414, + "grad_norm": 0.14321617441449858, + "learning_rate": 2.8529965955720746e-05, + "loss": 1.2228, + "step": 1498 + }, + { + "epoch": 3.1665786694825764, + "grad_norm": 0.16144570825794202, + "learning_rate": 2.8473405880318748e-05, + "loss": 1.2204, + "step": 1499 + }, + { + "epoch": 3.1686906019007393, + "grad_norm": 0.1570590183666449, + "learning_rate": 2.8416870927102382e-05, + "loss": 1.2289, + "step": 1500 + }, + { + "epoch": 3.1708025343189017, + "grad_norm": 0.1665362457759338, + "learning_rate": 2.8360361219289424e-05, + "loss": 1.2259, + "step": 1501 + }, + { + "epoch": 3.1729144667370646, + "grad_norm": 0.25460211186516574, + "learning_rate": 2.8303876880042675e-05, + "loss": 1.222, + "step": 1502 + }, + { + "epoch": 3.175026399155227, + "grad_norm": 0.17669844936788295, + "learning_rate": 2.8247418032469598e-05, + "loss": 1.2256, + "step": 1503 + }, + { + "epoch": 3.1771383315733894, + "grad_norm": 0.13547185801595119, + "learning_rate": 2.8190984799622146e-05, + "loss": 1.2407, + "step": 1504 + }, + { + "epoch": 3.1792502639915523, + "grad_norm": 0.1693538795298241, + "learning_rate": 2.8134577304496415e-05, + "loss": 1.2256, + "step": 1505 + }, + { + "epoch": 3.1813621964097147, + "grad_norm": 0.1915647534245182, + "learning_rate": 2.8078195670032397e-05, + "loss": 1.2241, + "step": 1506 + }, + { + "epoch": 3.1834741288278776, + "grad_norm": 0.13766099162342466, + "learning_rate": 2.8021840019113765e-05, + "loss": 1.2203, + "step": 1507 + }, + { + "epoch": 3.18558606124604, + "grad_norm": 0.17707242996339972, + "learning_rate": 2.7965510474567498e-05, + "loss": 1.2032, + "step": 1508 + }, + { + "epoch": 3.187697993664203, + "grad_norm": 0.20143143201356975, + "learning_rate": 2.7909207159163724e-05, + "loss": 1.2255, + "step": 1509 + }, + { + "epoch": 3.1898099260823654, + "grad_norm": 0.21084234777042263, + "learning_rate": 2.7852930195615413e-05, + "loss": 1.2123, + "step": 1510 + }, + { + "epoch": 3.191921858500528, + "grad_norm": 0.19224406132276375, + "learning_rate": 2.7796679706578053e-05, + "loss": 1.2143, + "step": 1511 + }, + { + "epoch": 3.1940337909186907, + "grad_norm": 0.16816931647691757, + "learning_rate": 2.774045581464946e-05, + "loss": 1.2362, + "step": 1512 + }, + { + "epoch": 3.196145723336853, + "grad_norm": 0.1967975532047474, + "learning_rate": 2.7684258642369486e-05, + "loss": 1.2259, + "step": 1513 + }, + { + "epoch": 3.198257655755016, + "grad_norm": 0.18344603595048667, + "learning_rate": 2.762808831221972e-05, + "loss": 1.2422, + "step": 1514 + }, + { + "epoch": 3.2003695881731784, + "grad_norm": 0.20762995948722196, + "learning_rate": 2.7571944946623307e-05, + "loss": 1.222, + "step": 1515 + }, + { + "epoch": 3.2024815205913413, + "grad_norm": 0.13471407580419545, + "learning_rate": 2.7515828667944542e-05, + "loss": 1.239, + "step": 1516 + }, + { + "epoch": 3.2045934530095037, + "grad_norm": 0.1871944627938325, + "learning_rate": 2.7459739598488765e-05, + "loss": 1.2183, + "step": 1517 + }, + { + "epoch": 3.206705385427666, + "grad_norm": 0.15467985398476952, + "learning_rate": 2.740367786050193e-05, + "loss": 1.2122, + "step": 1518 + }, + { + "epoch": 3.208817317845829, + "grad_norm": 0.16549218384857037, + "learning_rate": 2.734764357617052e-05, + "loss": 1.2332, + "step": 1519 + }, + { + "epoch": 3.2109292502639915, + "grad_norm": 0.509453412523133, + "learning_rate": 2.729163686762109e-05, + "loss": 1.2419, + "step": 1520 + }, + { + "epoch": 3.2130411826821543, + "grad_norm": 0.16526029964475913, + "learning_rate": 2.7235657856920136e-05, + "loss": 1.2284, + "step": 1521 + }, + { + "epoch": 3.2151531151003168, + "grad_norm": 0.14612715942156085, + "learning_rate": 2.7179706666073814e-05, + "loss": 1.2311, + "step": 1522 + }, + { + "epoch": 3.217265047518479, + "grad_norm": 0.46188149452036725, + "learning_rate": 2.712378341702758e-05, + "loss": 1.2369, + "step": 1523 + }, + { + "epoch": 3.219376979936642, + "grad_norm": 0.3540318622996265, + "learning_rate": 2.7067888231666065e-05, + "loss": 1.2549, + "step": 1524 + }, + { + "epoch": 3.2214889123548045, + "grad_norm": 0.1657522047115624, + "learning_rate": 2.7012021231812666e-05, + "loss": 1.2165, + "step": 1525 + }, + { + "epoch": 3.2236008447729674, + "grad_norm": 0.19342097023166802, + "learning_rate": 2.6956182539229392e-05, + "loss": 1.2183, + "step": 1526 + }, + { + "epoch": 3.22571277719113, + "grad_norm": 0.1900960321272287, + "learning_rate": 2.6900372275616563e-05, + "loss": 1.2313, + "step": 1527 + }, + { + "epoch": 3.2278247096092927, + "grad_norm": 0.20898273119307714, + "learning_rate": 2.6844590562612497e-05, + "loss": 1.24, + "step": 1528 + }, + { + "epoch": 3.229936642027455, + "grad_norm": 0.202541815683263, + "learning_rate": 2.678883752179333e-05, + "loss": 1.2194, + "step": 1529 + }, + { + "epoch": 3.2320485744456176, + "grad_norm": 0.19851280311750874, + "learning_rate": 2.6733113274672665e-05, + "loss": 1.2296, + "step": 1530 + }, + { + "epoch": 3.2341605068637804, + "grad_norm": 0.17418465404446826, + "learning_rate": 2.667741794270138e-05, + "loss": 1.231, + "step": 1531 + }, + { + "epoch": 3.236272439281943, + "grad_norm": 0.16677978470479468, + "learning_rate": 2.662175164726735e-05, + "loss": 1.229, + "step": 1532 + }, + { + "epoch": 3.2383843717001057, + "grad_norm": 0.16031760396735636, + "learning_rate": 2.6566114509695098e-05, + "loss": 1.2162, + "step": 1533 + }, + { + "epoch": 3.240496304118268, + "grad_norm": 0.16424572385530842, + "learning_rate": 2.651050665124568e-05, + "loss": 1.236, + "step": 1534 + }, + { + "epoch": 3.242608236536431, + "grad_norm": 0.18807891589950382, + "learning_rate": 2.6454928193116258e-05, + "loss": 1.22, + "step": 1535 + }, + { + "epoch": 3.2447201689545935, + "grad_norm": 0.17834711329252168, + "learning_rate": 2.639937925643997e-05, + "loss": 1.2235, + "step": 1536 + }, + { + "epoch": 3.246832101372756, + "grad_norm": 0.17161664017119935, + "learning_rate": 2.634385996228561e-05, + "loss": 1.225, + "step": 1537 + }, + { + "epoch": 3.248944033790919, + "grad_norm": 0.18822246793842412, + "learning_rate": 2.6288370431657333e-05, + "loss": 1.217, + "step": 1538 + }, + { + "epoch": 3.251055966209081, + "grad_norm": 0.2032516842075396, + "learning_rate": 2.6232910785494467e-05, + "loss": 1.241, + "step": 1539 + }, + { + "epoch": 3.253167898627244, + "grad_norm": 0.14519609553341564, + "learning_rate": 2.6177481144671157e-05, + "loss": 1.2206, + "step": 1540 + }, + { + "epoch": 3.2552798310454065, + "grad_norm": 0.21903818838845088, + "learning_rate": 2.6122081629996197e-05, + "loss": 1.2296, + "step": 1541 + }, + { + "epoch": 3.257391763463569, + "grad_norm": 0.1737596030638558, + "learning_rate": 2.6066712362212706e-05, + "loss": 1.2299, + "step": 1542 + }, + { + "epoch": 3.259503695881732, + "grad_norm": 0.17093871279976236, + "learning_rate": 2.601137346199786e-05, + "loss": 1.2108, + "step": 1543 + }, + { + "epoch": 3.2616156282998943, + "grad_norm": 0.23530703284895016, + "learning_rate": 2.5956065049962687e-05, + "loss": 1.2206, + "step": 1544 + }, + { + "epoch": 3.263727560718057, + "grad_norm": 0.13352147730015543, + "learning_rate": 2.5900787246651717e-05, + "loss": 1.2251, + "step": 1545 + }, + { + "epoch": 3.2658394931362196, + "grad_norm": 0.18521841417063514, + "learning_rate": 2.5845540172542815e-05, + "loss": 1.2405, + "step": 1546 + }, + { + "epoch": 3.2679514255543824, + "grad_norm": 0.133573091037659, + "learning_rate": 2.5790323948046867e-05, + "loss": 1.2281, + "step": 1547 + }, + { + "epoch": 3.270063357972545, + "grad_norm": 0.1608640466150519, + "learning_rate": 2.573513869350748e-05, + "loss": 1.2093, + "step": 1548 + }, + { + "epoch": 3.2721752903907073, + "grad_norm": 0.13893963499085307, + "learning_rate": 2.5679984529200813e-05, + "loss": 1.2301, + "step": 1549 + }, + { + "epoch": 3.27428722280887, + "grad_norm": 0.14469588718828708, + "learning_rate": 2.5624861575335223e-05, + "loss": 1.2307, + "step": 1550 + }, + { + "epoch": 3.2763991552270326, + "grad_norm": 0.13481475230411172, + "learning_rate": 2.5569769952051086e-05, + "loss": 1.221, + "step": 1551 + }, + { + "epoch": 3.2785110876451955, + "grad_norm": 0.14564521994663918, + "learning_rate": 2.5514709779420432e-05, + "loss": 1.2191, + "step": 1552 + }, + { + "epoch": 3.280623020063358, + "grad_norm": 0.13872830504871791, + "learning_rate": 2.5459681177446803e-05, + "loss": 1.235, + "step": 1553 + }, + { + "epoch": 3.282734952481521, + "grad_norm": 0.1256908553348283, + "learning_rate": 2.540468426606493e-05, + "loss": 1.2135, + "step": 1554 + }, + { + "epoch": 3.2848468848996832, + "grad_norm": 0.1431052162118805, + "learning_rate": 2.5349719165140428e-05, + "loss": 1.2387, + "step": 1555 + }, + { + "epoch": 3.2869588173178457, + "grad_norm": 0.18137247575356077, + "learning_rate": 2.5294785994469636e-05, + "loss": 1.226, + "step": 1556 + }, + { + "epoch": 3.2890707497360085, + "grad_norm": 0.16444207011857198, + "learning_rate": 2.523988487377924e-05, + "loss": 1.2166, + "step": 1557 + }, + { + "epoch": 3.291182682154171, + "grad_norm": 0.14994143333708496, + "learning_rate": 2.5185015922726127e-05, + "loss": 1.2302, + "step": 1558 + }, + { + "epoch": 3.293294614572334, + "grad_norm": 0.14642956578518454, + "learning_rate": 2.5130179260897062e-05, + "loss": 1.2134, + "step": 1559 + }, + { + "epoch": 3.2954065469904963, + "grad_norm": 0.12583268077797063, + "learning_rate": 2.50753750078084e-05, + "loss": 1.2326, + "step": 1560 + }, + { + "epoch": 3.2975184794086587, + "grad_norm": 0.14191402129411868, + "learning_rate": 2.5020603282905908e-05, + "loss": 1.2201, + "step": 1561 + }, + { + "epoch": 3.2996304118268216, + "grad_norm": 0.12181242866483355, + "learning_rate": 2.4965864205564413e-05, + "loss": 1.2224, + "step": 1562 + }, + { + "epoch": 3.301742344244984, + "grad_norm": 0.14900859910105255, + "learning_rate": 2.4911157895087622e-05, + "loss": 1.2211, + "step": 1563 + }, + { + "epoch": 3.303854276663147, + "grad_norm": 0.130078745467458, + "learning_rate": 2.485648447070783e-05, + "loss": 1.236, + "step": 1564 + }, + { + "epoch": 3.3059662090813093, + "grad_norm": 0.129839823372703, + "learning_rate": 2.4801844051585605e-05, + "loss": 1.2231, + "step": 1565 + }, + { + "epoch": 3.3080781414994718, + "grad_norm": 0.12898323778025028, + "learning_rate": 2.4747236756809655e-05, + "loss": 1.2286, + "step": 1566 + }, + { + "epoch": 3.3101900739176346, + "grad_norm": 0.15163584180370515, + "learning_rate": 2.4692662705396412e-05, + "loss": 1.2248, + "step": 1567 + }, + { + "epoch": 3.312302006335797, + "grad_norm": 0.1347669197890165, + "learning_rate": 2.4638122016289928e-05, + "loss": 1.2262, + "step": 1568 + }, + { + "epoch": 3.31441393875396, + "grad_norm": 0.1360667679805927, + "learning_rate": 2.458361480836151e-05, + "loss": 1.2244, + "step": 1569 + }, + { + "epoch": 3.3165258711721224, + "grad_norm": 0.14884216891374943, + "learning_rate": 2.4529141200409473e-05, + "loss": 1.2283, + "step": 1570 + }, + { + "epoch": 3.3186378035902853, + "grad_norm": 0.16274598804844226, + "learning_rate": 2.4474701311158953e-05, + "loss": 1.2139, + "step": 1571 + }, + { + "epoch": 3.3207497360084477, + "grad_norm": 0.14131412701070822, + "learning_rate": 2.4420295259261532e-05, + "loss": 1.2385, + "step": 1572 + }, + { + "epoch": 3.3228616684266106, + "grad_norm": 0.13086767804957528, + "learning_rate": 2.4365923163295087e-05, + "loss": 1.2217, + "step": 1573 + }, + { + "epoch": 3.324973600844773, + "grad_norm": 0.12418607720886564, + "learning_rate": 2.4311585141763495e-05, + "loss": 1.2272, + "step": 1574 + }, + { + "epoch": 3.3270855332629354, + "grad_norm": 0.12399226164200464, + "learning_rate": 2.4257281313096317e-05, + "loss": 1.2157, + "step": 1575 + }, + { + "epoch": 3.3291974656810983, + "grad_norm": 0.14232275033018205, + "learning_rate": 2.4203011795648658e-05, + "loss": 1.2241, + "step": 1576 + }, + { + "epoch": 3.3313093980992607, + "grad_norm": 0.1302213775748988, + "learning_rate": 2.4148776707700777e-05, + "loss": 1.2395, + "step": 1577 + }, + { + "epoch": 3.3334213305174236, + "grad_norm": 0.1272842046684516, + "learning_rate": 2.409457616745795e-05, + "loss": 1.2245, + "step": 1578 + }, + { + "epoch": 3.335533262935586, + "grad_norm": 0.13468133766351054, + "learning_rate": 2.4040410293050122e-05, + "loss": 1.2307, + "step": 1579 + }, + { + "epoch": 3.3376451953537485, + "grad_norm": 0.11423212658296052, + "learning_rate": 2.3986279202531673e-05, + "loss": 1.2491, + "step": 1580 + }, + { + "epoch": 3.3397571277719114, + "grad_norm": 0.12956957080201528, + "learning_rate": 2.393218301388123e-05, + "loss": 1.2189, + "step": 1581 + }, + { + "epoch": 3.341869060190074, + "grad_norm": 0.11670047309878293, + "learning_rate": 2.3878121845001276e-05, + "loss": 1.233, + "step": 1582 + }, + { + "epoch": 3.3439809926082367, + "grad_norm": 0.1324710881294976, + "learning_rate": 2.3824095813718024e-05, + "loss": 1.2383, + "step": 1583 + }, + { + "epoch": 3.346092925026399, + "grad_norm": 0.1274161547544752, + "learning_rate": 2.3770105037781097e-05, + "loss": 1.2352, + "step": 1584 + }, + { + "epoch": 3.3482048574445615, + "grad_norm": 0.11658395173065077, + "learning_rate": 2.3716149634863248e-05, + "loss": 1.2274, + "step": 1585 + }, + { + "epoch": 3.3503167898627244, + "grad_norm": 0.12068872266975779, + "learning_rate": 2.366222972256016e-05, + "loss": 1.2086, + "step": 1586 + }, + { + "epoch": 3.352428722280887, + "grad_norm": 0.12596509272693224, + "learning_rate": 2.360834541839016e-05, + "loss": 1.2154, + "step": 1587 + }, + { + "epoch": 3.3545406546990497, + "grad_norm": 0.12844433006768272, + "learning_rate": 2.355449683979398e-05, + "loss": 1.2134, + "step": 1588 + }, + { + "epoch": 3.356652587117212, + "grad_norm": 0.11234395288039106, + "learning_rate": 2.3500684104134437e-05, + "loss": 1.2186, + "step": 1589 + }, + { + "epoch": 3.358764519535375, + "grad_norm": 0.12527260698844067, + "learning_rate": 2.3446907328696292e-05, + "loss": 1.2315, + "step": 1590 + }, + { + "epoch": 3.3608764519535375, + "grad_norm": 0.12565152830262089, + "learning_rate": 2.3393166630685918e-05, + "loss": 1.2274, + "step": 1591 + }, + { + "epoch": 3.3629883843717003, + "grad_norm": 0.1191593383919846, + "learning_rate": 2.3339462127231015e-05, + "loss": 1.2258, + "step": 1592 + }, + { + "epoch": 3.3651003167898628, + "grad_norm": 0.11649017945544626, + "learning_rate": 2.3285793935380463e-05, + "loss": 1.2255, + "step": 1593 + }, + { + "epoch": 3.367212249208025, + "grad_norm": 0.13109401382320138, + "learning_rate": 2.323216217210391e-05, + "loss": 1.2331, + "step": 1594 + }, + { + "epoch": 3.369324181626188, + "grad_norm": 0.12106180707926388, + "learning_rate": 2.31785669542917e-05, + "loss": 1.2292, + "step": 1595 + }, + { + "epoch": 3.3714361140443505, + "grad_norm": 0.1726771458293426, + "learning_rate": 2.3125008398754483e-05, + "loss": 1.2288, + "step": 1596 + }, + { + "epoch": 3.3735480464625134, + "grad_norm": 0.12532317756312422, + "learning_rate": 2.3071486622223006e-05, + "loss": 1.2413, + "step": 1597 + }, + { + "epoch": 3.375659978880676, + "grad_norm": 0.16305563759143823, + "learning_rate": 2.3018001741347872e-05, + "loss": 1.2288, + "step": 1598 + }, + { + "epoch": 3.3777719112988382, + "grad_norm": 0.15758053844099723, + "learning_rate": 2.2964553872699216e-05, + "loss": 1.2326, + "step": 1599 + }, + { + "epoch": 3.379883843717001, + "grad_norm": 0.14185755051058216, + "learning_rate": 2.2911143132766586e-05, + "loss": 1.2264, + "step": 1600 + }, + { + "epoch": 3.3819957761351636, + "grad_norm": 0.1315108037384061, + "learning_rate": 2.2857769637958556e-05, + "loss": 1.2283, + "step": 1601 + }, + { + "epoch": 3.3841077085533264, + "grad_norm": 0.12461990628599313, + "learning_rate": 2.280443350460254e-05, + "loss": 1.2205, + "step": 1602 + }, + { + "epoch": 3.386219640971489, + "grad_norm": 0.15198885233099418, + "learning_rate": 2.2751134848944507e-05, + "loss": 1.2084, + "step": 1603 + }, + { + "epoch": 3.3883315733896513, + "grad_norm": 0.15301591288629432, + "learning_rate": 2.2697873787148766e-05, + "loss": 1.2227, + "step": 1604 + }, + { + "epoch": 3.390443505807814, + "grad_norm": 0.11975080213284, + "learning_rate": 2.264465043529768e-05, + "loss": 1.2263, + "step": 1605 + }, + { + "epoch": 3.3925554382259766, + "grad_norm": 0.12677738244393483, + "learning_rate": 2.2591464909391418e-05, + "loss": 1.2228, + "step": 1606 + }, + { + "epoch": 3.3946673706441395, + "grad_norm": 0.1267555123890361, + "learning_rate": 2.2538317325347717e-05, + "loss": 1.2257, + "step": 1607 + }, + { + "epoch": 3.396779303062302, + "grad_norm": 0.12435576741163262, + "learning_rate": 2.2485207799001607e-05, + "loss": 1.2155, + "step": 1608 + }, + { + "epoch": 3.398891235480465, + "grad_norm": 0.14791149138600518, + "learning_rate": 2.2432136446105192e-05, + "loss": 1.2291, + "step": 1609 + }, + { + "epoch": 3.401003167898627, + "grad_norm": 0.12908156086020206, + "learning_rate": 2.2379103382327357e-05, + "loss": 1.2309, + "step": 1610 + }, + { + "epoch": 3.40311510031679, + "grad_norm": 0.1532283609629337, + "learning_rate": 2.2326108723253544e-05, + "loss": 1.2309, + "step": 1611 + }, + { + "epoch": 3.4052270327349525, + "grad_norm": 0.13976224913794824, + "learning_rate": 2.227315258438549e-05, + "loss": 1.2339, + "step": 1612 + }, + { + "epoch": 3.407338965153115, + "grad_norm": 0.13475491450042854, + "learning_rate": 2.2220235081140986e-05, + "loss": 1.231, + "step": 1613 + }, + { + "epoch": 3.409450897571278, + "grad_norm": 0.12856909895484428, + "learning_rate": 2.2167356328853603e-05, + "loss": 1.2263, + "step": 1614 + }, + { + "epoch": 3.4115628299894403, + "grad_norm": 0.12471588506909757, + "learning_rate": 2.2114516442772443e-05, + "loss": 1.2242, + "step": 1615 + }, + { + "epoch": 3.413674762407603, + "grad_norm": 0.12555747671326783, + "learning_rate": 2.206171553806198e-05, + "loss": 1.2174, + "step": 1616 + }, + { + "epoch": 3.4157866948257656, + "grad_norm": 0.11771233174090952, + "learning_rate": 2.2008953729801585e-05, + "loss": 1.237, + "step": 1617 + }, + { + "epoch": 3.417898627243928, + "grad_norm": 0.13798936199034723, + "learning_rate": 2.195623113298557e-05, + "loss": 1.2221, + "step": 1618 + }, + { + "epoch": 3.420010559662091, + "grad_norm": 0.11718272974965659, + "learning_rate": 2.1903547862522666e-05, + "loss": 1.2281, + "step": 1619 + }, + { + "epoch": 3.4221224920802533, + "grad_norm": 0.12264014688793987, + "learning_rate": 2.185090403323596e-05, + "loss": 1.2105, + "step": 1620 + }, + { + "epoch": 3.424234424498416, + "grad_norm": 0.12528862472233465, + "learning_rate": 2.179829975986255e-05, + "loss": 1.2061, + "step": 1621 + }, + { + "epoch": 3.4263463569165786, + "grad_norm": 0.1216798849447191, + "learning_rate": 2.1745735157053315e-05, + "loss": 1.2304, + "step": 1622 + }, + { + "epoch": 3.428458289334741, + "grad_norm": 0.1348360957256745, + "learning_rate": 2.1693210339372744e-05, + "loss": 1.2215, + "step": 1623 + }, + { + "epoch": 3.430570221752904, + "grad_norm": 0.1185418357126947, + "learning_rate": 2.1640725421298487e-05, + "loss": 1.2118, + "step": 1624 + }, + { + "epoch": 3.4326821541710664, + "grad_norm": 0.13453095789070163, + "learning_rate": 2.158828051722137e-05, + "loss": 1.2409, + "step": 1625 + }, + { + "epoch": 3.4347940865892292, + "grad_norm": 0.13901354530650795, + "learning_rate": 2.1535875741444876e-05, + "loss": 1.2215, + "step": 1626 + }, + { + "epoch": 3.4369060190073917, + "grad_norm": 0.12975335328072118, + "learning_rate": 2.1483511208185142e-05, + "loss": 1.2317, + "step": 1627 + }, + { + "epoch": 3.4390179514255546, + "grad_norm": 0.12570469839183482, + "learning_rate": 2.1431187031570548e-05, + "loss": 1.2104, + "step": 1628 + }, + { + "epoch": 3.441129883843717, + "grad_norm": 0.12767911696230194, + "learning_rate": 2.1378903325641475e-05, + "loss": 1.2155, + "step": 1629 + }, + { + "epoch": 3.44324181626188, + "grad_norm": 0.13385145857716088, + "learning_rate": 2.1326660204350196e-05, + "loss": 1.2236, + "step": 1630 + }, + { + "epoch": 3.4453537486800423, + "grad_norm": 0.12430165924418088, + "learning_rate": 2.1274457781560418e-05, + "loss": 1.2313, + "step": 1631 + }, + { + "epoch": 3.4474656810982047, + "grad_norm": 0.13216596042746342, + "learning_rate": 2.1222296171047252e-05, + "loss": 1.2022, + "step": 1632 + }, + { + "epoch": 3.4495776135163676, + "grad_norm": 0.14225873326248206, + "learning_rate": 2.1170175486496786e-05, + "loss": 1.2082, + "step": 1633 + }, + { + "epoch": 3.45168954593453, + "grad_norm": 0.1254356400937591, + "learning_rate": 2.111809584150594e-05, + "loss": 1.2213, + "step": 1634 + }, + { + "epoch": 3.453801478352693, + "grad_norm": 0.1417834219005022, + "learning_rate": 2.106605734958219e-05, + "loss": 1.2186, + "step": 1635 + }, + { + "epoch": 3.4559134107708553, + "grad_norm": 0.12302274065839111, + "learning_rate": 2.1014060124143273e-05, + "loss": 1.2275, + "step": 1636 + }, + { + "epoch": 3.4580253431890178, + "grad_norm": 0.18191067401868513, + "learning_rate": 2.096210427851706e-05, + "loss": 1.2183, + "step": 1637 + }, + { + "epoch": 3.4601372756071807, + "grad_norm": 0.14175203024295552, + "learning_rate": 2.091018992594119e-05, + "loss": 1.2207, + "step": 1638 + }, + { + "epoch": 3.462249208025343, + "grad_norm": 0.14989186343781136, + "learning_rate": 2.0858317179562873e-05, + "loss": 1.2277, + "step": 1639 + }, + { + "epoch": 3.464361140443506, + "grad_norm": 0.16176584860059842, + "learning_rate": 2.0806486152438642e-05, + "loss": 1.2234, + "step": 1640 + }, + { + "epoch": 3.4664730728616684, + "grad_norm": 0.1289476104170934, + "learning_rate": 2.0754696957534106e-05, + "loss": 1.2261, + "step": 1641 + }, + { + "epoch": 3.468585005279831, + "grad_norm": 0.15087590973398504, + "learning_rate": 2.070294970772369e-05, + "loss": 1.2341, + "step": 1642 + }, + { + "epoch": 3.4706969376979937, + "grad_norm": 0.11279073270553841, + "learning_rate": 2.065124451579041e-05, + "loss": 1.2348, + "step": 1643 + }, + { + "epoch": 3.472808870116156, + "grad_norm": 0.20614296715169722, + "learning_rate": 2.059958149442562e-05, + "loss": 1.2244, + "step": 1644 + }, + { + "epoch": 3.474920802534319, + "grad_norm": 0.1427731836433151, + "learning_rate": 2.054796075622875e-05, + "loss": 1.2155, + "step": 1645 + }, + { + "epoch": 3.4770327349524814, + "grad_norm": 0.1576125658082226, + "learning_rate": 2.0496382413707083e-05, + "loss": 1.2349, + "step": 1646 + }, + { + "epoch": 3.4791446673706443, + "grad_norm": 0.12025801859353596, + "learning_rate": 2.0444846579275503e-05, + "loss": 1.2407, + "step": 1647 + }, + { + "epoch": 3.4812565997888067, + "grad_norm": 0.18852215945661235, + "learning_rate": 2.0393353365256238e-05, + "loss": 1.2165, + "step": 1648 + }, + { + "epoch": 3.4833685322069696, + "grad_norm": 0.31075445410857544, + "learning_rate": 2.0341902883878628e-05, + "loss": 1.2363, + "step": 1649 + }, + { + "epoch": 3.485480464625132, + "grad_norm": 0.1500929617829973, + "learning_rate": 2.0290495247278893e-05, + "loss": 1.2266, + "step": 1650 + }, + { + "epoch": 3.4875923970432945, + "grad_norm": 0.14479004832447318, + "learning_rate": 2.0239130567499846e-05, + "loss": 1.2191, + "step": 1651 + }, + { + "epoch": 3.4897043294614574, + "grad_norm": 0.1348278666773476, + "learning_rate": 2.01878089564907e-05, + "loss": 1.2141, + "step": 1652 + }, + { + "epoch": 3.49181626187962, + "grad_norm": 0.139086851993737, + "learning_rate": 2.0136530526106785e-05, + "loss": 1.2179, + "step": 1653 + }, + { + "epoch": 3.4939281942977827, + "grad_norm": 0.12929723388967354, + "learning_rate": 2.00852953881093e-05, + "loss": 1.2259, + "step": 1654 + }, + { + "epoch": 3.496040126715945, + "grad_norm": 0.1906117721552204, + "learning_rate": 2.0034103654165168e-05, + "loss": 1.2186, + "step": 1655 + }, + { + "epoch": 3.4981520591341075, + "grad_norm": 0.13676432883849363, + "learning_rate": 1.998295543584661e-05, + "loss": 1.2344, + "step": 1656 + }, + { + "epoch": 3.5002639915522704, + "grad_norm": 0.17073154565860252, + "learning_rate": 1.993185084463106e-05, + "loss": 1.2343, + "step": 1657 + }, + { + "epoch": 3.502375923970433, + "grad_norm": 0.12278309400565542, + "learning_rate": 1.9880789991900865e-05, + "loss": 1.2195, + "step": 1658 + }, + { + "epoch": 3.5044878563885957, + "grad_norm": 0.14138649735709846, + "learning_rate": 1.982977298894302e-05, + "loss": 1.228, + "step": 1659 + }, + { + "epoch": 3.506599788806758, + "grad_norm": 0.13613799103837432, + "learning_rate": 1.9778799946949006e-05, + "loss": 1.2274, + "step": 1660 + }, + { + "epoch": 3.5087117212249206, + "grad_norm": 0.12853932942426505, + "learning_rate": 1.9727870977014404e-05, + "loss": 1.2106, + "step": 1661 + }, + { + "epoch": 3.5108236536430835, + "grad_norm": 0.18286333995091564, + "learning_rate": 1.9676986190138835e-05, + "loss": 1.2392, + "step": 1662 + }, + { + "epoch": 3.512935586061246, + "grad_norm": 0.17167200554437498, + "learning_rate": 1.9626145697225523e-05, + "loss": 1.2345, + "step": 1663 + }, + { + "epoch": 3.5150475184794088, + "grad_norm": 0.15167393455428188, + "learning_rate": 1.9575349609081246e-05, + "loss": 1.2211, + "step": 1664 + }, + { + "epoch": 3.517159450897571, + "grad_norm": 0.18808025373268855, + "learning_rate": 1.9524598036415973e-05, + "loss": 1.2347, + "step": 1665 + }, + { + "epoch": 3.5192713833157336, + "grad_norm": 0.11517921616987158, + "learning_rate": 1.9473891089842585e-05, + "loss": 1.2109, + "step": 1666 + }, + { + "epoch": 3.5213833157338965, + "grad_norm": 0.15689483479245486, + "learning_rate": 1.9423228879876827e-05, + "loss": 1.2137, + "step": 1667 + }, + { + "epoch": 3.5234952481520594, + "grad_norm": 0.127468399929587, + "learning_rate": 1.93726115169368e-05, + "loss": 1.2182, + "step": 1668 + }, + { + "epoch": 3.525607180570222, + "grad_norm": 0.14765567060421253, + "learning_rate": 1.9322039111342978e-05, + "loss": 1.2182, + "step": 1669 + }, + { + "epoch": 3.5277191129883843, + "grad_norm": 0.12448201957569102, + "learning_rate": 1.9271511773317794e-05, + "loss": 1.2323, + "step": 1670 + }, + { + "epoch": 3.529831045406547, + "grad_norm": 0.12009474541088946, + "learning_rate": 1.922102961298546e-05, + "loss": 1.2194, + "step": 1671 + }, + { + "epoch": 3.5319429778247096, + "grad_norm": 0.11329023498100176, + "learning_rate": 1.917059274037175e-05, + "loss": 1.2202, + "step": 1672 + }, + { + "epoch": 3.5340549102428724, + "grad_norm": 0.11685452262057687, + "learning_rate": 1.9120201265403666e-05, + "loss": 1.2338, + "step": 1673 + }, + { + "epoch": 3.536166842661035, + "grad_norm": 0.11091083218031819, + "learning_rate": 1.9069855297909353e-05, + "loss": 1.2297, + "step": 1674 + }, + { + "epoch": 3.5382787750791973, + "grad_norm": 0.1310325499520254, + "learning_rate": 1.9019554947617728e-05, + "loss": 1.2241, + "step": 1675 + }, + { + "epoch": 3.54039070749736, + "grad_norm": 0.11648464323785482, + "learning_rate": 1.896930032415828e-05, + "loss": 1.2361, + "step": 1676 + }, + { + "epoch": 3.5425026399155226, + "grad_norm": 0.12978731501229013, + "learning_rate": 1.891909153706085e-05, + "loss": 1.2095, + "step": 1677 + }, + { + "epoch": 3.5446145723336855, + "grad_norm": 0.10410408796704705, + "learning_rate": 1.8868928695755375e-05, + "loss": 1.2263, + "step": 1678 + }, + { + "epoch": 3.546726504751848, + "grad_norm": 0.1274098342358375, + "learning_rate": 1.8818811909571657e-05, + "loss": 1.214, + "step": 1679 + }, + { + "epoch": 3.5488384371700104, + "grad_norm": 0.11368326528379058, + "learning_rate": 1.8768741287739106e-05, + "loss": 1.246, + "step": 1680 + }, + { + "epoch": 3.5509503695881732, + "grad_norm": 0.11354814518644038, + "learning_rate": 1.8718716939386543e-05, + "loss": 1.2237, + "step": 1681 + }, + { + "epoch": 3.5530623020063357, + "grad_norm": 0.12466160948448603, + "learning_rate": 1.8668738973541913e-05, + "loss": 1.2187, + "step": 1682 + }, + { + "epoch": 3.5551742344244985, + "grad_norm": 0.10958240366943, + "learning_rate": 1.8618807499132075e-05, + "loss": 1.226, + "step": 1683 + }, + { + "epoch": 3.557286166842661, + "grad_norm": 0.14221037898291733, + "learning_rate": 1.8568922624982567e-05, + "loss": 1.2114, + "step": 1684 + }, + { + "epoch": 3.5593980992608234, + "grad_norm": 0.1415836779483271, + "learning_rate": 1.8519084459817365e-05, + "loss": 1.2214, + "step": 1685 + }, + { + "epoch": 3.5615100316789863, + "grad_norm": 0.12151308394197158, + "learning_rate": 1.846929311225863e-05, + "loss": 1.2249, + "step": 1686 + }, + { + "epoch": 3.563621964097149, + "grad_norm": 0.1785012103960912, + "learning_rate": 1.841954869082649e-05, + "loss": 1.2261, + "step": 1687 + }, + { + "epoch": 3.5657338965153116, + "grad_norm": 0.13426605618855145, + "learning_rate": 1.83698513039388e-05, + "loss": 1.2463, + "step": 1688 + }, + { + "epoch": 3.567845828933474, + "grad_norm": 0.1266079659017047, + "learning_rate": 1.8320201059910902e-05, + "loss": 1.225, + "step": 1689 + }, + { + "epoch": 3.569957761351637, + "grad_norm": 0.11738416372689545, + "learning_rate": 1.8270598066955392e-05, + "loss": 1.2303, + "step": 1690 + }, + { + "epoch": 3.5720696937697993, + "grad_norm": 0.11977839410225624, + "learning_rate": 1.822104243318186e-05, + "loss": 1.2232, + "step": 1691 + }, + { + "epoch": 3.574181626187962, + "grad_norm": 0.11577215211395513, + "learning_rate": 1.8171534266596742e-05, + "loss": 1.2228, + "step": 1692 + }, + { + "epoch": 3.5762935586061246, + "grad_norm": 0.11550886310223042, + "learning_rate": 1.8122073675102935e-05, + "loss": 1.2273, + "step": 1693 + }, + { + "epoch": 3.578405491024287, + "grad_norm": 0.11232443064976404, + "learning_rate": 1.8072660766499695e-05, + "loss": 1.2205, + "step": 1694 + }, + { + "epoch": 3.58051742344245, + "grad_norm": 0.13775536110743047, + "learning_rate": 1.8023295648482343e-05, + "loss": 1.2258, + "step": 1695 + }, + { + "epoch": 3.5826293558606124, + "grad_norm": 0.1321798502812584, + "learning_rate": 1.797397842864203e-05, + "loss": 1.2351, + "step": 1696 + }, + { + "epoch": 3.5847412882787753, + "grad_norm": 0.12439267339468671, + "learning_rate": 1.7924709214465574e-05, + "loss": 1.2226, + "step": 1697 + }, + { + "epoch": 3.5868532206969377, + "grad_norm": 0.14469249161174313, + "learning_rate": 1.7875488113335044e-05, + "loss": 1.2264, + "step": 1698 + }, + { + "epoch": 3.5889651531151, + "grad_norm": 0.12292766290698051, + "learning_rate": 1.782631523252778e-05, + "loss": 1.2254, + "step": 1699 + }, + { + "epoch": 3.591077085533263, + "grad_norm": 0.11272519224028925, + "learning_rate": 1.7777190679215923e-05, + "loss": 1.2227, + "step": 1700 + }, + { + "epoch": 3.5931890179514254, + "grad_norm": 0.11160956458153243, + "learning_rate": 1.7728114560466325e-05, + "loss": 1.2346, + "step": 1701 + }, + { + "epoch": 3.5953009503695883, + "grad_norm": 0.12359109358639414, + "learning_rate": 1.7679086983240314e-05, + "loss": 1.2291, + "step": 1702 + }, + { + "epoch": 3.5974128827877507, + "grad_norm": 0.10507148379563606, + "learning_rate": 1.763010805439333e-05, + "loss": 1.2174, + "step": 1703 + }, + { + "epoch": 3.599524815205913, + "grad_norm": 0.1320977180186451, + "learning_rate": 1.7581177880674894e-05, + "loss": 1.211, + "step": 1704 + }, + { + "epoch": 3.601636747624076, + "grad_norm": 0.13214756182185777, + "learning_rate": 1.753229656872815e-05, + "loss": 1.2048, + "step": 1705 + }, + { + "epoch": 3.603748680042239, + "grad_norm": 0.21152231266368549, + "learning_rate": 1.748346422508985e-05, + "loss": 1.2296, + "step": 1706 + }, + { + "epoch": 3.6058606124604013, + "grad_norm": 0.1759329682332239, + "learning_rate": 1.743468095618996e-05, + "loss": 1.2124, + "step": 1707 + }, + { + "epoch": 3.607972544878564, + "grad_norm": 0.15901554442482888, + "learning_rate": 1.7385946868351498e-05, + "loss": 1.2261, + "step": 1708 + }, + { + "epoch": 3.6100844772967267, + "grad_norm": 0.13979190277263234, + "learning_rate": 1.733726206779032e-05, + "loss": 1.226, + "step": 1709 + }, + { + "epoch": 3.612196409714889, + "grad_norm": 0.15153194889159077, + "learning_rate": 1.7288626660614792e-05, + "loss": 1.2257, + "step": 1710 + }, + { + "epoch": 3.614308342133052, + "grad_norm": 0.13787893348198105, + "learning_rate": 1.7240040752825716e-05, + "loss": 1.2527, + "step": 1711 + }, + { + "epoch": 3.6164202745512144, + "grad_norm": 0.1367399288874534, + "learning_rate": 1.7191504450315953e-05, + "loss": 1.2365, + "step": 1712 + }, + { + "epoch": 3.618532206969377, + "grad_norm": 0.11637969828585162, + "learning_rate": 1.714301785887026e-05, + "loss": 1.2117, + "step": 1713 + }, + { + "epoch": 3.6206441393875397, + "grad_norm": 0.11225380315986809, + "learning_rate": 1.7094581084165058e-05, + "loss": 1.2244, + "step": 1714 + }, + { + "epoch": 3.622756071805702, + "grad_norm": 0.18494938901502286, + "learning_rate": 1.7046194231768187e-05, + "loss": 1.1992, + "step": 1715 + }, + { + "epoch": 3.624868004223865, + "grad_norm": 0.12469567096071683, + "learning_rate": 1.699785740713868e-05, + "loss": 1.2134, + "step": 1716 + }, + { + "epoch": 3.6269799366420274, + "grad_norm": 0.27226011515649823, + "learning_rate": 1.6949570715626532e-05, + "loss": 1.2418, + "step": 1717 + }, + { + "epoch": 3.62909186906019, + "grad_norm": 0.11176280651105308, + "learning_rate": 1.6901334262472488e-05, + "loss": 1.2298, + "step": 1718 + }, + { + "epoch": 3.6312038014783528, + "grad_norm": 0.2222672149062154, + "learning_rate": 1.6853148152807774e-05, + "loss": 1.218, + "step": 1719 + }, + { + "epoch": 3.633315733896515, + "grad_norm": 0.11776776652423927, + "learning_rate": 1.6805012491653917e-05, + "loss": 1.2317, + "step": 1720 + }, + { + "epoch": 3.635427666314678, + "grad_norm": 0.17513402894855798, + "learning_rate": 1.6756927383922473e-05, + "loss": 1.2286, + "step": 1721 + }, + { + "epoch": 3.6375395987328405, + "grad_norm": 0.11312524580636318, + "learning_rate": 1.670889293441483e-05, + "loss": 1.2205, + "step": 1722 + }, + { + "epoch": 3.639651531151003, + "grad_norm": 0.17973252160302508, + "learning_rate": 1.666090924782196e-05, + "loss": 1.2304, + "step": 1723 + }, + { + "epoch": 3.641763463569166, + "grad_norm": 0.1124333149228933, + "learning_rate": 1.66129764287242e-05, + "loss": 1.201, + "step": 1724 + }, + { + "epoch": 3.6438753959873287, + "grad_norm": 0.1390623035283563, + "learning_rate": 1.6565094581591018e-05, + "loss": 1.2382, + "step": 1725 + }, + { + "epoch": 3.645987328405491, + "grad_norm": 0.11592351742514051, + "learning_rate": 1.6517263810780785e-05, + "loss": 1.2141, + "step": 1726 + }, + { + "epoch": 3.6480992608236535, + "grad_norm": 0.13433090540740272, + "learning_rate": 1.6469484220540567e-05, + "loss": 1.23, + "step": 1727 + }, + { + "epoch": 3.6502111932418164, + "grad_norm": 0.14106734117094666, + "learning_rate": 1.642175591500586e-05, + "loss": 1.2241, + "step": 1728 + }, + { + "epoch": 3.652323125659979, + "grad_norm": 0.11034787653998582, + "learning_rate": 1.6374078998200427e-05, + "loss": 1.1982, + "step": 1729 + }, + { + "epoch": 3.6544350580781417, + "grad_norm": 1.2801929067789304, + "learning_rate": 1.632645357403596e-05, + "loss": 1.2346, + "step": 1730 + }, + { + "epoch": 3.656546990496304, + "grad_norm": 0.13633261576874528, + "learning_rate": 1.627887974631199e-05, + "loss": 1.2212, + "step": 1731 + }, + { + "epoch": 3.6586589229144666, + "grad_norm": 0.14625053379228856, + "learning_rate": 1.623135761871555e-05, + "loss": 1.2291, + "step": 1732 + }, + { + "epoch": 3.6607708553326295, + "grad_norm": 0.13181542754867015, + "learning_rate": 1.6183887294820995e-05, + "loss": 1.2198, + "step": 1733 + }, + { + "epoch": 3.662882787750792, + "grad_norm": 0.16445304636555066, + "learning_rate": 1.6136468878089843e-05, + "loss": 1.2195, + "step": 1734 + }, + { + "epoch": 3.664994720168955, + "grad_norm": 0.12044632574177874, + "learning_rate": 1.6089102471870366e-05, + "loss": 1.2301, + "step": 1735 + }, + { + "epoch": 3.667106652587117, + "grad_norm": 0.12814779025477496, + "learning_rate": 1.6041788179397584e-05, + "loss": 1.2347, + "step": 1736 + }, + { + "epoch": 3.6692185850052796, + "grad_norm": 0.1423845489530368, + "learning_rate": 1.5994526103792854e-05, + "loss": 1.2143, + "step": 1737 + }, + { + "epoch": 3.6713305174234425, + "grad_norm": 0.13480463043856625, + "learning_rate": 1.5947316348063764e-05, + "loss": 1.2254, + "step": 1738 + }, + { + "epoch": 3.673442449841605, + "grad_norm": 0.1208417589663892, + "learning_rate": 1.590015901510391e-05, + "loss": 1.2272, + "step": 1739 + }, + { + "epoch": 3.675554382259768, + "grad_norm": 0.12212272998517927, + "learning_rate": 1.5853054207692533e-05, + "loss": 1.2351, + "step": 1740 + }, + { + "epoch": 3.6776663146779303, + "grad_norm": 0.1164306152076413, + "learning_rate": 1.580600202849451e-05, + "loss": 1.2083, + "step": 1741 + }, + { + "epoch": 3.6797782470960927, + "grad_norm": 0.39851597882233836, + "learning_rate": 1.5759002580059907e-05, + "loss": 1.2287, + "step": 1742 + }, + { + "epoch": 3.6818901795142556, + "grad_norm": 0.11974666855699577, + "learning_rate": 1.5712055964823953e-05, + "loss": 1.2184, + "step": 1743 + }, + { + "epoch": 3.684002111932418, + "grad_norm": 0.12424072397223379, + "learning_rate": 1.5665162285106675e-05, + "loss": 1.2115, + "step": 1744 + }, + { + "epoch": 3.686114044350581, + "grad_norm": 0.1475245037477598, + "learning_rate": 1.561832164311274e-05, + "loss": 1.2429, + "step": 1745 + }, + { + "epoch": 3.6882259767687433, + "grad_norm": 0.12633022861987916, + "learning_rate": 1.5571534140931234e-05, + "loss": 1.23, + "step": 1746 + }, + { + "epoch": 3.690337909186906, + "grad_norm": 0.14562889882854824, + "learning_rate": 1.5524799880535372e-05, + "loss": 1.2331, + "step": 1747 + }, + { + "epoch": 3.6924498416050686, + "grad_norm": 0.11168492862058636, + "learning_rate": 1.5478118963782415e-05, + "loss": 1.2229, + "step": 1748 + }, + { + "epoch": 3.6945617740232315, + "grad_norm": 0.12697756532876417, + "learning_rate": 1.5431491492413288e-05, + "loss": 1.2038, + "step": 1749 + }, + { + "epoch": 3.696673706441394, + "grad_norm": 0.14852286780794552, + "learning_rate": 1.5384917568052467e-05, + "loss": 1.2283, + "step": 1750 + }, + { + "epoch": 3.6987856388595564, + "grad_norm": 0.14163778028524296, + "learning_rate": 1.5338397292207716e-05, + "loss": 1.2261, + "step": 1751 + }, + { + "epoch": 3.7008975712777192, + "grad_norm": 0.14656727173213085, + "learning_rate": 1.5291930766269864e-05, + "loss": 1.2161, + "step": 1752 + }, + { + "epoch": 3.7030095036958817, + "grad_norm": 0.12882197141583326, + "learning_rate": 1.52455180915126e-05, + "loss": 1.232, + "step": 1753 + }, + { + "epoch": 3.7051214361140445, + "grad_norm": 0.13104922482858164, + "learning_rate": 1.5199159369092247e-05, + "loss": 1.2275, + "step": 1754 + }, + { + "epoch": 3.707233368532207, + "grad_norm": 0.12562693047511275, + "learning_rate": 1.515285470004753e-05, + "loss": 1.2228, + "step": 1755 + }, + { + "epoch": 3.7093453009503694, + "grad_norm": 0.10933263341517407, + "learning_rate": 1.5106604185299372e-05, + "loss": 1.2255, + "step": 1756 + }, + { + "epoch": 3.7114572333685323, + "grad_norm": 0.6256315506357768, + "learning_rate": 1.5060407925650662e-05, + "loss": 1.2151, + "step": 1757 + }, + { + "epoch": 3.7135691657866947, + "grad_norm": 0.12457693280313233, + "learning_rate": 1.5014266021786048e-05, + "loss": 1.2234, + "step": 1758 + }, + { + "epoch": 3.7156810982048576, + "grad_norm": 0.15772963866343465, + "learning_rate": 1.4968178574271699e-05, + "loss": 1.2268, + "step": 1759 + }, + { + "epoch": 3.71779303062302, + "grad_norm": 0.1342166200356949, + "learning_rate": 1.4922145683555104e-05, + "loss": 1.2212, + "step": 1760 + }, + { + "epoch": 3.7199049630411825, + "grad_norm": 0.13046225172993142, + "learning_rate": 1.4876167449964842e-05, + "loss": 1.2369, + "step": 1761 + }, + { + "epoch": 3.7220168954593453, + "grad_norm": 0.11153260394167246, + "learning_rate": 1.483024397371037e-05, + "loss": 1.2273, + "step": 1762 + }, + { + "epoch": 3.7241288278775078, + "grad_norm": 0.12283813467124632, + "learning_rate": 1.47843753548818e-05, + "loss": 1.2257, + "step": 1763 + }, + { + "epoch": 3.7262407602956706, + "grad_norm": 0.11530918900175359, + "learning_rate": 1.4738561693449685e-05, + "loss": 1.2136, + "step": 1764 + }, + { + "epoch": 3.728352692713833, + "grad_norm": 0.12047718011185134, + "learning_rate": 1.4692803089264773e-05, + "loss": 1.2319, + "step": 1765 + }, + { + "epoch": 3.7304646251319955, + "grad_norm": 0.12309359384096176, + "learning_rate": 1.4647099642057887e-05, + "loss": 1.2224, + "step": 1766 + }, + { + "epoch": 3.7325765575501584, + "grad_norm": 0.11179659623958234, + "learning_rate": 1.4601451451439542e-05, + "loss": 1.2333, + "step": 1767 + }, + { + "epoch": 3.7346884899683213, + "grad_norm": 0.11513477659250682, + "learning_rate": 1.455585861689988e-05, + "loss": 1.2183, + "step": 1768 + }, + { + "epoch": 3.7368004223864837, + "grad_norm": 0.14898541580956406, + "learning_rate": 1.4510321237808377e-05, + "loss": 1.2283, + "step": 1769 + }, + { + "epoch": 3.738912354804646, + "grad_norm": 0.1225602641976818, + "learning_rate": 1.4464839413413638e-05, + "loss": 1.2208, + "step": 1770 + }, + { + "epoch": 3.741024287222809, + "grad_norm": 0.1240646238811233, + "learning_rate": 1.4419413242843235e-05, + "loss": 1.2256, + "step": 1771 + }, + { + "epoch": 3.7431362196409714, + "grad_norm": 0.12644863719884386, + "learning_rate": 1.4374042825103342e-05, + "loss": 1.2309, + "step": 1772 + }, + { + "epoch": 3.7452481520591343, + "grad_norm": 0.12420918038632946, + "learning_rate": 1.4328728259078748e-05, + "loss": 1.2263, + "step": 1773 + }, + { + "epoch": 3.7473600844772967, + "grad_norm": 0.13315519648146373, + "learning_rate": 1.4283469643532396e-05, + "loss": 1.2205, + "step": 1774 + }, + { + "epoch": 3.749472016895459, + "grad_norm": 0.25902183796146694, + "learning_rate": 1.4238267077105338e-05, + "loss": 1.2322, + "step": 1775 + }, + { + "epoch": 3.751583949313622, + "grad_norm": 0.1476564982833996, + "learning_rate": 1.4193120658316506e-05, + "loss": 1.2338, + "step": 1776 + }, + { + "epoch": 3.7536958817317845, + "grad_norm": 0.14580732078749295, + "learning_rate": 1.4148030485562362e-05, + "loss": 1.2312, + "step": 1777 + }, + { + "epoch": 3.7558078141499474, + "grad_norm": 0.1687337855237315, + "learning_rate": 1.4102996657116888e-05, + "loss": 1.2276, + "step": 1778 + }, + { + "epoch": 3.75791974656811, + "grad_norm": 0.1452893296457429, + "learning_rate": 1.405801927113116e-05, + "loss": 1.2305, + "step": 1779 + }, + { + "epoch": 3.760031678986272, + "grad_norm": 0.14622757444244833, + "learning_rate": 1.4013098425633325e-05, + "loss": 1.2349, + "step": 1780 + }, + { + "epoch": 3.762143611404435, + "grad_norm": 0.13796192757337403, + "learning_rate": 1.3968234218528252e-05, + "loss": 1.2185, + "step": 1781 + }, + { + "epoch": 3.7642555438225975, + "grad_norm": 0.11812932541349103, + "learning_rate": 1.3923426747597377e-05, + "loss": 1.222, + "step": 1782 + }, + { + "epoch": 3.7663674762407604, + "grad_norm": 0.14528927146713022, + "learning_rate": 1.3878676110498495e-05, + "loss": 1.223, + "step": 1783 + }, + { + "epoch": 3.768479408658923, + "grad_norm": 0.162574054064049, + "learning_rate": 1.3833982404765477e-05, + "loss": 1.21, + "step": 1784 + }, + { + "epoch": 3.7705913410770853, + "grad_norm": 0.11696939258112565, + "learning_rate": 1.3789345727808208e-05, + "loss": 1.2292, + "step": 1785 + }, + { + "epoch": 3.772703273495248, + "grad_norm": 0.11498385755451364, + "learning_rate": 1.3744766176912165e-05, + "loss": 1.2234, + "step": 1786 + }, + { + "epoch": 3.774815205913411, + "grad_norm": 0.11278219867468722, + "learning_rate": 1.3700243849238425e-05, + "loss": 1.236, + "step": 1787 + }, + { + "epoch": 3.7769271383315735, + "grad_norm": 0.13177902865453528, + "learning_rate": 1.365577884182328e-05, + "loss": 1.2286, + "step": 1788 + }, + { + "epoch": 3.779039070749736, + "grad_norm": 0.11312549185475144, + "learning_rate": 1.3611371251578117e-05, + "loss": 1.2177, + "step": 1789 + }, + { + "epoch": 3.7811510031678988, + "grad_norm": 0.1131064983366654, + "learning_rate": 1.3567021175289172e-05, + "loss": 1.2192, + "step": 1790 + }, + { + "epoch": 3.783262935586061, + "grad_norm": 0.11061885786865942, + "learning_rate": 1.352272870961734e-05, + "loss": 1.2282, + "step": 1791 + }, + { + "epoch": 3.785374868004224, + "grad_norm": 0.137101605638394, + "learning_rate": 1.3478493951097949e-05, + "loss": 1.2188, + "step": 1792 + }, + { + "epoch": 3.7874868004223865, + "grad_norm": 0.14944455935323167, + "learning_rate": 1.3434316996140555e-05, + "loss": 1.2154, + "step": 1793 + }, + { + "epoch": 3.789598732840549, + "grad_norm": 0.15922120561523048, + "learning_rate": 1.3390197941028725e-05, + "loss": 1.2411, + "step": 1794 + }, + { + "epoch": 3.791710665258712, + "grad_norm": 0.1444536733440371, + "learning_rate": 1.3346136881919845e-05, + "loss": 1.2288, + "step": 1795 + }, + { + "epoch": 3.7938225976768742, + "grad_norm": 0.12953734925880717, + "learning_rate": 1.330213391484489e-05, + "loss": 1.2222, + "step": 1796 + }, + { + "epoch": 3.795934530095037, + "grad_norm": 0.12022684641421882, + "learning_rate": 1.325818913570823e-05, + "loss": 1.2375, + "step": 1797 + }, + { + "epoch": 3.7980464625131996, + "grad_norm": 0.13240973822969507, + "learning_rate": 1.3214302640287406e-05, + "loss": 1.2443, + "step": 1798 + }, + { + "epoch": 3.800158394931362, + "grad_norm": 0.12522118897523832, + "learning_rate": 1.3170474524232937e-05, + "loss": 1.2191, + "step": 1799 + }, + { + "epoch": 3.802270327349525, + "grad_norm": 0.13315363676785721, + "learning_rate": 1.3126704883068104e-05, + "loss": 1.2269, + "step": 1800 + }, + { + "epoch": 3.8043822597676873, + "grad_norm": 0.13902121489439948, + "learning_rate": 1.3082993812188738e-05, + "loss": 1.2187, + "step": 1801 + }, + { + "epoch": 3.80649419218585, + "grad_norm": 0.1172454648130066, + "learning_rate": 1.3039341406863004e-05, + "loss": 1.2173, + "step": 1802 + }, + { + "epoch": 3.8086061246040126, + "grad_norm": 0.14556478998858832, + "learning_rate": 1.2995747762231261e-05, + "loss": 1.211, + "step": 1803 + }, + { + "epoch": 3.810718057022175, + "grad_norm": 0.13697179130847892, + "learning_rate": 1.295221297330572e-05, + "loss": 1.2252, + "step": 1804 + }, + { + "epoch": 3.812829989440338, + "grad_norm": 0.14674491033800083, + "learning_rate": 1.2908737134970367e-05, + "loss": 1.2216, + "step": 1805 + }, + { + "epoch": 3.814941921858501, + "grad_norm": 0.09641349899662074, + "learning_rate": 1.2865320341980687e-05, + "loss": 1.2315, + "step": 1806 + }, + { + "epoch": 3.817053854276663, + "grad_norm": 0.1168094527744162, + "learning_rate": 1.2821962688963473e-05, + "loss": 1.2229, + "step": 1807 + }, + { + "epoch": 3.8191657866948256, + "grad_norm": 0.10691427542396115, + "learning_rate": 1.2778664270416674e-05, + "loss": 1.2049, + "step": 1808 + }, + { + "epoch": 3.8212777191129885, + "grad_norm": 0.14616766818057925, + "learning_rate": 1.273542518070904e-05, + "loss": 1.2286, + "step": 1809 + }, + { + "epoch": 3.823389651531151, + "grad_norm": 0.20432544339130432, + "learning_rate": 1.2692245514080113e-05, + "loss": 1.2337, + "step": 1810 + }, + { + "epoch": 3.825501583949314, + "grad_norm": 0.11467354460194057, + "learning_rate": 1.2649125364639851e-05, + "loss": 1.2315, + "step": 1811 + }, + { + "epoch": 3.8276135163674763, + "grad_norm": 0.11706145039746803, + "learning_rate": 1.2606064826368512e-05, + "loss": 1.2156, + "step": 1812 + }, + { + "epoch": 3.8297254487856387, + "grad_norm": 0.133346261549998, + "learning_rate": 1.2563063993116482e-05, + "loss": 1.2262, + "step": 1813 + }, + { + "epoch": 3.8318373812038016, + "grad_norm": 0.11761019065261703, + "learning_rate": 1.2520122958603933e-05, + "loss": 1.2254, + "step": 1814 + }, + { + "epoch": 3.833949313621964, + "grad_norm": 0.12687974722050688, + "learning_rate": 1.2477241816420796e-05, + "loss": 1.2152, + "step": 1815 + }, + { + "epoch": 3.836061246040127, + "grad_norm": 0.11008667723743693, + "learning_rate": 1.2434420660026376e-05, + "loss": 1.2362, + "step": 1816 + }, + { + "epoch": 3.8381731784582893, + "grad_norm": 0.11293482363266455, + "learning_rate": 1.2391659582749332e-05, + "loss": 1.2335, + "step": 1817 + }, + { + "epoch": 3.8402851108764517, + "grad_norm": 0.14034570720183345, + "learning_rate": 1.2348958677787292e-05, + "loss": 1.2145, + "step": 1818 + }, + { + "epoch": 3.8423970432946146, + "grad_norm": 0.12509848741870538, + "learning_rate": 1.230631803820681e-05, + "loss": 1.2246, + "step": 1819 + }, + { + "epoch": 3.844508975712777, + "grad_norm": 0.1313171123765944, + "learning_rate": 1.2263737756943069e-05, + "loss": 1.2327, + "step": 1820 + }, + { + "epoch": 3.84662090813094, + "grad_norm": 2.4938226564037613, + "learning_rate": 1.2221217926799653e-05, + "loss": 1.2344, + "step": 1821 + }, + { + "epoch": 3.8487328405491024, + "grad_norm": 0.15438298129503883, + "learning_rate": 1.2178758640448489e-05, + "loss": 1.2291, + "step": 1822 + }, + { + "epoch": 3.850844772967265, + "grad_norm": 0.09933606201985347, + "learning_rate": 1.2136359990429444e-05, + "loss": 1.2164, + "step": 1823 + }, + { + "epoch": 3.8529567053854277, + "grad_norm": 0.12259646420137935, + "learning_rate": 1.2094022069150313e-05, + "loss": 1.2238, + "step": 1824 + }, + { + "epoch": 3.8550686378035905, + "grad_norm": 0.13137050302866068, + "learning_rate": 1.205174496888649e-05, + "loss": 1.2292, + "step": 1825 + }, + { + "epoch": 3.857180570221753, + "grad_norm": 0.1468083681645044, + "learning_rate": 1.2009528781780815e-05, + "loss": 1.2184, + "step": 1826 + }, + { + "epoch": 3.8592925026399154, + "grad_norm": 0.11260662628603564, + "learning_rate": 1.1967373599843391e-05, + "loss": 1.2185, + "step": 1827 + }, + { + "epoch": 3.8614044350580783, + "grad_norm": 0.12554006002149273, + "learning_rate": 1.1925279514951295e-05, + "loss": 1.221, + "step": 1828 + }, + { + "epoch": 3.8635163674762407, + "grad_norm": 0.11345629662058655, + "learning_rate": 1.1883246618848533e-05, + "loss": 1.2277, + "step": 1829 + }, + { + "epoch": 3.8656282998944036, + "grad_norm": 0.11936917395870872, + "learning_rate": 1.1841275003145695e-05, + "loss": 1.2192, + "step": 1830 + }, + { + "epoch": 3.867740232312566, + "grad_norm": 0.14610900571979674, + "learning_rate": 1.1799364759319817e-05, + "loss": 1.2284, + "step": 1831 + }, + { + "epoch": 3.8698521647307285, + "grad_norm": 0.1111815306488436, + "learning_rate": 1.1757515978714178e-05, + "loss": 1.2246, + "step": 1832 + }, + { + "epoch": 3.8719640971488913, + "grad_norm": 0.12201635045747661, + "learning_rate": 1.1715728752538103e-05, + "loss": 1.2239, + "step": 1833 + }, + { + "epoch": 3.8740760295670538, + "grad_norm": 0.09692480422248992, + "learning_rate": 1.167400317186676e-05, + "loss": 1.2151, + "step": 1834 + }, + { + "epoch": 3.8761879619852166, + "grad_norm": 0.14784058657538943, + "learning_rate": 1.1632339327640949e-05, + "loss": 1.2132, + "step": 1835 + }, + { + "epoch": 3.878299894403379, + "grad_norm": 0.1292822103839259, + "learning_rate": 1.1590737310666928e-05, + "loss": 1.2197, + "step": 1836 + }, + { + "epoch": 3.8804118268215415, + "grad_norm": 0.12744458274918927, + "learning_rate": 1.1549197211616203e-05, + "loss": 1.236, + "step": 1837 + }, + { + "epoch": 3.8825237592397044, + "grad_norm": 0.14276297337070334, + "learning_rate": 1.1507719121025316e-05, + "loss": 1.2344, + "step": 1838 + }, + { + "epoch": 3.884635691657867, + "grad_norm": 0.11450951140665319, + "learning_rate": 1.1466303129295663e-05, + "loss": 1.2237, + "step": 1839 + }, + { + "epoch": 3.8867476240760297, + "grad_norm": 0.14171153892275165, + "learning_rate": 1.1424949326693344e-05, + "loss": 1.2262, + "step": 1840 + }, + { + "epoch": 3.888859556494192, + "grad_norm": 0.12060730113708514, + "learning_rate": 1.1383657803348837e-05, + "loss": 1.2366, + "step": 1841 + }, + { + "epoch": 3.8909714889123546, + "grad_norm": 0.11710243194399327, + "learning_rate": 1.1342428649256934e-05, + "loss": 1.2169, + "step": 1842 + }, + { + "epoch": 3.8930834213305174, + "grad_norm": 0.1316171713149923, + "learning_rate": 1.1301261954276481e-05, + "loss": 1.2135, + "step": 1843 + }, + { + "epoch": 3.8951953537486803, + "grad_norm": 0.12127429907249662, + "learning_rate": 1.126015780813019e-05, + "loss": 1.2165, + "step": 1844 + }, + { + "epoch": 3.8973072861668427, + "grad_norm": 0.14907812448892813, + "learning_rate": 1.1219116300404486e-05, + "loss": 1.2078, + "step": 1845 + }, + { + "epoch": 3.899419218585005, + "grad_norm": 0.11951007311530615, + "learning_rate": 1.1178137520549197e-05, + "loss": 1.2263, + "step": 1846 + }, + { + "epoch": 3.901531151003168, + "grad_norm": 0.13568670261160964, + "learning_rate": 1.1137221557877522e-05, + "loss": 1.2245, + "step": 1847 + }, + { + "epoch": 3.9036430834213305, + "grad_norm": 0.1570379810106032, + "learning_rate": 1.1096368501565676e-05, + "loss": 1.2296, + "step": 1848 + }, + { + "epoch": 3.9057550158394934, + "grad_norm": 0.1353985500062486, + "learning_rate": 1.1055578440652802e-05, + "loss": 1.2211, + "step": 1849 + }, + { + "epoch": 3.907866948257656, + "grad_norm": 0.16254710351397286, + "learning_rate": 1.101485146404078e-05, + "loss": 1.2206, + "step": 1850 + }, + { + "epoch": 3.9099788806758182, + "grad_norm": 0.11621394304776594, + "learning_rate": 1.0974187660493917e-05, + "loss": 1.2331, + "step": 1851 + }, + { + "epoch": 3.912090813093981, + "grad_norm": 0.13652073142880686, + "learning_rate": 1.0933587118638927e-05, + "loss": 1.2233, + "step": 1852 + }, + { + "epoch": 3.9142027455121435, + "grad_norm": 0.15494837577158965, + "learning_rate": 1.0893049926964553e-05, + "loss": 1.2104, + "step": 1853 + }, + { + "epoch": 3.9163146779303064, + "grad_norm": 0.11726654865745481, + "learning_rate": 1.0852576173821552e-05, + "loss": 1.2267, + "step": 1854 + }, + { + "epoch": 3.918426610348469, + "grad_norm": 0.16710284957095667, + "learning_rate": 1.0812165947422332e-05, + "loss": 1.218, + "step": 1855 + }, + { + "epoch": 3.9205385427666313, + "grad_norm": 0.12683184207753298, + "learning_rate": 1.0771819335840931e-05, + "loss": 1.2263, + "step": 1856 + }, + { + "epoch": 3.922650475184794, + "grad_norm": 0.14945956171660615, + "learning_rate": 1.0731536427012696e-05, + "loss": 1.2224, + "step": 1857 + }, + { + "epoch": 3.9247624076029566, + "grad_norm": 0.14217017936826068, + "learning_rate": 1.069131730873409e-05, + "loss": 1.2196, + "step": 1858 + }, + { + "epoch": 3.9268743400211195, + "grad_norm": 0.11275620536046815, + "learning_rate": 1.0651162068662649e-05, + "loss": 1.2272, + "step": 1859 + }, + { + "epoch": 3.928986272439282, + "grad_norm": 0.1416872525616463, + "learning_rate": 1.0611070794316571e-05, + "loss": 1.2192, + "step": 1860 + }, + { + "epoch": 3.9310982048574443, + "grad_norm": 0.10875968303914332, + "learning_rate": 1.0571043573074737e-05, + "loss": 1.2183, + "step": 1861 + }, + { + "epoch": 3.933210137275607, + "grad_norm": 0.14144094905456378, + "learning_rate": 1.0531080492176372e-05, + "loss": 1.2073, + "step": 1862 + }, + { + "epoch": 3.93532206969377, + "grad_norm": 0.12975472004096641, + "learning_rate": 1.0491181638720915e-05, + "loss": 1.2148, + "step": 1863 + }, + { + "epoch": 3.9374340021119325, + "grad_norm": 0.1179062707848006, + "learning_rate": 1.0451347099667846e-05, + "loss": 1.2198, + "step": 1864 + }, + { + "epoch": 3.939545934530095, + "grad_norm": 0.1186424709051365, + "learning_rate": 1.0411576961836411e-05, + "loss": 1.2224, + "step": 1865 + }, + { + "epoch": 3.941657866948258, + "grad_norm": 0.09773054035950467, + "learning_rate": 1.0371871311905575e-05, + "loss": 1.2252, + "step": 1866 + }, + { + "epoch": 3.9437697993664202, + "grad_norm": 0.11538397290204694, + "learning_rate": 1.0332230236413693e-05, + "loss": 1.2036, + "step": 1867 + }, + { + "epoch": 3.945881731784583, + "grad_norm": 0.11516128715709993, + "learning_rate": 1.0292653821758404e-05, + "loss": 1.2265, + "step": 1868 + }, + { + "epoch": 3.9479936642027456, + "grad_norm": 0.10740845525305084, + "learning_rate": 1.0253142154196415e-05, + "loss": 1.2363, + "step": 1869 + }, + { + "epoch": 3.950105596620908, + "grad_norm": 0.12402353409696182, + "learning_rate": 1.0213695319843313e-05, + "loss": 1.2182, + "step": 1870 + }, + { + "epoch": 3.952217529039071, + "grad_norm": 0.10923671599616863, + "learning_rate": 1.0174313404673378e-05, + "loss": 1.2231, + "step": 1871 + }, + { + "epoch": 3.9543294614572333, + "grad_norm": 0.17723771141038344, + "learning_rate": 1.0134996494519411e-05, + "loss": 1.2204, + "step": 1872 + }, + { + "epoch": 3.956441393875396, + "grad_norm": 0.12841976917720593, + "learning_rate": 1.0095744675072527e-05, + "loss": 1.2044, + "step": 1873 + }, + { + "epoch": 3.9585533262935586, + "grad_norm": 0.13894375821339738, + "learning_rate": 1.0056558031881978e-05, + "loss": 1.2305, + "step": 1874 + }, + { + "epoch": 3.960665258711721, + "grad_norm": 0.13021924275365993, + "learning_rate": 1.0017436650354963e-05, + "loss": 1.2216, + "step": 1875 + }, + { + "epoch": 3.962777191129884, + "grad_norm": 0.10795388315408286, + "learning_rate": 9.97838061575644e-06, + "loss": 1.2241, + "step": 1876 + }, + { + "epoch": 3.9648891235480463, + "grad_norm": 0.17585323183748416, + "learning_rate": 9.939390013208951e-06, + "loss": 1.2129, + "step": 1877 + }, + { + "epoch": 3.967001055966209, + "grad_norm": 0.10266024206939811, + "learning_rate": 9.900464927692428e-06, + "loss": 1.2196, + "step": 1878 + }, + { + "epoch": 3.9691129883843717, + "grad_norm": 0.13146544167464178, + "learning_rate": 9.861605444043998e-06, + "loss": 1.2455, + "step": 1879 + }, + { + "epoch": 3.971224920802534, + "grad_norm": 0.11585206365958056, + "learning_rate": 9.822811646957828e-06, + "loss": 1.2041, + "step": 1880 + }, + { + "epoch": 3.973336853220697, + "grad_norm": 0.09804860049431775, + "learning_rate": 9.784083620984885e-06, + "loss": 1.2247, + "step": 1881 + }, + { + "epoch": 3.9754487856388594, + "grad_norm": 0.11979818415176877, + "learning_rate": 9.745421450532863e-06, + "loss": 1.2182, + "step": 1882 + }, + { + "epoch": 3.9775607180570223, + "grad_norm": 0.12287647666934917, + "learning_rate": 9.706825219865816e-06, + "loss": 1.2165, + "step": 1883 + }, + { + "epoch": 3.9796726504751847, + "grad_norm": 0.12218026691004391, + "learning_rate": 9.668295013104196e-06, + "loss": 1.2288, + "step": 1884 + }, + { + "epoch": 3.9817845828933476, + "grad_norm": 0.12036241735881349, + "learning_rate": 9.62983091422446e-06, + "loss": 1.2302, + "step": 1885 + }, + { + "epoch": 3.98389651531151, + "grad_norm": 0.10540892897298898, + "learning_rate": 9.591433007059038e-06, + "loss": 1.2093, + "step": 1886 + }, + { + "epoch": 3.986008447729673, + "grad_norm": 0.11056777907715418, + "learning_rate": 9.553101375296082e-06, + "loss": 1.2347, + "step": 1887 + }, + { + "epoch": 3.9881203801478353, + "grad_norm": 0.10054026190584293, + "learning_rate": 9.514836102479283e-06, + "loss": 1.2188, + "step": 1888 + }, + { + "epoch": 3.9902323125659978, + "grad_norm": 0.13844060704039668, + "learning_rate": 9.476637272007748e-06, + "loss": 1.2291, + "step": 1889 + }, + { + "epoch": 3.9923442449841606, + "grad_norm": 0.12651796639245633, + "learning_rate": 9.438504967135703e-06, + "loss": 1.2262, + "step": 1890 + }, + { + "epoch": 3.994456177402323, + "grad_norm": 0.10743833929963244, + "learning_rate": 9.400439270972454e-06, + "loss": 1.2321, + "step": 1891 + }, + { + "epoch": 3.996568109820486, + "grad_norm": 0.10654730138147309, + "learning_rate": 9.362440266482062e-06, + "loss": 1.2319, + "step": 1892 + }, + { + "epoch": 3.9986800422386484, + "grad_norm": 0.12517526665138262, + "learning_rate": 9.324508036483304e-06, + "loss": 1.2049, + "step": 1893 + }, + { + "epoch": 4.001055966209082, + "grad_norm": 0.10958861018984471, + "learning_rate": 9.286642663649386e-06, + "loss": 1.2207, + "step": 1894 + }, + { + "epoch": 4.003167898627244, + "grad_norm": 0.11413855609702508, + "learning_rate": 9.248844230507767e-06, + "loss": 1.2219, + "step": 1895 + }, + { + "epoch": 4.0052798310454065, + "grad_norm": 0.11256477985331834, + "learning_rate": 9.211112819440099e-06, + "loss": 1.1986, + "step": 1896 + }, + { + "epoch": 4.007391763463569, + "grad_norm": 0.12920129870598107, + "learning_rate": 9.173448512681849e-06, + "loss": 1.2118, + "step": 1897 + }, + { + "epoch": 4.009503695881731, + "grad_norm": 0.0972772977645975, + "learning_rate": 9.135851392322328e-06, + "loss": 1.209, + "step": 1898 + }, + { + "epoch": 4.011615628299895, + "grad_norm": 0.1525734838735639, + "learning_rate": 9.098321540304366e-06, + "loss": 1.2141, + "step": 1899 + }, + { + "epoch": 4.013727560718057, + "grad_norm": 0.1270574289703496, + "learning_rate": 9.060859038424187e-06, + "loss": 1.2186, + "step": 1900 + }, + { + "epoch": 4.01583949313622, + "grad_norm": 0.13092409718643924, + "learning_rate": 9.023463968331238e-06, + "loss": 1.1952, + "step": 1901 + }, + { + "epoch": 4.017951425554382, + "grad_norm": 0.12110964127402522, + "learning_rate": 8.98613641152796e-06, + "loss": 1.2003, + "step": 1902 + }, + { + "epoch": 4.020063357972544, + "grad_norm": 0.10520500997736362, + "learning_rate": 8.948876449369704e-06, + "loss": 1.2186, + "step": 1903 + }, + { + "epoch": 4.022175290390708, + "grad_norm": 0.13834489970261835, + "learning_rate": 8.911684163064472e-06, + "loss": 1.2191, + "step": 1904 + }, + { + "epoch": 4.02428722280887, + "grad_norm": 0.14412522548225012, + "learning_rate": 8.874559633672755e-06, + "loss": 1.207, + "step": 1905 + }, + { + "epoch": 4.026399155227033, + "grad_norm": 0.13084882957345403, + "learning_rate": 8.83750294210739e-06, + "loss": 1.2124, + "step": 1906 + }, + { + "epoch": 4.028511087645195, + "grad_norm": 0.13377046608084106, + "learning_rate": 8.800514169133345e-06, + "loss": 1.2197, + "step": 1907 + }, + { + "epoch": 4.030623020063358, + "grad_norm": 0.11523592626582022, + "learning_rate": 8.763593395367573e-06, + "loss": 1.2074, + "step": 1908 + }, + { + "epoch": 4.032734952481521, + "grad_norm": 0.12387265241655684, + "learning_rate": 8.72674070127881e-06, + "loss": 1.2086, + "step": 1909 + }, + { + "epoch": 4.034846884899683, + "grad_norm": 0.12518447037638142, + "learning_rate": 8.689956167187424e-06, + "loss": 1.2179, + "step": 1910 + }, + { + "epoch": 4.036958817317846, + "grad_norm": 0.12482338976515951, + "learning_rate": 8.65323987326522e-06, + "loss": 1.1932, + "step": 1911 + }, + { + "epoch": 4.039070749736008, + "grad_norm": 0.12287255188998382, + "learning_rate": 8.61659189953528e-06, + "loss": 1.2031, + "step": 1912 + }, + { + "epoch": 4.041182682154171, + "grad_norm": 0.13511681912077375, + "learning_rate": 8.580012325871775e-06, + "loss": 1.2007, + "step": 1913 + }, + { + "epoch": 4.043294614572334, + "grad_norm": 0.10830632080028853, + "learning_rate": 8.543501231999802e-06, + "loss": 1.1821, + "step": 1914 + }, + { + "epoch": 4.045406546990496, + "grad_norm": 0.11776726520175188, + "learning_rate": 8.507058697495213e-06, + "loss": 1.2149, + "step": 1915 + }, + { + "epoch": 4.047518479408659, + "grad_norm": 0.12213004826860518, + "learning_rate": 8.470684801784425e-06, + "loss": 1.195, + "step": 1916 + }, + { + "epoch": 4.049630411826821, + "grad_norm": 0.11840461024922444, + "learning_rate": 8.434379624144261e-06, + "loss": 1.2145, + "step": 1917 + }, + { + "epoch": 4.0517423442449845, + "grad_norm": 0.11561930316988513, + "learning_rate": 8.398143243701775e-06, + "loss": 1.2164, + "step": 1918 + }, + { + "epoch": 4.053854276663147, + "grad_norm": 0.12397658831475697, + "learning_rate": 8.36197573943407e-06, + "loss": 1.226, + "step": 1919 + }, + { + "epoch": 4.055966209081309, + "grad_norm": 0.1546974233174775, + "learning_rate": 8.325877190168134e-06, + "loss": 1.2197, + "step": 1920 + }, + { + "epoch": 4.058078141499472, + "grad_norm": 0.1125016149349428, + "learning_rate": 8.289847674580702e-06, + "loss": 1.2122, + "step": 1921 + }, + { + "epoch": 4.060190073917634, + "grad_norm": 0.16352591608664385, + "learning_rate": 8.25388727119799e-06, + "loss": 1.2107, + "step": 1922 + }, + { + "epoch": 4.0623020063357975, + "grad_norm": 0.13335266234499343, + "learning_rate": 8.21799605839563e-06, + "loss": 1.1986, + "step": 1923 + }, + { + "epoch": 4.06441393875396, + "grad_norm": 0.12206248963538087, + "learning_rate": 8.182174114398428e-06, + "loss": 1.2009, + "step": 1924 + }, + { + "epoch": 4.066525871172122, + "grad_norm": 0.14853663353340854, + "learning_rate": 8.146421517280227e-06, + "loss": 1.2175, + "step": 1925 + }, + { + "epoch": 4.068637803590285, + "grad_norm": 0.11622708069857082, + "learning_rate": 8.110738344963764e-06, + "loss": 1.2126, + "step": 1926 + }, + { + "epoch": 4.070749736008448, + "grad_norm": 0.13704111761794935, + "learning_rate": 8.075124675220385e-06, + "loss": 1.1991, + "step": 1927 + }, + { + "epoch": 4.072861668426611, + "grad_norm": 0.09973373955846523, + "learning_rate": 8.039580585670047e-06, + "loss": 1.1962, + "step": 1928 + }, + { + "epoch": 4.074973600844773, + "grad_norm": 0.1495591942178279, + "learning_rate": 8.004106153780968e-06, + "loss": 1.2014, + "step": 1929 + }, + { + "epoch": 4.077085533262935, + "grad_norm": 0.11260028145701888, + "learning_rate": 7.968701456869622e-06, + "loss": 1.2028, + "step": 1930 + }, + { + "epoch": 4.079197465681098, + "grad_norm": 0.1292655891385571, + "learning_rate": 7.933366572100469e-06, + "loss": 1.2035, + "step": 1931 + }, + { + "epoch": 4.081309398099261, + "grad_norm": 0.12365299366866774, + "learning_rate": 7.898101576485775e-06, + "loss": 1.2059, + "step": 1932 + }, + { + "epoch": 4.083421330517424, + "grad_norm": 0.10637871036516586, + "learning_rate": 7.86290654688556e-06, + "loss": 1.2047, + "step": 1933 + }, + { + "epoch": 4.085533262935586, + "grad_norm": 0.1687156272461975, + "learning_rate": 7.82778156000727e-06, + "loss": 1.2093, + "step": 1934 + }, + { + "epoch": 4.0876451953537485, + "grad_norm": 0.11286371911916185, + "learning_rate": 7.792726692405765e-06, + "loss": 1.2194, + "step": 1935 + }, + { + "epoch": 4.089757127771911, + "grad_norm": 0.11394363961646653, + "learning_rate": 7.75774202048304e-06, + "loss": 1.219, + "step": 1936 + }, + { + "epoch": 4.091869060190074, + "grad_norm": 0.11641596069842468, + "learning_rate": 7.72282762048811e-06, + "loss": 1.2121, + "step": 1937 + }, + { + "epoch": 4.093980992608237, + "grad_norm": 0.11114784051562351, + "learning_rate": 7.687983568516832e-06, + "loss": 1.204, + "step": 1938 + }, + { + "epoch": 4.096092925026399, + "grad_norm": 0.1459453245307813, + "learning_rate": 7.653209940511713e-06, + "loss": 1.2074, + "step": 1939 + }, + { + "epoch": 4.0982048574445615, + "grad_norm": 0.09855016745525019, + "learning_rate": 7.618506812261821e-06, + "loss": 1.2009, + "step": 1940 + }, + { + "epoch": 4.100316789862724, + "grad_norm": 0.10281546132003505, + "learning_rate": 7.583874259402546e-06, + "loss": 1.2155, + "step": 1941 + }, + { + "epoch": 4.102428722280887, + "grad_norm": 0.10643739079113741, + "learning_rate": 7.549312357415451e-06, + "loss": 1.207, + "step": 1942 + }, + { + "epoch": 4.10454065469905, + "grad_norm": 0.11271411492116025, + "learning_rate": 7.514821181628118e-06, + "loss": 1.2143, + "step": 1943 + }, + { + "epoch": 4.106652587117212, + "grad_norm": 0.10867153863307573, + "learning_rate": 7.480400807213994e-06, + "loss": 1.2201, + "step": 1944 + }, + { + "epoch": 4.108764519535375, + "grad_norm": 0.09697980804783232, + "learning_rate": 7.446051309192204e-06, + "loss": 1.2075, + "step": 1945 + }, + { + "epoch": 4.110876451953538, + "grad_norm": 0.09901842616782294, + "learning_rate": 7.4117727624273984e-06, + "loss": 1.1894, + "step": 1946 + }, + { + "epoch": 4.1129883843717, + "grad_norm": 0.09822999761656238, + "learning_rate": 7.3775652416295936e-06, + "loss": 1.2272, + "step": 1947 + }, + { + "epoch": 4.115100316789863, + "grad_norm": 0.09563186687350364, + "learning_rate": 7.3434288213540065e-06, + "loss": 1.209, + "step": 1948 + }, + { + "epoch": 4.117212249208025, + "grad_norm": 0.0964187003549706, + "learning_rate": 7.309363576000881e-06, + "loss": 1.228, + "step": 1949 + }, + { + "epoch": 4.119324181626188, + "grad_norm": 0.09458559758176459, + "learning_rate": 7.275369579815348e-06, + "loss": 1.2094, + "step": 1950 + }, + { + "epoch": 4.121436114044351, + "grad_norm": 0.09471895334391972, + "learning_rate": 7.241446906887239e-06, + "loss": 1.2206, + "step": 1951 + }, + { + "epoch": 4.123548046462513, + "grad_norm": 0.22593562738668418, + "learning_rate": 7.207595631150947e-06, + "loss": 1.2181, + "step": 1952 + }, + { + "epoch": 4.125659978880676, + "grad_norm": 0.2611437008249475, + "learning_rate": 7.173815826385246e-06, + "loss": 1.2085, + "step": 1953 + }, + { + "epoch": 4.127771911298838, + "grad_norm": 0.10151993291911131, + "learning_rate": 7.140107566213146e-06, + "loss": 1.2244, + "step": 1954 + }, + { + "epoch": 4.129883843717001, + "grad_norm": 0.12036527695977842, + "learning_rate": 7.106470924101722e-06, + "loss": 1.2047, + "step": 1955 + }, + { + "epoch": 4.131995776135164, + "grad_norm": 0.09083718298179554, + "learning_rate": 7.072905973361961e-06, + "loss": 1.2046, + "step": 1956 + }, + { + "epoch": 4.134107708553326, + "grad_norm": 0.09829821611209384, + "learning_rate": 7.039412787148587e-06, + "loss": 1.2131, + "step": 1957 + }, + { + "epoch": 4.136219640971489, + "grad_norm": 0.09799349973145999, + "learning_rate": 7.005991438459947e-06, + "loss": 1.2197, + "step": 1958 + }, + { + "epoch": 4.138331573389651, + "grad_norm": 0.09290597943145522, + "learning_rate": 6.972642000137772e-06, + "loss": 1.2172, + "step": 1959 + }, + { + "epoch": 4.140443505807814, + "grad_norm": 0.10608390889505304, + "learning_rate": 6.939364544867095e-06, + "loss": 1.1925, + "step": 1960 + }, + { + "epoch": 4.142555438225977, + "grad_norm": 0.10819847892591801, + "learning_rate": 6.90615914517605e-06, + "loss": 1.2113, + "step": 1961 + }, + { + "epoch": 4.1446673706441395, + "grad_norm": 0.10289456699831477, + "learning_rate": 6.873025873435719e-06, + "loss": 1.2009, + "step": 1962 + }, + { + "epoch": 4.146779303062302, + "grad_norm": 0.1702864863134587, + "learning_rate": 6.839964801860031e-06, + "loss": 1.2136, + "step": 1963 + }, + { + "epoch": 4.148891235480464, + "grad_norm": 0.15682514627664665, + "learning_rate": 6.80697600250547e-06, + "loss": 1.2035, + "step": 1964 + }, + { + "epoch": 4.151003167898628, + "grad_norm": 0.10211290667827794, + "learning_rate": 6.774059547271088e-06, + "loss": 1.2067, + "step": 1965 + }, + { + "epoch": 4.15311510031679, + "grad_norm": 0.10574089434803488, + "learning_rate": 6.7412155078981865e-06, + "loss": 1.1933, + "step": 1966 + }, + { + "epoch": 4.1552270327349525, + "grad_norm": 0.10386045029196173, + "learning_rate": 6.708443955970305e-06, + "loss": 1.2151, + "step": 1967 + }, + { + "epoch": 4.157338965153115, + "grad_norm": 0.1206604331133922, + "learning_rate": 6.6757449629129666e-06, + "loss": 1.2146, + "step": 1968 + }, + { + "epoch": 4.159450897571277, + "grad_norm": 0.13576892979258437, + "learning_rate": 6.643118599993519e-06, + "loss": 1.2195, + "step": 1969 + }, + { + "epoch": 4.161562829989441, + "grad_norm": 0.0979271871896594, + "learning_rate": 6.610564938321089e-06, + "loss": 1.2114, + "step": 1970 + }, + { + "epoch": 4.163674762407603, + "grad_norm": 0.1297339272583601, + "learning_rate": 6.578084048846265e-06, + "loss": 1.2124, + "step": 1971 + }, + { + "epoch": 4.165786694825766, + "grad_norm": 0.12068763941528697, + "learning_rate": 6.5456760023611035e-06, + "loss": 1.2216, + "step": 1972 + }, + { + "epoch": 4.167898627243928, + "grad_norm": 0.0973961137216155, + "learning_rate": 6.513340869498859e-06, + "loss": 1.2118, + "step": 1973 + }, + { + "epoch": 4.17001055966209, + "grad_norm": 0.13136841104520358, + "learning_rate": 6.481078720733873e-06, + "loss": 1.215, + "step": 1974 + }, + { + "epoch": 4.172122492080254, + "grad_norm": 0.1093991275432658, + "learning_rate": 6.448889626381447e-06, + "loss": 1.2065, + "step": 1975 + }, + { + "epoch": 4.174234424498416, + "grad_norm": 0.09967310843084899, + "learning_rate": 6.416773656597599e-06, + "loss": 1.2109, + "step": 1976 + }, + { + "epoch": 4.176346356916579, + "grad_norm": 0.098279976539171, + "learning_rate": 6.384730881379049e-06, + "loss": 1.2169, + "step": 1977 + }, + { + "epoch": 4.178458289334741, + "grad_norm": 0.10877589270569686, + "learning_rate": 6.352761370562936e-06, + "loss": 1.2107, + "step": 1978 + }, + { + "epoch": 4.1805702217529035, + "grad_norm": 0.09636879130028331, + "learning_rate": 6.320865193826744e-06, + "loss": 1.2006, + "step": 1979 + }, + { + "epoch": 4.182682154171067, + "grad_norm": 0.14954175530631017, + "learning_rate": 6.289042420688107e-06, + "loss": 1.2011, + "step": 1980 + }, + { + "epoch": 4.184794086589229, + "grad_norm": 0.11032372934892025, + "learning_rate": 6.2572931205046925e-06, + "loss": 1.2133, + "step": 1981 + }, + { + "epoch": 4.186906019007392, + "grad_norm": 0.1275635568056174, + "learning_rate": 6.225617362474032e-06, + "loss": 1.2143, + "step": 1982 + }, + { + "epoch": 4.189017951425554, + "grad_norm": 0.12885072033641987, + "learning_rate": 6.19401521563336e-06, + "loss": 1.2072, + "step": 1983 + }, + { + "epoch": 4.191129883843717, + "grad_norm": 0.10463644075151039, + "learning_rate": 6.1624867488594865e-06, + "loss": 1.2093, + "step": 1984 + }, + { + "epoch": 4.19324181626188, + "grad_norm": 0.09433643149971153, + "learning_rate": 6.1310320308686354e-06, + "loss": 1.1987, + "step": 1985 + }, + { + "epoch": 4.195353748680042, + "grad_norm": 0.14778110493420393, + "learning_rate": 6.099651130216289e-06, + "loss": 1.1997, + "step": 1986 + }, + { + "epoch": 4.197465681098205, + "grad_norm": 0.10622777911355209, + "learning_rate": 6.068344115297055e-06, + "loss": 1.2024, + "step": 1987 + }, + { + "epoch": 4.199577613516367, + "grad_norm": 0.09486589994350125, + "learning_rate": 6.037111054344493e-06, + "loss": 1.2071, + "step": 1988 + }, + { + "epoch": 4.2016895459345305, + "grad_norm": 0.11058554328154406, + "learning_rate": 6.005952015430994e-06, + "loss": 1.2291, + "step": 1989 + }, + { + "epoch": 4.203801478352693, + "grad_norm": 0.10188545610220635, + "learning_rate": 5.974867066467607e-06, + "loss": 1.2164, + "step": 1990 + }, + { + "epoch": 4.205913410770855, + "grad_norm": 0.09491793951212535, + "learning_rate": 5.943856275203912e-06, + "loss": 1.2033, + "step": 1991 + }, + { + "epoch": 4.208025343189018, + "grad_norm": 0.11849716428873132, + "learning_rate": 5.912919709227858e-06, + "loss": 1.224, + "step": 1992 + }, + { + "epoch": 4.21013727560718, + "grad_norm": 0.11249462935701642, + "learning_rate": 5.88205743596562e-06, + "loss": 1.2189, + "step": 1993 + }, + { + "epoch": 4.2122492080253435, + "grad_norm": 0.09904070957577137, + "learning_rate": 5.8512695226814376e-06, + "loss": 1.2268, + "step": 1994 + }, + { + "epoch": 4.214361140443506, + "grad_norm": 0.09365109078698292, + "learning_rate": 5.820556036477532e-06, + "loss": 1.1995, + "step": 1995 + }, + { + "epoch": 4.216473072861668, + "grad_norm": 0.12455811119384046, + "learning_rate": 5.789917044293845e-06, + "loss": 1.2094, + "step": 1996 + }, + { + "epoch": 4.218585005279831, + "grad_norm": 0.09046510872617762, + "learning_rate": 5.759352612908001e-06, + "loss": 1.2172, + "step": 1997 + }, + { + "epoch": 4.220696937697993, + "grad_norm": 0.1039126878829509, + "learning_rate": 5.7288628089351075e-06, + "loss": 1.2055, + "step": 1998 + }, + { + "epoch": 4.222808870116157, + "grad_norm": 0.10544397020259026, + "learning_rate": 5.698447698827614e-06, + "loss": 1.2216, + "step": 1999 + }, + { + "epoch": 4.224920802534319, + "grad_norm": 0.09096450328064688, + "learning_rate": 5.668107348875214e-06, + "loss": 1.2104, + "step": 2000 + }, + { + "epoch": 4.227032734952481, + "grad_norm": 0.1092285903010717, + "learning_rate": 5.637841825204588e-06, + "loss": 1.1982, + "step": 2001 + }, + { + "epoch": 4.229144667370644, + "grad_norm": 0.09340024084073767, + "learning_rate": 5.607651193779422e-06, + "loss": 1.2019, + "step": 2002 + }, + { + "epoch": 4.231256599788807, + "grad_norm": 0.10158846477134463, + "learning_rate": 5.577535520400088e-06, + "loss": 1.2106, + "step": 2003 + }, + { + "epoch": 4.23336853220697, + "grad_norm": 0.09771418411357628, + "learning_rate": 5.547494870703642e-06, + "loss": 1.2162, + "step": 2004 + }, + { + "epoch": 4.235480464625132, + "grad_norm": 0.10415213031634217, + "learning_rate": 5.517529310163627e-06, + "loss": 1.2301, + "step": 2005 + }, + { + "epoch": 4.2375923970432945, + "grad_norm": 0.10893382175181676, + "learning_rate": 5.487638904089885e-06, + "loss": 1.218, + "step": 2006 + }, + { + "epoch": 4.239704329461457, + "grad_norm": 0.09807971629461211, + "learning_rate": 5.457823717628516e-06, + "loss": 1.191, + "step": 2007 + }, + { + "epoch": 4.24181626187962, + "grad_norm": 0.0948526959136119, + "learning_rate": 5.428083815761627e-06, + "loss": 1.2078, + "step": 2008 + }, + { + "epoch": 4.243928194297783, + "grad_norm": 0.09310366554303758, + "learning_rate": 5.3984192633072815e-06, + "loss": 1.206, + "step": 2009 + }, + { + "epoch": 4.246040126715945, + "grad_norm": 0.09559205582193137, + "learning_rate": 5.368830124919302e-06, + "loss": 1.2145, + "step": 2010 + }, + { + "epoch": 4.2481520591341075, + "grad_norm": 0.09001474422563172, + "learning_rate": 5.339316465087145e-06, + "loss": 1.2121, + "step": 2011 + }, + { + "epoch": 4.25026399155227, + "grad_norm": 0.0959056831707372, + "learning_rate": 5.30987834813578e-06, + "loss": 1.2071, + "step": 2012 + }, + { + "epoch": 4.252375923970433, + "grad_norm": 0.09508016478244032, + "learning_rate": 5.280515838225477e-06, + "loss": 1.2199, + "step": 2013 + }, + { + "epoch": 4.254487856388596, + "grad_norm": 0.10976122512034585, + "learning_rate": 5.251228999351799e-06, + "loss": 1.2141, + "step": 2014 + }, + { + "epoch": 4.256599788806758, + "grad_norm": 0.1116452913041032, + "learning_rate": 5.222017895345324e-06, + "loss": 1.2142, + "step": 2015 + }, + { + "epoch": 4.258711721224921, + "grad_norm": 0.10236307520049791, + "learning_rate": 5.192882589871597e-06, + "loss": 1.2133, + "step": 2016 + }, + { + "epoch": 4.260823653643083, + "grad_norm": 0.11498479026984806, + "learning_rate": 5.1638231464309444e-06, + "loss": 1.2077, + "step": 2017 + }, + { + "epoch": 4.262935586061246, + "grad_norm": 0.08904135801393495, + "learning_rate": 5.134839628358359e-06, + "loss": 1.2058, + "step": 2018 + }, + { + "epoch": 4.265047518479409, + "grad_norm": 0.09881847497198958, + "learning_rate": 5.1059320988233515e-06, + "loss": 1.2112, + "step": 2019 + }, + { + "epoch": 4.267159450897571, + "grad_norm": 0.09883539292243808, + "learning_rate": 5.077100620829814e-06, + "loss": 1.1979, + "step": 2020 + }, + { + "epoch": 4.269271383315734, + "grad_norm": 0.14170730530204634, + "learning_rate": 5.048345257215892e-06, + "loss": 1.2121, + "step": 2021 + }, + { + "epoch": 4.271383315733896, + "grad_norm": 0.12149357031556532, + "learning_rate": 5.019666070653833e-06, + "loss": 1.2239, + "step": 2022 + }, + { + "epoch": 4.273495248152059, + "grad_norm": 0.11048529195893288, + "learning_rate": 4.991063123649853e-06, + "loss": 1.2166, + "step": 2023 + }, + { + "epoch": 4.275607180570222, + "grad_norm": 0.10492183080934375, + "learning_rate": 4.96253647854402e-06, + "loss": 1.2017, + "step": 2024 + }, + { + "epoch": 4.277719112988384, + "grad_norm": 0.1270273755385006, + "learning_rate": 4.934086197510088e-06, + "loss": 1.2223, + "step": 2025 + }, + { + "epoch": 4.279831045406547, + "grad_norm": 0.10689052999339844, + "learning_rate": 4.905712342555378e-06, + "loss": 1.2203, + "step": 2026 + }, + { + "epoch": 4.28194297782471, + "grad_norm": 0.10450768161872781, + "learning_rate": 4.877414975520638e-06, + "loss": 1.2144, + "step": 2027 + }, + { + "epoch": 4.284054910242872, + "grad_norm": 0.1744067489658228, + "learning_rate": 4.849194158079926e-06, + "loss": 1.2007, + "step": 2028 + }, + { + "epoch": 4.286166842661035, + "grad_norm": 0.10619316607797957, + "learning_rate": 4.821049951740442e-06, + "loss": 1.2238, + "step": 2029 + }, + { + "epoch": 4.288278775079197, + "grad_norm": 0.10504384079809824, + "learning_rate": 4.79298241784242e-06, + "loss": 1.213, + "step": 2030 + }, + { + "epoch": 4.29039070749736, + "grad_norm": 0.09930296132113825, + "learning_rate": 4.764991617558976e-06, + "loss": 1.2014, + "step": 2031 + }, + { + "epoch": 4.292502639915523, + "grad_norm": 0.09797647465471879, + "learning_rate": 4.737077611896026e-06, + "loss": 1.2095, + "step": 2032 + }, + { + "epoch": 4.2946145723336855, + "grad_norm": 0.0971297443876462, + "learning_rate": 4.7092404616920554e-06, + "loss": 1.2186, + "step": 2033 + }, + { + "epoch": 4.296726504751848, + "grad_norm": 0.11078493821219554, + "learning_rate": 4.6814802276180735e-06, + "loss": 1.1956, + "step": 2034 + }, + { + "epoch": 4.29883843717001, + "grad_norm": 0.10752544962168878, + "learning_rate": 4.6537969701774534e-06, + "loss": 1.2105, + "step": 2035 + }, + { + "epoch": 4.300950369588173, + "grad_norm": 0.10446127156444286, + "learning_rate": 4.626190749705779e-06, + "loss": 1.2199, + "step": 2036 + }, + { + "epoch": 4.303062302006336, + "grad_norm": 0.1215584090703717, + "learning_rate": 4.598661626370771e-06, + "loss": 1.2002, + "step": 2037 + }, + { + "epoch": 4.3051742344244985, + "grad_norm": 0.09641420887334538, + "learning_rate": 4.5712096601720555e-06, + "loss": 1.2022, + "step": 2038 + }, + { + "epoch": 4.307286166842661, + "grad_norm": 0.1195405032017642, + "learning_rate": 4.543834910941165e-06, + "loss": 1.1988, + "step": 2039 + }, + { + "epoch": 4.309398099260823, + "grad_norm": 0.09622617280008877, + "learning_rate": 4.516537438341284e-06, + "loss": 1.2159, + "step": 2040 + }, + { + "epoch": 4.311510031678987, + "grad_norm": 0.0946166949646773, + "learning_rate": 4.489317301867182e-06, + "loss": 1.2055, + "step": 2041 + }, + { + "epoch": 4.313621964097149, + "grad_norm": 0.10823644214288149, + "learning_rate": 4.462174560845114e-06, + "loss": 1.2135, + "step": 2042 + }, + { + "epoch": 4.315733896515312, + "grad_norm": 0.10659601415483864, + "learning_rate": 4.4351092744325895e-06, + "loss": 1.2092, + "step": 2043 + }, + { + "epoch": 4.317845828933474, + "grad_norm": 0.08974989168982314, + "learning_rate": 4.408121501618374e-06, + "loss": 1.2093, + "step": 2044 + }, + { + "epoch": 4.3199577613516364, + "grad_norm": 0.1581601608600536, + "learning_rate": 4.3812113012222164e-06, + "loss": 1.2244, + "step": 2045 + }, + { + "epoch": 4.3220696937698, + "grad_norm": 0.09868578894908858, + "learning_rate": 4.3543787318948685e-06, + "loss": 1.1881, + "step": 2046 + }, + { + "epoch": 4.324181626187962, + "grad_norm": 0.09157493646880059, + "learning_rate": 4.3276238521178456e-06, + "loss": 1.2138, + "step": 2047 + }, + { + "epoch": 4.326293558606125, + "grad_norm": 0.09211157109428125, + "learning_rate": 4.30094672020334e-06, + "loss": 1.2021, + "step": 2048 + }, + { + "epoch": 4.328405491024287, + "grad_norm": 0.11106579790769947, + "learning_rate": 4.274347394294118e-06, + "loss": 1.2042, + "step": 2049 + }, + { + "epoch": 4.3305174234424495, + "grad_norm": 0.10054290903564914, + "learning_rate": 4.247825932363321e-06, + "loss": 1.2026, + "step": 2050 + }, + { + "epoch": 4.332629355860613, + "grad_norm": 0.09836267450867371, + "learning_rate": 4.221382392214444e-06, + "loss": 1.1795, + "step": 2051 + }, + { + "epoch": 4.334741288278775, + "grad_norm": 0.09428895946264715, + "learning_rate": 4.1950168314810955e-06, + "loss": 1.2155, + "step": 2052 + }, + { + "epoch": 4.336853220696938, + "grad_norm": 0.10796000823516028, + "learning_rate": 4.168729307626977e-06, + "loss": 1.2216, + "step": 2053 + }, + { + "epoch": 4.3389651531151, + "grad_norm": 0.09656304422248639, + "learning_rate": 4.1425198779456764e-06, + "loss": 1.2045, + "step": 2054 + }, + { + "epoch": 4.3410770855332625, + "grad_norm": 0.0923167859783252, + "learning_rate": 4.116388599560593e-06, + "loss": 1.2128, + "step": 2055 + }, + { + "epoch": 4.343189017951426, + "grad_norm": 0.09777208163894098, + "learning_rate": 4.090335529424776e-06, + "loss": 1.2049, + "step": 2056 + }, + { + "epoch": 4.345300950369588, + "grad_norm": 0.10127917182246446, + "learning_rate": 4.064360724320846e-06, + "loss": 1.2047, + "step": 2057 + }, + { + "epoch": 4.347412882787751, + "grad_norm": 0.09706113443108196, + "learning_rate": 4.038464240860825e-06, + "loss": 1.2172, + "step": 2058 + }, + { + "epoch": 4.349524815205913, + "grad_norm": 0.09632981822686659, + "learning_rate": 4.012646135486038e-06, + "loss": 1.2002, + "step": 2059 + }, + { + "epoch": 4.351636747624076, + "grad_norm": 0.10680981793083227, + "learning_rate": 3.986906464466991e-06, + "loss": 1.2018, + "step": 2060 + }, + { + "epoch": 4.353748680042239, + "grad_norm": 0.1440540068721411, + "learning_rate": 3.961245283903239e-06, + "loss": 1.2076, + "step": 2061 + }, + { + "epoch": 4.355860612460401, + "grad_norm": 0.10217153584503677, + "learning_rate": 3.9356626497232575e-06, + "loss": 1.2137, + "step": 2062 + }, + { + "epoch": 4.357972544878564, + "grad_norm": 0.10549816180034295, + "learning_rate": 3.910158617684352e-06, + "loss": 1.2123, + "step": 2063 + }, + { + "epoch": 4.360084477296726, + "grad_norm": 0.09944708713937835, + "learning_rate": 3.884733243372494e-06, + "loss": 1.2072, + "step": 2064 + }, + { + "epoch": 4.3621964097148895, + "grad_norm": 0.10475087327087804, + "learning_rate": 3.859386582202231e-06, + "loss": 1.1838, + "step": 2065 + }, + { + "epoch": 4.364308342133052, + "grad_norm": 0.10107596753982377, + "learning_rate": 3.834118689416553e-06, + "loss": 1.1896, + "step": 2066 + }, + { + "epoch": 4.366420274551214, + "grad_norm": 0.1060320752414899, + "learning_rate": 3.8089296200867742e-06, + "loss": 1.2195, + "step": 2067 + }, + { + "epoch": 4.368532206969377, + "grad_norm": 0.11794262106055552, + "learning_rate": 3.7838194291124074e-06, + "loss": 1.2235, + "step": 2068 + }, + { + "epoch": 4.370644139387539, + "grad_norm": 0.14074239889302334, + "learning_rate": 3.7587881712210793e-06, + "loss": 1.2041, + "step": 2069 + }, + { + "epoch": 4.372756071805703, + "grad_norm": 0.10105987229120071, + "learning_rate": 3.733835900968341e-06, + "loss": 1.2126, + "step": 2070 + }, + { + "epoch": 4.374868004223865, + "grad_norm": 0.10638272084330884, + "learning_rate": 3.708962672737606e-06, + "loss": 1.2005, + "step": 2071 + }, + { + "epoch": 4.3769799366420274, + "grad_norm": 0.12011630078698769, + "learning_rate": 3.684168540740025e-06, + "loss": 1.2021, + "step": 2072 + }, + { + "epoch": 4.37909186906019, + "grad_norm": 0.10103513231034478, + "learning_rate": 3.6594535590143454e-06, + "loss": 1.2052, + "step": 2073 + }, + { + "epoch": 4.381203801478352, + "grad_norm": 0.10728197745333677, + "learning_rate": 3.634817781426834e-06, + "loss": 1.209, + "step": 2074 + }, + { + "epoch": 4.383315733896516, + "grad_norm": 0.11399698400664125, + "learning_rate": 3.610261261671082e-06, + "loss": 1.2091, + "step": 2075 + }, + { + "epoch": 4.385427666314678, + "grad_norm": 0.0943576271209081, + "learning_rate": 3.5857840532679975e-06, + "loss": 1.217, + "step": 2076 + }, + { + "epoch": 4.3875395987328405, + "grad_norm": 0.09043908751280652, + "learning_rate": 3.5613862095655827e-06, + "loss": 1.2186, + "step": 2077 + }, + { + "epoch": 4.389651531151003, + "grad_norm": 0.09530151466446464, + "learning_rate": 3.5370677837388746e-06, + "loss": 1.212, + "step": 2078 + }, + { + "epoch": 4.391763463569166, + "grad_norm": 0.09928560718921353, + "learning_rate": 3.5128288287898583e-06, + "loss": 1.1999, + "step": 2079 + }, + { + "epoch": 4.393875395987329, + "grad_norm": 0.08946118673982409, + "learning_rate": 3.4886693975472443e-06, + "loss": 1.2133, + "step": 2080 + }, + { + "epoch": 4.395987328405491, + "grad_norm": 0.11495600109669246, + "learning_rate": 3.4645895426664856e-06, + "loss": 1.1999, + "step": 2081 + }, + { + "epoch": 4.3980992608236535, + "grad_norm": 0.09638022159541147, + "learning_rate": 3.440589316629548e-06, + "loss": 1.1919, + "step": 2082 + }, + { + "epoch": 4.400211193241816, + "grad_norm": 0.09875698694156333, + "learning_rate": 3.4166687717449e-06, + "loss": 1.2068, + "step": 2083 + }, + { + "epoch": 4.402323125659979, + "grad_norm": 0.1327619754483469, + "learning_rate": 3.392827960147278e-06, + "loss": 1.2081, + "step": 2084 + }, + { + "epoch": 4.404435058078142, + "grad_norm": 0.09296090746583822, + "learning_rate": 3.3690669337977e-06, + "loss": 1.209, + "step": 2085 + }, + { + "epoch": 4.406546990496304, + "grad_norm": 0.09361060659147219, + "learning_rate": 3.345385744483274e-06, + "loss": 1.2178, + "step": 2086 + }, + { + "epoch": 4.408658922914467, + "grad_norm": 0.1034188909655148, + "learning_rate": 3.321784443817073e-06, + "loss": 1.22, + "step": 2087 + }, + { + "epoch": 4.410770855332629, + "grad_norm": 0.14670254929267273, + "learning_rate": 3.2982630832381067e-06, + "loss": 1.2128, + "step": 2088 + }, + { + "epoch": 4.412882787750792, + "grad_norm": 0.09499894493970609, + "learning_rate": 3.2748217140111005e-06, + "loss": 1.2167, + "step": 2089 + }, + { + "epoch": 4.414994720168955, + "grad_norm": 0.10990650972195812, + "learning_rate": 3.251460387226484e-06, + "loss": 1.2129, + "step": 2090 + }, + { + "epoch": 4.417106652587117, + "grad_norm": 0.11063907924648461, + "learning_rate": 3.2281791538002085e-06, + "loss": 1.2087, + "step": 2091 + }, + { + "epoch": 4.41921858500528, + "grad_norm": 0.10189197375894349, + "learning_rate": 3.2049780644736717e-06, + "loss": 1.2094, + "step": 2092 + }, + { + "epoch": 4.421330517423442, + "grad_norm": 0.10697937332770391, + "learning_rate": 3.181857169813598e-06, + "loss": 1.222, + "step": 2093 + }, + { + "epoch": 4.423442449841605, + "grad_norm": 0.1071355669622226, + "learning_rate": 3.1588165202119094e-06, + "loss": 1.209, + "step": 2094 + }, + { + "epoch": 4.425554382259768, + "grad_norm": 0.09591587765853932, + "learning_rate": 3.1358561658856623e-06, + "loss": 1.205, + "step": 2095 + }, + { + "epoch": 4.42766631467793, + "grad_norm": 0.09444527394111425, + "learning_rate": 3.1129761568768947e-06, + "loss": 1.2229, + "step": 2096 + }, + { + "epoch": 4.429778247096093, + "grad_norm": 0.13732644791308993, + "learning_rate": 3.0901765430525345e-06, + "loss": 1.2076, + "step": 2097 + }, + { + "epoch": 4.431890179514255, + "grad_norm": 0.12117754205195758, + "learning_rate": 3.06745737410429e-06, + "loss": 1.2151, + "step": 2098 + }, + { + "epoch": 4.434002111932418, + "grad_norm": 0.13253847983736342, + "learning_rate": 3.0448186995485307e-06, + "loss": 1.2271, + "step": 2099 + }, + { + "epoch": 4.436114044350581, + "grad_norm": 0.10061585342640611, + "learning_rate": 3.0222605687262053e-06, + "loss": 1.2107, + "step": 2100 + }, + { + "epoch": 4.438225976768743, + "grad_norm": 0.10445141968447787, + "learning_rate": 2.999783030802701e-06, + "loss": 1.1852, + "step": 2101 + }, + { + "epoch": 4.440337909186906, + "grad_norm": 0.09105069328103839, + "learning_rate": 2.9773861347677633e-06, + "loss": 1.2118, + "step": 2102 + }, + { + "epoch": 4.442449841605068, + "grad_norm": 0.14064141975078143, + "learning_rate": 2.955069929435377e-06, + "loss": 1.2105, + "step": 2103 + }, + { + "epoch": 4.4445617740232315, + "grad_norm": 0.09075393378286566, + "learning_rate": 2.9328344634436567e-06, + "loss": 1.2144, + "step": 2104 + }, + { + "epoch": 4.446673706441394, + "grad_norm": 0.09249227321137649, + "learning_rate": 2.9106797852547487e-06, + "loss": 1.2025, + "step": 2105 + }, + { + "epoch": 4.448785638859556, + "grad_norm": 0.12202885690697399, + "learning_rate": 2.888605943154743e-06, + "loss": 1.2245, + "step": 2106 + }, + { + "epoch": 4.450897571277719, + "grad_norm": 0.11102111949280138, + "learning_rate": 2.8666129852535028e-06, + "loss": 1.1804, + "step": 2107 + }, + { + "epoch": 4.453009503695882, + "grad_norm": 0.10070385108834266, + "learning_rate": 2.8447009594846454e-06, + "loss": 1.2159, + "step": 2108 + }, + { + "epoch": 4.4551214361140445, + "grad_norm": 0.09037386810732238, + "learning_rate": 2.822869913605373e-06, + "loss": 1.2084, + "step": 2109 + }, + { + "epoch": 4.457233368532207, + "grad_norm": 0.11867375610466756, + "learning_rate": 2.8011198951963935e-06, + "loss": 1.2069, + "step": 2110 + }, + { + "epoch": 4.459345300950369, + "grad_norm": 0.08910417732124928, + "learning_rate": 2.7794509516618507e-06, + "loss": 1.198, + "step": 2111 + }, + { + "epoch": 4.461457233368532, + "grad_norm": 0.08507150579873418, + "learning_rate": 2.7578631302291213e-06, + "loss": 1.2046, + "step": 2112 + }, + { + "epoch": 4.463569165786695, + "grad_norm": 0.10283387800116833, + "learning_rate": 2.7363564779488448e-06, + "loss": 1.2018, + "step": 2113 + }, + { + "epoch": 4.465681098204858, + "grad_norm": 0.09243665316916745, + "learning_rate": 2.7149310416946995e-06, + "loss": 1.2026, + "step": 2114 + }, + { + "epoch": 4.46779303062302, + "grad_norm": 0.08357811009552821, + "learning_rate": 2.6935868681633805e-06, + "loss": 1.2077, + "step": 2115 + }, + { + "epoch": 4.4699049630411825, + "grad_norm": 0.13118843262958663, + "learning_rate": 2.6723240038744757e-06, + "loss": 1.1916, + "step": 2116 + }, + { + "epoch": 4.472016895459345, + "grad_norm": 0.09289745526751504, + "learning_rate": 2.6511424951703248e-06, + "loss": 1.2061, + "step": 2117 + }, + { + "epoch": 4.474128827877508, + "grad_norm": 0.0936473999796065, + "learning_rate": 2.630042388216012e-06, + "loss": 1.2231, + "step": 2118 + }, + { + "epoch": 4.476240760295671, + "grad_norm": 0.08933200607797251, + "learning_rate": 2.609023728999138e-06, + "loss": 1.2068, + "step": 2119 + }, + { + "epoch": 4.478352692713833, + "grad_norm": 0.09293058520481882, + "learning_rate": 2.58808656332985e-06, + "loss": 1.2127, + "step": 2120 + }, + { + "epoch": 4.4804646251319955, + "grad_norm": 0.13708999482164239, + "learning_rate": 2.5672309368406325e-06, + "loss": 1.2182, + "step": 2121 + }, + { + "epoch": 4.482576557550159, + "grad_norm": 0.08132112569027296, + "learning_rate": 2.5464568949862888e-06, + "loss": 1.2214, + "step": 2122 + }, + { + "epoch": 4.484688489968321, + "grad_norm": 0.08579107163227982, + "learning_rate": 2.5257644830437976e-06, + "loss": 1.1901, + "step": 2123 + }, + { + "epoch": 4.486800422386484, + "grad_norm": 0.08993213171719701, + "learning_rate": 2.5051537461122034e-06, + "loss": 1.2175, + "step": 2124 + }, + { + "epoch": 4.488912354804646, + "grad_norm": 0.08565452408971524, + "learning_rate": 2.4846247291125903e-06, + "loss": 1.2153, + "step": 2125 + }, + { + "epoch": 4.4910242872228086, + "grad_norm": 0.08526964508490718, + "learning_rate": 2.4641774767878746e-06, + "loss": 1.2105, + "step": 2126 + }, + { + "epoch": 4.493136219640972, + "grad_norm": 0.08607021325651502, + "learning_rate": 2.4438120337028173e-06, + "loss": 1.1984, + "step": 2127 + }, + { + "epoch": 4.495248152059134, + "grad_norm": 0.08630186334543269, + "learning_rate": 2.4235284442438502e-06, + "loss": 1.2015, + "step": 2128 + }, + { + "epoch": 4.497360084477297, + "grad_norm": 0.13605263393042208, + "learning_rate": 2.403326752619006e-06, + "loss": 1.2099, + "step": 2129 + }, + { + "epoch": 4.499472016895459, + "grad_norm": 0.11225177514461938, + "learning_rate": 2.38320700285783e-06, + "loss": 1.2202, + "step": 2130 + }, + { + "epoch": 4.501583949313622, + "grad_norm": 0.09188164107509064, + "learning_rate": 2.363169238811258e-06, + "loss": 1.2162, + "step": 2131 + }, + { + "epoch": 4.503695881731785, + "grad_norm": 0.08607814711862438, + "learning_rate": 2.343213504151569e-06, + "loss": 1.2007, + "step": 2132 + }, + { + "epoch": 4.505807814149947, + "grad_norm": 0.0959638242283176, + "learning_rate": 2.3233398423722343e-06, + "loss": 1.1957, + "step": 2133 + }, + { + "epoch": 4.50791974656811, + "grad_norm": 0.09617888154227387, + "learning_rate": 2.303548296787854e-06, + "loss": 1.2038, + "step": 2134 + }, + { + "epoch": 4.510031678986272, + "grad_norm": 0.08878768144644715, + "learning_rate": 2.2838389105340576e-06, + "loss": 1.2137, + "step": 2135 + }, + { + "epoch": 4.512143611404435, + "grad_norm": 0.13586146643413324, + "learning_rate": 2.264211726567402e-06, + "loss": 1.2007, + "step": 2136 + }, + { + "epoch": 4.514255543822598, + "grad_norm": 0.09576026668911643, + "learning_rate": 2.244666787665297e-06, + "loss": 1.2176, + "step": 2137 + }, + { + "epoch": 4.51636747624076, + "grad_norm": 0.09276881045447767, + "learning_rate": 2.2252041364258937e-06, + "loss": 1.1981, + "step": 2138 + }, + { + "epoch": 4.518479408658923, + "grad_norm": 0.09736295250996978, + "learning_rate": 2.2058238152679887e-06, + "loss": 1.2175, + "step": 2139 + }, + { + "epoch": 4.520591341077085, + "grad_norm": 0.09089032220374489, + "learning_rate": 2.1865258664309506e-06, + "loss": 1.2124, + "step": 2140 + }, + { + "epoch": 4.522703273495248, + "grad_norm": 0.09026013029989147, + "learning_rate": 2.1673103319746146e-06, + "loss": 1.2141, + "step": 2141 + }, + { + "epoch": 4.524815205913411, + "grad_norm": 0.12687822196776008, + "learning_rate": 2.1481772537791958e-06, + "loss": 1.2024, + "step": 2142 + }, + { + "epoch": 4.5269271383315735, + "grad_norm": 0.0883731046888192, + "learning_rate": 2.129126673545199e-06, + "loss": 1.2286, + "step": 2143 + }, + { + "epoch": 4.529039070749736, + "grad_norm": 0.10790334311643893, + "learning_rate": 2.1101586327933086e-06, + "loss": 1.2024, + "step": 2144 + }, + { + "epoch": 4.531151003167898, + "grad_norm": 0.08621422264417435, + "learning_rate": 2.0912731728643364e-06, + "loss": 1.1994, + "step": 2145 + }, + { + "epoch": 4.533262935586061, + "grad_norm": 0.09093505491515089, + "learning_rate": 2.0724703349190945e-06, + "loss": 1.2208, + "step": 2146 + }, + { + "epoch": 4.535374868004224, + "grad_norm": 0.09109375493676079, + "learning_rate": 2.0537501599383214e-06, + "loss": 1.1948, + "step": 2147 + }, + { + "epoch": 4.5374868004223865, + "grad_norm": 0.11383334208392784, + "learning_rate": 2.0351126887226157e-06, + "loss": 1.2156, + "step": 2148 + }, + { + "epoch": 4.539598732840549, + "grad_norm": 0.10862646936906352, + "learning_rate": 2.016557961892276e-06, + "loss": 1.2111, + "step": 2149 + }, + { + "epoch": 4.541710665258711, + "grad_norm": 0.11702941334101759, + "learning_rate": 1.9980860198873087e-06, + "loss": 1.2259, + "step": 2150 + }, + { + "epoch": 4.543822597676875, + "grad_norm": 0.09508269080608694, + "learning_rate": 1.9796969029672564e-06, + "loss": 1.2117, + "step": 2151 + }, + { + "epoch": 4.545934530095037, + "grad_norm": 0.10605448654599944, + "learning_rate": 1.9613906512111615e-06, + "loss": 1.2009, + "step": 2152 + }, + { + "epoch": 4.5480464625131996, + "grad_norm": 0.10396034790256396, + "learning_rate": 1.9431673045174595e-06, + "loss": 1.2153, + "step": 2153 + }, + { + "epoch": 4.550158394931362, + "grad_norm": 0.08922341627146647, + "learning_rate": 1.925026902603886e-06, + "loss": 1.1879, + "step": 2154 + }, + { + "epoch": 4.552270327349525, + "grad_norm": 0.1290988162709901, + "learning_rate": 1.9069694850074239e-06, + "loss": 1.2076, + "step": 2155 + }, + { + "epoch": 4.554382259767688, + "grad_norm": 0.1337130916040129, + "learning_rate": 1.888995091084147e-06, + "loss": 1.2163, + "step": 2156 + }, + { + "epoch": 4.55649419218585, + "grad_norm": 0.11177869306917829, + "learning_rate": 1.8711037600092342e-06, + "loss": 1.2322, + "step": 2157 + }, + { + "epoch": 4.558606124604013, + "grad_norm": 0.11313598096898439, + "learning_rate": 1.853295530776773e-06, + "loss": 1.2054, + "step": 2158 + }, + { + "epoch": 4.560718057022175, + "grad_norm": 0.09189355130239621, + "learning_rate": 1.8355704421997788e-06, + "loss": 1.2172, + "step": 2159 + }, + { + "epoch": 4.562829989440338, + "grad_norm": 0.10139657172235596, + "learning_rate": 1.8179285329100382e-06, + "loss": 1.2047, + "step": 2160 + }, + { + "epoch": 4.564941921858501, + "grad_norm": 0.09134029249264383, + "learning_rate": 1.8003698413580429e-06, + "loss": 1.2176, + "step": 2161 + }, + { + "epoch": 4.567053854276663, + "grad_norm": 0.14817489193326192, + "learning_rate": 1.782894405812936e-06, + "loss": 1.2103, + "step": 2162 + }, + { + "epoch": 4.569165786694826, + "grad_norm": 0.1292389907114446, + "learning_rate": 1.7655022643623664e-06, + "loss": 1.2042, + "step": 2163 + }, + { + "epoch": 4.571277719112988, + "grad_norm": 0.126623556831046, + "learning_rate": 1.7481934549124923e-06, + "loss": 1.1925, + "step": 2164 + }, + { + "epoch": 4.573389651531151, + "grad_norm": 0.09195761705852724, + "learning_rate": 1.7309680151878128e-06, + "loss": 1.2031, + "step": 2165 + }, + { + "epoch": 4.575501583949314, + "grad_norm": 0.11890917262072524, + "learning_rate": 1.7138259827311322e-06, + "loss": 1.203, + "step": 2166 + }, + { + "epoch": 4.577613516367476, + "grad_norm": 0.10402374284960854, + "learning_rate": 1.6967673949034802e-06, + "loss": 1.2196, + "step": 2167 + }, + { + "epoch": 4.579725448785639, + "grad_norm": 0.10639773647663905, + "learning_rate": 1.6797922888839924e-06, + "loss": 1.2022, + "step": 2168 + }, + { + "epoch": 4.581837381203801, + "grad_norm": 0.09514669277169176, + "learning_rate": 1.6629007016698918e-06, + "loss": 1.1982, + "step": 2169 + }, + { + "epoch": 4.5839493136219644, + "grad_norm": 0.09508526488035633, + "learning_rate": 1.6460926700763468e-06, + "loss": 1.2078, + "step": 2170 + }, + { + "epoch": 4.586061246040127, + "grad_norm": 0.12316313804032254, + "learning_rate": 1.6293682307364278e-06, + "loss": 1.2111, + "step": 2171 + }, + { + "epoch": 4.588173178458289, + "grad_norm": 0.12344894636601081, + "learning_rate": 1.612727420101008e-06, + "loss": 1.231, + "step": 2172 + }, + { + "epoch": 4.590285110876452, + "grad_norm": 0.09584103639625328, + "learning_rate": 1.5961702744386974e-06, + "loss": 1.1916, + "step": 2173 + }, + { + "epoch": 4.592397043294614, + "grad_norm": 0.09195073059560466, + "learning_rate": 1.579696829835755e-06, + "loss": 1.2044, + "step": 2174 + }, + { + "epoch": 4.5945089757127775, + "grad_norm": 0.09355266074713252, + "learning_rate": 1.5633071221960205e-06, + "loss": 1.2181, + "step": 2175 + }, + { + "epoch": 4.59662090813094, + "grad_norm": 0.12362745187642117, + "learning_rate": 1.547001187240822e-06, + "loss": 1.2137, + "step": 2176 + }, + { + "epoch": 4.598732840549102, + "grad_norm": 0.11430813905727792, + "learning_rate": 1.5307790605089045e-06, + "loss": 1.198, + "step": 2177 + }, + { + "epoch": 4.600844772967265, + "grad_norm": 0.1112607247234472, + "learning_rate": 1.5146407773563642e-06, + "loss": 1.2046, + "step": 2178 + }, + { + "epoch": 4.602956705385427, + "grad_norm": 0.1031180586195969, + "learning_rate": 1.4985863729565453e-06, + "loss": 1.2075, + "step": 2179 + }, + { + "epoch": 4.6050686378035905, + "grad_norm": 0.10354687779623509, + "learning_rate": 1.482615882299987e-06, + "loss": 1.2148, + "step": 2180 + }, + { + "epoch": 4.607180570221753, + "grad_norm": 0.09777664493698075, + "learning_rate": 1.4667293401943394e-06, + "loss": 1.2076, + "step": 2181 + }, + { + "epoch": 4.609292502639915, + "grad_norm": 0.09932060669425943, + "learning_rate": 1.4509267812642792e-06, + "loss": 1.2159, + "step": 2182 + }, + { + "epoch": 4.611404435058078, + "grad_norm": 0.12971408544301208, + "learning_rate": 1.4352082399514556e-06, + "loss": 1.2065, + "step": 2183 + }, + { + "epoch": 4.61351636747624, + "grad_norm": 0.10413897746838435, + "learning_rate": 1.4195737505143936e-06, + "loss": 1.2128, + "step": 2184 + }, + { + "epoch": 4.615628299894404, + "grad_norm": 0.10451686888299859, + "learning_rate": 1.404023347028418e-06, + "loss": 1.192, + "step": 2185 + }, + { + "epoch": 4.617740232312566, + "grad_norm": 0.1213914255222211, + "learning_rate": 1.3885570633856004e-06, + "loss": 1.2261, + "step": 2186 + }, + { + "epoch": 4.6198521647307285, + "grad_norm": 0.11291487196042374, + "learning_rate": 1.373174933294683e-06, + "loss": 1.2034, + "step": 2187 + }, + { + "epoch": 4.621964097148891, + "grad_norm": 0.11164248252588581, + "learning_rate": 1.3578769902809686e-06, + "loss": 1.2161, + "step": 2188 + }, + { + "epoch": 4.624076029567054, + "grad_norm": 0.09664972438180003, + "learning_rate": 1.3426632676862971e-06, + "loss": 1.2225, + "step": 2189 + }, + { + "epoch": 4.626187961985217, + "grad_norm": 0.12802583720396385, + "learning_rate": 1.3275337986689407e-06, + "loss": 1.2016, + "step": 2190 + }, + { + "epoch": 4.628299894403379, + "grad_norm": 0.12441005417209298, + "learning_rate": 1.3124886162035311e-06, + "loss": 1.1958, + "step": 2191 + }, + { + "epoch": 4.6304118268215415, + "grad_norm": 0.09584020755554955, + "learning_rate": 1.2975277530810338e-06, + "loss": 1.215, + "step": 2192 + }, + { + "epoch": 4.632523759239704, + "grad_norm": 0.09841844409889043, + "learning_rate": 1.2826512419085924e-06, + "loss": 1.2156, + "step": 2193 + }, + { + "epoch": 4.634635691657867, + "grad_norm": 0.10037983957711769, + "learning_rate": 1.2678591151095466e-06, + "loss": 1.198, + "step": 2194 + }, + { + "epoch": 4.63674762407603, + "grad_norm": 0.09493993430886719, + "learning_rate": 1.2531514049232852e-06, + "loss": 1.2106, + "step": 2195 + }, + { + "epoch": 4.638859556494192, + "grad_norm": 0.10639493154967405, + "learning_rate": 1.2385281434052422e-06, + "loss": 1.2091, + "step": 2196 + }, + { + "epoch": 4.640971488912355, + "grad_norm": 0.11923872249215337, + "learning_rate": 1.2239893624267852e-06, + "loss": 1.2099, + "step": 2197 + }, + { + "epoch": 4.643083421330518, + "grad_norm": 0.1346608820891257, + "learning_rate": 1.2095350936751405e-06, + "loss": 1.2106, + "step": 2198 + }, + { + "epoch": 4.64519535374868, + "grad_norm": 0.11665408086946699, + "learning_rate": 1.1951653686533705e-06, + "loss": 1.2066, + "step": 2199 + }, + { + "epoch": 4.647307286166843, + "grad_norm": 0.08687106789799998, + "learning_rate": 1.1808802186802403e-06, + "loss": 1.189, + "step": 2200 + }, + { + "epoch": 4.649419218585005, + "grad_norm": 0.10165053203634293, + "learning_rate": 1.1666796748902142e-06, + "loss": 1.2089, + "step": 2201 + }, + { + "epoch": 4.651531151003168, + "grad_norm": 0.09358695097732984, + "learning_rate": 1.1525637682333434e-06, + "loss": 1.2231, + "step": 2202 + }, + { + "epoch": 4.653643083421331, + "grad_norm": 0.1056314055585163, + "learning_rate": 1.1385325294752091e-06, + "loss": 1.2173, + "step": 2203 + }, + { + "epoch": 4.655755015839493, + "grad_norm": 0.12605275641727728, + "learning_rate": 1.124585989196878e-06, + "loss": 1.1873, + "step": 2204 + }, + { + "epoch": 4.657866948257656, + "grad_norm": 0.11459142305490089, + "learning_rate": 1.1107241777947775e-06, + "loss": 1.1969, + "step": 2205 + }, + { + "epoch": 4.659978880675818, + "grad_norm": 0.13736164206395282, + "learning_rate": 1.0969471254807185e-06, + "loss": 1.2227, + "step": 2206 + }, + { + "epoch": 4.662090813093981, + "grad_norm": 0.09870133584094315, + "learning_rate": 1.0832548622817397e-06, + "loss": 1.2154, + "step": 2207 + }, + { + "epoch": 4.664202745512144, + "grad_norm": 0.10841773487237055, + "learning_rate": 1.0696474180401074e-06, + "loss": 1.2117, + "step": 2208 + }, + { + "epoch": 4.666314677930306, + "grad_norm": 0.1018958398203731, + "learning_rate": 1.0561248224132093e-06, + "loss": 1.2012, + "step": 2209 + }, + { + "epoch": 4.668426610348469, + "grad_norm": 0.08938063846074239, + "learning_rate": 1.0426871048735188e-06, + "loss": 1.2137, + "step": 2210 + }, + { + "epoch": 4.670538542766631, + "grad_norm": 0.0946766616732523, + "learning_rate": 1.0293342947085017e-06, + "loss": 1.2145, + "step": 2211 + }, + { + "epoch": 4.672650475184794, + "grad_norm": 0.08532160079755323, + "learning_rate": 1.0160664210205805e-06, + "loss": 1.2117, + "step": 2212 + }, + { + "epoch": 4.674762407602957, + "grad_norm": 0.09156852605188437, + "learning_rate": 1.0028835127270553e-06, + "loss": 1.2215, + "step": 2213 + }, + { + "epoch": 4.6768743400211195, + "grad_norm": 0.12201055313400687, + "learning_rate": 9.897855985600492e-07, + "loss": 1.2139, + "step": 2214 + }, + { + "epoch": 4.678986272439282, + "grad_norm": 0.1267203842856114, + "learning_rate": 9.767727070664335e-07, + "loss": 1.2005, + "step": 2215 + }, + { + "epoch": 4.681098204857444, + "grad_norm": 0.12696612806098634, + "learning_rate": 9.638448666077704e-07, + "loss": 1.2151, + "step": 2216 + }, + { + "epoch": 4.683210137275607, + "grad_norm": 0.10971855320210214, + "learning_rate": 9.510021053602681e-07, + "loss": 1.2116, + "step": 2217 + }, + { + "epoch": 4.68532206969377, + "grad_norm": 0.10540886748160795, + "learning_rate": 9.382444513146871e-07, + "loss": 1.2021, + "step": 2218 + }, + { + "epoch": 4.6874340021119325, + "grad_norm": 0.09799963716472415, + "learning_rate": 9.255719322763101e-07, + "loss": 1.2148, + "step": 2219 + }, + { + "epoch": 4.689545934530095, + "grad_norm": 0.09434920402109645, + "learning_rate": 9.129845758648614e-07, + "loss": 1.2092, + "step": 2220 + }, + { + "epoch": 4.691657866948257, + "grad_norm": 0.10144782539881184, + "learning_rate": 9.004824095144582e-07, + "loss": 1.2018, + "step": 2221 + }, + { + "epoch": 4.69376979936642, + "grad_norm": 0.10040419041479653, + "learning_rate": 8.880654604735395e-07, + "loss": 1.2037, + "step": 2222 + }, + { + "epoch": 4.695881731784583, + "grad_norm": 0.08783641892869687, + "learning_rate": 8.757337558048173e-07, + "loss": 1.219, + "step": 2223 + }, + { + "epoch": 4.697993664202746, + "grad_norm": 0.11965961976051069, + "learning_rate": 8.634873223852236e-07, + "loss": 1.2077, + "step": 2224 + }, + { + "epoch": 4.700105596620908, + "grad_norm": 0.15469064238389935, + "learning_rate": 8.513261869058209e-07, + "loss": 1.2126, + "step": 2225 + }, + { + "epoch": 4.70221752903907, + "grad_norm": 0.11663849847370679, + "learning_rate": 8.392503758717807e-07, + "loss": 1.2206, + "step": 2226 + }, + { + "epoch": 4.704329461457234, + "grad_norm": 0.08625495923597536, + "learning_rate": 8.272599156023075e-07, + "loss": 1.2118, + "step": 2227 + }, + { + "epoch": 4.706441393875396, + "grad_norm": 0.09400948522810887, + "learning_rate": 8.153548322305726e-07, + "loss": 1.1908, + "step": 2228 + }, + { + "epoch": 4.708553326293559, + "grad_norm": 0.09927522445511867, + "learning_rate": 8.035351517036915e-07, + "loss": 1.201, + "step": 2229 + }, + { + "epoch": 4.710665258711721, + "grad_norm": 0.09117244282215597, + "learning_rate": 7.918008997826221e-07, + "loss": 1.2093, + "step": 2230 + }, + { + "epoch": 4.7127771911298835, + "grad_norm": 0.10749630689104366, + "learning_rate": 7.801521020421465e-07, + "loss": 1.2091, + "step": 2231 + }, + { + "epoch": 4.714889123548047, + "grad_norm": 0.09999762543702777, + "learning_rate": 7.685887838707828e-07, + "loss": 1.1995, + "step": 2232 + }, + { + "epoch": 4.717001055966209, + "grad_norm": 0.09042653786234948, + "learning_rate": 7.571109704707624e-07, + "loss": 1.2185, + "step": 2233 + }, + { + "epoch": 4.719112988384372, + "grad_norm": 0.11652674366545727, + "learning_rate": 7.457186868579591e-07, + "loss": 1.2086, + "step": 2234 + }, + { + "epoch": 4.721224920802534, + "grad_norm": 0.11887141357830999, + "learning_rate": 7.344119578618181e-07, + "loss": 1.2065, + "step": 2235 + }, + { + "epoch": 4.723336853220697, + "grad_norm": 0.08552898064893985, + "learning_rate": 7.231908081253425e-07, + "loss": 1.2084, + "step": 2236 + }, + { + "epoch": 4.72544878563886, + "grad_norm": 0.08516413162626064, + "learning_rate": 7.120552621049826e-07, + "loss": 1.2079, + "step": 2237 + }, + { + "epoch": 4.727560718057022, + "grad_norm": 0.0930760655841454, + "learning_rate": 7.010053440706576e-07, + "loss": 1.2181, + "step": 2238 + }, + { + "epoch": 4.729672650475185, + "grad_norm": 0.08926349357537679, + "learning_rate": 6.900410781056233e-07, + "loss": 1.2046, + "step": 2239 + }, + { + "epoch": 4.731784582893347, + "grad_norm": 0.12141975865155319, + "learning_rate": 6.79162488106484e-07, + "loss": 1.2075, + "step": 2240 + }, + { + "epoch": 4.7338965153115105, + "grad_norm": 0.10631511236611117, + "learning_rate": 6.683695977830962e-07, + "loss": 1.2128, + "step": 2241 + }, + { + "epoch": 4.736008447729673, + "grad_norm": 0.0940250606367877, + "learning_rate": 6.576624306585322e-07, + "loss": 1.2147, + "step": 2242 + }, + { + "epoch": 4.738120380147835, + "grad_norm": 0.09757261356112051, + "learning_rate": 6.470410100690494e-07, + "loss": 1.1999, + "step": 2243 + }, + { + "epoch": 4.740232312565998, + "grad_norm": 0.09086783202747167, + "learning_rate": 6.365053591640058e-07, + "loss": 1.202, + "step": 2244 + }, + { + "epoch": 4.74234424498416, + "grad_norm": 0.09141902486730022, + "learning_rate": 6.260555009058289e-07, + "loss": 1.2329, + "step": 2245 + }, + { + "epoch": 4.7444561774023235, + "grad_norm": 0.08902685831438867, + "learning_rate": 6.15691458069958e-07, + "loss": 1.2201, + "step": 2246 + }, + { + "epoch": 4.746568109820486, + "grad_norm": 0.08444122752194314, + "learning_rate": 6.054132532448087e-07, + "loss": 1.2013, + "step": 2247 + }, + { + "epoch": 4.748680042238648, + "grad_norm": 0.13336734931326827, + "learning_rate": 5.952209088316974e-07, + "loss": 1.215, + "step": 2248 + }, + { + "epoch": 4.750791974656811, + "grad_norm": 0.12212151485488357, + "learning_rate": 5.851144470448144e-07, + "loss": 1.2037, + "step": 2249 + }, + { + "epoch": 4.752903907074973, + "grad_norm": 0.0898997630138174, + "learning_rate": 5.750938899111802e-07, + "loss": 1.1904, + "step": 2250 + }, + { + "epoch": 4.7550158394931366, + "grad_norm": 0.10474090591298117, + "learning_rate": 5.651592592705646e-07, + "loss": 1.2018, + "step": 2251 + }, + { + "epoch": 4.757127771911299, + "grad_norm": 0.1058911059829169, + "learning_rate": 5.55310576775483e-07, + "loss": 1.2141, + "step": 2252 + }, + { + "epoch": 4.759239704329461, + "grad_norm": 0.11136485930468229, + "learning_rate": 5.455478638911071e-07, + "loss": 1.2144, + "step": 2253 + }, + { + "epoch": 4.761351636747624, + "grad_norm": 0.2903068660891, + "learning_rate": 5.358711418952523e-07, + "loss": 1.2238, + "step": 2254 + }, + { + "epoch": 4.763463569165786, + "grad_norm": 0.09760369884743833, + "learning_rate": 5.2628043187831e-07, + "loss": 1.2131, + "step": 2255 + }, + { + "epoch": 4.76557550158395, + "grad_norm": 0.09899283589262856, + "learning_rate": 5.167757547432129e-07, + "loss": 1.196, + "step": 2256 + }, + { + "epoch": 4.767687434002112, + "grad_norm": 0.12511465473828828, + "learning_rate": 5.073571312053816e-07, + "loss": 1.2045, + "step": 2257 + }, + { + "epoch": 4.7697993664202745, + "grad_norm": 0.08097219428124978, + "learning_rate": 4.980245817926887e-07, + "loss": 1.2144, + "step": 2258 + }, + { + "epoch": 4.771911298838437, + "grad_norm": 0.08504354250700975, + "learning_rate": 4.887781268453973e-07, + "loss": 1.2168, + "step": 2259 + }, + { + "epoch": 4.774023231256599, + "grad_norm": 0.08824447273479105, + "learning_rate": 4.796177865161378e-07, + "loss": 1.2112, + "step": 2260 + }, + { + "epoch": 4.776135163674763, + "grad_norm": 0.09575374787105131, + "learning_rate": 4.7054358076985553e-07, + "loss": 1.2061, + "step": 2261 + }, + { + "epoch": 4.778247096092925, + "grad_norm": 0.1268290058109381, + "learning_rate": 4.61555529383757e-07, + "loss": 1.2171, + "step": 2262 + }, + { + "epoch": 4.7803590285110875, + "grad_norm": 0.11262215401851912, + "learning_rate": 4.5265365194727906e-07, + "loss": 1.233, + "step": 2263 + }, + { + "epoch": 4.78247096092925, + "grad_norm": 0.08575807137779157, + "learning_rate": 4.4383796786204414e-07, + "loss": 1.2153, + "step": 2264 + }, + { + "epoch": 4.784582893347413, + "grad_norm": 0.09280671543117167, + "learning_rate": 4.3510849634181176e-07, + "loss": 1.2236, + "step": 2265 + }, + { + "epoch": 4.786694825765576, + "grad_norm": 0.08651536349214572, + "learning_rate": 4.2646525641245605e-07, + "loss": 1.1974, + "step": 2266 + }, + { + "epoch": 4.788806758183738, + "grad_norm": 0.09769865429967384, + "learning_rate": 4.1790826691189055e-07, + "loss": 1.1931, + "step": 2267 + }, + { + "epoch": 4.790918690601901, + "grad_norm": 0.09683721266346838, + "learning_rate": 4.094375464900546e-07, + "loss": 1.2266, + "step": 2268 + }, + { + "epoch": 4.793030623020063, + "grad_norm": 0.09781453004833127, + "learning_rate": 4.010531136088691e-07, + "loss": 1.1998, + "step": 2269 + }, + { + "epoch": 4.795142555438226, + "grad_norm": 0.08963633268836847, + "learning_rate": 3.9275498654217425e-07, + "loss": 1.2165, + "step": 2270 + }, + { + "epoch": 4.797254487856389, + "grad_norm": 0.087273571031155, + "learning_rate": 3.845431833757385e-07, + "loss": 1.2005, + "step": 2271 + }, + { + "epoch": 4.799366420274551, + "grad_norm": 0.11544386309234086, + "learning_rate": 3.7641772200714745e-07, + "loss": 1.214, + "step": 2272 + }, + { + "epoch": 4.801478352692714, + "grad_norm": 0.10934567169158294, + "learning_rate": 3.683786201458439e-07, + "loss": 1.2217, + "step": 2273 + }, + { + "epoch": 4.803590285110877, + "grad_norm": 0.08709564893636469, + "learning_rate": 3.6042589531301683e-07, + "loss": 1.2053, + "step": 2274 + }, + { + "epoch": 4.805702217529039, + "grad_norm": 0.09756272111531798, + "learning_rate": 3.525595648416191e-07, + "loss": 1.2057, + "step": 2275 + }, + { + "epoch": 4.807814149947202, + "grad_norm": 0.08915272128200992, + "learning_rate": 3.447796458763009e-07, + "loss": 1.2049, + "step": 2276 + }, + { + "epoch": 4.809926082365364, + "grad_norm": 0.09202144070740916, + "learning_rate": 3.370861553733784e-07, + "loss": 1.2019, + "step": 2277 + }, + { + "epoch": 4.812038014783527, + "grad_norm": 0.10867732134740843, + "learning_rate": 3.294791101007944e-07, + "loss": 1.2022, + "step": 2278 + }, + { + "epoch": 4.81414994720169, + "grad_norm": 0.11356956232248583, + "learning_rate": 3.2195852663808644e-07, + "loss": 1.1983, + "step": 2279 + }, + { + "epoch": 4.816261879619852, + "grad_norm": 0.11426181244547283, + "learning_rate": 3.1452442137634763e-07, + "loss": 1.2151, + "step": 2280 + }, + { + "epoch": 4.818373812038015, + "grad_norm": 0.09277220654317034, + "learning_rate": 3.0717681051819935e-07, + "loss": 1.2007, + "step": 2281 + }, + { + "epoch": 4.820485744456177, + "grad_norm": 0.09829176673082482, + "learning_rate": 2.999157100777339e-07, + "loss": 1.2208, + "step": 2282 + }, + { + "epoch": 4.82259767687434, + "grad_norm": 0.09249924527885615, + "learning_rate": 2.927411358805099e-07, + "loss": 1.2107, + "step": 2283 + }, + { + "epoch": 4.824709609292503, + "grad_norm": 0.08250622599113858, + "learning_rate": 2.8565310356349465e-07, + "loss": 1.2122, + "step": 2284 + }, + { + "epoch": 4.8268215417106655, + "grad_norm": 0.0860611455796367, + "learning_rate": 2.786516285750374e-07, + "loss": 1.2158, + "step": 2285 + }, + { + "epoch": 4.828933474128828, + "grad_norm": 0.10886081208542991, + "learning_rate": 2.7173672617483825e-07, + "loss": 1.1874, + "step": 2286 + }, + { + "epoch": 4.83104540654699, + "grad_norm": 0.11156209374443159, + "learning_rate": 2.649084114339084e-07, + "loss": 1.2071, + "step": 2287 + }, + { + "epoch": 4.833157338965153, + "grad_norm": 0.09989087332336705, + "learning_rate": 2.5816669923454774e-07, + "loss": 1.23, + "step": 2288 + }, + { + "epoch": 4.835269271383316, + "grad_norm": 0.10403612652556571, + "learning_rate": 2.5151160427029584e-07, + "loss": 1.2027, + "step": 2289 + }, + { + "epoch": 4.8373812038014785, + "grad_norm": 0.08486689211399558, + "learning_rate": 2.4494314104592796e-07, + "loss": 1.1944, + "step": 2290 + }, + { + "epoch": 4.839493136219641, + "grad_norm": 0.08814265872325379, + "learning_rate": 2.3846132387738363e-07, + "loss": 1.2119, + "step": 2291 + }, + { + "epoch": 4.841605068637803, + "grad_norm": 0.09752654576402599, + "learning_rate": 2.3206616689177564e-07, + "loss": 1.1818, + "step": 2292 + }, + { + "epoch": 4.843717001055966, + "grad_norm": 0.10360504761897507, + "learning_rate": 2.2575768402733234e-07, + "loss": 1.2224, + "step": 2293 + }, + { + "epoch": 4.845828933474129, + "grad_norm": 0.0950536408709072, + "learning_rate": 2.195358890333754e-07, + "loss": 1.202, + "step": 2294 + }, + { + "epoch": 4.847940865892292, + "grad_norm": 0.09234544388512557, + "learning_rate": 2.134007954702977e-07, + "loss": 1.2153, + "step": 2295 + }, + { + "epoch": 4.850052798310454, + "grad_norm": 0.10868417116474952, + "learning_rate": 2.0735241670951867e-07, + "loss": 1.2071, + "step": 2296 + }, + { + "epoch": 4.852164730728616, + "grad_norm": 0.10892549591123751, + "learning_rate": 2.0139076593346242e-07, + "loss": 1.2144, + "step": 2297 + }, + { + "epoch": 4.854276663146779, + "grad_norm": 0.0859230918840511, + "learning_rate": 1.955158561355397e-07, + "loss": 1.2034, + "step": 2298 + }, + { + "epoch": 4.856388595564942, + "grad_norm": 0.08755383138574804, + "learning_rate": 1.8972770012010367e-07, + "loss": 1.2003, + "step": 2299 + }, + { + "epoch": 4.858500527983105, + "grad_norm": 0.11532312218222748, + "learning_rate": 1.840263105024187e-07, + "loss": 1.2043, + "step": 2300 + }, + { + "epoch": 4.860612460401267, + "grad_norm": 0.08729263988677757, + "learning_rate": 1.7841169970866047e-07, + "loss": 1.1999, + "step": 2301 + }, + { + "epoch": 4.8627243928194295, + "grad_norm": 0.09049484366671302, + "learning_rate": 1.728838799758492e-07, + "loss": 1.2127, + "step": 2302 + }, + { + "epoch": 4.864836325237592, + "grad_norm": 0.10330640439937368, + "learning_rate": 1.6744286335186766e-07, + "loss": 1.2034, + "step": 2303 + }, + { + "epoch": 4.866948257655755, + "grad_norm": 0.09170918180161815, + "learning_rate": 1.6208866169538983e-07, + "loss": 1.2219, + "step": 2304 + }, + { + "epoch": 4.869060190073918, + "grad_norm": 0.11459976041980874, + "learning_rate": 1.5682128667589002e-07, + "loss": 1.2147, + "step": 2305 + }, + { + "epoch": 4.87117212249208, + "grad_norm": 0.12116690669951602, + "learning_rate": 1.5164074977360278e-07, + "loss": 1.2179, + "step": 2306 + }, + { + "epoch": 4.8732840549102425, + "grad_norm": 0.24462955075721146, + "learning_rate": 1.4654706227948734e-07, + "loss": 1.2132, + "step": 2307 + }, + { + "epoch": 4.875395987328406, + "grad_norm": 0.11598235163761247, + "learning_rate": 1.4154023529523663e-07, + "loss": 1.2226, + "step": 2308 + }, + { + "epoch": 4.877507919746568, + "grad_norm": 0.08733678126580055, + "learning_rate": 1.3662027973320614e-07, + "loss": 1.2129, + "step": 2309 + }, + { + "epoch": 4.879619852164731, + "grad_norm": 0.08441366872437057, + "learning_rate": 1.3178720631643604e-07, + "loss": 1.2057, + "step": 2310 + }, + { + "epoch": 4.881731784582893, + "grad_norm": 0.08818166406690325, + "learning_rate": 1.2704102557859366e-07, + "loss": 1.2132, + "step": 2311 + }, + { + "epoch": 4.8838437170010565, + "grad_norm": 0.08835246720599585, + "learning_rate": 1.223817478639777e-07, + "loss": 1.2139, + "step": 2312 + }, + { + "epoch": 4.885955649419219, + "grad_norm": 0.08380635965212241, + "learning_rate": 1.1780938332746517e-07, + "loss": 1.202, + "step": 2313 + }, + { + "epoch": 4.888067581837381, + "grad_norm": 0.08659485338982816, + "learning_rate": 1.1332394193451557e-07, + "loss": 1.2165, + "step": 2314 + }, + { + "epoch": 4.890179514255544, + "grad_norm": 0.12065631041825903, + "learning_rate": 1.0892543346114004e-07, + "loss": 1.2054, + "step": 2315 + }, + { + "epoch": 4.892291446673706, + "grad_norm": 0.12350325580471763, + "learning_rate": 1.0461386749387458e-07, + "loss": 1.2159, + "step": 2316 + }, + { + "epoch": 4.8944033790918695, + "grad_norm": 0.12028290346641379, + "learning_rate": 1.0038925342977123e-07, + "loss": 1.2244, + "step": 2317 + }, + { + "epoch": 4.896515311510032, + "grad_norm": 0.0851784752164222, + "learning_rate": 9.62516004763625e-08, + "loss": 1.2063, + "step": 2318 + }, + { + "epoch": 4.898627243928194, + "grad_norm": 0.09649411905983231, + "learning_rate": 9.220091765165695e-08, + "loss": 1.2244, + "step": 2319 + }, + { + "epoch": 4.900739176346357, + "grad_norm": 0.08440375415425161, + "learning_rate": 8.82372137841081e-08, + "loss": 1.2024, + "step": 2320 + }, + { + "epoch": 4.902851108764519, + "grad_norm": 0.10059705701953012, + "learning_rate": 8.436049751260111e-08, + "loss": 1.1905, + "step": 2321 + }, + { + "epoch": 4.904963041182683, + "grad_norm": 0.09055704742437022, + "learning_rate": 8.057077728642615e-08, + "loss": 1.2109, + "step": 2322 + }, + { + "epoch": 4.907074973600845, + "grad_norm": 0.08857750681770395, + "learning_rate": 7.686806136527391e-08, + "loss": 1.2132, + "step": 2323 + }, + { + "epoch": 4.909186906019007, + "grad_norm": 0.08287296357069605, + "learning_rate": 7.325235781920459e-08, + "loss": 1.1973, + "step": 2324 + }, + { + "epoch": 4.91129883843717, + "grad_norm": 0.08140662864727649, + "learning_rate": 6.972367452863005e-08, + "loss": 1.2032, + "step": 2325 + }, + { + "epoch": 4.913410770855332, + "grad_norm": 0.11207406770297139, + "learning_rate": 6.628201918431831e-08, + "loss": 1.2093, + "step": 2326 + }, + { + "epoch": 4.915522703273496, + "grad_norm": 0.1085036363780518, + "learning_rate": 6.292739928733582e-08, + "loss": 1.1958, + "step": 2327 + }, + { + "epoch": 4.917634635691658, + "grad_norm": 0.11960054045970155, + "learning_rate": 5.965982214908294e-08, + "loss": 1.2162, + "step": 2328 + }, + { + "epoch": 4.9197465681098205, + "grad_norm": 0.10456261704653298, + "learning_rate": 5.647929489122739e-08, + "loss": 1.219, + "step": 2329 + }, + { + "epoch": 4.921858500527983, + "grad_norm": 0.09255670965737642, + "learning_rate": 5.3385824445721935e-08, + "loss": 1.2172, + "step": 2330 + }, + { + "epoch": 4.923970432946145, + "grad_norm": 0.08616666611432097, + "learning_rate": 5.037941755478226e-08, + "loss": 1.2091, + "step": 2331 + }, + { + "epoch": 4.926082365364309, + "grad_norm": 0.0866674224049509, + "learning_rate": 4.746008077086028e-08, + "loss": 1.201, + "step": 2332 + }, + { + "epoch": 4.928194297782471, + "grad_norm": 0.0945483659539817, + "learning_rate": 4.4627820456648594e-08, + "loss": 1.1992, + "step": 2333 + }, + { + "epoch": 4.9303062302006335, + "grad_norm": 0.09039613135688769, + "learning_rate": 4.188264278504939e-08, + "loss": 1.1954, + "step": 2334 + }, + { + "epoch": 4.932418162618796, + "grad_norm": 0.08619368207482583, + "learning_rate": 3.922455373917444e-08, + "loss": 1.2054, + "step": 2335 + }, + { + "epoch": 4.934530095036958, + "grad_norm": 0.08659123490413242, + "learning_rate": 3.6653559112318495e-08, + "loss": 1.207, + "step": 2336 + }, + { + "epoch": 4.936642027455122, + "grad_norm": 0.08487823104793037, + "learning_rate": 3.416966450795922e-08, + "loss": 1.1928, + "step": 2337 + }, + { + "epoch": 4.938753959873284, + "grad_norm": 0.07872394928400044, + "learning_rate": 3.1772875339739495e-08, + "loss": 1.2096, + "step": 2338 + }, + { + "epoch": 4.940865892291447, + "grad_norm": 0.09153556623514689, + "learning_rate": 2.9463196831458486e-08, + "loss": 1.1982, + "step": 2339 + }, + { + "epoch": 4.942977824709609, + "grad_norm": 0.10394435984337934, + "learning_rate": 2.72406340170539e-08, + "loss": 1.1972, + "step": 2340 + }, + { + "epoch": 4.945089757127771, + "grad_norm": 0.12333915443310656, + "learning_rate": 2.5105191740597557e-08, + "loss": 1.2045, + "step": 2341 + }, + { + "epoch": 4.947201689545935, + "grad_norm": 0.11848643531436136, + "learning_rate": 2.3056874656277596e-08, + "loss": 1.2076, + "step": 2342 + }, + { + "epoch": 4.949313621964097, + "grad_norm": 0.10755415290757411, + "learning_rate": 2.10956872283985e-08, + "loss": 1.2121, + "step": 2343 + }, + { + "epoch": 4.95142555438226, + "grad_norm": 0.10873411914761923, + "learning_rate": 1.922163373135888e-08, + "loss": 1.2313, + "step": 2344 + }, + { + "epoch": 4.953537486800422, + "grad_norm": 0.10723776306690906, + "learning_rate": 1.7434718249664807e-08, + "loss": 1.2248, + "step": 2345 + }, + { + "epoch": 4.955649419218585, + "grad_norm": 0.08061065295287793, + "learning_rate": 1.5734944677885388e-08, + "loss": 1.2073, + "step": 2346 + }, + { + "epoch": 4.957761351636748, + "grad_norm": 0.0835699468274866, + "learning_rate": 1.412231672067499e-08, + "loss": 1.2059, + "step": 2347 + }, + { + "epoch": 4.95987328405491, + "grad_norm": 0.08619296762372283, + "learning_rate": 1.2596837892755453e-08, + "loss": 1.2251, + "step": 2348 + }, + { + "epoch": 4.961985216473073, + "grad_norm": 0.08515607934013258, + "learning_rate": 1.1158511518902793e-08, + "loss": 1.2072, + "step": 2349 + }, + { + "epoch": 4.964097148891235, + "grad_norm": 0.08679936823130462, + "learning_rate": 9.807340733947179e-09, + "loss": 1.2077, + "step": 2350 + }, + { + "epoch": 4.966209081309398, + "grad_norm": 0.08717174754060683, + "learning_rate": 8.543328482759627e-09, + "loss": 1.2255, + "step": 2351 + }, + { + "epoch": 4.968321013727561, + "grad_norm": 0.08354454462515311, + "learning_rate": 7.366477520251991e-09, + "loss": 1.2137, + "step": 2352 + }, + { + "epoch": 4.970432946145723, + "grad_norm": 0.0844431967974954, + "learning_rate": 6.276790411372524e-09, + "loss": 1.2207, + "step": 2353 + }, + { + "epoch": 4.972544878563886, + "grad_norm": 0.08168340565640805, + "learning_rate": 5.274269531088116e-09, + "loss": 1.211, + "step": 2354 + }, + { + "epoch": 4.974656810982049, + "grad_norm": 0.10184029641742327, + "learning_rate": 4.358917064388734e-09, + "loss": 1.2114, + "step": 2355 + }, + { + "epoch": 4.9767687434002115, + "grad_norm": 0.0837451878197697, + "learning_rate": 3.530735006291863e-09, + "loss": 1.1913, + "step": 2356 + }, + { + "epoch": 4.978880675818374, + "grad_norm": 0.08471798054573139, + "learning_rate": 2.7897251618069777e-09, + "loss": 1.1976, + "step": 2357 + }, + { + "epoch": 4.980992608236536, + "grad_norm": 0.08840583583803327, + "learning_rate": 2.1358891459710707e-09, + "loss": 1.2049, + "step": 2358 + }, + { + "epoch": 4.983104540654699, + "grad_norm": 0.08583363886375359, + "learning_rate": 1.5692283838131262e-09, + "loss": 1.2043, + "step": 2359 + }, + { + "epoch": 4.985216473072862, + "grad_norm": 0.08998320526132073, + "learning_rate": 1.0897441103763228e-09, + "loss": 1.2166, + "step": 2360 + }, + { + "epoch": 4.9873284054910245, + "grad_norm": 0.10243379533008468, + "learning_rate": 6.974373706869486e-10, + "loss": 1.2099, + "step": 2361 + }, + { + "epoch": 4.989440337909187, + "grad_norm": 0.08681527069163769, + "learning_rate": 3.923090197810453e-10, + "loss": 1.1971, + "step": 2362 + }, + { + "epoch": 4.991552270327349, + "grad_norm": 0.09193518045024805, + "learning_rate": 1.7435972268664559e-10, + "loss": 1.2077, + "step": 2363 + }, + { + "epoch": 4.993664202745512, + "grad_norm": 0.09027871293113858, + "learning_rate": 4.3589954423772784e-11, + "loss": 1.1872, + "step": 2364 + }, + { + "epoch": 4.995776135163675, + "grad_norm": 0.08310033529765323, + "learning_rate": 0.0, + "loss": 1.2252, + "step": 2365 + }, + { + "epoch": 4.995776135163675, + "step": 2365, + "total_flos": 5.680390465230838e+19, + "train_loss": 1.2719324281805413, + "train_runtime": 115420.1008, + "train_samples_per_second": 10.5, + "train_steps_per_second": 0.02 + } + ], + "logging_steps": 1, + "max_steps": 2365, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.680390465230838e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}