diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6891 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4282, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011676786548341896, + "grad_norm": 2.834435044274876, + "learning_rate": 1.1627906976744186e-06, + "loss": 1.1275, + "mean_token_accuracy": 0.7145514488220215, + "step": 5 + }, + { + "epoch": 0.002335357309668379, + "grad_norm": 2.379978268884163, + "learning_rate": 2.325581395348837e-06, + "loss": 1.1118, + "mean_token_accuracy": 0.7183676719665527, + "step": 10 + }, + { + "epoch": 0.003503035964502569, + "grad_norm": 1.7880142996198274, + "learning_rate": 3.488372093023256e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.7222420334815979, + "step": 15 + }, + { + "epoch": 0.004670714619336758, + "grad_norm": 1.436920791294609, + "learning_rate": 4.651162790697674e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7317505478858948, + "step": 20 + }, + { + "epoch": 0.005838393274170948, + "grad_norm": 0.8898514868579989, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7411599397659302, + "step": 25 + }, + { + "epoch": 0.007006071929005138, + "grad_norm": 0.9215782187302078, + "learning_rate": 6.976744186046512e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7415176510810852, + "step": 30 + }, + { + "epoch": 0.008173750583839328, + "grad_norm": 0.7526398720937829, + "learning_rate": 8.139534883720931e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7485355615615845, + "step": 35 + }, + { + "epoch": 0.009341429238673517, + "grad_norm": 0.6798256778267421, + "learning_rate": 9.302325581395349e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7499055147171021, + "step": 40 + }, + { + "epoch": 0.010509107893507707, + "grad_norm": 0.639003387303369, + "learning_rate": 1.0465116279069768e-05, + "loss": 0.8718, + "mean_token_accuracy": 0.754253089427948, + "step": 45 + }, + { + "epoch": 0.011676786548341896, + "grad_norm": 0.600206765612218, + "learning_rate": 1.1627906976744187e-05, + "loss": 0.8591, + "mean_token_accuracy": 0.7566048502922058, + "step": 50 + }, + { + "epoch": 0.012844465203176086, + "grad_norm": 0.5941698118090386, + "learning_rate": 1.2790697674418606e-05, + "loss": 0.8489, + "mean_token_accuracy": 0.7596176505088806, + "step": 55 + }, + { + "epoch": 0.014012143858010275, + "grad_norm": 0.5576117273804396, + "learning_rate": 1.3953488372093024e-05, + "loss": 0.836, + "mean_token_accuracy": 0.7622167348861695, + "step": 60 + }, + { + "epoch": 0.015179822512844466, + "grad_norm": 0.5803269046021847, + "learning_rate": 1.5116279069767441e-05, + "loss": 0.8278, + "mean_token_accuracy": 0.7640398144721985, + "step": 65 + }, + { + "epoch": 0.016347501167678656, + "grad_norm": 0.6207517481816065, + "learning_rate": 1.6279069767441862e-05, + "loss": 0.8217, + "mean_token_accuracy": 0.764385211467743, + "step": 70 + }, + { + "epoch": 0.017515179822512845, + "grad_norm": 0.6156865313111355, + "learning_rate": 1.744186046511628e-05, + "loss": 0.81, + "mean_token_accuracy": 0.767782461643219, + "step": 75 + }, + { + "epoch": 0.018682858477347034, + "grad_norm": 0.6259615332834731, + "learning_rate": 1.8604651162790697e-05, + "loss": 0.8154, + "mean_token_accuracy": 0.7665858030319214, + "step": 80 + }, + { + "epoch": 0.019850537132181222, + "grad_norm": 0.5709616517741386, + "learning_rate": 1.9767441860465116e-05, + "loss": 0.804, + "mean_token_accuracy": 0.7685602903366089, + "step": 85 + }, + { + "epoch": 0.021018215787015414, + "grad_norm": 0.5967433985562557, + "learning_rate": 2.0930232558139536e-05, + "loss": 0.8284, + "mean_token_accuracy": 0.7630936026573181, + "step": 90 + }, + { + "epoch": 0.022185894441849603, + "grad_norm": 0.6913013676610581, + "learning_rate": 2.2093023255813955e-05, + "loss": 0.8117, + "mean_token_accuracy": 0.7667965531349182, + "step": 95 + }, + { + "epoch": 0.023353573096683792, + "grad_norm": 0.6181608339151498, + "learning_rate": 2.3255813953488374e-05, + "loss": 0.7886, + "mean_token_accuracy": 0.7722474217414856, + "step": 100 + }, + { + "epoch": 0.02452125175151798, + "grad_norm": 0.6358092710241181, + "learning_rate": 2.441860465116279e-05, + "loss": 0.8135, + "mean_token_accuracy": 0.7663555383682251, + "step": 105 + }, + { + "epoch": 0.025688930406352173, + "grad_norm": 0.611586922387333, + "learning_rate": 2.5581395348837212e-05, + "loss": 0.787, + "mean_token_accuracy": 0.7728439092636108, + "step": 110 + }, + { + "epoch": 0.02685660906118636, + "grad_norm": 0.5854030201826317, + "learning_rate": 2.674418604651163e-05, + "loss": 0.798, + "mean_token_accuracy": 0.7691318392753601, + "step": 115 + }, + { + "epoch": 0.02802428771602055, + "grad_norm": 0.6124505325375188, + "learning_rate": 2.7906976744186048e-05, + "loss": 0.7785, + "mean_token_accuracy": 0.7739496827125549, + "step": 120 + }, + { + "epoch": 0.029191966370854742, + "grad_norm": 0.6741670015168594, + "learning_rate": 2.9069767441860467e-05, + "loss": 0.8066, + "mean_token_accuracy": 0.7667750120162964, + "step": 125 + }, + { + "epoch": 0.03035964502568893, + "grad_norm": 0.7213611745036309, + "learning_rate": 3.0232558139534883e-05, + "loss": 0.8087, + "mean_token_accuracy": 0.7660662293434143, + "step": 130 + }, + { + "epoch": 0.03152732368052312, + "grad_norm": 0.6462301442485545, + "learning_rate": 3.13953488372093e-05, + "loss": 0.79, + "mean_token_accuracy": 0.7716695904731751, + "step": 135 + }, + { + "epoch": 0.03269500233535731, + "grad_norm": 0.7200856075155071, + "learning_rate": 3.2558139534883724e-05, + "loss": 0.7874, + "mean_token_accuracy": 0.7721680879592896, + "step": 140 + }, + { + "epoch": 0.0338626809901915, + "grad_norm": 0.653098157465906, + "learning_rate": 3.372093023255814e-05, + "loss": 0.8035, + "mean_token_accuracy": 0.7668708443641663, + "step": 145 + }, + { + "epoch": 0.03503035964502569, + "grad_norm": 0.6564669867965367, + "learning_rate": 3.488372093023256e-05, + "loss": 0.781, + "mean_token_accuracy": 0.7731781840324402, + "step": 150 + }, + { + "epoch": 0.03619803829985988, + "grad_norm": 0.716934400421906, + "learning_rate": 3.604651162790698e-05, + "loss": 0.794, + "mean_token_accuracy": 0.7690209746360779, + "step": 155 + }, + { + "epoch": 0.03736571695469407, + "grad_norm": 0.6844706080683889, + "learning_rate": 3.7209302325581394e-05, + "loss": 0.7816, + "mean_token_accuracy": 0.7729620218276978, + "step": 160 + }, + { + "epoch": 0.03853339560952826, + "grad_norm": 0.7045578876190589, + "learning_rate": 3.837209302325582e-05, + "loss": 0.7827, + "mean_token_accuracy": 0.7726161241531372, + "step": 165 + }, + { + "epoch": 0.039701074264362445, + "grad_norm": 0.6766171407600107, + "learning_rate": 3.953488372093023e-05, + "loss": 0.7892, + "mean_token_accuracy": 0.7705170154571533, + "step": 170 + }, + { + "epoch": 0.04086875291919664, + "grad_norm": 0.7371951510285583, + "learning_rate": 4.0697674418604655e-05, + "loss": 0.7763, + "mean_token_accuracy": 0.7734784007072448, + "step": 175 + }, + { + "epoch": 0.04203643157403083, + "grad_norm": 0.6100530022358429, + "learning_rate": 4.186046511627907e-05, + "loss": 0.7893, + "mean_token_accuracy": 0.7704616069793702, + "step": 180 + }, + { + "epoch": 0.043204110228865014, + "grad_norm": 0.6275364687160232, + "learning_rate": 4.302325581395349e-05, + "loss": 0.7853, + "mean_token_accuracy": 0.7720511078834533, + "step": 185 + }, + { + "epoch": 0.044371788883699206, + "grad_norm": 0.692327582566576, + "learning_rate": 4.418604651162791e-05, + "loss": 0.7611, + "mean_token_accuracy": 0.7766433358192444, + "step": 190 + }, + { + "epoch": 0.0455394675385334, + "grad_norm": 0.6133073681625562, + "learning_rate": 4.5348837209302326e-05, + "loss": 0.7825, + "mean_token_accuracy": 0.7726439476013184, + "step": 195 + }, + { + "epoch": 0.046707146193367584, + "grad_norm": 0.590175861731042, + "learning_rate": 4.651162790697675e-05, + "loss": 0.784, + "mean_token_accuracy": 0.7719995737075805, + "step": 200 + }, + { + "epoch": 0.047874824848201776, + "grad_norm": 0.6364643760793014, + "learning_rate": 4.7674418604651164e-05, + "loss": 0.7896, + "mean_token_accuracy": 0.7697775363922119, + "step": 205 + }, + { + "epoch": 0.04904250350303596, + "grad_norm": 0.6106582081588859, + "learning_rate": 4.883720930232558e-05, + "loss": 0.7668, + "mean_token_accuracy": 0.776114547252655, + "step": 210 + }, + { + "epoch": 0.050210182157870153, + "grad_norm": 0.6270003956771206, + "learning_rate": 5e-05, + "loss": 0.7833, + "mean_token_accuracy": 0.7716834068298339, + "step": 215 + }, + { + "epoch": 0.051377860812704346, + "grad_norm": 0.7360280961349297, + "learning_rate": 4.9999832180125564e-05, + "loss": 0.7774, + "mean_token_accuracy": 0.7732850790023804, + "step": 220 + }, + { + "epoch": 0.05254553946753853, + "grad_norm": 0.7353478083649229, + "learning_rate": 4.999932872300567e-05, + "loss": 0.7802, + "mean_token_accuracy": 0.7725408196449279, + "step": 225 + }, + { + "epoch": 0.05371321812237272, + "grad_norm": 0.6344157021307477, + "learning_rate": 4.9998489636150545e-05, + "loss": 0.7859, + "mean_token_accuracy": 0.7699117660522461, + "step": 230 + }, + { + "epoch": 0.054880896777206915, + "grad_norm": 0.7889478422297246, + "learning_rate": 4.999731493207714e-05, + "loss": 0.7686, + "mean_token_accuracy": 0.7756441354751586, + "step": 235 + }, + { + "epoch": 0.0560485754320411, + "grad_norm": 0.6637268168217192, + "learning_rate": 4.999580462830887e-05, + "loss": 0.7789, + "mean_token_accuracy": 0.772291648387909, + "step": 240 + }, + { + "epoch": 0.05721625408687529, + "grad_norm": 0.7011123056433453, + "learning_rate": 4.999395874737543e-05, + "loss": 0.7733, + "mean_token_accuracy": 0.7734087228775024, + "step": 245 + }, + { + "epoch": 0.058383932741709485, + "grad_norm": 0.5521943639172412, + "learning_rate": 4.9991777316812435e-05, + "loss": 0.7519, + "mean_token_accuracy": 0.7793730735778809, + "step": 250 + }, + { + "epoch": 0.05955161139654367, + "grad_norm": 0.6422059991104521, + "learning_rate": 4.998926036916096e-05, + "loss": 0.7801, + "mean_token_accuracy": 0.7724045991897583, + "step": 255 + }, + { + "epoch": 0.06071929005137786, + "grad_norm": 0.6278282605028872, + "learning_rate": 4.9986407941967145e-05, + "loss": 0.767, + "mean_token_accuracy": 0.7750486612319947, + "step": 260 + }, + { + "epoch": 0.06188696870621205, + "grad_norm": 0.6342738916218161, + "learning_rate": 4.998322007778156e-05, + "loss": 0.7727, + "mean_token_accuracy": 0.7736498475074768, + "step": 265 + }, + { + "epoch": 0.06305464736104624, + "grad_norm": 0.6201069276831243, + "learning_rate": 4.9979696824158613e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.7746302485466003, + "step": 270 + }, + { + "epoch": 0.06422232601588043, + "grad_norm": 0.5583443600253161, + "learning_rate": 4.997583823365579e-05, + "loss": 0.7716, + "mean_token_accuracy": 0.7740055322647095, + "step": 275 + }, + { + "epoch": 0.06539000467071462, + "grad_norm": 0.5651120729666149, + "learning_rate": 4.997164436383294e-05, + "loss": 0.7684, + "mean_token_accuracy": 0.7752025604248047, + "step": 280 + }, + { + "epoch": 0.0665576833255488, + "grad_norm": 0.6779084902542014, + "learning_rate": 4.996711527725137e-05, + "loss": 0.7401, + "mean_token_accuracy": 0.7824846148490906, + "step": 285 + }, + { + "epoch": 0.067725361980383, + "grad_norm": 0.5508336290031768, + "learning_rate": 4.9962251041472936e-05, + "loss": 0.7569, + "mean_token_accuracy": 0.7779935479164124, + "step": 290 + }, + { + "epoch": 0.06889304063521719, + "grad_norm": 0.578899793990126, + "learning_rate": 4.9957051729058994e-05, + "loss": 0.7698, + "mean_token_accuracy": 0.7752918124198913, + "step": 295 + }, + { + "epoch": 0.07006071929005138, + "grad_norm": 0.5523380204960521, + "learning_rate": 4.9951517417569365e-05, + "loss": 0.7698, + "mean_token_accuracy": 0.773857319355011, + "step": 300 + }, + { + "epoch": 0.07122839794488557, + "grad_norm": 0.591829362319793, + "learning_rate": 4.994564818956116e-05, + "loss": 0.7734, + "mean_token_accuracy": 0.7735624313354492, + "step": 305 + }, + { + "epoch": 0.07239607659971976, + "grad_norm": 0.6267401802622065, + "learning_rate": 4.993944413258755e-05, + "loss": 0.766, + "mean_token_accuracy": 0.7749941229820252, + "step": 310 + }, + { + "epoch": 0.07356375525455394, + "grad_norm": 0.5165219803325648, + "learning_rate": 4.993290533919644e-05, + "loss": 0.7486, + "mean_token_accuracy": 0.7796483039855957, + "step": 315 + }, + { + "epoch": 0.07473143390938813, + "grad_norm": 0.578072281108687, + "learning_rate": 4.9926031906929114e-05, + "loss": 0.7666, + "mean_token_accuracy": 0.7752916693687439, + "step": 320 + }, + { + "epoch": 0.07589911256422233, + "grad_norm": 0.5293575565087292, + "learning_rate": 4.9918823938318796e-05, + "loss": 0.7625, + "mean_token_accuracy": 0.7761589884757996, + "step": 325 + }, + { + "epoch": 0.07706679121905652, + "grad_norm": 0.6252879783919262, + "learning_rate": 4.991128154088906e-05, + "loss": 0.7679, + "mean_token_accuracy": 0.7747756361961364, + "step": 330 + }, + { + "epoch": 0.07823446987389071, + "grad_norm": 0.5504762240468136, + "learning_rate": 4.990340482715228e-05, + "loss": 0.7506, + "mean_token_accuracy": 0.7791156768798828, + "step": 335 + }, + { + "epoch": 0.07940214852872489, + "grad_norm": 0.6694871923654819, + "learning_rate": 4.989519391460794e-05, + "loss": 0.7776, + "mean_token_accuracy": 0.772446084022522, + "step": 340 + }, + { + "epoch": 0.08056982718355908, + "grad_norm": 0.5408339323253566, + "learning_rate": 4.988664892574086e-05, + "loss": 0.7475, + "mean_token_accuracy": 0.7793663501739502, + "step": 345 + }, + { + "epoch": 0.08173750583839327, + "grad_norm": 0.5899899142051376, + "learning_rate": 4.987776998801939e-05, + "loss": 0.7652, + "mean_token_accuracy": 0.7747410535812378, + "step": 350 + }, + { + "epoch": 0.08290518449322747, + "grad_norm": 0.5826260618753661, + "learning_rate": 4.986855723389351e-05, + "loss": 0.7584, + "mean_token_accuracy": 0.7774671077728271, + "step": 355 + }, + { + "epoch": 0.08407286314806166, + "grad_norm": 0.5853778690736158, + "learning_rate": 4.9859010800792855e-05, + "loss": 0.7708, + "mean_token_accuracy": 0.7740145683288574, + "step": 360 + }, + { + "epoch": 0.08524054180289584, + "grad_norm": 0.5725709877855831, + "learning_rate": 4.984913083112462e-05, + "loss": 0.7588, + "mean_token_accuracy": 0.7769133090972901, + "step": 365 + }, + { + "epoch": 0.08640822045773003, + "grad_norm": 0.5576105732385842, + "learning_rate": 4.9838917472271495e-05, + "loss": 0.7578, + "mean_token_accuracy": 0.7774189114570618, + "step": 370 + }, + { + "epoch": 0.08757589911256422, + "grad_norm": 0.5135243486762037, + "learning_rate": 4.982837087658947e-05, + "loss": 0.7482, + "mean_token_accuracy": 0.7795826315879821, + "step": 375 + }, + { + "epoch": 0.08874357776739841, + "grad_norm": 0.5103543957545502, + "learning_rate": 4.981749120140547e-05, + "loss": 0.7591, + "mean_token_accuracy": 0.7763033866882324, + "step": 380 + }, + { + "epoch": 0.0899112564222326, + "grad_norm": 0.5184798576708707, + "learning_rate": 4.980627860901516e-05, + "loss": 0.7472, + "mean_token_accuracy": 0.7796693801879883, + "step": 385 + }, + { + "epoch": 0.0910789350770668, + "grad_norm": 0.5357829908587407, + "learning_rate": 4.9794733266680364e-05, + "loss": 0.7605, + "mean_token_accuracy": 0.7767962455749512, + "step": 390 + }, + { + "epoch": 0.09224661373190098, + "grad_norm": 0.4869135099165371, + "learning_rate": 4.978285534662669e-05, + "loss": 0.7637, + "mean_token_accuracy": 0.7756361842155457, + "step": 395 + }, + { + "epoch": 0.09341429238673517, + "grad_norm": 0.5719107969204461, + "learning_rate": 4.977064502604089e-05, + "loss": 0.7302, + "mean_token_accuracy": 0.7849988102912903, + "step": 400 + }, + { + "epoch": 0.09458197104156936, + "grad_norm": 0.5343080270231678, + "learning_rate": 4.975810248706824e-05, + "loss": 0.7592, + "mean_token_accuracy": 0.7763170838356018, + "step": 405 + }, + { + "epoch": 0.09574964969640355, + "grad_norm": 0.4878670452679305, + "learning_rate": 4.974522791680985e-05, + "loss": 0.7432, + "mean_token_accuracy": 0.7811870098114013, + "step": 410 + }, + { + "epoch": 0.09691732835123774, + "grad_norm": 0.5721244216925044, + "learning_rate": 4.9732021507319814e-05, + "loss": 0.7742, + "mean_token_accuracy": 0.7725372076034546, + "step": 415 + }, + { + "epoch": 0.09808500700607192, + "grad_norm": 0.5597107270324383, + "learning_rate": 4.971848345560243e-05, + "loss": 0.7783, + "mean_token_accuracy": 0.7717612981796265, + "step": 420 + }, + { + "epoch": 0.09925268566090611, + "grad_norm": 0.5705976574793036, + "learning_rate": 4.970461396360914e-05, + "loss": 0.7514, + "mean_token_accuracy": 0.7784589052200317, + "step": 425 + }, + { + "epoch": 0.10042036431574031, + "grad_norm": 0.5512507668394148, + "learning_rate": 4.969041323823565e-05, + "loss": 0.7456, + "mean_token_accuracy": 0.7799262046813965, + "step": 430 + }, + { + "epoch": 0.1015880429705745, + "grad_norm": 0.5306621697405588, + "learning_rate": 4.9675881491318735e-05, + "loss": 0.7454, + "mean_token_accuracy": 0.7800767183303833, + "step": 435 + }, + { + "epoch": 0.10275572162540869, + "grad_norm": 0.49116432972703566, + "learning_rate": 4.966101893963317e-05, + "loss": 0.7407, + "mean_token_accuracy": 0.7816652059555054, + "step": 440 + }, + { + "epoch": 0.10392340028024288, + "grad_norm": 0.6100216589920942, + "learning_rate": 4.9645825804888416e-05, + "loss": 0.7614, + "mean_token_accuracy": 0.7764947652816773, + "step": 445 + }, + { + "epoch": 0.10509107893507706, + "grad_norm": 0.5218544527135991, + "learning_rate": 4.9630302313725354e-05, + "loss": 0.7448, + "mean_token_accuracy": 0.7801677823066712, + "step": 450 + }, + { + "epoch": 0.10625875758991125, + "grad_norm": 0.5378079186101065, + "learning_rate": 4.96144486977129e-05, + "loss": 0.7489, + "mean_token_accuracy": 0.7782549619674682, + "step": 455 + }, + { + "epoch": 0.10742643624474545, + "grad_norm": 0.5150836296709289, + "learning_rate": 4.959826519334456e-05, + "loss": 0.7471, + "mean_token_accuracy": 0.7787457346916199, + "step": 460 + }, + { + "epoch": 0.10859411489957964, + "grad_norm": 0.5707700573587294, + "learning_rate": 4.958175204203488e-05, + "loss": 0.7425, + "mean_token_accuracy": 0.7812817573547364, + "step": 465 + }, + { + "epoch": 0.10976179355441383, + "grad_norm": 0.5278308225763797, + "learning_rate": 4.9564909490115864e-05, + "loss": 0.7419, + "mean_token_accuracy": 0.7815093040466309, + "step": 470 + }, + { + "epoch": 0.11092947220924801, + "grad_norm": 0.5068167481586519, + "learning_rate": 4.9547737788833274e-05, + "loss": 0.7463, + "mean_token_accuracy": 0.7798628807067871, + "step": 475 + }, + { + "epoch": 0.1120971508640822, + "grad_norm": 0.4966047752528451, + "learning_rate": 4.953023719434292e-05, + "loss": 0.7408, + "mean_token_accuracy": 0.7800005555152894, + "step": 480 + }, + { + "epoch": 0.1132648295189164, + "grad_norm": 0.4763621654693867, + "learning_rate": 4.95124079677068e-05, + "loss": 0.7412, + "mean_token_accuracy": 0.7807362318038941, + "step": 485 + }, + { + "epoch": 0.11443250817375059, + "grad_norm": 0.5180497902228367, + "learning_rate": 4.9494250374889235e-05, + "loss": 0.7361, + "mean_token_accuracy": 0.7820371627807617, + "step": 490 + }, + { + "epoch": 0.11560018682858478, + "grad_norm": 0.5206124521786272, + "learning_rate": 4.947576468675289e-05, + "loss": 0.7489, + "mean_token_accuracy": 0.7787517547607422, + "step": 495 + }, + { + "epoch": 0.11676786548341897, + "grad_norm": 0.516455677060152, + "learning_rate": 4.9456951179054725e-05, + "loss": 0.7325, + "mean_token_accuracy": 0.7826202750205994, + "step": 500 + }, + { + "epoch": 0.11793554413825315, + "grad_norm": 0.5180527540768113, + "learning_rate": 4.94378101324419e-05, + "loss": 0.7466, + "mean_token_accuracy": 0.7795100331306457, + "step": 505 + }, + { + "epoch": 0.11910322279308734, + "grad_norm": 0.49805865297739094, + "learning_rate": 4.9418341832447575e-05, + "loss": 0.7482, + "mean_token_accuracy": 0.778470253944397, + "step": 510 + }, + { + "epoch": 0.12027090144792153, + "grad_norm": 0.47838294733363296, + "learning_rate": 4.939854656948665e-05, + "loss": 0.752, + "mean_token_accuracy": 0.7778467774391175, + "step": 515 + }, + { + "epoch": 0.12143858010275572, + "grad_norm": 0.5136310633692487, + "learning_rate": 4.937842463885143e-05, + "loss": 0.7426, + "mean_token_accuracy": 0.7804072141647339, + "step": 520 + }, + { + "epoch": 0.12260625875758992, + "grad_norm": 0.56199422307423, + "learning_rate": 4.935797634070726e-05, + "loss": 0.7417, + "mean_token_accuracy": 0.7800330877304077, + "step": 525 + }, + { + "epoch": 0.1237739374124241, + "grad_norm": 0.498563487534454, + "learning_rate": 4.933720198008798e-05, + "loss": 0.7498, + "mean_token_accuracy": 0.77762371301651, + "step": 530 + }, + { + "epoch": 0.12494161606725829, + "grad_norm": 0.5494917516779005, + "learning_rate": 4.9316101866891414e-05, + "loss": 0.7505, + "mean_token_accuracy": 0.7779279947280884, + "step": 535 + }, + { + "epoch": 0.12610929472209248, + "grad_norm": 0.528910650118832, + "learning_rate": 4.9294676315874756e-05, + "loss": 0.7435, + "mean_token_accuracy": 0.7792040467262268, + "step": 540 + }, + { + "epoch": 0.12727697337692667, + "grad_norm": 0.5096686677289526, + "learning_rate": 4.927292564664985e-05, + "loss": 0.7385, + "mean_token_accuracy": 0.782158350944519, + "step": 545 + }, + { + "epoch": 0.12844465203176086, + "grad_norm": 0.5141516961830445, + "learning_rate": 4.925085018367844e-05, + "loss": 0.7291, + "mean_token_accuracy": 0.7843806624412537, + "step": 550 + }, + { + "epoch": 0.12961233068659506, + "grad_norm": 0.48712982541984445, + "learning_rate": 4.922845025626732e-05, + "loss": 0.7476, + "mean_token_accuracy": 0.7781603097915649, + "step": 555 + }, + { + "epoch": 0.13078000934142925, + "grad_norm": 0.5180023610822088, + "learning_rate": 4.9205726198563415e-05, + "loss": 0.7379, + "mean_token_accuracy": 0.7818755507469177, + "step": 560 + }, + { + "epoch": 0.13194768799626344, + "grad_norm": 0.5495723266400895, + "learning_rate": 4.918267834954882e-05, + "loss": 0.7426, + "mean_token_accuracy": 0.7793578743934632, + "step": 565 + }, + { + "epoch": 0.1331153666510976, + "grad_norm": 0.4625322581825189, + "learning_rate": 4.915930705303572e-05, + "loss": 0.7457, + "mean_token_accuracy": 0.7794965147972107, + "step": 570 + }, + { + "epoch": 0.1342830453059318, + "grad_norm": 0.5031515487278571, + "learning_rate": 4.913561265766129e-05, + "loss": 0.7389, + "mean_token_accuracy": 0.7821717500686646, + "step": 575 + }, + { + "epoch": 0.135450723960766, + "grad_norm": 0.4367872878702878, + "learning_rate": 4.911159551688244e-05, + "loss": 0.7292, + "mean_token_accuracy": 0.7840291142463685, + "step": 580 + }, + { + "epoch": 0.13661840261560018, + "grad_norm": 0.5457997992975028, + "learning_rate": 4.908725598897061e-05, + "loss": 0.74, + "mean_token_accuracy": 0.7806668400764465, + "step": 585 + }, + { + "epoch": 0.13778608127043437, + "grad_norm": 0.49013496011684654, + "learning_rate": 4.906259443700638e-05, + "loss": 0.7326, + "mean_token_accuracy": 0.7835927963256836, + "step": 590 + }, + { + "epoch": 0.13895375992526857, + "grad_norm": 0.5031833854909199, + "learning_rate": 4.9037611228874045e-05, + "loss": 0.7274, + "mean_token_accuracy": 0.7842515587806702, + "step": 595 + }, + { + "epoch": 0.14012143858010276, + "grad_norm": 0.5021747765152961, + "learning_rate": 4.90123067372562e-05, + "loss": 0.7281, + "mean_token_accuracy": 0.7844620704650879, + "step": 600 + }, + { + "epoch": 0.14128911723493695, + "grad_norm": 0.5565460238732902, + "learning_rate": 4.8986681339628077e-05, + "loss": 0.7191, + "mean_token_accuracy": 0.7857699632644654, + "step": 605 + }, + { + "epoch": 0.14245679588977114, + "grad_norm": 0.4930108125024874, + "learning_rate": 4.8960735418252004e-05, + "loss": 0.7321, + "mean_token_accuracy": 0.7825563907623291, + "step": 610 + }, + { + "epoch": 0.14362447454460534, + "grad_norm": 0.5336502591931652, + "learning_rate": 4.893446936017162e-05, + "loss": 0.7428, + "mean_token_accuracy": 0.7788402199745178, + "step": 615 + }, + { + "epoch": 0.14479215319943953, + "grad_norm": 0.49619307792734996, + "learning_rate": 4.890788355720621e-05, + "loss": 0.734, + "mean_token_accuracy": 0.7823409914970398, + "step": 620 + }, + { + "epoch": 0.1459598318542737, + "grad_norm": 0.5075260629040811, + "learning_rate": 4.888097840594475e-05, + "loss": 0.7342, + "mean_token_accuracy": 0.7819887876510621, + "step": 625 + }, + { + "epoch": 0.14712751050910788, + "grad_norm": 0.5347494097761175, + "learning_rate": 4.8853754307740043e-05, + "loss": 0.7301, + "mean_token_accuracy": 0.7819931507110596, + "step": 630 + }, + { + "epoch": 0.14829518916394208, + "grad_norm": 0.5214153664035984, + "learning_rate": 4.8826211668702744e-05, + "loss": 0.7301, + "mean_token_accuracy": 0.7836357355117798, + "step": 635 + }, + { + "epoch": 0.14946286781877627, + "grad_norm": 0.5164850795664806, + "learning_rate": 4.879835089969526e-05, + "loss": 0.7396, + "mean_token_accuracy": 0.7807266354560852, + "step": 640 + }, + { + "epoch": 0.15063054647361046, + "grad_norm": 0.49031953759854213, + "learning_rate": 4.877017241632567e-05, + "loss": 0.7316, + "mean_token_accuracy": 0.7826692700386048, + "step": 645 + }, + { + "epoch": 0.15179822512844465, + "grad_norm": 0.47972549490113653, + "learning_rate": 4.874167663894148e-05, + "loss": 0.7275, + "mean_token_accuracy": 0.7837109208106995, + "step": 650 + }, + { + "epoch": 0.15296590378327884, + "grad_norm": 0.573852158613571, + "learning_rate": 4.871286399262338e-05, + "loss": 0.734, + "mean_token_accuracy": 0.7821164131164551, + "step": 655 + }, + { + "epoch": 0.15413358243811304, + "grad_norm": 0.48670270200334753, + "learning_rate": 4.868373490717891e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.7876108169555665, + "step": 660 + }, + { + "epoch": 0.15530126109294723, + "grad_norm": 0.4807181062706159, + "learning_rate": 4.8654289817136014e-05, + "loss": 0.7356, + "mean_token_accuracy": 0.7812622427940369, + "step": 665 + }, + { + "epoch": 0.15646893974778142, + "grad_norm": 0.4740244960354614, + "learning_rate": 4.8624529161736585e-05, + "loss": 0.7391, + "mean_token_accuracy": 0.7798499464988708, + "step": 670 + }, + { + "epoch": 0.1576366184026156, + "grad_norm": 0.4842016834630077, + "learning_rate": 4.859445338492991e-05, + "loss": 0.7183, + "mean_token_accuracy": 0.7859867334365844, + "step": 675 + }, + { + "epoch": 0.15880429705744978, + "grad_norm": 0.5293878295868812, + "learning_rate": 4.856406293536604e-05, + "loss": 0.7203, + "mean_token_accuracy": 0.7852997183799744, + "step": 680 + }, + { + "epoch": 0.15997197571228397, + "grad_norm": 0.4971189738204096, + "learning_rate": 4.8533358266389114e-05, + "loss": 0.7459, + "mean_token_accuracy": 0.7780242919921875, + "step": 685 + }, + { + "epoch": 0.16113965436711816, + "grad_norm": 0.4961153642234319, + "learning_rate": 4.8502339836030557e-05, + "loss": 0.7282, + "mean_token_accuracy": 0.783974289894104, + "step": 690 + }, + { + "epoch": 0.16230733302195235, + "grad_norm": 0.4611733997421793, + "learning_rate": 4.847100810700228e-05, + "loss": 0.7285, + "mean_token_accuracy": 0.7842178583145142, + "step": 695 + }, + { + "epoch": 0.16347501167678655, + "grad_norm": 0.4573905057941084, + "learning_rate": 4.843936354668981e-05, + "loss": 0.735, + "mean_token_accuracy": 0.7812911033630371, + "step": 700 + }, + { + "epoch": 0.16464269033162074, + "grad_norm": 0.4913157024753009, + "learning_rate": 4.8407406627145223e-05, + "loss": 0.7084, + "mean_token_accuracy": 0.7885575413703918, + "step": 705 + }, + { + "epoch": 0.16581036898645493, + "grad_norm": 0.44416957183936584, + "learning_rate": 4.837513782508018e-05, + "loss": 0.7233, + "mean_token_accuracy": 0.7844390273094177, + "step": 710 + }, + { + "epoch": 0.16697804764128912, + "grad_norm": 0.4907006068133848, + "learning_rate": 4.834255762185882e-05, + "loss": 0.7195, + "mean_token_accuracy": 0.7859719276428223, + "step": 715 + }, + { + "epoch": 0.16814572629612332, + "grad_norm": 0.43435459153568495, + "learning_rate": 4.830966650349051e-05, + "loss": 0.7254, + "mean_token_accuracy": 0.7844242691993714, + "step": 720 + }, + { + "epoch": 0.1693134049509575, + "grad_norm": 0.49536581158333626, + "learning_rate": 4.827646496062267e-05, + "loss": 0.7418, + "mean_token_accuracy": 0.7795446991920472, + "step": 725 + }, + { + "epoch": 0.17048108360579167, + "grad_norm": 0.4783676455304318, + "learning_rate": 4.8242953488533405e-05, + "loss": 0.7359, + "mean_token_accuracy": 0.7814501404762269, + "step": 730 + }, + { + "epoch": 0.17164876226062586, + "grad_norm": 0.47099005338218264, + "learning_rate": 4.820913258712415e-05, + "loss": 0.725, + "mean_token_accuracy": 0.7842321276664734, + "step": 735 + }, + { + "epoch": 0.17281644091546006, + "grad_norm": 0.4584620233922422, + "learning_rate": 4.817500276091218e-05, + "loss": 0.7312, + "mean_token_accuracy": 0.7828685283660889, + "step": 740 + }, + { + "epoch": 0.17398411957029425, + "grad_norm": 0.477169638319274, + "learning_rate": 4.8140564519023104e-05, + "loss": 0.736, + "mean_token_accuracy": 0.7808755159378051, + "step": 745 + }, + { + "epoch": 0.17515179822512844, + "grad_norm": 0.48092623315321326, + "learning_rate": 4.810581837518329e-05, + "loss": 0.7217, + "mean_token_accuracy": 0.7843928694725036, + "step": 750 + }, + { + "epoch": 0.17631947687996263, + "grad_norm": 0.46568894961249274, + "learning_rate": 4.807076484771214e-05, + "loss": 0.7259, + "mean_token_accuracy": 0.7835266947746277, + "step": 755 + }, + { + "epoch": 0.17748715553479683, + "grad_norm": 0.5051059313810501, + "learning_rate": 4.803540445951443e-05, + "loss": 0.7318, + "mean_token_accuracy": 0.782058572769165, + "step": 760 + }, + { + "epoch": 0.17865483418963102, + "grad_norm": 0.47776634420399566, + "learning_rate": 4.7999737738072454e-05, + "loss": 0.719, + "mean_token_accuracy": 0.7861660599708558, + "step": 765 + }, + { + "epoch": 0.1798225128444652, + "grad_norm": 0.4908298429728617, + "learning_rate": 4.796376521543818e-05, + "loss": 0.7385, + "mean_token_accuracy": 0.7802499651908874, + "step": 770 + }, + { + "epoch": 0.1809901914992994, + "grad_norm": 0.48275463030869936, + "learning_rate": 4.792748742822534e-05, + "loss": 0.7399, + "mean_token_accuracy": 0.7795346736907959, + "step": 775 + }, + { + "epoch": 0.1821578701541336, + "grad_norm": 0.4585236677961296, + "learning_rate": 4.789090491760136e-05, + "loss": 0.7217, + "mean_token_accuracy": 0.7852041959762573, + "step": 780 + }, + { + "epoch": 0.18332554880896776, + "grad_norm": 0.4431192599862763, + "learning_rate": 4.785401822927933e-05, + "loss": 0.7301, + "mean_token_accuracy": 0.7829492092132568, + "step": 785 + }, + { + "epoch": 0.18449322746380195, + "grad_norm": 0.4597057682029413, + "learning_rate": 4.781682791350988e-05, + "loss": 0.7228, + "mean_token_accuracy": 0.7847963213920593, + "step": 790 + }, + { + "epoch": 0.18566090611863614, + "grad_norm": 0.4662418312052468, + "learning_rate": 4.777933452507292e-05, + "loss": 0.7199, + "mean_token_accuracy": 0.7857567667961121, + "step": 795 + }, + { + "epoch": 0.18682858477347034, + "grad_norm": 0.4776885429005188, + "learning_rate": 4.774153862326941e-05, + "loss": 0.7206, + "mean_token_accuracy": 0.784937572479248, + "step": 800 + }, + { + "epoch": 0.18799626342830453, + "grad_norm": 0.5045109101462393, + "learning_rate": 4.770344077191298e-05, + "loss": 0.727, + "mean_token_accuracy": 0.7833572149276733, + "step": 805 + }, + { + "epoch": 0.18916394208313872, + "grad_norm": 0.4319869115453285, + "learning_rate": 4.7665041539321575e-05, + "loss": 0.7008, + "mean_token_accuracy": 0.7904294490814209, + "step": 810 + }, + { + "epoch": 0.1903316207379729, + "grad_norm": 0.5090096266239746, + "learning_rate": 4.762634149830891e-05, + "loss": 0.7219, + "mean_token_accuracy": 0.7848439335823059, + "step": 815 + }, + { + "epoch": 0.1914992993928071, + "grad_norm": 0.486136383514688, + "learning_rate": 4.758734122617596e-05, + "loss": 0.7325, + "mean_token_accuracy": 0.7821864604949951, + "step": 820 + }, + { + "epoch": 0.1926669780476413, + "grad_norm": 0.4509251265129464, + "learning_rate": 4.7548041304702354e-05, + "loss": 0.7143, + "mean_token_accuracy": 0.7870524287223816, + "step": 825 + }, + { + "epoch": 0.1938346567024755, + "grad_norm": 0.46074813369964984, + "learning_rate": 4.750844232013767e-05, + "loss": 0.7223, + "mean_token_accuracy": 0.7847394943237305, + "step": 830 + }, + { + "epoch": 0.19500233535730968, + "grad_norm": 0.4867697283152236, + "learning_rate": 4.746854486319274e-05, + "loss": 0.7148, + "mean_token_accuracy": 0.7859659194946289, + "step": 835 + }, + { + "epoch": 0.19617001401214385, + "grad_norm": 0.47078564341422224, + "learning_rate": 4.742834952903077e-05, + "loss": 0.7213, + "mean_token_accuracy": 0.7853442192077636, + "step": 840 + }, + { + "epoch": 0.19733769266697804, + "grad_norm": 0.4316603395170734, + "learning_rate": 4.738785691725851e-05, + "loss": 0.719, + "mean_token_accuracy": 0.7852090835571289, + "step": 845 + }, + { + "epoch": 0.19850537132181223, + "grad_norm": 0.4528316312681997, + "learning_rate": 4.73470676319173e-05, + "loss": 0.7321, + "mean_token_accuracy": 0.7820116639137268, + "step": 850 + }, + { + "epoch": 0.19967304997664642, + "grad_norm": 0.4435887447758274, + "learning_rate": 4.7305982281474044e-05, + "loss": 0.7252, + "mean_token_accuracy": 0.7840889692306519, + "step": 855 + }, + { + "epoch": 0.20084072863148061, + "grad_norm": 0.43699846639780626, + "learning_rate": 4.726460147881215e-05, + "loss": 0.7258, + "mean_token_accuracy": 0.7833877682685852, + "step": 860 + }, + { + "epoch": 0.2020084072863148, + "grad_norm": 0.4396712062437621, + "learning_rate": 4.7222925841222396e-05, + "loss": 0.7322, + "mean_token_accuracy": 0.7813938021659851, + "step": 865 + }, + { + "epoch": 0.203176085941149, + "grad_norm": 0.4841604056782495, + "learning_rate": 4.7180955990393685e-05, + "loss": 0.6978, + "mean_token_accuracy": 0.7909688353538513, + "step": 870 + }, + { + "epoch": 0.2043437645959832, + "grad_norm": 0.48414776026269296, + "learning_rate": 4.71386925524038e-05, + "loss": 0.7263, + "mean_token_accuracy": 0.7835749387741089, + "step": 875 + }, + { + "epoch": 0.20551144325081738, + "grad_norm": 0.4912644937458995, + "learning_rate": 4.709613615771008e-05, + "loss": 0.7179, + "mean_token_accuracy": 0.7850035667419434, + "step": 880 + }, + { + "epoch": 0.20667912190565157, + "grad_norm": 0.502985778770973, + "learning_rate": 4.705328744113994e-05, + "loss": 0.7122, + "mean_token_accuracy": 0.7873599052429199, + "step": 885 + }, + { + "epoch": 0.20784680056048577, + "grad_norm": 0.42720426244029885, + "learning_rate": 4.701014704188153e-05, + "loss": 0.7103, + "mean_token_accuracy": 0.7881195187568665, + "step": 890 + }, + { + "epoch": 0.20901447921531993, + "grad_norm": 0.4408382277852761, + "learning_rate": 4.696671560347405e-05, + "loss": 0.7169, + "mean_token_accuracy": 0.7855958819389344, + "step": 895 + }, + { + "epoch": 0.21018215787015412, + "grad_norm": 0.43583459875467395, + "learning_rate": 4.692299377379829e-05, + "loss": 0.7162, + "mean_token_accuracy": 0.7864755153656006, + "step": 900 + }, + { + "epoch": 0.21134983652498832, + "grad_norm": 0.4432758900239226, + "learning_rate": 4.687898220506684e-05, + "loss": 0.7188, + "mean_token_accuracy": 0.7851768493652344, + "step": 905 + }, + { + "epoch": 0.2125175151798225, + "grad_norm": 0.46485060237852943, + "learning_rate": 4.683468155381447e-05, + "loss": 0.7129, + "mean_token_accuracy": 0.7869641304016113, + "step": 910 + }, + { + "epoch": 0.2136851938346567, + "grad_norm": 0.49320979564054895, + "learning_rate": 4.679009248088827e-05, + "loss": 0.7197, + "mean_token_accuracy": 0.7856220841407776, + "step": 915 + }, + { + "epoch": 0.2148528724894909, + "grad_norm": 0.4982063158575857, + "learning_rate": 4.674521565143778e-05, + "loss": 0.7099, + "mean_token_accuracy": 0.7877394437789917, + "step": 920 + }, + { + "epoch": 0.21602055114432508, + "grad_norm": 0.43795135467944624, + "learning_rate": 4.670005173490515e-05, + "loss": 0.7293, + "mean_token_accuracy": 0.7827953219413757, + "step": 925 + }, + { + "epoch": 0.21718822979915928, + "grad_norm": 0.45518239517083336, + "learning_rate": 4.6654601405015067e-05, + "loss": 0.6916, + "mean_token_accuracy": 0.7928288698196411, + "step": 930 + }, + { + "epoch": 0.21835590845399347, + "grad_norm": 0.44145431612948177, + "learning_rate": 4.6608865339764734e-05, + "loss": 0.7241, + "mean_token_accuracy": 0.7845181703567505, + "step": 935 + }, + { + "epoch": 0.21952358710882766, + "grad_norm": 0.4795540122905968, + "learning_rate": 4.656284422141379e-05, + "loss": 0.7166, + "mean_token_accuracy": 0.7859855771064759, + "step": 940 + }, + { + "epoch": 0.22069126576366185, + "grad_norm": 0.44443006857088085, + "learning_rate": 4.6516538736474065e-05, + "loss": 0.7261, + "mean_token_accuracy": 0.7834364414215088, + "step": 945 + }, + { + "epoch": 0.22185894441849602, + "grad_norm": 0.4458965816154048, + "learning_rate": 4.64699495756994e-05, + "loss": 0.7223, + "mean_token_accuracy": 0.7847238898277282, + "step": 950 + }, + { + "epoch": 0.2230266230733302, + "grad_norm": 0.4468347193201661, + "learning_rate": 4.6423077434075325e-05, + "loss": 0.6971, + "mean_token_accuracy": 0.7907997846603394, + "step": 955 + }, + { + "epoch": 0.2241943017281644, + "grad_norm": 0.4567634433598694, + "learning_rate": 4.637592301080868e-05, + "loss": 0.7073, + "mean_token_accuracy": 0.7883715748786926, + "step": 960 + }, + { + "epoch": 0.2253619803829986, + "grad_norm": 0.4335252780054766, + "learning_rate": 4.632848700931717e-05, + "loss": 0.7145, + "mean_token_accuracy": 0.7857046246528625, + "step": 965 + }, + { + "epoch": 0.2265296590378328, + "grad_norm": 0.44892314598834515, + "learning_rate": 4.6280770137218935e-05, + "loss": 0.7189, + "mean_token_accuracy": 0.7853599071502686, + "step": 970 + }, + { + "epoch": 0.22769733769266698, + "grad_norm": 0.4413985435576338, + "learning_rate": 4.623277310632191e-05, + "loss": 0.7119, + "mean_token_accuracy": 0.787575876712799, + "step": 975 + }, + { + "epoch": 0.22886501634750117, + "grad_norm": 0.4323189509583375, + "learning_rate": 4.618449663261327e-05, + "loss": 0.7148, + "mean_token_accuracy": 0.7864131808280945, + "step": 980 + }, + { + "epoch": 0.23003269500233536, + "grad_norm": 0.45017973068272576, + "learning_rate": 4.613594143624874e-05, + "loss": 0.7274, + "mean_token_accuracy": 0.782983911037445, + "step": 985 + }, + { + "epoch": 0.23120037365716956, + "grad_norm": 0.43714942530139217, + "learning_rate": 4.6087108241541816e-05, + "loss": 0.7155, + "mean_token_accuracy": 0.7863085150718689, + "step": 990 + }, + { + "epoch": 0.23236805231200375, + "grad_norm": 0.45451058238634834, + "learning_rate": 4.6037997776953006e-05, + "loss": 0.7252, + "mean_token_accuracy": 0.7834529042243957, + "step": 995 + }, + { + "epoch": 0.23353573096683794, + "grad_norm": 0.44583243685048174, + "learning_rate": 4.5988610775078924e-05, + "loss": 0.7155, + "mean_token_accuracy": 0.7857861399650574, + "step": 1000 + }, + { + "epoch": 0.2347034096216721, + "grad_norm": 0.49619720347902463, + "learning_rate": 4.5938947972641384e-05, + "loss": 0.7118, + "mean_token_accuracy": 0.7873388051986694, + "step": 1005 + }, + { + "epoch": 0.2358710882765063, + "grad_norm": 0.46073385899853847, + "learning_rate": 4.588901011047643e-05, + "loss": 0.7041, + "mean_token_accuracy": 0.7886918306350708, + "step": 1010 + }, + { + "epoch": 0.2370387669313405, + "grad_norm": 0.42849201016736405, + "learning_rate": 4.583879793352321e-05, + "loss": 0.7019, + "mean_token_accuracy": 0.7896603226661683, + "step": 1015 + }, + { + "epoch": 0.23820644558617468, + "grad_norm": 0.4117242188194485, + "learning_rate": 4.578831219081297e-05, + "loss": 0.7155, + "mean_token_accuracy": 0.7855068564414978, + "step": 1020 + }, + { + "epoch": 0.23937412424100887, + "grad_norm": 0.4444046018137099, + "learning_rate": 4.573755363545779e-05, + "loss": 0.7088, + "mean_token_accuracy": 0.7877750992774963, + "step": 1025 + }, + { + "epoch": 0.24054180289584307, + "grad_norm": 0.45378761057819667, + "learning_rate": 4.5686523024639396e-05, + "loss": 0.7036, + "mean_token_accuracy": 0.7887048363685608, + "step": 1030 + }, + { + "epoch": 0.24170948155067726, + "grad_norm": 0.4410046842474515, + "learning_rate": 4.5635221119597844e-05, + "loss": 0.6996, + "mean_token_accuracy": 0.7904541969299317, + "step": 1035 + }, + { + "epoch": 0.24287716020551145, + "grad_norm": 0.5161595021602942, + "learning_rate": 4.558364868562019e-05, + "loss": 0.7051, + "mean_token_accuracy": 0.7887148141860962, + "step": 1040 + }, + { + "epoch": 0.24404483886034564, + "grad_norm": 0.42583609149302426, + "learning_rate": 4.5531806492029025e-05, + "loss": 0.7108, + "mean_token_accuracy": 0.7868101596832275, + "step": 1045 + }, + { + "epoch": 0.24521251751517983, + "grad_norm": 0.43741558979257184, + "learning_rate": 4.5479695312171076e-05, + "loss": 0.7084, + "mean_token_accuracy": 0.7882131457328796, + "step": 1050 + }, + { + "epoch": 0.246380196170014, + "grad_norm": 0.4444587088939259, + "learning_rate": 4.54273159234056e-05, + "loss": 0.7171, + "mean_token_accuracy": 0.7851068854331971, + "step": 1055 + }, + { + "epoch": 0.2475478748248482, + "grad_norm": 0.4527642681216855, + "learning_rate": 4.53746691070928e-05, + "loss": 0.7054, + "mean_token_accuracy": 0.7889351010322571, + "step": 1060 + }, + { + "epoch": 0.24871555347968238, + "grad_norm": 0.469111394987926, + "learning_rate": 4.532175564858221e-05, + "loss": 0.7168, + "mean_token_accuracy": 0.7852754116058349, + "step": 1065 + }, + { + "epoch": 0.24988323213451658, + "grad_norm": 0.44575914809190365, + "learning_rate": 4.526857633720093e-05, + "loss": 0.7153, + "mean_token_accuracy": 0.7849599242210388, + "step": 1070 + }, + { + "epoch": 0.25105091078935077, + "grad_norm": 0.4539882897584748, + "learning_rate": 4.521513196624189e-05, + "loss": 0.7041, + "mean_token_accuracy": 0.7891679286956788, + "step": 1075 + }, + { + "epoch": 0.25221858944418496, + "grad_norm": 0.440991057796004, + "learning_rate": 4.516142333295198e-05, + "loss": 0.7266, + "mean_token_accuracy": 0.7828721523284912, + "step": 1080 + }, + { + "epoch": 0.25338626809901915, + "grad_norm": 0.4619769454215481, + "learning_rate": 4.510745123852019e-05, + "loss": 0.7117, + "mean_token_accuracy": 0.7872965574264527, + "step": 1085 + }, + { + "epoch": 0.25455394675385334, + "grad_norm": 0.45926309101218876, + "learning_rate": 4.505321648806563e-05, + "loss": 0.7142, + "mean_token_accuracy": 0.785740876197815, + "step": 1090 + }, + { + "epoch": 0.25572162540868754, + "grad_norm": 0.4496676481586561, + "learning_rate": 4.499871989062557e-05, + "loss": 0.7288, + "mean_token_accuracy": 0.7817098140716553, + "step": 1095 + }, + { + "epoch": 0.25688930406352173, + "grad_norm": 0.41412785456381945, + "learning_rate": 4.4943962259143284e-05, + "loss": 0.7151, + "mean_token_accuracy": 0.785811471939087, + "step": 1100 + }, + { + "epoch": 0.2580569827183559, + "grad_norm": 0.4547014384212491, + "learning_rate": 4.488894441045602e-05, + "loss": 0.7083, + "mean_token_accuracy": 0.7874979853630066, + "step": 1105 + }, + { + "epoch": 0.2592246613731901, + "grad_norm": 0.4364026761955352, + "learning_rate": 4.483366716528275e-05, + "loss": 0.7153, + "mean_token_accuracy": 0.7856666803359985, + "step": 1110 + }, + { + "epoch": 0.2603923400280243, + "grad_norm": 0.4377059739770065, + "learning_rate": 4.477813134821195e-05, + "loss": 0.7037, + "mean_token_accuracy": 0.7891816020011901, + "step": 1115 + }, + { + "epoch": 0.2615600186828585, + "grad_norm": 0.4831722921558223, + "learning_rate": 4.472233778768929e-05, + "loss": 0.7234, + "mean_token_accuracy": 0.7838827013969422, + "step": 1120 + }, + { + "epoch": 0.2627276973376927, + "grad_norm": 0.428334400055855, + "learning_rate": 4.4666287316005286e-05, + "loss": 0.7164, + "mean_token_accuracy": 0.7852272629737854, + "step": 1125 + }, + { + "epoch": 0.2638953759925269, + "grad_norm": 0.4826970078971994, + "learning_rate": 4.4609980769282885e-05, + "loss": 0.7085, + "mean_token_accuracy": 0.7876496911048889, + "step": 1130 + }, + { + "epoch": 0.2650630546473611, + "grad_norm": 0.44086484245836477, + "learning_rate": 4.455341898746498e-05, + "loss": 0.7054, + "mean_token_accuracy": 0.7883308887481689, + "step": 1135 + }, + { + "epoch": 0.2662307333021952, + "grad_norm": 0.4880543047258986, + "learning_rate": 4.4496602814301916e-05, + "loss": 0.7056, + "mean_token_accuracy": 0.7887270092964173, + "step": 1140 + }, + { + "epoch": 0.2673984119570294, + "grad_norm": 0.4616159176521573, + "learning_rate": 4.443953309733882e-05, + "loss": 0.703, + "mean_token_accuracy": 0.7891542792320252, + "step": 1145 + }, + { + "epoch": 0.2685660906118636, + "grad_norm": 0.4427788392219619, + "learning_rate": 4.438221068790307e-05, + "loss": 0.6964, + "mean_token_accuracy": 0.7907665014266968, + "step": 1150 + }, + { + "epoch": 0.2697337692666978, + "grad_norm": 0.4376612396660607, + "learning_rate": 4.43246364410915e-05, + "loss": 0.7182, + "mean_token_accuracy": 0.7839595317840576, + "step": 1155 + }, + { + "epoch": 0.270901447921532, + "grad_norm": 0.4233818394600401, + "learning_rate": 4.42668112157577e-05, + "loss": 0.7139, + "mean_token_accuracy": 0.7864643573760987, + "step": 1160 + }, + { + "epoch": 0.27206912657636617, + "grad_norm": 0.4345545922147828, + "learning_rate": 4.4208735874499195e-05, + "loss": 0.7132, + "mean_token_accuracy": 0.786436665058136, + "step": 1165 + }, + { + "epoch": 0.27323680523120036, + "grad_norm": 0.43169356687900057, + "learning_rate": 4.415041128364456e-05, + "loss": 0.7063, + "mean_token_accuracy": 0.7881024360656739, + "step": 1170 + }, + { + "epoch": 0.27440448388603456, + "grad_norm": 0.4447028072401058, + "learning_rate": 4.409183831324051e-05, + "loss": 0.7154, + "mean_token_accuracy": 0.7854717016220093, + "step": 1175 + }, + { + "epoch": 0.27557216254086875, + "grad_norm": 0.42471507783469004, + "learning_rate": 4.40330178370389e-05, + "loss": 0.7073, + "mean_token_accuracy": 0.7885934591293335, + "step": 1180 + }, + { + "epoch": 0.27673984119570294, + "grad_norm": 0.45082272188542066, + "learning_rate": 4.3973950732483734e-05, + "loss": 0.7086, + "mean_token_accuracy": 0.7870532512664795, + "step": 1185 + }, + { + "epoch": 0.27790751985053713, + "grad_norm": 0.4404465666188941, + "learning_rate": 4.391463788069804e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.7885097980499267, + "step": 1190 + }, + { + "epoch": 0.2790751985053713, + "grad_norm": 0.4406524426250642, + "learning_rate": 4.385508016647074e-05, + "loss": 0.7081, + "mean_token_accuracy": 0.7865199685096741, + "step": 1195 + }, + { + "epoch": 0.2802428771602055, + "grad_norm": 0.44839479317312364, + "learning_rate": 4.379527847824343e-05, + "loss": 0.7046, + "mean_token_accuracy": 0.7878746509552002, + "step": 1200 + }, + { + "epoch": 0.2814105558150397, + "grad_norm": 0.43744246766113964, + "learning_rate": 4.373523370809719e-05, + "loss": 0.7107, + "mean_token_accuracy": 0.7875268459320068, + "step": 1205 + }, + { + "epoch": 0.2825782344698739, + "grad_norm": 0.43883040317858174, + "learning_rate": 4.367494675173916e-05, + "loss": 0.7101, + "mean_token_accuracy": 0.7864572763442993, + "step": 1210 + }, + { + "epoch": 0.2837459131247081, + "grad_norm": 0.40569418372162835, + "learning_rate": 4.361441850848933e-05, + "loss": 0.7065, + "mean_token_accuracy": 0.7879815340042114, + "step": 1215 + }, + { + "epoch": 0.2849135917795423, + "grad_norm": 0.439300765453792, + "learning_rate": 4.3553649881266987e-05, + "loss": 0.7049, + "mean_token_accuracy": 0.7881851673126221, + "step": 1220 + }, + { + "epoch": 0.2860812704343765, + "grad_norm": 0.422025337814165, + "learning_rate": 4.349264177657731e-05, + "loss": 0.688, + "mean_token_accuracy": 0.7934986114501953, + "step": 1225 + }, + { + "epoch": 0.28724894908921067, + "grad_norm": 0.47587779096405475, + "learning_rate": 4.343139510449788e-05, + "loss": 0.7029, + "mean_token_accuracy": 0.7886928200721741, + "step": 1230 + }, + { + "epoch": 0.28841662774404486, + "grad_norm": 0.4544237748792879, + "learning_rate": 4.3369910778665015e-05, + "loss": 0.7035, + "mean_token_accuracy": 0.788232421875, + "step": 1235 + }, + { + "epoch": 0.28958430639887905, + "grad_norm": 0.4782767918479427, + "learning_rate": 4.330818971626022e-05, + "loss": 0.7101, + "mean_token_accuracy": 0.7862432956695556, + "step": 1240 + }, + { + "epoch": 0.2907519850537132, + "grad_norm": 0.44258635457091783, + "learning_rate": 4.324623283799646e-05, + "loss": 0.709, + "mean_token_accuracy": 0.7871970534324646, + "step": 1245 + }, + { + "epoch": 0.2919196637085474, + "grad_norm": 0.4486338099740648, + "learning_rate": 4.3184041068104455e-05, + "loss": 0.7037, + "mean_token_accuracy": 0.7884302258491516, + "step": 1250 + }, + { + "epoch": 0.2930873423633816, + "grad_norm": 0.44580101101035563, + "learning_rate": 4.312161533431887e-05, + "loss": 0.705, + "mean_token_accuracy": 0.7882564902305603, + "step": 1255 + }, + { + "epoch": 0.29425502101821577, + "grad_norm": 0.4404302507045317, + "learning_rate": 4.3058956567864496e-05, + "loss": 0.7089, + "mean_token_accuracy": 0.7877544641494751, + "step": 1260 + }, + { + "epoch": 0.29542269967304996, + "grad_norm": 0.4384928660732881, + "learning_rate": 4.299606570344233e-05, + "loss": 0.6975, + "mean_token_accuracy": 0.789632523059845, + "step": 1265 + }, + { + "epoch": 0.29659037832788415, + "grad_norm": 0.42098417723453946, + "learning_rate": 4.293294367921565e-05, + "loss": 0.7034, + "mean_token_accuracy": 0.78822261095047, + "step": 1270 + }, + { + "epoch": 0.29775805698271834, + "grad_norm": 0.41472374373337156, + "learning_rate": 4.2869591436796055e-05, + "loss": 0.7019, + "mean_token_accuracy": 0.7883977055549621, + "step": 1275 + }, + { + "epoch": 0.29892573563755254, + "grad_norm": 0.43065241006128546, + "learning_rate": 4.2806009921229335e-05, + "loss": 0.7191, + "mean_token_accuracy": 0.7884038209915161, + "step": 1280 + }, + { + "epoch": 0.30009341429238673, + "grad_norm": 0.4175425007225463, + "learning_rate": 4.274220008098145e-05, + "loss": 0.7132, + "mean_token_accuracy": 0.7859068274497986, + "step": 1285 + }, + { + "epoch": 0.3012610929472209, + "grad_norm": 0.4307805070923456, + "learning_rate": 4.2678162867924334e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.7868623852729797, + "step": 1290 + }, + { + "epoch": 0.3024287716020551, + "grad_norm": 0.444114017113492, + "learning_rate": 4.261389923732173e-05, + "loss": 0.6912, + "mean_token_accuracy": 0.7915731787681579, + "step": 1295 + }, + { + "epoch": 0.3035964502568893, + "grad_norm": 0.4189085840596454, + "learning_rate": 4.254941014781493e-05, + "loss": 0.698, + "mean_token_accuracy": 0.7907318711280823, + "step": 1300 + }, + { + "epoch": 0.3047641289117235, + "grad_norm": 0.45185026533343686, + "learning_rate": 4.248469656140845e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.7854733467102051, + "step": 1305 + }, + { + "epoch": 0.3059318075665577, + "grad_norm": 0.42777780266147564, + "learning_rate": 4.2419759443455695e-05, + "loss": 0.7103, + "mean_token_accuracy": 0.7865085482597352, + "step": 1310 + }, + { + "epoch": 0.3070994862213919, + "grad_norm": 0.39460548581947386, + "learning_rate": 4.23545997626446e-05, + "loss": 0.6825, + "mean_token_accuracy": 0.7942874431610107, + "step": 1315 + }, + { + "epoch": 0.3082671648762261, + "grad_norm": 0.4175375501200804, + "learning_rate": 4.22892184909831e-05, + "loss": 0.6868, + "mean_token_accuracy": 0.7929208397865295, + "step": 1320 + }, + { + "epoch": 0.30943484353106027, + "grad_norm": 0.4191016587696918, + "learning_rate": 4.222361660378469e-05, + "loss": 0.7064, + "mean_token_accuracy": 0.7886691808700561, + "step": 1325 + }, + { + "epoch": 0.31060252218589446, + "grad_norm": 0.43966394833844286, + "learning_rate": 4.2157795079653865e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.7888782143592834, + "step": 1330 + }, + { + "epoch": 0.31177020084072865, + "grad_norm": 0.41447760478443535, + "learning_rate": 4.20917549004715e-05, + "loss": 0.7172, + "mean_token_accuracy": 0.78470778465271, + "step": 1335 + }, + { + "epoch": 0.31293787949556284, + "grad_norm": 0.44061995622008426, + "learning_rate": 4.202549705138022e-05, + "loss": 0.7042, + "mean_token_accuracy": 0.7879290461540223, + "step": 1340 + }, + { + "epoch": 0.31410555815039704, + "grad_norm": 0.399187005465073, + "learning_rate": 4.195902252076971e-05, + "loss": 0.6994, + "mean_token_accuracy": 0.789470088481903, + "step": 1345 + }, + { + "epoch": 0.3152732368052312, + "grad_norm": 0.41977370884132514, + "learning_rate": 4.189233230026196e-05, + "loss": 0.6966, + "mean_token_accuracy": 0.7900195121765137, + "step": 1350 + }, + { + "epoch": 0.31644091546006536, + "grad_norm": 0.40072152067413735, + "learning_rate": 4.1825427384696474e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.7925947666168213, + "step": 1355 + }, + { + "epoch": 0.31760859411489956, + "grad_norm": 0.44100264357135505, + "learning_rate": 4.175830877211544e-05, + "loss": 0.685, + "mean_token_accuracy": 0.79369136095047, + "step": 1360 + }, + { + "epoch": 0.31877627276973375, + "grad_norm": 0.4482241815543107, + "learning_rate": 4.169097746374881e-05, + "loss": 0.6943, + "mean_token_accuracy": 0.7907788753509521, + "step": 1365 + }, + { + "epoch": 0.31994395142456794, + "grad_norm": 0.45473535257281034, + "learning_rate": 4.1623434463999416e-05, + "loss": 0.6924, + "mean_token_accuracy": 0.7912651419639587, + "step": 1370 + }, + { + "epoch": 0.32111163007940213, + "grad_norm": 0.42481144295921136, + "learning_rate": 4.155568078042794e-05, + "loss": 0.6787, + "mean_token_accuracy": 0.7944711685180664, + "step": 1375 + }, + { + "epoch": 0.3222793087342363, + "grad_norm": 0.43711667628284634, + "learning_rate": 4.148771742373792e-05, + "loss": 0.7027, + "mean_token_accuracy": 0.7891393303871155, + "step": 1380 + }, + { + "epoch": 0.3234469873890705, + "grad_norm": 0.48033690732285167, + "learning_rate": 4.1419545407760626e-05, + "loss": 0.6966, + "mean_token_accuracy": 0.7903271079063415, + "step": 1385 + }, + { + "epoch": 0.3246146660439047, + "grad_norm": 0.41868798949995434, + "learning_rate": 4.135116574943999e-05, + "loss": 0.6791, + "mean_token_accuracy": 0.79523845911026, + "step": 1390 + }, + { + "epoch": 0.3257823446987389, + "grad_norm": 0.4180376549142789, + "learning_rate": 4.128257946881741e-05, + "loss": 0.6936, + "mean_token_accuracy": 0.7911462187767029, + "step": 1395 + }, + { + "epoch": 0.3269500233535731, + "grad_norm": 0.43547115836725975, + "learning_rate": 4.121378758901652e-05, + "loss": 0.6918, + "mean_token_accuracy": 0.7913682222366333, + "step": 1400 + }, + { + "epoch": 0.3281177020084073, + "grad_norm": 0.40149028637904866, + "learning_rate": 4.1144791136227965e-05, + "loss": 0.6772, + "mean_token_accuracy": 0.7953076958656311, + "step": 1405 + }, + { + "epoch": 0.3292853806632415, + "grad_norm": 0.4085755305586855, + "learning_rate": 4.1075591139694055e-05, + "loss": 0.6992, + "mean_token_accuracy": 0.7897636413574218, + "step": 1410 + }, + { + "epoch": 0.33045305931807567, + "grad_norm": 0.4093937182811985, + "learning_rate": 4.1006188631693436e-05, + "loss": 0.704, + "mean_token_accuracy": 0.7888221025466919, + "step": 1415 + }, + { + "epoch": 0.33162073797290986, + "grad_norm": 0.4048619205886811, + "learning_rate": 4.093658464752568e-05, + "loss": 0.6954, + "mean_token_accuracy": 0.7900570034980774, + "step": 1420 + }, + { + "epoch": 0.33278841662774405, + "grad_norm": 0.40294027080852507, + "learning_rate": 4.086678022549583e-05, + "loss": 0.696, + "mean_token_accuracy": 0.790727686882019, + "step": 1425 + }, + { + "epoch": 0.33395609528257825, + "grad_norm": 0.41265771484471453, + "learning_rate": 4.079677640689896e-05, + "loss": 0.6793, + "mean_token_accuracy": 0.7941466689109802, + "step": 1430 + }, + { + "epoch": 0.33512377393741244, + "grad_norm": 0.41823498225621625, + "learning_rate": 4.072657423600457e-05, + "loss": 0.7122, + "mean_token_accuracy": 0.7856095314025879, + "step": 1435 + }, + { + "epoch": 0.33629145259224663, + "grad_norm": 0.4453822627819668, + "learning_rate": 4.065617476004106e-05, + "loss": 0.7039, + "mean_token_accuracy": 0.7885820269584656, + "step": 1440 + }, + { + "epoch": 0.3374591312470808, + "grad_norm": 0.4270695836104636, + "learning_rate": 4.0585579029180105e-05, + "loss": 0.6987, + "mean_token_accuracy": 0.790066933631897, + "step": 1445 + }, + { + "epoch": 0.338626809901915, + "grad_norm": 0.405743394136002, + "learning_rate": 4.051478809652096e-05, + "loss": 0.6995, + "mean_token_accuracy": 0.7893929600715637, + "step": 1450 + }, + { + "epoch": 0.3397944885567492, + "grad_norm": 0.4158884606745251, + "learning_rate": 4.0443803018074764e-05, + "loss": 0.6917, + "mean_token_accuracy": 0.7915486335754395, + "step": 1455 + }, + { + "epoch": 0.34096216721158334, + "grad_norm": 0.41153256418505635, + "learning_rate": 4.037262485274882e-05, + "loss": 0.6969, + "mean_token_accuracy": 0.7899181485176087, + "step": 1460 + }, + { + "epoch": 0.34212984586641754, + "grad_norm": 0.40577869170909975, + "learning_rate": 4.0301254662330746e-05, + "loss": 0.6953, + "mean_token_accuracy": 0.7905377507209778, + "step": 1465 + }, + { + "epoch": 0.34329752452125173, + "grad_norm": 0.43799178063711397, + "learning_rate": 4.022969351147265e-05, + "loss": 0.6937, + "mean_token_accuracy": 0.7911334872245789, + "step": 1470 + }, + { + "epoch": 0.3444652031760859, + "grad_norm": 0.39792996813120934, + "learning_rate": 4.0157942467675295e-05, + "loss": 0.6994, + "mean_token_accuracy": 0.789932906627655, + "step": 1475 + }, + { + "epoch": 0.3456328818309201, + "grad_norm": 0.4126862786094769, + "learning_rate": 4.008600260127212e-05, + "loss": 0.6938, + "mean_token_accuracy": 0.7911051034927368, + "step": 1480 + }, + { + "epoch": 0.3468005604857543, + "grad_norm": 0.42592729690432185, + "learning_rate": 4.001387498541327e-05, + "loss": 0.6837, + "mean_token_accuracy": 0.7948046326637268, + "step": 1485 + }, + { + "epoch": 0.3479682391405885, + "grad_norm": 0.41016053735417785, + "learning_rate": 3.994156069604963e-05, + "loss": 0.7044, + "mean_token_accuracy": 0.787828552722931, + "step": 1490 + }, + { + "epoch": 0.3491359177954227, + "grad_norm": 0.3821462363666685, + "learning_rate": 3.986906081191677e-05, + "loss": 0.6854, + "mean_token_accuracy": 0.7931524038314819, + "step": 1495 + }, + { + "epoch": 0.3503035964502569, + "grad_norm": 0.39463613397755326, + "learning_rate": 3.979637641451881e-05, + "loss": 0.6834, + "mean_token_accuracy": 0.7938231825828552, + "step": 1500 + }, + { + "epoch": 0.3514712751050911, + "grad_norm": 0.3999864270838378, + "learning_rate": 3.972350858811232e-05, + "loss": 0.6854, + "mean_token_accuracy": 0.7928770065307618, + "step": 1505 + }, + { + "epoch": 0.35263895375992527, + "grad_norm": 0.41667087675846004, + "learning_rate": 3.965045841969014e-05, + "loss": 0.6896, + "mean_token_accuracy": 0.7916455507278443, + "step": 1510 + }, + { + "epoch": 0.35380663241475946, + "grad_norm": 0.4055050434303604, + "learning_rate": 3.957722699896517e-05, + "loss": 0.6961, + "mean_token_accuracy": 0.7902446627616883, + "step": 1515 + }, + { + "epoch": 0.35497431106959365, + "grad_norm": 0.42281823120551243, + "learning_rate": 3.9503815418354106e-05, + "loss": 0.6986, + "mean_token_accuracy": 0.789761197566986, + "step": 1520 + }, + { + "epoch": 0.35614198972442784, + "grad_norm": 0.3964627615389493, + "learning_rate": 3.9430224772961156e-05, + "loss": 0.6861, + "mean_token_accuracy": 0.7927127242088318, + "step": 1525 + }, + { + "epoch": 0.35730966837926204, + "grad_norm": 0.4060784763255284, + "learning_rate": 3.935645616056167e-05, + "loss": 0.6962, + "mean_token_accuracy": 0.7902538776397705, + "step": 1530 + }, + { + "epoch": 0.3584773470340962, + "grad_norm": 0.40815525312633766, + "learning_rate": 3.928251068158582e-05, + "loss": 0.6817, + "mean_token_accuracy": 0.7942526340484619, + "step": 1535 + }, + { + "epoch": 0.3596450256889304, + "grad_norm": 0.40955558615241283, + "learning_rate": 3.9208389439102136e-05, + "loss": 0.6976, + "mean_token_accuracy": 0.7898761391639709, + "step": 1540 + }, + { + "epoch": 0.3608127043437646, + "grad_norm": 0.39755542463533255, + "learning_rate": 3.913409353880107e-05, + "loss": 0.679, + "mean_token_accuracy": 0.7947714328765869, + "step": 1545 + }, + { + "epoch": 0.3619803829985988, + "grad_norm": 0.3955943941014702, + "learning_rate": 3.9059624088978494e-05, + "loss": 0.6884, + "mean_token_accuracy": 0.7926104426383972, + "step": 1550 + }, + { + "epoch": 0.363148061653433, + "grad_norm": 0.4079534941897669, + "learning_rate": 3.8984982200519205e-05, + "loss": 0.6852, + "mean_token_accuracy": 0.7938459873199463, + "step": 1555 + }, + { + "epoch": 0.3643157403082672, + "grad_norm": 0.422222144244619, + "learning_rate": 3.891016898688027e-05, + "loss": 0.6728, + "mean_token_accuracy": 0.7968662142753601, + "step": 1560 + }, + { + "epoch": 0.3654834189631014, + "grad_norm": 0.3974446737115297, + "learning_rate": 3.8835185564074526e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.7951172947883606, + "step": 1565 + }, + { + "epoch": 0.3666510976179355, + "grad_norm": 0.45009359356113454, + "learning_rate": 3.8760033050653836e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.7919694304466247, + "step": 1570 + }, + { + "epoch": 0.3678187762727697, + "grad_norm": 0.415776055684977, + "learning_rate": 3.868471256769246e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.7906696677207947, + "step": 1575 + }, + { + "epoch": 0.3689864549276039, + "grad_norm": 0.41799146991908137, + "learning_rate": 3.86092252387703e-05, + "loss": 0.7176, + "mean_token_accuracy": 0.7845171213150024, + "step": 1580 + }, + { + "epoch": 0.3701541335824381, + "grad_norm": 0.4042330598199903, + "learning_rate": 3.853357218995617e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.7948396801948547, + "step": 1585 + }, + { + "epoch": 0.3713218122372723, + "grad_norm": 0.44651798009857396, + "learning_rate": 3.845775454979097e-05, + "loss": 0.688, + "mean_token_accuracy": 0.7919300556182861, + "step": 1590 + }, + { + "epoch": 0.3724894908921065, + "grad_norm": 0.40863835927817727, + "learning_rate": 3.838177344927087e-05, + "loss": 0.6929, + "mean_token_accuracy": 0.7914644598960876, + "step": 1595 + }, + { + "epoch": 0.37365716954694067, + "grad_norm": 0.4135722453777411, + "learning_rate": 3.8305630021830414e-05, + "loss": 0.6976, + "mean_token_accuracy": 0.7901840806007385, + "step": 1600 + }, + { + "epoch": 0.37482484820177486, + "grad_norm": 0.39878487254841516, + "learning_rate": 3.822932540332565e-05, + "loss": 0.6867, + "mean_token_accuracy": 0.7924752593040466, + "step": 1605 + }, + { + "epoch": 0.37599252685660906, + "grad_norm": 0.4270183665264642, + "learning_rate": 3.815286073201715e-05, + "loss": 0.6787, + "mean_token_accuracy": 0.7952769160270691, + "step": 1610 + }, + { + "epoch": 0.37716020551144325, + "grad_norm": 0.41198732423180434, + "learning_rate": 3.807623714855306e-05, + "loss": 0.697, + "mean_token_accuracy": 0.7908246994018555, + "step": 1615 + }, + { + "epoch": 0.37832788416627744, + "grad_norm": 0.39974854593415393, + "learning_rate": 3.799945579595204e-05, + "loss": 0.6757, + "mean_token_accuracy": 0.7958306431770324, + "step": 1620 + }, + { + "epoch": 0.37949556282111163, + "grad_norm": 0.4132324453951051, + "learning_rate": 3.792251781958629e-05, + "loss": 0.6874, + "mean_token_accuracy": 0.7922473192214966, + "step": 1625 + }, + { + "epoch": 0.3806632414759458, + "grad_norm": 0.39202900774430555, + "learning_rate": 3.7845424367164375e-05, + "loss": 0.6756, + "mean_token_accuracy": 0.7950840830802918, + "step": 1630 + }, + { + "epoch": 0.38183092013078, + "grad_norm": 0.41314551597463683, + "learning_rate": 3.776817658871415e-05, + "loss": 0.686, + "mean_token_accuracy": 0.7925597429275513, + "step": 1635 + }, + { + "epoch": 0.3829985987856142, + "grad_norm": 0.4030136426892179, + "learning_rate": 3.769077563656564e-05, + "loss": 0.6812, + "mean_token_accuracy": 0.7936363101005555, + "step": 1640 + }, + { + "epoch": 0.3841662774404484, + "grad_norm": 0.42860403219009824, + "learning_rate": 3.761322266533378e-05, + "loss": 0.6842, + "mean_token_accuracy": 0.793194854259491, + "step": 1645 + }, + { + "epoch": 0.3853339560952826, + "grad_norm": 0.4000562038879463, + "learning_rate": 3.7535518831901215e-05, + "loss": 0.6924, + "mean_token_accuracy": 0.7905576705932618, + "step": 1650 + }, + { + "epoch": 0.3865016347501168, + "grad_norm": 0.41944219063853394, + "learning_rate": 3.745766529540107e-05, + "loss": 0.692, + "mean_token_accuracy": 0.7906318783760071, + "step": 1655 + }, + { + "epoch": 0.387669313404951, + "grad_norm": 0.4646711920924628, + "learning_rate": 3.737966321719963e-05, + "loss": 0.6845, + "mean_token_accuracy": 0.79284508228302, + "step": 1660 + }, + { + "epoch": 0.38883699205978517, + "grad_norm": 0.4275709921637854, + "learning_rate": 3.730151376087901e-05, + "loss": 0.6992, + "mean_token_accuracy": 0.788605010509491, + "step": 1665 + }, + { + "epoch": 0.39000467071461936, + "grad_norm": 0.42375137549729824, + "learning_rate": 3.7223218092219854e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.7888615489006042, + "step": 1670 + }, + { + "epoch": 0.39117234936945355, + "grad_norm": 0.38559358617477313, + "learning_rate": 3.714477737918386e-05, + "loss": 0.694, + "mean_token_accuracy": 0.7908984899520874, + "step": 1675 + }, + { + "epoch": 0.3923400280242877, + "grad_norm": 0.41940017607146757, + "learning_rate": 3.706619279189642e-05, + "loss": 0.6833, + "mean_token_accuracy": 0.7926470756530761, + "step": 1680 + }, + { + "epoch": 0.3935077066791219, + "grad_norm": 0.40250611845594586, + "learning_rate": 3.698746550262914e-05, + "loss": 0.6957, + "mean_token_accuracy": 0.7898336172103881, + "step": 1685 + }, + { + "epoch": 0.3946753853339561, + "grad_norm": 0.43574884134226355, + "learning_rate": 3.690859668578237e-05, + "loss": 0.6924, + "mean_token_accuracy": 0.7914280891418457, + "step": 1690 + }, + { + "epoch": 0.39584306398879027, + "grad_norm": 0.38255322551705834, + "learning_rate": 3.6829587517867624e-05, + "loss": 0.6826, + "mean_token_accuracy": 0.7941157579421997, + "step": 1695 + }, + { + "epoch": 0.39701074264362446, + "grad_norm": 0.43206487998392523, + "learning_rate": 3.6750439177490157e-05, + "loss": 0.6789, + "mean_token_accuracy": 0.7942695617675781, + "step": 1700 + }, + { + "epoch": 0.39817842129845865, + "grad_norm": 0.4111698754410712, + "learning_rate": 3.6671152845331235e-05, + "loss": 0.6775, + "mean_token_accuracy": 0.7941833376884461, + "step": 1705 + }, + { + "epoch": 0.39934609995329284, + "grad_norm": 0.38687483691755237, + "learning_rate": 3.6591729704130625e-05, + "loss": 0.6912, + "mean_token_accuracy": 0.791841197013855, + "step": 1710 + }, + { + "epoch": 0.40051377860812704, + "grad_norm": 0.3962240155238275, + "learning_rate": 3.6512170938668926e-05, + "loss": 0.6654, + "mean_token_accuracy": 0.7994776248931885, + "step": 1715 + }, + { + "epoch": 0.40168145726296123, + "grad_norm": 0.41596152074136994, + "learning_rate": 3.643247773574985e-05, + "loss": 0.6805, + "mean_token_accuracy": 0.7940452218055725, + "step": 1720 + }, + { + "epoch": 0.4028491359177954, + "grad_norm": 0.39752836907482675, + "learning_rate": 3.63526512841826e-05, + "loss": 0.6855, + "mean_token_accuracy": 0.7924312829971314, + "step": 1725 + }, + { + "epoch": 0.4040168145726296, + "grad_norm": 0.43193754482691554, + "learning_rate": 3.627269277476406e-05, + "loss": 0.6922, + "mean_token_accuracy": 0.7911014437675477, + "step": 1730 + }, + { + "epoch": 0.4051844932274638, + "grad_norm": 0.4201633139908271, + "learning_rate": 3.619260340026108e-05, + "loss": 0.6887, + "mean_token_accuracy": 0.791678786277771, + "step": 1735 + }, + { + "epoch": 0.406352171882298, + "grad_norm": 0.4300484944319388, + "learning_rate": 3.6112384355392656e-05, + "loss": 0.6885, + "mean_token_accuracy": 0.7914774298667908, + "step": 1740 + }, + { + "epoch": 0.4075198505371322, + "grad_norm": 0.41243262421692184, + "learning_rate": 3.603203683681214e-05, + "loss": 0.6929, + "mean_token_accuracy": 0.79034343957901, + "step": 1745 + }, + { + "epoch": 0.4086875291919664, + "grad_norm": 0.416302895998379, + "learning_rate": 3.595156204308934e-05, + "loss": 0.6855, + "mean_token_accuracy": 0.7930089354515075, + "step": 1750 + }, + { + "epoch": 0.4098552078468006, + "grad_norm": 0.40119087043829565, + "learning_rate": 3.587096117469269e-05, + "loss": 0.6754, + "mean_token_accuracy": 0.7958263874053955, + "step": 1755 + }, + { + "epoch": 0.41102288650163477, + "grad_norm": 0.3896628206511767, + "learning_rate": 3.579023543397129e-05, + "loss": 0.6678, + "mean_token_accuracy": 0.7986201286315918, + "step": 1760 + }, + { + "epoch": 0.41219056515646896, + "grad_norm": 0.4063001104261983, + "learning_rate": 3.570938602513705e-05, + "loss": 0.6795, + "mean_token_accuracy": 0.7944958925247192, + "step": 1765 + }, + { + "epoch": 0.41335824381130315, + "grad_norm": 0.41089339721209406, + "learning_rate": 3.562841415424662e-05, + "loss": 0.6977, + "mean_token_accuracy": 0.7886811137199402, + "step": 1770 + }, + { + "epoch": 0.41452592246613734, + "grad_norm": 0.3868432258485096, + "learning_rate": 3.554732102918351e-05, + "loss": 0.6863, + "mean_token_accuracy": 0.7919796586036683, + "step": 1775 + }, + { + "epoch": 0.41569360112097153, + "grad_norm": 0.37119853439597744, + "learning_rate": 3.5466107859639984e-05, + "loss": 0.6758, + "mean_token_accuracy": 0.7950326204299927, + "step": 1780 + }, + { + "epoch": 0.41686127977580567, + "grad_norm": 0.3939503508757233, + "learning_rate": 3.538477585709905e-05, + "loss": 0.6866, + "mean_token_accuracy": 0.7924443244934082, + "step": 1785 + }, + { + "epoch": 0.41802895843063986, + "grad_norm": 0.41991821007514973, + "learning_rate": 3.53033262348164e-05, + "loss": 0.6719, + "mean_token_accuracy": 0.7966994285583496, + "step": 1790 + }, + { + "epoch": 0.41919663708547406, + "grad_norm": 0.4120411412605125, + "learning_rate": 3.5221760207802294e-05, + "loss": 0.6892, + "mean_token_accuracy": 0.7919999957084656, + "step": 1795 + }, + { + "epoch": 0.42036431574030825, + "grad_norm": 0.39510010822123987, + "learning_rate": 3.5140078992803434e-05, + "loss": 0.6805, + "mean_token_accuracy": 0.7937294363975524, + "step": 1800 + }, + { + "epoch": 0.42153199439514244, + "grad_norm": 0.4149842898085742, + "learning_rate": 3.505828380828481e-05, + "loss": 0.6728, + "mean_token_accuracy": 0.7967824578285218, + "step": 1805 + }, + { + "epoch": 0.42269967304997663, + "grad_norm": 0.4181905866060584, + "learning_rate": 3.497637587441155e-05, + "loss": 0.6792, + "mean_token_accuracy": 0.7939387559890747, + "step": 1810 + }, + { + "epoch": 0.4238673517048108, + "grad_norm": 0.3985035611058136, + "learning_rate": 3.4894356413030675e-05, + "loss": 0.6845, + "mean_token_accuracy": 0.7926811814308167, + "step": 1815 + }, + { + "epoch": 0.425035030359645, + "grad_norm": 0.4153099489898849, + "learning_rate": 3.481222664765295e-05, + "loss": 0.6839, + "mean_token_accuracy": 0.7927857637405396, + "step": 1820 + }, + { + "epoch": 0.4262027090144792, + "grad_norm": 0.40739651617921374, + "learning_rate": 3.4729987803434514e-05, + "loss": 0.6934, + "mean_token_accuracy": 0.7898040890693665, + "step": 1825 + }, + { + "epoch": 0.4273703876693134, + "grad_norm": 0.46301143185621646, + "learning_rate": 3.464764110715871e-05, + "loss": 0.681, + "mean_token_accuracy": 0.79350346326828, + "step": 1830 + }, + { + "epoch": 0.4285380663241476, + "grad_norm": 0.4318784168557727, + "learning_rate": 3.456518778721775e-05, + "loss": 0.689, + "mean_token_accuracy": 0.7910579919815064, + "step": 1835 + }, + { + "epoch": 0.4297057449789818, + "grad_norm": 0.41027536080349863, + "learning_rate": 3.448262907359435e-05, + "loss": 0.679, + "mean_token_accuracy": 0.7946364521980286, + "step": 1840 + }, + { + "epoch": 0.430873423633816, + "grad_norm": 0.4050182004231391, + "learning_rate": 3.439996619784343e-05, + "loss": 0.6832, + "mean_token_accuracy": 0.7934371113777161, + "step": 1845 + }, + { + "epoch": 0.43204110228865017, + "grad_norm": 0.40170693940144653, + "learning_rate": 3.431720039307378e-05, + "loss": 0.6869, + "mean_token_accuracy": 0.7916916489601136, + "step": 1850 + }, + { + "epoch": 0.43320878094348436, + "grad_norm": 0.3931388373746608, + "learning_rate": 3.423433289392956e-05, + "loss": 0.6707, + "mean_token_accuracy": 0.7967695236206055, + "step": 1855 + }, + { + "epoch": 0.43437645959831855, + "grad_norm": 0.40483020784318546, + "learning_rate": 3.4151364936571934e-05, + "loss": 0.6858, + "mean_token_accuracy": 0.7928181648254394, + "step": 1860 + }, + { + "epoch": 0.43554413825315275, + "grad_norm": 0.36478863069964784, + "learning_rate": 3.4068297758660664e-05, + "loss": 0.6721, + "mean_token_accuracy": 0.7965542912483216, + "step": 1865 + }, + { + "epoch": 0.43671181690798694, + "grad_norm": 0.3839526719783043, + "learning_rate": 3.398513259933561e-05, + "loss": 0.686, + "mean_token_accuracy": 0.7919784545898437, + "step": 1870 + }, + { + "epoch": 0.43787949556282113, + "grad_norm": 0.4091620918073207, + "learning_rate": 3.3901870699198226e-05, + "loss": 0.6822, + "mean_token_accuracy": 0.793584406375885, + "step": 1875 + }, + { + "epoch": 0.4390471742176553, + "grad_norm": 0.3883080060204217, + "learning_rate": 3.381851330029314e-05, + "loss": 0.6563, + "mean_token_accuracy": 0.8008251786231995, + "step": 1880 + }, + { + "epoch": 0.4402148528724895, + "grad_norm": 0.39320719705252183, + "learning_rate": 3.373506164608948e-05, + "loss": 0.6923, + "mean_token_accuracy": 0.790829861164093, + "step": 1885 + }, + { + "epoch": 0.4413825315273237, + "grad_norm": 0.40294298303367143, + "learning_rate": 3.365151698146249e-05, + "loss": 0.6772, + "mean_token_accuracy": 0.7951627731323242, + "step": 1890 + }, + { + "epoch": 0.44255021018215784, + "grad_norm": 0.403729531665388, + "learning_rate": 3.3567880552674835e-05, + "loss": 0.6889, + "mean_token_accuracy": 0.7918317317962646, + "step": 1895 + }, + { + "epoch": 0.44371788883699204, + "grad_norm": 0.3819022516226057, + "learning_rate": 3.3484153607358075e-05, + "loss": 0.6757, + "mean_token_accuracy": 0.7951253414154053, + "step": 1900 + }, + { + "epoch": 0.44488556749182623, + "grad_norm": 0.39870138376832936, + "learning_rate": 3.340033739449403e-05, + "loss": 0.6794, + "mean_token_accuracy": 0.794717812538147, + "step": 1905 + }, + { + "epoch": 0.4460532461466604, + "grad_norm": 0.39523818540209393, + "learning_rate": 3.331643316439614e-05, + "loss": 0.6828, + "mean_token_accuracy": 0.793049693107605, + "step": 1910 + }, + { + "epoch": 0.4472209248014946, + "grad_norm": 0.38417691183881175, + "learning_rate": 3.323244216869084e-05, + "loss": 0.6881, + "mean_token_accuracy": 0.7920605659484863, + "step": 1915 + }, + { + "epoch": 0.4483886034563288, + "grad_norm": 0.40792374210755894, + "learning_rate": 3.314836566029888e-05, + "loss": 0.6748, + "mean_token_accuracy": 0.7949099779129029, + "step": 1920 + }, + { + "epoch": 0.449556282111163, + "grad_norm": 0.3925942956702455, + "learning_rate": 3.306420489341662e-05, + "loss": 0.6822, + "mean_token_accuracy": 0.7930144429206848, + "step": 1925 + }, + { + "epoch": 0.4507239607659972, + "grad_norm": 0.39212589972373063, + "learning_rate": 3.2979961123497315e-05, + "loss": 0.684, + "mean_token_accuracy": 0.7924970984458923, + "step": 1930 + }, + { + "epoch": 0.4518916394208314, + "grad_norm": 0.4255518983413449, + "learning_rate": 3.289563560723246e-05, + "loss": 0.6758, + "mean_token_accuracy": 0.7956908702850342, + "step": 1935 + }, + { + "epoch": 0.4530593180756656, + "grad_norm": 0.41723618211659086, + "learning_rate": 3.281122960253292e-05, + "loss": 0.6761, + "mean_token_accuracy": 0.7948375463485717, + "step": 1940 + }, + { + "epoch": 0.45422699673049977, + "grad_norm": 0.38780026642120474, + "learning_rate": 3.272674436851026e-05, + "loss": 0.6708, + "mean_token_accuracy": 0.7973459005355835, + "step": 1945 + }, + { + "epoch": 0.45539467538533396, + "grad_norm": 0.3875382727295917, + "learning_rate": 3.264218116545795e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.7931478142738342, + "step": 1950 + }, + { + "epoch": 0.45656235404016815, + "grad_norm": 0.3905338176443901, + "learning_rate": 3.255754125483253e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.7934584140777587, + "step": 1955 + }, + { + "epoch": 0.45773003269500234, + "grad_norm": 0.3844167831040847, + "learning_rate": 3.24728258992348e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.7986980676651001, + "step": 1960 + }, + { + "epoch": 0.45889771134983653, + "grad_norm": 0.386804296525295, + "learning_rate": 3.238803636239103e-05, + "loss": 0.6798, + "mean_token_accuracy": 0.7941224694252014, + "step": 1965 + }, + { + "epoch": 0.4600653900046707, + "grad_norm": 0.40361626122726907, + "learning_rate": 3.2303173909134054e-05, + "loss": 0.6828, + "mean_token_accuracy": 0.7928823709487915, + "step": 1970 + }, + { + "epoch": 0.4612330686595049, + "grad_norm": 0.40687751319649823, + "learning_rate": 3.221823980538442e-05, + "loss": 0.6782, + "mean_token_accuracy": 0.7942167758941651, + "step": 1975 + }, + { + "epoch": 0.4624007473143391, + "grad_norm": 0.39126623981232456, + "learning_rate": 3.213323531813153e-05, + "loss": 0.6835, + "mean_token_accuracy": 0.7925355076789856, + "step": 1980 + }, + { + "epoch": 0.4635684259691733, + "grad_norm": 0.4106657734198936, + "learning_rate": 3.204816171541469e-05, + "loss": 0.6756, + "mean_token_accuracy": 0.7947211265563965, + "step": 1985 + }, + { + "epoch": 0.4647361046240075, + "grad_norm": 0.4095714694072843, + "learning_rate": 3.196302026630425e-05, + "loss": 0.6702, + "mean_token_accuracy": 0.7964576601982116, + "step": 1990 + }, + { + "epoch": 0.4659037832788417, + "grad_norm": 0.3953706043561268, + "learning_rate": 3.1877812240882646e-05, + "loss": 0.6757, + "mean_token_accuracy": 0.7942575693130494, + "step": 1995 + }, + { + "epoch": 0.4670714619336759, + "grad_norm": 0.3851763681421473, + "learning_rate": 3.179253891022542e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.7938367128372192, + "step": 2000 + }, + { + "epoch": 0.46823914058851, + "grad_norm": 0.399948747547719, + "learning_rate": 3.170720154638234e-05, + "loss": 0.6807, + "mean_token_accuracy": 0.7936875104904175, + "step": 2005 + }, + { + "epoch": 0.4694068192433442, + "grad_norm": 0.40869044111431857, + "learning_rate": 3.1621801422358355e-05, + "loss": 0.6709, + "mean_token_accuracy": 0.7963006258010864, + "step": 2010 + }, + { + "epoch": 0.4705744978981784, + "grad_norm": 0.38149322201340513, + "learning_rate": 3.153633981209462e-05, + "loss": 0.669, + "mean_token_accuracy": 0.7964886784553528, + "step": 2015 + }, + { + "epoch": 0.4717421765530126, + "grad_norm": 0.39952941092922073, + "learning_rate": 3.1450817990449497e-05, + "loss": 0.6722, + "mean_token_accuracy": 0.7965138196945191, + "step": 2020 + }, + { + "epoch": 0.4729098552078468, + "grad_norm": 0.4243690821658399, + "learning_rate": 3.136523723317958e-05, + "loss": 0.6801, + "mean_token_accuracy": 0.7944878339767456, + "step": 2025 + }, + { + "epoch": 0.474077533862681, + "grad_norm": 0.39143938961989644, + "learning_rate": 3.127959881692058e-05, + "loss": 0.6769, + "mean_token_accuracy": 0.7950151681900024, + "step": 2030 + }, + { + "epoch": 0.47524521251751517, + "grad_norm": 0.39047631233184493, + "learning_rate": 3.119390401916834e-05, + "loss": 0.6634, + "mean_token_accuracy": 0.7983637809753418, + "step": 2035 + }, + { + "epoch": 0.47641289117234936, + "grad_norm": 0.38472832603584556, + "learning_rate": 3.1108154118259774e-05, + "loss": 0.659, + "mean_token_accuracy": 0.799714720249176, + "step": 2040 + }, + { + "epoch": 0.47758056982718355, + "grad_norm": 0.3947996686846603, + "learning_rate": 3.1022350393353786e-05, + "loss": 0.6819, + "mean_token_accuracy": 0.7930071473121643, + "step": 2045 + }, + { + "epoch": 0.47874824848201775, + "grad_norm": 0.41107093084136964, + "learning_rate": 3.093649412441217e-05, + "loss": 0.6888, + "mean_token_accuracy": 0.7907634019851685, + "step": 2050 + }, + { + "epoch": 0.47991592713685194, + "grad_norm": 0.3856486431842656, + "learning_rate": 3.0850586592180555e-05, + "loss": 0.679, + "mean_token_accuracy": 0.7932346105575562, + "step": 2055 + }, + { + "epoch": 0.48108360579168613, + "grad_norm": 0.3796935841092975, + "learning_rate": 3.076462907816928e-05, + "loss": 0.6794, + "mean_token_accuracy": 0.7937552332878113, + "step": 2060 + }, + { + "epoch": 0.4822512844465203, + "grad_norm": 0.383083167059971, + "learning_rate": 3.067862286463427e-05, + "loss": 0.6768, + "mean_token_accuracy": 0.7947343349456787, + "step": 2065 + }, + { + "epoch": 0.4834189631013545, + "grad_norm": 0.39583888033165066, + "learning_rate": 3.059256923455793e-05, + "loss": 0.6717, + "mean_token_accuracy": 0.7962270617485047, + "step": 2070 + }, + { + "epoch": 0.4845866417561887, + "grad_norm": 0.4060106716959265, + "learning_rate": 3.0506469471629976e-05, + "loss": 0.6746, + "mean_token_accuracy": 0.7952371597290039, + "step": 2075 + }, + { + "epoch": 0.4857543204110229, + "grad_norm": 0.3930580617528944, + "learning_rate": 3.0420324860228304e-05, + "loss": 0.6659, + "mean_token_accuracy": 0.7979523539543152, + "step": 2080 + }, + { + "epoch": 0.4869219990658571, + "grad_norm": 0.36749953201149543, + "learning_rate": 3.0334136685399857e-05, + "loss": 0.6748, + "mean_token_accuracy": 0.795464026927948, + "step": 2085 + }, + { + "epoch": 0.4880896777206913, + "grad_norm": 0.3965044081975742, + "learning_rate": 3.0247906232841384e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8003806233406067, + "step": 2090 + }, + { + "epoch": 0.4892573563755255, + "grad_norm": 0.38482924893571335, + "learning_rate": 3.016163478888034e-05, + "loss": 0.6698, + "mean_token_accuracy": 0.7958544492721558, + "step": 2095 + }, + { + "epoch": 0.49042503503035967, + "grad_norm": 0.38777411362743164, + "learning_rate": 3.0075323640455638e-05, + "loss": 0.6825, + "mean_token_accuracy": 0.7926191449165344, + "step": 2100 + }, + { + "epoch": 0.49159271368519386, + "grad_norm": 0.3858133988390046, + "learning_rate": 2.998897407509851e-05, + "loss": 0.6799, + "mean_token_accuracy": 0.7943918347358704, + "step": 2105 + }, + { + "epoch": 0.492760392340028, + "grad_norm": 0.39200123798109965, + "learning_rate": 2.9902587380913212e-05, + "loss": 0.6557, + "mean_token_accuracy": 0.8000911831855774, + "step": 2110 + }, + { + "epoch": 0.4939280709948622, + "grad_norm": 0.3773511934480941, + "learning_rate": 2.9816164846557903e-05, + "loss": 0.6828, + "mean_token_accuracy": 0.792800772190094, + "step": 2115 + }, + { + "epoch": 0.4950957496496964, + "grad_norm": 0.4083064297009315, + "learning_rate": 2.9729707761225383e-05, + "loss": 0.6739, + "mean_token_accuracy": 0.795106828212738, + "step": 2120 + }, + { + "epoch": 0.4962634283045306, + "grad_norm": 0.39969207812571533, + "learning_rate": 2.964321741462383e-05, + "loss": 0.6782, + "mean_token_accuracy": 0.794288718700409, + "step": 2125 + }, + { + "epoch": 0.49743110695936477, + "grad_norm": 0.3811595920632708, + "learning_rate": 2.9556695096957626e-05, + "loss": 0.6857, + "mean_token_accuracy": 0.791758918762207, + "step": 2130 + }, + { + "epoch": 0.49859878561419896, + "grad_norm": 0.40129475927775393, + "learning_rate": 2.9470142098908043e-05, + "loss": 0.6685, + "mean_token_accuracy": 0.7979156017303467, + "step": 2135 + }, + { + "epoch": 0.49976646426903315, + "grad_norm": 0.39114028446445537, + "learning_rate": 2.9383559711614045e-05, + "loss": 0.6582, + "mean_token_accuracy": 0.7996680259704589, + "step": 2140 + }, + { + "epoch": 0.5009341429238674, + "grad_norm": 0.38594327816555923, + "learning_rate": 2.9296949226653004e-05, + "loss": 0.6507, + "mean_token_accuracy": 0.8020126342773437, + "step": 2145 + }, + { + "epoch": 0.5021018215787015, + "grad_norm": 0.397852143933047, + "learning_rate": 2.9210311936021416e-05, + "loss": 0.6894, + "mean_token_accuracy": 0.790645432472229, + "step": 2150 + }, + { + "epoch": 0.5032695002335358, + "grad_norm": 0.3950490777176235, + "learning_rate": 2.9123649132115667e-05, + "loss": 0.6676, + "mean_token_accuracy": 0.797437596321106, + "step": 2155 + }, + { + "epoch": 0.5044371788883699, + "grad_norm": 0.39043202295885177, + "learning_rate": 2.9036962107712724e-05, + "loss": 0.6737, + "mean_token_accuracy": 0.795278263092041, + "step": 2160 + }, + { + "epoch": 0.5056048575432041, + "grad_norm": 0.3827269978768823, + "learning_rate": 2.8950252155950843e-05, + "loss": 0.6722, + "mean_token_accuracy": 0.7963788628578186, + "step": 2165 + }, + { + "epoch": 0.5067725361980383, + "grad_norm": 0.4025260843407993, + "learning_rate": 2.886352057031032e-05, + "loss": 0.6651, + "mean_token_accuracy": 0.7969090104103088, + "step": 2170 + }, + { + "epoch": 0.5079402148528724, + "grad_norm": 0.4011236833901522, + "learning_rate": 2.877676864459417e-05, + "loss": 0.6765, + "mean_token_accuracy": 0.7953397870063782, + "step": 2175 + }, + { + "epoch": 0.5091078935077067, + "grad_norm": 0.4435658167566233, + "learning_rate": 2.8689997672908804e-05, + "loss": 0.6655, + "mean_token_accuracy": 0.7974289059638977, + "step": 2180 + }, + { + "epoch": 0.5102755721625408, + "grad_norm": 0.3859021507998351, + "learning_rate": 2.8603208949644768e-05, + "loss": 0.6585, + "mean_token_accuracy": 0.7993681788444519, + "step": 2185 + }, + { + "epoch": 0.5114432508173751, + "grad_norm": 0.39154850392258655, + "learning_rate": 2.85164037694574e-05, + "loss": 0.6767, + "mean_token_accuracy": 0.7944085955619812, + "step": 2190 + }, + { + "epoch": 0.5126109294722092, + "grad_norm": 0.39936708782686586, + "learning_rate": 2.842958342724756e-05, + "loss": 0.6646, + "mean_token_accuracy": 0.7973796606063843, + "step": 2195 + }, + { + "epoch": 0.5137786081270435, + "grad_norm": 0.4162031994500093, + "learning_rate": 2.8342749218142232e-05, + "loss": 0.6681, + "mean_token_accuracy": 0.7967635989189148, + "step": 2200 + }, + { + "epoch": 0.5149462867818776, + "grad_norm": 0.4095664949401664, + "learning_rate": 2.8255902437475323e-05, + "loss": 0.6644, + "mean_token_accuracy": 0.7977075338363647, + "step": 2205 + }, + { + "epoch": 0.5161139654367118, + "grad_norm": 0.39041227393490224, + "learning_rate": 2.8169044380768216e-05, + "loss": 0.6846, + "mean_token_accuracy": 0.792940354347229, + "step": 2210 + }, + { + "epoch": 0.517281644091546, + "grad_norm": 0.3714465361577785, + "learning_rate": 2.808217634371053e-05, + "loss": 0.6582, + "mean_token_accuracy": 0.7992855072021484, + "step": 2215 + }, + { + "epoch": 0.5184493227463802, + "grad_norm": 0.3879507772590561, + "learning_rate": 2.7995299622140773e-05, + "loss": 0.665, + "mean_token_accuracy": 0.7973877191543579, + "step": 2220 + }, + { + "epoch": 0.5196170014012144, + "grad_norm": 0.37711112009818293, + "learning_rate": 2.7908415512026974e-05, + "loss": 0.6589, + "mean_token_accuracy": 0.7992047548294068, + "step": 2225 + }, + { + "epoch": 0.5207846800560486, + "grad_norm": 0.40098132802056774, + "learning_rate": 2.7821525309447403e-05, + "loss": 0.6607, + "mean_token_accuracy": 0.7991437911987305, + "step": 2230 + }, + { + "epoch": 0.5219523587108827, + "grad_norm": 0.37342214108859606, + "learning_rate": 2.7734630310571202e-05, + "loss": 0.6787, + "mean_token_accuracy": 0.7935737013816834, + "step": 2235 + }, + { + "epoch": 0.523120037365717, + "grad_norm": 0.3898292474235159, + "learning_rate": 2.764773181163907e-05, + "loss": 0.6675, + "mean_token_accuracy": 0.7977463841438294, + "step": 2240 + }, + { + "epoch": 0.5242877160205511, + "grad_norm": 0.38119821420345606, + "learning_rate": 2.7560831108943907e-05, + "loss": 0.6642, + "mean_token_accuracy": 0.7989084362983704, + "step": 2245 + }, + { + "epoch": 0.5254553946753854, + "grad_norm": 0.3787798191096755, + "learning_rate": 2.7473929498811496e-05, + "loss": 0.6679, + "mean_token_accuracy": 0.7965149402618408, + "step": 2250 + }, + { + "epoch": 0.5266230733302195, + "grad_norm": 0.39416611023257603, + "learning_rate": 2.738702827758115e-05, + "loss": 0.6774, + "mean_token_accuracy": 0.7939783453941345, + "step": 2255 + }, + { + "epoch": 0.5277907519850538, + "grad_norm": 0.39461726302066324, + "learning_rate": 2.7300128741586385e-05, + "loss": 0.6791, + "mean_token_accuracy": 0.7939274907112122, + "step": 2260 + }, + { + "epoch": 0.5289584306398879, + "grad_norm": 0.3986676742440667, + "learning_rate": 2.7213232187135575e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.7978583693504333, + "step": 2265 + }, + { + "epoch": 0.5301261092947221, + "grad_norm": 0.3956173352915575, + "learning_rate": 2.7126339910492617e-05, + "loss": 0.6851, + "mean_token_accuracy": 0.7923621296882629, + "step": 2270 + }, + { + "epoch": 0.5312937879495563, + "grad_norm": 0.3903817778008115, + "learning_rate": 2.7039453207857595e-05, + "loss": 0.6801, + "mean_token_accuracy": 0.7939092755317688, + "step": 2275 + }, + { + "epoch": 0.5324614666043904, + "grad_norm": 0.3965730802392834, + "learning_rate": 2.695257337534744e-05, + "loss": 0.6804, + "mean_token_accuracy": 0.7934110164642334, + "step": 2280 + }, + { + "epoch": 0.5336291452592247, + "grad_norm": 0.3727904957482611, + "learning_rate": 2.686570170897662e-05, + "loss": 0.6806, + "mean_token_accuracy": 0.7929062724113465, + "step": 2285 + }, + { + "epoch": 0.5347968239140588, + "grad_norm": 0.37598048199616924, + "learning_rate": 2.677883950463776e-05, + "loss": 0.6547, + "mean_token_accuracy": 0.8006412625312805, + "step": 2290 + }, + { + "epoch": 0.535964502568893, + "grad_norm": 0.38833889105342206, + "learning_rate": 2.6691988058082345e-05, + "loss": 0.6857, + "mean_token_accuracy": 0.793193006515503, + "step": 2295 + }, + { + "epoch": 0.5371321812237272, + "grad_norm": 1.2510963610307277, + "learning_rate": 2.660514866490139e-05, + "loss": 0.6706, + "mean_token_accuracy": 0.7959818720817566, + "step": 2300 + }, + { + "epoch": 0.5382998598785614, + "grad_norm": 0.3989608785946315, + "learning_rate": 2.6518322620506098e-05, + "loss": 0.6681, + "mean_token_accuracy": 0.7970048904418945, + "step": 2305 + }, + { + "epoch": 0.5394675385333956, + "grad_norm": 0.3802985904821147, + "learning_rate": 2.643151122010854e-05, + "loss": 0.6622, + "mean_token_accuracy": 0.7991369962692261, + "step": 2310 + }, + { + "epoch": 0.5406352171882298, + "grad_norm": 0.395664874813112, + "learning_rate": 2.6344715758702356e-05, + "loss": 0.6648, + "mean_token_accuracy": 0.7975702285766602, + "step": 2315 + }, + { + "epoch": 0.541802895843064, + "grad_norm": 0.3942394369251023, + "learning_rate": 2.6257937531043392e-05, + "loss": 0.68, + "mean_token_accuracy": 0.7932106733322144, + "step": 2320 + }, + { + "epoch": 0.5429705744978982, + "grad_norm": 0.4012425579239663, + "learning_rate": 2.6171177831630444e-05, + "loss": 0.6871, + "mean_token_accuracy": 0.790957224369049, + "step": 2325 + }, + { + "epoch": 0.5441382531527323, + "grad_norm": 0.39564436103262335, + "learning_rate": 2.6084437954685887e-05, + "loss": 0.6513, + "mean_token_accuracy": 0.8017109155654907, + "step": 2330 + }, + { + "epoch": 0.5453059318075666, + "grad_norm": 0.40101367370104424, + "learning_rate": 2.5997719194136416e-05, + "loss": 0.6767, + "mean_token_accuracy": 0.7942777752876282, + "step": 2335 + }, + { + "epoch": 0.5464736104624007, + "grad_norm": 0.406854581261549, + "learning_rate": 2.591102284359371e-05, + "loss": 0.6543, + "mean_token_accuracy": 0.8002031445503235, + "step": 2340 + }, + { + "epoch": 0.547641289117235, + "grad_norm": 0.37819038401172, + "learning_rate": 2.582435019633518e-05, + "loss": 0.6612, + "mean_token_accuracy": 0.7984280347824096, + "step": 2345 + }, + { + "epoch": 0.5488089677720691, + "grad_norm": 0.40850144931576354, + "learning_rate": 2.573770254528462e-05, + "loss": 0.6687, + "mean_token_accuracy": 0.7967644572257996, + "step": 2350 + }, + { + "epoch": 0.5499766464269034, + "grad_norm": 0.39904878699893803, + "learning_rate": 2.5651081182992954e-05, + "loss": 0.6673, + "mean_token_accuracy": 0.7971315860748291, + "step": 2355 + }, + { + "epoch": 0.5511443250817375, + "grad_norm": 0.37993363491806803, + "learning_rate": 2.5564487401618958e-05, + "loss": 0.6674, + "mean_token_accuracy": 0.7966079831123352, + "step": 2360 + }, + { + "epoch": 0.5523120037365717, + "grad_norm": 0.40002006305564713, + "learning_rate": 2.5477922492909955e-05, + "loss": 0.6829, + "mean_token_accuracy": 0.79194655418396, + "step": 2365 + }, + { + "epoch": 0.5534796823914059, + "grad_norm": 0.40169074685134154, + "learning_rate": 2.5391387748182573e-05, + "loss": 0.6699, + "mean_token_accuracy": 0.7962153315544128, + "step": 2370 + }, + { + "epoch": 0.5546473610462401, + "grad_norm": 0.3806873663637588, + "learning_rate": 2.5304884458303495e-05, + "loss": 0.6643, + "mean_token_accuracy": 0.7983691930770874, + "step": 2375 + }, + { + "epoch": 0.5558150397010743, + "grad_norm": 0.3804813077314878, + "learning_rate": 2.5218413913670162e-05, + "loss": 0.6626, + "mean_token_accuracy": 0.7981161832809448, + "step": 2380 + }, + { + "epoch": 0.5569827183559084, + "grad_norm": 0.36718976979679036, + "learning_rate": 2.5131977404191542e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.8000335693359375, + "step": 2385 + }, + { + "epoch": 0.5581503970107426, + "grad_norm": 0.3924482308299873, + "learning_rate": 2.5045576219268902e-05, + "loss": 0.6714, + "mean_token_accuracy": 0.7956712245941162, + "step": 2390 + }, + { + "epoch": 0.5593180756655768, + "grad_norm": 0.37508195238381914, + "learning_rate": 2.4959211647776537e-05, + "loss": 0.6525, + "mean_token_accuracy": 0.8008920073509216, + "step": 2395 + }, + { + "epoch": 0.560485754320411, + "grad_norm": 0.3825752189214151, + "learning_rate": 2.4872884978042595e-05, + "loss": 0.6658, + "mean_token_accuracy": 0.797624123096466, + "step": 2400 + }, + { + "epoch": 0.5616534329752452, + "grad_norm": 0.3856473155983679, + "learning_rate": 2.478659749782982e-05, + "loss": 0.6608, + "mean_token_accuracy": 0.7988722562789917, + "step": 2405 + }, + { + "epoch": 0.5628211116300794, + "grad_norm": 0.38253770959765526, + "learning_rate": 2.4700350494316354e-05, + "loss": 0.6688, + "mean_token_accuracy": 0.7962144017219543, + "step": 2410 + }, + { + "epoch": 0.5639887902849136, + "grad_norm": 0.3977597706057122, + "learning_rate": 2.4614145254076537e-05, + "loss": 0.6689, + "mean_token_accuracy": 0.7961449980735779, + "step": 2415 + }, + { + "epoch": 0.5651564689397478, + "grad_norm": 0.3820336543662093, + "learning_rate": 2.4527983063061704e-05, + "loss": 0.6734, + "mean_token_accuracy": 0.795576560497284, + "step": 2420 + }, + { + "epoch": 0.5663241475945819, + "grad_norm": 0.3819158177587632, + "learning_rate": 2.444186520658102e-05, + "loss": 0.6568, + "mean_token_accuracy": 0.7998837471008301, + "step": 2425 + }, + { + "epoch": 0.5674918262494162, + "grad_norm": 0.4109662034267226, + "learning_rate": 2.435579296928229e-05, + "loss": 0.6579, + "mean_token_accuracy": 0.7989148139953614, + "step": 2430 + }, + { + "epoch": 0.5686595049042503, + "grad_norm": 0.3821385731988524, + "learning_rate": 2.426976763513282e-05, + "loss": 0.662, + "mean_token_accuracy": 0.7981701970100403, + "step": 2435 + }, + { + "epoch": 0.5698271835590846, + "grad_norm": 0.377188757646439, + "learning_rate": 2.4183790487400237e-05, + "loss": 0.659, + "mean_token_accuracy": 0.7995378375053406, + "step": 2440 + }, + { + "epoch": 0.5709948622139187, + "grad_norm": 0.3843093931372197, + "learning_rate": 2.4097862808633343e-05, + "loss": 0.6595, + "mean_token_accuracy": 0.7992360472679139, + "step": 2445 + }, + { + "epoch": 0.572162540868753, + "grad_norm": 0.3863088292570427, + "learning_rate": 2.4011985880643e-05, + "loss": 0.6684, + "mean_token_accuracy": 0.7961150288581849, + "step": 2450 + }, + { + "epoch": 0.5733302195235871, + "grad_norm": 0.39198377236251924, + "learning_rate": 2.3926160984483022e-05, + "loss": 0.647, + "mean_token_accuracy": 0.8012050151824951, + "step": 2455 + }, + { + "epoch": 0.5744978981784213, + "grad_norm": 0.3999029068861176, + "learning_rate": 2.3840389400431017e-05, + "loss": 0.6782, + "mean_token_accuracy": 0.7940584063529968, + "step": 2460 + }, + { + "epoch": 0.5756655768332555, + "grad_norm": 0.3760255349698814, + "learning_rate": 2.375467240796937e-05, + "loss": 0.6474, + "mean_token_accuracy": 0.8021298289299011, + "step": 2465 + }, + { + "epoch": 0.5768332554880897, + "grad_norm": 0.38563041351905636, + "learning_rate": 2.3669011285766046e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.7983677506446838, + "step": 2470 + }, + { + "epoch": 0.5780009341429239, + "grad_norm": 0.3694181522631338, + "learning_rate": 2.358340731165561e-05, + "loss": 0.6684, + "mean_token_accuracy": 0.7960250616073609, + "step": 2475 + }, + { + "epoch": 0.5791686127977581, + "grad_norm": 0.38148162472047337, + "learning_rate": 2.349786176262013e-05, + "loss": 0.6694, + "mean_token_accuracy": 0.7958851099014282, + "step": 2480 + }, + { + "epoch": 0.5803362914525922, + "grad_norm": 0.42312819257640844, + "learning_rate": 2.3412375914770112e-05, + "loss": 0.6534, + "mean_token_accuracy": 0.8013283371925354, + "step": 2485 + }, + { + "epoch": 0.5815039701074264, + "grad_norm": 0.4255679109629224, + "learning_rate": 2.3326951043325486e-05, + "loss": 0.6661, + "mean_token_accuracy": 0.7969426035881042, + "step": 2490 + }, + { + "epoch": 0.5826716487622606, + "grad_norm": 0.38998485908353636, + "learning_rate": 2.3241588422596587e-05, + "loss": 0.6612, + "mean_token_accuracy": 0.798487389087677, + "step": 2495 + }, + { + "epoch": 0.5838393274170948, + "grad_norm": 0.3918336155573359, + "learning_rate": 2.3156289325965143e-05, + "loss": 0.6558, + "mean_token_accuracy": 0.8002569437026977, + "step": 2500 + }, + { + "epoch": 0.585007006071929, + "grad_norm": 0.38057538484406456, + "learning_rate": 2.307105502586524e-05, + "loss": 0.6573, + "mean_token_accuracy": 0.7994654297828674, + "step": 2505 + }, + { + "epoch": 0.5861746847267632, + "grad_norm": 0.38590081420703304, + "learning_rate": 2.2985886793764405e-05, + "loss": 0.6743, + "mean_token_accuracy": 0.7945105552673339, + "step": 2510 + }, + { + "epoch": 0.5873423633815974, + "grad_norm": 0.38044374951170873, + "learning_rate": 2.2900785900144588e-05, + "loss": 0.6501, + "mean_token_accuracy": 0.8019159197807312, + "step": 2515 + }, + { + "epoch": 0.5885100420364315, + "grad_norm": 0.3987228535022481, + "learning_rate": 2.2815753614483237e-05, + "loss": 0.6476, + "mean_token_accuracy": 0.8025233149528503, + "step": 2520 + }, + { + "epoch": 0.5896777206912658, + "grad_norm": 0.3853477717970389, + "learning_rate": 2.2730791205234353e-05, + "loss": 0.6575, + "mean_token_accuracy": 0.7994131326675415, + "step": 2525 + }, + { + "epoch": 0.5908453993460999, + "grad_norm": 0.411383400475898, + "learning_rate": 2.2645899939809574e-05, + "loss": 0.6614, + "mean_token_accuracy": 0.7975597500801086, + "step": 2530 + }, + { + "epoch": 0.5920130780009342, + "grad_norm": 0.39844474574732414, + "learning_rate": 2.2561081084559243e-05, + "loss": 0.6774, + "mean_token_accuracy": 0.7939473390579224, + "step": 2535 + }, + { + "epoch": 0.5931807566557683, + "grad_norm": 0.3767945599452822, + "learning_rate": 2.247633590475356e-05, + "loss": 0.6559, + "mean_token_accuracy": 0.7998726487159729, + "step": 2540 + }, + { + "epoch": 0.5943484353106026, + "grad_norm": 0.3861122127684768, + "learning_rate": 2.2391665664563672e-05, + "loss": 0.6825, + "mean_token_accuracy": 0.7923056721687317, + "step": 2545 + }, + { + "epoch": 0.5955161139654367, + "grad_norm": 0.4102012417651354, + "learning_rate": 2.2307071627042823e-05, + "loss": 0.6653, + "mean_token_accuracy": 0.7968276858329773, + "step": 2550 + }, + { + "epoch": 0.5966837926202709, + "grad_norm": 0.3782186608819109, + "learning_rate": 2.2222555054107536e-05, + "loss": 0.6512, + "mean_token_accuracy": 0.8002317428588868, + "step": 2555 + }, + { + "epoch": 0.5978514712751051, + "grad_norm": 0.3811522448736815, + "learning_rate": 2.213811720651876e-05, + "loss": 0.65, + "mean_token_accuracy": 0.801992905139923, + "step": 2560 + }, + { + "epoch": 0.5990191499299393, + "grad_norm": 0.3826157781929433, + "learning_rate": 2.205375934386306e-05, + "loss": 0.6711, + "mean_token_accuracy": 0.7949237942695617, + "step": 2565 + }, + { + "epoch": 0.6001868285847735, + "grad_norm": 0.3676084119802985, + "learning_rate": 2.1969482724533858e-05, + "loss": 0.6655, + "mean_token_accuracy": 0.797294819355011, + "step": 2570 + }, + { + "epoch": 0.6013545072396077, + "grad_norm": 0.3758364748202847, + "learning_rate": 2.188528860571263e-05, + "loss": 0.6597, + "mean_token_accuracy": 0.7983564853668212, + "step": 2575 + }, + { + "epoch": 0.6025221858944418, + "grad_norm": 0.3731297463142096, + "learning_rate": 2.1801178243350174e-05, + "loss": 0.6717, + "mean_token_accuracy": 0.7961166858673095, + "step": 2580 + }, + { + "epoch": 0.6036898645492761, + "grad_norm": 0.3827597543625971, + "learning_rate": 2.171715289214786e-05, + "loss": 0.6624, + "mean_token_accuracy": 0.7986134648323059, + "step": 2585 + }, + { + "epoch": 0.6048575432041102, + "grad_norm": 0.3997867634894463, + "learning_rate": 2.1633213805538945e-05, + "loss": 0.6635, + "mean_token_accuracy": 0.7980273962020874, + "step": 2590 + }, + { + "epoch": 0.6060252218589445, + "grad_norm": 0.39712129201169716, + "learning_rate": 2.15493622356698e-05, + "loss": 0.6711, + "mean_token_accuracy": 0.7957921743392944, + "step": 2595 + }, + { + "epoch": 0.6071929005137786, + "grad_norm": 0.36529555469851793, + "learning_rate": 2.1465599433381318e-05, + "loss": 0.6555, + "mean_token_accuracy": 0.7996231198310852, + "step": 2600 + }, + { + "epoch": 0.6083605791686127, + "grad_norm": 0.3723833719585349, + "learning_rate": 2.1381926648190198e-05, + "loss": 0.6487, + "mean_token_accuracy": 0.802177345752716, + "step": 2605 + }, + { + "epoch": 0.609528257823447, + "grad_norm": 0.3645089856463834, + "learning_rate": 2.1298345128270352e-05, + "loss": 0.6546, + "mean_token_accuracy": 0.8001800179481506, + "step": 2610 + }, + { + "epoch": 0.6106959364782811, + "grad_norm": 0.40851910258690566, + "learning_rate": 2.1214856120434223e-05, + "loss": 0.6674, + "mean_token_accuracy": 0.7959051609039307, + "step": 2615 + }, + { + "epoch": 0.6118636151331154, + "grad_norm": 0.40282257001176175, + "learning_rate": 2.113146087011425e-05, + "loss": 0.6634, + "mean_token_accuracy": 0.7979746460914612, + "step": 2620 + }, + { + "epoch": 0.6130312937879495, + "grad_norm": 0.3748773248577665, + "learning_rate": 2.1048160621344244e-05, + "loss": 0.6598, + "mean_token_accuracy": 0.7981600522994995, + "step": 2625 + }, + { + "epoch": 0.6141989724427838, + "grad_norm": 0.38571072254446964, + "learning_rate": 2.0964956616740857e-05, + "loss": 0.6539, + "mean_token_accuracy": 0.7994945645332336, + "step": 2630 + }, + { + "epoch": 0.6153666510976179, + "grad_norm": 0.4036395873311437, + "learning_rate": 2.0881850097485012e-05, + "loss": 0.6669, + "mean_token_accuracy": 0.7969999432563781, + "step": 2635 + }, + { + "epoch": 0.6165343297524521, + "grad_norm": 0.43304546379063247, + "learning_rate": 2.079884230330346e-05, + "loss": 0.6589, + "mean_token_accuracy": 0.7992241263389588, + "step": 2640 + }, + { + "epoch": 0.6177020084072863, + "grad_norm": 0.37772771955693135, + "learning_rate": 2.07159344724502e-05, + "loss": 0.647, + "mean_token_accuracy": 0.8021301746368408, + "step": 2645 + }, + { + "epoch": 0.6188696870621205, + "grad_norm": 0.4008087173427264, + "learning_rate": 2.063312784168805e-05, + "loss": 0.6702, + "mean_token_accuracy": 0.7959200382232666, + "step": 2650 + }, + { + "epoch": 0.6200373657169547, + "grad_norm": 0.387080371632132, + "learning_rate": 2.0550423646270206e-05, + "loss": 0.6631, + "mean_token_accuracy": 0.7976379990577698, + "step": 2655 + }, + { + "epoch": 0.6212050443717889, + "grad_norm": 0.38458477316560813, + "learning_rate": 2.0467823119921787e-05, + "loss": 0.6686, + "mean_token_accuracy": 0.7961486577987671, + "step": 2660 + }, + { + "epoch": 0.622372723026623, + "grad_norm": 0.3852310732662969, + "learning_rate": 2.038532749482146e-05, + "loss": 0.663, + "mean_token_accuracy": 0.7982524037361145, + "step": 2665 + }, + { + "epoch": 0.6235404016814573, + "grad_norm": 0.37947550796375445, + "learning_rate": 2.030293800158305e-05, + "loss": 0.6419, + "mean_token_accuracy": 0.8028964877128602, + "step": 2670 + }, + { + "epoch": 0.6247080803362914, + "grad_norm": 0.38193179452668885, + "learning_rate": 2.0220655869237165e-05, + "loss": 0.6513, + "mean_token_accuracy": 0.8001656055450439, + "step": 2675 + }, + { + "epoch": 0.6258757589911257, + "grad_norm": 0.391899409271097, + "learning_rate": 2.0138482325212877e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.7988639235496521, + "step": 2680 + }, + { + "epoch": 0.6270434376459598, + "grad_norm": 0.38735951808618496, + "learning_rate": 2.0056418595319427e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.8003427267074585, + "step": 2685 + }, + { + "epoch": 0.6282111163007941, + "grad_norm": 0.3819003636798128, + "learning_rate": 1.9974465903727906e-05, + "loss": 0.6567, + "mean_token_accuracy": 0.7997593641281128, + "step": 2690 + }, + { + "epoch": 0.6293787949556282, + "grad_norm": 0.3869898494196985, + "learning_rate": 1.9892625472953008e-05, + "loss": 0.654, + "mean_token_accuracy": 0.8004847049713135, + "step": 2695 + }, + { + "epoch": 0.6305464736104625, + "grad_norm": 0.3778592848937282, + "learning_rate": 1.9810898523834826e-05, + "loss": 0.6553, + "mean_token_accuracy": 0.7997747421264648, + "step": 2700 + }, + { + "epoch": 0.6317141522652966, + "grad_norm": 0.3837523681375621, + "learning_rate": 1.972928627552058e-05, + "loss": 0.6534, + "mean_token_accuracy": 0.8005034685134887, + "step": 2705 + }, + { + "epoch": 0.6328818309201307, + "grad_norm": 0.37512050860108664, + "learning_rate": 1.9647789945446475e-05, + "loss": 0.6535, + "mean_token_accuracy": 0.7993323326110839, + "step": 2710 + }, + { + "epoch": 0.634049509574965, + "grad_norm": 0.3835897668056868, + "learning_rate": 1.9566410749319515e-05, + "loss": 0.6514, + "mean_token_accuracy": 0.8006104469299317, + "step": 2715 + }, + { + "epoch": 0.6352171882297991, + "grad_norm": 0.3695002272856091, + "learning_rate": 1.948514990109939e-05, + "loss": 0.6684, + "mean_token_accuracy": 0.7959200143814087, + "step": 2720 + }, + { + "epoch": 0.6363848668846334, + "grad_norm": 0.37960960442256886, + "learning_rate": 1.9404008612980347e-05, + "loss": 0.6463, + "mean_token_accuracy": 0.801915454864502, + "step": 2725 + }, + { + "epoch": 0.6375525455394675, + "grad_norm": 0.368485075069303, + "learning_rate": 1.9322988095373133e-05, + "loss": 0.6551, + "mean_token_accuracy": 0.8000059247016906, + "step": 2730 + }, + { + "epoch": 0.6387202241943017, + "grad_norm": 0.3786842019624189, + "learning_rate": 1.9242089556886917e-05, + "loss": 0.6612, + "mean_token_accuracy": 0.798745584487915, + "step": 2735 + }, + { + "epoch": 0.6398879028491359, + "grad_norm": 0.3695349977894561, + "learning_rate": 1.9161314204311242e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.7988736033439636, + "step": 2740 + }, + { + "epoch": 0.6410555815039701, + "grad_norm": 0.3807857604718515, + "learning_rate": 1.908066324259808e-05, + "loss": 0.6588, + "mean_token_accuracy": 0.7984455585479736, + "step": 2745 + }, + { + "epoch": 0.6422232601588043, + "grad_norm": 0.3911666646790534, + "learning_rate": 1.9000137874843803e-05, + "loss": 0.657, + "mean_token_accuracy": 0.7995599150657654, + "step": 2750 + }, + { + "epoch": 0.6433909388136385, + "grad_norm": 0.39213958972667373, + "learning_rate": 1.8919739302271267e-05, + "loss": 0.6455, + "mean_token_accuracy": 0.8023225426673889, + "step": 2755 + }, + { + "epoch": 0.6445586174684726, + "grad_norm": 0.3832691800155978, + "learning_rate": 1.883946872421189e-05, + "loss": 0.6524, + "mean_token_accuracy": 0.8006999731063843, + "step": 2760 + }, + { + "epoch": 0.6457262961233069, + "grad_norm": 0.3795347775431126, + "learning_rate": 1.8759327338087735e-05, + "loss": 0.6618, + "mean_token_accuracy": 0.7985408186912537, + "step": 2765 + }, + { + "epoch": 0.646893974778141, + "grad_norm": 0.3746052441311914, + "learning_rate": 1.8679316339393677e-05, + "loss": 0.6507, + "mean_token_accuracy": 0.8010080099105835, + "step": 2770 + }, + { + "epoch": 0.6480616534329753, + "grad_norm": 0.36868261829288146, + "learning_rate": 1.859943692167957e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.7999223947525025, + "step": 2775 + }, + { + "epoch": 0.6492293320878094, + "grad_norm": 0.3684599305009906, + "learning_rate": 1.8519690276532416e-05, + "loss": 0.6577, + "mean_token_accuracy": 0.7998315095901489, + "step": 2780 + }, + { + "epoch": 0.6503970107426437, + "grad_norm": 0.3953716444604819, + "learning_rate": 1.8440077593558602e-05, + "loss": 0.6574, + "mean_token_accuracy": 0.7995441913604736, + "step": 2785 + }, + { + "epoch": 0.6515646893974778, + "grad_norm": 0.3783877229144065, + "learning_rate": 1.8360600060366197e-05, + "loss": 0.6668, + "mean_token_accuracy": 0.7966958522796631, + "step": 2790 + }, + { + "epoch": 0.652732368052312, + "grad_norm": 0.39004659409960457, + "learning_rate": 1.828125886254715e-05, + "loss": 0.6587, + "mean_token_accuracy": 0.7985253810882569, + "step": 2795 + }, + { + "epoch": 0.6539000467071462, + "grad_norm": 0.36768492114055695, + "learning_rate": 1.8202055183659677e-05, + "loss": 0.6552, + "mean_token_accuracy": 0.8008711099624634, + "step": 2800 + }, + { + "epoch": 0.6550677253619804, + "grad_norm": 0.37698809062259425, + "learning_rate": 1.8122990205210572e-05, + "loss": 0.6652, + "mean_token_accuracy": 0.7970464348793029, + "step": 2805 + }, + { + "epoch": 0.6562354040168146, + "grad_norm": 0.3926446830311605, + "learning_rate": 1.8044065106637585e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.7995175361633301, + "step": 2810 + }, + { + "epoch": 0.6574030826716487, + "grad_norm": 0.3884566523960821, + "learning_rate": 1.7965281065291838e-05, + "loss": 0.6531, + "mean_token_accuracy": 0.799417507648468, + "step": 2815 + }, + { + "epoch": 0.658570761326483, + "grad_norm": 0.37748897348406885, + "learning_rate": 1.7886639256420267e-05, + "loss": 0.6416, + "mean_token_accuracy": 0.8030337929725647, + "step": 2820 + }, + { + "epoch": 0.6597384399813171, + "grad_norm": 0.3629083444690745, + "learning_rate": 1.780814085314807e-05, + "loss": 0.6634, + "mean_token_accuracy": 0.7977676868438721, + "step": 2825 + }, + { + "epoch": 0.6609061186361513, + "grad_norm": 0.38944957970094385, + "learning_rate": 1.772978702646122e-05, + "loss": 0.6556, + "mean_token_accuracy": 0.8002457976341247, + "step": 2830 + }, + { + "epoch": 0.6620737972909855, + "grad_norm": 0.3807367680127489, + "learning_rate": 1.7651578945188984e-05, + "loss": 0.6581, + "mean_token_accuracy": 0.7991437435150146, + "step": 2835 + }, + { + "epoch": 0.6632414759458197, + "grad_norm": 0.3950466878040155, + "learning_rate": 1.7573517775986514e-05, + "loss": 0.6533, + "mean_token_accuracy": 0.799791157245636, + "step": 2840 + }, + { + "epoch": 0.6644091546006539, + "grad_norm": 0.38426184641565697, + "learning_rate": 1.7495604683317406e-05, + "loss": 0.6605, + "mean_token_accuracy": 0.7978045463562011, + "step": 2845 + }, + { + "epoch": 0.6655768332554881, + "grad_norm": 0.39252101919934435, + "learning_rate": 1.7417840829436387e-05, + "loss": 0.6537, + "mean_token_accuracy": 0.7998825311660767, + "step": 2850 + }, + { + "epoch": 0.6667445119103222, + "grad_norm": 0.37326307854831325, + "learning_rate": 1.7340227374371914e-05, + "loss": 0.6639, + "mean_token_accuracy": 0.7980046033859253, + "step": 2855 + }, + { + "epoch": 0.6679121905651565, + "grad_norm": 0.390977049996798, + "learning_rate": 1.726276547590889e-05, + "loss": 0.665, + "mean_token_accuracy": 0.7969791054725647, + "step": 2860 + }, + { + "epoch": 0.6690798692199906, + "grad_norm": 0.39534841393244546, + "learning_rate": 1.7185456289571416e-05, + "loss": 0.6504, + "mean_token_accuracy": 0.8005916237831116, + "step": 2865 + }, + { + "epoch": 0.6702475478748249, + "grad_norm": 0.41290321984245815, + "learning_rate": 1.7108300968605537e-05, + "loss": 0.6554, + "mean_token_accuracy": 0.800058650970459, + "step": 2870 + }, + { + "epoch": 0.671415226529659, + "grad_norm": 0.37052508407741974, + "learning_rate": 1.7031300663962023e-05, + "loss": 0.6497, + "mean_token_accuracy": 0.8015452980995178, + "step": 2875 + }, + { + "epoch": 0.6725829051844933, + "grad_norm": 0.3891123318682196, + "learning_rate": 1.6954456524279233e-05, + "loss": 0.6645, + "mean_token_accuracy": 0.796503484249115, + "step": 2880 + }, + { + "epoch": 0.6737505838393274, + "grad_norm": 0.37051747281712377, + "learning_rate": 1.6877769695865953e-05, + "loss": 0.6471, + "mean_token_accuracy": 0.8017045021057129, + "step": 2885 + }, + { + "epoch": 0.6749182624941616, + "grad_norm": 0.3836989948070399, + "learning_rate": 1.680124132268431e-05, + "loss": 0.6491, + "mean_token_accuracy": 0.8009914517402649, + "step": 2890 + }, + { + "epoch": 0.6760859411489958, + "grad_norm": 0.37526435417909715, + "learning_rate": 1.6724872546332697e-05, + "loss": 0.6511, + "mean_token_accuracy": 0.8005789041519165, + "step": 2895 + }, + { + "epoch": 0.67725361980383, + "grad_norm": 0.36719200431237475, + "learning_rate": 1.6648664506028762e-05, + "loss": 0.6395, + "mean_token_accuracy": 0.8038304686546326, + "step": 2900 + }, + { + "epoch": 0.6784212984586642, + "grad_norm": 0.37255525027645936, + "learning_rate": 1.657261833859238e-05, + "loss": 0.6561, + "mean_token_accuracy": 0.7990904331207276, + "step": 2905 + }, + { + "epoch": 0.6795889771134984, + "grad_norm": 0.3841368338189396, + "learning_rate": 1.6496735178428754e-05, + "loss": 0.6502, + "mean_token_accuracy": 0.8011838793754578, + "step": 2910 + }, + { + "epoch": 0.6807566557683326, + "grad_norm": 0.393134880719221, + "learning_rate": 1.642101615751142e-05, + "loss": 0.6531, + "mean_token_accuracy": 0.8012020349502563, + "step": 2915 + }, + { + "epoch": 0.6819243344231667, + "grad_norm": 0.37450951503194646, + "learning_rate": 1.6345462405365402e-05, + "loss": 0.6501, + "mean_token_accuracy": 0.801072645187378, + "step": 2920 + }, + { + "epoch": 0.6830920130780009, + "grad_norm": 0.374188211422769, + "learning_rate": 1.627007504905037e-05, + "loss": 0.6439, + "mean_token_accuracy": 0.8032167911529541, + "step": 2925 + }, + { + "epoch": 0.6842596917328351, + "grad_norm": 0.37831161846784017, + "learning_rate": 1.6194855213143807e-05, + "loss": 0.6484, + "mean_token_accuracy": 0.8009993553161621, + "step": 2930 + }, + { + "epoch": 0.6854273703876693, + "grad_norm": 0.37196751054185123, + "learning_rate": 1.611980401972422e-05, + "loss": 0.6423, + "mean_token_accuracy": 0.802980613708496, + "step": 2935 + }, + { + "epoch": 0.6865950490425035, + "grad_norm": 0.40775639369569106, + "learning_rate": 1.604492258835447e-05, + "loss": 0.6614, + "mean_token_accuracy": 0.7978637337684631, + "step": 2940 + }, + { + "epoch": 0.6877627276973377, + "grad_norm": 0.36662494351255903, + "learning_rate": 1.5970212036064973e-05, + "loss": 0.658, + "mean_token_accuracy": 0.7992728471755981, + "step": 2945 + }, + { + "epoch": 0.6889304063521718, + "grad_norm": 0.38496978007716887, + "learning_rate": 1.589567347733712e-05, + "loss": 0.658, + "mean_token_accuracy": 0.7988088250160217, + "step": 2950 + }, + { + "epoch": 0.6900980850070061, + "grad_norm": 0.3845461880390263, + "learning_rate": 1.5821308024086594e-05, + "loss": 0.6493, + "mean_token_accuracy": 0.8010260343551636, + "step": 2955 + }, + { + "epoch": 0.6912657636618402, + "grad_norm": 0.3867346422525274, + "learning_rate": 1.5747116785646827e-05, + "loss": 0.6407, + "mean_token_accuracy": 0.8042088747024536, + "step": 2960 + }, + { + "epoch": 0.6924334423166745, + "grad_norm": 0.3892914528793639, + "learning_rate": 1.5673100868752414e-05, + "loss": 0.6575, + "mean_token_accuracy": 0.7988802671432496, + "step": 2965 + }, + { + "epoch": 0.6936011209715086, + "grad_norm": 0.371495262139535, + "learning_rate": 1.559926137752265e-05, + "loss": 0.6611, + "mean_token_accuracy": 0.7979819178581238, + "step": 2970 + }, + { + "epoch": 0.6947687996263429, + "grad_norm": 0.38018365696870615, + "learning_rate": 1.5525599413445008e-05, + "loss": 0.6525, + "mean_token_accuracy": 0.8005258440971375, + "step": 2975 + }, + { + "epoch": 0.695936478281177, + "grad_norm": 0.39328281912684904, + "learning_rate": 1.5452116075358737e-05, + "loss": 0.6619, + "mean_token_accuracy": 0.7984437584877014, + "step": 2980 + }, + { + "epoch": 0.6971041569360112, + "grad_norm": 0.38422158941222695, + "learning_rate": 1.5378812459438465e-05, + "loss": 0.6649, + "mean_token_accuracy": 0.7973509311676026, + "step": 2985 + }, + { + "epoch": 0.6982718355908454, + "grad_norm": 0.3738884686657843, + "learning_rate": 1.530568965917785e-05, + "loss": 0.6539, + "mean_token_accuracy": 0.800170624256134, + "step": 2990 + }, + { + "epoch": 0.6994395142456796, + "grad_norm": 0.3813266689749819, + "learning_rate": 1.5232748765373252e-05, + "loss": 0.6533, + "mean_token_accuracy": 0.7995292544364929, + "step": 2995 + }, + { + "epoch": 0.7006071929005138, + "grad_norm": 0.38271935337315144, + "learning_rate": 1.5159990866107499e-05, + "loss": 0.6402, + "mean_token_accuracy": 0.8041568160057068, + "step": 3000 + }, + { + "epoch": 0.701774871555348, + "grad_norm": 0.36947396800353793, + "learning_rate": 1.5087417046733602e-05, + "loss": 0.6535, + "mean_token_accuracy": 0.7999855399131774, + "step": 3005 + }, + { + "epoch": 0.7029425502101821, + "grad_norm": 0.3835716556912879, + "learning_rate": 1.5015028389858609e-05, + "loss": 0.6494, + "mean_token_accuracy": 0.8019704461097718, + "step": 3010 + }, + { + "epoch": 0.7041102288650164, + "grad_norm": 0.38170465735432796, + "learning_rate": 1.494282597532744e-05, + "loss": 0.6565, + "mean_token_accuracy": 0.7988895773887634, + "step": 3015 + }, + { + "epoch": 0.7052779075198505, + "grad_norm": 0.37624531281159473, + "learning_rate": 1.487081088020677e-05, + "loss": 0.671, + "mean_token_accuracy": 0.795062518119812, + "step": 3020 + }, + { + "epoch": 0.7064455861746848, + "grad_norm": 0.39019695842445207, + "learning_rate": 1.4798984178768971e-05, + "loss": 0.6483, + "mean_token_accuracy": 0.8010831713676453, + "step": 3025 + }, + { + "epoch": 0.7076132648295189, + "grad_norm": 0.3748104690543524, + "learning_rate": 1.4727346942476101e-05, + "loss": 0.6426, + "mean_token_accuracy": 0.8030072450637817, + "step": 3030 + }, + { + "epoch": 0.708780943484353, + "grad_norm": 0.37640934124295516, + "learning_rate": 1.4655900239963888e-05, + "loss": 0.6531, + "mean_token_accuracy": 0.7999082922935485, + "step": 3035 + }, + { + "epoch": 0.7099486221391873, + "grad_norm": 0.3685628273977279, + "learning_rate": 1.4584645137025805e-05, + "loss": 0.6421, + "mean_token_accuracy": 0.8034268379211426, + "step": 3040 + }, + { + "epoch": 0.7111163007940214, + "grad_norm": 0.3697296331271997, + "learning_rate": 1.4513582696597186e-05, + "loss": 0.6493, + "mean_token_accuracy": 0.8020463228225708, + "step": 3045 + }, + { + "epoch": 0.7122839794488557, + "grad_norm": 0.37052119376997816, + "learning_rate": 1.4442713978739347e-05, + "loss": 0.6645, + "mean_token_accuracy": 0.7975929260253907, + "step": 3050 + }, + { + "epoch": 0.7134516581036898, + "grad_norm": 0.37857325346561405, + "learning_rate": 1.4372040040623776e-05, + "loss": 0.6558, + "mean_token_accuracy": 0.7994943141937256, + "step": 3055 + }, + { + "epoch": 0.7146193367585241, + "grad_norm": 0.3833794512344288, + "learning_rate": 1.4301561936516395e-05, + "loss": 0.6504, + "mean_token_accuracy": 0.8009833574295044, + "step": 3060 + }, + { + "epoch": 0.7157870154133582, + "grad_norm": 0.38110314318687005, + "learning_rate": 1.4231280717761778e-05, + "loss": 0.6525, + "mean_token_accuracy": 0.7999861717224122, + "step": 3065 + }, + { + "epoch": 0.7169546940681925, + "grad_norm": 0.3735254091900343, + "learning_rate": 1.416119743276751e-05, + "loss": 0.6435, + "mean_token_accuracy": 0.802791154384613, + "step": 3070 + }, + { + "epoch": 0.7181223727230266, + "grad_norm": 0.37162809243948136, + "learning_rate": 1.4091313126988529e-05, + "loss": 0.6423, + "mean_token_accuracy": 0.8036653518676757, + "step": 3075 + }, + { + "epoch": 0.7192900513778608, + "grad_norm": 0.39057705983112834, + "learning_rate": 1.4021628842911544e-05, + "loss": 0.6521, + "mean_token_accuracy": 0.8002265572547913, + "step": 3080 + }, + { + "epoch": 0.720457730032695, + "grad_norm": 0.3808493448345631, + "learning_rate": 1.3952145620039464e-05, + "loss": 0.6574, + "mean_token_accuracy": 0.7988303661346435, + "step": 3085 + }, + { + "epoch": 0.7216254086875292, + "grad_norm": 0.37524483042858947, + "learning_rate": 1.3882864494875918e-05, + "loss": 0.6497, + "mean_token_accuracy": 0.8013214588165283, + "step": 3090 + }, + { + "epoch": 0.7227930873423634, + "grad_norm": 0.3719467728492709, + "learning_rate": 1.3813786500909767e-05, + "loss": 0.6468, + "mean_token_accuracy": 0.8023304224014283, + "step": 3095 + }, + { + "epoch": 0.7239607659971976, + "grad_norm": 0.38004620956274954, + "learning_rate": 1.3744912668599703e-05, + "loss": 0.6522, + "mean_token_accuracy": 0.8002031087875366, + "step": 3100 + }, + { + "epoch": 0.7251284446520317, + "grad_norm": 0.3643447159709163, + "learning_rate": 1.3676244025358867e-05, + "loss": 0.646, + "mean_token_accuracy": 0.8021530151367188, + "step": 3105 + }, + { + "epoch": 0.726296123306866, + "grad_norm": 0.4135902488768389, + "learning_rate": 1.3607781595539531e-05, + "loss": 0.6506, + "mean_token_accuracy": 0.8008216857910156, + "step": 3110 + }, + { + "epoch": 0.7274638019617001, + "grad_norm": 0.3952338503414328, + "learning_rate": 1.3539526400417835e-05, + "loss": 0.6476, + "mean_token_accuracy": 0.8010394215583801, + "step": 3115 + }, + { + "epoch": 0.7286314806165344, + "grad_norm": 0.37572483236167004, + "learning_rate": 1.3471479458178499e-05, + "loss": 0.6335, + "mean_token_accuracy": 0.8057471871376037, + "step": 3120 + }, + { + "epoch": 0.7297991592713685, + "grad_norm": 0.3909882723756494, + "learning_rate": 1.3403641783899687e-05, + "loss": 0.6291, + "mean_token_accuracy": 0.8065895080566406, + "step": 3125 + }, + { + "epoch": 0.7309668379262028, + "grad_norm": 0.3866004464421546, + "learning_rate": 1.3336014389537843e-05, + "loss": 0.6543, + "mean_token_accuracy": 0.8001023054122924, + "step": 3130 + }, + { + "epoch": 0.7321345165810369, + "grad_norm": 0.37635945397107956, + "learning_rate": 1.3268598283912598e-05, + "loss": 0.6466, + "mean_token_accuracy": 0.801537036895752, + "step": 3135 + }, + { + "epoch": 0.733302195235871, + "grad_norm": 0.38122541277008287, + "learning_rate": 1.3201394472691702e-05, + "loss": 0.6599, + "mean_token_accuracy": 0.7979891896247864, + "step": 3140 + }, + { + "epoch": 0.7344698738907053, + "grad_norm": 0.3781128505362795, + "learning_rate": 1.313440395837609e-05, + "loss": 0.6569, + "mean_token_accuracy": 0.8005757093429565, + "step": 3145 + }, + { + "epoch": 0.7356375525455394, + "grad_norm": 0.3999916030894223, + "learning_rate": 1.3067627740284821e-05, + "loss": 0.6476, + "mean_token_accuracy": 0.8019585251808167, + "step": 3150 + }, + { + "epoch": 0.7368052312003737, + "grad_norm": 0.37316370820848577, + "learning_rate": 1.3001066814540269e-05, + "loss": 0.6465, + "mean_token_accuracy": 0.8019713521003723, + "step": 3155 + }, + { + "epoch": 0.7379729098552078, + "grad_norm": 0.3787332241459694, + "learning_rate": 1.2934722174053196e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.7989971518516541, + "step": 3160 + }, + { + "epoch": 0.739140588510042, + "grad_norm": 0.3962890582248418, + "learning_rate": 1.2868594808507984e-05, + "loss": 0.6577, + "mean_token_accuracy": 0.7986351370811462, + "step": 3165 + }, + { + "epoch": 0.7403082671648762, + "grad_norm": 0.3923927050579654, + "learning_rate": 1.280268570434784e-05, + "loss": 0.6471, + "mean_token_accuracy": 0.8014679551124573, + "step": 3170 + }, + { + "epoch": 0.7414759458197104, + "grad_norm": 0.37821196328370077, + "learning_rate": 1.2736995844760122e-05, + "loss": 0.6428, + "mean_token_accuracy": 0.8029387712478637, + "step": 3175 + }, + { + "epoch": 0.7426436244745446, + "grad_norm": 0.41062307616179666, + "learning_rate": 1.2671526209661622e-05, + "loss": 0.6578, + "mean_token_accuracy": 0.7991038799285889, + "step": 3180 + }, + { + "epoch": 0.7438113031293788, + "grad_norm": 0.3800852357343656, + "learning_rate": 1.2606277775683973e-05, + "loss": 0.6358, + "mean_token_accuracy": 0.8052282333374023, + "step": 3185 + }, + { + "epoch": 0.744978981784213, + "grad_norm": 0.3774500810775569, + "learning_rate": 1.2541251516159097e-05, + "loss": 0.6513, + "mean_token_accuracy": 0.8008481979370117, + "step": 3190 + }, + { + "epoch": 0.7461466604390472, + "grad_norm": 0.36700371808035454, + "learning_rate": 1.2476448401104647e-05, + "loss": 0.6549, + "mean_token_accuracy": 0.7990518927574157, + "step": 3195 + }, + { + "epoch": 0.7473143390938813, + "grad_norm": 0.35906624732320686, + "learning_rate": 1.241186939720956e-05, + "loss": 0.6309, + "mean_token_accuracy": 0.806613278388977, + "step": 3200 + }, + { + "epoch": 0.7484820177487156, + "grad_norm": 0.3723888916003376, + "learning_rate": 1.2347515467819666e-05, + "loss": 0.6537, + "mean_token_accuracy": 0.8000619173049927, + "step": 3205 + }, + { + "epoch": 0.7496496964035497, + "grad_norm": 0.37431462575701124, + "learning_rate": 1.2283387572923248e-05, + "loss": 0.6511, + "mean_token_accuracy": 0.801169490814209, + "step": 3210 + }, + { + "epoch": 0.750817375058384, + "grad_norm": 0.3754316963979503, + "learning_rate": 1.2219486669136774e-05, + "loss": 0.654, + "mean_token_accuracy": 0.7997126698493957, + "step": 3215 + }, + { + "epoch": 0.7519850537132181, + "grad_norm": 0.3881362845027334, + "learning_rate": 1.2155813709690621e-05, + "loss": 0.642, + "mean_token_accuracy": 0.803035032749176, + "step": 3220 + }, + { + "epoch": 0.7531527323680524, + "grad_norm": 0.38156990956682096, + "learning_rate": 1.209236964441483e-05, + "loss": 0.6554, + "mean_token_accuracy": 0.799255621433258, + "step": 3225 + }, + { + "epoch": 0.7543204110228865, + "grad_norm": 0.36622017998230666, + "learning_rate": 1.2029155419724961e-05, + "loss": 0.6414, + "mean_token_accuracy": 0.8037328243255615, + "step": 3230 + }, + { + "epoch": 0.7554880896777207, + "grad_norm": 0.3867277460574047, + "learning_rate": 1.1966171978607984e-05, + "loss": 0.6433, + "mean_token_accuracy": 0.8028404474258423, + "step": 3235 + }, + { + "epoch": 0.7566557683325549, + "grad_norm": 0.39363675240792806, + "learning_rate": 1.190342026060817e-05, + "loss": 0.6411, + "mean_token_accuracy": 0.8030229091644288, + "step": 3240 + }, + { + "epoch": 0.757823446987389, + "grad_norm": 0.38930846612423925, + "learning_rate": 1.1840901201813114e-05, + "loss": 0.6679, + "mean_token_accuracy": 0.7969488501548767, + "step": 3245 + }, + { + "epoch": 0.7589911256422233, + "grad_norm": 0.3710714882449297, + "learning_rate": 1.1778615734839764e-05, + "loss": 0.631, + "mean_token_accuracy": 0.806585431098938, + "step": 3250 + }, + { + "epoch": 0.7601588042970574, + "grad_norm": 0.3832102538168167, + "learning_rate": 1.1716564788820492e-05, + "loss": 0.6385, + "mean_token_accuracy": 0.8041316270828247, + "step": 3255 + }, + { + "epoch": 0.7613264829518916, + "grad_norm": 0.3872972070773182, + "learning_rate": 1.1654749289389243e-05, + "loss": 0.6477, + "mean_token_accuracy": 0.8018147468566894, + "step": 3260 + }, + { + "epoch": 0.7624941616067258, + "grad_norm": 0.37700846567526525, + "learning_rate": 1.1593170158667754e-05, + "loss": 0.645, + "mean_token_accuracy": 0.802414882183075, + "step": 3265 + }, + { + "epoch": 0.76366184026156, + "grad_norm": 0.3639232524686639, + "learning_rate": 1.1531828315251753e-05, + "loss": 0.6472, + "mean_token_accuracy": 0.8010790586471558, + "step": 3270 + }, + { + "epoch": 0.7648295189163942, + "grad_norm": 0.3735030345315405, + "learning_rate": 1.1470724674197274e-05, + "loss": 0.6443, + "mean_token_accuracy": 0.8015389323234559, + "step": 3275 + }, + { + "epoch": 0.7659971975712284, + "grad_norm": 0.39691294367220126, + "learning_rate": 1.1409860147007021e-05, + "loss": 0.6395, + "mean_token_accuracy": 0.8040521144866943, + "step": 3280 + }, + { + "epoch": 0.7671648762260626, + "grad_norm": 0.36836054780777194, + "learning_rate": 1.1349235641616755e-05, + "loss": 0.6322, + "mean_token_accuracy": 0.8058985590934753, + "step": 3285 + }, + { + "epoch": 0.7683325548808968, + "grad_norm": 0.37537381453379987, + "learning_rate": 1.128885206238175e-05, + "loss": 0.6558, + "mean_token_accuracy": 0.7990662813186645, + "step": 3290 + }, + { + "epoch": 0.7695002335357309, + "grad_norm": 0.3757994875727749, + "learning_rate": 1.1228710310063317e-05, + "loss": 0.6315, + "mean_token_accuracy": 0.8068357229232788, + "step": 3295 + }, + { + "epoch": 0.7706679121905652, + "grad_norm": 0.38085875647517503, + "learning_rate": 1.116881128181535e-05, + "loss": 0.6441, + "mean_token_accuracy": 0.8024171948432922, + "step": 3300 + }, + { + "epoch": 0.7718355908453993, + "grad_norm": 0.3757840113169571, + "learning_rate": 1.1109155871170949e-05, + "loss": 0.6471, + "mean_token_accuracy": 0.8019705295562745, + "step": 3305 + }, + { + "epoch": 0.7730032695002336, + "grad_norm": 0.3790920839791116, + "learning_rate": 1.1049744968029097e-05, + "loss": 0.6396, + "mean_token_accuracy": 0.8037992238998413, + "step": 3310 + }, + { + "epoch": 0.7741709481550677, + "grad_norm": 0.3970928268871951, + "learning_rate": 1.0990579458641372e-05, + "loss": 0.6448, + "mean_token_accuracy": 0.8028359889984131, + "step": 3315 + }, + { + "epoch": 0.775338626809902, + "grad_norm": 0.36327974752116304, + "learning_rate": 1.0931660225598736e-05, + "loss": 0.6373, + "mean_token_accuracy": 0.804655647277832, + "step": 3320 + }, + { + "epoch": 0.7765063054647361, + "grad_norm": 0.3759518240279743, + "learning_rate": 1.0872988147818384e-05, + "loss": 0.6484, + "mean_token_accuracy": 0.8016178965568542, + "step": 3325 + }, + { + "epoch": 0.7776739841195703, + "grad_norm": 0.4141729619166308, + "learning_rate": 1.0814564100530595e-05, + "loss": 0.6622, + "mean_token_accuracy": 0.7982792854309082, + "step": 3330 + }, + { + "epoch": 0.7788416627744045, + "grad_norm": 0.3753700414446427, + "learning_rate": 1.0756388955265704e-05, + "loss": 0.6388, + "mean_token_accuracy": 0.8043179392814637, + "step": 3335 + }, + { + "epoch": 0.7800093414292387, + "grad_norm": 0.39343671409059494, + "learning_rate": 1.0698463579841098e-05, + "loss": 0.6341, + "mean_token_accuracy": 0.8044108986854553, + "step": 3340 + }, + { + "epoch": 0.7811770200840729, + "grad_norm": 0.386770571343313, + "learning_rate": 1.0640788838348261e-05, + "loss": 0.6487, + "mean_token_accuracy": 0.8013948798179626, + "step": 3345 + }, + { + "epoch": 0.7823446987389071, + "grad_norm": 0.36690191491533564, + "learning_rate": 1.0583365591139891e-05, + "loss": 0.6511, + "mean_token_accuracy": 0.8005888104438782, + "step": 3350 + }, + { + "epoch": 0.7835123773937412, + "grad_norm": 0.37294410315201554, + "learning_rate": 1.0526194694817074e-05, + "loss": 0.6545, + "mean_token_accuracy": 0.7997806549072266, + "step": 3355 + }, + { + "epoch": 0.7846800560485754, + "grad_norm": 0.36545686487921497, + "learning_rate": 1.0469277002216487e-05, + "loss": 0.6381, + "mean_token_accuracy": 0.8042653918266296, + "step": 3360 + }, + { + "epoch": 0.7858477347034096, + "grad_norm": 0.37362176800963576, + "learning_rate": 1.0412613362397686e-05, + "loss": 0.6475, + "mean_token_accuracy": 0.8019968271255493, + "step": 3365 + }, + { + "epoch": 0.7870154133582438, + "grad_norm": 0.36435345585483275, + "learning_rate": 1.0356204620630441e-05, + "loss": 0.6313, + "mean_token_accuracy": 0.8065681219100952, + "step": 3370 + }, + { + "epoch": 0.788183092013078, + "grad_norm": 0.380770219797811, + "learning_rate": 1.0300051618382132e-05, + "loss": 0.634, + "mean_token_accuracy": 0.8051761150360107, + "step": 3375 + }, + { + "epoch": 0.7893507706679121, + "grad_norm": 0.37536515641948165, + "learning_rate": 1.0244155193305176e-05, + "loss": 0.6331, + "mean_token_accuracy": 0.8054030537605286, + "step": 3380 + }, + { + "epoch": 0.7905184493227464, + "grad_norm": 0.3729316306337742, + "learning_rate": 1.0188516179224561e-05, + "loss": 0.6358, + "mean_token_accuracy": 0.8045217156410217, + "step": 3385 + }, + { + "epoch": 0.7916861279775805, + "grad_norm": 0.3650068190492238, + "learning_rate": 1.013313540612538e-05, + "loss": 0.6503, + "mean_token_accuracy": 0.8011725425720215, + "step": 3390 + }, + { + "epoch": 0.7928538066324148, + "grad_norm": 0.386596564725545, + "learning_rate": 1.0078013700140471e-05, + "loss": 0.6381, + "mean_token_accuracy": 0.8042178750038147, + "step": 3395 + }, + { + "epoch": 0.7940214852872489, + "grad_norm": 0.3579931576799502, + "learning_rate": 1.002315188353808e-05, + "loss": 0.6296, + "mean_token_accuracy": 0.8062708616256714, + "step": 3400 + }, + { + "epoch": 0.7951891639420832, + "grad_norm": 0.38219625329638357, + "learning_rate": 9.968550774709601e-06, + "loss": 0.637, + "mean_token_accuracy": 0.8040671586990357, + "step": 3405 + }, + { + "epoch": 0.7963568425969173, + "grad_norm": 0.38924147726730784, + "learning_rate": 9.914211188157359e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7979804873466492, + "step": 3410 + }, + { + "epoch": 0.7975245212517516, + "grad_norm": 0.36447930847495597, + "learning_rate": 9.860133934482485e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.802125895023346, + "step": 3415 + }, + { + "epoch": 0.7986921999065857, + "grad_norm": 0.3890641112501792, + "learning_rate": 9.806319820372787e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.8002747178077698, + "step": 3420 + }, + { + "epoch": 0.7998598785614199, + "grad_norm": 0.37357026309425223, + "learning_rate": 9.752769648590742e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.808570384979248, + "step": 3425 + }, + { + "epoch": 0.8010275572162541, + "grad_norm": 0.3863922920951192, + "learning_rate": 9.699484217961519e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.8002237915992737, + "step": 3430 + }, + { + "epoch": 0.8021952358710883, + "grad_norm": 0.37929675638526267, + "learning_rate": 9.64646432336105e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7986759424209595, + "step": 3435 + }, + { + "epoch": 0.8033629145259225, + "grad_norm": 0.3694243788571857, + "learning_rate": 9.593710755704181e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.8015845060348511, + "step": 3440 + }, + { + "epoch": 0.8045305931807567, + "grad_norm": 0.3765015175531483, + "learning_rate": 9.541224301932885e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8022978782653809, + "step": 3445 + }, + { + "epoch": 0.8056982718355908, + "grad_norm": 0.38757633441236145, + "learning_rate": 9.489005745004504e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.8029710173606872, + "step": 3450 + }, + { + "epoch": 0.8068659504904251, + "grad_norm": 0.3646449178736856, + "learning_rate": 9.437055863880063e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.8004884123802185, + "step": 3455 + }, + { + "epoch": 0.8080336291452592, + "grad_norm": 0.3764930850054609, + "learning_rate": 9.385375433512685e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.8039047360420227, + "step": 3460 + }, + { + "epoch": 0.8092013078000934, + "grad_norm": 0.3757955547628867, + "learning_rate": 9.333965224835997e-06, + "loss": 0.648, + "mean_token_accuracy": 0.8016306638717652, + "step": 3465 + }, + { + "epoch": 0.8103689864549276, + "grad_norm": 0.39001728508159117, + "learning_rate": 9.282826004752642e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.8011831641197205, + "step": 3470 + }, + { + "epoch": 0.8115366651097617, + "grad_norm": 0.3828537633405285, + "learning_rate": 9.231958536122847e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7992441654205322, + "step": 3475 + }, + { + "epoch": 0.812704343764596, + "grad_norm": 0.36331299973655, + "learning_rate": 9.18136357775303e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.802211606502533, + "step": 3480 + }, + { + "epoch": 0.8138720224194301, + "grad_norm": 0.3719016603774465, + "learning_rate": 9.131041884384481e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.8047228932380677, + "step": 3485 + }, + { + "epoch": 0.8150397010742644, + "grad_norm": 0.3662701059034752, + "learning_rate": 9.080994206682111e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.8053487658500671, + "step": 3490 + }, + { + "epoch": 0.8162073797290985, + "grad_norm": 0.3721470006106764, + "learning_rate": 9.03122129122326e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.8008878350257873, + "step": 3495 + }, + { + "epoch": 0.8173750583839328, + "grad_norm": 0.37253995769908066, + "learning_rate": 8.981723880486534e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.8009424090385437, + "step": 3500 + }, + { + "epoch": 0.8185427370387669, + "grad_norm": 0.3860225348281834, + "learning_rate": 8.932502712840763e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.7989884138107299, + "step": 3505 + }, + { + "epoch": 0.8197104156936011, + "grad_norm": 0.36696522185179786, + "learning_rate": 8.883558522533959e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.806327474117279, + "step": 3510 + }, + { + "epoch": 0.8208780943484353, + "grad_norm": 0.3908441797079317, + "learning_rate": 8.834892039682378e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.8054356098175048, + "step": 3515 + }, + { + "epoch": 0.8220457730032695, + "grad_norm": 0.3814337990128635, + "learning_rate": 8.786503990259623e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.8018156170845032, + "step": 3520 + }, + { + "epoch": 0.8232134516581037, + "grad_norm": 0.37904435543536347, + "learning_rate": 8.738395096085816e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.8014731526374816, + "step": 3525 + }, + { + "epoch": 0.8243811303129379, + "grad_norm": 0.3856401991646529, + "learning_rate": 8.690566074816828e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.8027620553970337, + "step": 3530 + }, + { + "epoch": 0.825548808967772, + "grad_norm": 0.36635278676079086, + "learning_rate": 8.643017639933586e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8067147254943847, + "step": 3535 + }, + { + "epoch": 0.8267164876226063, + "grad_norm": 0.37543669599458507, + "learning_rate": 8.59575050073141e-06, + "loss": 0.6394, + "mean_token_accuracy": 0.8037443876266479, + "step": 3540 + }, + { + "epoch": 0.8278841662774404, + "grad_norm": 0.36726959314986707, + "learning_rate": 8.54876536230944e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.8025928378105164, + "step": 3545 + }, + { + "epoch": 0.8290518449322747, + "grad_norm": 0.38377795881288285, + "learning_rate": 8.502062925560127e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.8046881675720214, + "step": 3550 + }, + { + "epoch": 0.8302195235871088, + "grad_norm": 0.3778962876964921, + "learning_rate": 8.455643887158756e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.8028294682502747, + "step": 3555 + }, + { + "epoch": 0.8313872022419431, + "grad_norm": 0.374997830332416, + "learning_rate": 8.409508939553092e-06, + "loss": 0.64, + "mean_token_accuracy": 0.8032994627952575, + "step": 3560 + }, + { + "epoch": 0.8325548808967772, + "grad_norm": 0.37040850306398904, + "learning_rate": 8.36365877095301e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.8030556082725525, + "step": 3565 + }, + { + "epoch": 0.8337225595516113, + "grad_norm": 0.379176099809407, + "learning_rate": 8.318094065320238e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.8062988758087158, + "step": 3570 + }, + { + "epoch": 0.8348902382064456, + "grad_norm": 0.37815806377281297, + "learning_rate": 8.272815502358174e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.8012060403823853, + "step": 3575 + }, + { + "epoch": 0.8360579168612797, + "grad_norm": 0.36652782356930486, + "learning_rate": 8.227823757501731e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.8026718378067017, + "step": 3580 + }, + { + "epoch": 0.837225595516114, + "grad_norm": 0.3727210450315503, + "learning_rate": 8.183119501907264e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.8021503329277039, + "step": 3585 + }, + { + "epoch": 0.8383932741709481, + "grad_norm": 0.36230794178325504, + "learning_rate": 8.138703402442545e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.8018892645835877, + "step": 3590 + }, + { + "epoch": 0.8395609528257824, + "grad_norm": 0.3654173951506457, + "learning_rate": 8.094576121676852e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.8056976795196533, + "step": 3595 + }, + { + "epoch": 0.8407286314806165, + "grad_norm": 0.36979812335964923, + "learning_rate": 8.050738317871041e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.8089974641799926, + "step": 3600 + }, + { + "epoch": 0.8418963101354507, + "grad_norm": 0.3616280327443599, + "learning_rate": 8.007190644967759e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.806039834022522, + "step": 3605 + }, + { + "epoch": 0.8430639887902849, + "grad_norm": 0.3765619379695166, + "learning_rate": 7.96393375258167e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.8023491621017456, + "step": 3610 + }, + { + "epoch": 0.8442316674451191, + "grad_norm": 0.36226539024495746, + "learning_rate": 7.920968285989772e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.8048917770385742, + "step": 3615 + }, + { + "epoch": 0.8453993460999533, + "grad_norm": 0.36002243334098544, + "learning_rate": 7.878294886121779e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.80823233127594, + "step": 3620 + }, + { + "epoch": 0.8465670247547875, + "grad_norm": 0.37820924959930546, + "learning_rate": 7.835914189550541e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.8032718658447265, + "step": 3625 + }, + { + "epoch": 0.8477347034096216, + "grad_norm": 0.38099613168773, + "learning_rate": 7.793826828482564e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.805348539352417, + "step": 3630 + }, + { + "epoch": 0.8489023820644559, + "grad_norm": 0.3633287961608042, + "learning_rate": 7.75203343074857e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.8038220524787902, + "step": 3635 + }, + { + "epoch": 0.85007006071929, + "grad_norm": 0.3661456121907081, + "learning_rate": 7.710534619794138e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.8043089628219604, + "step": 3640 + }, + { + "epoch": 0.8512377393741243, + "grad_norm": 0.37298233390075336, + "learning_rate": 7.669331014670402e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.8037281632423401, + "step": 3645 + }, + { + "epoch": 0.8524054180289584, + "grad_norm": 0.3729700605001267, + "learning_rate": 7.628423230024814e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7994333863258362, + "step": 3650 + }, + { + "epoch": 0.8535730966837927, + "grad_norm": 0.37271200666206983, + "learning_rate": 7.58781187609198e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.8032780528068543, + "step": 3655 + }, + { + "epoch": 0.8547407753386268, + "grad_norm": 0.37538881969649895, + "learning_rate": 7.547497558684547e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.8070550918579101, + "step": 3660 + }, + { + "epoch": 0.855908453993461, + "grad_norm": 0.37368000368806253, + "learning_rate": 7.507480879184176e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.8029873251914978, + "step": 3665 + }, + { + "epoch": 0.8570761326482952, + "grad_norm": 0.3712739189392918, + "learning_rate": 7.467762434532566e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.8007203578948975, + "step": 3670 + }, + { + "epoch": 0.8582438113031294, + "grad_norm": 0.38241388832620266, + "learning_rate": 7.4283428172225514e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.80256108045578, + "step": 3675 + }, + { + "epoch": 0.8594114899579636, + "grad_norm": 0.38052713242424113, + "learning_rate": 7.389222615289269e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.8012696385383606, + "step": 3680 + }, + { + "epoch": 0.8605791686127977, + "grad_norm": 0.3748188089246763, + "learning_rate": 7.350402412301374e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.804817545413971, + "step": 3685 + }, + { + "epoch": 0.861746847267632, + "grad_norm": 0.3703002801836799, + "learning_rate": 7.311882787352342e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.8044106364250183, + "step": 3690 + }, + { + "epoch": 0.8629145259224661, + "grad_norm": 0.3762623313652402, + "learning_rate": 7.273664315051821e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.8041117310523986, + "step": 3695 + }, + { + "epoch": 0.8640822045773003, + "grad_norm": 0.36695128892108986, + "learning_rate": 7.2357475655170786e-06, + "loss": 0.6451, + "mean_token_accuracy": 0.8018723487854004, + "step": 3700 + }, + { + "epoch": 0.8652498832321345, + "grad_norm": 0.36497750177188926, + "learning_rate": 7.1981331043644795e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8037200570106506, + "step": 3705 + }, + { + "epoch": 0.8664175618869687, + "grad_norm": 0.3800828688931662, + "learning_rate": 7.160821492701069e-06, + "loss": 0.6353, + "mean_token_accuracy": 0.8043835043907166, + "step": 3710 + }, + { + "epoch": 0.8675852405418029, + "grad_norm": 0.3755628946279249, + "learning_rate": 7.123813287116172e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7997564673423767, + "step": 3715 + }, + { + "epoch": 0.8687529191966371, + "grad_norm": 0.38382883340596985, + "learning_rate": 7.087109039673117e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.800423264503479, + "step": 3720 + }, + { + "epoch": 0.8699205978514712, + "grad_norm": 0.36484802539142697, + "learning_rate": 7.050709297900987e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.8034875631332398, + "step": 3725 + }, + { + "epoch": 0.8710882765063055, + "grad_norm": 0.3706847592912565, + "learning_rate": 7.014614604786457e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.8101207256317139, + "step": 3730 + }, + { + "epoch": 0.8722559551611396, + "grad_norm": 0.36988510840921385, + "learning_rate": 6.978825498765688e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.8027829647064209, + "step": 3735 + }, + { + "epoch": 0.8734236338159739, + "grad_norm": 0.36999010845579794, + "learning_rate": 6.943342513716314e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8067967772483826, + "step": 3740 + }, + { + "epoch": 0.874591312470808, + "grad_norm": 0.36613982079028595, + "learning_rate": 6.908166178949452e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.8028635621070862, + "step": 3745 + }, + { + "epoch": 0.8757589911256423, + "grad_norm": 0.3831455554557966, + "learning_rate": 6.873297019201818e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.8038128256797791, + "step": 3750 + }, + { + "epoch": 0.8769266697804764, + "grad_norm": 0.3642254943490607, + "learning_rate": 6.838735554627903e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.8011249899864197, + "step": 3755 + }, + { + "epoch": 0.8780943484353106, + "grad_norm": 0.38607218352693734, + "learning_rate": 6.8044823007922075e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.803896176815033, + "step": 3760 + }, + { + "epoch": 0.8792620270901448, + "grad_norm": 0.3727422134819163, + "learning_rate": 6.770537768661553e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.8011768341064454, + "step": 3765 + }, + { + "epoch": 0.880429705744979, + "grad_norm": 0.38906424020220626, + "learning_rate": 6.736902464597465e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.8019062638282776, + "step": 3770 + }, + { + "epoch": 0.8815973843998132, + "grad_norm": 0.3701772880498804, + "learning_rate": 6.703576890348613e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.8026488184928894, + "step": 3775 + }, + { + "epoch": 0.8827650630546474, + "grad_norm": 0.3603832805681772, + "learning_rate": 6.670561543043317e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.805867052078247, + "step": 3780 + }, + { + "epoch": 0.8839327417094816, + "grad_norm": 0.37600099575905993, + "learning_rate": 6.637856915182155e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.8045099973678589, + "step": 3785 + }, + { + "epoch": 0.8851004203643157, + "grad_norm": 0.3896885216460711, + "learning_rate": 6.605463494630597e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.8006672382354736, + "step": 3790 + }, + { + "epoch": 0.8862680990191499, + "grad_norm": 0.37799934975789856, + "learning_rate": 6.573381764611733e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.8044989466667175, + "step": 3795 + }, + { + "epoch": 0.8874357776739841, + "grad_norm": 0.357643357794127, + "learning_rate": 6.541612203699062e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.8045275568962097, + "step": 3800 + }, + { + "epoch": 0.8886034563288183, + "grad_norm": 0.3571555166076395, + "learning_rate": 6.510155285809365e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.8032607913017273, + "step": 3805 + }, + { + "epoch": 0.8897711349836525, + "grad_norm": 0.3763731935969846, + "learning_rate": 6.479011480195617e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.802224051952362, + "step": 3810 + }, + { + "epoch": 0.8909388136384867, + "grad_norm": 0.387162775935002, + "learning_rate": 6.448181251439998e-06, + "loss": 0.645, + "mean_token_accuracy": 0.8017127037048339, + "step": 3815 + }, + { + "epoch": 0.8921064922933208, + "grad_norm": 0.36079349840916214, + "learning_rate": 6.41766505944697e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.8051084995269775, + "step": 3820 + }, + { + "epoch": 0.8932741709481551, + "grad_norm": 0.36999272866184874, + "learning_rate": 6.387463359436392e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.8055039405822754, + "step": 3825 + }, + { + "epoch": 0.8944418496029892, + "grad_norm": 0.3792944199181755, + "learning_rate": 6.35757660193676e-06, + "loss": 0.643, + "mean_token_accuracy": 0.802158772945404, + "step": 3830 + }, + { + "epoch": 0.8956095282578235, + "grad_norm": 0.3779820325881207, + "learning_rate": 6.328005232778463e-06, + "loss": 0.6487, + "mean_token_accuracy": 0.8011029839515686, + "step": 3835 + }, + { + "epoch": 0.8967772069126576, + "grad_norm": 0.373700416598451, + "learning_rate": 6.29874969308714e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.8058335781097412, + "step": 3840 + }, + { + "epoch": 0.8979448855674919, + "grad_norm": 0.3748376814674246, + "learning_rate": 6.269810419277105e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.80627361536026, + "step": 3845 + }, + { + "epoch": 0.899112564222326, + "grad_norm": 0.36850082218082947, + "learning_rate": 6.241187843044823e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.8062416076660156, + "step": 3850 + }, + { + "epoch": 0.9002802428771602, + "grad_norm": 0.3744558197926556, + "learning_rate": 6.212882391362487e-06, + "loss": 0.6278, + "mean_token_accuracy": 0.8066682457923889, + "step": 3855 + }, + { + "epoch": 0.9014479215319944, + "grad_norm": 0.3832620103046848, + "learning_rate": 6.184894486471646e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.8025842308998108, + "step": 3860 + }, + { + "epoch": 0.9026156001868286, + "grad_norm": 0.3730276977470794, + "learning_rate": 6.157224545876886e-06, + "loss": 0.6337, + "mean_token_accuracy": 0.8054579973220826, + "step": 3865 + }, + { + "epoch": 0.9037832788416628, + "grad_norm": 0.3799224257735744, + "learning_rate": 6.12987298233963e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.8040842294692994, + "step": 3870 + }, + { + "epoch": 0.904950957496497, + "grad_norm": 0.36834211069241374, + "learning_rate": 6.10284020387196e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.8027387380599975, + "step": 3875 + }, + { + "epoch": 0.9061186361513311, + "grad_norm": 0.3664509650764104, + "learning_rate": 6.076126613730543e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.8023294568061828, + "step": 3880 + }, + { + "epoch": 0.9072863148061654, + "grad_norm": 0.36433416660199286, + "learning_rate": 6.04973261041061e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.8024672150611878, + "step": 3885 + }, + { + "epoch": 0.9084539934609995, + "grad_norm": 0.37731084286507455, + "learning_rate": 6.023658587640011e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.80344078540802, + "step": 3890 + }, + { + "epoch": 0.9096216721158337, + "grad_norm": 0.3768059918618886, + "learning_rate": 5.9979049343733434e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.8058271408081055, + "step": 3895 + }, + { + "epoch": 0.9107893507706679, + "grad_norm": 0.3935168702915699, + "learning_rate": 5.9724720347861524e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.80316321849823, + "step": 3900 + }, + { + "epoch": 0.911957029425502, + "grad_norm": 0.3655960012225911, + "learning_rate": 5.947360268269192e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.8059329390525818, + "step": 3905 + }, + { + "epoch": 0.9131247080803363, + "grad_norm": 0.37523802617878566, + "learning_rate": 5.922570009422778e-06, + "loss": 0.6313, + "mean_token_accuracy": 0.806255555152893, + "step": 3910 + }, + { + "epoch": 0.9142923867351704, + "grad_norm": 0.36286046959444784, + "learning_rate": 5.89810162805118e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.8033267617225647, + "step": 3915 + }, + { + "epoch": 0.9154600653900047, + "grad_norm": 0.3671343687737744, + "learning_rate": 5.873955489157137e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.8018193006515503, + "step": 3920 + }, + { + "epoch": 0.9166277440448388, + "grad_norm": 0.3768650046504404, + "learning_rate": 5.850131952936376e-06, + "loss": 0.639, + "mean_token_accuracy": 0.8045102715492248, + "step": 3925 + }, + { + "epoch": 0.9177954226996731, + "grad_norm": 0.3630738871862932, + "learning_rate": 5.82663137477226e-06, + "loss": 0.6353, + "mean_token_accuracy": 0.8060416936874389, + "step": 3930 + }, + { + "epoch": 0.9189631013545072, + "grad_norm": 0.37611151869305925, + "learning_rate": 5.8034541052304915e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.8017747282981873, + "step": 3935 + }, + { + "epoch": 0.9201307800093415, + "grad_norm": 0.3732824843743703, + "learning_rate": 5.780600490053861e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.8012337446212768, + "step": 3940 + }, + { + "epoch": 0.9212984586641756, + "grad_norm": 0.368138461494472, + "learning_rate": 5.758070870157108e-06, + "loss": 0.6269, + "mean_token_accuracy": 0.8073325395584107, + "step": 3945 + }, + { + "epoch": 0.9224661373190098, + "grad_norm": 0.3673991903717714, + "learning_rate": 5.735865581621838e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8053783416748047, + "step": 3950 + }, + { + "epoch": 0.923633815973844, + "grad_norm": 0.3754305955828592, + "learning_rate": 5.71398495569149e-06, + "loss": 0.6328, + "mean_token_accuracy": 0.8050259113311767, + "step": 3955 + }, + { + "epoch": 0.9248014946286782, + "grad_norm": 0.3790096697891478, + "learning_rate": 5.6924293187664096e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.8042787313461304, + "step": 3960 + }, + { + "epoch": 0.9259691732835124, + "grad_norm": 0.3687822558965693, + "learning_rate": 5.671198992398981e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.8093876004219055, + "step": 3965 + }, + { + "epoch": 0.9271368519383466, + "grad_norm": 0.367977435187402, + "learning_rate": 5.650294293288818e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.8074079036712647, + "step": 3970 + }, + { + "epoch": 0.9283045305931807, + "grad_norm": 0.36144579417766975, + "learning_rate": 5.62971553327805e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.8058712005615234, + "step": 3975 + }, + { + "epoch": 0.929472209248015, + "grad_norm": 0.3727004717637723, + "learning_rate": 5.609463019346677e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.804334819316864, + "step": 3980 + }, + { + "epoch": 0.9306398879028491, + "grad_norm": 0.38410456565566015, + "learning_rate": 5.589537053607972e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.8031257033348084, + "step": 3985 + }, + { + "epoch": 0.9318075665576834, + "grad_norm": 0.37444484206064776, + "learning_rate": 5.569937933303985e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.805377972126007, + "step": 3990 + }, + { + "epoch": 0.9329752452125175, + "grad_norm": 0.373584563127484, + "learning_rate": 5.5506659508011056e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8054758191108704, + "step": 3995 + }, + { + "epoch": 0.9341429238673518, + "grad_norm": 0.3803936358729911, + "learning_rate": 5.53172139358571e-06, + "loss": 0.636, + "mean_token_accuracy": 0.803295373916626, + "step": 4000 + }, + { + "epoch": 0.9353106025221859, + "grad_norm": 0.3797025505944808, + "learning_rate": 5.513104544259858e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.8076358079910279, + "step": 4005 + }, + { + "epoch": 0.93647828117702, + "grad_norm": 0.3723825122401832, + "learning_rate": 5.494815680537088e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.8067279100418091, + "step": 4010 + }, + { + "epoch": 0.9376459598318543, + "grad_norm": 0.3853707955125403, + "learning_rate": 5.476855075238277e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.8056137323379516, + "step": 4015 + }, + { + "epoch": 0.9388136384866884, + "grad_norm": 0.37557308139550377, + "learning_rate": 5.459222996287555e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8041159510612488, + "step": 4020 + }, + { + "epoch": 0.9399813171415227, + "grad_norm": 0.3724739717041964, + "learning_rate": 5.44191970670833e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.8010015487670898, + "step": 4025 + }, + { + "epoch": 0.9411489957963568, + "grad_norm": 0.3984054406510199, + "learning_rate": 5.424945464619341e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.8018677711486817, + "step": 4030 + }, + { + "epoch": 0.942316674451191, + "grad_norm": 0.36558226451632236, + "learning_rate": 5.408300523230831e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.806693160533905, + "step": 4035 + }, + { + "epoch": 0.9434843531060252, + "grad_norm": 0.3633878007892705, + "learning_rate": 5.391985130840752e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.8046307444572449, + "step": 4040 + }, + { + "epoch": 0.9446520317608594, + "grad_norm": 0.36982879542298946, + "learning_rate": 5.375999530831069e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8034152865409852, + "step": 4045 + }, + { + "epoch": 0.9458197104156936, + "grad_norm": 0.3773307947830433, + "learning_rate": 5.3603439616641245e-06, + "loss": 0.624, + "mean_token_accuracy": 0.8086912035942078, + "step": 4050 + }, + { + "epoch": 0.9469873890705278, + "grad_norm": 0.377172907901949, + "learning_rate": 5.345018656879092e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.8014029502868653, + "step": 4055 + }, + { + "epoch": 0.948155067725362, + "grad_norm": 0.379361655583902, + "learning_rate": 5.330023845088476e-06, + "loss": 0.6439, + "mean_token_accuracy": 0.8024410009384155, + "step": 4060 + }, + { + "epoch": 0.9493227463801962, + "grad_norm": 0.36892135295834716, + "learning_rate": 5.3153597499747185e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.805402421951294, + "step": 4065 + }, + { + "epoch": 0.9504904250350303, + "grad_norm": 0.3694357527532295, + "learning_rate": 5.301026590286849e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.802853775024414, + "step": 4070 + }, + { + "epoch": 0.9516581036898646, + "grad_norm": 0.37797464313705204, + "learning_rate": 5.2870245798372275e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.8065261006355285, + "step": 4075 + }, + { + "epoch": 0.9528257823446987, + "grad_norm": 0.37255215818995174, + "learning_rate": 5.2733539274983565e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.7999952077865601, + "step": 4080 + }, + { + "epoch": 0.953993460999533, + "grad_norm": 0.3874452304360724, + "learning_rate": 5.260014837199762e-06, + "loss": 0.6568, + "mean_token_accuracy": 0.798421573638916, + "step": 4085 + }, + { + "epoch": 0.9551611396543671, + "grad_norm": 0.37102670644027963, + "learning_rate": 5.247007507924949e-06, + "loss": 0.6313, + "mean_token_accuracy": 0.8060812592506409, + "step": 4090 + }, + { + "epoch": 0.9563288183092014, + "grad_norm": 0.37041599041148326, + "learning_rate": 5.234332133708441e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.8028023600578308, + "step": 4095 + }, + { + "epoch": 0.9574964969640355, + "grad_norm": 0.37098887273239156, + "learning_rate": 5.221988903632876e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.8064718604087829, + "step": 4100 + }, + { + "epoch": 0.9586641756188697, + "grad_norm": 0.37399941932512254, + "learning_rate": 5.209978001826198e-06, + "loss": 0.6419, + "mean_token_accuracy": 0.8026984453201294, + "step": 4105 + }, + { + "epoch": 0.9598318542737039, + "grad_norm": 0.37668193577405523, + "learning_rate": 5.198299607458896e-06, + "loss": 0.638, + "mean_token_accuracy": 0.8038488507270813, + "step": 4110 + }, + { + "epoch": 0.960999532928538, + "grad_norm": 0.38244863804166446, + "learning_rate": 5.1869538947413405e-06, + "loss": 0.639, + "mean_token_accuracy": 0.8033149838447571, + "step": 4115 + }, + { + "epoch": 0.9621672115833723, + "grad_norm": 0.3748972711675515, + "learning_rate": 5.175941032921182e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.8039469122886658, + "step": 4120 + }, + { + "epoch": 0.9633348902382064, + "grad_norm": 0.3672781370142466, + "learning_rate": 5.165261186280833e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.8036592364311218, + "step": 4125 + }, + { + "epoch": 0.9645025688930406, + "grad_norm": 0.3629863194210429, + "learning_rate": 5.154914514135008e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.807152795791626, + "step": 4130 + }, + { + "epoch": 0.9656702475478748, + "grad_norm": 0.3699498874378647, + "learning_rate": 5.144901170828339e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8035337686538696, + "step": 4135 + }, + { + "epoch": 0.966837926202709, + "grad_norm": 0.3627680979998877, + "learning_rate": 5.1352213057331046e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7997589230537414, + "step": 4140 + }, + { + "epoch": 0.9680056048575432, + "grad_norm": 0.3763629882180278, + "learning_rate": 5.125875063246966e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.8044492959976196, + "step": 4145 + }, + { + "epoch": 0.9691732835123774, + "grad_norm": 0.3668934010447104, + "learning_rate": 5.116862582790834e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.8018334984779358, + "step": 4150 + }, + { + "epoch": 0.9703409621672116, + "grad_norm": 0.3569341969508895, + "learning_rate": 5.108183998806782e-06, + "loss": 0.6434, + "mean_token_accuracy": 0.8027917623519898, + "step": 4155 + }, + { + "epoch": 0.9715086408220458, + "grad_norm": 0.3695260165649441, + "learning_rate": 5.099839440756046e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.8042210698127746, + "step": 4160 + }, + { + "epoch": 0.9726763194768799, + "grad_norm": 0.39208598492768826, + "learning_rate": 5.091829033117085e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.8037813901901245, + "step": 4165 + }, + { + "epoch": 0.9738439981317142, + "grad_norm": 0.37092295719950835, + "learning_rate": 5.0841528953837325e-06, + "loss": 0.628, + "mean_token_accuracy": 0.8063337922096252, + "step": 4170 + }, + { + "epoch": 0.9750116767865483, + "grad_norm": 0.3861513870585751, + "learning_rate": 5.0768111420634066e-06, + "loss": 0.6353, + "mean_token_accuracy": 0.8039231657981872, + "step": 4175 + }, + { + "epoch": 0.9761793554413826, + "grad_norm": 0.3571086827868627, + "learning_rate": 5.069803882675404e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.8044708371162415, + "step": 4180 + }, + { + "epoch": 0.9773470340962167, + "grad_norm": 0.3633237706562508, + "learning_rate": 5.0631312217492755e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.8002571225166321, + "step": 4185 + }, + { + "epoch": 0.978514712751051, + "grad_norm": 0.3853903183864727, + "learning_rate": 5.056793258823254e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.8029995203018189, + "step": 4190 + }, + { + "epoch": 0.9796823914058851, + "grad_norm": 0.37827820395600986, + "learning_rate": 5.050790088442769e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.8053547859191894, + "step": 4195 + }, + { + "epoch": 0.9808500700607193, + "grad_norm": 0.36587793311732325, + "learning_rate": 5.0451218001590496e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.8017143130302429, + "step": 4200 + }, + { + "epoch": 0.9820177487155535, + "grad_norm": 0.3783411005539477, + "learning_rate": 5.03978847852778e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.8051274061203003, + "step": 4205 + }, + { + "epoch": 0.9831854273703877, + "grad_norm": 0.38627606052749275, + "learning_rate": 5.034790203107836e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.8073434591293335, + "step": 4210 + }, + { + "epoch": 0.9843531060252219, + "grad_norm": 0.3679951875937361, + "learning_rate": 5.030127048460104e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.8097437143325805, + "step": 4215 + }, + { + "epoch": 0.985520784680056, + "grad_norm": 0.3660053470368048, + "learning_rate": 5.025799084146362e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.803262984752655, + "step": 4220 + }, + { + "epoch": 0.9866884633348902, + "grad_norm": 0.3658378287783763, + "learning_rate": 5.021806374728249e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.8093530297279358, + "step": 4225 + }, + { + "epoch": 0.9878561419897244, + "grad_norm": 0.37332888668380787, + "learning_rate": 5.0181489797662965e-06, + "loss": 0.629, + "mean_token_accuracy": 0.8064879417419434, + "step": 4230 + }, + { + "epoch": 0.9890238206445586, + "grad_norm": 0.3702887396928239, + "learning_rate": 5.014826953819044e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.8056162118911743, + "step": 4235 + }, + { + "epoch": 0.9901914992993928, + "grad_norm": 0.3681459402409494, + "learning_rate": 5.011840346442225e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.8034667015075684, + "step": 4240 + }, + { + "epoch": 0.991359177954227, + "grad_norm": 0.3763288327057415, + "learning_rate": 5.009189202188022e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.8022206544876098, + "step": 4245 + }, + { + "epoch": 0.9925268566090611, + "grad_norm": 0.3740354834329566, + "learning_rate": 5.006873560604408e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.8054550647735595, + "step": 4250 + }, + { + "epoch": 0.9936945352638954, + "grad_norm": 0.3745952894525004, + "learning_rate": 5.004893456234555e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.8045870423316955, + "step": 4255 + }, + { + "epoch": 0.9948622139187295, + "grad_norm": 0.3673579155169193, + "learning_rate": 5.003248918616317e-06, + "loss": 0.626, + "mean_token_accuracy": 0.8075182795524597, + "step": 4260 + }, + { + "epoch": 0.9960298925735638, + "grad_norm": 0.3714627800004543, + "learning_rate": 5.001939972281793e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.8039301633834839, + "step": 4265 + }, + { + "epoch": 0.9971975712283979, + "grad_norm": 0.36308174848714936, + "learning_rate": 5.000966636756956e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.8000084400177002, + "step": 4270 + }, + { + "epoch": 0.9983652498832322, + "grad_norm": 0.3647134837051614, + "learning_rate": 5.000328926561364e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.8058404445648193, + "step": 4275 + }, + { + "epoch": 0.9995329285380663, + "grad_norm": 0.3694922596818811, + "learning_rate": 5.00002685120795e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.8045821070671082, + "step": 4280 + }, + { + "epoch": 1.0, + "mean_token_accuracy": 0.8059653639793396, + "step": 4282, + "total_flos": 487937544290304.0, + "train_loss": 0.6862401177759541, + "train_runtime": 4946.1614, + "train_samples_per_second": 110.812, + "train_steps_per_second": 0.866 + } + ], + "logging_steps": 5, + "max_steps": 4282, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 487937544290304.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}