diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2305 +1,6981 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9998149861239592, + "epoch": 2.9996357012750456, "eval_steps": 100, - "global_step": 1351, + "global_step": 4116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0037002775208140612, - "grad_norm": 2.8957433700561523, - "learning_rate": 6.5522123978507875e-06, - "loss": 1.2055, - "mean_token_accuracy": 0.6906560870025514, + "epoch": 0.0036429872495446266, + "grad_norm": 1.083729863166809, + "learning_rate": 5.346061023357346e-06, + "loss": 1.5553, + "mean_token_accuracy": 0.6280868343917929, "step": 5 }, { - "epoch": 0.0074005550416281225, - "grad_norm": 1.8598837852478027, - "learning_rate": 9.37409668112298e-06, - "loss": 1.1409, - "mean_token_accuracy": 0.698853009887776, + "epoch": 0.007285974499089253, + "grad_norm": 0.8888028264045715, + "learning_rate": 7.648484184147212e-06, + "loss": 1.5402, + "mean_token_accuracy": 0.632089643380557, "step": 10 }, { - "epoch": 0.011100832562442183, - "grad_norm": 1.537341594696045, - "learning_rate": 1.1024793168211608e-05, - "loss": 0.9966, - "mean_token_accuracy": 0.7270418325735069, + "epoch": 0.01092896174863388, + "grad_norm": 0.607347309589386, + "learning_rate": 8.995315394001158e-06, + "loss": 1.5004, + "mean_token_accuracy": 0.638629702002931, "step": 15 }, { - "epoch": 0.014801110083256245, - "grad_norm": 1.109471082687378, - "learning_rate": 1.219598096439517e-05, - "loss": 0.9761, - "mean_token_accuracy": 0.7272552272192254, + "epoch": 0.014571948998178506, + "grad_norm": 0.5297051668167114, + "learning_rate": 9.950907344937074e-06, + "loss": 1.4841, + "mean_token_accuracy": 0.6408677332681976, "step": 20 }, { - "epoch": 0.018501387604070305, - "grad_norm": 1.0324221849441528, - "learning_rate": 1.3104424795701575e-05, - "loss": 0.9551, - "mean_token_accuracy": 0.7313410652722199, + "epoch": 0.018214936247723135, + "grad_norm": 0.5317749381065369, + "learning_rate": 1.0692122046714693e-05, + "loss": 1.4407, + "mean_token_accuracy": 0.6413135075720566, "step": 25 }, { - "epoch": 0.022201665124884366, - "grad_norm": 0.8868595957756042, - "learning_rate": 1.3846677451483799e-05, - "loss": 0.9201, - "mean_token_accuracy": 0.7379118125275668, + "epoch": 0.02185792349726776, + "grad_norm": 0.4751906096935272, + "learning_rate": 1.1297738554791023e-05, + "loss": 1.4098, + "mean_token_accuracy": 0.6488728414197817, "step": 30 }, { - "epoch": 0.025901942645698426, - "grad_norm": 0.9547779560089111, - "learning_rate": 1.4474243129971968e-05, - "loss": 0.9167, - "mean_token_accuracy": 0.7379963784805907, + "epoch": 0.025500910746812388, + "grad_norm": 0.38953375816345215, + "learning_rate": 1.1809780016460197e-05, + "loss": 1.406, + "mean_token_accuracy": 0.647099413776258, "step": 35 }, { - "epoch": 0.02960222016651249, - "grad_norm": 1.0785847902297974, - "learning_rate": 1.501786524766736e-05, - "loss": 0.8836, - "mean_token_accuracy": 0.746341194692062, + "epoch": 0.029143897996357013, + "grad_norm": 0.39693206548690796, + "learning_rate": 1.2253330505726937e-05, + "loss": 1.3257, + "mean_token_accuracy": 0.6625793844650707, "step": 40 }, { - "epoch": 0.03330249768732655, - "grad_norm": 0.8535160422325134, - "learning_rate": 1.5497373938572426e-05, - "loss": 0.91, - "mean_token_accuracy": 0.7387234984522606, + "epoch": 0.03278688524590164, + "grad_norm": 0.3601134717464447, + "learning_rate": 1.2644569764644967e-05, + "loss": 1.3285, + "mean_token_accuracy": 0.661306179775281, "step": 45 }, { - "epoch": 0.03700277520814061, - "grad_norm": 0.9231573939323425, - "learning_rate": 1.592630907897377e-05, - "loss": 0.8773, - "mean_token_accuracy": 0.74560689911493, + "epoch": 0.03642987249544627, + "grad_norm": 0.36966654658317566, + "learning_rate": 1.2994545207504556e-05, + "loss": 1.2978, + "mean_token_accuracy": 0.6672416951636541, "step": 50 }, { - "epoch": 0.04070305272895467, - "grad_norm": 0.9026180505752563, - "learning_rate": 1.6314328111538255e-05, - "loss": 0.8634, - "mean_token_accuracy": 0.7512907472411612, + "epoch": 0.04007285974499089, + "grad_norm": 0.35046863555908203, + "learning_rate": 1.3311136505276628e-05, + "loss": 1.3147, + "mean_token_accuracy": 0.6625061064973132, "step": 55 }, { - "epoch": 0.04440333024976873, - "grad_norm": 1.0450937747955322, - "learning_rate": 1.6668561734755987e-05, - "loss": 0.8587, - "mean_token_accuracy": 0.7512712527442093, + "epoch": 0.04371584699453552, + "grad_norm": 0.32947227358818054, + "learning_rate": 1.3600161715580883e-05, + "loss": 1.2817, + "mean_token_accuracy": 0.6690461651196873, "step": 60 }, { - "epoch": 0.04810360777058279, - "grad_norm": 1.0268136262893677, - "learning_rate": 1.6994425079669313e-05, - "loss": 0.8858, - "mean_token_accuracy": 0.7436999012547456, + "epoch": 0.04735883424408015, + "grad_norm": 0.321172833442688, + "learning_rate": 1.3866039135512111e-05, + "loss": 1.262, + "mean_token_accuracy": 0.6715070835368833, "step": 65 }, { - "epoch": 0.05180388529139685, - "grad_norm": 1.0223065614700317, - "learning_rate": 1.7296127413244162e-05, - "loss": 0.8555, - "mean_token_accuracy": 0.7516197988052247, + "epoch": 0.051001821493624776, + "grad_norm": 0.35791364312171936, + "learning_rate": 1.411220317725006e-05, + "loss": 1.2613, + "mean_token_accuracy": 0.6724963361016121, "step": 70 }, { - "epoch": 0.05550416281221091, - "grad_norm": 0.9143498539924622, - "learning_rate": 1.7577005566062396e-05, - "loss": 0.914, - "mean_token_accuracy": 0.7367069906630946, + "epoch": 0.0546448087431694, + "grad_norm": 0.33518293499946594, + "learning_rate": 1.4341376417358504e-05, + "loss": 1.2837, + "mean_token_accuracy": 0.6674829018075231, "step": 75 }, { - "epoch": 0.05920444033302498, - "grad_norm": 0.9483162760734558, - "learning_rate": 1.783974953093955e-05, - "loss": 0.872, - "mean_token_accuracy": 0.7465734032042433, + "epoch": 0.058287795992714025, + "grad_norm": 0.3203108608722687, + "learning_rate": 1.45557536665168e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6794638495359062, "step": 80 }, { - "epoch": 0.06290471785383904, - "grad_norm": 1.0431548357009888, - "learning_rate": 1.808655954803422e-05, - "loss": 0.8958, - "mean_token_accuracy": 0.739948661671668, + "epoch": 0.061930783242258654, + "grad_norm": 0.3269725441932678, + "learning_rate": 1.4757130137920071e-05, + "loss": 1.2487, + "mean_token_accuracy": 0.6726276257938447, "step": 85 }, { - "epoch": 0.0666049953746531, - "grad_norm": 0.9597674012184143, - "learning_rate": 1.8319258221844618e-05, - "loss": 0.8504, - "mean_token_accuracy": 0.7529073109309405, + "epoch": 0.06557377049180328, + "grad_norm": 0.3066905736923218, + "learning_rate": 1.4946992925434831e-05, + "loss": 1.2204, + "mean_token_accuracy": 0.680077552515877, "step": 90 }, { - "epoch": 0.07030527289546716, - "grad_norm": 0.8986235857009888, - "learning_rate": 1.8539372284516762e-05, - "loss": 0.8689, - "mean_token_accuracy": 0.7483927127084614, + "epoch": 0.0692167577413479, + "grad_norm": 0.3220832943916321, + "learning_rate": 1.5126587715666353e-05, + "loss": 1.2368, + "mean_token_accuracy": 0.6759861993160723, "step": 95 }, { - "epoch": 0.07400555041628122, - "grad_norm": 1.116733431816101, - "learning_rate": 1.874819336224596e-05, - "loss": 0.8829, - "mean_token_accuracy": 0.7434675110209781, + "epoch": 0.07285974499089254, + "grad_norm": 0.3288683593273163, + "learning_rate": 1.5296968368294423e-05, + "loss": 1.2077, + "mean_token_accuracy": 0.6822423058133853, "step": 100 }, { - "epoch": 0.07400555041628122, - "eval_loss": 0.8832501769065857, - "eval_mean_token_accuracy": 0.7398939720176742, - "eval_runtime": 28.2477, - "eval_samples_per_second": 9.098, - "eval_steps_per_second": 1.168, + "epoch": 0.07285974499089254, + "eval_loss": 1.236223816871643, + "eval_mean_token_accuracy": 0.6733261218644397, + "eval_runtime": 14.5336, + "eval_samples_per_second": 18.027, + "eval_steps_per_second": 1.17, "step": 100 }, { - "epoch": 0.07770582793709528, - "grad_norm": 1.0423495769500732, - "learning_rate": 1.8946823900332786e-05, - "loss": 0.9029, - "mean_token_accuracy": 0.7380944747487156, + "epoch": 0.07650273224043716, + "grad_norm": 0.3233710527420044, + "learning_rate": 1.5459034387104006e-05, + "loss": 1.2171, + "mean_token_accuracy": 0.6799829018075232, "step": 105 }, { - "epoch": 0.08140610545790934, - "grad_norm": 0.8770654797554016, - "learning_rate": 1.9136212394810443e-05, - "loss": 0.8512, - "mean_token_accuracy": 0.752658273084104, + "epoch": 0.08014571948998178, + "grad_norm": 0.3219452500343323, + "learning_rate": 1.5613559666066493e-05, + "loss": 1.2284, + "mean_token_accuracy": 0.6770426233512457, "step": 110 }, { - "epoch": 0.0851063829787234, - "grad_norm": 0.9110291600227356, - "learning_rate": 1.9317180786056083e-05, - "loss": 0.8404, - "mean_token_accuracy": 0.7541489019069838, + "epoch": 0.08378870673952642, + "grad_norm": 0.3516216278076172, + "learning_rate": 1.5761214840250913e-05, + "loss": 1.2033, + "mean_token_accuracy": 0.680730947728383, "step": 115 }, { - "epoch": 0.08880666049953746, - "grad_norm": 1.0504673719406128, - "learning_rate": 1.949044601802818e-05, - "loss": 0.7908, - "mean_token_accuracy": 0.7690538308961964, + "epoch": 0.08743169398907104, + "grad_norm": 0.3209221661090851, + "learning_rate": 1.590258487637075e-05, + "loss": 1.2173, + "mean_token_accuracy": 0.6805741462560163, "step": 120 }, { - "epoch": 0.09250693802035152, - "grad_norm": 0.9688966870307922, - "learning_rate": 1.9656637193552367e-05, - "loss": 0.8283, - "mean_token_accuracy": 0.7583009069462628, + "epoch": 0.09107468123861566, + "grad_norm": 0.3234521746635437, + "learning_rate": 1.6038183070072043e-05, + "loss": 1.2223, + "mean_token_accuracy": 0.677242785901655, "step": 125 }, { - "epoch": 0.09620721554116558, - "grad_norm": 1.0260767936706543, - "learning_rate": 1.9816309362941505e-05, - "loss": 0.8764, - "mean_token_accuracy": 0.744777112966561, + "epoch": 0.0947176684881603, + "grad_norm": 0.3622635006904602, + "learning_rate": 1.6168462296301973e-05, + "loss": 1.202, + "mean_token_accuracy": 0.6806454567659991, "step": 130 }, { - "epoch": 0.09990749306197964, - "grad_norm": 0.9919949769973755, - "learning_rate": 1.996995470893325e-05, - "loss": 0.8696, - "mean_token_accuracy": 0.7461172694007374, + "epoch": 0.09836065573770492, + "grad_norm": 0.300037145614624, + "learning_rate": 1.6293824135288782e-05, + "loss": 1.1958, + "mean_token_accuracy": 0.6822697850512947, "step": 135 }, { - "epoch": 0.1036077705827937, - "grad_norm": 1.040398120880127, - "learning_rate": 2e-05, - "loss": 0.8734, - "mean_token_accuracy": 0.7447351537110054, + "epoch": 0.10200364298724955, + "grad_norm": 0.3131561875343323, + "learning_rate": 1.6414626338039924e-05, + "loss": 1.1895, + "mean_token_accuracy": 0.6848986321446018, "step": 140 }, { - "epoch": 0.10730804810360776, - "grad_norm": 0.8929896354675293, - "learning_rate": 2e-05, - "loss": 0.8464, - "mean_token_accuracy": 0.7520211094548359, + "epoch": 0.10564663023679417, + "grad_norm": 0.3196031451225281, + "learning_rate": 1.6531188981216057e-05, + "loss": 1.1976, + "mean_token_accuracy": 0.6813721299462628, "step": 145 }, { - "epoch": 0.11100832562442182, - "grad_norm": 1.0192770957946777, - "learning_rate": 2e-05, - "loss": 0.8769, - "mean_token_accuracy": 0.744968839209957, + "epoch": 0.1092896174863388, + "grad_norm": 0.3315783143043518, + "learning_rate": 1.6643799578148368e-05, + "loss": 1.1576, + "mean_token_accuracy": 0.692782120175867, "step": 150 }, { - "epoch": 0.1147086031452359, - "grad_norm": 1.0098509788513184, - "learning_rate": 2e-05, - "loss": 0.8323, - "mean_token_accuracy": 0.755749875133377, + "epoch": 0.11293260473588343, + "grad_norm": 0.3324851989746094, + "learning_rate": 1.6752717351491778e-05, + "loss": 1.2008, + "mean_token_accuracy": 0.6811614557889596, "step": 155 }, { - "epoch": 0.11840888066604996, - "grad_norm": 1.056443691253662, - "learning_rate": 2e-05, - "loss": 0.8581, - "mean_token_accuracy": 0.7490507941249067, + "epoch": 0.11657559198542805, + "grad_norm": 0.3203977048397064, + "learning_rate": 1.6858176827306663e-05, + "loss": 1.1705, + "mean_token_accuracy": 0.6861290913531997, "step": 160 }, { - "epoch": 0.12210915818686402, - "grad_norm": 0.876708447933197, - "learning_rate": 2e-05, - "loss": 0.8488, - "mean_token_accuracy": 0.7515548475709357, + "epoch": 0.12021857923497267, + "grad_norm": 0.31534770131111145, + "learning_rate": 1.6960390875920437e-05, + "loss": 1.1766, + "mean_token_accuracy": 0.6852009037616023, "step": 165 }, { - "epoch": 0.12580943570767808, - "grad_norm": 0.9257837533950806, - "learning_rate": 2e-05, - "loss": 0.858, - "mean_token_accuracy": 0.7489278577453964, + "epoch": 0.12386156648451731, + "grad_norm": 0.3208962380886078, + "learning_rate": 1.7059553298709933e-05, + "loss": 1.1762, + "mean_token_accuracy": 0.689121275036639, "step": 170 }, { - "epoch": 0.12950971322849214, - "grad_norm": 1.0198274850845337, - "learning_rate": 2e-05, - "loss": 0.8361, - "mean_token_accuracy": 0.7551833607296417, + "epoch": 0.12750455373406194, + "grad_norm": 0.3643946945667267, + "learning_rate": 1.7155841039817543e-05, + "loss": 1.1845, + "mean_token_accuracy": 0.6833231558378114, "step": 175 }, { - "epoch": 0.1332099907493062, - "grad_norm": 1.0134333372116089, - "learning_rate": 2e-05, - "loss": 0.8212, - "mean_token_accuracy": 0.7584818009818037, + "epoch": 0.13114754098360656, + "grad_norm": 0.32823869585990906, + "learning_rate": 1.7249416086224696e-05, + "loss": 1.196, + "mean_token_accuracy": 0.6801325109916952, "step": 180 }, { - "epoch": 0.13691026827012026, - "grad_norm": 1.0174723863601685, - "learning_rate": 2e-05, - "loss": 0.8628, - "mean_token_accuracy": 0.7476798968375172, + "epoch": 0.13479052823315119, + "grad_norm": 0.33277738094329834, + "learning_rate": 1.7340427107436143e-05, + "loss": 1.1585, + "mean_token_accuracy": 0.6900555691255496, "step": 185 }, { - "epoch": 0.14061054579093432, - "grad_norm": 0.9869109988212585, - "learning_rate": 2e-05, - "loss": 0.8443, - "mean_token_accuracy": 0.7523148595161331, + "epoch": 0.1384335154826958, + "grad_norm": 0.33548158407211304, + "learning_rate": 1.7429010876456215e-05, + "loss": 1.1943, + "mean_token_accuracy": 0.6806485100146555, "step": 190 }, { - "epoch": 0.14431082331174838, - "grad_norm": 0.9373570084571838, - "learning_rate": 2e-05, - "loss": 0.8582, - "mean_token_accuracy": 0.7481680463659037, + "epoch": 0.14207650273224043, + "grad_norm": 0.35972943902015686, + "learning_rate": 1.7515293506155924e-05, + "loss": 1.1669, + "mean_token_accuracy": 0.6856069858329262, "step": 195 }, { - "epoch": 0.14801110083256244, - "grad_norm": 0.9523370862007141, - "learning_rate": 2e-05, - "loss": 0.839, - "mean_token_accuracy": 0.7536938574339362, + "epoch": 0.14571948998178508, + "grad_norm": 0.32267147302627563, + "learning_rate": 1.7599391529084285e-05, + "loss": 1.1692, + "mean_token_accuracy": 0.6881839276990719, "step": 200 }, { - "epoch": 0.14801110083256244, - "eval_loss": 0.8592866659164429, - "eval_mean_token_accuracy": 0.7450125681976898, - "eval_runtime": 28.1828, - "eval_samples_per_second": 9.119, - "eval_steps_per_second": 1.171, + "epoch": 0.14571948998178508, + "eval_loss": 1.1753935813903809, + "eval_mean_token_accuracy": 0.6837428585889604, + "eval_runtime": 14.4124, + "eval_samples_per_second": 18.179, + "eval_steps_per_second": 1.18, "step": 200 }, { - "epoch": 0.1517113783533765, - "grad_norm": 1.0168991088867188, - "learning_rate": 2e-05, - "loss": 0.8189, - "mean_token_accuracy": 0.7584566375595763, + "epoch": 0.1493624772313297, + "grad_norm": 0.3298497200012207, + "learning_rate": 1.7681412843926186e-05, + "loss": 1.1636, + "mean_token_accuracy": 0.6878847093307279, "step": 205 }, { - "epoch": 0.15541165587419056, - "grad_norm": 0.9532285928726196, - "learning_rate": 2e-05, - "loss": 0.8488, - "mean_token_accuracy": 0.7494834024795103, + "epoch": 0.15300546448087432, + "grad_norm": 0.30288785696029663, + "learning_rate": 1.776145754789387e-05, + "loss": 1.1562, + "mean_token_accuracy": 0.6889502931118711, "step": 210 }, { - "epoch": 0.15911193339500462, - "grad_norm": 0.8827746510505676, - "learning_rate": 2e-05, - "loss": 0.8187, - "mean_token_accuracy": 0.7595932905014217, + "epoch": 0.15664845173041894, + "grad_norm": 0.40150612592697144, + "learning_rate": 1.7839618671161183e-05, + "loss": 1.1843, + "mean_token_accuracy": 0.6828957010258916, "step": 215 }, { - "epoch": 0.16281221091581868, - "grad_norm": 0.8878557682037354, - "learning_rate": 2e-05, - "loss": 0.8276, - "mean_token_accuracy": 0.7566971009274142, + "epoch": 0.16029143897996356, + "grad_norm": 0.3242761194705963, + "learning_rate": 1.7915982826856355e-05, + "loss": 1.1429, + "mean_token_accuracy": 0.692953102100635, "step": 220 }, { - "epoch": 0.16651248843663274, - "grad_norm": 0.8196055293083191, - "learning_rate": 2e-05, - "loss": 0.8143, - "mean_token_accuracy": 0.7594526820177664, + "epoch": 0.16393442622950818, + "grad_norm": 0.3417595326900482, + "learning_rate": 1.7990630788002315e-05, + "loss": 1.1259, + "mean_token_accuracy": 0.6991542501221301, "step": 225 }, { - "epoch": 0.1702127659574468, - "grad_norm": 0.8733744025230408, - "learning_rate": 2e-05, - "loss": 0.8433, - "mean_token_accuracy": 0.7518748700823068, + "epoch": 0.16757741347905283, + "grad_norm": 0.3361223638057709, + "learning_rate": 1.8063638001040778e-05, + "loss": 1.1684, + "mean_token_accuracy": 0.685579506595017, "step": 230 }, { - "epoch": 0.17391304347826086, - "grad_norm": 0.851927638053894, - "learning_rate": 2e-05, - "loss": 0.8615, - "mean_token_accuracy": 0.7447105825612464, + "epoch": 0.17122040072859745, + "grad_norm": 0.33286741375923157, + "learning_rate": 1.813507504412511e-05, + "loss": 1.1698, + "mean_token_accuracy": 0.6841627992183683, "step": 235 }, { - "epoch": 0.17761332099907493, - "grad_norm": 0.9009714126586914, - "learning_rate": 2e-05, - "loss": 0.8498, - "mean_token_accuracy": 0.7484718849440476, + "epoch": 0.17486338797814208, + "grad_norm": 0.34265580773353577, + "learning_rate": 1.820500803716061e-05, + "loss": 1.1747, + "mean_token_accuracy": 0.6862054225696141, "step": 240 }, { - "epoch": 0.18131359851988899, - "grad_norm": 0.9394606351852417, - "learning_rate": 2e-05, - "loss": 0.8294, - "mean_token_accuracy": 0.7557432874020297, + "epoch": 0.1785063752276867, + "grad_norm": 0.34530794620513916, + "learning_rate": 1.8273499009563047e-05, + "loss": 1.1496, + "mean_token_accuracy": 0.6905929408891059, "step": 245 }, { - "epoch": 0.18501387604070305, - "grad_norm": 0.9246826171875, - "learning_rate": 2e-05, - "loss": 0.8004, - "mean_token_accuracy": 0.7644653627399571, + "epoch": 0.18214936247723132, + "grad_norm": 0.378530889749527, + "learning_rate": 1.8340606230861904e-05, + "loss": 1.1569, + "mean_token_accuracy": 0.6879213483146068, "step": 250 }, { - "epoch": 0.1887141535615171, - "grad_norm": 0.8407145142555237, - "learning_rate": 2e-05, - "loss": 0.8377, - "mean_token_accuracy": 0.7536744187275825, + "epoch": 0.18579234972677597, + "grad_norm": 0.3267212212085724, + "learning_rate": 1.840638450856388e-05, + "loss": 1.1486, + "mean_token_accuracy": 0.6902121564708253, "step": 255 }, { - "epoch": 0.19241443108233117, - "grad_norm": 0.8852797746658325, - "learning_rate": 2e-05, - "loss": 0.8338, - "mean_token_accuracy": 0.7559093371950457, + "epoch": 0.1894353369763206, + "grad_norm": 0.31661802530288696, + "learning_rate": 1.847088545709184e-05, + "loss": 1.1427, + "mean_token_accuracy": 0.6917562286272594, "step": 260 }, { - "epoch": 0.19611470860314523, - "grad_norm": 1.0910946130752563, - "learning_rate": 2e-05, - "loss": 0.8022, - "mean_token_accuracy": 0.7614075804389667, + "epoch": 0.1930783242258652, + "grad_norm": 0.30997058749198914, + "learning_rate": 1.853415774110566e-05, + "loss": 1.1549, + "mean_token_accuracy": 0.6902479237909136, "step": 265 }, { - "epoch": 0.1998149861239593, - "grad_norm": 0.9022583961486816, - "learning_rate": 2e-05, - "loss": 0.8226, - "mean_token_accuracy": 0.7575589018517326, + "epoch": 0.19672131147540983, + "grad_norm": 0.30749374628067017, + "learning_rate": 1.8596247296078644e-05, + "loss": 1.133, + "mean_token_accuracy": 0.6957437713727406, "step": 270 }, { - "epoch": 0.20351526364477335, - "grad_norm": 0.8302850723266602, - "learning_rate": 2e-05, - "loss": 0.8237, - "mean_token_accuracy": 0.757602458948465, + "epoch": 0.20036429872495445, + "grad_norm": 0.300138384103775, + "learning_rate": 1.8657197528633974e-05, + "loss": 1.1249, + "mean_token_accuracy": 0.6944736199316071, "step": 275 }, { - "epoch": 0.2072155411655874, - "grad_norm": 0.9338579773902893, - "learning_rate": 2e-05, - "loss": 0.8081, - "mean_token_accuracy": 0.7613954278187921, + "epoch": 0.2040072859744991, + "grad_norm": 0.3103027939796448, + "learning_rate": 1.8717049498829786e-05, + "loss": 1.1429, + "mean_token_accuracy": 0.6907700293111871, "step": 280 }, { - "epoch": 0.21091581868640147, - "grad_norm": 0.9413399696350098, - "learning_rate": 2e-05, - "loss": 0.8522, - "mean_token_accuracy": 0.7486690000063428, + "epoch": 0.20765027322404372, + "grad_norm": 0.3321939706802368, + "learning_rate": 1.8775842086310163e-05, + "loss": 1.1427, + "mean_token_accuracy": 0.6926813629702003, "step": 285 }, { - "epoch": 0.21461609620721553, - "grad_norm": 0.850651741027832, - "learning_rate": 2e-05, - "loss": 0.8085, - "mean_token_accuracy": 0.7597455023324446, + "epoch": 0.21129326047358835, + "grad_norm": 0.3095480501651764, + "learning_rate": 1.8833612142005922e-05, + "loss": 1.1425, + "mean_token_accuracy": 0.6899548119198827, "step": 290 }, { - "epoch": 0.2183163737280296, - "grad_norm": 0.8325985074043274, - "learning_rate": 2e-05, - "loss": 0.8015, - "mean_token_accuracy": 0.7625035134700381, + "epoch": 0.21493624772313297, + "grad_norm": 0.2930939197540283, + "learning_rate": 1.889039462686756e-05, + "loss": 1.125, + "mean_token_accuracy": 0.6964429653150953, "step": 295 }, { - "epoch": 0.22201665124884365, - "grad_norm": 0.8833345770835876, - "learning_rate": 2e-05, - "loss": 0.8442, - "mean_token_accuracy": 0.7495757818615388, + "epoch": 0.2185792349726776, + "grad_norm": 0.3320595324039459, + "learning_rate": 1.8946222738938233e-05, + "loss": 1.1666, + "mean_token_accuracy": 0.6868649242794332, "step": 300 }, { - "epoch": 0.22201665124884365, - "eval_loss": 0.8446890115737915, - "eval_mean_token_accuracy": 0.7479223023166885, - "eval_runtime": 28.2235, - "eval_samples_per_second": 9.106, - "eval_steps_per_second": 1.169, + "epoch": 0.2185792349726776, + "eval_loss": 1.1416388750076294, + "eval_mean_token_accuracy": 0.6895034030538906, + "eval_runtime": 14.4924, + "eval_samples_per_second": 18.078, + "eval_steps_per_second": 1.173, "step": 300 }, { - "epoch": 0.22571692876965774, - "grad_norm": 1.0072492361068726, - "learning_rate": 2e-05, - "loss": 0.8126, - "mean_token_accuracy": 0.7591269186975097, + "epoch": 0.2222222222222222, + "grad_norm": 0.31145739555358887, + "learning_rate": 1.9001128029923346e-05, + "loss": 1.1466, + "mean_token_accuracy": 0.6887243527112848, "step": 305 }, { - "epoch": 0.2294172062904718, - "grad_norm": 0.9031651616096497, - "learning_rate": 2e-05, - "loss": 0.8281, - "mean_token_accuracy": 0.7545658190081542, + "epoch": 0.22586520947176686, + "grad_norm": 0.314193993806839, + "learning_rate": 1.905514051228164e-05, + "loss": 1.1185, + "mean_token_accuracy": 0.6978230337078649, "step": 310 }, { - "epoch": 0.23311748381128586, - "grad_norm": 0.8532033562660217, - "learning_rate": 2e-05, - "loss": 0.8178, - "mean_token_accuracy": 0.7587916660812325, + "epoch": 0.22950819672131148, + "grad_norm": 0.34623581171035767, + "learning_rate": 1.910828875774782e-05, + "loss": 1.1448, + "mean_token_accuracy": 0.6900097703957011, "step": 315 }, { - "epoch": 0.23681776133209992, - "grad_norm": 0.9483544826507568, - "learning_rate": 2e-05, - "loss": 0.8451, - "mean_token_accuracy": 0.7510137117833572, + "epoch": 0.2331511839708561, + "grad_norm": 0.3344809114933014, + "learning_rate": 1.9160599988096528e-05, + "loss": 1.1514, + "mean_token_accuracy": 0.6889502931118712, "step": 320 }, { - "epoch": 0.24051803885291398, - "grad_norm": 0.9190371036529541, - "learning_rate": 2e-05, - "loss": 0.8236, - "mean_token_accuracy": 0.7560184304098765, + "epoch": 0.23679417122040072, + "grad_norm": 0.32068851590156555, + "learning_rate": 1.9212100158869457e-05, + "loss": 1.1423, + "mean_token_accuracy": 0.6901746458231559, "step": 325 }, { - "epoch": 0.24421831637372804, - "grad_norm": 0.8880885243415833, - "learning_rate": 2e-05, - "loss": 0.8116, - "mean_token_accuracy": 0.7590534226852792, + "epoch": 0.24043715846994534, + "grad_norm": 0.3246494233608246, + "learning_rate": 1.92628140367103e-05, + "loss": 1.0916, + "mean_token_accuracy": 0.7032120175867121, "step": 330 }, { - "epoch": 0.2479185938945421, - "grad_norm": 0.9330101609230042, - "learning_rate": 2e-05, - "loss": 0.8302, - "mean_token_accuracy": 0.7547550656313512, + "epoch": 0.24408014571949, + "grad_norm": 0.3234722912311554, + "learning_rate": 1.9312765270884355e-05, + "loss": 1.149, + "mean_token_accuracy": 0.687756472887152, "step": 335 }, { - "epoch": 0.25161887141535616, - "grad_norm": 0.8674152493476868, - "learning_rate": 2e-05, - "loss": 0.8351, - "mean_token_accuracy": 0.7541465255919348, + "epoch": 0.24772313296903462, + "grad_norm": 0.3160967528820038, + "learning_rate": 1.9361976459499798e-05, + "loss": 1.0939, + "mean_token_accuracy": 0.7012060332193453, "step": 340 }, { - "epoch": 0.2553191489361702, - "grad_norm": 0.9083060026168823, - "learning_rate": 2e-05, - "loss": 0.8345, - "mean_token_accuracy": 0.7528752633371677, + "epoch": 0.25136612021857924, + "grad_norm": 0.322691947221756, + "learning_rate": 1.9410469210894726e-05, + "loss": 1.1172, + "mean_token_accuracy": 0.6959697117733267, "step": 345 }, { - "epoch": 0.2590194264569843, - "grad_norm": 0.9060890078544617, - "learning_rate": 2e-05, - "loss": 0.8216, - "mean_token_accuracy": 0.7575168041307812, + "epoch": 0.2550091074681239, + "grad_norm": 0.31874555349349976, + "learning_rate": 1.9458264200607405e-05, + "loss": 1.1018, + "mean_token_accuracy": 0.7011755007327797, "step": 350 }, { - "epoch": 0.2627197039777983, - "grad_norm": 0.8555945158004761, - "learning_rate": 2e-05, - "loss": 0.783, - "mean_token_accuracy": 0.7672426588048891, + "epoch": 0.2586520947176685, + "grad_norm": 0.3494158387184143, + "learning_rate": 1.9505381224305674e-05, + "loss": 1.1373, + "mean_token_accuracy": 0.6918753053248655, "step": 355 }, { - "epoch": 0.2664199814986124, - "grad_norm": 0.8146703839302063, - "learning_rate": 2e-05, - "loss": 0.8284, - "mean_token_accuracy": 0.7543931949875237, + "epoch": 0.26229508196721313, + "grad_norm": 0.3095668852329254, + "learning_rate": 1.9551839247014558e-05, + "loss": 1.1233, + "mean_token_accuracy": 0.6946201758671228, "step": 360 }, { - "epoch": 0.27012025901942643, - "grad_norm": 0.9369416832923889, - "learning_rate": 2e-05, - "loss": 0.8016, - "mean_token_accuracy": 0.7621930585328915, + "epoch": 0.2659380692167577, + "grad_norm": 0.33371731638908386, + "learning_rate": 1.9597656448948597e-05, + "loss": 1.1395, + "mean_token_accuracy": 0.6902143380556913, "step": 365 }, { - "epoch": 0.2738205365402405, - "grad_norm": 0.7886361479759216, - "learning_rate": 2e-05, - "loss": 0.8076, - "mean_token_accuracy": 0.7606381698886774, + "epoch": 0.26958105646630237, + "grad_norm": 0.3272486925125122, + "learning_rate": 1.9642850268226008e-05, + "loss": 1.1334, + "mean_token_accuracy": 0.6928737176355642, "step": 370 }, { - "epoch": 0.27752081406105455, - "grad_norm": 0.9856371879577637, - "learning_rate": 2e-05, - "loss": 0.8025, - "mean_token_accuracy": 0.7619788416802726, + "epoch": 0.273224043715847, + "grad_norm": 0.34388625621795654, + "learning_rate": 1.9687437440715852e-05, + "loss": 1.1345, + "mean_token_accuracy": 0.6911791646311675, "step": 375 }, { - "epoch": 0.28122109158186864, - "grad_norm": 0.9179413318634033, - "learning_rate": 2e-05, - "loss": 0.8102, - "mean_token_accuracy": 0.7585128919069817, + "epoch": 0.2768670309653916, + "grad_norm": 0.2946622967720032, + "learning_rate": 1.973143403724608e-05, + "loss": 1.1011, + "mean_token_accuracy": 0.6998412310698583, "step": 380 }, { - "epoch": 0.2849213691026827, - "grad_norm": 0.945956289768219, - "learning_rate": 2e-05, - "loss": 0.7964, - "mean_token_accuracy": 0.7646142601688602, + "epoch": 0.28051001821493626, + "grad_norm": 0.3136511445045471, + "learning_rate": 1.9774855498379478e-05, + "loss": 1.1306, + "mean_token_accuracy": 0.6944949926722032, "step": 385 }, { - "epoch": 0.28862164662349676, - "grad_norm": 0.9356254935264587, - "learning_rate": 2e-05, - "loss": 0.8281, - "mean_token_accuracy": 0.7552207106091118, + "epoch": 0.28415300546448086, + "grad_norm": 0.3592908978462219, + "learning_rate": 1.9817716666945786e-05, + "loss": 1.0909, + "mean_token_accuracy": 0.7024273326819737, "step": 390 }, { - "epoch": 0.2923219241443108, - "grad_norm": 0.8899054527282715, - "learning_rate": 2e-05, - "loss": 0.8269, - "mean_token_accuracy": 0.7552909439013166, + "epoch": 0.2877959927140255, + "grad_norm": 0.34179943799972534, + "learning_rate": 1.9860031818501565e-05, + "loss": 1.1172, + "mean_token_accuracy": 0.6946171226184662, "step": 395 }, { - "epoch": 0.2960222016651249, - "grad_norm": 0.9562066793441772, - "learning_rate": 2e-05, - "loss": 0.8067, - "mean_token_accuracy": 0.7602417180096708, + "epoch": 0.29143897996357016, + "grad_norm": 0.3114219605922699, + "learning_rate": 1.9901814689874147e-05, + "loss": 1.0922, + "mean_token_accuracy": 0.7006564484611626, "step": 400 }, { - "epoch": 0.2960222016651249, - "eval_loss": 0.8349066376686096, - "eval_mean_token_accuracy": 0.7500406710334881, - "eval_runtime": 28.1787, - "eval_samples_per_second": 9.12, - "eval_steps_per_second": 1.171, + "epoch": 0.29143897996357016, + "eval_loss": 1.1195729970932007, + "eval_mean_token_accuracy": 0.6934797960919029, + "eval_runtime": 14.5378, + "eval_samples_per_second": 18.022, + "eval_steps_per_second": 1.169, "step": 400 }, { - "epoch": 0.299722479185939, - "grad_norm": 1.0249305963516235, - "learning_rate": 2e-05, - "loss": 0.8002, - "mean_token_accuracy": 0.7625656294392676, + "epoch": 0.29508196721311475, + "grad_norm": 0.32596296072006226, + "learning_rate": 1.994307850593259e-05, + "loss": 1.1116, + "mean_token_accuracy": 0.6975177088422082, "step": 405 }, { - "epoch": 0.303422756706753, - "grad_norm": 0.8880975842475891, - "learning_rate": 2e-05, - "loss": 0.8045, - "mean_token_accuracy": 0.7602248209978305, + "epoch": 0.2987249544626594, + "grad_norm": 0.33011898398399353, + "learning_rate": 1.9983836004716048e-05, + "loss": 1.1279, + "mean_token_accuracy": 0.6937744259892525, "step": 410 }, { - "epoch": 0.3071230342275671, - "grad_norm": 0.8439946174621582, + "epoch": 0.302367941712204, + "grad_norm": 0.3652317523956299, "learning_rate": 2e-05, - "loss": 0.8164, - "mean_token_accuracy": 0.75832583094361, + "loss": 1.1157, + "mean_token_accuracy": 0.6966597459697119, "step": 415 }, { - "epoch": 0.3108233117483811, - "grad_norm": 0.9006332159042358, + "epoch": 0.30601092896174864, + "grad_norm": 0.31457507610321045, "learning_rate": 2e-05, - "loss": 0.8231, - "mean_token_accuracy": 0.7574048409584961, + "loss": 1.1197, + "mean_token_accuracy": 0.6949621397166584, "step": 420 }, { - "epoch": 0.3145235892691952, - "grad_norm": 0.8789312243461609, + "epoch": 0.30965391621129323, + "grad_norm": 0.31319767236709595, "learning_rate": 2e-05, - "loss": 0.8076, - "mean_token_accuracy": 0.7590419888879876, + "loss": 1.0992, + "mean_token_accuracy": 0.7012732046897898, "step": 425 }, { - "epoch": 0.31822386679000925, - "grad_norm": 0.8401655554771423, + "epoch": 0.3132969034608379, + "grad_norm": 0.2934488356113434, "learning_rate": 2e-05, - "loss": 0.8405, - "mean_token_accuracy": 0.7504104069436911, + "loss": 1.0945, + "mean_token_accuracy": 0.6996916218856863, "step": 430 }, { - "epoch": 0.32192414431082333, - "grad_norm": 0.8797054290771484, + "epoch": 0.31693989071038253, + "grad_norm": 0.33269914984703064, "learning_rate": 2e-05, - "loss": 0.8261, - "mean_token_accuracy": 0.7548758435645662, + "loss": 1.1113, + "mean_token_accuracy": 0.6966261602344895, "step": 435 }, { - "epoch": 0.32562442183163737, - "grad_norm": 0.8514414429664612, + "epoch": 0.3205828779599271, + "grad_norm": 0.33153873682022095, "learning_rate": 2e-05, - "loss": 0.7757, - "mean_token_accuracy": 0.7683710895600167, + "loss": 1.1204, + "mean_token_accuracy": 0.6963574743527113, "step": 440 }, { - "epoch": 0.32932469935245146, - "grad_norm": 0.8692080974578857, + "epoch": 0.3242258652094718, + "grad_norm": 0.32638055086135864, "learning_rate": 2e-05, - "loss": 0.8089, - "mean_token_accuracy": 0.7593312312489597, + "loss": 1.0837, + "mean_token_accuracy": 0.7034817894672104, "step": 445 }, { - "epoch": 0.3330249768732655, - "grad_norm": 0.8913412690162659, + "epoch": 0.32786885245901637, + "grad_norm": 0.3373214602470398, "learning_rate": 2e-05, - "loss": 0.8234, - "mean_token_accuracy": 0.7565642409607426, + "loss": 1.1176, + "mean_token_accuracy": 0.6957162921348317, "step": 450 }, { - "epoch": 0.3367252543940796, - "grad_norm": 0.8350377678871155, + "epoch": 0.331511839708561, + "grad_norm": 0.35421860218048096, "learning_rate": 2e-05, - "loss": 0.7822, - "mean_token_accuracy": 0.7671309817518462, + "loss": 1.0935, + "mean_token_accuracy": 0.7031082071323889, "step": 455 }, { - "epoch": 0.3404255319148936, - "grad_norm": 0.8672026991844177, + "epoch": 0.33515482695810567, + "grad_norm": 0.32108989357948303, "learning_rate": 2e-05, - "loss": 0.8021, - "mean_token_accuracy": 0.7619013855929817, + "loss": 1.0774, + "mean_token_accuracy": 0.7054744748412312, "step": 460 }, { - "epoch": 0.3441258094357077, - "grad_norm": 0.8754116892814636, + "epoch": 0.33879781420765026, + "grad_norm": 0.3276057839393616, "learning_rate": 2e-05, - "loss": 0.8281, - "mean_token_accuracy": 0.7542948754676929, + "loss": 1.0878, + "mean_token_accuracy": 0.7024872572898195, "step": 465 }, { - "epoch": 0.34782608695652173, - "grad_norm": 0.8823506236076355, + "epoch": 0.3424408014571949, + "grad_norm": 0.3203645348548889, "learning_rate": 2e-05, - "loss": 0.8224, - "mean_token_accuracy": 0.7557379489990506, + "loss": 1.1107, + "mean_token_accuracy": 0.6970383488031264, "step": 470 }, { - "epoch": 0.3515263644773358, - "grad_norm": 0.8378307819366455, + "epoch": 0.3460837887067395, + "grad_norm": 0.3142698109149933, "learning_rate": 2e-05, - "loss": 0.8235, - "mean_token_accuracy": 0.7557773294176604, + "loss": 1.0818, + "mean_token_accuracy": 0.7031570591108939, "step": 475 }, { - "epoch": 0.35522664199814985, - "grad_norm": 0.764640748500824, + "epoch": 0.34972677595628415, + "grad_norm": 0.36916568875312805, "learning_rate": 2e-05, - "loss": 0.7811, - "mean_token_accuracy": 0.7674092078356458, + "loss": 1.1035, + "mean_token_accuracy": 0.6999328285295553, "step": 480 }, { - "epoch": 0.35892691951896394, - "grad_norm": 0.8759243488311768, + "epoch": 0.3533697632058288, + "grad_norm": 0.33288297057151794, "learning_rate": 2e-05, - "loss": 0.8143, - "mean_token_accuracy": 0.7586287599334172, + "loss": 1.1184, + "mean_token_accuracy": 0.6936950415241816, "step": 485 }, { - "epoch": 0.36262719703977797, - "grad_norm": 0.8526579141616821, + "epoch": 0.3570127504553734, + "grad_norm": 0.2986063063144684, "learning_rate": 2e-05, - "loss": 0.7947, - "mean_token_accuracy": 0.7634890650281875, + "loss": 1.07, + "mean_token_accuracy": 0.7059324621397166, "step": 490 }, { - "epoch": 0.36632747456059206, - "grad_norm": 0.8783523440361023, + "epoch": 0.36065573770491804, + "grad_norm": 0.33390265703201294, "learning_rate": 2e-05, - "loss": 0.8113, - "mean_token_accuracy": 0.758303520901475, + "loss": 1.0834, + "mean_token_accuracy": 0.7028731069858329, "step": 495 }, { - "epoch": 0.3700277520814061, - "grad_norm": 0.8912361264228821, + "epoch": 0.36429872495446264, + "grad_norm": 0.3243837058544159, "learning_rate": 2e-05, - "loss": 0.8004, - "mean_token_accuracy": 0.7601969218920052, + "loss": 1.0766, + "mean_token_accuracy": 0.7079109672691744, "step": 500 }, { - "epoch": 0.3700277520814061, - "eval_loss": 0.8273892402648926, - "eval_mean_token_accuracy": 0.7517986458893432, - "eval_runtime": 28.2564, - "eval_samples_per_second": 9.095, - "eval_steps_per_second": 1.168, + "epoch": 0.36429872495446264, + "eval_loss": 1.1029341220855713, + "eval_mean_token_accuracy": 0.6966890054732253, + "eval_runtime": 14.5663, + "eval_samples_per_second": 17.987, + "eval_steps_per_second": 1.167, "step": 500 }, { - "epoch": 0.3737280296022202, - "grad_norm": 0.8783790469169617, + "epoch": 0.3679417122040073, + "grad_norm": 0.3292768597602844, "learning_rate": 2e-05, - "loss": 0.8166, - "mean_token_accuracy": 0.7571132549186148, + "loss": 1.0895, + "mean_token_accuracy": 0.7004579872984855, "step": 505 }, { - "epoch": 0.3774283071230342, - "grad_norm": 0.8384273648262024, + "epoch": 0.37158469945355194, + "grad_norm": 0.34203407168388367, "learning_rate": 2e-05, - "loss": 0.7976, - "mean_token_accuracy": 0.7625027165364132, + "loss": 1.0898, + "mean_token_accuracy": 0.7014441866145578, "step": 510 }, { - "epoch": 0.3811285846438483, - "grad_norm": 0.8020018339157104, + "epoch": 0.37522768670309653, + "grad_norm": 0.36646661162376404, "learning_rate": 2e-05, - "loss": 0.8005, - "mean_token_accuracy": 0.7613467266492346, + "loss": 1.1225, + "mean_token_accuracy": 0.6927393746946752, "step": 515 }, { - "epoch": 0.38482886216466233, - "grad_norm": 0.864335298538208, + "epoch": 0.3788706739526412, + "grad_norm": 0.32106146216392517, "learning_rate": 2e-05, - "loss": 0.8166, - "mean_token_accuracy": 0.7572026062300129, + "loss": 1.1075, + "mean_token_accuracy": 0.6962872496336103, "step": 520 }, { - "epoch": 0.3885291396854764, - "grad_norm": 0.8236207365989685, + "epoch": 0.3825136612021858, + "grad_norm": 0.3438628315925598, "learning_rate": 2e-05, - "loss": 0.8345, - "mean_token_accuracy": 0.7512810127712488, + "loss": 1.0885, + "mean_token_accuracy": 0.7010411577918904, "step": 525 }, { - "epoch": 0.39222941720629045, - "grad_norm": 0.922486424446106, + "epoch": 0.3861566484517304, + "grad_norm": 0.3221867084503174, "learning_rate": 2e-05, - "loss": 0.8128, - "mean_token_accuracy": 0.7574585249731419, + "loss": 1.0891, + "mean_token_accuracy": 0.701841108939912, "step": 530 }, { - "epoch": 0.39592969472710454, - "grad_norm": 0.8133776187896729, + "epoch": 0.38979963570127507, + "grad_norm": 0.3283216655254364, "learning_rate": 2e-05, - "loss": 0.8045, - "mean_token_accuracy": 0.7600427585053521, + "loss": 1.1008, + "mean_token_accuracy": 0.6979451636541281, "step": 535 }, { - "epoch": 0.3996299722479186, - "grad_norm": 0.9678435921669006, + "epoch": 0.39344262295081966, + "grad_norm": 0.3395269811153412, "learning_rate": 2e-05, - "loss": 0.8434, - "mean_token_accuracy": 0.748454559157412, + "loss": 1.089, + "mean_token_accuracy": 0.702310933274755, "step": 540 }, { - "epoch": 0.40333024976873266, - "grad_norm": 0.8359988927841187, + "epoch": 0.3970856102003643, + "grad_norm": 0.32621896266937256, "learning_rate": 2e-05, - "loss": 0.8218, - "mean_token_accuracy": 0.7548506933279708, + "loss": 1.0759, + "mean_token_accuracy": 0.7054286761113826, "step": 545 }, { - "epoch": 0.4070305272895467, - "grad_norm": 0.9044314622879028, + "epoch": 0.4007285974499089, + "grad_norm": 0.34217944741249084, "learning_rate": 2e-05, - "loss": 0.773, - "mean_token_accuracy": 0.7704903178347341, + "loss": 1.1018, + "mean_token_accuracy": 0.6971726917440155, "step": 550 }, { - "epoch": 0.4107308048103608, - "grad_norm": 0.8256170153617859, + "epoch": 0.40437158469945356, + "grad_norm": 0.3205217719078064, "learning_rate": 2e-05, - "loss": 0.7998, - "mean_token_accuracy": 0.7627984143152889, + "loss": 1.0865, + "mean_token_accuracy": 0.7010564240351734, "step": 555 }, { - "epoch": 0.4144310823311748, - "grad_norm": 0.8337195515632629, + "epoch": 0.4080145719489982, + "grad_norm": 0.3366459906101227, "learning_rate": 2e-05, - "loss": 0.7815, - "mean_token_accuracy": 0.7671558285449628, + "loss": 1.1069, + "mean_token_accuracy": 0.6947758915486076, "step": 560 }, { - "epoch": 0.4181313598519889, - "grad_norm": 0.824918270111084, + "epoch": 0.4116575591985428, + "grad_norm": 0.33357521891593933, "learning_rate": 2e-05, - "loss": 0.7931, - "mean_token_accuracy": 0.7636928037001625, + "loss": 1.0831, + "mean_token_accuracy": 0.7024853444064484, "step": 565 }, { - "epoch": 0.42183163737280294, - "grad_norm": 0.8702638149261475, + "epoch": 0.41530054644808745, + "grad_norm": 0.3021646738052368, "learning_rate": 2e-05, - "loss": 0.7799, - "mean_token_accuracy": 0.766696658125059, + "loss": 1.0934, + "mean_token_accuracy": 0.6990626526624328, "step": 570 }, { - "epoch": 0.425531914893617, - "grad_norm": 0.9347019195556641, + "epoch": 0.41894353369763204, + "grad_norm": 0.3405080735683441, "learning_rate": 2e-05, - "loss": 0.8027, - "mean_token_accuracy": 0.759642292609576, + "loss": 1.0948, + "mean_token_accuracy": 0.6999297752808988, "step": 575 }, { - "epoch": 0.42923219241443106, - "grad_norm": 0.8226842284202576, + "epoch": 0.4225865209471767, + "grad_norm": 0.35369783639907837, "learning_rate": 2e-05, - "loss": 0.8318, - "mean_token_accuracy": 0.7513268577092702, + "loss": 1.0664, + "mean_token_accuracy": 0.7070591108939912, "step": 580 }, { - "epoch": 0.43293246993524515, - "grad_norm": 0.8303760886192322, + "epoch": 0.4262295081967213, + "grad_norm": 0.32571864128112793, "learning_rate": 2e-05, - "loss": 0.7847, - "mean_token_accuracy": 0.7666960565561379, + "loss": 1.0829, + "mean_token_accuracy": 0.701379174799081, "step": 585 }, { - "epoch": 0.4366327474560592, - "grad_norm": 0.864280641078949, + "epoch": 0.42987249544626593, + "grad_norm": 0.3396914005279541, "learning_rate": 2e-05, - "loss": 0.7967, - "mean_token_accuracy": 0.7617017729986968, + "loss": 1.1002, + "mean_token_accuracy": 0.6994626282364436, "step": 590 }, { - "epoch": 0.44033302497687327, - "grad_norm": 0.8039905428886414, + "epoch": 0.4335154826958106, + "grad_norm": 0.3538070023059845, "learning_rate": 2e-05, - "loss": 0.7952, - "mean_token_accuracy": 0.7631796585579562, + "loss": 1.0985, + "mean_token_accuracy": 0.6981222520762091, "step": 595 }, { - "epoch": 0.4440333024976873, - "grad_norm": 0.8144698143005371, + "epoch": 0.4371584699453552, + "grad_norm": 0.31537488102912903, "learning_rate": 2e-05, - "loss": 0.7595, - "mean_token_accuracy": 0.7731878196083116, + "loss": 1.0523, + "mean_token_accuracy": 0.709605520273571, "step": 600 }, { - "epoch": 0.4440333024976873, - "eval_loss": 0.8222786784172058, - "eval_mean_token_accuracy": 0.7522364520321178, - "eval_runtime": 28.1935, - "eval_samples_per_second": 9.116, - "eval_steps_per_second": 1.17, + "epoch": 0.4371584699453552, + "eval_loss": 1.0888596773147583, + "eval_mean_token_accuracy": 0.6988977098839045, + "eval_runtime": 14.4857, + "eval_samples_per_second": 18.087, + "eval_steps_per_second": 1.174, "step": 600 }, { - "epoch": 0.4477335800185014, - "grad_norm": 0.898853600025177, + "epoch": 0.4408014571948998, + "grad_norm": 0.3270356059074402, "learning_rate": 2e-05, - "loss": 0.8311, - "mean_token_accuracy": 0.7522999408717911, + "loss": 1.0672, + "mean_token_accuracy": 0.7046531509526136, "step": 605 }, { - "epoch": 0.4514338575393155, - "grad_norm": 0.7734112739562988, + "epoch": 0.4444444444444444, + "grad_norm": 0.36020419001579285, "learning_rate": 2e-05, - "loss": 0.7892, - "mean_token_accuracy": 0.7639044411576632, + "loss": 1.0966, + "mean_token_accuracy": 0.6975146555935515, "step": 610 }, { - "epoch": 0.4551341350601295, - "grad_norm": 0.8656638860702515, + "epoch": 0.44808743169398907, + "grad_norm": 0.33137136697769165, "learning_rate": 2e-05, - "loss": 0.7942, - "mean_token_accuracy": 0.7623657197010026, + "loss": 1.076, + "mean_token_accuracy": 0.7033310942843185, "step": 615 }, { - "epoch": 0.4588344125809436, - "grad_norm": 0.9421690106391907, + "epoch": 0.4517304189435337, + "grad_norm": 0.2987717390060425, "learning_rate": 2e-05, - "loss": 0.7726, - "mean_token_accuracy": 0.7677914333370994, + "loss": 1.0588, + "mean_token_accuracy": 0.7078102100635075, "step": 620 }, { - "epoch": 0.46253469010175763, - "grad_norm": 0.810966968536377, + "epoch": 0.4553734061930783, + "grad_norm": 0.30083930492401123, "learning_rate": 2e-05, - "loss": 0.7738, - "mean_token_accuracy": 0.7682241466373756, + "loss": 1.0457, + "mean_token_accuracy": 0.7125610649731315, "step": 625 }, { - "epoch": 0.4662349676225717, - "grad_norm": 0.8764069080352783, + "epoch": 0.45901639344262296, + "grad_norm": 0.3193149268627167, "learning_rate": 2e-05, - "loss": 0.764, - "mean_token_accuracy": 0.7716749303779987, + "loss": 1.0729, + "mean_token_accuracy": 0.7045462872496338, "step": 630 }, { - "epoch": 0.46993524514338575, - "grad_norm": 0.8801226615905762, + "epoch": 0.46265938069216755, + "grad_norm": 0.3396466076374054, "learning_rate": 2e-05, - "loss": 0.7868, - "mean_token_accuracy": 0.7653086539883042, + "loss": 1.0784, + "mean_token_accuracy": 0.7025983146067416, "step": 635 }, { - "epoch": 0.47363552266419984, - "grad_norm": 0.789652943611145, + "epoch": 0.4663023679417122, + "grad_norm": 0.32033106684684753, "learning_rate": 2e-05, - "loss": 0.7996, - "mean_token_accuracy": 0.7604761749526181, + "loss": 1.0852, + "mean_token_accuracy": 0.7016365412799219, "step": 640 }, { - "epoch": 0.47733580018501387, - "grad_norm": 0.8484530448913574, + "epoch": 0.46994535519125685, + "grad_norm": 0.32467973232269287, "learning_rate": 2e-05, - "loss": 0.7966, - "mean_token_accuracy": 0.7630161460550556, + "loss": 1.0679, + "mean_token_accuracy": 0.7072850512945774, "step": 645 }, { - "epoch": 0.48103607770582796, - "grad_norm": 0.8077908754348755, + "epoch": 0.47358834244080145, + "grad_norm": 0.3388020396232605, "learning_rate": 2e-05, - "loss": 0.7931, - "mean_token_accuracy": 0.763731608290881, + "loss": 1.0266, + "mean_token_accuracy": 0.7156540058622374, "step": 650 }, { - "epoch": 0.484736355226642, - "grad_norm": 0.8435987830162048, + "epoch": 0.4772313296903461, + "grad_norm": 0.31007012724876404, "learning_rate": 2e-05, - "loss": 0.7695, - "mean_token_accuracy": 0.7693811651575391, + "loss": 1.0441, + "mean_token_accuracy": 0.7133701758671225, "step": 655 }, { - "epoch": 0.4884366327474561, - "grad_norm": 0.7886292934417725, + "epoch": 0.4808743169398907, + "grad_norm": 0.3379007577896118, "learning_rate": 2e-05, - "loss": 0.794, - "mean_token_accuracy": 0.7628652561375162, + "loss": 1.0474, + "mean_token_accuracy": 0.7104818026380066, "step": 660 }, { - "epoch": 0.4921369102682701, - "grad_norm": 0.9320831894874573, + "epoch": 0.48451730418943534, + "grad_norm": 0.33584991097450256, "learning_rate": 2e-05, - "loss": 0.8054, - "mean_token_accuracy": 0.7584387909891809, + "loss": 1.0816, + "mean_token_accuracy": 0.7025181547348566, "step": 665 }, { - "epoch": 0.4958371877890842, - "grad_norm": 0.8228521943092346, + "epoch": 0.48816029143898, + "grad_norm": 0.3317875266075134, "learning_rate": 2e-05, - "loss": 0.7931, - "mean_token_accuracy": 0.762219202960193, + "loss": 1.0785, + "mean_token_accuracy": 0.7022074987787005, "step": 670 }, { - "epoch": 0.49953746530989823, - "grad_norm": 0.9157373905181885, + "epoch": 0.4918032786885246, + "grad_norm": 0.31039467453956604, "learning_rate": 2e-05, - "loss": 0.7726, - "mean_token_accuracy": 0.766911525229679, + "loss": 1.0751, + "mean_token_accuracy": 0.7024639716658523, "step": 675 }, { - "epoch": 0.5032377428307123, - "grad_norm": 0.8757467865943909, + "epoch": 0.49544626593806923, + "grad_norm": 0.32393017411231995, "learning_rate": 2e-05, - "loss": 0.7925, - "mean_token_accuracy": 0.7622793847521553, + "loss": 1.0702, + "mean_token_accuracy": 0.706164509037616, "step": 680 }, { - "epoch": 0.5069380203515264, - "grad_norm": 0.9015477895736694, + "epoch": 0.4990892531876138, + "grad_norm": 0.3062119483947754, "learning_rate": 2e-05, - "loss": 0.808, - "mean_token_accuracy": 0.7594514579501054, + "loss": 1.0657, + "mean_token_accuracy": 0.708112481680508, "step": 685 }, { - "epoch": 0.5106382978723404, - "grad_norm": 0.8006423711776733, + "epoch": 0.5027322404371585, + "grad_norm": 0.30919599533081055, "learning_rate": 2e-05, - "loss": 0.7802, - "mean_token_accuracy": 0.7662706193752193, + "loss": 1.0531, + "mean_token_accuracy": 0.7101428920371275, "step": 690 }, { - "epoch": 0.5143385753931545, - "grad_norm": 0.9912610650062561, + "epoch": 0.5063752276867031, + "grad_norm": 0.3312112092971802, "learning_rate": 2e-05, - "loss": 0.7897, - "mean_token_accuracy": 0.761338202651713, + "loss": 1.0533, + "mean_token_accuracy": 0.709449804592086, "step": 695 }, { - "epoch": 0.5180388529139686, - "grad_norm": 0.8785173296928406, + "epoch": 0.5100182149362478, + "grad_norm": 0.3210844397544861, "learning_rate": 2e-05, - "loss": 0.7866, - "mean_token_accuracy": 0.7645523463170034, + "loss": 1.0521, + "mean_token_accuracy": 0.7106863702979971, "step": 700 }, { - "epoch": 0.5180388529139686, - "eval_loss": 0.816895604133606, - "eval_mean_token_accuracy": 0.7541631101443679, - "eval_runtime": 28.2109, - "eval_samples_per_second": 9.11, - "eval_steps_per_second": 1.17, + "epoch": 0.5100182149362478, + "eval_loss": 1.0767533779144287, + "eval_mean_token_accuracy": 0.7011067128365758, + "eval_runtime": 14.4944, + "eval_samples_per_second": 18.076, + "eval_steps_per_second": 1.173, "step": 700 }, { - "epoch": 0.5217391304347826, - "grad_norm": 0.9194395542144775, + "epoch": 0.5136612021857924, + "grad_norm": 0.3688066899776459, "learning_rate": 2e-05, - "loss": 0.7962, - "mean_token_accuracy": 0.7632879072410094, + "loss": 1.0859, + "mean_token_accuracy": 0.7002839521250611, "step": 705 }, { - "epoch": 0.5254394079555966, - "grad_norm": 0.8533481359481812, + "epoch": 0.517304189435337, + "grad_norm": 0.32076314091682434, "learning_rate": 2e-05, - "loss": 0.7945, - "mean_token_accuracy": 0.7621978633882525, + "loss": 1.0545, + "mean_token_accuracy": 0.7069553004396677, "step": 710 }, { - "epoch": 0.5291396854764108, - "grad_norm": 0.7783718705177307, + "epoch": 0.5209471766848816, + "grad_norm": 0.35336294770240784, "learning_rate": 2e-05, - "loss": 0.7771, - "mean_token_accuracy": 0.7678239959564587, + "loss": 1.0638, + "mean_token_accuracy": 0.7060545920859795, "step": 715 }, { - "epoch": 0.5328399629972248, - "grad_norm": 0.9274916052818298, + "epoch": 0.5245901639344263, + "grad_norm": 0.32573992013931274, "learning_rate": 2e-05, - "loss": 0.7996, - "mean_token_accuracy": 0.7604699077057977, + "loss": 1.0584, + "mean_token_accuracy": 0.7088574743527113, "step": 720 }, { - "epoch": 0.5365402405180388, - "grad_norm": 0.8770431280136108, + "epoch": 0.5282331511839709, + "grad_norm": 0.3330203890800476, "learning_rate": 2e-05, - "loss": 0.7845, - "mean_token_accuracy": 0.7654583712388715, + "loss": 1.0631, + "mean_token_accuracy": 0.7062377870053738, "step": 725 }, { - "epoch": 0.5402405180388529, - "grad_norm": 0.8453335762023926, + "epoch": 0.5318761384335154, + "grad_norm": 0.3228294849395752, "learning_rate": 2e-05, - "loss": 0.7938, - "mean_token_accuracy": 0.761018252526195, + "loss": 1.0201, + "mean_token_accuracy": 0.7158768930141669, "step": 730 }, { - "epoch": 0.543940795559667, - "grad_norm": 0.7490786910057068, + "epoch": 0.5355191256830601, + "grad_norm": 0.32984620332717896, "learning_rate": 2e-05, - "loss": 0.8167, - "mean_token_accuracy": 0.7549528979795714, + "loss": 1.0253, + "mean_token_accuracy": 0.715898265754763, "step": 735 }, { - "epoch": 0.547641073080481, - "grad_norm": 0.9425523281097412, + "epoch": 0.5391621129326047, + "grad_norm": 0.30142080783843994, "learning_rate": 2e-05, - "loss": 0.8059, - "mean_token_accuracy": 0.7600661300171085, + "loss": 1.0556, + "mean_token_accuracy": 0.7093948461162678, "step": 740 }, { - "epoch": 0.5513413506012951, - "grad_norm": 0.8289265036582947, + "epoch": 0.5428051001821493, + "grad_norm": 0.3037840723991394, "learning_rate": 2e-05, - "loss": 0.7797, - "mean_token_accuracy": 0.7658190325778043, + "loss": 1.0697, + "mean_token_accuracy": 0.7060240595994138, "step": 745 }, { - "epoch": 0.5550416281221091, - "grad_norm": 0.808990478515625, + "epoch": 0.546448087431694, + "grad_norm": 0.2929915487766266, "learning_rate": 2e-05, - "loss": 0.7919, - "mean_token_accuracy": 0.7639361658364197, + "loss": 1.0601, + "mean_token_accuracy": 0.7079384465070836, "step": 750 }, { - "epoch": 0.5587419056429233, - "grad_norm": 0.8714848756790161, + "epoch": 0.5500910746812386, + "grad_norm": 0.32633528113365173, "learning_rate": 2e-05, - "loss": 0.8133, - "mean_token_accuracy": 0.7559352930609992, + "loss": 1.0715, + "mean_token_accuracy": 0.7061064973131412, "step": 755 }, { - "epoch": 0.5624421831637373, - "grad_norm": 0.8202362060546875, + "epoch": 0.5537340619307832, + "grad_norm": 0.33284929394721985, "learning_rate": 2e-05, - "loss": 0.7967, - "mean_token_accuracy": 0.7620283378489665, + "loss": 1.0601, + "mean_token_accuracy": 0.7047539081582804, "step": 760 }, { - "epoch": 0.5661424606845513, - "grad_norm": 0.8363901376724243, + "epoch": 0.5573770491803278, + "grad_norm": 0.3333057165145874, "learning_rate": 2e-05, - "loss": 0.7856, - "mean_token_accuracy": 0.7647013079094134, + "loss": 1.0572, + "mean_token_accuracy": 0.7077789853207801, "step": 765 }, { - "epoch": 0.5698427382053654, - "grad_norm": 0.7312168478965759, + "epoch": 0.5610200364298725, + "grad_norm": 0.3196307122707367, "learning_rate": 2e-05, - "loss": 0.7807, - "mean_token_accuracy": 0.7657156813892808, + "loss": 1.0413, + "mean_token_accuracy": 0.7101215192965314, "step": 770 }, { - "epoch": 0.5735430157261795, - "grad_norm": 0.8106317520141602, + "epoch": 0.5646630236794171, + "grad_norm": 0.3234025239944458, "learning_rate": 2e-05, - "loss": 0.7833, - "mean_token_accuracy": 0.7646042951530488, + "loss": 1.0412, + "mean_token_accuracy": 0.7131656082071325, "step": 775 }, { - "epoch": 0.5772432932469935, - "grad_norm": 0.879208505153656, + "epoch": 0.5683060109289617, + "grad_norm": 0.3486537039279938, "learning_rate": 2e-05, - "loss": 0.7906, - "mean_token_accuracy": 0.7629727484373736, + "loss": 1.043, + "mean_token_accuracy": 0.7117122618466047, "step": 780 }, { - "epoch": 0.5809435707678076, - "grad_norm": 0.8280200958251953, + "epoch": 0.5719489981785064, + "grad_norm": 0.31476178765296936, "learning_rate": 2e-05, - "loss": 0.7931, - "mean_token_accuracy": 0.7624752387623053, + "loss": 1.0723, + "mean_token_accuracy": 0.7013709086468003, "step": 785 }, { - "epoch": 0.5846438482886216, - "grad_norm": 0.8161231279373169, + "epoch": 0.575591985428051, + "grad_norm": 0.3405304253101349, "learning_rate": 2e-05, - "loss": 0.7979, - "mean_token_accuracy": 0.7606783559652616, + "loss": 1.0733, + "mean_token_accuracy": 0.7017128724963362, "step": 790 }, { - "epoch": 0.5883441258094357, - "grad_norm": 0.8787475228309631, + "epoch": 0.5792349726775956, + "grad_norm": 0.3531171679496765, "learning_rate": 2e-05, - "loss": 0.8031, - "mean_token_accuracy": 0.7610423345421407, + "loss": 1.0622, + "mean_token_accuracy": 0.7063660234489495, "step": 795 }, { - "epoch": 0.5920444033302498, - "grad_norm": 0.8519960641860962, + "epoch": 0.5828779599271403, + "grad_norm": 0.33499255776405334, "learning_rate": 2e-05, - "loss": 0.8219, - "mean_token_accuracy": 0.7543098449152789, + "loss": 1.0585, + "mean_token_accuracy": 0.7079750854909623, "step": 800 }, { - "epoch": 0.5920444033302498, - "eval_loss": 0.8132702708244324, - "eval_mean_token_accuracy": 0.7547294497721886, - "eval_runtime": 28.2905, - "eval_samples_per_second": 9.084, - "eval_steps_per_second": 1.166, + "epoch": 0.5828779599271403, + "eval_loss": 1.0676288604736328, + "eval_mean_token_accuracy": 0.7027526775347543, + "eval_runtime": 14.4235, + "eval_samples_per_second": 18.165, + "eval_steps_per_second": 1.179, "step": 800 }, { - "epoch": 0.5957446808510638, - "grad_norm": 0.8904110193252563, + "epoch": 0.5865209471766849, + "grad_norm": 0.3178112506866455, "learning_rate": 2e-05, - "loss": 0.7317, - "mean_token_accuracy": 0.7804520508941676, + "loss": 1.0546, + "mean_token_accuracy": 0.7070072056668294, "step": 805 }, { - "epoch": 0.599444958371878, - "grad_norm": 0.8179667592048645, + "epoch": 0.5901639344262295, + "grad_norm": 0.34171855449676514, "learning_rate": 2e-05, - "loss": 0.7845, - "mean_token_accuracy": 0.7655715065591602, + "loss": 1.0459, + "mean_token_accuracy": 0.7116786761113827, "step": 810 }, { - "epoch": 0.603145235892692, - "grad_norm": 0.7966557145118713, + "epoch": 0.5938069216757741, + "grad_norm": 0.3327518701553345, "learning_rate": 2e-05, - "loss": 0.7778, - "mean_token_accuracy": 0.7667068888225074, + "loss": 1.0714, + "mean_token_accuracy": 0.7016884465070834, "step": 815 }, { - "epoch": 0.606845513413506, - "grad_norm": 0.8500820994377136, + "epoch": 0.5974499089253188, + "grad_norm": 0.33222025632858276, "learning_rate": 2e-05, - "loss": 0.7909, - "mean_token_accuracy": 0.7624461584090862, + "loss": 1.0528, + "mean_token_accuracy": 0.7067659990229603, "step": 820 }, { - "epoch": 0.61054579093432, - "grad_norm": 0.876806378364563, + "epoch": 0.6010928961748634, + "grad_norm": 0.3413512706756592, "learning_rate": 2e-05, - "loss": 0.7986, - "mean_token_accuracy": 0.7598412421319061, + "loss": 1.0643, + "mean_token_accuracy": 0.7061461895456767, "step": 825 }, { - "epoch": 0.6142460684551342, - "grad_norm": 0.8352357745170593, + "epoch": 0.604735883424408, + "grad_norm": 0.35720759630203247, "learning_rate": 2e-05, - "loss": 0.8045, - "mean_token_accuracy": 0.7595125698366062, + "loss": 1.0611, + "mean_token_accuracy": 0.7056851489985345, "step": 830 }, { - "epoch": 0.6179463459759482, - "grad_norm": 0.8609703779220581, + "epoch": 0.6083788706739527, + "grad_norm": 0.36705562472343445, "learning_rate": 2e-05, - "loss": 0.7653, - "mean_token_accuracy": 0.7705634200292611, + "loss": 1.0749, + "mean_token_accuracy": 0.702274670249145, "step": 835 }, { - "epoch": 0.6216466234967623, - "grad_norm": 0.8633068203926086, + "epoch": 0.6120218579234973, + "grad_norm": 0.3292873203754425, "learning_rate": 2e-05, - "loss": 0.7696, - "mean_token_accuracy": 0.7682237681411259, + "loss": 1.0509, + "mean_token_accuracy": 0.7101093063019052, "step": 840 }, { - "epoch": 0.6253469010175763, - "grad_norm": 0.8756479620933533, + "epoch": 0.6156648451730419, + "grad_norm": 0.3292001187801361, "learning_rate": 2e-05, - "loss": 0.7786, - "mean_token_accuracy": 0.7672769183942045, + "loss": 1.0535, + "mean_token_accuracy": 0.7094223253541768, "step": 845 }, { - "epoch": 0.6290471785383904, - "grad_norm": 0.792131781578064, + "epoch": 0.6193078324225865, + "grad_norm": 0.3075976073741913, "learning_rate": 2e-05, - "loss": 0.8053, - "mean_token_accuracy": 0.7596922873144693, + "loss": 1.0812, + "mean_token_accuracy": 0.7013525891548608, "step": 850 }, { - "epoch": 0.6327474560592045, - "grad_norm": 0.7844769954681396, + "epoch": 0.6229508196721312, + "grad_norm": 0.32240450382232666, "learning_rate": 2e-05, - "loss": 0.7427, - "mean_token_accuracy": 0.7763440537258867, + "loss": 1.0659, + "mean_token_accuracy": 0.7051569369809476, "step": 855 }, { - "epoch": 0.6364477335800185, - "grad_norm": 0.770456850528717, + "epoch": 0.6265938069216758, + "grad_norm": 0.3207255005836487, "learning_rate": 2e-05, - "loss": 0.8082, - "mean_token_accuracy": 0.7579604529340764, + "loss": 1.0361, + "mean_token_accuracy": 0.7137518319491939, "step": 860 }, { - "epoch": 0.6401480111008325, - "grad_norm": 0.8686452507972717, + "epoch": 0.6302367941712204, + "grad_norm": 0.3257087469100952, "learning_rate": 2e-05, - "loss": 0.7895, - "mean_token_accuracy": 0.7622386697563842, + "loss": 1.0576, + "mean_token_accuracy": 0.7068789692232537, "step": 865 }, { - "epoch": 0.6438482886216467, - "grad_norm": 0.7865906357765198, + "epoch": 0.6338797814207651, + "grad_norm": 0.33827245235443115, "learning_rate": 2e-05, - "loss": 0.7622, - "mean_token_accuracy": 0.7711946538689294, + "loss": 1.0799, + "mean_token_accuracy": 0.6992122618466047, "step": 870 }, { - "epoch": 0.6475485661424607, - "grad_norm": 0.8619543313980103, + "epoch": 0.6375227686703097, + "grad_norm": 0.3211885094642639, "learning_rate": 2e-05, - "loss": 0.7776, - "mean_token_accuracy": 0.7680711539071554, + "loss": 1.0222, + "mean_token_accuracy": 0.7154677576941866, "step": 875 }, { - "epoch": 0.6512488436632747, - "grad_norm": 0.8202876448631287, + "epoch": 0.6411657559198543, + "grad_norm": 0.328647643327713, "learning_rate": 2e-05, - "loss": 0.7633, - "mean_token_accuracy": 0.7725941263025453, + "loss": 1.039, + "mean_token_accuracy": 0.7112500139591076, "step": 880 }, { - "epoch": 0.6549491211840888, - "grad_norm": 0.7937086820602417, + "epoch": 0.644808743169399, + "grad_norm": 0.3323783576488495, "learning_rate": 2e-05, - "loss": 0.803, - "mean_token_accuracy": 0.7600367861507152, + "loss": 1.0648, + "mean_token_accuracy": 0.7059385686370299, "step": 885 }, { - "epoch": 0.6586493987049029, - "grad_norm": 0.889613687992096, + "epoch": 0.6484517304189436, + "grad_norm": 0.32891303300857544, "learning_rate": 2e-05, - "loss": 0.7807, - "mean_token_accuracy": 0.7660721298978383, + "loss": 1.0393, + "mean_token_accuracy": 0.710622252076209, "step": 890 }, { - "epoch": 0.6623496762257169, - "grad_norm": 0.9099228382110596, + "epoch": 0.6520947176684881, + "grad_norm": 0.3386060297489166, "learning_rate": 2e-05, - "loss": 0.7911, - "mean_token_accuracy": 0.7620904760689602, + "loss": 1.0493, + "mean_token_accuracy": 0.7073888617489008, "step": 895 }, { - "epoch": 0.666049953746531, - "grad_norm": 0.8228977918624878, + "epoch": 0.6557377049180327, + "grad_norm": 0.29806962609291077, "learning_rate": 2e-05, - "loss": 0.7695, - "mean_token_accuracy": 0.7692359508353532, + "loss": 1.0482, + "mean_token_accuracy": 0.7085857352222765, "step": 900 }, { - "epoch": 0.666049953746531, - "eval_loss": 0.8099245429039001, - "eval_mean_token_accuracy": 0.7551735807763004, - "eval_runtime": 28.1688, - "eval_samples_per_second": 9.124, - "eval_steps_per_second": 1.172, + "epoch": 0.6557377049180327, + "eval_loss": 1.059637427330017, + "eval_mean_token_accuracy": 0.7044808439790701, + "eval_runtime": 14.4418, + "eval_samples_per_second": 18.142, + "eval_steps_per_second": 1.177, "step": 900 }, { - "epoch": 0.669750231267345, - "grad_norm": 0.8004016280174255, + "epoch": 0.6593806921675774, + "grad_norm": 0.346737265586853, "learning_rate": 2e-05, - "loss": 0.7658, - "mean_token_accuracy": 0.7700296924370021, + "loss": 1.0432, + "mean_token_accuracy": 0.7125244259892527, "step": 905 }, { - "epoch": 0.6734505087881592, - "grad_norm": 0.8088281154632568, + "epoch": 0.663023679417122, + "grad_norm": 0.35346153378486633, "learning_rate": 2e-05, - "loss": 0.7449, - "mean_token_accuracy": 0.7756549136654808, + "loss": 1.0375, + "mean_token_accuracy": 0.7128877625793846, "step": 910 }, { - "epoch": 0.6771507863089732, - "grad_norm": 0.8305289149284363, + "epoch": 0.6666666666666666, + "grad_norm": 0.32934707403182983, "learning_rate": 2e-05, - "loss": 0.7748, - "mean_token_accuracy": 0.7683322851408529, + "loss": 1.0322, + "mean_token_accuracy": 0.7146983390327307, "step": 915 }, { - "epoch": 0.6808510638297872, - "grad_norm": 0.8163439631462097, + "epoch": 0.6703096539162113, + "grad_norm": 0.3805268704891205, "learning_rate": 2e-05, - "loss": 0.7852, - "mean_token_accuracy": 0.7640544732853833, + "loss": 1.0556, + "mean_token_accuracy": 0.7068484367366878, "step": 920 }, { - "epoch": 0.6845513413506013, - "grad_norm": 0.8671082854270935, + "epoch": 0.6739526411657559, + "grad_norm": 0.3154759407043457, "learning_rate": 2e-05, - "loss": 0.8049, - "mean_token_accuracy": 0.7582682713568054, + "loss": 1.0551, + "mean_token_accuracy": 0.7053065461651196, "step": 925 }, { - "epoch": 0.6882516188714154, - "grad_norm": 0.8410265445709229, + "epoch": 0.6775956284153005, + "grad_norm": 0.31786778569221497, "learning_rate": 2e-05, - "loss": 0.7935, - "mean_token_accuracy": 0.7615948024373346, + "loss": 1.0319, + "mean_token_accuracy": 0.7121061309233024, "step": 930 }, { - "epoch": 0.6919518963922294, - "grad_norm": 0.8609654307365417, + "epoch": 0.6812386156648452, + "grad_norm": 0.3243469297885895, "learning_rate": 2e-05, - "loss": 0.793, - "mean_token_accuracy": 0.7631270531301052, + "loss": 1.0384, + "mean_token_accuracy": 0.7118771372740597, "step": 935 }, { - "epoch": 0.6956521739130435, - "grad_norm": 0.829707145690918, + "epoch": 0.6848816029143898, + "grad_norm": 0.34407398104667664, "learning_rate": 2e-05, - "loss": 0.7669, - "mean_token_accuracy": 0.7696065254786408, + "loss": 1.0546, + "mean_token_accuracy": 0.7059202491450903, "step": 940 }, { - "epoch": 0.6993524514338575, - "grad_norm": 0.8150595426559448, + "epoch": 0.6885245901639344, + "grad_norm": 0.32175493240356445, "learning_rate": 2e-05, - "loss": 0.7984, - "mean_token_accuracy": 0.7615524716113254, + "loss": 1.0669, + "mean_token_accuracy": 0.7043325598436735, "step": 945 }, { - "epoch": 0.7030527289546716, - "grad_norm": 0.8580748438835144, + "epoch": 0.692167577413479, + "grad_norm": 0.3119860887527466, "learning_rate": 2e-05, - "loss": 0.7689, - "mean_token_accuracy": 0.7683636381982694, + "loss": 1.0457, + "mean_token_accuracy": 0.7077308255984368, "step": 950 }, { - "epoch": 0.7067530064754857, - "grad_norm": 0.774782121181488, + "epoch": 0.6958105646630237, + "grad_norm": 0.29809337854385376, "learning_rate": 2e-05, - "loss": 0.7895, - "mean_token_accuracy": 0.7641451735925326, + "loss": 1.0263, + "mean_token_accuracy": 0.7132633121641427, "step": 955 }, { - "epoch": 0.7104532839962997, - "grad_norm": 0.8466439843177795, + "epoch": 0.6994535519125683, + "grad_norm": 0.3371947109699249, "learning_rate": 2e-05, - "loss": 0.7651, - "mean_token_accuracy": 0.7699041692846329, + "loss": 1.0438, + "mean_token_accuracy": 0.7100593064398786, "step": 960 }, { - "epoch": 0.7141535615171137, - "grad_norm": 0.8498072028160095, + "epoch": 0.7030965391621129, + "grad_norm": 0.3244064748287201, "learning_rate": 2e-05, - "loss": 0.7708, - "mean_token_accuracy": 0.7683624308125098, + "loss": 1.0337, + "mean_token_accuracy": 0.7126587689301417, "step": 965 }, { - "epoch": 0.7178538390379279, - "grad_norm": 0.8351811170578003, + "epoch": 0.7067395264116576, + "grad_norm": 0.3447455167770386, "learning_rate": 2e-05, - "loss": 0.7811, - "mean_token_accuracy": 0.7650397336041128, + "loss": 1.0254, + "mean_token_accuracy": 0.7147410845139229, "step": 970 }, { - "epoch": 0.7215541165587419, - "grad_norm": 0.9042247533798218, + "epoch": 0.7103825136612022, + "grad_norm": 0.28738129138946533, "learning_rate": 2e-05, - "loss": 0.7654, - "mean_token_accuracy": 0.7692840039456211, + "loss": 1.0263, + "mean_token_accuracy": 0.7152876160234489, "step": 975 }, { - "epoch": 0.7252543940795559, - "grad_norm": 0.7749535441398621, + "epoch": 0.7140255009107468, + "grad_norm": 0.32235240936279297, "learning_rate": 2e-05, - "loss": 0.7597, - "mean_token_accuracy": 0.7701962410947761, + "loss": 1.0199, + "mean_token_accuracy": 0.7157211773326819, "step": 980 }, { - "epoch": 0.72895467160037, - "grad_norm": 0.8533946871757507, + "epoch": 0.7176684881602914, + "grad_norm": 0.35440897941589355, "learning_rate": 2e-05, - "loss": 0.7784, - "mean_token_accuracy": 0.7647845122361627, + "loss": 1.0319, + "mean_token_accuracy": 0.7125122129946264, "step": 985 }, { - "epoch": 0.7326549491211841, - "grad_norm": 0.8168575763702393, + "epoch": 0.7213114754098361, + "grad_norm": 0.37784770131111145, "learning_rate": 2e-05, - "loss": 0.8086, - "mean_token_accuracy": 0.757385044925169, + "loss": 1.0336, + "mean_token_accuracy": 0.7125488519785051, "step": 990 }, { - "epoch": 0.7363552266419982, - "grad_norm": 0.7941954135894775, + "epoch": 0.7249544626593807, + "grad_norm": 0.31454479694366455, "learning_rate": 2e-05, - "loss": 0.765, - "mean_token_accuracy": 0.7690882272352915, + "loss": 1.0533, + "mean_token_accuracy": 0.7099444308744505, "step": 995 }, { - "epoch": 0.7400555041628122, - "grad_norm": 0.8234696984291077, + "epoch": 0.7285974499089253, + "grad_norm": 0.3095937967300415, "learning_rate": 2e-05, - "loss": 0.7771, - "mean_token_accuracy": 0.7658752902385906, + "loss": 1.0418, + "mean_token_accuracy": 0.7107138495359062, "step": 1000 }, { - "epoch": 0.7400555041628122, - "eval_loss": 0.8039809465408325, - "eval_mean_token_accuracy": 0.7569865406645019, - "eval_runtime": 28.2336, - "eval_samples_per_second": 9.103, - "eval_steps_per_second": 1.169, + "epoch": 0.7285974499089253, + "eval_loss": 1.052242398262024, + "eval_mean_token_accuracy": 0.7063926113598887, + "eval_runtime": 14.4403, + "eval_samples_per_second": 18.144, + "eval_steps_per_second": 1.177, "step": 1000 }, { - "epoch": 0.7437557816836263, - "grad_norm": 0.8295962810516357, + "epoch": 0.73224043715847, + "grad_norm": 0.3146299421787262, "learning_rate": 2e-05, - "loss": 0.8089, - "mean_token_accuracy": 0.7569489958101997, + "loss": 1.0171, + "mean_token_accuracy": 0.7154250122129946, "step": 1005 }, { - "epoch": 0.7474560592044404, - "grad_norm": 0.7815067768096924, + "epoch": 0.7358834244080146, + "grad_norm": 0.3378625214099884, "learning_rate": 2e-05, - "loss": 0.7758, - "mean_token_accuracy": 0.7671153767210239, + "loss": 1.0309, + "mean_token_accuracy": 0.7123320713238885, "step": 1010 }, { - "epoch": 0.7511563367252544, - "grad_norm": 0.8126241564750671, + "epoch": 0.7395264116575592, + "grad_norm": 0.346305787563324, "learning_rate": 2e-05, - "loss": 0.7379, - "mean_token_accuracy": 0.7785501124249352, + "loss": 1.0303, + "mean_token_accuracy": 0.71228627259404, "step": 1015 }, { - "epoch": 0.7548566142460684, - "grad_norm": 0.8162810802459717, + "epoch": 0.7431693989071039, + "grad_norm": 0.3319399654865265, "learning_rate": 2e-05, - "loss": 0.8102, - "mean_token_accuracy": 0.7573220349482289, + "loss": 1.0508, + "mean_token_accuracy": 0.7065431118710308, "step": 1020 }, { - "epoch": 0.7585568917668826, - "grad_norm": 0.7573408484458923, + "epoch": 0.7468123861566485, + "grad_norm": 0.31447839736938477, "learning_rate": 2e-05, - "loss": 0.7662, - "mean_token_accuracy": 0.7698151470774073, + "loss": 1.0135, + "mean_token_accuracy": 0.7169516365412798, "step": 1025 }, { - "epoch": 0.7622571692876966, - "grad_norm": 0.7962117791175842, + "epoch": 0.7504553734061931, + "grad_norm": 0.3323952853679657, "learning_rate": 2e-05, - "loss": 0.7612, - "mean_token_accuracy": 0.7716137946938584, + "loss": 1.0683, + "mean_token_accuracy": 0.7038593063019053, "step": 1030 }, { - "epoch": 0.7659574468085106, - "grad_norm": 0.7962700724601746, + "epoch": 0.7540983606557377, + "grad_norm": 0.33427804708480835, "learning_rate": 2e-05, - "loss": 0.7647, - "mean_token_accuracy": 0.7699719865120008, + "loss": 1.0319, + "mean_token_accuracy": 0.7127320468978994, "step": 1035 }, { - "epoch": 0.7696577243293247, - "grad_norm": 0.8017210960388184, + "epoch": 0.7577413479052824, + "grad_norm": 0.33110910654067993, "learning_rate": 2e-05, - "loss": 0.7729, - "mean_token_accuracy": 0.7656573176756548, + "loss": 1.05, + "mean_token_accuracy": 0.7086345872007815, "step": 1040 }, { - "epoch": 0.7733580018501388, - "grad_norm": 0.8346449136734009, + "epoch": 0.761384335154827, + "grad_norm": 0.31014275550842285, "learning_rate": 2e-05, - "loss": 0.7636, - "mean_token_accuracy": 0.7693838985258703, + "loss": 1.0545, + "mean_token_accuracy": 0.7088544211040547, "step": 1045 }, { - "epoch": 0.7770582793709528, - "grad_norm": 0.8304547667503357, + "epoch": 0.7650273224043715, + "grad_norm": 0.34434184432029724, "learning_rate": 2e-05, - "loss": 0.7722, - "mean_token_accuracy": 0.7666433599313455, + "loss": 1.0181, + "mean_token_accuracy": 0.7153089887640449, "step": 1050 }, { - "epoch": 0.7807585568917669, - "grad_norm": 0.8001357913017273, + "epoch": 0.7686703096539163, + "grad_norm": 0.31507736444473267, "learning_rate": 2e-05, - "loss": 0.7786, - "mean_token_accuracy": 0.7662119403935679, + "loss": 1.0436, + "mean_token_accuracy": 0.7094620175867123, "step": 1055 }, { - "epoch": 0.7844588344125809, - "grad_norm": 0.8312594294548035, + "epoch": 0.7723132969034608, + "grad_norm": 0.3478749990463257, "learning_rate": 2e-05, - "loss": 0.7863, - "mean_token_accuracy": 0.7634551853341814, + "loss": 1.039, + "mean_token_accuracy": 0.7100054958475819, "step": 1060 }, { - "epoch": 0.788159111933395, - "grad_norm": 0.8669179677963257, + "epoch": 0.7759562841530054, + "grad_norm": 0.3535066246986389, "learning_rate": 2e-05, - "loss": 0.7696, - "mean_token_accuracy": 0.768203040960474, + "loss": 1.0351, + "mean_token_accuracy": 0.7126282364435758, "step": 1065 }, { - "epoch": 0.7918593894542091, - "grad_norm": 0.9093945026397705, + "epoch": 0.7795992714025501, + "grad_norm": 0.30896762013435364, "learning_rate": 2e-05, - "loss": 0.7506, - "mean_token_accuracy": 0.7740881651757804, + "loss": 1.056, + "mean_token_accuracy": 0.70635686370298, "step": 1070 }, { - "epoch": 0.7955596669750231, - "grad_norm": 0.9110556244850159, + "epoch": 0.7832422586520947, + "grad_norm": 0.3770598769187927, "learning_rate": 2e-05, - "loss": 0.7723, - "mean_token_accuracy": 0.768156867902089, + "loss": 1.034, + "mean_token_accuracy": 0.7107138495359061, "step": 1075 }, { - "epoch": 0.7992599444958371, - "grad_norm": 0.8719515204429626, + "epoch": 0.7868852459016393, + "grad_norm": 0.3074532449245453, "learning_rate": 2e-05, - "loss": 0.7487, - "mean_token_accuracy": 0.7738584815308094, + "loss": 1.0325, + "mean_token_accuracy": 0.7111871030776745, "step": 1080 }, { - "epoch": 0.8029602220166513, - "grad_norm": 0.855759859085083, + "epoch": 0.7905282331511839, + "grad_norm": 0.2974810004234314, "learning_rate": 2e-05, - "loss": 0.8085, - "mean_token_accuracy": 0.7569202538917256, + "loss": 1.041, + "mean_token_accuracy": 0.7103230337078652, "step": 1085 }, { - "epoch": 0.8066604995374653, - "grad_norm": 0.7778683304786682, + "epoch": 0.7941712204007286, + "grad_norm": 0.3332538306713104, "learning_rate": 2e-05, - "loss": 0.8039, - "mean_token_accuracy": 0.7584921428765562, + "loss": 1.0351, + "mean_token_accuracy": 0.7102680752320469, "step": 1090 }, { - "epoch": 0.8103607770582794, - "grad_norm": 0.7796655893325806, + "epoch": 0.7978142076502732, + "grad_norm": 0.3428950011730194, "learning_rate": 2e-05, - "loss": 0.7847, - "mean_token_accuracy": 0.7638983818899377, + "loss": 1.0277, + "mean_token_accuracy": 0.7125457987298486, "step": 1095 }, { - "epoch": 0.8140610545790934, - "grad_norm": 0.8250465989112854, + "epoch": 0.8014571948998178, + "grad_norm": 0.3009244203567505, "learning_rate": 2e-05, - "loss": 0.7639, - "mean_token_accuracy": 0.7687881349384388, + "loss": 1.0228, + "mean_token_accuracy": 0.7134342940889103, "step": 1100 }, { - "epoch": 0.8140610545790934, - "eval_loss": 0.800523042678833, - "eval_mean_token_accuracy": 0.7575268288806666, - "eval_runtime": 28.2503, - "eval_samples_per_second": 9.097, - "eval_steps_per_second": 1.168, + "epoch": 0.8014571948998178, + "eval_loss": 1.0443105697631836, + "eval_mean_token_accuracy": 0.7080775799721868, + "eval_runtime": 14.3394, + "eval_samples_per_second": 18.271, + "eval_steps_per_second": 1.186, "step": 1100 }, { - "epoch": 0.8177613320999075, - "grad_norm": 0.8301143646240234, + "epoch": 0.8051001821493625, + "grad_norm": 0.3108626902103424, "learning_rate": 2e-05, - "loss": 0.7764, - "mean_token_accuracy": 0.7657188242099898, + "loss": 1.0141, + "mean_token_accuracy": 0.717134831460674, "step": 1105 }, { - "epoch": 0.8214616096207216, - "grad_norm": 0.8132140040397644, + "epoch": 0.8087431693989071, + "grad_norm": 0.31533291935920715, "learning_rate": 2e-05, - "loss": 0.7783, - "mean_token_accuracy": 0.7655952486721203, + "loss": 1.0231, + "mean_token_accuracy": 0.7143407530199676, "step": 1110 }, { - "epoch": 0.8251618871415356, - "grad_norm": 0.8092007637023926, + "epoch": 0.8123861566484517, + "grad_norm": 0.33208322525024414, "learning_rate": 2e-05, - "loss": 0.7873, - "mean_token_accuracy": 0.7621894900380742, + "loss": 1.0412, + "mean_token_accuracy": 0.7096116267708843, "step": 1115 }, { - "epoch": 0.8288621646623496, - "grad_norm": 0.8420717120170593, + "epoch": 0.8160291438979964, + "grad_norm": 0.33526450395584106, "learning_rate": 2e-05, - "loss": 0.7788, - "mean_token_accuracy": 0.7661002615281637, + "loss": 1.0223, + "mean_token_accuracy": 0.7140693698094772, "step": 1120 }, { - "epoch": 0.8325624421831638, - "grad_norm": 0.819154679775238, + "epoch": 0.819672131147541, + "grad_norm": 0.33586207032203674, "learning_rate": 2e-05, - "loss": 0.7834, - "mean_token_accuracy": 0.7640260223222127, + "loss": 1.0417, + "mean_token_accuracy": 0.7102894479726429, "step": 1125 }, { - "epoch": 0.8362627197039778, - "grad_norm": 0.7833020091056824, + "epoch": 0.8233151183970856, + "grad_norm": 0.3324921429157257, "learning_rate": 2e-05, - "loss": 0.7673, - "mean_token_accuracy": 0.7671381666306092, + "loss": 1.0112, + "mean_token_accuracy": 0.7177057889594529, "step": 1130 }, { - "epoch": 0.8399629972247918, - "grad_norm": 0.8282708525657654, + "epoch": 0.8269581056466302, + "grad_norm": 0.3279407322406769, "learning_rate": 2e-05, - "loss": 0.7663, - "mean_token_accuracy": 0.76869465856567, + "loss": 1.0215, + "mean_token_accuracy": 0.7144968246213971, "step": 1135 }, { - "epoch": 0.8436632747456059, - "grad_norm": 0.8674660325050354, + "epoch": 0.8306010928961749, + "grad_norm": 0.3108167350292206, "learning_rate": 2e-05, - "loss": 0.8198, - "mean_token_accuracy": 0.7535738698791236, + "loss": 1.0458, + "mean_token_accuracy": 0.7076056424035174, "step": 1140 }, { - "epoch": 0.84736355226642, - "grad_norm": 0.8629141449928284, + "epoch": 0.8342440801457195, + "grad_norm": 0.3357519507408142, "learning_rate": 2e-05, - "loss": 0.8062, - "mean_token_accuracy": 0.7576205181138813, + "loss": 1.0416, + "mean_token_accuracy": 0.7098100879335613, "step": 1145 }, { - "epoch": 0.851063829787234, - "grad_norm": 0.8082600235939026, + "epoch": 0.8378870673952641, + "grad_norm": 0.3577389717102051, "learning_rate": 2e-05, - "loss": 0.7517, - "mean_token_accuracy": 0.7717635510287453, + "loss": 1.0267, + "mean_token_accuracy": 0.7120114802149485, "step": 1150 }, { - "epoch": 0.8547641073080481, - "grad_norm": 0.7617512941360474, + "epoch": 0.8415300546448088, + "grad_norm": 0.3234967887401581, "learning_rate": 2e-05, - "loss": 0.7643, - "mean_token_accuracy": 0.7704166399970936, + "loss": 1.0654, + "mean_token_accuracy": 0.7035234489496826, "step": 1155 }, { - "epoch": 0.8584643848288621, - "grad_norm": 0.7678114771842957, + "epoch": 0.8451730418943534, + "grad_norm": 0.328988254070282, "learning_rate": 2e-05, - "loss": 0.7729, - "mean_token_accuracy": 0.7668813717759277, + "loss": 1.0385, + "mean_token_accuracy": 0.7108115534929164, "step": 1160 }, { - "epoch": 0.8621646623496763, - "grad_norm": 0.7924060821533203, + "epoch": 0.848816029143898, + "grad_norm": 0.3636587858200073, "learning_rate": 2e-05, - "loss": 0.7521, - "mean_token_accuracy": 0.7735818962967718, + "loss": 1.0515, + "mean_token_accuracy": 0.7053462383976552, "step": 1165 }, { - "epoch": 0.8658649398704903, - "grad_norm": 0.881769061088562, + "epoch": 0.8524590163934426, + "grad_norm": 0.3206827938556671, "learning_rate": 2e-05, - "loss": 0.7859, - "mean_token_accuracy": 0.7643183082912849, + "loss": 1.0167, + "mean_token_accuracy": 0.7159135319980459, "step": 1170 }, { - "epoch": 0.8695652173913043, - "grad_norm": 0.7947380542755127, + "epoch": 0.8561020036429873, + "grad_norm": 0.3391059637069702, "learning_rate": 2e-05, - "loss": 0.7305, - "mean_token_accuracy": 0.7786316210966694, + "loss": 1.0397, + "mean_token_accuracy": 0.7123595505617978, "step": 1175 }, { - "epoch": 0.8732654949121184, - "grad_norm": 0.8072516918182373, + "epoch": 0.8597449908925319, + "grad_norm": 0.3314334750175476, "learning_rate": 2e-05, - "loss": 0.7676, - "mean_token_accuracy": 0.7692856950594738, + "loss": 1.0297, + "mean_token_accuracy": 0.714533463605276, "step": 1180 }, { - "epoch": 0.8769657724329325, - "grad_norm": 0.8792457580566406, + "epoch": 0.8633879781420765, + "grad_norm": 0.3360309600830078, "learning_rate": 2e-05, - "loss": 0.7984, - "mean_token_accuracy": 0.7623190074905516, + "loss": 1.0692, + "mean_token_accuracy": 0.7027845627747925, "step": 1185 }, { - "epoch": 0.8806660499537465, - "grad_norm": 0.8246925473213196, + "epoch": 0.8670309653916212, + "grad_norm": 0.3183639645576477, "learning_rate": 2e-05, - "loss": 0.7604, - "mean_token_accuracy": 0.7710242769603142, + "loss": 1.0175, + "mean_token_accuracy": 0.7152265510503177, "step": 1190 }, { - "epoch": 0.8843663274745606, - "grad_norm": 0.8517448902130127, + "epoch": 0.8706739526411658, + "grad_norm": 0.3319450914859772, "learning_rate": 2e-05, - "loss": 0.8127, - "mean_token_accuracy": 0.7556866127163658, + "loss": 1.0542, + "mean_token_accuracy": 0.7045035417684418, "step": 1195 }, { - "epoch": 0.8880666049953746, - "grad_norm": 0.8204156160354614, + "epoch": 0.8743169398907104, + "grad_norm": 0.32276779413223267, "learning_rate": 2e-05, - "loss": 0.7559, - "mean_token_accuracy": 0.7712058797842442, + "loss": 1.0273, + "mean_token_accuracy": 0.7115290669272106, "step": 1200 }, { - "epoch": 0.8880666049953746, - "eval_loss": 0.797502875328064, - "eval_mean_token_accuracy": 0.7579622750028506, - "eval_runtime": 28.4023, - "eval_samples_per_second": 9.049, - "eval_steps_per_second": 1.162, + "epoch": 0.8743169398907104, + "eval_loss": 1.0371540784835815, + "eval_mean_token_accuracy": 0.7092831164176581, + "eval_runtime": 14.3641, + "eval_samples_per_second": 18.24, + "eval_steps_per_second": 1.184, "step": 1200 }, { - "epoch": 0.8917668825161887, - "grad_norm": 0.8131943345069885, + "epoch": 0.8779599271402551, + "grad_norm": 0.3125724494457245, "learning_rate": 2e-05, - "loss": 0.7571, - "mean_token_accuracy": 0.7733840665520189, + "loss": 1.0164, + "mean_token_accuracy": 0.7162616023448949, "step": 1205 }, { - "epoch": 0.8954671600370028, - "grad_norm": 0.73580002784729, + "epoch": 0.8816029143897997, + "grad_norm": 0.33562660217285156, "learning_rate": 2e-05, - "loss": 0.7727, - "mean_token_accuracy": 0.7674258088788803, + "loss": 1.023, + "mean_token_accuracy": 0.714150905550501, "step": 1210 }, { - "epoch": 0.8991674375578168, - "grad_norm": 0.7590300440788269, + "epoch": 0.8852459016393442, + "grad_norm": 0.31345677375793457, "learning_rate": 2e-05, - "loss": 0.7637, - "mean_token_accuracy": 0.7691722572737629, + "loss": 1.0235, + "mean_token_accuracy": 0.7146097948216903, "step": 1215 }, { - "epoch": 0.902867715078631, - "grad_norm": 0.7864495515823364, + "epoch": 0.8888888888888888, + "grad_norm": 0.3195979595184326, "learning_rate": 2e-05, - "loss": 0.794, - "mean_token_accuracy": 0.7610983305690475, + "loss": 1.0359, + "mean_token_accuracy": 0.711648143624817, "step": 1220 }, { - "epoch": 0.906567992599445, - "grad_norm": 0.7783846259117126, + "epoch": 0.8925318761384335, + "grad_norm": 0.3194137513637543, "learning_rate": 2e-05, - "loss": 0.7698, - "mean_token_accuracy": 0.7677923415686079, + "loss": 1.0178, + "mean_token_accuracy": 0.7147838299951148, "step": 1225 }, { - "epoch": 0.910268270120259, - "grad_norm": 0.8439836502075195, + "epoch": 0.8961748633879781, + "grad_norm": 0.3297196328639984, "learning_rate": 2e-05, - "loss": 0.8021, - "mean_token_accuracy": 0.7596970516172707, + "loss": 1.0396, + "mean_token_accuracy": 0.7098375671714704, "step": 1230 }, { - "epoch": 0.913968547641073, - "grad_norm": 0.875133752822876, + "epoch": 0.8998178506375227, + "grad_norm": 0.32576802372932434, "learning_rate": 2e-05, - "loss": 0.7796, - "mean_token_accuracy": 0.7652020678902339, + "loss": 1.0309, + "mean_token_accuracy": 0.7101613812069504, "step": 1235 }, { - "epoch": 0.9176688251618872, - "grad_norm": 0.7731521725654602, + "epoch": 0.9034608378870674, + "grad_norm": 0.3285605311393738, "learning_rate": 2e-05, - "loss": 0.7595, - "mean_token_accuracy": 0.7703574952610036, + "loss": 1.0273, + "mean_token_accuracy": 0.7137640449438203, "step": 1240 }, { - "epoch": 0.9213691026827012, - "grad_norm": 0.7407891750335693, + "epoch": 0.907103825136612, + "grad_norm": 0.32822245359420776, "learning_rate": 2e-05, - "loss": 0.7665, - "mean_token_accuracy": 0.7682582461530055, + "loss": 1.0257, + "mean_token_accuracy": 0.7128450170981925, "step": 1245 }, { - "epoch": 0.9250693802035153, - "grad_norm": 0.790274977684021, + "epoch": 0.9107468123861566, + "grad_norm": 0.32141098380088806, "learning_rate": 2e-05, - "loss": 0.7489, - "mean_token_accuracy": 0.7736910322129121, + "loss": 1.0354, + "mean_token_accuracy": 0.711327552515877, "step": 1250 }, { - "epoch": 0.9287696577243293, - "grad_norm": 0.81461501121521, + "epoch": 0.9143897996357013, + "grad_norm": 0.3102847635746002, "learning_rate": 2e-05, - "loss": 0.7741, - "mean_token_accuracy": 0.7667225211713106, + "loss": 1.0393, + "mean_token_accuracy": 0.7078682217879825, "step": 1255 }, { - "epoch": 0.9324699352451434, - "grad_norm": 0.7998383045196533, + "epoch": 0.9180327868852459, + "grad_norm": 0.32655781507492065, "learning_rate": 2e-05, - "loss": 0.7807, - "mean_token_accuracy": 0.7625351777097331, + "loss": 1.0343, + "mean_token_accuracy": 0.7091322667318025, "step": 1260 }, { - "epoch": 0.9361702127659575, - "grad_norm": 0.8241930603981018, + "epoch": 0.9216757741347905, + "grad_norm": 0.33519840240478516, "learning_rate": 2e-05, - "loss": 0.8072, - "mean_token_accuracy": 0.7569770056683507, + "loss": 1.0103, + "mean_token_accuracy": 0.7167684416218856, "step": 1265 }, { - "epoch": 0.9398704902867715, - "grad_norm": 0.7618784308433533, + "epoch": 0.9253187613843351, + "grad_norm": 0.3166919946670532, "learning_rate": 2e-05, - "loss": 0.7772, - "mean_token_accuracy": 0.7643117414360313, + "loss": 1.0049, + "mean_token_accuracy": 0.7192385197850513, "step": 1270 }, { - "epoch": 0.9435707678075855, - "grad_norm": 0.7704834938049316, + "epoch": 0.9289617486338798, + "grad_norm": 0.31707146763801575, "learning_rate": 2e-05, - "loss": 0.7789, - "mean_token_accuracy": 0.7653304006081735, + "loss": 1.0088, + "mean_token_accuracy": 0.7189606741573035, "step": 1275 }, { - "epoch": 0.9472710453283997, - "grad_norm": 0.7337019443511963, + "epoch": 0.9326047358834244, + "grad_norm": 0.32707205414772034, "learning_rate": 2e-05, - "loss": 0.772, - "mean_token_accuracy": 0.7669845879245384, + "loss": 1.016, + "mean_token_accuracy": 0.7171409379579873, "step": 1280 }, { - "epoch": 0.9509713228492137, - "grad_norm": 0.8250733017921448, + "epoch": 0.936247723132969, + "grad_norm": 0.3101848065853119, "learning_rate": 2e-05, - "loss": 0.7785, - "mean_token_accuracy": 0.7657440636307564, + "loss": 1.0258, + "mean_token_accuracy": 0.711348925256473, "step": 1285 }, { - "epoch": 0.9546716003700277, - "grad_norm": 0.839878499507904, + "epoch": 0.9398907103825137, + "grad_norm": 0.36114785075187683, "learning_rate": 2e-05, - "loss": 0.8066, - "mean_token_accuracy": 0.7591835950929369, + "loss": 1.0088, + "mean_token_accuracy": 0.7187561064973134, "step": 1290 }, { - "epoch": 0.9583718778908418, - "grad_norm": 0.8156315088272095, + "epoch": 0.9435336976320583, + "grad_norm": 0.30461356043815613, "learning_rate": 2e-05, - "loss": 0.7704, - "mean_token_accuracy": 0.7668827112901107, + "loss": 1.0016, + "mean_token_accuracy": 0.718658402540303, "step": 1295 }, { - "epoch": 0.9620721554116559, - "grad_norm": 0.8538619875907898, + "epoch": 0.9471766848816029, + "grad_norm": 0.32494619488716125, "learning_rate": 2e-05, - "loss": 0.7589, - "mean_token_accuracy": 0.7709486799753794, + "loss": 1.0156, + "mean_token_accuracy": 0.7143166829506594, "step": 1300 }, { - "epoch": 0.9620721554116559, - "eval_loss": 0.7957349419593811, - "eval_mean_token_accuracy": 0.7585383150980717, - "eval_runtime": 28.3961, - "eval_samples_per_second": 9.051, - "eval_steps_per_second": 1.162, + "epoch": 0.9471766848816029, + "eval_loss": 1.0324426889419556, + "eval_mean_token_accuracy": 0.7099415543626529, + "eval_runtime": 14.3232, + "eval_samples_per_second": 18.292, + "eval_steps_per_second": 1.187, "step": 1300 }, { - "epoch": 0.96577243293247, - "grad_norm": 0.8585022687911987, + "epoch": 0.9508196721311475, + "grad_norm": 0.3495274484157562, "learning_rate": 2e-05, - "loss": 0.7955, - "mean_token_accuracy": 0.7603014430216614, + "loss": 1.0224, + "mean_token_accuracy": 0.71535784074255, "step": 1305 }, { - "epoch": 0.969472710453284, - "grad_norm": 0.8347756862640381, + "epoch": 0.9544626593806922, + "grad_norm": 0.34313705563545227, "learning_rate": 2e-05, - "loss": 0.8013, - "mean_token_accuracy": 0.7593268261742192, + "loss": 1.0457, + "mean_token_accuracy": 0.7087200781631655, "step": 1310 }, { - "epoch": 0.973172987974098, - "grad_norm": 0.7872015833854675, + "epoch": 0.9581056466302368, + "grad_norm": 0.3228403925895691, "learning_rate": 2e-05, - "loss": 0.7683, - "mean_token_accuracy": 0.7696177228132353, + "loss": 1.0365, + "mean_token_accuracy": 0.7079201270151441, "step": 1315 }, { - "epoch": 0.9768732654949122, - "grad_norm": 0.8233494162559509, + "epoch": 0.9617486338797814, + "grad_norm": 0.30886438488960266, "learning_rate": 2e-05, - "loss": 0.7706, - "mean_token_accuracy": 0.7678653349992682, + "loss": 0.9937, + "mean_token_accuracy": 0.7199041279921836, "step": 1320 }, { - "epoch": 0.9805735430157262, - "grad_norm": 0.8023855090141296, + "epoch": 0.9653916211293261, + "grad_norm": 0.3261902630329132, "learning_rate": 2e-05, - "loss": 0.7783, - "mean_token_accuracy": 0.7656580825758337, + "loss": 1.0278, + "mean_token_accuracy": 0.7134495603321935, "step": 1325 }, { - "epoch": 0.9842738205365402, - "grad_norm": 0.8429985642433167, + "epoch": 0.9690346083788707, + "grad_norm": 0.303875207901001, "learning_rate": 2e-05, - "loss": 0.7669, - "mean_token_accuracy": 0.7683449900233187, + "loss": 0.9981, + "mean_token_accuracy": 0.7204811919882755, "step": 1330 }, { - "epoch": 0.9879740980573543, - "grad_norm": 0.792027473449707, + "epoch": 0.9726775956284153, + "grad_norm": 0.33992454409599304, "learning_rate": 2e-05, - "loss": 0.767, - "mean_token_accuracy": 0.7692297252303795, + "loss": 1.028, + "mean_token_accuracy": 0.7103535661944307, "step": 1335 }, { - "epoch": 0.9916743755781684, - "grad_norm": 0.8096156120300293, + "epoch": 0.97632058287796, + "grad_norm": 0.35153236985206604, "learning_rate": 2e-05, - "loss": 0.7841, - "mean_token_accuracy": 0.7638673896787455, + "loss": 1.006, + "mean_token_accuracy": 0.7178431851489984, "step": 1340 }, { - "epoch": 0.9953746530989824, - "grad_norm": 0.9265767931938171, + "epoch": 0.9799635701275046, + "grad_norm": 0.33356398344039917, "learning_rate": 2e-05, - "loss": 0.771, - "mean_token_accuracy": 0.7675258668736931, + "loss": 1.0026, + "mean_token_accuracy": 0.720670493404983, "step": 1345 }, { - "epoch": 0.9990749306197965, - "grad_norm": 0.8164911866188049, + "epoch": 0.9836065573770492, + "grad_norm": 0.32960230112075806, "learning_rate": 2e-05, - "loss": 0.7838, - "mean_token_accuracy": 0.7645547380257296, + "loss": 1.0319, + "mean_token_accuracy": 0.7102375427454812, "step": 1350 }, { - "epoch": 0.9998149861239592, - "mean_token_accuracy": 0.7587965060316426, - "step": 1351, - "total_flos": 76959556042752.0, - "train_loss": 0.8068137678457665, - "train_runtime": 18078.9101, - "train_samples_per_second": 2.392, - "train_steps_per_second": 0.075 + "epoch": 0.9872495446265938, + "grad_norm": 0.3457731306552887, + "learning_rate": 2e-05, + "loss": 1.0324, + "mean_token_accuracy": 0.7103286771488085, + "step": 1355 + }, + { + "epoch": 0.9908925318761385, + "grad_norm": 0.34496790170669556, + "learning_rate": 2e-05, + "loss": 1.0129, + "mean_token_accuracy": 0.7149914509037616, + "step": 1360 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 0.3273138105869293, + "learning_rate": 2e-05, + "loss": 1.0067, + "mean_token_accuracy": 0.7190278940364658, + "step": 1365 + }, + { + "epoch": 0.9981785063752276, + "grad_norm": 0.35900992155075073, + "learning_rate": 2e-05, + "loss": 1.0475, + "mean_token_accuracy": 0.7082559843673668, + "step": 1370 + }, + { + "epoch": 1.0021857923497268, + "grad_norm": 0.364685982465744, + "learning_rate": 2e-05, + "loss": 1.039, + "mean_token_accuracy": 0.720287528787773, + "step": 1375 + }, + { + "epoch": 1.0058287795992713, + "grad_norm": 0.3133888244628906, + "learning_rate": 2e-05, + "loss": 0.9677, + "mean_token_accuracy": 0.7262609916951638, + "step": 1380 + }, + { + "epoch": 1.009471766848816, + "grad_norm": 0.3458701968193054, + "learning_rate": 2e-05, + "loss": 0.9779, + "mean_token_accuracy": 0.7242458475818271, + "step": 1385 + }, + { + "epoch": 1.0131147540983607, + "grad_norm": 0.3019266426563263, + "learning_rate": 2e-05, + "loss": 0.9867, + "mean_token_accuracy": 0.7212231314118223, + "step": 1390 + }, + { + "epoch": 1.0167577413479052, + "grad_norm": 0.3397253751754761, + "learning_rate": 2e-05, + "loss": 0.9864, + "mean_token_accuracy": 0.7227711284807034, + "step": 1395 + }, + { + "epoch": 1.02040072859745, + "grad_norm": 0.3236973285675049, + "learning_rate": 2e-05, + "loss": 0.9789, + "mean_token_accuracy": 0.7222307034684905, + "step": 1400 + }, + { + "epoch": 1.02040072859745, + "eval_loss": 1.0278208255767822, + "eval_mean_token_accuracy": 0.7113203024833191, + "eval_runtime": 14.2654, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 1.192, + "step": 1400 + }, + { + "epoch": 1.0240437158469946, + "grad_norm": 0.3549124300479889, + "learning_rate": 2e-05, + "loss": 0.9882, + "mean_token_accuracy": 0.7225512945774304, + "step": 1405 + }, + { + "epoch": 1.027686703096539, + "grad_norm": 0.3246813416481018, + "learning_rate": 2e-05, + "loss": 1.0112, + "mean_token_accuracy": 0.7154952369320957, + "step": 1410 + }, + { + "epoch": 1.0313296903460838, + "grad_norm": 0.347151517868042, + "learning_rate": 2e-05, + "loss": 0.9723, + "mean_token_accuracy": 0.7266487542745483, + "step": 1415 + }, + { + "epoch": 1.0349726775956285, + "grad_norm": 0.35966718196868896, + "learning_rate": 2e-05, + "loss": 0.9863, + "mean_token_accuracy": 0.7234916951636542, + "step": 1420 + }, + { + "epoch": 1.038615664845173, + "grad_norm": 0.3249855935573578, + "learning_rate": 2e-05, + "loss": 0.9768, + "mean_token_accuracy": 0.7243191255495847, + "step": 1425 + }, + { + "epoch": 1.0422586520947177, + "grad_norm": 0.3137085437774658, + "learning_rate": 2e-05, + "loss": 0.9823, + "mean_token_accuracy": 0.721757449926722, + "step": 1430 + }, + { + "epoch": 1.0459016393442624, + "grad_norm": 0.3215838670730591, + "learning_rate": 2e-05, + "loss": 0.9939, + "mean_token_accuracy": 0.7202796775769419, + "step": 1435 + }, + { + "epoch": 1.0495446265938069, + "grad_norm": 0.3278406858444214, + "learning_rate": 2e-05, + "loss": 0.9775, + "mean_token_accuracy": 0.7260136785539814, + "step": 1440 + }, + { + "epoch": 1.0531876138433516, + "grad_norm": 0.32132425904273987, + "learning_rate": 2e-05, + "loss": 0.9815, + "mean_token_accuracy": 0.7215712017586711, + "step": 1445 + }, + { + "epoch": 1.0568306010928963, + "grad_norm": 0.29465213418006897, + "learning_rate": 2e-05, + "loss": 0.9981, + "mean_token_accuracy": 0.7183011724474841, + "step": 1450 + }, + { + "epoch": 1.0604735883424408, + "grad_norm": 0.32398778200149536, + "learning_rate": 2e-05, + "loss": 0.9833, + "mean_token_accuracy": 0.7217727161700049, + "step": 1455 + }, + { + "epoch": 1.0641165755919855, + "grad_norm": 0.3281712532043457, + "learning_rate": 2e-05, + "loss": 0.9946, + "mean_token_accuracy": 0.7185686983605112, + "step": 1460 + }, + { + "epoch": 1.06775956284153, + "grad_norm": 0.31330931186676025, + "learning_rate": 2e-05, + "loss": 0.967, + "mean_token_accuracy": 0.727317415730337, + "step": 1465 + }, + { + "epoch": 1.0714025500910747, + "grad_norm": 0.33476975560188293, + "learning_rate": 2e-05, + "loss": 0.952, + "mean_token_accuracy": 0.7321110161211529, + "step": 1470 + }, + { + "epoch": 1.0750455373406194, + "grad_norm": 0.3155185580253601, + "learning_rate": 2e-05, + "loss": 0.982, + "mean_token_accuracy": 0.7213910600879336, + "step": 1475 + }, + { + "epoch": 1.0786885245901638, + "grad_norm": 0.3037089407444, + "learning_rate": 2e-05, + "loss": 0.9817, + "mean_token_accuracy": 0.722349729518869, + "step": 1480 + }, + { + "epoch": 1.0823315118397085, + "grad_norm": 0.3598880171775818, + "learning_rate": 2e-05, + "loss": 1.0052, + "mean_token_accuracy": 0.7150433561309233, + "step": 1485 + }, + { + "epoch": 1.0859744990892533, + "grad_norm": 0.32113686203956604, + "learning_rate": 2e-05, + "loss": 0.9745, + "mean_token_accuracy": 0.7253543328356823, + "step": 1490 + }, + { + "epoch": 1.0896174863387977, + "grad_norm": 0.3267917335033417, + "learning_rate": 2e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7191438690766976, + "step": 1495 + }, + { + "epoch": 1.0932604735883424, + "grad_norm": 0.33139562606811523, + "learning_rate": 2e-05, + "loss": 0.9829, + "mean_token_accuracy": 0.7221024670249144, + "step": 1500 + }, + { + "epoch": 1.0932604735883424, + "eval_loss": 1.0242724418640137, + "eval_mean_token_accuracy": 0.7120056856343084, + "eval_runtime": 14.3696, + "eval_samples_per_second": 18.233, + "eval_steps_per_second": 1.183, + "step": 1500 + }, + { + "epoch": 1.0969034608378871, + "grad_norm": 0.3266630172729492, + "learning_rate": 2e-05, + "loss": 0.9739, + "mean_token_accuracy": 0.7254488275525159, + "step": 1505 + }, + { + "epoch": 1.1005464480874316, + "grad_norm": 0.3302668333053589, + "learning_rate": 2e-05, + "loss": 0.9751, + "mean_token_accuracy": 0.7234306301905229, + "step": 1510 + }, + { + "epoch": 1.1041894353369763, + "grad_norm": 0.32395848631858826, + "learning_rate": 2e-05, + "loss": 0.9685, + "mean_token_accuracy": 0.7260014655593551, + "step": 1515 + }, + { + "epoch": 1.107832422586521, + "grad_norm": 0.31805044412612915, + "learning_rate": 2e-05, + "loss": 0.992, + "mean_token_accuracy": 0.7183103321934539, + "step": 1520 + }, + { + "epoch": 1.1114754098360655, + "grad_norm": 0.3312923312187195, + "learning_rate": 2e-05, + "loss": 0.9656, + "mean_token_accuracy": 0.7273204689789936, + "step": 1525 + }, + { + "epoch": 1.1151183970856102, + "grad_norm": 0.31044256687164307, + "learning_rate": 2e-05, + "loss": 0.9764, + "mean_token_accuracy": 0.7244168295065949, + "step": 1530 + }, + { + "epoch": 1.118761384335155, + "grad_norm": 0.343591570854187, + "learning_rate": 2e-05, + "loss": 0.9865, + "mean_token_accuracy": 0.7209636052760137, + "step": 1535 + }, + { + "epoch": 1.1224043715846994, + "grad_norm": 0.3052913248538971, + "learning_rate": 2e-05, + "loss": 0.9773, + "mean_token_accuracy": 0.7230153883732291, + "step": 1540 + }, + { + "epoch": 1.1260473588342441, + "grad_norm": 0.32441943883895874, + "learning_rate": 2e-05, + "loss": 1.0037, + "mean_token_accuracy": 0.7171042989741085, + "step": 1545 + }, + { + "epoch": 1.1296903460837888, + "grad_norm": 0.33394140005111694, + "learning_rate": 2e-05, + "loss": 0.9877, + "mean_token_accuracy": 0.720826209086468, + "step": 1550 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.34548622369766235, + "learning_rate": 2e-05, + "loss": 0.989, + "mean_token_accuracy": 0.7204445530043968, + "step": 1555 + }, + { + "epoch": 1.136976320582878, + "grad_norm": 0.3255009949207306, + "learning_rate": 2e-05, + "loss": 0.9615, + "mean_token_accuracy": 0.7266609672691743, + "step": 1560 + }, + { + "epoch": 1.1406193078324225, + "grad_norm": 0.3292433023452759, + "learning_rate": 2e-05, + "loss": 1.0, + "mean_token_accuracy": 0.7187103077674646, + "step": 1565 + }, + { + "epoch": 1.1442622950819672, + "grad_norm": 0.3490438759326935, + "learning_rate": 2e-05, + "loss": 0.9932, + "mean_token_accuracy": 0.7185576453346362, + "step": 1570 + }, + { + "epoch": 1.147905282331512, + "grad_norm": 0.32841020822525024, + "learning_rate": 2e-05, + "loss": 0.9784, + "mean_token_accuracy": 0.7231283585735222, + "step": 1575 + }, + { + "epoch": 1.1515482695810564, + "grad_norm": 0.3233148753643036, + "learning_rate": 2e-05, + "loss": 0.9649, + "mean_token_accuracy": 0.7264472398632144, + "step": 1580 + }, + { + "epoch": 1.155191256830601, + "grad_norm": 0.29847216606140137, + "learning_rate": 2e-05, + "loss": 0.9563, + "mean_token_accuracy": 0.7308133854421105, + "step": 1585 + }, + { + "epoch": 1.1588342440801458, + "grad_norm": 0.31668856739997864, + "learning_rate": 2e-05, + "loss": 0.9666, + "mean_token_accuracy": 0.726267098192477, + "step": 1590 + }, + { + "epoch": 1.1624772313296903, + "grad_norm": 0.32061877846717834, + "learning_rate": 2e-05, + "loss": 0.9824, + "mean_token_accuracy": 0.7208720078163166, + "step": 1595 + }, + { + "epoch": 1.166120218579235, + "grad_norm": 0.3160786032676697, + "learning_rate": 2e-05, + "loss": 1.0117, + "mean_token_accuracy": 0.7149151196873472, + "step": 1600 + }, + { + "epoch": 1.166120218579235, + "eval_loss": 1.0199487209320068, + "eval_mean_token_accuracy": 0.7121113575576578, + "eval_runtime": 14.3112, + "eval_samples_per_second": 18.307, + "eval_steps_per_second": 1.188, + "step": 1600 + }, + { + "epoch": 1.1697632058287797, + "grad_norm": 0.32467761635780334, + "learning_rate": 2e-05, + "loss": 0.9945, + "mean_token_accuracy": 0.7184507816316562, + "step": 1605 + }, + { + "epoch": 1.1734061930783242, + "grad_norm": 0.31809383630752563, + "learning_rate": 2e-05, + "loss": 0.9654, + "mean_token_accuracy": 0.726557156814851, + "step": 1610 + }, + { + "epoch": 1.1770491803278689, + "grad_norm": 0.33797189593315125, + "learning_rate": 2e-05, + "loss": 0.9995, + "mean_token_accuracy": 0.7159257449926724, + "step": 1615 + }, + { + "epoch": 1.1806921675774136, + "grad_norm": 0.315861314535141, + "learning_rate": 2e-05, + "loss": 0.9621, + "mean_token_accuracy": 0.7287493893502688, + "step": 1620 + }, + { + "epoch": 1.184335154826958, + "grad_norm": 0.31979405879974365, + "learning_rate": 2e-05, + "loss": 0.9607, + "mean_token_accuracy": 0.7283768930141671, + "step": 1625 + }, + { + "epoch": 1.1879781420765028, + "grad_norm": 0.31935200095176697, + "learning_rate": 2e-05, + "loss": 0.9692, + "mean_token_accuracy": 0.7262060332193452, + "step": 1630 + }, + { + "epoch": 1.1916211293260472, + "grad_norm": 0.3271839916706085, + "learning_rate": 2e-05, + "loss": 0.9718, + "mean_token_accuracy": 0.7253847093307277, + "step": 1635 + }, + { + "epoch": 1.195264116575592, + "grad_norm": 0.30795371532440186, + "learning_rate": 2e-05, + "loss": 0.9508, + "mean_token_accuracy": 0.7300012212994627, + "step": 1640 + }, + { + "epoch": 1.1989071038251367, + "grad_norm": 0.3378467857837677, + "learning_rate": 2e-05, + "loss": 0.9933, + "mean_token_accuracy": 0.7200476306790425, + "step": 1645 + }, + { + "epoch": 1.2025500910746811, + "grad_norm": 0.3224976658821106, + "learning_rate": 2e-05, + "loss": 0.9886, + "mean_token_accuracy": 0.7199761428617866, + "step": 1650 + }, + { + "epoch": 1.2061930783242258, + "grad_norm": 0.30547860264778137, + "learning_rate": 2e-05, + "loss": 0.9856, + "mean_token_accuracy": 0.7202552515876893, + "step": 1655 + }, + { + "epoch": 1.2098360655737705, + "grad_norm": 0.346476674079895, + "learning_rate": 2e-05, + "loss": 0.9644, + "mean_token_accuracy": 0.7255495847581827, + "step": 1660 + }, + { + "epoch": 1.213479052823315, + "grad_norm": 0.3571411967277527, + "learning_rate": 2e-05, + "loss": 0.966, + "mean_token_accuracy": 0.7260930630190523, + "step": 1665 + }, + { + "epoch": 1.2171220400728597, + "grad_norm": 0.3297497034072876, + "learning_rate": 2e-05, + "loss": 0.9826, + "mean_token_accuracy": 0.7229054714215927, + "step": 1670 + }, + { + "epoch": 1.2207650273224044, + "grad_norm": 0.3237413167953491, + "learning_rate": 2e-05, + "loss": 0.9728, + "mean_token_accuracy": 0.7240382266731802, + "step": 1675 + }, + { + "epoch": 1.224408014571949, + "grad_norm": 0.29469963908195496, + "learning_rate": 2e-05, + "loss": 0.9388, + "mean_token_accuracy": 0.732352222765022, + "step": 1680 + }, + { + "epoch": 1.2280510018214936, + "grad_norm": 0.33955973386764526, + "learning_rate": 2e-05, + "loss": 0.9725, + "mean_token_accuracy": 0.722966536394724, + "step": 1685 + }, + { + "epoch": 1.2316939890710383, + "grad_norm": 0.29405203461647034, + "learning_rate": 2e-05, + "loss": 0.9681, + "mean_token_accuracy": 0.7255404250122128, + "step": 1690 + }, + { + "epoch": 1.2353369763205828, + "grad_norm": 0.3296028673648834, + "learning_rate": 2e-05, + "loss": 0.9769, + "mean_token_accuracy": 0.7248809233023937, + "step": 1695 + }, + { + "epoch": 1.2389799635701275, + "grad_norm": 0.3257172703742981, + "learning_rate": 2e-05, + "loss": 0.9819, + "mean_token_accuracy": 0.723152784562775, + "step": 1700 + }, + { + "epoch": 1.2389799635701275, + "eval_loss": 1.0157443284988403, + "eval_mean_token_accuracy": 0.7135474758957674, + "eval_runtime": 14.3005, + "eval_samples_per_second": 18.321, + "eval_steps_per_second": 1.189, + "step": 1700 + }, + { + "epoch": 1.2426229508196722, + "grad_norm": 0.30303245782852173, + "learning_rate": 2e-05, + "loss": 0.9754, + "mean_token_accuracy": 0.7243343917928676, + "step": 1705 + }, + { + "epoch": 1.2462659380692167, + "grad_norm": 0.29583922028541565, + "learning_rate": 2e-05, + "loss": 0.9914, + "mean_token_accuracy": 0.7174801125055479, + "step": 1710 + }, + { + "epoch": 1.2499089253187614, + "grad_norm": 0.3175574541091919, + "learning_rate": 2e-05, + "loss": 0.9695, + "mean_token_accuracy": 0.7243374450415242, + "step": 1715 + }, + { + "epoch": 1.2535519125683061, + "grad_norm": 0.32751041650772095, + "learning_rate": 2e-05, + "loss": 0.9658, + "mean_token_accuracy": 0.7266762335124572, + "step": 1720 + }, + { + "epoch": 1.2571948998178506, + "grad_norm": 0.3620862662792206, + "learning_rate": 2e-05, + "loss": 0.9437, + "mean_token_accuracy": 0.7318392769907182, + "step": 1725 + }, + { + "epoch": 1.2608378870673953, + "grad_norm": 0.31344443559646606, + "learning_rate": 2e-05, + "loss": 0.9519, + "mean_token_accuracy": 0.7299096238397655, + "step": 1730 + }, + { + "epoch": 1.26448087431694, + "grad_norm": 0.3178603947162628, + "learning_rate": 2e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.7239557889594528, + "step": 1735 + }, + { + "epoch": 1.2681238615664845, + "grad_norm": 0.3062346279621124, + "learning_rate": 2e-05, + "loss": 0.9769, + "mean_token_accuracy": 0.7235344406448461, + "step": 1740 + }, + { + "epoch": 1.2717668488160292, + "grad_norm": 0.3273670971393585, + "learning_rate": 2e-05, + "loss": 0.9738, + "mean_token_accuracy": 0.7229634831460674, + "step": 1745 + }, + { + "epoch": 1.275409836065574, + "grad_norm": 0.3440561592578888, + "learning_rate": 2e-05, + "loss": 0.9708, + "mean_token_accuracy": 0.7243130190522715, + "step": 1750 + }, + { + "epoch": 1.2790528233151184, + "grad_norm": 0.3306814730167389, + "learning_rate": 2e-05, + "loss": 0.9763, + "mean_token_accuracy": 0.7213208353688324, + "step": 1755 + }, + { + "epoch": 1.282695810564663, + "grad_norm": 0.3273833990097046, + "learning_rate": 2e-05, + "loss": 0.9462, + "mean_token_accuracy": 0.7324743527112848, + "step": 1760 + }, + { + "epoch": 1.2863387978142076, + "grad_norm": 0.32791441679000854, + "learning_rate": 2e-05, + "loss": 0.9993, + "mean_token_accuracy": 0.7166310454323399, + "step": 1765 + }, + { + "epoch": 1.2899817850637523, + "grad_norm": 0.31501585245132446, + "learning_rate": 2e-05, + "loss": 0.9759, + "mean_token_accuracy": 0.7226764777723499, + "step": 1770 + }, + { + "epoch": 1.293624772313297, + "grad_norm": 0.3403507173061371, + "learning_rate": 2e-05, + "loss": 0.975, + "mean_token_accuracy": 0.7232352222765023, + "step": 1775 + }, + { + "epoch": 1.2972677595628415, + "grad_norm": 0.31439098715782166, + "learning_rate": 2e-05, + "loss": 0.9892, + "mean_token_accuracy": 0.719667688261515, + "step": 1780 + }, + { + "epoch": 1.3009107468123862, + "grad_norm": 0.32850295305252075, + "learning_rate": 2e-05, + "loss": 0.9702, + "mean_token_accuracy": 0.725277845627748, + "step": 1785 + }, + { + "epoch": 1.3045537340619306, + "grad_norm": 0.3256784975528717, + "learning_rate": 2e-05, + "loss": 0.9539, + "mean_token_accuracy": 0.7279036394723986, + "step": 1790 + }, + { + "epoch": 1.3081967213114754, + "grad_norm": 0.34349489212036133, + "learning_rate": 2e-05, + "loss": 0.9646, + "mean_token_accuracy": 0.7258457498778701, + "step": 1795 + }, + { + "epoch": 1.31183970856102, + "grad_norm": 0.3172832429409027, + "learning_rate": 2e-05, + "loss": 0.976, + "mean_token_accuracy": 0.7235344406448462, + "step": 1800 + }, + { + "epoch": 1.31183970856102, + "eval_loss": 1.0121958255767822, + "eval_mean_token_accuracy": 0.7141900481251602, + "eval_runtime": 14.3175, + "eval_samples_per_second": 18.299, + "eval_steps_per_second": 1.187, + "step": 1800 + }, + { + "epoch": 1.3154826958105645, + "grad_norm": 0.3244900405406952, + "learning_rate": 2e-05, + "loss": 0.9801, + "mean_token_accuracy": 0.7222307034684906, + "step": 1805 + }, + { + "epoch": 1.3191256830601092, + "grad_norm": 0.37057653069496155, + "learning_rate": 2e-05, + "loss": 0.972, + "mean_token_accuracy": 0.7246091841719591, + "step": 1810 + }, + { + "epoch": 1.322768670309654, + "grad_norm": 0.3139060437679291, + "learning_rate": 2e-05, + "loss": 0.9705, + "mean_token_accuracy": 0.7253602833414752, + "step": 1815 + }, + { + "epoch": 1.3264116575591984, + "grad_norm": 0.3245513439178467, + "learning_rate": 2e-05, + "loss": 0.9489, + "mean_token_accuracy": 0.7318087445041525, + "step": 1820 + }, + { + "epoch": 1.3300546448087431, + "grad_norm": 0.31191951036453247, + "learning_rate": 2e-05, + "loss": 0.9617, + "mean_token_accuracy": 0.7279738641914998, + "step": 1825 + }, + { + "epoch": 1.3336976320582878, + "grad_norm": 0.3181591331958771, + "learning_rate": 2e-05, + "loss": 0.9389, + "mean_token_accuracy": 0.7314179286761113, + "step": 1830 + }, + { + "epoch": 1.3373406193078323, + "grad_norm": 0.32525399327278137, + "learning_rate": 2e-05, + "loss": 1.0118, + "mean_token_accuracy": 0.71133365901319, + "step": 1835 + }, + { + "epoch": 1.340983606557377, + "grad_norm": 0.3335639238357544, + "learning_rate": 2e-05, + "loss": 0.9624, + "mean_token_accuracy": 0.72753724963361, + "step": 1840 + }, + { + "epoch": 1.3446265938069217, + "grad_norm": 0.36104097962379456, + "learning_rate": 2e-05, + "loss": 0.9731, + "mean_token_accuracy": 0.7239160967269173, + "step": 1845 + }, + { + "epoch": 1.3482695810564662, + "grad_norm": 0.3039902448654175, + "learning_rate": 2e-05, + "loss": 0.966, + "mean_token_accuracy": 0.7241908891060088, + "step": 1850 + }, + { + "epoch": 1.351912568306011, + "grad_norm": 0.30735480785369873, + "learning_rate": 2e-05, + "loss": 0.9614, + "mean_token_accuracy": 0.7256656082071324, + "step": 1855 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.31513282656669617, + "learning_rate": 2e-05, + "loss": 0.9668, + "mean_token_accuracy": 0.7249938935026867, + "step": 1860 + }, + { + "epoch": 1.3591985428051, + "grad_norm": 0.3390577733516693, + "learning_rate": 2e-05, + "loss": 0.9512, + "mean_token_accuracy": 0.7295585002442598, + "step": 1865 + }, + { + "epoch": 1.3628415300546448, + "grad_norm": 0.309792697429657, + "learning_rate": 2e-05, + "loss": 0.9693, + "mean_token_accuracy": 0.7264594528578409, + "step": 1870 + }, + { + "epoch": 1.3664845173041895, + "grad_norm": 0.3256780803203583, + "learning_rate": 2e-05, + "loss": 0.965, + "mean_token_accuracy": 0.7258304836345872, + "step": 1875 + }, + { + "epoch": 1.370127504553734, + "grad_norm": 0.3005531132221222, + "learning_rate": 2e-05, + "loss": 0.9479, + "mean_token_accuracy": 0.7296256717147045, + "step": 1880 + }, + { + "epoch": 1.3737704918032787, + "grad_norm": 0.32955485582351685, + "learning_rate": 2e-05, + "loss": 0.9692, + "mean_token_accuracy": 0.7237634342940888, + "step": 1885 + }, + { + "epoch": 1.3774134790528234, + "grad_norm": 0.3305559456348419, + "learning_rate": 2e-05, + "loss": 0.9599, + "mean_token_accuracy": 0.7268288959452859, + "step": 1890 + }, + { + "epoch": 1.381056466302368, + "grad_norm": 0.3320905268192291, + "learning_rate": 2e-05, + "loss": 0.9589, + "mean_token_accuracy": 0.7275158768930142, + "step": 1895 + }, + { + "epoch": 1.3846994535519126, + "grad_norm": 0.3323926329612732, + "learning_rate": 2e-05, + "loss": 0.9708, + "mean_token_accuracy": 0.7250183194919394, + "step": 1900 + }, + { + "epoch": 1.3846994535519126, + "eval_loss": 1.0094226598739624, + "eval_mean_token_accuracy": 0.7149347002206592, + "eval_runtime": 14.3499, + "eval_samples_per_second": 18.258, + "eval_steps_per_second": 1.185, + "step": 1900 + }, + { + "epoch": 1.3883424408014573, + "grad_norm": 0.33546602725982666, + "learning_rate": 2e-05, + "loss": 0.9688, + "mean_token_accuracy": 0.725732779677577, + "step": 1905 + }, + { + "epoch": 1.3919854280510018, + "grad_norm": 0.34871605038642883, + "learning_rate": 2e-05, + "loss": 0.9566, + "mean_token_accuracy": 0.7274456521739128, + "step": 1910 + }, + { + "epoch": 1.3956284153005465, + "grad_norm": 0.3340302109718323, + "learning_rate": 2e-05, + "loss": 0.9665, + "mean_token_accuracy": 0.7259342940889107, + "step": 1915 + }, + { + "epoch": 1.3992714025500912, + "grad_norm": 0.30206888914108276, + "learning_rate": 2e-05, + "loss": 0.9742, + "mean_token_accuracy": 0.7233756717147044, + "step": 1920 + }, + { + "epoch": 1.4029143897996357, + "grad_norm": 0.29673662781715393, + "learning_rate": 2e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.731414875427455, + "step": 1925 + }, + { + "epoch": 1.4065573770491804, + "grad_norm": 0.3166307508945465, + "learning_rate": 2e-05, + "loss": 0.9447, + "mean_token_accuracy": 0.7307217879824133, + "step": 1930 + }, + { + "epoch": 1.410200364298725, + "grad_norm": 0.29243019223213196, + "learning_rate": 2e-05, + "loss": 0.9695, + "mean_token_accuracy": 0.7265571568148511, + "step": 1935 + }, + { + "epoch": 1.4138433515482696, + "grad_norm": 0.32718804478645325, + "learning_rate": 2e-05, + "loss": 0.9611, + "mean_token_accuracy": 0.7267220322423057, + "step": 1940 + }, + { + "epoch": 1.4174863387978143, + "grad_norm": 0.32046985626220703, + "learning_rate": 2e-05, + "loss": 0.9627, + "mean_token_accuracy": 0.7269021739130435, + "step": 1945 + }, + { + "epoch": 1.4211293260473588, + "grad_norm": 0.31391212344169617, + "learning_rate": 2e-05, + "loss": 0.9792, + "mean_token_accuracy": 0.7224474841231071, + "step": 1950 + }, + { + "epoch": 1.4247723132969035, + "grad_norm": 0.2963239848613739, + "learning_rate": 2e-05, + "loss": 0.9413, + "mean_token_accuracy": 0.7336468001954081, + "step": 1955 + }, + { + "epoch": 1.4284153005464482, + "grad_norm": 0.30880090594291687, + "learning_rate": 2e-05, + "loss": 0.9516, + "mean_token_accuracy": 0.7294607962872497, + "step": 1960 + }, + { + "epoch": 1.4320582877959926, + "grad_norm": 0.2999388873577118, + "learning_rate": 2e-05, + "loss": 0.9428, + "mean_token_accuracy": 0.7320010991695163, + "step": 1965 + }, + { + "epoch": 1.4357012750455374, + "grad_norm": 0.3056471049785614, + "learning_rate": 2e-05, + "loss": 0.9523, + "mean_token_accuracy": 0.7304927943331705, + "step": 1970 + }, + { + "epoch": 1.4393442622950818, + "grad_norm": 0.3117208182811737, + "learning_rate": 2e-05, + "loss": 0.9727, + "mean_token_accuracy": 0.7226520517830972, + "step": 1975 + }, + { + "epoch": 1.4429872495446265, + "grad_norm": 0.3220628499984741, + "learning_rate": 2e-05, + "loss": 0.9786, + "mean_token_accuracy": 0.723250488519785, + "step": 1980 + }, + { + "epoch": 1.4466302367941712, + "grad_norm": 0.3221818804740906, + "learning_rate": 2e-05, + "loss": 0.9713, + "mean_token_accuracy": 0.7243985100146556, + "step": 1985 + }, + { + "epoch": 1.4502732240437157, + "grad_norm": 0.30782631039619446, + "learning_rate": 2e-05, + "loss": 0.9525, + "mean_token_accuracy": 0.7300378602833414, + "step": 1990 + }, + { + "epoch": 1.4539162112932604, + "grad_norm": 0.32362139225006104, + "learning_rate": 2e-05, + "loss": 0.9706, + "mean_token_accuracy": 0.7234978016609672, + "step": 1995 + }, + { + "epoch": 1.4575591985428051, + "grad_norm": 0.31744489073753357, + "learning_rate": 2e-05, + "loss": 0.9497, + "mean_token_accuracy": 0.7307401074743527, + "step": 2000 + }, + { + "epoch": 1.4575591985428051, + "eval_loss": 1.0048604011535645, + "eval_mean_token_accuracy": 0.7154876811925385, + "eval_runtime": 14.3295, + "eval_samples_per_second": 18.284, + "eval_steps_per_second": 1.186, + "step": 2000 + }, + { + "epoch": 1.4612021857923496, + "grad_norm": 0.34834545850753784, + "learning_rate": 2e-05, + "loss": 0.9786, + "mean_token_accuracy": 0.7242458475818271, + "step": 2005 + }, + { + "epoch": 1.4648451730418943, + "grad_norm": 0.344738632440567, + "learning_rate": 2e-05, + "loss": 0.9607, + "mean_token_accuracy": 0.7274670249145089, + "step": 2010 + }, + { + "epoch": 1.468488160291439, + "grad_norm": 0.32923632860183716, + "learning_rate": 2e-05, + "loss": 0.9651, + "mean_token_accuracy": 0.7261327552515875, + "step": 2015 + }, + { + "epoch": 1.4721311475409835, + "grad_norm": 0.33209770917892456, + "learning_rate": 2e-05, + "loss": 0.9664, + "mean_token_accuracy": 0.7245175867122619, + "step": 2020 + }, + { + "epoch": 1.4757741347905282, + "grad_norm": 0.30448663234710693, + "learning_rate": 2e-05, + "loss": 0.961, + "mean_token_accuracy": 0.7256442354665364, + "step": 2025 + }, + { + "epoch": 1.479417122040073, + "grad_norm": 0.29810741543769836, + "learning_rate": 2e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7260777967757693, + "step": 2030 + }, + { + "epoch": 1.4830601092896174, + "grad_norm": 0.3135621249675751, + "learning_rate": 2e-05, + "loss": 0.9706, + "mean_token_accuracy": 0.7255373717635566, + "step": 2035 + }, + { + "epoch": 1.486703096539162, + "grad_norm": 0.32791656255722046, + "learning_rate": 2e-05, + "loss": 0.943, + "mean_token_accuracy": 0.7303187591597459, + "step": 2040 + }, + { + "epoch": 1.4903460837887068, + "grad_norm": 0.3219871520996094, + "learning_rate": 2e-05, + "loss": 0.9485, + "mean_token_accuracy": 0.7297630679042502, + "step": 2045 + }, + { + "epoch": 1.4939890710382513, + "grad_norm": 0.30337387323379517, + "learning_rate": 2e-05, + "loss": 0.9512, + "mean_token_accuracy": 0.7309752076209086, + "step": 2050 + }, + { + "epoch": 1.497632058287796, + "grad_norm": 0.3070669174194336, + "learning_rate": 2e-05, + "loss": 0.9529, + "mean_token_accuracy": 0.7285600879335614, + "step": 2055 + }, + { + "epoch": 1.5012750455373407, + "grad_norm": 0.3109239637851715, + "learning_rate": 2e-05, + "loss": 0.9473, + "mean_token_accuracy": 0.730019540791402, + "step": 2060 + }, + { + "epoch": 1.5049180327868852, + "grad_norm": 0.3539317846298218, + "learning_rate": 2e-05, + "loss": 0.9641, + "mean_token_accuracy": 0.7243801905227162, + "step": 2065 + }, + { + "epoch": 1.50856102003643, + "grad_norm": 0.33243849873542786, + "learning_rate": 2e-05, + "loss": 0.9736, + "mean_token_accuracy": 0.722575720566683, + "step": 2070 + }, + { + "epoch": 1.5122040072859746, + "grad_norm": 0.3301512897014618, + "learning_rate": 2e-05, + "loss": 0.9709, + "mean_token_accuracy": 0.7232603488129995, + "step": 2075 + }, + { + "epoch": 1.515846994535519, + "grad_norm": 0.34674686193466187, + "learning_rate": 2e-05, + "loss": 0.9957, + "mean_token_accuracy": 0.716304347826087, + "step": 2080 + }, + { + "epoch": 1.5194899817850638, + "grad_norm": 0.34269919991493225, + "learning_rate": 2e-05, + "loss": 0.9578, + "mean_token_accuracy": 0.7280013434294089, + "step": 2085 + }, + { + "epoch": 1.5231329690346085, + "grad_norm": 0.3565721809864044, + "learning_rate": 2e-05, + "loss": 0.9462, + "mean_token_accuracy": 0.7306362970200293, + "step": 2090 + }, + { + "epoch": 1.526775956284153, + "grad_norm": 0.32590150833129883, + "learning_rate": 2e-05, + "loss": 0.9778, + "mean_token_accuracy": 0.7213727405959941, + "step": 2095 + }, + { + "epoch": 1.5304189435336977, + "grad_norm": 0.3151698112487793, + "learning_rate": 2e-05, + "loss": 0.9566, + "mean_token_accuracy": 0.7274731314118223, + "step": 2100 + }, + { + "epoch": 1.5304189435336977, + "eval_loss": 1.0019382238388062, + "eval_mean_token_accuracy": 0.7163066606906044, + "eval_runtime": 14.3168, + "eval_samples_per_second": 18.3, + "eval_steps_per_second": 1.187, + "step": 2100 + }, + { + "epoch": 1.5340619307832424, + "grad_norm": 0.31888285279273987, + "learning_rate": 2e-05, + "loss": 0.9413, + "mean_token_accuracy": 0.7308591841719589, + "step": 2105 + }, + { + "epoch": 1.5377049180327869, + "grad_norm": 0.33658313751220703, + "learning_rate": 2e-05, + "loss": 0.9439, + "mean_token_accuracy": 0.7299126770884221, + "step": 2110 + }, + { + "epoch": 1.5413479052823316, + "grad_norm": 0.3298831880092621, + "learning_rate": 2e-05, + "loss": 0.9699, + "mean_token_accuracy": 0.7238916707376648, + "step": 2115 + }, + { + "epoch": 1.5449908925318763, + "grad_norm": 0.323344886302948, + "learning_rate": 2e-05, + "loss": 0.9464, + "mean_token_accuracy": 0.7299035173424525, + "step": 2120 + }, + { + "epoch": 1.5486338797814208, + "grad_norm": 0.3089609742164612, + "learning_rate": 2e-05, + "loss": 0.9593, + "mean_token_accuracy": 0.726954079140205, + "step": 2125 + }, + { + "epoch": 1.5522768670309652, + "grad_norm": 0.2949562072753906, + "learning_rate": 2e-05, + "loss": 0.9563, + "mean_token_accuracy": 0.7273143624816807, + "step": 2130 + }, + { + "epoch": 1.5559198542805102, + "grad_norm": 0.3209057152271271, + "learning_rate": 2e-05, + "loss": 0.9712, + "mean_token_accuracy": 0.7229115779189057, + "step": 2135 + }, + { + "epoch": 1.5595628415300546, + "grad_norm": 0.34897011518478394, + "learning_rate": 2e-05, + "loss": 0.9625, + "mean_token_accuracy": 0.7247007816316561, + "step": 2140 + }, + { + "epoch": 1.5632058287795991, + "grad_norm": 0.3226200044155121, + "learning_rate": 2e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7320926966292134, + "step": 2145 + }, + { + "epoch": 1.566848816029144, + "grad_norm": 0.3218037486076355, + "learning_rate": 2e-05, + "loss": 0.9749, + "mean_token_accuracy": 0.7230795065950171, + "step": 2150 + }, + { + "epoch": 1.5704918032786885, + "grad_norm": 0.309461772441864, + "learning_rate": 2e-05, + "loss": 0.9367, + "mean_token_accuracy": 0.7339643380556913, + "step": 2155 + }, + { + "epoch": 1.574134790528233, + "grad_norm": 0.35405707359313965, + "learning_rate": 2e-05, + "loss": 0.9549, + "mean_token_accuracy": 0.7288776257938446, + "step": 2160 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 0.3519957661628723, + "learning_rate": 2e-05, + "loss": 0.961, + "mean_token_accuracy": 0.7253358573522226, + "step": 2165 + }, + { + "epoch": 1.5814207650273224, + "grad_norm": 0.3647383451461792, + "learning_rate": 2e-05, + "loss": 0.9372, + "mean_token_accuracy": 0.7328651685393259, + "step": 2170 + }, + { + "epoch": 1.585063752276867, + "grad_norm": 0.3361128568649292, + "learning_rate": 2e-05, + "loss": 0.9645, + "mean_token_accuracy": 0.7258732291157791, + "step": 2175 + }, + { + "epoch": 1.5887067395264116, + "grad_norm": 0.3090117275714874, + "learning_rate": 2e-05, + "loss": 0.9645, + "mean_token_accuracy": 0.7250837591040762, + "step": 2180 + }, + { + "epoch": 1.5923497267759563, + "grad_norm": 0.3278610110282898, + "learning_rate": 2e-05, + "loss": 0.9489, + "mean_token_accuracy": 0.7295432340009771, + "step": 2185 + }, + { + "epoch": 1.5959927140255008, + "grad_norm": 0.3234911561012268, + "learning_rate": 2e-05, + "loss": 0.9484, + "mean_token_accuracy": 0.7298241328773816, + "step": 2190 + }, + { + "epoch": 1.5996357012750455, + "grad_norm": 0.3117011487483978, + "learning_rate": 2e-05, + "loss": 0.957, + "mean_token_accuracy": 0.7265479970688812, + "step": 2195 + }, + { + "epoch": 1.6032786885245902, + "grad_norm": 0.3146551549434662, + "learning_rate": 2e-05, + "loss": 0.9702, + "mean_token_accuracy": 0.7246213971665852, + "step": 2200 + }, + { + "epoch": 1.6032786885245902, + "eval_loss": 0.9981810450553894, + "eval_mean_token_accuracy": 0.7169864646888545, + "eval_runtime": 14.4002, + "eval_samples_per_second": 18.194, + "eval_steps_per_second": 1.181, + "step": 2200 + }, + { + "epoch": 1.6069216757741347, + "grad_norm": 0.3198413848876953, + "learning_rate": 2e-05, + "loss": 0.9378, + "mean_token_accuracy": 0.7318881289692233, + "step": 2205 + }, + { + "epoch": 1.6105646630236794, + "grad_norm": 0.3311745524406433, + "learning_rate": 2e-05, + "loss": 0.9751, + "mean_token_accuracy": 0.7230550806057646, + "step": 2210 + }, + { + "epoch": 1.614207650273224, + "grad_norm": 0.3199496269226074, + "learning_rate": 2e-05, + "loss": 0.9589, + "mean_token_accuracy": 0.7272288715192966, + "step": 2215 + }, + { + "epoch": 1.6178506375227686, + "grad_norm": 0.3042522668838501, + "learning_rate": 2e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.7277082315583782, + "step": 2220 + }, + { + "epoch": 1.6214936247723133, + "grad_norm": 0.32280251383781433, + "learning_rate": 2e-05, + "loss": 0.9378, + "mean_token_accuracy": 0.7322545188080117, + "step": 2225 + }, + { + "epoch": 1.625136612021858, + "grad_norm": 0.3113802969455719, + "learning_rate": 2e-05, + "loss": 0.9678, + "mean_token_accuracy": 0.7245267464582316, + "step": 2230 + }, + { + "epoch": 1.6287795992714025, + "grad_norm": 0.32498499751091003, + "learning_rate": 2e-05, + "loss": 0.9649, + "mean_token_accuracy": 0.7253541768441623, + "step": 2235 + }, + { + "epoch": 1.6324225865209472, + "grad_norm": 0.3127484619617462, + "learning_rate": 2e-05, + "loss": 0.94, + "mean_token_accuracy": 0.7314881533952124, + "step": 2240 + }, + { + "epoch": 1.6360655737704919, + "grad_norm": 0.34295135736465454, + "learning_rate": 2e-05, + "loss": 0.98, + "mean_token_accuracy": 0.7219894968246214, + "step": 2245 + }, + { + "epoch": 1.6397085610200364, + "grad_norm": 0.32535672187805176, + "learning_rate": 2e-05, + "loss": 0.9454, + "mean_token_accuracy": 0.7287986041154254, + "step": 2250 + }, + { + "epoch": 1.643351548269581, + "grad_norm": 0.3259055018424988, + "learning_rate": 2e-05, + "loss": 0.9604, + "mean_token_accuracy": 0.7256808744504151, + "step": 2255 + }, + { + "epoch": 1.6469945355191258, + "grad_norm": 0.3303998112678528, + "learning_rate": 2e-05, + "loss": 0.9551, + "mean_token_accuracy": 0.7276705991907064, + "step": 2260 + }, + { + "epoch": 1.6506375227686703, + "grad_norm": 0.3288848400115967, + "learning_rate": 2e-05, + "loss": 0.9473, + "mean_token_accuracy": 0.7294455300439667, + "step": 2265 + }, + { + "epoch": 1.654280510018215, + "grad_norm": 0.3175695240497589, + "learning_rate": 2e-05, + "loss": 0.9757, + "mean_token_accuracy": 0.7240504396678065, + "step": 2270 + }, + { + "epoch": 1.6579234972677597, + "grad_norm": 0.3215230703353882, + "learning_rate": 2e-05, + "loss": 0.9631, + "mean_token_accuracy": 0.7257022471910114, + "step": 2275 + }, + { + "epoch": 1.6615664845173042, + "grad_norm": 0.31356170773506165, + "learning_rate": 2e-05, + "loss": 0.9771, + "mean_token_accuracy": 0.7222734489496825, + "step": 2280 + }, + { + "epoch": 1.6652094717668489, + "grad_norm": 0.31009095907211304, + "learning_rate": 2e-05, + "loss": 0.9339, + "mean_token_accuracy": 0.7347184904738644, + "step": 2285 + }, + { + "epoch": 1.6688524590163936, + "grad_norm": 0.3301193118095398, + "learning_rate": 2e-05, + "loss": 0.9575, + "mean_token_accuracy": 0.728318881289692, + "step": 2290 + }, + { + "epoch": 1.672495446265938, + "grad_norm": 0.31997668743133545, + "learning_rate": 2e-05, + "loss": 0.9648, + "mean_token_accuracy": 0.7259159745969711, + "step": 2295 + }, + { + "epoch": 1.6761384335154828, + "grad_norm": 0.32590940594673157, + "learning_rate": 2e-05, + "loss": 0.9509, + "mean_token_accuracy": 0.7274975574010748, + "step": 2300 + }, + { + "epoch": 1.6761384335154828, + "eval_loss": 0.9955868124961853, + "eval_mean_token_accuracy": 0.7172010781674456, + "eval_runtime": 14.3325, + "eval_samples_per_second": 18.28, + "eval_steps_per_second": 1.186, + "step": 2300 + }, + { + "epoch": 1.6797814207650275, + "grad_norm": 0.3190097510814667, + "learning_rate": 2e-05, + "loss": 0.9659, + "mean_token_accuracy": 0.7243710307767464, + "step": 2305 + }, + { + "epoch": 1.683424408014572, + "grad_norm": 0.3566085994243622, + "learning_rate": 2e-05, + "loss": 0.9797, + "mean_token_accuracy": 0.7204598192476794, + "step": 2310 + }, + { + "epoch": 1.6870673952641164, + "grad_norm": 0.3309180438518524, + "learning_rate": 2e-05, + "loss": 0.943, + "mean_token_accuracy": 0.7305782852955545, + "step": 2315 + }, + { + "epoch": 1.6907103825136613, + "grad_norm": 0.32704347372055054, + "learning_rate": 2e-05, + "loss": 0.9572, + "mean_token_accuracy": 0.7284349047386419, + "step": 2320 + }, + { + "epoch": 1.6943533697632058, + "grad_norm": 0.31231483817100525, + "learning_rate": 2e-05, + "loss": 0.9482, + "mean_token_accuracy": 0.7299615290669271, + "step": 2325 + }, + { + "epoch": 1.6979963570127503, + "grad_norm": 0.31142398715019226, + "learning_rate": 2e-05, + "loss": 0.9633, + "mean_token_accuracy": 0.7263281631656083, + "step": 2330 + }, + { + "epoch": 1.7016393442622952, + "grad_norm": 0.35823601484298706, + "learning_rate": 2e-05, + "loss": 0.9453, + "mean_token_accuracy": 0.7308439179286761, + "step": 2335 + }, + { + "epoch": 1.7052823315118397, + "grad_norm": 0.3349340260028839, + "learning_rate": 2e-05, + "loss": 0.9602, + "mean_token_accuracy": 0.7278425744992671, + "step": 2340 + }, + { + "epoch": 1.7089253187613842, + "grad_norm": 0.3404028117656708, + "learning_rate": 2e-05, + "loss": 0.9668, + "mean_token_accuracy": 0.7247404738641916, + "step": 2345 + }, + { + "epoch": 1.712568306010929, + "grad_norm": 0.3519185781478882, + "learning_rate": 2e-05, + "loss": 0.9325, + "mean_token_accuracy": 0.7344589643380556, + "step": 2350 + }, + { + "epoch": 1.7162112932604736, + "grad_norm": 0.33176666498184204, + "learning_rate": 2e-05, + "loss": 0.9474, + "mean_token_accuracy": 0.7299890083048365, + "step": 2355 + }, + { + "epoch": 1.719854280510018, + "grad_norm": 0.31135764718055725, + "learning_rate": 2e-05, + "loss": 0.9707, + "mean_token_accuracy": 0.7230459208597948, + "step": 2360 + }, + { + "epoch": 1.7234972677595628, + "grad_norm": 0.31730037927627563, + "learning_rate": 2e-05, + "loss": 0.9358, + "mean_token_accuracy": 0.7335399364924279, + "step": 2365 + }, + { + "epoch": 1.7271402550091075, + "grad_norm": 0.31346791982650757, + "learning_rate": 2e-05, + "loss": 0.9462, + "mean_token_accuracy": 0.7301294577430386, + "step": 2370 + }, + { + "epoch": 1.730783242258652, + "grad_norm": 0.30968010425567627, + "learning_rate": 2e-05, + "loss": 0.9683, + "mean_token_accuracy": 0.7245206399609183, + "step": 2375 + }, + { + "epoch": 1.7344262295081967, + "grad_norm": 0.3089336156845093, + "learning_rate": 2e-05, + "loss": 0.9386, + "mean_token_accuracy": 0.7323888617489008, + "step": 2380 + }, + { + "epoch": 1.7380692167577414, + "grad_norm": 0.3058148920536041, + "learning_rate": 2e-05, + "loss": 0.9559, + "mean_token_accuracy": 0.728135686370298, + "step": 2385 + }, + { + "epoch": 1.7417122040072859, + "grad_norm": 0.3387933075428009, + "learning_rate": 2e-05, + "loss": 0.9714, + "mean_token_accuracy": 0.7253419638495358, + "step": 2390 + }, + { + "epoch": 1.7453551912568306, + "grad_norm": 0.33751383423805237, + "learning_rate": 2e-05, + "loss": 0.9236, + "mean_token_accuracy": 0.7382297264289204, + "step": 2395 + }, + { + "epoch": 1.7489981785063753, + "grad_norm": 0.3331749439239502, + "learning_rate": 2e-05, + "loss": 0.9384, + "mean_token_accuracy": 0.7333597948216903, + "step": 2400 + }, + { + "epoch": 1.7489981785063753, + "eval_loss": 0.9930522441864014, + "eval_mean_token_accuracy": 0.717845759297467, + "eval_runtime": 14.2644, + "eval_samples_per_second": 18.367, + "eval_steps_per_second": 1.192, + "step": 2400 + }, + { + "epoch": 1.7526411657559198, + "grad_norm": 0.3001438081264496, + "learning_rate": 2e-05, + "loss": 0.9468, + "mean_token_accuracy": 0.7304439423546653, + "step": 2405 + }, + { + "epoch": 1.7562841530054645, + "grad_norm": 0.3136955201625824, + "learning_rate": 2e-05, + "loss": 0.9627, + "mean_token_accuracy": 0.725497679531021, + "step": 2410 + }, + { + "epoch": 1.7599271402550092, + "grad_norm": 0.33199629187583923, + "learning_rate": 2e-05, + "loss": 0.9689, + "mean_token_accuracy": 0.7236971195681857, + "step": 2415 + }, + { + "epoch": 1.7635701275045537, + "grad_norm": 0.34084373712539673, + "learning_rate": 2e-05, + "loss": 0.9479, + "mean_token_accuracy": 0.7301996824621397, + "step": 2420 + }, + { + "epoch": 1.7672131147540984, + "grad_norm": 0.3298395276069641, + "learning_rate": 2e-05, + "loss": 0.959, + "mean_token_accuracy": 0.7253541768441623, + "step": 2425 + }, + { + "epoch": 1.770856102003643, + "grad_norm": 0.3308035731315613, + "learning_rate": 2e-05, + "loss": 0.966, + "mean_token_accuracy": 0.7240626526624327, + "step": 2430 + }, + { + "epoch": 1.7744990892531876, + "grad_norm": 0.33812233805656433, + "learning_rate": 2e-05, + "loss": 0.9408, + "mean_token_accuracy": 0.7327674645823156, + "step": 2435 + }, + { + "epoch": 1.7781420765027323, + "grad_norm": 0.31434857845306396, + "learning_rate": 2e-05, + "loss": 0.9408, + "mean_token_accuracy": 0.7304866878358574, + "step": 2440 + }, + { + "epoch": 1.781785063752277, + "grad_norm": 0.33050814270973206, + "learning_rate": 2e-05, + "loss": 0.9496, + "mean_token_accuracy": 0.7288967664624881, + "step": 2445 + }, + { + "epoch": 1.7854280510018214, + "grad_norm": 0.3517625629901886, + "learning_rate": 2e-05, + "loss": 0.9708, + "mean_token_accuracy": 0.7233634587200781, + "step": 2450 + }, + { + "epoch": 1.7890710382513662, + "grad_norm": 0.39435386657714844, + "learning_rate": 2e-05, + "loss": 0.9768, + "mean_token_accuracy": 0.720649120664387, + "step": 2455 + }, + { + "epoch": 1.7927140255009109, + "grad_norm": 0.3447003662586212, + "learning_rate": 2e-05, + "loss": 0.9488, + "mean_token_accuracy": 0.7297325354176845, + "step": 2460 + }, + { + "epoch": 1.7963570127504553, + "grad_norm": 0.3322184681892395, + "learning_rate": 2e-05, + "loss": 0.9641, + "mean_token_accuracy": 0.7258030043966781, + "step": 2465 + }, + { + "epoch": 1.8, + "grad_norm": 0.31109198927879333, + "learning_rate": 2e-05, + "loss": 0.9415, + "mean_token_accuracy": 0.7302851734245237, + "step": 2470 + }, + { + "epoch": 1.8036429872495447, + "grad_norm": 0.32298213243484497, + "learning_rate": 2e-05, + "loss": 0.9645, + "mean_token_accuracy": 0.7249114557889594, + "step": 2475 + }, + { + "epoch": 1.8072859744990892, + "grad_norm": 0.31393465399742126, + "learning_rate": 2e-05, + "loss": 0.9373, + "mean_token_accuracy": 0.7328315828041037, + "step": 2480 + }, + { + "epoch": 1.8109289617486337, + "grad_norm": 0.343615859746933, + "learning_rate": 2e-05, + "loss": 0.9546, + "mean_token_accuracy": 0.7293600390815828, + "step": 2485 + }, + { + "epoch": 1.8145719489981786, + "grad_norm": 0.3566816747188568, + "learning_rate": 2e-05, + "loss": 0.9335, + "mean_token_accuracy": 0.7325537371763557, + "step": 2490 + }, + { + "epoch": 1.8182149362477231, + "grad_norm": 0.3791857957839966, + "learning_rate": 2e-05, + "loss": 0.9633, + "mean_token_accuracy": 0.7262063928835035, + "step": 2495 + }, + { + "epoch": 1.8218579234972676, + "grad_norm": 0.34828808903694153, + "learning_rate": 2e-05, + "loss": 0.9461, + "mean_token_accuracy": 0.7305782852955542, + "step": 2500 + }, + { + "epoch": 1.8218579234972676, + "eval_loss": 0.9900703430175781, + "eval_mean_token_accuracy": 0.7183870589182986, + "eval_runtime": 14.2978, + "eval_samples_per_second": 18.324, + "eval_steps_per_second": 1.189, + "step": 2500 + }, + { + "epoch": 1.8255009107468125, + "grad_norm": 0.34125182032585144, + "learning_rate": 2e-05, + "loss": 0.9546, + "mean_token_accuracy": 0.727842574499267, + "step": 2505 + }, + { + "epoch": 1.829143897996357, + "grad_norm": 0.31761088967323303, + "learning_rate": 2e-05, + "loss": 0.9251, + "mean_token_accuracy": 0.7357932340009771, + "step": 2510 + }, + { + "epoch": 1.8327868852459015, + "grad_norm": 0.3065294027328491, + "learning_rate": 2e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.728977635288093, + "step": 2515 + }, + { + "epoch": 1.8364298724954464, + "grad_norm": 0.31213077902793884, + "learning_rate": 2e-05, + "loss": 0.9637, + "mean_token_accuracy": 0.7256961406936981, + "step": 2520 + }, + { + "epoch": 1.840072859744991, + "grad_norm": 0.35743096470832825, + "learning_rate": 2e-05, + "loss": 0.9703, + "mean_token_accuracy": 0.7234367366878359, + "step": 2525 + }, + { + "epoch": 1.8437158469945354, + "grad_norm": 0.3187105655670166, + "learning_rate": 2e-05, + "loss": 0.9289, + "mean_token_accuracy": 0.7341841719589643, + "step": 2530 + }, + { + "epoch": 1.84735883424408, + "grad_norm": 0.3167194426059723, + "learning_rate": 2e-05, + "loss": 0.9534, + "mean_token_accuracy": 0.7265876893014166, + "step": 2535 + }, + { + "epoch": 1.8510018214936248, + "grad_norm": 0.34710872173309326, + "learning_rate": 2e-05, + "loss": 0.9519, + "mean_token_accuracy": 0.727842574499267, + "step": 2540 + }, + { + "epoch": 1.8546448087431693, + "grad_norm": 0.3225826919078827, + "learning_rate": 2e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.725058011724475, + "step": 2545 + }, + { + "epoch": 1.858287795992714, + "grad_norm": 0.31684067845344543, + "learning_rate": 2e-05, + "loss": 0.9578, + "mean_token_accuracy": 0.7259953590620422, + "step": 2550 + }, + { + "epoch": 1.8619307832422587, + "grad_norm": 0.3433721959590912, + "learning_rate": 2e-05, + "loss": 0.9457, + "mean_token_accuracy": 0.7303157059110894, + "step": 2555 + }, + { + "epoch": 1.8655737704918032, + "grad_norm": 0.3335418403148651, + "learning_rate": 2e-05, + "loss": 0.9461, + "mean_token_accuracy": 0.7287860283341476, + "step": 2560 + }, + { + "epoch": 1.8692167577413479, + "grad_norm": 0.34308183193206787, + "learning_rate": 2e-05, + "loss": 0.9524, + "mean_token_accuracy": 0.7277173913043478, + "step": 2565 + }, + { + "epoch": 1.8728597449908926, + "grad_norm": 0.3250623047351837, + "learning_rate": 2e-05, + "loss": 0.9246, + "mean_token_accuracy": 0.7354726428920371, + "step": 2570 + }, + { + "epoch": 1.876502732240437, + "grad_norm": 0.3206217586994171, + "learning_rate": 2e-05, + "loss": 0.9416, + "mean_token_accuracy": 0.730801172447484, + "step": 2575 + }, + { + "epoch": 1.8801457194899818, + "grad_norm": 0.33022913336753845, + "learning_rate": 2e-05, + "loss": 0.9636, + "mean_token_accuracy": 0.723464215925745, + "step": 2580 + }, + { + "epoch": 1.8837887067395265, + "grad_norm": 0.35024046897888184, + "learning_rate": 2e-05, + "loss": 0.9481, + "mean_token_accuracy": 0.7290852467024915, + "step": 2585 + }, + { + "epoch": 1.887431693989071, + "grad_norm": 0.3380562365055084, + "learning_rate": 2e-05, + "loss": 0.9613, + "mean_token_accuracy": 0.7261419149975574, + "step": 2590 + }, + { + "epoch": 1.8910746812386157, + "grad_norm": 0.34045740962028503, + "learning_rate": 2e-05, + "loss": 0.9627, + "mean_token_accuracy": 0.7240901319003419, + "step": 2595 + }, + { + "epoch": 1.8947176684881604, + "grad_norm": 0.3662465810775757, + "learning_rate": 2e-05, + "loss": 0.9463, + "mean_token_accuracy": 0.7302302149487054, + "step": 2600 + }, + { + "epoch": 1.8947176684881604, + "eval_loss": 0.9878339767456055, + "eval_mean_token_accuracy": 0.7185170775381476, + "eval_runtime": 14.2544, + "eval_samples_per_second": 18.38, + "eval_steps_per_second": 1.193, + "step": 2600 + }, + { + "epoch": 1.8983606557377048, + "grad_norm": 0.3937079906463623, + "learning_rate": 2e-05, + "loss": 0.9448, + "mean_token_accuracy": 0.7303340254030288, + "step": 2605 + }, + { + "epoch": 1.9020036429872496, + "grad_norm": 0.3231543004512787, + "learning_rate": 2e-05, + "loss": 0.9596, + "mean_token_accuracy": 0.725903761602345, + "step": 2610 + }, + { + "epoch": 1.9056466302367943, + "grad_norm": 0.3279549181461334, + "learning_rate": 2e-05, + "loss": 0.9598, + "mean_token_accuracy": 0.7254793600390816, + "step": 2615 + }, + { + "epoch": 1.9092896174863387, + "grad_norm": 0.300364226102829, + "learning_rate": 2e-05, + "loss": 0.9611, + "mean_token_accuracy": 0.7248473375671716, + "step": 2620 + }, + { + "epoch": 1.9129326047358834, + "grad_norm": 0.32046106457710266, + "learning_rate": 2e-05, + "loss": 0.9604, + "mean_token_accuracy": 0.72521372740596, + "step": 2625 + }, + { + "epoch": 1.9165755919854282, + "grad_norm": 0.30070799589157104, + "learning_rate": 2e-05, + "loss": 0.9282, + "mean_token_accuracy": 0.7357260625305325, + "step": 2630 + }, + { + "epoch": 1.9202185792349726, + "grad_norm": 0.3148041069507599, + "learning_rate": 2e-05, + "loss": 0.9495, + "mean_token_accuracy": 0.7294027148905409, + "step": 2635 + }, + { + "epoch": 1.9238615664845173, + "grad_norm": 0.30489620566368103, + "learning_rate": 2e-05, + "loss": 0.9302, + "mean_token_accuracy": 0.7349658036150464, + "step": 2640 + }, + { + "epoch": 1.927504553734062, + "grad_norm": 0.31464579701423645, + "learning_rate": 2e-05, + "loss": 0.9469, + "mean_token_accuracy": 0.7290333414753298, + "step": 2645 + }, + { + "epoch": 1.9311475409836065, + "grad_norm": 0.34819769859313965, + "learning_rate": 2e-05, + "loss": 0.95, + "mean_token_accuracy": 0.727873106985833, + "step": 2650 + }, + { + "epoch": 1.9347905282331512, + "grad_norm": 0.30317771434783936, + "learning_rate": 2e-05, + "loss": 0.931, + "mean_token_accuracy": 0.7324041279921838, + "step": 2655 + }, + { + "epoch": 1.938433515482696, + "grad_norm": 0.33199000358581543, + "learning_rate": 2e-05, + "loss": 0.9447, + "mean_token_accuracy": 0.7300042745481192, + "step": 2660 + }, + { + "epoch": 1.9420765027322404, + "grad_norm": 0.32186630368232727, + "learning_rate": 2e-05, + "loss": 0.9223, + "mean_token_accuracy": 0.7355062286272593, + "step": 2665 + }, + { + "epoch": 1.945719489981785, + "grad_norm": 0.3189879357814789, + "learning_rate": 2e-05, + "loss": 0.9214, + "mean_token_accuracy": 0.7362176355642402, + "step": 2670 + }, + { + "epoch": 1.9493624772313298, + "grad_norm": 0.3124241232872009, + "learning_rate": 2e-05, + "loss": 0.952, + "mean_token_accuracy": 0.7294241573033708, + "step": 2675 + }, + { + "epoch": 1.9530054644808743, + "grad_norm": 0.33368435502052307, + "learning_rate": 2e-05, + "loss": 0.9541, + "mean_token_accuracy": 0.726944973067412, + "step": 2680 + }, + { + "epoch": 1.9566484517304188, + "grad_norm": 0.31824326515197754, + "learning_rate": 2e-05, + "loss": 0.9595, + "mean_token_accuracy": 0.7259007083536885, + "step": 2685 + }, + { + "epoch": 1.9602914389799637, + "grad_norm": 0.33006536960601807, + "learning_rate": 2e-05, + "loss": 0.9401, + "mean_token_accuracy": 0.7317018808011726, + "step": 2690 + }, + { + "epoch": 1.9639344262295082, + "grad_norm": 0.32560908794403076, + "learning_rate": 2e-05, + "loss": 0.9497, + "mean_token_accuracy": 0.7289753297508549, + "step": 2695 + }, + { + "epoch": 1.9675774134790527, + "grad_norm": 0.32261034846305847, + "learning_rate": 2e-05, + "loss": 0.9432, + "mean_token_accuracy": 0.7300744992672203, + "step": 2700 + }, + { + "epoch": 1.9675774134790527, + "eval_loss": 0.9857168197631836, + "eval_mean_token_accuracy": 0.719198261994567, + "eval_runtime": 14.2692, + "eval_samples_per_second": 18.361, + "eval_steps_per_second": 1.191, + "step": 2700 + }, + { + "epoch": 1.9712204007285976, + "grad_norm": 0.3109206259250641, + "learning_rate": 2e-05, + "loss": 0.9401, + "mean_token_accuracy": 0.7307981191988278, + "step": 2705 + }, + { + "epoch": 1.974863387978142, + "grad_norm": 0.3254808783531189, + "learning_rate": 2e-05, + "loss": 0.9255, + "mean_token_accuracy": 0.7365412799218367, + "step": 2710 + }, + { + "epoch": 1.9785063752276866, + "grad_norm": 0.348579078912735, + "learning_rate": 2e-05, + "loss": 0.9367, + "mean_token_accuracy": 0.7322361993160723, + "step": 2715 + }, + { + "epoch": 1.9821493624772313, + "grad_norm": 0.2932336628437042, + "learning_rate": 2e-05, + "loss": 0.9373, + "mean_token_accuracy": 0.7305111138251098, + "step": 2720 + }, + { + "epoch": 1.985792349726776, + "grad_norm": 0.33148205280303955, + "learning_rate": 2e-05, + "loss": 0.9334, + "mean_token_accuracy": 0.7327063996091842, + "step": 2725 + }, + { + "epoch": 1.9894353369763205, + "grad_norm": 0.3459117114543915, + "learning_rate": 2e-05, + "loss": 0.9458, + "mean_token_accuracy": 0.7299727053900504, + "step": 2730 + }, + { + "epoch": 1.9930783242258652, + "grad_norm": 0.3014063835144043, + "learning_rate": 2e-05, + "loss": 0.9354, + "mean_token_accuracy": 0.7322545188080116, + "step": 2735 + }, + { + "epoch": 1.9967213114754099, + "grad_norm": 0.3319181501865387, + "learning_rate": 2e-05, + "loss": 0.9575, + "mean_token_accuracy": 0.7258640693698095, + "step": 2740 + }, + { + "epoch": 2.0007285974499087, + "grad_norm": 0.34205400943756104, + "learning_rate": 2e-05, + "loss": 0.9841, + "mean_token_accuracy": 0.7305871542559377, + "step": 2745 + }, + { + "epoch": 2.0043715846994536, + "grad_norm": 0.36871999502182007, + "learning_rate": 2e-05, + "loss": 0.9157, + "mean_token_accuracy": 0.7358542989741086, + "step": 2750 + }, + { + "epoch": 2.008014571948998, + "grad_norm": 0.32711997628211975, + "learning_rate": 2e-05, + "loss": 0.8998, + "mean_token_accuracy": 0.7395395701025891, + "step": 2755 + }, + { + "epoch": 2.0116575591985426, + "grad_norm": 0.31132447719573975, + "learning_rate": 2e-05, + "loss": 0.9142, + "mean_token_accuracy": 0.7366878358573523, + "step": 2760 + }, + { + "epoch": 2.0153005464480875, + "grad_norm": 0.3128187656402588, + "learning_rate": 2e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7384342940889106, + "step": 2765 + }, + { + "epoch": 2.018943533697632, + "grad_norm": 0.3222801983356476, + "learning_rate": 2e-05, + "loss": 0.9163, + "mean_token_accuracy": 0.735851245725452, + "step": 2770 + }, + { + "epoch": 2.0225865209471765, + "grad_norm": 0.31613680720329285, + "learning_rate": 2e-05, + "loss": 0.8652, + "mean_token_accuracy": 0.7491267708842207, + "step": 2775 + }, + { + "epoch": 2.0262295081967214, + "grad_norm": 0.3172297179698944, + "learning_rate": 2e-05, + "loss": 0.8856, + "mean_token_accuracy": 0.7436980947728384, + "step": 2780 + }, + { + "epoch": 2.029872495446266, + "grad_norm": 0.31364375352859497, + "learning_rate": 2e-05, + "loss": 0.8942, + "mean_token_accuracy": 0.7424645823155838, + "step": 2785 + }, + { + "epoch": 2.0335154826958104, + "grad_norm": 0.30311858654022217, + "learning_rate": 2e-05, + "loss": 0.8925, + "mean_token_accuracy": 0.740699804592086, + "step": 2790 + }, + { + "epoch": 2.0371584699453553, + "grad_norm": 0.3112095892429352, + "learning_rate": 2e-05, + "loss": 0.8659, + "mean_token_accuracy": 0.7491664631167564, + "step": 2795 + }, + { + "epoch": 2.0408014571949, + "grad_norm": 0.3168626129627228, + "learning_rate": 2e-05, + "loss": 0.9142, + "mean_token_accuracy": 0.7359581094284315, + "step": 2800 + }, + { + "epoch": 2.0408014571949, + "eval_loss": 0.9864921569824219, + "eval_mean_token_accuracy": 0.7194168831008534, + "eval_runtime": 14.3174, + "eval_samples_per_second": 18.299, + "eval_steps_per_second": 1.187, + "step": 2800 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.31390464305877686, + "learning_rate": 2e-05, + "loss": 0.9135, + "mean_token_accuracy": 0.7361962628236444, + "step": 2805 + }, + { + "epoch": 2.048087431693989, + "grad_norm": 0.3033863306045532, + "learning_rate": 2e-05, + "loss": 0.8846, + "mean_token_accuracy": 0.7426691499755742, + "step": 2810 + }, + { + "epoch": 2.0517304189435337, + "grad_norm": 0.31870099902153015, + "learning_rate": 2e-05, + "loss": 0.8978, + "mean_token_accuracy": 0.7406356863702981, + "step": 2815 + }, + { + "epoch": 2.055373406193078, + "grad_norm": 0.33986830711364746, + "learning_rate": 2e-05, + "loss": 0.9034, + "mean_token_accuracy": 0.73845261358085, + "step": 2820 + }, + { + "epoch": 2.059016393442623, + "grad_norm": 0.31481096148490906, + "learning_rate": 2e-05, + "loss": 0.9133, + "mean_token_accuracy": 0.7372191011235955, + "step": 2825 + }, + { + "epoch": 2.0626593806921676, + "grad_norm": 0.3070632517337799, + "learning_rate": 2e-05, + "loss": 0.9003, + "mean_token_accuracy": 0.7388953346360528, + "step": 2830 + }, + { + "epoch": 2.066302367941712, + "grad_norm": 0.31041911244392395, + "learning_rate": 2e-05, + "loss": 0.9196, + "mean_token_accuracy": 0.734300195407914, + "step": 2835 + }, + { + "epoch": 2.069945355191257, + "grad_norm": 0.35703134536743164, + "learning_rate": 2e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.7365473864191501, + "step": 2840 + }, + { + "epoch": 2.0735883424408015, + "grad_norm": 0.3322874903678894, + "learning_rate": 2e-05, + "loss": 0.8989, + "mean_token_accuracy": 0.7415150219833901, + "step": 2845 + }, + { + "epoch": 2.077231329690346, + "grad_norm": 0.3197586238384247, + "learning_rate": 2e-05, + "loss": 0.914, + "mean_token_accuracy": 0.7365657059110894, + "step": 2850 + }, + { + "epoch": 2.080874316939891, + "grad_norm": 0.3333277404308319, + "learning_rate": 2e-05, + "loss": 0.8899, + "mean_token_accuracy": 0.7442049340498289, + "step": 2855 + }, + { + "epoch": 2.0845173041894354, + "grad_norm": 0.3269352912902832, + "learning_rate": 2e-05, + "loss": 0.9002, + "mean_token_accuracy": 0.7409226917440156, + "step": 2860 + }, + { + "epoch": 2.08816029143898, + "grad_norm": 0.35753223299980164, + "learning_rate": 2e-05, + "loss": 0.8883, + "mean_token_accuracy": 0.7434691011235955, + "step": 2865 + }, + { + "epoch": 2.091803278688525, + "grad_norm": 0.33091840147972107, + "learning_rate": 2e-05, + "loss": 0.9363, + "mean_token_accuracy": 0.7311492427943331, + "step": 2870 + }, + { + "epoch": 2.0954462659380693, + "grad_norm": 0.3381136953830719, + "learning_rate": 2e-05, + "loss": 0.9195, + "mean_token_accuracy": 0.7348192476795311, + "step": 2875 + }, + { + "epoch": 2.0990892531876137, + "grad_norm": 0.311617374420166, + "learning_rate": 2e-05, + "loss": 0.895, + "mean_token_accuracy": 0.740876893014167, + "step": 2880 + }, + { + "epoch": 2.1027322404371587, + "grad_norm": 0.30609360337257385, + "learning_rate": 2e-05, + "loss": 0.8873, + "mean_token_accuracy": 0.7443514899853445, + "step": 2885 + }, + { + "epoch": 2.106375227686703, + "grad_norm": 0.39882877469062805, + "learning_rate": 2e-05, + "loss": 0.8927, + "mean_token_accuracy": 0.7430172203224231, + "step": 2890 + }, + { + "epoch": 2.1100182149362476, + "grad_norm": 0.35035115480422974, + "learning_rate": 2e-05, + "loss": 0.909, + "mean_token_accuracy": 0.7358333581223833, + "step": 2895 + }, + { + "epoch": 2.1136612021857926, + "grad_norm": 0.32378336787223816, + "learning_rate": 2e-05, + "loss": 0.9167, + "mean_token_accuracy": 0.7358481924767953, + "step": 2900 + }, + { + "epoch": 2.1136612021857926, + "eval_loss": 0.9846732020378113, + "eval_mean_token_accuracy": 0.7199527994124922, + "eval_runtime": 14.3227, + "eval_samples_per_second": 18.293, + "eval_steps_per_second": 1.187, + "step": 2900 + }, + { + "epoch": 2.117304189435337, + "grad_norm": 0.3474068343639374, + "learning_rate": 2e-05, + "loss": 0.8941, + "mean_token_accuracy": 0.7412768685881779, + "step": 2905 + }, + { + "epoch": 2.1209471766848815, + "grad_norm": 0.3359163999557495, + "learning_rate": 2e-05, + "loss": 0.9217, + "mean_token_accuracy": 0.7339460185637519, + "step": 2910 + }, + { + "epoch": 2.1245901639344265, + "grad_norm": 0.33082684874534607, + "learning_rate": 2e-05, + "loss": 0.879, + "mean_token_accuracy": 0.7462780898876404, + "step": 2915 + }, + { + "epoch": 2.128233151183971, + "grad_norm": 0.3506236672401428, + "learning_rate": 2e-05, + "loss": 0.8999, + "mean_token_accuracy": 0.7397227650219833, + "step": 2920 + }, + { + "epoch": 2.1318761384335154, + "grad_norm": 0.3336649239063263, + "learning_rate": 2e-05, + "loss": 0.8984, + "mean_token_accuracy": 0.7394846116267708, + "step": 2925 + }, + { + "epoch": 2.13551912568306, + "grad_norm": 0.3224109709262848, + "learning_rate": 2e-05, + "loss": 0.9222, + "mean_token_accuracy": 0.7358970444553004, + "step": 2930 + }, + { + "epoch": 2.139162112932605, + "grad_norm": 0.3256509602069855, + "learning_rate": 2e-05, + "loss": 0.8949, + "mean_token_accuracy": 0.7407028578407425, + "step": 2935 + }, + { + "epoch": 2.1428051001821493, + "grad_norm": 0.3140164911746979, + "learning_rate": 2e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7382724719101122, + "step": 2940 + }, + { + "epoch": 2.146448087431694, + "grad_norm": 0.3872627019882202, + "learning_rate": 2e-05, + "loss": 0.9106, + "mean_token_accuracy": 0.736156570591109, + "step": 2945 + }, + { + "epoch": 2.1500910746812387, + "grad_norm": 0.332190603017807, + "learning_rate": 2e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7400036638983879, + "step": 2950 + }, + { + "epoch": 2.153734061930783, + "grad_norm": 0.3359198570251465, + "learning_rate": 2e-05, + "loss": 0.9245, + "mean_token_accuracy": 0.7332101856375183, + "step": 2955 + }, + { + "epoch": 2.1573770491803277, + "grad_norm": 0.33812880516052246, + "learning_rate": 2e-05, + "loss": 0.888, + "mean_token_accuracy": 0.7440675378602833, + "step": 2960 + }, + { + "epoch": 2.1610200364298726, + "grad_norm": 0.34355029463768005, + "learning_rate": 2e-05, + "loss": 0.9231, + "mean_token_accuracy": 0.7337109184171959, + "step": 2965 + }, + { + "epoch": 2.164663023679417, + "grad_norm": 0.3436281979084015, + "learning_rate": 2e-05, + "loss": 0.8767, + "mean_token_accuracy": 0.7456704934049829, + "step": 2970 + }, + { + "epoch": 2.1683060109289616, + "grad_norm": 0.321349561214447, + "learning_rate": 2e-05, + "loss": 0.8905, + "mean_token_accuracy": 0.7416035661944308, + "step": 2975 + }, + { + "epoch": 2.1719489981785065, + "grad_norm": 0.33591070771217346, + "learning_rate": 2e-05, + "loss": 0.9287, + "mean_token_accuracy": 0.7313660234489496, + "step": 2980 + }, + { + "epoch": 2.175591985428051, + "grad_norm": 0.3216773271560669, + "learning_rate": 2e-05, + "loss": 0.893, + "mean_token_accuracy": 0.7406540058622374, + "step": 2985 + }, + { + "epoch": 2.1792349726775955, + "grad_norm": 0.3402141332626343, + "learning_rate": 2e-05, + "loss": 0.9236, + "mean_token_accuracy": 0.7342177576941865, + "step": 2990 + }, + { + "epoch": 2.1828779599271404, + "grad_norm": 0.3489401936531067, + "learning_rate": 2e-05, + "loss": 0.9054, + "mean_token_accuracy": 0.7382938446507085, + "step": 2995 + }, + { + "epoch": 2.186520947176685, + "grad_norm": 0.32912924885749817, + "learning_rate": 2e-05, + "loss": 0.8798, + "mean_token_accuracy": 0.7457101856375183, + "step": 3000 + }, + { + "epoch": 2.186520947176685, + "eval_loss": 0.982496440410614, + "eval_mean_token_accuracy": 0.7202835672208421, + "eval_runtime": 14.2889, + "eval_samples_per_second": 18.336, + "eval_steps_per_second": 1.19, + "step": 3000 + }, + { + "epoch": 2.1901639344262294, + "grad_norm": 0.3348352909088135, + "learning_rate": 2e-05, + "loss": 0.9096, + "mean_token_accuracy": 0.7373412310698583, + "step": 3005 + }, + { + "epoch": 2.1938069216757743, + "grad_norm": 0.32950860261917114, + "learning_rate": 2e-05, + "loss": 0.8937, + "mean_token_accuracy": 0.7408616267708842, + "step": 3010 + }, + { + "epoch": 2.1974499089253188, + "grad_norm": 0.2998691201210022, + "learning_rate": 2e-05, + "loss": 0.8911, + "mean_token_accuracy": 0.7423943575964826, + "step": 3015 + }, + { + "epoch": 2.2010928961748633, + "grad_norm": 0.32780349254608154, + "learning_rate": 2e-05, + "loss": 0.9153, + "mean_token_accuracy": 0.735530654616512, + "step": 3020 + }, + { + "epoch": 2.204735883424408, + "grad_norm": 0.31766557693481445, + "learning_rate": 2e-05, + "loss": 0.9022, + "mean_token_accuracy": 0.7394968246213971, + "step": 3025 + }, + { + "epoch": 2.2083788706739527, + "grad_norm": 0.32309776544570923, + "learning_rate": 2e-05, + "loss": 0.8915, + "mean_token_accuracy": 0.7415302882266731, + "step": 3030 + }, + { + "epoch": 2.212021857923497, + "grad_norm": 0.319327712059021, + "learning_rate": 2e-05, + "loss": 0.9192, + "mean_token_accuracy": 0.734413165608207, + "step": 3035 + }, + { + "epoch": 2.215664845173042, + "grad_norm": 0.30570122599601746, + "learning_rate": 2e-05, + "loss": 0.8889, + "mean_token_accuracy": 0.7435271128480703, + "step": 3040 + }, + { + "epoch": 2.2193078324225866, + "grad_norm": 0.30299875140190125, + "learning_rate": 2e-05, + "loss": 0.9105, + "mean_token_accuracy": 0.7357810210063506, + "step": 3045 + }, + { + "epoch": 2.222950819672131, + "grad_norm": 0.3085125982761383, + "learning_rate": 2e-05, + "loss": 0.9112, + "mean_token_accuracy": 0.7353413531998043, + "step": 3050 + }, + { + "epoch": 2.226593806921676, + "grad_norm": 0.32464104890823364, + "learning_rate": 2e-05, + "loss": 0.8885, + "mean_token_accuracy": 0.7432745613269548, + "step": 3055 + }, + { + "epoch": 2.2302367941712204, + "grad_norm": 0.309283047914505, + "learning_rate": 2e-05, + "loss": 0.8881, + "mean_token_accuracy": 0.7437042012701514, + "step": 3060 + }, + { + "epoch": 2.233879781420765, + "grad_norm": 0.3193177282810211, + "learning_rate": 2e-05, + "loss": 0.9189, + "mean_token_accuracy": 0.7337139716658526, + "step": 3065 + }, + { + "epoch": 2.23752276867031, + "grad_norm": 0.31948578357696533, + "learning_rate": 2e-05, + "loss": 0.9111, + "mean_token_accuracy": 0.7373259648265754, + "step": 3070 + }, + { + "epoch": 2.2411657559198543, + "grad_norm": 0.3639989495277405, + "learning_rate": 2e-05, + "loss": 0.9098, + "mean_token_accuracy": 0.7367061553492917, + "step": 3075 + }, + { + "epoch": 2.244808743169399, + "grad_norm": 0.31007903814315796, + "learning_rate": 2e-05, + "loss": 0.9083, + "mean_token_accuracy": 0.737045065950171, + "step": 3080 + }, + { + "epoch": 2.2484517304189433, + "grad_norm": 0.34219416975975037, + "learning_rate": 2e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.737329018075232, + "step": 3085 + }, + { + "epoch": 2.2520947176684882, + "grad_norm": 0.3155016005039215, + "learning_rate": 2e-05, + "loss": 0.886, + "mean_token_accuracy": 0.7428920371275037, + "step": 3090 + }, + { + "epoch": 2.2557377049180327, + "grad_norm": 0.34369996190071106, + "learning_rate": 2e-05, + "loss": 0.8909, + "mean_token_accuracy": 0.7413593063019053, + "step": 3095 + }, + { + "epoch": 2.2593806921675776, + "grad_norm": 0.3238091468811035, + "learning_rate": 2e-05, + "loss": 0.9196, + "mean_token_accuracy": 0.7337597703957011, + "step": 3100 + }, + { + "epoch": 2.2593806921675776, + "eval_loss": 0.9811844229698181, + "eval_mean_token_accuracy": 0.7203799509057488, + "eval_runtime": 14.3438, + "eval_samples_per_second": 18.266, + "eval_steps_per_second": 1.185, + "step": 3100 + }, + { + "epoch": 2.263023679417122, + "grad_norm": 0.34170106053352356, + "learning_rate": 2e-05, + "loss": 0.9115, + "mean_token_accuracy": 0.736837445041524, + "step": 3105 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.34322506189346313, + "learning_rate": 2e-05, + "loss": 0.9017, + "mean_token_accuracy": 0.738431240840254, + "step": 3110 + }, + { + "epoch": 2.270309653916211, + "grad_norm": 0.3346925973892212, + "learning_rate": 2e-05, + "loss": 0.89, + "mean_token_accuracy": 0.7419211040547142, + "step": 3115 + }, + { + "epoch": 2.273952641165756, + "grad_norm": 0.30860498547554016, + "learning_rate": 2e-05, + "loss": 0.8928, + "mean_token_accuracy": 0.7410234489496824, + "step": 3120 + }, + { + "epoch": 2.2775956284153005, + "grad_norm": 0.34893983602523804, + "learning_rate": 2e-05, + "loss": 0.8988, + "mean_token_accuracy": 0.7408524670249145, + "step": 3125 + }, + { + "epoch": 2.281238615664845, + "grad_norm": 0.33216506242752075, + "learning_rate": 2e-05, + "loss": 0.8947, + "mean_token_accuracy": 0.7412585490962382, + "step": 3130 + }, + { + "epoch": 2.28488160291439, + "grad_norm": 0.34543153643608093, + "learning_rate": 2e-05, + "loss": 0.9253, + "mean_token_accuracy": 0.7330850024425989, + "step": 3135 + }, + { + "epoch": 2.2885245901639344, + "grad_norm": 0.31388479471206665, + "learning_rate": 2e-05, + "loss": 0.9089, + "mean_token_accuracy": 0.7361382510991696, + "step": 3140 + }, + { + "epoch": 2.292167577413479, + "grad_norm": 0.3121238350868225, + "learning_rate": 2e-05, + "loss": 0.9108, + "mean_token_accuracy": 0.7372282608695653, + "step": 3145 + }, + { + "epoch": 2.295810564663024, + "grad_norm": 0.33222267031669617, + "learning_rate": 2e-05, + "loss": 0.9163, + "mean_token_accuracy": 0.7346329995114803, + "step": 3150 + }, + { + "epoch": 2.2994535519125683, + "grad_norm": 0.3138783872127533, + "learning_rate": 2e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7387548851978505, + "step": 3155 + }, + { + "epoch": 2.3030965391621128, + "grad_norm": 0.3232060372829437, + "learning_rate": 2e-05, + "loss": 0.8933, + "mean_token_accuracy": 0.7417134831460673, + "step": 3160 + }, + { + "epoch": 2.3067395264116577, + "grad_norm": 0.3669489920139313, + "learning_rate": 2e-05, + "loss": 0.9146, + "mean_token_accuracy": 0.7371519296531508, + "step": 3165 + }, + { + "epoch": 2.310382513661202, + "grad_norm": 0.31355297565460205, + "learning_rate": 2e-05, + "loss": 0.9092, + "mean_token_accuracy": 0.7390754763067905, + "step": 3170 + }, + { + "epoch": 2.3140255009107467, + "grad_norm": 0.3025984764099121, + "learning_rate": 2e-05, + "loss": 0.8852, + "mean_token_accuracy": 0.7443026380068392, + "step": 3175 + }, + { + "epoch": 2.3176684881602916, + "grad_norm": 0.3665474057197571, + "learning_rate": 2e-05, + "loss": 0.9014, + "mean_token_accuracy": 0.7385075720566683, + "step": 3180 + }, + { + "epoch": 2.321311475409836, + "grad_norm": 0.33963456749916077, + "learning_rate": 2e-05, + "loss": 0.8944, + "mean_token_accuracy": 0.7402723497801661, + "step": 3185 + }, + { + "epoch": 2.3249544626593805, + "grad_norm": 0.3448450565338135, + "learning_rate": 2e-05, + "loss": 0.9042, + "mean_token_accuracy": 0.7382358329262335, + "step": 3190 + }, + { + "epoch": 2.3285974499089255, + "grad_norm": 0.32416313886642456, + "learning_rate": 2e-05, + "loss": 0.9044, + "mean_token_accuracy": 0.7392951526599021, + "step": 3195 + }, + { + "epoch": 2.33224043715847, + "grad_norm": 0.32076016068458557, + "learning_rate": 2e-05, + "loss": 0.899, + "mean_token_accuracy": 0.7407059110893991, + "step": 3200 + }, + { + "epoch": 2.33224043715847, + "eval_loss": 0.9800810813903809, + "eval_mean_token_accuracy": 0.7205252333805446, + "eval_runtime": 14.3063, + "eval_samples_per_second": 18.314, + "eval_steps_per_second": 1.188, + "step": 3200 + }, + { + "epoch": 2.3358834244080144, + "grad_norm": 0.31426194310188293, + "learning_rate": 2e-05, + "loss": 0.9071, + "mean_token_accuracy": 0.7373351245725452, + "step": 3205 + }, + { + "epoch": 2.3395264116575594, + "grad_norm": 0.3155065178871155, + "learning_rate": 2e-05, + "loss": 0.9108, + "mean_token_accuracy": 0.7373076453346361, + "step": 3210 + }, + { + "epoch": 2.343169398907104, + "grad_norm": 0.32219287753105164, + "learning_rate": 2e-05, + "loss": 0.9271, + "mean_token_accuracy": 0.7313629702002931, + "step": 3215 + }, + { + "epoch": 2.3468123861566483, + "grad_norm": 0.3496605455875397, + "learning_rate": 2e-05, + "loss": 0.8893, + "mean_token_accuracy": 0.7418984967469944, + "step": 3220 + }, + { + "epoch": 2.3504553734061933, + "grad_norm": 0.31775274872779846, + "learning_rate": 2e-05, + "loss": 0.9079, + "mean_token_accuracy": 0.7362817537860283, + "step": 3225 + }, + { + "epoch": 2.3540983606557377, + "grad_norm": 0.3471783399581909, + "learning_rate": 2e-05, + "loss": 0.8733, + "mean_token_accuracy": 0.7464673913043477, + "step": 3230 + }, + { + "epoch": 2.3577413479052822, + "grad_norm": 0.3281533420085907, + "learning_rate": 2e-05, + "loss": 0.8887, + "mean_token_accuracy": 0.7437988519785051, + "step": 3235 + }, + { + "epoch": 2.361384335154827, + "grad_norm": 0.31612518429756165, + "learning_rate": 2e-05, + "loss": 0.9116, + "mean_token_accuracy": 0.7362176355642404, + "step": 3240 + }, + { + "epoch": 2.3650273224043716, + "grad_norm": 0.3653080463409424, + "learning_rate": 2e-05, + "loss": 0.9029, + "mean_token_accuracy": 0.7385961162677088, + "step": 3245 + }, + { + "epoch": 2.368670309653916, + "grad_norm": 0.33496755361557007, + "learning_rate": 2e-05, + "loss": 0.9137, + "mean_token_accuracy": 0.7352253297508549, + "step": 3250 + }, + { + "epoch": 2.372313296903461, + "grad_norm": 0.3244491219520569, + "learning_rate": 2e-05, + "loss": 0.9206, + "mean_token_accuracy": 0.734910845139228, + "step": 3255 + }, + { + "epoch": 2.3759562841530055, + "grad_norm": 0.31501951813697815, + "learning_rate": 2e-05, + "loss": 0.8909, + "mean_token_accuracy": 0.7428553981436248, + "step": 3260 + }, + { + "epoch": 2.37959927140255, + "grad_norm": 0.3064330518245697, + "learning_rate": 2e-05, + "loss": 0.9008, + "mean_token_accuracy": 0.7384281875915975, + "step": 3265 + }, + { + "epoch": 2.3832422586520945, + "grad_norm": 0.31080517172813416, + "learning_rate": 2e-05, + "loss": 0.8998, + "mean_token_accuracy": 0.7391243282852955, + "step": 3270 + }, + { + "epoch": 2.3868852459016394, + "grad_norm": 0.3312147855758667, + "learning_rate": 2e-05, + "loss": 0.8757, + "mean_token_accuracy": 0.7458524138337579, + "step": 3275 + }, + { + "epoch": 2.390528233151184, + "grad_norm": 0.3071138858795166, + "learning_rate": 2e-05, + "loss": 0.908, + "mean_token_accuracy": 0.7380068392769907, + "step": 3280 + }, + { + "epoch": 2.394171220400729, + "grad_norm": 0.3064194619655609, + "learning_rate": 2e-05, + "loss": 0.8614, + "mean_token_accuracy": 0.7494107230092817, + "step": 3285 + }, + { + "epoch": 2.3978142076502733, + "grad_norm": 0.30981016159057617, + "learning_rate": 2e-05, + "loss": 0.8708, + "mean_token_accuracy": 0.7473253541768441, + "step": 3290 + }, + { + "epoch": 2.401457194899818, + "grad_norm": 0.30898359417915344, + "learning_rate": 2e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.7413928920371275, + "step": 3295 + }, + { + "epoch": 2.4051001821493623, + "grad_norm": 0.3338389992713928, + "learning_rate": 2e-05, + "loss": 0.8809, + "mean_token_accuracy": 0.74310881778212, + "step": 3300 + }, + { + "epoch": 2.4051001821493623, + "eval_loss": 0.9772781729698181, + "eval_mean_token_accuracy": 0.7212848507889992, + "eval_runtime": 14.2945, + "eval_samples_per_second": 18.329, + "eval_steps_per_second": 1.189, + "step": 3300 + }, + { + "epoch": 2.408743169398907, + "grad_norm": 0.33642616868019104, + "learning_rate": 2e-05, + "loss": 0.8856, + "mean_token_accuracy": 0.743920981924768, + "step": 3305 + }, + { + "epoch": 2.4123861566484517, + "grad_norm": 0.33066093921661377, + "learning_rate": 2e-05, + "loss": 0.8851, + "mean_token_accuracy": 0.7444217147044455, + "step": 3310 + }, + { + "epoch": 2.416029143897996, + "grad_norm": 0.3612671196460724, + "learning_rate": 2e-05, + "loss": 0.9075, + "mean_token_accuracy": 0.7383610161211529, + "step": 3315 + }, + { + "epoch": 2.419672131147541, + "grad_norm": 0.3145372271537781, + "learning_rate": 2e-05, + "loss": 0.9059, + "mean_token_accuracy": 0.7378511235955056, + "step": 3320 + }, + { + "epoch": 2.4233151183970856, + "grad_norm": 0.31871461868286133, + "learning_rate": 2e-05, + "loss": 0.9106, + "mean_token_accuracy": 0.7359000977039571, + "step": 3325 + }, + { + "epoch": 2.42695810564663, + "grad_norm": 0.3099087178707123, + "learning_rate": 2e-05, + "loss": 0.9293, + "mean_token_accuracy": 0.7312988519785052, + "step": 3330 + }, + { + "epoch": 2.430601092896175, + "grad_norm": 0.32242926955223083, + "learning_rate": 2e-05, + "loss": 0.9075, + "mean_token_accuracy": 0.737692354665364, + "step": 3335 + }, + { + "epoch": 2.4342440801457195, + "grad_norm": 0.3814336955547333, + "learning_rate": 2e-05, + "loss": 0.911, + "mean_token_accuracy": 0.7348680996580361, + "step": 3340 + }, + { + "epoch": 2.437887067395264, + "grad_norm": 0.332454115152359, + "learning_rate": 2e-05, + "loss": 0.8897, + "mean_token_accuracy": 0.7413684660478751, + "step": 3345 + }, + { + "epoch": 2.441530054644809, + "grad_norm": 0.3682723045349121, + "learning_rate": 2e-05, + "loss": 0.929, + "mean_token_accuracy": 0.7310484855886664, + "step": 3350 + }, + { + "epoch": 2.4451730418943534, + "grad_norm": 0.34098002314567566, + "learning_rate": 2e-05, + "loss": 0.8978, + "mean_token_accuracy": 0.7393344319677048, + "step": 3355 + }, + { + "epoch": 2.448816029143898, + "grad_norm": 0.3256863057613373, + "learning_rate": 2e-05, + "loss": 0.9005, + "mean_token_accuracy": 0.739646433805569, + "step": 3360 + }, + { + "epoch": 2.4524590163934428, + "grad_norm": 0.3080803155899048, + "learning_rate": 2e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7381930874450416, + "step": 3365 + }, + { + "epoch": 2.4561020036429873, + "grad_norm": 0.3441084623336792, + "learning_rate": 2e-05, + "loss": 0.8865, + "mean_token_accuracy": 0.7436248168050805, + "step": 3370 + }, + { + "epoch": 2.4597449908925317, + "grad_norm": 0.30771470069885254, + "learning_rate": 2e-05, + "loss": 0.8927, + "mean_token_accuracy": 0.7419821690278456, + "step": 3375 + }, + { + "epoch": 2.4633879781420767, + "grad_norm": 0.36730024218559265, + "learning_rate": 2e-05, + "loss": 0.9168, + "mean_token_accuracy": 0.736474364845075, + "step": 3380 + }, + { + "epoch": 2.467030965391621, + "grad_norm": 0.33307135105133057, + "learning_rate": 2e-05, + "loss": 0.9013, + "mean_token_accuracy": 0.7384861993160723, + "step": 3385 + }, + { + "epoch": 2.4706739526411656, + "grad_norm": 0.3176257014274597, + "learning_rate": 2e-05, + "loss": 0.8822, + "mean_token_accuracy": 0.7452796775769418, + "step": 3390 + }, + { + "epoch": 2.4743169398907106, + "grad_norm": 0.3291166126728058, + "learning_rate": 2e-05, + "loss": 0.9069, + "mean_token_accuracy": 0.7390083048363458, + "step": 3395 + }, + { + "epoch": 2.477959927140255, + "grad_norm": 0.3459428548812866, + "learning_rate": 2e-05, + "loss": 0.9279, + "mean_token_accuracy": 0.7312042012701515, + "step": 3400 + }, + { + "epoch": 2.477959927140255, + "eval_loss": 0.9747435450553894, + "eval_mean_token_accuracy": 0.7213943106131382, + "eval_runtime": 14.2613, + "eval_samples_per_second": 18.371, + "eval_steps_per_second": 1.192, + "step": 3400 + }, + { + "epoch": 2.4816029143897995, + "grad_norm": 0.30401602387428284, + "learning_rate": 2e-05, + "loss": 0.8715, + "mean_token_accuracy": 0.746794088910601, + "step": 3405 + }, + { + "epoch": 2.4852459016393444, + "grad_norm": 0.320230096578598, + "learning_rate": 2e-05, + "loss": 0.8973, + "mean_token_accuracy": 0.7395816801558677, + "step": 3410 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 0.32111015915870667, + "learning_rate": 2e-05, + "loss": 0.8763, + "mean_token_accuracy": 0.7474658036150463, + "step": 3415 + }, + { + "epoch": 2.4925318761384334, + "grad_norm": 0.31403398513793945, + "learning_rate": 2e-05, + "loss": 0.8985, + "mean_token_accuracy": 0.7403731069858328, + "step": 3420 + }, + { + "epoch": 2.496174863387978, + "grad_norm": 0.32882779836654663, + "learning_rate": 2e-05, + "loss": 0.8912, + "mean_token_accuracy": 0.7424951148021495, + "step": 3425 + }, + { + "epoch": 2.499817850637523, + "grad_norm": 0.3540889620780945, + "learning_rate": 2e-05, + "loss": 0.8965, + "mean_token_accuracy": 0.7381808744504154, + "step": 3430 + }, + { + "epoch": 2.5034608378870673, + "grad_norm": 0.31427791714668274, + "learning_rate": 2e-05, + "loss": 0.888, + "mean_token_accuracy": 0.7435668050806058, + "step": 3435 + }, + { + "epoch": 2.5071038251366122, + "grad_norm": 0.32461288571357727, + "learning_rate": 2e-05, + "loss": 0.9023, + "mean_token_accuracy": 0.7397227650219834, + "step": 3440 + }, + { + "epoch": 2.5107468123861567, + "grad_norm": 0.31609442830085754, + "learning_rate": 2e-05, + "loss": 0.9039, + "mean_token_accuracy": 0.7388678553981436, + "step": 3445 + }, + { + "epoch": 2.514389799635701, + "grad_norm": 0.34672296047210693, + "learning_rate": 2e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7388814035812061, + "step": 3450 + }, + { + "epoch": 2.5180327868852457, + "grad_norm": 0.363460510969162, + "learning_rate": 2e-05, + "loss": 0.8908, + "mean_token_accuracy": 0.7427638006839276, + "step": 3455 + }, + { + "epoch": 2.5216757741347906, + "grad_norm": 0.32321321964263916, + "learning_rate": 2e-05, + "loss": 0.8851, + "mean_token_accuracy": 0.7424738649440817, + "step": 3460 + }, + { + "epoch": 2.525318761384335, + "grad_norm": 0.30429279804229736, + "learning_rate": 2e-05, + "loss": 0.9026, + "mean_token_accuracy": 0.7386907669760625, + "step": 3465 + }, + { + "epoch": 2.52896174863388, + "grad_norm": 0.3118142783641815, + "learning_rate": 2e-05, + "loss": 0.8841, + "mean_token_accuracy": 0.7432004152418175, + "step": 3470 + }, + { + "epoch": 2.5326047358834245, + "grad_norm": 0.34448346495628357, + "learning_rate": 2e-05, + "loss": 0.9038, + "mean_token_accuracy": 0.7381159228060409, + "step": 3475 + }, + { + "epoch": 2.536247723132969, + "grad_norm": 0.31645679473876953, + "learning_rate": 2e-05, + "loss": 0.8834, + "mean_token_accuracy": 0.7446140693698095, + "step": 3480 + }, + { + "epoch": 2.5398907103825135, + "grad_norm": 0.3462752401828766, + "learning_rate": 2e-05, + "loss": 0.9091, + "mean_token_accuracy": 0.7369626282364435, + "step": 3485 + }, + { + "epoch": 2.5435336976320584, + "grad_norm": 0.3447904586791992, + "learning_rate": 2e-05, + "loss": 0.8708, + "mean_token_accuracy": 0.7476581582804104, + "step": 3490 + }, + { + "epoch": 2.547176684881603, + "grad_norm": 0.34256285429000854, + "learning_rate": 2e-05, + "loss": 0.8867, + "mean_token_accuracy": 0.7426874694675135, + "step": 3495 + }, + { + "epoch": 2.550819672131148, + "grad_norm": 0.33325037360191345, + "learning_rate": 2e-05, + "loss": 0.9018, + "mean_token_accuracy": 0.7395792623351245, + "step": 3500 + }, + { + "epoch": 2.550819672131148, + "eval_loss": 0.973312258720398, + "eval_mean_token_accuracy": 0.7216909538776792, + "eval_runtime": 14.34, + "eval_samples_per_second": 18.271, + "eval_steps_per_second": 1.185, + "step": 3500 + }, + { + "epoch": 2.5544626593806923, + "grad_norm": 0.3150700032711029, + "learning_rate": 2e-05, + "loss": 0.865, + "mean_token_accuracy": 0.7495542256961406, + "step": 3505 + }, + { + "epoch": 2.5581056466302368, + "grad_norm": 0.31783822178840637, + "learning_rate": 2e-05, + "loss": 0.9029, + "mean_token_accuracy": 0.739710552027357, + "step": 3510 + }, + { + "epoch": 2.5617486338797812, + "grad_norm": 0.2993064224720001, + "learning_rate": 2e-05, + "loss": 0.8922, + "mean_token_accuracy": 0.7411700048851977, + "step": 3515 + }, + { + "epoch": 2.565391621129326, + "grad_norm": 0.3268680274486542, + "learning_rate": 2e-05, + "loss": 0.9183, + "mean_token_accuracy": 0.7340380082411782, + "step": 3520 + }, + { + "epoch": 2.5690346083788707, + "grad_norm": 0.313725084066391, + "learning_rate": 2e-05, + "loss": 0.9055, + "mean_token_accuracy": 0.7373168050806058, + "step": 3525 + }, + { + "epoch": 2.572677595628415, + "grad_norm": 0.31111860275268555, + "learning_rate": 2e-05, + "loss": 0.8665, + "mean_token_accuracy": 0.7484214704445529, + "step": 3530 + }, + { + "epoch": 2.57632058287796, + "grad_norm": 0.36401307582855225, + "learning_rate": 2e-05, + "loss": 0.9009, + "mean_token_accuracy": 0.7393563751831947, + "step": 3535 + }, + { + "epoch": 2.5799635701275045, + "grad_norm": 0.32261475920677185, + "learning_rate": 2e-05, + "loss": 0.904, + "mean_token_accuracy": 0.7369870542256962, + "step": 3540 + }, + { + "epoch": 2.583606557377049, + "grad_norm": 0.3348728120326996, + "learning_rate": 2e-05, + "loss": 0.8999, + "mean_token_accuracy": 0.7393777479237909, + "step": 3545 + }, + { + "epoch": 2.587249544626594, + "grad_norm": 0.3013302683830261, + "learning_rate": 2e-05, + "loss": 0.8864, + "mean_token_accuracy": 0.7442293600390817, + "step": 3550 + }, + { + "epoch": 2.5908925318761384, + "grad_norm": 0.3235645890235901, + "learning_rate": 2e-05, + "loss": 0.8804, + "mean_token_accuracy": 0.7435545920859795, + "step": 3555 + }, + { + "epoch": 2.594535519125683, + "grad_norm": 0.30001094937324524, + "learning_rate": 2e-05, + "loss": 0.8755, + "mean_token_accuracy": 0.7467116511968734, + "step": 3560 + }, + { + "epoch": 2.598178506375228, + "grad_norm": 0.3245663642883301, + "learning_rate": 2e-05, + "loss": 0.9012, + "mean_token_accuracy": 0.7391304347826086, + "step": 3565 + }, + { + "epoch": 2.6018214936247723, + "grad_norm": 0.30933406949043274, + "learning_rate": 2e-05, + "loss": 0.8758, + "mean_token_accuracy": 0.7474138983878847, + "step": 3570 + }, + { + "epoch": 2.605464480874317, + "grad_norm": 0.33006611466407776, + "learning_rate": 2e-05, + "loss": 0.9045, + "mean_token_accuracy": 0.7370572789447973, + "step": 3575 + }, + { + "epoch": 2.6091074681238613, + "grad_norm": 0.3191071152687073, + "learning_rate": 2e-05, + "loss": 0.8865, + "mean_token_accuracy": 0.7418264533463605, + "step": 3580 + }, + { + "epoch": 2.612750455373406, + "grad_norm": 0.3415972888469696, + "learning_rate": 2e-05, + "loss": 0.8861, + "mean_token_accuracy": 0.7413806790425013, + "step": 3585 + }, + { + "epoch": 2.6163934426229507, + "grad_norm": 0.3029073178768158, + "learning_rate": 2e-05, + "loss": 0.8917, + "mean_token_accuracy": 0.7411974841231072, + "step": 3590 + }, + { + "epoch": 2.6200364298724956, + "grad_norm": 0.325960636138916, + "learning_rate": 2e-05, + "loss": 0.9157, + "mean_token_accuracy": 0.7347215437225207, + "step": 3595 + }, + { + "epoch": 2.62367941712204, + "grad_norm": 0.3525567054748535, + "learning_rate": 2e-05, + "loss": 0.8991, + "mean_token_accuracy": 0.7409623839765509, + "step": 3600 + }, + { + "epoch": 2.62367941712204, + "eval_loss": 0.9719704389572144, + "eval_mean_token_accuracy": 0.7224160455018883, + "eval_runtime": 14.2805, + "eval_samples_per_second": 18.347, + "eval_steps_per_second": 1.19, + "step": 3600 + }, + { + "epoch": 2.6273224043715846, + "grad_norm": 0.3182205557823181, + "learning_rate": 2e-05, + "loss": 0.8875, + "mean_token_accuracy": 0.7440828041035662, + "step": 3605 + }, + { + "epoch": 2.630965391621129, + "grad_norm": 0.3188170790672302, + "learning_rate": 2e-05, + "loss": 0.8778, + "mean_token_accuracy": 0.7443850757205668, + "step": 3610 + }, + { + "epoch": 2.634608378870674, + "grad_norm": 0.320301353931427, + "learning_rate": 2e-05, + "loss": 0.8714, + "mean_token_accuracy": 0.7470810942843185, + "step": 3615 + }, + { + "epoch": 2.6382513661202185, + "grad_norm": 0.3600040078163147, + "learning_rate": 2e-05, + "loss": 0.9075, + "mean_token_accuracy": 0.7374511480214949, + "step": 3620 + }, + { + "epoch": 2.6418943533697634, + "grad_norm": 0.34417158365249634, + "learning_rate": 2e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.743273693209575, + "step": 3625 + }, + { + "epoch": 2.645537340619308, + "grad_norm": 0.3184971213340759, + "learning_rate": 2e-05, + "loss": 0.884, + "mean_token_accuracy": 0.7432584269662921, + "step": 3630 + }, + { + "epoch": 2.6491803278688524, + "grad_norm": 0.34727537631988525, + "learning_rate": 2e-05, + "loss": 0.9198, + "mean_token_accuracy": 0.7346726917440157, + "step": 3635 + }, + { + "epoch": 2.652823315118397, + "grad_norm": 0.3686891496181488, + "learning_rate": 2e-05, + "loss": 0.8614, + "mean_token_accuracy": 0.7494320957498779, + "step": 3640 + }, + { + "epoch": 2.656466302367942, + "grad_norm": 0.3379385769367218, + "learning_rate": 2e-05, + "loss": 0.8768, + "mean_token_accuracy": 0.7454476062530533, + "step": 3645 + }, + { + "epoch": 2.6601092896174863, + "grad_norm": 0.3417653441429138, + "learning_rate": 2e-05, + "loss": 0.8937, + "mean_token_accuracy": 0.7402234978016609, + "step": 3650 + }, + { + "epoch": 2.663752276867031, + "grad_norm": 0.32817432284355164, + "learning_rate": 2e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.7415791402051782, + "step": 3655 + }, + { + "epoch": 2.6673952641165757, + "grad_norm": 0.3513006865978241, + "learning_rate": 2e-05, + "loss": 0.8658, + "mean_token_accuracy": 0.7484397899364924, + "step": 3660 + }, + { + "epoch": 2.67103825136612, + "grad_norm": 0.36159011721611023, + "learning_rate": 2e-05, + "loss": 0.8869, + "mean_token_accuracy": 0.7432492672203225, + "step": 3665 + }, + { + "epoch": 2.6746812386156646, + "grad_norm": 0.3413192927837372, + "learning_rate": 2e-05, + "loss": 0.8909, + "mean_token_accuracy": 0.7409226917440156, + "step": 3670 + }, + { + "epoch": 2.6783242258652096, + "grad_norm": 0.30612418055534363, + "learning_rate": 2e-05, + "loss": 0.8743, + "mean_token_accuracy": 0.744739252564729, + "step": 3675 + }, + { + "epoch": 2.681967213114754, + "grad_norm": 0.3212524354457855, + "learning_rate": 2e-05, + "loss": 0.865, + "mean_token_accuracy": 0.7500091597459695, + "step": 3680 + }, + { + "epoch": 2.685610200364299, + "grad_norm": 0.3098811209201813, + "learning_rate": 2e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.7383976551050317, + "step": 3685 + }, + { + "epoch": 2.6892531876138435, + "grad_norm": 0.32596051692962646, + "learning_rate": 2e-05, + "loss": 0.9077, + "mean_token_accuracy": 0.7350451880801172, + "step": 3690 + }, + { + "epoch": 2.692896174863388, + "grad_norm": 0.333390474319458, + "learning_rate": 2e-05, + "loss": 0.8947, + "mean_token_accuracy": 0.7402296042989742, + "step": 3695 + }, + { + "epoch": 2.6965391621129324, + "grad_norm": 0.3289414048194885, + "learning_rate": 2e-05, + "loss": 0.9052, + "mean_token_accuracy": 0.7383640693698095, + "step": 3700 + }, + { + "epoch": 2.6965391621129324, + "eval_loss": 0.970956563949585, + "eval_mean_token_accuracy": 0.722475216524625, + "eval_runtime": 14.3324, + "eval_samples_per_second": 18.28, + "eval_steps_per_second": 1.186, + "step": 3700 + }, + { + "epoch": 2.7001821493624774, + "grad_norm": 0.30651167035102844, + "learning_rate": 2e-05, + "loss": 0.885, + "mean_token_accuracy": 0.7432095749877871, + "step": 3705 + }, + { + "epoch": 2.703825136612022, + "grad_norm": 0.30320581793785095, + "learning_rate": 2e-05, + "loss": 0.8982, + "mean_token_accuracy": 0.7390021983390329, + "step": 3710 + }, + { + "epoch": 2.7074681238615663, + "grad_norm": 0.30784809589385986, + "learning_rate": 2e-05, + "loss": 0.9066, + "mean_token_accuracy": 0.7373412310698583, + "step": 3715 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 0.34543514251708984, + "learning_rate": 2e-05, + "loss": 0.9057, + "mean_token_accuracy": 0.7384190278456277, + "step": 3720 + }, + { + "epoch": 2.7147540983606557, + "grad_norm": 0.30098143219947815, + "learning_rate": 2e-05, + "loss": 0.877, + "mean_token_accuracy": 0.7448186370297997, + "step": 3725 + }, + { + "epoch": 2.7183970856102, + "grad_norm": 0.3111075460910797, + "learning_rate": 2e-05, + "loss": 0.8778, + "mean_token_accuracy": 0.7456033219345384, + "step": 3730 + }, + { + "epoch": 2.722040072859745, + "grad_norm": 0.316897988319397, + "learning_rate": 2e-05, + "loss": 0.8799, + "mean_token_accuracy": 0.7437561064973132, + "step": 3735 + }, + { + "epoch": 2.7256830601092896, + "grad_norm": 0.3311839699745178, + "learning_rate": 2e-05, + "loss": 0.8764, + "mean_token_accuracy": 0.7459161556975303, + "step": 3740 + }, + { + "epoch": 2.729326047358834, + "grad_norm": 0.31449511647224426, + "learning_rate": 2e-05, + "loss": 0.9195, + "mean_token_accuracy": 0.7339704445530045, + "step": 3745 + }, + { + "epoch": 2.732969034608379, + "grad_norm": 0.3076689541339874, + "learning_rate": 2e-05, + "loss": 0.9106, + "mean_token_accuracy": 0.7375, + "step": 3750 + }, + { + "epoch": 2.7366120218579235, + "grad_norm": 0.32780900597572327, + "learning_rate": 2e-05, + "loss": 0.8937, + "mean_token_accuracy": 0.7403578407425501, + "step": 3755 + }, + { + "epoch": 2.740255009107468, + "grad_norm": 0.33124321699142456, + "learning_rate": 2e-05, + "loss": 0.8945, + "mean_token_accuracy": 0.7399700781631655, + "step": 3760 + }, + { + "epoch": 2.7438979963570125, + "grad_norm": 0.3387695848941803, + "learning_rate": 2e-05, + "loss": 0.8765, + "mean_token_accuracy": 0.7466475329750856, + "step": 3765 + }, + { + "epoch": 2.7475409836065574, + "grad_norm": 0.3448026478290558, + "learning_rate": 2e-05, + "loss": 0.8911, + "mean_token_accuracy": 0.7415677923927882, + "step": 3770 + }, + { + "epoch": 2.751183970856102, + "grad_norm": 0.3187689781188965, + "learning_rate": 2e-05, + "loss": 0.8754, + "mean_token_accuracy": 0.745847581827064, + "step": 3775 + }, + { + "epoch": 2.754826958105647, + "grad_norm": 0.31393617391586304, + "learning_rate": 2e-05, + "loss": 0.9051, + "mean_token_accuracy": 0.7382816316560822, + "step": 3780 + }, + { + "epoch": 2.7584699453551913, + "grad_norm": 0.3261832594871521, + "learning_rate": 2e-05, + "loss": 0.8941, + "mean_token_accuracy": 0.7418172936003908, + "step": 3785 + }, + { + "epoch": 2.762112932604736, + "grad_norm": 0.30824825167655945, + "learning_rate": 2e-05, + "loss": 0.8949, + "mean_token_accuracy": 0.7418473941582479, + "step": 3790 + }, + { + "epoch": 2.7657559198542803, + "grad_norm": 0.3116508722305298, + "learning_rate": 2e-05, + "loss": 0.8786, + "mean_token_accuracy": 0.7441438690766975, + "step": 3795 + }, + { + "epoch": 2.769398907103825, + "grad_norm": 0.365998238325119, + "learning_rate": 2e-05, + "loss": 0.8915, + "mean_token_accuracy": 0.741508915486077, + "step": 3800 + }, + { + "epoch": 2.769398907103825, + "eval_loss": 0.9683325290679932, + "eval_mean_token_accuracy": 0.7231605948989424, + "eval_runtime": 14.2659, + "eval_samples_per_second": 18.366, + "eval_steps_per_second": 1.192, + "step": 3800 + }, + { + "epoch": 2.7730418943533697, + "grad_norm": 0.31595027446746826, + "learning_rate": 2e-05, + "loss": 0.9011, + "mean_token_accuracy": 0.7383213238886175, + "step": 3805 + }, + { + "epoch": 2.7766848816029146, + "grad_norm": 0.3480280041694641, + "learning_rate": 2e-05, + "loss": 0.8807, + "mean_token_accuracy": 0.744873595505618, + "step": 3810 + }, + { + "epoch": 2.780327868852459, + "grad_norm": 0.3077247440814972, + "learning_rate": 2e-05, + "loss": 0.8804, + "mean_token_accuracy": 0.7448308500244261, + "step": 3815 + }, + { + "epoch": 2.7839708561020036, + "grad_norm": 0.31197667121887207, + "learning_rate": 2e-05, + "loss": 0.8889, + "mean_token_accuracy": 0.7428950903761603, + "step": 3820 + }, + { + "epoch": 2.787613843351548, + "grad_norm": 0.30954769253730774, + "learning_rate": 2e-05, + "loss": 0.8974, + "mean_token_accuracy": 0.739206765999023, + "step": 3825 + }, + { + "epoch": 2.791256830601093, + "grad_norm": 0.30114755034446716, + "learning_rate": 2e-05, + "loss": 0.9126, + "mean_token_accuracy": 0.73676111382511, + "step": 3830 + }, + { + "epoch": 2.7948998178506375, + "grad_norm": 0.3321165442466736, + "learning_rate": 2e-05, + "loss": 0.8907, + "mean_token_accuracy": 0.7411974841231069, + "step": 3835 + }, + { + "epoch": 2.7985428051001824, + "grad_norm": 0.3215268552303314, + "learning_rate": 2e-05, + "loss": 0.8832, + "mean_token_accuracy": 0.7422920089136892, + "step": 3840 + }, + { + "epoch": 2.802185792349727, + "grad_norm": 0.3101475238800049, + "learning_rate": 2e-05, + "loss": 0.8845, + "mean_token_accuracy": 0.7431729360039082, + "step": 3845 + }, + { + "epoch": 2.8058287795992713, + "grad_norm": 0.30512532591819763, + "learning_rate": 2e-05, + "loss": 0.8797, + "mean_token_accuracy": 0.7461132144601857, + "step": 3850 + }, + { + "epoch": 2.809471766848816, + "grad_norm": 0.30885010957717896, + "learning_rate": 2e-05, + "loss": 0.8835, + "mean_token_accuracy": 0.7433317049340498, + "step": 3855 + }, + { + "epoch": 2.8131147540983608, + "grad_norm": 0.31740885972976685, + "learning_rate": 2e-05, + "loss": 0.9041, + "mean_token_accuracy": 0.7379488275525158, + "step": 3860 + }, + { + "epoch": 2.8167577413479052, + "grad_norm": 0.3246329426765442, + "learning_rate": 2e-05, + "loss": 0.8886, + "mean_token_accuracy": 0.7418539325842698, + "step": 3865 + }, + { + "epoch": 2.82040072859745, + "grad_norm": 0.3088524341583252, + "learning_rate": 2e-05, + "loss": 0.891, + "mean_token_accuracy": 0.7422966536394723, + "step": 3870 + }, + { + "epoch": 2.8240437158469947, + "grad_norm": 0.33513322472572327, + "learning_rate": 2e-05, + "loss": 0.9228, + "mean_token_accuracy": 0.732694186614558, + "step": 3875 + }, + { + "epoch": 2.827686703096539, + "grad_norm": 0.33118847012519836, + "learning_rate": 2e-05, + "loss": 0.8659, + "mean_token_accuracy": 0.7481375183194922, + "step": 3880 + }, + { + "epoch": 2.8313296903460836, + "grad_norm": 0.3234773576259613, + "learning_rate": 2e-05, + "loss": 0.9047, + "mean_token_accuracy": 0.7398235222276502, + "step": 3885 + }, + { + "epoch": 2.8349726775956285, + "grad_norm": 0.3078250586986542, + "learning_rate": 2e-05, + "loss": 0.8619, + "mean_token_accuracy": 0.7485436003908157, + "step": 3890 + }, + { + "epoch": 2.838615664845173, + "grad_norm": 0.3281988203525543, + "learning_rate": 2e-05, + "loss": 0.8847, + "mean_token_accuracy": 0.7424554225696142, + "step": 3895 + }, + { + "epoch": 2.8422586520947175, + "grad_norm": 0.3436889946460724, + "learning_rate": 2e-05, + "loss": 0.9215, + "mean_token_accuracy": 0.7336040547142159, + "step": 3900 + }, + { + "epoch": 2.8422586520947175, + "eval_loss": 0.9669012427330017, + "eval_mean_token_accuracy": 0.7233897867800676, + "eval_runtime": 14.2753, + "eval_samples_per_second": 18.353, + "eval_steps_per_second": 1.191, + "step": 3900 + }, + { + "epoch": 2.8459016393442624, + "grad_norm": 0.32411956787109375, + "learning_rate": 2e-05, + "loss": 0.9043, + "mean_token_accuracy": 0.7381014899853444, + "step": 3905 + }, + { + "epoch": 2.849544626593807, + "grad_norm": 0.293822318315506, + "learning_rate": 2e-05, + "loss": 0.8795, + "mean_token_accuracy": 0.745911700048852, + "step": 3910 + }, + { + "epoch": 2.8531876138433514, + "grad_norm": 0.31667500734329224, + "learning_rate": 2e-05, + "loss": 0.8954, + "mean_token_accuracy": 0.7399792379091353, + "step": 3915 + }, + { + "epoch": 2.8568306010928963, + "grad_norm": 0.29437676072120667, + "learning_rate": 2e-05, + "loss": 0.8796, + "mean_token_accuracy": 0.7442171470444553, + "step": 3920 + }, + { + "epoch": 2.860473588342441, + "grad_norm": 0.3104299306869507, + "learning_rate": 2e-05, + "loss": 0.8805, + "mean_token_accuracy": 0.7429195163654126, + "step": 3925 + }, + { + "epoch": 2.8641165755919853, + "grad_norm": 0.3084963858127594, + "learning_rate": 2e-05, + "loss": 0.8997, + "mean_token_accuracy": 0.7388525891548607, + "step": 3930 + }, + { + "epoch": 2.86775956284153, + "grad_norm": 0.3696286678314209, + "learning_rate": 2e-05, + "loss": 0.8896, + "mean_token_accuracy": 0.7407944553004396, + "step": 3935 + }, + { + "epoch": 2.8714025500910747, + "grad_norm": 0.30959826707839966, + "learning_rate": 2e-05, + "loss": 0.8934, + "mean_token_accuracy": 0.7408810524700733, + "step": 3940 + }, + { + "epoch": 2.875045537340619, + "grad_norm": 0.3243521451950073, + "learning_rate": 2e-05, + "loss": 0.8602, + "mean_token_accuracy": 0.7506778212017586, + "step": 3945 + }, + { + "epoch": 2.8786885245901637, + "grad_norm": 0.33037546277046204, + "learning_rate": 2e-05, + "loss": 0.8975, + "mean_token_accuracy": 0.7393472154372251, + "step": 3950 + }, + { + "epoch": 2.8823315118397086, + "grad_norm": 0.3091663718223572, + "learning_rate": 2e-05, + "loss": 0.8705, + "mean_token_accuracy": 0.7468124084025402, + "step": 3955 + }, + { + "epoch": 2.885974499089253, + "grad_norm": 0.32397231459617615, + "learning_rate": 2e-05, + "loss": 0.896, + "mean_token_accuracy": 0.7402326575476307, + "step": 3960 + }, + { + "epoch": 2.889617486338798, + "grad_norm": 0.34801802039146423, + "learning_rate": 2e-05, + "loss": 0.8859, + "mean_token_accuracy": 0.7428798241328776, + "step": 3965 + }, + { + "epoch": 2.8932604735883425, + "grad_norm": 0.33170145750045776, + "learning_rate": 2e-05, + "loss": 0.8748, + "mean_token_accuracy": 0.7472826086956522, + "step": 3970 + }, + { + "epoch": 2.896903460837887, + "grad_norm": 0.3254745602607727, + "learning_rate": 2e-05, + "loss": 0.8793, + "mean_token_accuracy": 0.7446232291157793, + "step": 3975 + }, + { + "epoch": 2.9005464480874315, + "grad_norm": 0.3225265145301819, + "learning_rate": 2e-05, + "loss": 0.8896, + "mean_token_accuracy": 0.7418783585735221, + "step": 3980 + }, + { + "epoch": 2.9041894353369764, + "grad_norm": 0.3272905945777893, + "learning_rate": 2e-05, + "loss": 0.8889, + "mean_token_accuracy": 0.7421134587200783, + "step": 3985 + }, + { + "epoch": 2.907832422586521, + "grad_norm": 0.34882068634033203, + "learning_rate": 2e-05, + "loss": 0.8992, + "mean_token_accuracy": 0.7385655837811432, + "step": 3990 + }, + { + "epoch": 2.911475409836066, + "grad_norm": 0.3148573040962219, + "learning_rate": 2e-05, + "loss": 0.8773, + "mean_token_accuracy": 0.7454720322423058, + "step": 3995 + }, + { + "epoch": 2.9151183970856103, + "grad_norm": 0.31475773453712463, + "learning_rate": 2e-05, + "loss": 0.9036, + "mean_token_accuracy": 0.7386785539814363, + "step": 4000 + }, + { + "epoch": 2.9151183970856103, + "eval_loss": 0.9656786918640137, + "eval_mean_token_accuracy": 0.723362347188535, + "eval_runtime": 14.2833, + "eval_samples_per_second": 18.343, + "eval_steps_per_second": 1.19, + "step": 4000 + }, + { + "epoch": 2.9187613843351548, + "grad_norm": 0.301011323928833, + "learning_rate": 2e-05, + "loss": 0.8715, + "mean_token_accuracy": 0.7476062530532486, + "step": 4005 + }, + { + "epoch": 2.9224043715846992, + "grad_norm": 0.3263828158378601, + "learning_rate": 2e-05, + "loss": 0.8848, + "mean_token_accuracy": 0.7429073033707867, + "step": 4010 + }, + { + "epoch": 2.926047358834244, + "grad_norm": 0.36500218510627747, + "learning_rate": 2e-05, + "loss": 0.8937, + "mean_token_accuracy": 0.7400250366389841, + "step": 4015 + }, + { + "epoch": 2.9296903460837886, + "grad_norm": 0.32235583662986755, + "learning_rate": 2e-05, + "loss": 0.8906, + "mean_token_accuracy": 0.7414417440156326, + "step": 4020 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.32114923000335693, + "learning_rate": 2e-05, + "loss": 0.8742, + "mean_token_accuracy": 0.7465345627747924, + "step": 4025 + }, + { + "epoch": 2.936976320582878, + "grad_norm": 0.30394190549850464, + "learning_rate": 2e-05, + "loss": 0.8815, + "mean_token_accuracy": 0.742946995603322, + "step": 4030 + }, + { + "epoch": 2.9406193078324225, + "grad_norm": 0.32967275381088257, + "learning_rate": 2e-05, + "loss": 0.8831, + "mean_token_accuracy": 0.7421653639472399, + "step": 4035 + }, + { + "epoch": 2.944262295081967, + "grad_norm": 0.312377005815506, + "learning_rate": 2e-05, + "loss": 0.8916, + "mean_token_accuracy": 0.7408097215437226, + "step": 4040 + }, + { + "epoch": 2.947905282331512, + "grad_norm": 0.3285759389400482, + "learning_rate": 2e-05, + "loss": 0.8649, + "mean_token_accuracy": 0.7494778944797267, + "step": 4045 + }, + { + "epoch": 2.9515482695810564, + "grad_norm": 0.320372611284256, + "learning_rate": 2e-05, + "loss": 0.8772, + "mean_token_accuracy": 0.7447942110405471, + "step": 4050 + }, + { + "epoch": 2.9551912568306014, + "grad_norm": 0.33457741141319275, + "learning_rate": 2e-05, + "loss": 0.8985, + "mean_token_accuracy": 0.7386907669760626, + "step": 4055 + }, + { + "epoch": 2.958834244080146, + "grad_norm": 0.3132217526435852, + "learning_rate": 2e-05, + "loss": 0.9028, + "mean_token_accuracy": 0.7372679531021007, + "step": 4060 + }, + { + "epoch": 2.9624772313296903, + "grad_norm": 0.3045112192630768, + "learning_rate": 2e-05, + "loss": 0.9002, + "mean_token_accuracy": 0.7387335124572545, + "step": 4065 + }, + { + "epoch": 2.966120218579235, + "grad_norm": 0.3133973479270935, + "learning_rate": 2e-05, + "loss": 0.8899, + "mean_token_accuracy": 0.7421562042012703, + "step": 4070 + }, + { + "epoch": 2.9697632058287797, + "grad_norm": 0.3029925227165222, + "learning_rate": 2e-05, + "loss": 0.8778, + "mean_token_accuracy": 0.7461406936980948, + "step": 4075 + }, + { + "epoch": 2.973406193078324, + "grad_norm": 0.35844218730926514, + "learning_rate": 2e-05, + "loss": 0.8842, + "mean_token_accuracy": 0.7436828285295553, + "step": 4080 + }, + { + "epoch": 2.9770491803278687, + "grad_norm": 0.32251685857772827, + "learning_rate": 2e-05, + "loss": 0.9023, + "mean_token_accuracy": 0.7384923058133854, + "step": 4085 + }, + { + "epoch": 2.9806921675774136, + "grad_norm": 0.31728076934814453, + "learning_rate": 2e-05, + "loss": 0.8827, + "mean_token_accuracy": 0.7427424279433319, + "step": 4090 + }, + { + "epoch": 2.984335154826958, + "grad_norm": 0.3193369507789612, + "learning_rate": 2e-05, + "loss": 0.8938, + "mean_token_accuracy": 0.7414539570102588, + "step": 4095 + }, + { + "epoch": 2.9879781420765026, + "grad_norm": 0.330759733915329, + "learning_rate": 2e-05, + "loss": 0.8971, + "mean_token_accuracy": 0.7398968001954079, + "step": 4100 + }, + { + "epoch": 2.9879781420765026, + "eval_loss": 0.9635317325592041, + "eval_mean_token_accuracy": 0.723665181019814, + "eval_runtime": 14.2498, + "eval_samples_per_second": 18.386, + "eval_steps_per_second": 1.193, + "step": 4100 + }, + { + "epoch": 2.9916211293260475, + "grad_norm": 0.31818100810050964, + "learning_rate": 2e-05, + "loss": 0.8821, + "mean_token_accuracy": 0.7423058133854422, + "step": 4105 + }, + { + "epoch": 2.995264116575592, + "grad_norm": 0.3011719882488251, + "learning_rate": 2e-05, + "loss": 0.8524, + "mean_token_accuracy": 0.7518441621885686, + "step": 4110 + }, + { + "epoch": 2.9989071038251365, + "grad_norm": 0.3335364758968353, + "learning_rate": 2e-05, + "loss": 0.9245, + "mean_token_accuracy": 0.7316560820713238, + "step": 4115 + }, + { + "epoch": 2.9996357012750456, + "mean_token_accuracy": 0.7347490229604299, + "step": 4116, + "total_flos": 0.0, + "train_loss": 0.984842965855427, + "train_runtime": 39952.7795, + "train_samples_per_second": 3.298, + "train_steps_per_second": 0.103 } ], "logging_steps": 5, - "max_steps": 1351, + "max_steps": 4116, "num_input_tokens_seen": 0, - "num_train_epochs": 1, + "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -2313,8 +6989,8 @@ "attributes": {} } }, - "total_flos": 76959556042752.0, - "train_batch_size": 4, + "total_flos": 0.0, + "train_batch_size": 2, "trial_name": null, "trial_params": null }