diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6981 +1,2337 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9996357012750456, + "epoch": 1.9981785063752278, "eval_steps": 100, - "global_step": 4116, + "global_step": 1372, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0036429872495446266, - "grad_norm": 1.083729863166809, - "learning_rate": 5.346061023357346e-06, - "loss": 1.5553, - "mean_token_accuracy": 0.6280868343917929, + "epoch": 0.007285974499089253, + "grad_norm": 1.0096142292022705, + "learning_rate": 6.532799061198535e-06, + "loss": 1.4658, + "mean_token_accuracy": 0.643508793356131, "step": 5 }, { - "epoch": 0.007285974499089253, - "grad_norm": 0.8888028264045715, - "learning_rate": 7.648484184147212e-06, - "loss": 1.5402, - "mean_token_accuracy": 0.632089643380557, + "epoch": 0.014571948998178506, + "grad_norm": 0.7324991226196289, + "learning_rate": 9.346322475460614e-06, + "loss": 1.4392, + "mean_token_accuracy": 0.6500748045920858, "step": 10 }, { - "epoch": 0.01092896174863388, - "grad_norm": 0.607347309589386, - "learning_rate": 8.995315394001158e-06, - "loss": 1.5004, - "mean_token_accuracy": 0.638629702002931, + "epoch": 0.02185792349726776, + "grad_norm": 0.4901340901851654, + "learning_rate": 1.0992128167704883e-05, + "loss": 1.4079, + "mean_token_accuracy": 0.6516214780158487, "step": 15 }, { - "epoch": 0.014571948998178506, - "grad_norm": 0.5297051668167114, - "learning_rate": 9.950907344937074e-06, - "loss": 1.4841, - "mean_token_accuracy": 0.6408677332681976, + "epoch": 0.029143897996357013, + "grad_norm": 0.5083264708518982, + "learning_rate": 1.215984588972269e-05, + "loss": 1.3607, + "mean_token_accuracy": 0.658668172936004, "step": 20 }, { - "epoch": 0.018214936247723135, - "grad_norm": 0.5317749381065369, - "learning_rate": 1.0692122046714693e-05, - "loss": 1.4407, - "mean_token_accuracy": 0.6413135075720566, + "epoch": 0.03642987249544627, + "grad_norm": 0.40738093852996826, + "learning_rate": 1.306559812239707e-05, + "loss": 1.2944, + "mean_token_accuracy": 0.6690186858817784, "step": 25 }, { - "epoch": 0.02185792349726776, - "grad_norm": 0.4751906096935272, - "learning_rate": 1.1297738554791023e-05, - "loss": 1.4098, - "mean_token_accuracy": 0.6488728414197817, + "epoch": 0.04371584699453552, + "grad_norm": 0.3954770267009735, + "learning_rate": 1.3805651581966963e-05, + "loss": 1.2738, + "mean_token_accuracy": 0.670574316072301, "step": 30 }, { - "epoch": 0.025500910746812388, - "grad_norm": 0.38953375816345215, - "learning_rate": 1.1809780016460197e-05, - "loss": 1.406, - "mean_token_accuracy": 0.647099413776258, + "epoch": 0.051001821493624776, + "grad_norm": 0.3460943102836609, + "learning_rate": 1.4431357866551495e-05, + "loss": 1.2281, + "mean_token_accuracy": 0.6792226428920372, "step": 35 }, { - "epoch": 0.029143897996357013, - "grad_norm": 0.39693206548690796, - "learning_rate": 1.2253330505726937e-05, - "loss": 1.3257, - "mean_token_accuracy": 0.6625793844650707, + "epoch": 0.058287795992714025, + "grad_norm": 0.3018144369125366, + "learning_rate": 1.4973369303984771e-05, + "loss": 1.2132, + "mean_token_accuracy": 0.6817354665363946, "step": 40 }, { - "epoch": 0.03278688524590164, - "grad_norm": 0.3601134717464447, - "learning_rate": 1.2644569764644967e-05, - "loss": 1.3285, - "mean_token_accuracy": 0.661306179775281, + "epoch": 0.06557377049180328, + "grad_norm": 0.26513800024986267, + "learning_rate": 1.545145727421123e-05, + "loss": 1.1888, + "mean_token_accuracy": 0.6858985710796286, "step": 45 }, { - "epoch": 0.03642987249544627, - "grad_norm": 0.36966654658317566, - "learning_rate": 1.2994545207504556e-05, - "loss": 1.2978, - "mean_token_accuracy": 0.6672416951636541, + "epoch": 0.07285974499089254, + "grad_norm": 0.24335457384586334, + "learning_rate": 1.5879121536659146e-05, + "loss": 1.1716, + "mean_token_accuracy": 0.6894952979970689, "step": 50 }, { - "epoch": 0.04007285974499089, - "grad_norm": 0.35046863555908203, - "learning_rate": 1.3311136505276628e-05, - "loss": 1.3147, - "mean_token_accuracy": 0.6625061064973132, + "epoch": 0.08014571948998178, + "grad_norm": 0.24771854281425476, + "learning_rate": 1.626599092027313e-05, + "loss": 1.1691, + "mean_token_accuracy": 0.688681607230093, "step": 55 }, { - "epoch": 0.04371584699453552, - "grad_norm": 0.32947227358818054, - "learning_rate": 1.3600161715580883e-05, - "loss": 1.2817, - "mean_token_accuracy": 0.6690461651196873, + "epoch": 0.08743169398907104, + "grad_norm": 0.25512585043907166, + "learning_rate": 1.661917499622904e-05, + "loss": 1.1528, + "mean_token_accuracy": 0.6918442926674404, "step": 60 }, { - "epoch": 0.04735883424408015, - "grad_norm": 0.321172833442688, - "learning_rate": 1.3866039135512111e-05, - "loss": 1.262, - "mean_token_accuracy": 0.6715070835368833, + "epoch": 0.0947176684881603, + "grad_norm": 0.24238301813602448, + "learning_rate": 1.694407285125386e-05, + "loss": 1.1547, + "mean_token_accuracy": 0.6903882936378324, "step": 65 }, { - "epoch": 0.051001821493624776, - "grad_norm": 0.35791364312171936, - "learning_rate": 1.411220317725006e-05, - "loss": 1.2613, - "mean_token_accuracy": 0.6724963361016121, + "epoch": 0.10200364298724955, + "grad_norm": 0.25091490149497986, + "learning_rate": 1.7244881280813573e-05, + "loss": 1.1366, + "mean_token_accuracy": 0.6941881411822179, "step": 70 }, { - "epoch": 0.0546448087431694, - "grad_norm": 0.33518293499946594, - "learning_rate": 1.4341376417358504e-05, - "loss": 1.2837, - "mean_token_accuracy": 0.6674829018075231, + "epoch": 0.1092896174863388, + "grad_norm": 0.235865980386734, + "learning_rate": 1.7524927228903416e-05, + "loss": 1.1182, + "mean_token_accuracy": 0.6983726184660476, "step": 75 }, { - "epoch": 0.058287795992714025, - "grad_norm": 0.3203108608722687, - "learning_rate": 1.45557536665168e-05, - "loss": 1.2284, - "mean_token_accuracy": 0.6794638495359062, + "epoch": 0.11657559198542805, + "grad_norm": 0.2224230170249939, + "learning_rate": 1.7786892718246845e-05, + "loss": 1.1276, + "mean_token_accuracy": 0.6956857596482655, "step": 80 }, { - "epoch": 0.061930783242258654, - "grad_norm": 0.3269725441932678, - "learning_rate": 1.4757130137920071e-05, - "loss": 1.2487, - "mean_token_accuracy": 0.6726276257938447, + "epoch": 0.12386156648451731, + "grad_norm": 0.24439820647239685, + "learning_rate": 1.8032971469982566e-05, + "loss": 1.1179, + "mean_token_accuracy": 0.6984550561797753, "step": 85 }, { - "epoch": 0.06557377049180328, - "grad_norm": 0.3066905736923218, - "learning_rate": 1.4946992925434831e-05, - "loss": 1.2204, - "mean_token_accuracy": 0.680077552515877, + "epoch": 0.13114754098360656, + "grad_norm": 0.22702665627002716, + "learning_rate": 1.826498068847331e-05, + "loss": 1.1293, + "mean_token_accuracy": 0.6938858695652173, "step": 90 }, { - "epoch": 0.0692167577413479, - "grad_norm": 0.3220832943916321, - "learning_rate": 1.5126587715666353e-05, - "loss": 1.2368, - "mean_token_accuracy": 0.6759861993160723, + "epoch": 0.1384335154826958, + "grad_norm": 0.24332156777381897, + "learning_rate": 1.8484442582360153e-05, + "loss": 1.117, + "mean_token_accuracy": 0.6968719467513433, "step": 95 }, { - "epoch": 0.07285974499089254, - "grad_norm": 0.3288683593273163, - "learning_rate": 1.5296968368294423e-05, - "loss": 1.2077, - "mean_token_accuracy": 0.6822423058133853, + "epoch": 0.14571948998178508, + "grad_norm": 0.240131676197052, + "learning_rate": 1.8692644950921228e-05, + "loss": 1.1094, + "mean_token_accuracy": 0.6984871152906692, "step": 100 }, { - "epoch": 0.07285974499089254, - "eval_loss": 1.236223816871643, - "eval_mean_token_accuracy": 0.6733261218644397, - "eval_runtime": 14.5336, - "eval_samples_per_second": 18.027, - "eval_steps_per_second": 1.17, + "epoch": 0.14571948998178508, + "eval_loss": 1.1161140203475952, + "eval_mean_token_accuracy": 0.6946953157815736, + "eval_runtime": 45.3066, + "eval_samples_per_second": 5.783, + "eval_steps_per_second": 0.375, "step": 100 }, { - "epoch": 0.07650273224043716, - "grad_norm": 0.3233710527420044, - "learning_rate": 1.5459034387104006e-05, - "loss": 1.2171, - "mean_token_accuracy": 0.6799829018075232, + "epoch": 0.15300546448087432, + "grad_norm": 0.24882008135318756, + "learning_rate": 1.8890686973057843e-05, + "loss": 1.1004, + "mean_token_accuracy": 0.700412188568637, "step": 105 }, { - "epoch": 0.08014571948998178, - "grad_norm": 0.3219452500343323, - "learning_rate": 1.5613559666066493e-05, - "loss": 1.2284, - "mean_token_accuracy": 0.6770426233512457, + "epoch": 0.16029143897996356, + "grad_norm": 0.2562393546104431, + "learning_rate": 1.9079514334535213e-05, + "loss": 1.1024, + "mean_token_accuracy": 0.6994366756228629, "step": 110 }, { - "epoch": 0.08378870673952642, - "grad_norm": 0.3516216278076172, - "learning_rate": 1.5761214840250913e-05, - "loss": 1.2033, - "mean_token_accuracy": 0.680730947728383, + "epoch": 0.16757741347905283, + "grad_norm": 0.256538987159729, + "learning_rate": 1.9259946540430108e-05, + "loss": 1.0851, + "mean_token_accuracy": 0.7041478382999509, "step": 115 }, { - "epoch": 0.08743169398907104, - "grad_norm": 0.3209221661090851, - "learning_rate": 1.590258487637075e-05, - "loss": 1.2173, - "mean_token_accuracy": 0.6805741462560163, + "epoch": 0.17486338797814208, + "grad_norm": 0.2501155734062195, + "learning_rate": 1.943269841049112e-05, + "loss": 1.1104, + "mean_token_accuracy": 0.697314667806546, "step": 120 }, { - "epoch": 0.09107468123861566, - "grad_norm": 0.3234521746635437, - "learning_rate": 1.6038183070072043e-05, - "loss": 1.2223, - "mean_token_accuracy": 0.677242785901655, + "epoch": 0.18214936247723132, + "grad_norm": 0.23555661737918854, + "learning_rate": 1.9598397183595605e-05, + "loss": 1.0897, + "mean_token_accuracy": 0.7015464704445529, "step": 125 }, { - "epoch": 0.0947176684881603, - "grad_norm": 0.3622635006904602, - "learning_rate": 1.6168462296301973e-05, - "loss": 1.202, - "mean_token_accuracy": 0.6806454567659991, + "epoch": 0.1894353369763206, + "grad_norm": 0.27323609590530396, + "learning_rate": 1.9757596265515943e-05, + "loss": 1.0836, + "mean_token_accuracy": 0.7026937829876132, "step": 130 }, { - "epoch": 0.09836065573770492, - "grad_norm": 0.300037145614624, - "learning_rate": 1.6293824135288782e-05, - "loss": 1.1958, - "mean_token_accuracy": 0.6822697850512947, + "epoch": 0.19672131147540983, + "grad_norm": 0.28936222195625305, + "learning_rate": 1.9910786380717584e-05, + "loss": 1.08, + "mean_token_accuracy": 0.7054042501221296, "step": 135 }, { - "epoch": 0.10200364298724955, - "grad_norm": 0.3131561875343323, - "learning_rate": 1.6414626338039924e-05, - "loss": 1.1895, - "mean_token_accuracy": 0.6848986321446018, + "epoch": 0.2040072859744991, + "grad_norm": 0.2519981265068054, + "learning_rate": 2e-05, + "loss": 1.0728, + "mean_token_accuracy": 0.705013434294089, "step": 140 }, { - "epoch": 0.10564663023679417, - "grad_norm": 0.3196031451225281, - "learning_rate": 1.6531188981216057e-05, - "loss": 1.1976, - "mean_token_accuracy": 0.6813721299462628, + "epoch": 0.21129326047358835, + "grad_norm": 0.24323837459087372, + "learning_rate": 2e-05, + "loss": 1.0795, + "mean_token_accuracy": 0.7039799096238396, "step": 145 }, { - "epoch": 0.1092896174863388, - "grad_norm": 0.3315783143043518, - "learning_rate": 1.6643799578148368e-05, - "loss": 1.1576, - "mean_token_accuracy": 0.692782120175867, + "epoch": 0.2185792349726776, + "grad_norm": 0.2754611074924469, + "learning_rate": 2e-05, + "loss": 1.0832, + "mean_token_accuracy": 0.7035051294577429, "step": 150 }, { - "epoch": 0.11293260473588343, - "grad_norm": 0.3324851989746094, - "learning_rate": 1.6752717351491778e-05, - "loss": 1.2008, - "mean_token_accuracy": 0.6811614557889596, + "epoch": 0.22586520947176686, + "grad_norm": 0.24538645148277283, + "learning_rate": 2e-05, + "loss": 1.0684, + "mean_token_accuracy": 0.7052928065461651, "step": 155 }, { - "epoch": 0.11657559198542805, - "grad_norm": 0.3203977048397064, - "learning_rate": 1.6858176827306663e-05, - "loss": 1.1705, - "mean_token_accuracy": 0.6861290913531997, + "epoch": 0.2331511839708561, + "grad_norm": 0.2513323724269867, + "learning_rate": 2e-05, + "loss": 1.0832, + "mean_token_accuracy": 0.7013220566682951, "step": 160 }, { - "epoch": 0.12021857923497267, - "grad_norm": 0.31534770131111145, - "learning_rate": 1.6960390875920437e-05, - "loss": 1.1766, - "mean_token_accuracy": 0.6852009037616023, + "epoch": 0.24043715846994534, + "grad_norm": 0.24028229713439941, + "learning_rate": 2e-05, + "loss": 1.0543, + "mean_token_accuracy": 0.7086483268197361, "step": 165 }, { - "epoch": 0.12386156648451731, - "grad_norm": 0.3208962380886078, - "learning_rate": 1.7059553298709933e-05, - "loss": 1.1762, - "mean_token_accuracy": 0.689121275036639, + "epoch": 0.24772313296903462, + "grad_norm": 0.2462315559387207, + "learning_rate": 2e-05, + "loss": 1.0591, + "mean_token_accuracy": 0.7064622007816315, "step": 170 }, { - "epoch": 0.12750455373406194, - "grad_norm": 0.3643946945667267, - "learning_rate": 1.7155841039817543e-05, - "loss": 1.1845, - "mean_token_accuracy": 0.6833231558378114, + "epoch": 0.2550091074681239, + "grad_norm": 0.22473278641700745, + "learning_rate": 2e-05, + "loss": 1.0463, + "mean_token_accuracy": 0.7112435881778212, "step": 175 }, { - "epoch": 0.13114754098360656, - "grad_norm": 0.32823869585990906, - "learning_rate": 1.7249416086224696e-05, - "loss": 1.196, - "mean_token_accuracy": 0.6801325109916952, + "epoch": 0.26229508196721313, + "grad_norm": 0.23117414116859436, + "learning_rate": 2e-05, + "loss": 1.0669, + "mean_token_accuracy": 0.7054256228627258, "step": 180 }, { - "epoch": 0.13479052823315119, - "grad_norm": 0.33277738094329834, - "learning_rate": 1.7340427107436143e-05, - "loss": 1.1585, - "mean_token_accuracy": 0.6900555691255496, + "epoch": 0.26958105646630237, + "grad_norm": 0.23212246596813202, + "learning_rate": 2e-05, + "loss": 1.0721, + "mean_token_accuracy": 0.703644052271617, "step": 185 }, { - "epoch": 0.1384335154826958, - "grad_norm": 0.33548158407211304, - "learning_rate": 1.7429010876456215e-05, - "loss": 1.1943, - "mean_token_accuracy": 0.6806485100146555, + "epoch": 0.2768670309653916, + "grad_norm": 0.22645309567451477, + "learning_rate": 2e-05, + "loss": 1.0542, + "mean_token_accuracy": 0.7082223986321445, "step": 190 }, { - "epoch": 0.14207650273224043, - "grad_norm": 0.35972943902015686, - "learning_rate": 1.7515293506155924e-05, - "loss": 1.1669, - "mean_token_accuracy": 0.6856069858329262, + "epoch": 0.28415300546448086, + "grad_norm": 0.24246527254581451, + "learning_rate": 2e-05, + "loss": 1.0473, + "mean_token_accuracy": 0.7109749022960431, "step": 195 }, { - "epoch": 0.14571948998178508, - "grad_norm": 0.32267147302627563, - "learning_rate": 1.7599391529084285e-05, - "loss": 1.1692, - "mean_token_accuracy": 0.6881839276990719, + "epoch": 0.29143897996357016, + "grad_norm": 0.26072046160697937, + "learning_rate": 2e-05, + "loss": 1.0423, + "mean_token_accuracy": 0.7102268563751831, "step": 200 }, { - "epoch": 0.14571948998178508, - "eval_loss": 1.1753935813903809, - "eval_mean_token_accuracy": 0.6837428585889604, - "eval_runtime": 14.4124, - "eval_samples_per_second": 18.179, - "eval_steps_per_second": 1.18, + "epoch": 0.29143897996357016, + "eval_loss": 1.0579973459243774, + "eval_mean_token_accuracy": 0.7055148573028509, + "eval_runtime": 40.5396, + "eval_samples_per_second": 6.463, + "eval_steps_per_second": 0.419, "step": 200 }, { - "epoch": 0.1493624772313297, - "grad_norm": 0.3298497200012207, - "learning_rate": 1.7681412843926186e-05, - "loss": 1.1636, - "mean_token_accuracy": 0.6878847093307279, + "epoch": 0.2987249544626594, + "grad_norm": 0.2392524629831314, + "learning_rate": 2e-05, + "loss": 1.056, + "mean_token_accuracy": 0.7082315583781142, "step": 205 }, { - "epoch": 0.15300546448087432, - "grad_norm": 0.30288785696029663, - "learning_rate": 1.776145754789387e-05, - "loss": 1.1562, - "mean_token_accuracy": 0.6889502931118711, + "epoch": 0.30601092896174864, + "grad_norm": 0.24252192676067352, + "learning_rate": 2e-05, + "loss": 1.055, + "mean_token_accuracy": 0.7082407181240838, "step": 210 }, { - "epoch": 0.15664845173041894, - "grad_norm": 0.40150612592697144, - "learning_rate": 1.7839618671161183e-05, - "loss": 1.1843, - "mean_token_accuracy": 0.6828957010258916, + "epoch": 0.3132969034608379, + "grad_norm": 0.2512829303741455, + "learning_rate": 2e-05, + "loss": 1.0353, + "mean_token_accuracy": 0.7127122007816316, "step": 215 }, { - "epoch": 0.16029143897996356, - "grad_norm": 0.3242761194705963, - "learning_rate": 1.7915982826856355e-05, - "loss": 1.1429, - "mean_token_accuracy": 0.692953102100635, + "epoch": 0.3205828779599271, + "grad_norm": 0.2263517677783966, + "learning_rate": 2e-05, + "loss": 1.0528, + "mean_token_accuracy": 0.7089963971665854, "step": 220 }, { - "epoch": 0.16393442622950818, - "grad_norm": 0.3417595326900482, - "learning_rate": 1.7990630788002315e-05, - "loss": 1.1259, - "mean_token_accuracy": 0.6991542501221301, + "epoch": 0.32786885245901637, + "grad_norm": 0.25571829080581665, + "learning_rate": 2e-05, + "loss": 1.0373, + "mean_token_accuracy": 0.7127348798847855, "step": 225 }, { - "epoch": 0.16757741347905283, - "grad_norm": 0.3361223638057709, - "learning_rate": 1.8063638001040778e-05, - "loss": 1.1684, - "mean_token_accuracy": 0.685579506595017, + "epoch": 0.33515482695810567, + "grad_norm": 0.2718060314655304, + "learning_rate": 2e-05, + "loss": 1.0231, + "mean_token_accuracy": 0.7161425256472889, "step": 230 }, { - "epoch": 0.17122040072859745, - "grad_norm": 0.33286741375923157, - "learning_rate": 1.813507504412511e-05, - "loss": 1.1698, - "mean_token_accuracy": 0.6841627992183683, + "epoch": 0.3424408014571949, + "grad_norm": 0.24943749606609344, + "learning_rate": 2e-05, + "loss": 1.0364, + "mean_token_accuracy": 0.7126031834774758, "step": 235 }, { - "epoch": 0.17486338797814208, - "grad_norm": 0.34265580773353577, - "learning_rate": 1.820500803716061e-05, - "loss": 1.1747, - "mean_token_accuracy": 0.6862054225696141, + "epoch": 0.34972677595628415, + "grad_norm": 0.25987544655799866, + "learning_rate": 2e-05, + "loss": 1.0302, + "mean_token_accuracy": 0.7140556301905225, "step": 240 }, { - "epoch": 0.1785063752276867, - "grad_norm": 0.34530794620513916, - "learning_rate": 1.8273499009563047e-05, - "loss": 1.1496, - "mean_token_accuracy": 0.6905929408891059, + "epoch": 0.3570127504553734, + "grad_norm": 0.2467297464609146, + "learning_rate": 2e-05, + "loss": 1.0313, + "mean_token_accuracy": 0.7123977161700048, "step": 245 }, { - "epoch": 0.18214936247723132, - "grad_norm": 0.378530889749527, - "learning_rate": 1.8340606230861904e-05, - "loss": 1.1569, - "mean_token_accuracy": 0.6879213483146068, + "epoch": 0.36429872495446264, + "grad_norm": 0.25790703296661377, + "learning_rate": 2e-05, + "loss": 1.0158, + "mean_token_accuracy": 0.7178706643869076, "step": 250 }, { - "epoch": 0.18579234972677597, - "grad_norm": 0.3267212212085724, - "learning_rate": 1.840638450856388e-05, - "loss": 1.1486, - "mean_token_accuracy": 0.6902121564708253, + "epoch": 0.37158469945355194, + "grad_norm": 0.2591741383075714, + "learning_rate": 2e-05, + "loss": 1.0258, + "mean_token_accuracy": 0.7137533585735221, "step": 255 }, { - "epoch": 0.1894353369763206, - "grad_norm": 0.31661802530288696, - "learning_rate": 1.847088545709184e-05, - "loss": 1.1427, - "mean_token_accuracy": 0.6917562286272594, + "epoch": 0.3788706739526412, + "grad_norm": 0.2623481750488281, + "learning_rate": 2e-05, + "loss": 1.0525, + "mean_token_accuracy": 0.7065553248656569, "step": 260 }, { - "epoch": 0.1930783242258652, - "grad_norm": 0.30997058749198914, - "learning_rate": 1.853415774110566e-05, - "loss": 1.1549, - "mean_token_accuracy": 0.6902479237909136, + "epoch": 0.3861566484517304, + "grad_norm": 0.22918273508548737, + "learning_rate": 2e-05, + "loss": 1.027, + "mean_token_accuracy": 0.7136006961406938, "step": 265 }, { - "epoch": 0.19672131147540983, - "grad_norm": 0.30749374628067017, - "learning_rate": 1.8596247296078644e-05, - "loss": 1.133, - "mean_token_accuracy": 0.6957437713727406, + "epoch": 0.39344262295081966, + "grad_norm": 0.24080874025821686, + "learning_rate": 2e-05, + "loss": 1.0324, + "mean_token_accuracy": 0.7127405883580941, "step": 270 }, { - "epoch": 0.20036429872495445, - "grad_norm": 0.300138384103775, - "learning_rate": 1.8657197528633974e-05, - "loss": 1.1249, - "mean_token_accuracy": 0.6944736199316071, + "epoch": 0.4007285974499089, + "grad_norm": 0.2676204741001129, + "learning_rate": 2e-05, + "loss": 1.0265, + "mean_token_accuracy": 0.7136129091353199, "step": 275 }, { - "epoch": 0.2040072859744991, - "grad_norm": 0.3103027939796448, - "learning_rate": 1.8717049498829786e-05, - "loss": 1.1429, - "mean_token_accuracy": 0.6907700293111871, + "epoch": 0.4080145719489982, + "grad_norm": 0.24465428292751312, + "learning_rate": 2e-05, + "loss": 1.0349, + "mean_token_accuracy": 0.7100161822178797, "step": 280 }, { - "epoch": 0.20765027322404372, - "grad_norm": 0.3321939706802368, - "learning_rate": 1.8775842086310163e-05, - "loss": 1.1427, - "mean_token_accuracy": 0.6926813629702003, + "epoch": 0.41530054644808745, + "grad_norm": 0.24205349385738373, + "learning_rate": 2e-05, + "loss": 1.0254, + "mean_token_accuracy": 0.7133365901319003, "step": 285 }, { - "epoch": 0.21129326047358835, - "grad_norm": 0.3095480501651764, - "learning_rate": 1.8833612142005922e-05, - "loss": 1.1425, - "mean_token_accuracy": 0.6899548119198827, + "epoch": 0.4225865209471767, + "grad_norm": 0.2451147884130478, + "learning_rate": 2e-05, + "loss": 1.0189, + "mean_token_accuracy": 0.715379213483146, "step": 290 }, { - "epoch": 0.21493624772313297, - "grad_norm": 0.2930939197540283, - "learning_rate": 1.889039462686756e-05, - "loss": 1.125, - "mean_token_accuracy": 0.6964429653150953, + "epoch": 0.42987249544626593, + "grad_norm": 0.24288122355937958, + "learning_rate": 2e-05, + "loss": 1.0282, + "mean_token_accuracy": 0.7133766909700059, "step": 295 }, { - "epoch": 0.2185792349726776, - "grad_norm": 0.3320595324039459, - "learning_rate": 1.8946222738938233e-05, - "loss": 1.1666, - "mean_token_accuracy": 0.6868649242794332, + "epoch": 0.4371584699453552, + "grad_norm": 0.23199407756328583, + "learning_rate": 2e-05, + "loss": 1.012, + "mean_token_accuracy": 0.7167684416218857, "step": 300 }, { - "epoch": 0.2185792349726776, - "eval_loss": 1.1416388750076294, - "eval_mean_token_accuracy": 0.6895034030538906, - "eval_runtime": 14.4924, - "eval_samples_per_second": 18.078, - "eval_steps_per_second": 1.173, + "epoch": 0.4371584699453552, + "eval_loss": 1.0286855697631836, + "eval_mean_token_accuracy": 0.7104119275712862, + "eval_runtime": 37.7776, + "eval_samples_per_second": 6.935, + "eval_steps_per_second": 0.45, "step": 300 }, { - "epoch": 0.2222222222222222, - "grad_norm": 0.31145739555358887, - "learning_rate": 1.9001128029923346e-05, - "loss": 1.1466, - "mean_token_accuracy": 0.6887243527112848, + "epoch": 0.4444444444444444, + "grad_norm": 0.24106916785240173, + "learning_rate": 2e-05, + "loss": 1.0193, + "mean_token_accuracy": 0.714025097703957, "step": 305 }, { - "epoch": 0.22586520947176686, - "grad_norm": 0.314193993806839, - "learning_rate": 1.905514051228164e-05, - "loss": 1.1185, - "mean_token_accuracy": 0.6978230337078649, + "epoch": 0.4517304189435337, + "grad_norm": 0.2508867084980011, + "learning_rate": 2e-05, + "loss": 1.0078, + "mean_token_accuracy": 0.7173775647288717, "step": 310 }, { - "epoch": 0.22950819672131148, - "grad_norm": 0.34623581171035767, - "learning_rate": 1.910828875774782e-05, - "loss": 1.1448, - "mean_token_accuracy": 0.6900097703957011, + "epoch": 0.45901639344262296, + "grad_norm": 0.2557254731655121, + "learning_rate": 2e-05, + "loss": 1.0002, + "mean_token_accuracy": 0.7212154982901808, "step": 315 }, { - "epoch": 0.2331511839708561, - "grad_norm": 0.3344809114933014, - "learning_rate": 1.9160599988096528e-05, - "loss": 1.1514, - "mean_token_accuracy": 0.6889502931118712, + "epoch": 0.4663023679417122, + "grad_norm": 0.2555619180202484, + "learning_rate": 2e-05, + "loss": 1.0212, + "mean_token_accuracy": 0.7146265876893015, "step": 320 }, { - "epoch": 0.23679417122040072, - "grad_norm": 0.32068851590156555, - "learning_rate": 1.9212100158869457e-05, - "loss": 1.1423, - "mean_token_accuracy": 0.6901746458231559, + "epoch": 0.47358834244080145, + "grad_norm": 0.2347564995288849, + "learning_rate": 2e-05, + "loss": 0.9874, + "mean_token_accuracy": 0.7231909501709821, "step": 325 }, { - "epoch": 0.24043715846994534, - "grad_norm": 0.3246494233608246, - "learning_rate": 1.92628140367103e-05, - "loss": 1.0916, - "mean_token_accuracy": 0.7032120175867121, + "epoch": 0.4808743169398907, + "grad_norm": 0.24380792677402496, + "learning_rate": 2e-05, + "loss": 0.9829, + "mean_token_accuracy": 0.7247633732291158, "step": 330 }, { - "epoch": 0.24408014571949, - "grad_norm": 0.3234722912311554, - "learning_rate": 1.9312765270884355e-05, - "loss": 1.149, - "mean_token_accuracy": 0.687756472887152, + "epoch": 0.48816029143898, + "grad_norm": 0.24516567587852478, + "learning_rate": 2e-05, + "loss": 1.0198, + "mean_token_accuracy": 0.7143673417060667, "step": 335 }, { - "epoch": 0.24772313296903462, - "grad_norm": 0.3160967528820038, - "learning_rate": 1.9361976459499798e-05, - "loss": 1.0939, - "mean_token_accuracy": 0.7012060332193453, + "epoch": 0.49544626593806923, + "grad_norm": 0.23901765048503876, + "learning_rate": 2e-05, + "loss": 1.0099, + "mean_token_accuracy": 0.7166050928187591, "step": 340 }, { - "epoch": 0.25136612021857924, - "grad_norm": 0.322691947221756, - "learning_rate": 1.9410469210894726e-05, - "loss": 1.1172, - "mean_token_accuracy": 0.6959697117733267, + "epoch": 0.5027322404371585, + "grad_norm": 0.252539724111557, + "learning_rate": 2e-05, + "loss": 0.9999, + "mean_token_accuracy": 0.7207071323888619, "step": 345 }, { - "epoch": 0.2550091074681239, - "grad_norm": 0.31874555349349976, - "learning_rate": 1.9458264200607405e-05, - "loss": 1.1018, - "mean_token_accuracy": 0.7011755007327797, + "epoch": 0.5100182149362478, + "grad_norm": 0.2206733673810959, + "learning_rate": 2e-05, + "loss": 0.9919, + "mean_token_accuracy": 0.7224993893502688, "step": 350 }, { - "epoch": 0.2586520947176685, - "grad_norm": 0.3494158387184143, - "learning_rate": 1.9505381224305674e-05, - "loss": 1.1373, - "mean_token_accuracy": 0.6918753053248655, + "epoch": 0.517304189435337, + "grad_norm": 0.21977460384368896, + "learning_rate": 2e-05, + "loss": 1.0083, + "mean_token_accuracy": 0.716026502198339, "step": 355 }, { - "epoch": 0.26229508196721313, - "grad_norm": 0.3095668852329254, - "learning_rate": 1.9551839247014558e-05, - "loss": 1.1233, - "mean_token_accuracy": 0.6946201758671228, + "epoch": 0.5245901639344263, + "grad_norm": 0.25190576910972595, + "learning_rate": 2e-05, + "loss": 1.0001, + "mean_token_accuracy": 0.7192278334147535, "step": 360 }, { - "epoch": 0.2659380692167577, - "grad_norm": 0.33371731638908386, - "learning_rate": 1.9597656448948597e-05, - "loss": 1.1395, - "mean_token_accuracy": 0.6902143380556913, + "epoch": 0.5318761384335154, + "grad_norm": 0.2302297204732895, + "learning_rate": 2e-05, + "loss": 0.9807, + "mean_token_accuracy": 0.7235634465070835, "step": 365 }, { - "epoch": 0.26958105646630237, - "grad_norm": 0.3272486925125122, - "learning_rate": 1.9642850268226008e-05, - "loss": 1.1334, - "mean_token_accuracy": 0.6928737176355642, + "epoch": 0.5391621129326047, + "grad_norm": 0.2296873778104782, + "learning_rate": 2e-05, + "loss": 0.979, + "mean_token_accuracy": 0.7253984489496823, "step": 370 }, { - "epoch": 0.273224043715847, - "grad_norm": 0.34388625621795654, - "learning_rate": 1.9687437440715852e-05, - "loss": 1.1345, - "mean_token_accuracy": 0.6911791646311675, + "epoch": 0.546448087431694, + "grad_norm": 0.22952046990394592, + "learning_rate": 2e-05, + "loss": 1.0024, + "mean_token_accuracy": 0.7198125305324866, "step": 375 }, { - "epoch": 0.2768670309653916, - "grad_norm": 0.2946622967720032, - "learning_rate": 1.973143403724608e-05, - "loss": 1.1011, - "mean_token_accuracy": 0.6998412310698583, + "epoch": 0.5537340619307832, + "grad_norm": 0.26434800028800964, + "learning_rate": 2e-05, + "loss": 1.0029, + "mean_token_accuracy": 0.7184065095261356, "step": 380 }, { - "epoch": 0.28051001821493626, - "grad_norm": 0.3136511445045471, - "learning_rate": 1.9774855498379478e-05, - "loss": 1.1306, - "mean_token_accuracy": 0.6944949926722032, + "epoch": 0.5610200364298725, + "grad_norm": 0.24686159193515778, + "learning_rate": 2e-05, + "loss": 0.9886, + "mean_token_accuracy": 0.7215569017124912, "step": 385 }, { - "epoch": 0.28415300546448086, - "grad_norm": 0.3592908978462219, - "learning_rate": 1.9817716666945786e-05, - "loss": 1.0909, - "mean_token_accuracy": 0.7024273326819737, + "epoch": 0.5683060109289617, + "grad_norm": 0.2560524344444275, + "learning_rate": 2e-05, + "loss": 0.9816, + "mean_token_accuracy": 0.7243008060576454, "step": 390 }, { - "epoch": 0.2877959927140255, - "grad_norm": 0.34179943799972534, - "learning_rate": 1.9860031818501565e-05, - "loss": 1.1172, - "mean_token_accuracy": 0.6946171226184662, + "epoch": 0.575591985428051, + "grad_norm": 0.23923024535179138, + "learning_rate": 2e-05, + "loss": 1.0118, + "mean_token_accuracy": 0.7144159135319982, "step": 395 }, { - "epoch": 0.29143897996357016, - "grad_norm": 0.3114219605922699, - "learning_rate": 1.9901814689874147e-05, - "loss": 1.0922, - "mean_token_accuracy": 0.7006564484611626, + "epoch": 0.5828779599271403, + "grad_norm": 0.2485678642988205, + "learning_rate": 2e-05, + "loss": 1.0, + "mean_token_accuracy": 0.7188690766976062, "step": 400 }, { - "epoch": 0.29143897996357016, - "eval_loss": 1.1195729970932007, - "eval_mean_token_accuracy": 0.6934797960919029, - "eval_runtime": 14.5378, - "eval_samples_per_second": 18.022, - "eval_steps_per_second": 1.169, + "epoch": 0.5828779599271403, + "eval_loss": 1.008617639541626, + "eval_mean_token_accuracy": 0.714862565310479, + "eval_runtime": 38.6648, + "eval_samples_per_second": 6.776, + "eval_steps_per_second": 0.44, "step": 400 }, { - "epoch": 0.29508196721311475, - "grad_norm": 0.32596296072006226, - "learning_rate": 1.994307850593259e-05, - "loss": 1.1116, - "mean_token_accuracy": 0.6975177088422082, + "epoch": 0.5901639344262295, + "grad_norm": 0.2388627827167511, + "learning_rate": 2e-05, + "loss": 0.9903, + "mean_token_accuracy": 0.7215254030288225, "step": 405 }, { - "epoch": 0.2987249544626594, - "grad_norm": 0.33011898398399353, - "learning_rate": 1.9983836004716048e-05, - "loss": 1.1279, - "mean_token_accuracy": 0.6937744259892525, + "epoch": 0.5974499089253188, + "grad_norm": 0.23851706087589264, + "learning_rate": 2e-05, + "loss": 1.002, + "mean_token_accuracy": 0.7170279677576941, "step": 410 }, { - "epoch": 0.302367941712204, - "grad_norm": 0.3652317523956299, + "epoch": 0.604735883424408, + "grad_norm": 0.25729191303253174, "learning_rate": 2e-05, - "loss": 1.1157, - "mean_token_accuracy": 0.6966597459697119, + "loss": 1.0014, + "mean_token_accuracy": 0.7183484978016609, "step": 415 }, { - "epoch": 0.30601092896174864, - "grad_norm": 0.31457507610321045, + "epoch": 0.6120218579234973, + "grad_norm": 0.24574624001979828, "learning_rate": 2e-05, - "loss": 1.1197, - "mean_token_accuracy": 0.6949621397166584, + "loss": 1.0013, + "mean_token_accuracy": 0.7191163898387888, "step": 420 }, { - "epoch": 0.30965391621129323, - "grad_norm": 0.31319767236709595, + "epoch": 0.6193078324225865, + "grad_norm": 0.24090902507305145, "learning_rate": 2e-05, - "loss": 1.0992, - "mean_token_accuracy": 0.7012732046897898, + "loss": 1.0069, + "mean_token_accuracy": 0.7172478016609672, "step": 425 }, { - "epoch": 0.3132969034608379, - "grad_norm": 0.2934488356113434, + "epoch": 0.6265938069216758, + "grad_norm": 0.24890194833278656, "learning_rate": 2e-05, - "loss": 1.0945, - "mean_token_accuracy": 0.6996916218856863, + "loss": 0.9897, + "mean_token_accuracy": 0.7221360527601368, "step": 430 }, { - "epoch": 0.31693989071038253, - "grad_norm": 0.33269914984703064, + "epoch": 0.6338797814207651, + "grad_norm": 0.27910518646240234, "learning_rate": 2e-05, - "loss": 1.1113, - "mean_token_accuracy": 0.6966261602344895, + "loss": 1.0073, + "mean_token_accuracy": 0.7160036028334147, "step": 435 }, { - "epoch": 0.3205828779599271, - "grad_norm": 0.33153873682022095, + "epoch": 0.6411657559198543, + "grad_norm": 0.24221768975257874, "learning_rate": 2e-05, - "loss": 1.1204, - "mean_token_accuracy": 0.6963574743527113, + "loss": 0.9711, + "mean_token_accuracy": 0.7259183995837477, "step": 440 }, { - "epoch": 0.3242258652094718, - "grad_norm": 0.32638055086135864, + "epoch": 0.6484517304189436, + "grad_norm": 0.23564772307872772, "learning_rate": 2e-05, - "loss": 1.0837, - "mean_token_accuracy": 0.7034817894672104, + "loss": 0.9914, + "mean_token_accuracy": 0.7203071568148511, "step": 445 }, { - "epoch": 0.32786885245901637, - "grad_norm": 0.3373214602470398, + "epoch": 0.6557377049180327, + "grad_norm": 0.23250643908977509, "learning_rate": 2e-05, - "loss": 1.1176, - "mean_token_accuracy": 0.6957162921348317, + "loss": 0.9863, + "mean_token_accuracy": 0.7207727772349781, "step": 450 }, { - "epoch": 0.331511839708561, - "grad_norm": 0.35421860218048096, + "epoch": 0.663023679417122, + "grad_norm": 0.22369219362735748, "learning_rate": 2e-05, - "loss": 1.0935, - "mean_token_accuracy": 0.7031082071323889, + "loss": 0.9786, + "mean_token_accuracy": 0.7255831704934049, "step": 455 }, { - "epoch": 0.33515482695810567, - "grad_norm": 0.32108989357948303, + "epoch": 0.6703096539162113, + "grad_norm": 0.2398165464401245, "learning_rate": 2e-05, - "loss": 1.0774, - "mean_token_accuracy": 0.7054744748412312, + "loss": 0.9824, + "mean_token_accuracy": 0.723346665852467, "step": 460 }, { - "epoch": 0.33879781420765026, - "grad_norm": 0.3276057839393616, + "epoch": 0.6775956284153005, + "grad_norm": 0.2463538646697998, "learning_rate": 2e-05, - "loss": 1.0878, - "mean_token_accuracy": 0.7024872572898195, + "loss": 0.9823, + "mean_token_accuracy": 0.7212032852955546, "step": 465 }, { - "epoch": 0.3424408014571949, - "grad_norm": 0.3203645348548889, + "epoch": 0.6848816029143898, + "grad_norm": 0.25979697704315186, "learning_rate": 2e-05, - "loss": 1.1107, - "mean_token_accuracy": 0.6970383488031264, + "loss": 0.9844, + "mean_token_accuracy": 0.7220978871519298, "step": 470 }, { - "epoch": 0.3460837887067395, - "grad_norm": 0.3142698109149933, + "epoch": 0.692167577413479, + "grad_norm": 0.25015923380851746, "learning_rate": 2e-05, - "loss": 1.0818, - "mean_token_accuracy": 0.7031570591108939, + "loss": 0.9966, + "mean_token_accuracy": 0.717882877381534, "step": 475 }, { - "epoch": 0.34972677595628415, - "grad_norm": 0.36916568875312805, + "epoch": 0.6994535519125683, + "grad_norm": 0.2696310877799988, "learning_rate": 2e-05, - "loss": 1.1035, - "mean_token_accuracy": 0.6999328285295553, + "loss": 0.9742, + "mean_token_accuracy": 0.7241342766156815, "step": 480 }, { - "epoch": 0.3533697632058288, - "grad_norm": 0.33288297057151794, + "epoch": 0.7067395264116576, + "grad_norm": 0.23021022975444794, "learning_rate": 2e-05, - "loss": 1.1184, - "mean_token_accuracy": 0.6936950415241816, + "loss": 0.9685, + "mean_token_accuracy": 0.7260197850512945, "step": 485 }, { - "epoch": 0.3570127504553734, - "grad_norm": 0.2986063063144684, + "epoch": 0.7140255009107468, + "grad_norm": 0.25251683592796326, "learning_rate": 2e-05, - "loss": 1.07, - "mean_token_accuracy": 0.7059324621397166, + "loss": 0.9619, + "mean_token_accuracy": 0.7277463971665853, "step": 490 }, { - "epoch": 0.36065573770491804, - "grad_norm": 0.33390265703201294, + "epoch": 0.7213114754098361, + "grad_norm": 0.23041123151779175, "learning_rate": 2e-05, - "loss": 1.0834, - "mean_token_accuracy": 0.7028731069858329, + "loss": 0.9733, + "mean_token_accuracy": 0.7242901196873474, "step": 495 }, { - "epoch": 0.36429872495446264, - "grad_norm": 0.3243837058544159, + "epoch": 0.7285974499089253, + "grad_norm": 0.2576392889022827, "learning_rate": 2e-05, - "loss": 1.0766, - "mean_token_accuracy": 0.7079109672691744, + "loss": 0.9852, + "mean_token_accuracy": 0.7223543600390816, "step": 500 }, { - "epoch": 0.36429872495446264, - "eval_loss": 1.1029341220855713, - "eval_mean_token_accuracy": 0.6966890054732253, - "eval_runtime": 14.5663, - "eval_samples_per_second": 17.987, - "eval_steps_per_second": 1.167, + "epoch": 0.7285974499089253, + "eval_loss": 0.9932907819747925, + "eval_mean_token_accuracy": 0.7173129502108766, + "eval_runtime": 37.0409, + "eval_samples_per_second": 7.073, + "eval_steps_per_second": 0.459, "step": 500 }, { - "epoch": 0.3679417122040073, - "grad_norm": 0.3292768597602844, + "epoch": 0.7358834244080146, + "grad_norm": 0.2792870104312897, "learning_rate": 2e-05, - "loss": 1.0895, - "mean_token_accuracy": 0.7004579872984855, + "loss": 0.9639, + "mean_token_accuracy": 0.7265312042012704, "step": 505 }, { - "epoch": 0.37158469945355194, - "grad_norm": 0.34203407168388367, + "epoch": 0.7431693989071039, + "grad_norm": 0.2572779059410095, "learning_rate": 2e-05, - "loss": 1.0898, - "mean_token_accuracy": 0.7014441866145578, + "loss": 0.9807, + "mean_token_accuracy": 0.7220719345383488, "step": 510 }, { - "epoch": 0.37522768670309653, - "grad_norm": 0.36646661162376404, + "epoch": 0.7504553734061931, + "grad_norm": 0.23189429938793182, "learning_rate": 2e-05, - "loss": 1.1225, - "mean_token_accuracy": 0.6927393746946752, + "loss": 0.982, + "mean_token_accuracy": 0.7221726917440157, "step": 515 }, { - "epoch": 0.3788706739526412, - "grad_norm": 0.32106146216392517, + "epoch": 0.7577413479052824, + "grad_norm": 0.23592406511306763, "learning_rate": 2e-05, - "loss": 1.1075, - "mean_token_accuracy": 0.6962872496336103, + "loss": 0.9817, + "mean_token_accuracy": 0.7224322178798241, "step": 520 }, { - "epoch": 0.3825136612021858, - "grad_norm": 0.3438628315925598, + "epoch": 0.7650273224043715, + "grad_norm": 0.24176359176635742, "learning_rate": 2e-05, - "loss": 1.0885, - "mean_token_accuracy": 0.7010411577918904, + "loss": 0.9749, + "mean_token_accuracy": 0.7247175744992674, "step": 525 }, { - "epoch": 0.3861566484517304, - "grad_norm": 0.3221867084503174, + "epoch": 0.7723132969034608, + "grad_norm": 0.2435123324394226, "learning_rate": 2e-05, - "loss": 1.0891, - "mean_token_accuracy": 0.701841108939912, + "loss": 0.9819, + "mean_token_accuracy": 0.7221314728871521, "step": 530 }, { - "epoch": 0.38979963570127507, - "grad_norm": 0.3283216655254364, + "epoch": 0.7795992714025501, + "grad_norm": 0.22645749151706696, "learning_rate": 2e-05, - "loss": 1.1008, - "mean_token_accuracy": 0.6979451636541281, + "loss": 0.9847, + "mean_token_accuracy": 0.7216826453346362, "step": 535 }, { - "epoch": 0.39344262295081966, - "grad_norm": 0.3395269811153412, + "epoch": 0.7868852459016393, + "grad_norm": 0.2561565339565277, "learning_rate": 2e-05, - "loss": 1.089, - "mean_token_accuracy": 0.702310933274755, + "loss": 0.9747, + "mean_token_accuracy": 0.723244382022472, "step": 540 }, { - "epoch": 0.3970856102003643, - "grad_norm": 0.32621896266937256, + "epoch": 0.7941712204007286, + "grad_norm": 0.238953098654747, "learning_rate": 2e-05, - "loss": 1.0759, - "mean_token_accuracy": 0.7054286761113826, + "loss": 0.978, + "mean_token_accuracy": 0.722381839276991, "step": 545 }, { - "epoch": 0.4007285974499089, - "grad_norm": 0.34217944741249084, + "epoch": 0.8014571948998178, + "grad_norm": 0.27305781841278076, "learning_rate": 2e-05, - "loss": 1.1018, - "mean_token_accuracy": 0.6971726917440155, + "loss": 0.9671, + "mean_token_accuracy": 0.725273265754763, "step": 550 }, { - "epoch": 0.40437158469945356, - "grad_norm": 0.3205217719078064, + "epoch": 0.8087431693989071, + "grad_norm": 0.23001207411289215, "learning_rate": 2e-05, - "loss": 1.0865, - "mean_token_accuracy": 0.7010564240351734, + "loss": 0.9588, + "mean_token_accuracy": 0.7276065295012101, "step": 555 }, { - "epoch": 0.4080145719489982, - "grad_norm": 0.3366459906101227, + "epoch": 0.8160291438979964, + "grad_norm": 0.24449627101421356, "learning_rate": 2e-05, - "loss": 1.1069, - "mean_token_accuracy": 0.6947758915486076, + "loss": 0.9728, + "mean_token_accuracy": 0.7239069369809478, "step": 560 }, { - "epoch": 0.4116575591985428, - "grad_norm": 0.33357521891593933, + "epoch": 0.8233151183970856, + "grad_norm": 0.23095275461673737, "learning_rate": 2e-05, - "loss": 1.0831, - "mean_token_accuracy": 0.7024853444064484, + "loss": 0.967, + "mean_token_accuracy": 0.7263602222765023, "step": 565 }, { - "epoch": 0.41530054644808745, - "grad_norm": 0.3021646738052368, + "epoch": 0.8306010928961749, + "grad_norm": 0.2598857879638672, "learning_rate": 2e-05, - "loss": 1.0934, - "mean_token_accuracy": 0.6990626526624328, + "loss": 0.976, + "mean_token_accuracy": 0.722152845627748, "step": 570 }, { - "epoch": 0.41894353369763204, - "grad_norm": 0.3405080735683441, + "epoch": 0.8378870673952641, + "grad_norm": 0.2525656223297119, "learning_rate": 2e-05, - "loss": 1.0948, - "mean_token_accuracy": 0.6999297752808988, + "loss": 0.9756, + "mean_token_accuracy": 0.7226230459208598, "step": 575 }, { - "epoch": 0.4225865209471767, - "grad_norm": 0.35369783639907837, + "epoch": 0.8451730418943534, + "grad_norm": 0.23842041194438934, "learning_rate": 2e-05, - "loss": 1.0664, - "mean_token_accuracy": 0.7070591108939912, + "loss": 0.9929, + "mean_token_accuracy": 0.7186629824132877, "step": 580 }, { - "epoch": 0.4262295081967213, - "grad_norm": 0.32571864128112793, + "epoch": 0.8524590163934426, + "grad_norm": 0.26655957102775574, "learning_rate": 2e-05, - "loss": 1.0829, - "mean_token_accuracy": 0.701379174799081, + "loss": 0.9732, + "mean_token_accuracy": 0.7229268441621884, "step": 585 }, { - "epoch": 0.42987249544626593, - "grad_norm": 0.3396914005279541, + "epoch": 0.8597449908925319, + "grad_norm": 0.2641935348510742, "learning_rate": 2e-05, - "loss": 1.1002, - "mean_token_accuracy": 0.6994626282364436, + "loss": 0.9758, + "mean_token_accuracy": 0.7253282242305814, "step": 590 }, { - "epoch": 0.4335154826958106, - "grad_norm": 0.3538070023059845, + "epoch": 0.8670309653916212, + "grad_norm": 0.24463647603988647, "learning_rate": 2e-05, - "loss": 1.0985, - "mean_token_accuracy": 0.6981222520762091, + "loss": 0.983, + "mean_token_accuracy": 0.7212307645334635, "step": 595 }, { - "epoch": 0.4371584699453552, - "grad_norm": 0.31537488102912903, + "epoch": 0.8743169398907104, + "grad_norm": 0.24827370047569275, "learning_rate": 2e-05, - "loss": 1.0523, - "mean_token_accuracy": 0.709605520273571, + "loss": 0.9807, + "mean_token_accuracy": 0.7212093917928677, "step": 600 }, { - "epoch": 0.4371584699453552, - "eval_loss": 1.0888596773147583, - "eval_mean_token_accuracy": 0.6988977098839045, - "eval_runtime": 14.4857, - "eval_samples_per_second": 18.087, - "eval_steps_per_second": 1.174, + "epoch": 0.8743169398907104, + "eval_loss": 0.979246199131012, + "eval_mean_token_accuracy": 0.7202548355395697, + "eval_runtime": 38.2386, + "eval_samples_per_second": 6.852, + "eval_steps_per_second": 0.445, "step": 600 }, { - "epoch": 0.4408014571948998, - "grad_norm": 0.3270356059074402, + "epoch": 0.8816029143897997, + "grad_norm": 0.23297545313835144, "learning_rate": 2e-05, - "loss": 1.0672, - "mean_token_accuracy": 0.7046531509526136, + "loss": 0.9609, + "mean_token_accuracy": 0.7272982922789468, "step": 605 }, { - "epoch": 0.4444444444444444, - "grad_norm": 0.36020419001579285, + "epoch": 0.8888888888888888, + "grad_norm": 0.23551233112812042, "learning_rate": 2e-05, - "loss": 1.0966, - "mean_token_accuracy": 0.6975146555935515, + "loss": 0.9715, + "mean_token_accuracy": 0.7250076331216416, "step": 610 }, { - "epoch": 0.44808743169398907, - "grad_norm": 0.33137136697769165, + "epoch": 0.8961748633879781, + "grad_norm": 0.24120616912841797, "learning_rate": 2e-05, - "loss": 1.076, - "mean_token_accuracy": 0.7033310942843185, + "loss": 0.9707, + "mean_token_accuracy": 0.7239664753297508, "step": 615 }, { - "epoch": 0.4517304189435337, - "grad_norm": 0.2987717390060425, + "epoch": 0.9034608378870674, + "grad_norm": 0.2541744112968445, "learning_rate": 2e-05, - "loss": 1.0588, - "mean_token_accuracy": 0.7078102100635075, + "loss": 0.9695, + "mean_token_accuracy": 0.7240719724584961, "step": 620 }, { - "epoch": 0.4553734061930783, - "grad_norm": 0.30083930492401123, + "epoch": 0.9107468123861566, + "grad_norm": 0.2749602198600769, "learning_rate": 2e-05, - "loss": 1.0457, - "mean_token_accuracy": 0.7125610649731315, + "loss": 0.9714, + "mean_token_accuracy": 0.7238626648754275, "step": 625 }, { - "epoch": 0.45901639344262296, - "grad_norm": 0.3193149268627167, + "epoch": 0.9180327868852459, + "grad_norm": 0.2778976559638977, "learning_rate": 2e-05, - "loss": 1.0729, - "mean_token_accuracy": 0.7045462872496338, + "loss": 0.9765, + "mean_token_accuracy": 0.7213330483634588, "step": 630 }, { - "epoch": 0.46265938069216755, - "grad_norm": 0.3396466076374054, + "epoch": 0.9253187613843351, + "grad_norm": 0.24223344027996063, "learning_rate": 2e-05, - "loss": 1.0784, - "mean_token_accuracy": 0.7025983146067416, + "loss": 0.9472, + "mean_token_accuracy": 0.7306683561309237, "step": 635 }, { - "epoch": 0.4663023679417122, - "grad_norm": 0.32033106684684753, + "epoch": 0.9326047358834244, + "grad_norm": 0.2598780691623688, "learning_rate": 2e-05, - "loss": 1.0852, - "mean_token_accuracy": 0.7016365412799219, + "loss": 0.9544, + "mean_token_accuracy": 0.7294760625305325, "step": 640 }, { - "epoch": 0.46994535519125685, - "grad_norm": 0.32467973232269287, + "epoch": 0.9398907103825137, + "grad_norm": 0.2586725950241089, "learning_rate": 2e-05, - "loss": 1.0679, - "mean_token_accuracy": 0.7072850512945774, + "loss": 0.9571, + "mean_token_accuracy": 0.7278074621397166, "step": 645 }, { - "epoch": 0.47358834244080145, - "grad_norm": 0.3388020396232605, + "epoch": 0.9471766848816029, + "grad_norm": 0.23097127676010132, "learning_rate": 2e-05, - "loss": 1.0266, - "mean_token_accuracy": 0.7156540058622374, + "loss": 0.951, + "mean_token_accuracy": 0.7284211651196874, "step": 650 }, { - "epoch": 0.4772313296903461, - "grad_norm": 0.31007012724876404, + "epoch": 0.9544626593806922, + "grad_norm": 0.242562934756279, "learning_rate": 2e-05, - "loss": 1.0441, - "mean_token_accuracy": 0.7133701758671225, + "loss": 0.9743, + "mean_token_accuracy": 0.7239756350757205, "step": 655 }, { - "epoch": 0.4808743169398907, - "grad_norm": 0.3379007577896118, - "learning_rate": 2e-05, - "loss": 1.0474, - "mean_token_accuracy": 0.7104818026380066, - "step": 660 - }, - { - "epoch": 0.48451730418943534, - "grad_norm": 0.33584991097450256, - "learning_rate": 2e-05, - "loss": 1.0816, - "mean_token_accuracy": 0.7025181547348566, - "step": 665 - }, - { - "epoch": 0.48816029143898, - "grad_norm": 0.3317875266075134, - "learning_rate": 2e-05, - "loss": 1.0785, - "mean_token_accuracy": 0.7022074987787005, - "step": 670 - }, - { - "epoch": 0.4918032786885246, - "grad_norm": 0.31039467453956604, - "learning_rate": 2e-05, - "loss": 1.0751, - "mean_token_accuracy": 0.7024639716658523, - "step": 675 - }, - { - "epoch": 0.49544626593806923, - "grad_norm": 0.32393017411231995, - "learning_rate": 2e-05, - "loss": 1.0702, - "mean_token_accuracy": 0.706164509037616, - "step": 680 - }, - { - "epoch": 0.4990892531876138, - "grad_norm": 0.3062119483947754, - "learning_rate": 2e-05, - "loss": 1.0657, - "mean_token_accuracy": 0.708112481680508, - "step": 685 - }, - { - "epoch": 0.5027322404371585, - "grad_norm": 0.30919599533081055, - "learning_rate": 2e-05, - "loss": 1.0531, - "mean_token_accuracy": 0.7101428920371275, - "step": 690 - }, - { - "epoch": 0.5063752276867031, - "grad_norm": 0.3312112092971802, - "learning_rate": 2e-05, - "loss": 1.0533, - "mean_token_accuracy": 0.709449804592086, - "step": 695 - }, - { - "epoch": 0.5100182149362478, - "grad_norm": 0.3210844397544861, - "learning_rate": 2e-05, - "loss": 1.0521, - "mean_token_accuracy": 0.7106863702979971, - "step": 700 - }, - { - "epoch": 0.5100182149362478, - "eval_loss": 1.0767533779144287, - "eval_mean_token_accuracy": 0.7011067128365758, - "eval_runtime": 14.4944, - "eval_samples_per_second": 18.076, - "eval_steps_per_second": 1.173, - "step": 700 - }, - { - "epoch": 0.5136612021857924, - "grad_norm": 0.3688066899776459, - "learning_rate": 2e-05, - "loss": 1.0859, - "mean_token_accuracy": 0.7002839521250611, - "step": 705 - }, - { - "epoch": 0.517304189435337, - "grad_norm": 0.32076314091682434, - "learning_rate": 2e-05, - "loss": 1.0545, - "mean_token_accuracy": 0.7069553004396677, - "step": 710 - }, - { - "epoch": 0.5209471766848816, - "grad_norm": 0.35336294770240784, - "learning_rate": 2e-05, - "loss": 1.0638, - "mean_token_accuracy": 0.7060545920859795, - "step": 715 - }, - { - "epoch": 0.5245901639344263, - "grad_norm": 0.32573992013931274, - "learning_rate": 2e-05, - "loss": 1.0584, - "mean_token_accuracy": 0.7088574743527113, - "step": 720 - }, - { - "epoch": 0.5282331511839709, - "grad_norm": 0.3330203890800476, - "learning_rate": 2e-05, - "loss": 1.0631, - "mean_token_accuracy": 0.7062377870053738, - "step": 725 - }, - { - "epoch": 0.5318761384335154, - "grad_norm": 0.3228294849395752, - "learning_rate": 2e-05, - "loss": 1.0201, - "mean_token_accuracy": 0.7158768930141669, - "step": 730 - }, - { - "epoch": 0.5355191256830601, - "grad_norm": 0.32984620332717896, - "learning_rate": 2e-05, - "loss": 1.0253, - "mean_token_accuracy": 0.715898265754763, - "step": 735 - }, - { - "epoch": 0.5391621129326047, - "grad_norm": 0.30142080783843994, - "learning_rate": 2e-05, - "loss": 1.0556, - "mean_token_accuracy": 0.7093948461162678, - "step": 740 - }, - { - "epoch": 0.5428051001821493, - "grad_norm": 0.3037840723991394, - "learning_rate": 2e-05, - "loss": 1.0697, - "mean_token_accuracy": 0.7060240595994138, - "step": 745 - }, - { - "epoch": 0.546448087431694, - "grad_norm": 0.2929915487766266, - "learning_rate": 2e-05, - "loss": 1.0601, - "mean_token_accuracy": 0.7079384465070836, - "step": 750 - }, - { - "epoch": 0.5500910746812386, - "grad_norm": 0.32633528113365173, - "learning_rate": 2e-05, - "loss": 1.0715, - "mean_token_accuracy": 0.7061064973131412, - "step": 755 - }, - { - "epoch": 0.5537340619307832, - "grad_norm": 0.33284929394721985, - "learning_rate": 2e-05, - "loss": 1.0601, - "mean_token_accuracy": 0.7047539081582804, - "step": 760 - }, - { - "epoch": 0.5573770491803278, - "grad_norm": 0.3333057165145874, - "learning_rate": 2e-05, - "loss": 1.0572, - "mean_token_accuracy": 0.7077789853207801, - "step": 765 - }, - { - "epoch": 0.5610200364298725, - "grad_norm": 0.3196307122707367, - "learning_rate": 2e-05, - "loss": 1.0413, - "mean_token_accuracy": 0.7101215192965314, - "step": 770 - }, - { - "epoch": 0.5646630236794171, - "grad_norm": 0.3234025239944458, - "learning_rate": 2e-05, - "loss": 1.0412, - "mean_token_accuracy": 0.7131656082071325, - "step": 775 - }, - { - "epoch": 0.5683060109289617, - "grad_norm": 0.3486537039279938, - "learning_rate": 2e-05, - "loss": 1.043, - "mean_token_accuracy": 0.7117122618466047, - "step": 780 - }, - { - "epoch": 0.5719489981785064, - "grad_norm": 0.31476178765296936, - "learning_rate": 2e-05, - "loss": 1.0723, - "mean_token_accuracy": 0.7013709086468003, - "step": 785 - }, - { - "epoch": 0.575591985428051, - "grad_norm": 0.3405304253101349, - "learning_rate": 2e-05, - "loss": 1.0733, - "mean_token_accuracy": 0.7017128724963362, - "step": 790 - }, - { - "epoch": 0.5792349726775956, - "grad_norm": 0.3531171679496765, - "learning_rate": 2e-05, - "loss": 1.0622, - "mean_token_accuracy": 0.7063660234489495, - "step": 795 - }, - { - "epoch": 0.5828779599271403, - "grad_norm": 0.33499255776405334, - "learning_rate": 2e-05, - "loss": 1.0585, - "mean_token_accuracy": 0.7079750854909623, - "step": 800 - }, - { - "epoch": 0.5828779599271403, - "eval_loss": 1.0676288604736328, - "eval_mean_token_accuracy": 0.7027526775347543, - "eval_runtime": 14.4235, - "eval_samples_per_second": 18.165, - "eval_steps_per_second": 1.179, - "step": 800 - }, - { - "epoch": 0.5865209471766849, - "grad_norm": 0.3178112506866455, - "learning_rate": 2e-05, - "loss": 1.0546, - "mean_token_accuracy": 0.7070072056668294, - "step": 805 - }, - { - "epoch": 0.5901639344262295, - "grad_norm": 0.34171855449676514, - "learning_rate": 2e-05, - "loss": 1.0459, - "mean_token_accuracy": 0.7116786761113827, - "step": 810 - }, - { - "epoch": 0.5938069216757741, - "grad_norm": 0.3327518701553345, - "learning_rate": 2e-05, - "loss": 1.0714, - "mean_token_accuracy": 0.7016884465070834, - "step": 815 - }, - { - "epoch": 0.5974499089253188, - "grad_norm": 0.33222025632858276, - "learning_rate": 2e-05, - "loss": 1.0528, - "mean_token_accuracy": 0.7067659990229603, - "step": 820 - }, - { - "epoch": 0.6010928961748634, - "grad_norm": 0.3413512706756592, - "learning_rate": 2e-05, - "loss": 1.0643, - "mean_token_accuracy": 0.7061461895456767, - "step": 825 - }, - { - "epoch": 0.604735883424408, - "grad_norm": 0.35720759630203247, - "learning_rate": 2e-05, - "loss": 1.0611, - "mean_token_accuracy": 0.7056851489985345, - "step": 830 - }, - { - "epoch": 0.6083788706739527, - "grad_norm": 0.36705562472343445, - "learning_rate": 2e-05, - "loss": 1.0749, - "mean_token_accuracy": 0.702274670249145, - "step": 835 - }, - { - "epoch": 0.6120218579234973, - "grad_norm": 0.3292873203754425, - "learning_rate": 2e-05, - "loss": 1.0509, - "mean_token_accuracy": 0.7101093063019052, - "step": 840 - }, - { - "epoch": 0.6156648451730419, - "grad_norm": 0.3292001187801361, - "learning_rate": 2e-05, - "loss": 1.0535, - "mean_token_accuracy": 0.7094223253541768, - "step": 845 - }, - { - "epoch": 0.6193078324225865, - "grad_norm": 0.3075976073741913, - "learning_rate": 2e-05, - "loss": 1.0812, - "mean_token_accuracy": 0.7013525891548608, - "step": 850 - }, - { - "epoch": 0.6229508196721312, - "grad_norm": 0.32240450382232666, - "learning_rate": 2e-05, - "loss": 1.0659, - "mean_token_accuracy": 0.7051569369809476, - "step": 855 - }, - { - "epoch": 0.6265938069216758, - "grad_norm": 0.3207255005836487, - "learning_rate": 2e-05, - "loss": 1.0361, - "mean_token_accuracy": 0.7137518319491939, - "step": 860 - }, - { - "epoch": 0.6302367941712204, - "grad_norm": 0.3257087469100952, - "learning_rate": 2e-05, - "loss": 1.0576, - "mean_token_accuracy": 0.7068789692232537, - "step": 865 - }, - { - "epoch": 0.6338797814207651, - "grad_norm": 0.33827245235443115, - "learning_rate": 2e-05, - "loss": 1.0799, - "mean_token_accuracy": 0.6992122618466047, - "step": 870 - }, - { - "epoch": 0.6375227686703097, - "grad_norm": 0.3211885094642639, - "learning_rate": 2e-05, - "loss": 1.0222, - "mean_token_accuracy": 0.7154677576941866, - "step": 875 - }, - { - "epoch": 0.6411657559198543, - "grad_norm": 0.328647643327713, - "learning_rate": 2e-05, - "loss": 1.039, - "mean_token_accuracy": 0.7112500139591076, - "step": 880 - }, - { - "epoch": 0.644808743169399, - "grad_norm": 0.3323783576488495, - "learning_rate": 2e-05, - "loss": 1.0648, - "mean_token_accuracy": 0.7059385686370299, - "step": 885 - }, - { - "epoch": 0.6484517304189436, - "grad_norm": 0.32891303300857544, - "learning_rate": 2e-05, - "loss": 1.0393, - "mean_token_accuracy": 0.710622252076209, - "step": 890 - }, - { - "epoch": 0.6520947176684881, - "grad_norm": 0.3386060297489166, - "learning_rate": 2e-05, - "loss": 1.0493, - "mean_token_accuracy": 0.7073888617489008, - "step": 895 - }, - { - "epoch": 0.6557377049180327, - "grad_norm": 0.29806962609291077, - "learning_rate": 2e-05, - "loss": 1.0482, - "mean_token_accuracy": 0.7085857352222765, - "step": 900 - }, - { - "epoch": 0.6557377049180327, - "eval_loss": 1.059637427330017, - "eval_mean_token_accuracy": 0.7044808439790701, - "eval_runtime": 14.4418, - "eval_samples_per_second": 18.142, - "eval_steps_per_second": 1.177, - "step": 900 - }, - { - "epoch": 0.6593806921675774, - "grad_norm": 0.346737265586853, - "learning_rate": 2e-05, - "loss": 1.0432, - "mean_token_accuracy": 0.7125244259892527, - "step": 905 - }, - { - "epoch": 0.663023679417122, - "grad_norm": 0.35346153378486633, - "learning_rate": 2e-05, - "loss": 1.0375, - "mean_token_accuracy": 0.7128877625793846, - "step": 910 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 0.32934707403182983, - "learning_rate": 2e-05, - "loss": 1.0322, - "mean_token_accuracy": 0.7146983390327307, - "step": 915 - }, - { - "epoch": 0.6703096539162113, - "grad_norm": 0.3805268704891205, - "learning_rate": 2e-05, - "loss": 1.0556, - "mean_token_accuracy": 0.7068484367366878, - "step": 920 - }, - { - "epoch": 0.6739526411657559, - "grad_norm": 0.3154759407043457, - "learning_rate": 2e-05, - "loss": 1.0551, - "mean_token_accuracy": 0.7053065461651196, - "step": 925 - }, - { - "epoch": 0.6775956284153005, - "grad_norm": 0.31786778569221497, - "learning_rate": 2e-05, - "loss": 1.0319, - "mean_token_accuracy": 0.7121061309233024, - "step": 930 - }, - { - "epoch": 0.6812386156648452, - "grad_norm": 0.3243469297885895, - "learning_rate": 2e-05, - "loss": 1.0384, - "mean_token_accuracy": 0.7118771372740597, - "step": 935 - }, - { - "epoch": 0.6848816029143898, - "grad_norm": 0.34407398104667664, - "learning_rate": 2e-05, - "loss": 1.0546, - "mean_token_accuracy": 0.7059202491450903, - "step": 940 - }, - { - "epoch": 0.6885245901639344, - "grad_norm": 0.32175493240356445, - "learning_rate": 2e-05, - "loss": 1.0669, - "mean_token_accuracy": 0.7043325598436735, - "step": 945 - }, - { - "epoch": 0.692167577413479, - "grad_norm": 0.3119860887527466, - "learning_rate": 2e-05, - "loss": 1.0457, - "mean_token_accuracy": 0.7077308255984368, - "step": 950 - }, - { - "epoch": 0.6958105646630237, - "grad_norm": 0.29809337854385376, - "learning_rate": 2e-05, - "loss": 1.0263, - "mean_token_accuracy": 0.7132633121641427, - "step": 955 - }, - { - "epoch": 0.6994535519125683, - "grad_norm": 0.3371947109699249, - "learning_rate": 2e-05, - "loss": 1.0438, - "mean_token_accuracy": 0.7100593064398786, - "step": 960 - }, - { - "epoch": 0.7030965391621129, - "grad_norm": 0.3244064748287201, - "learning_rate": 2e-05, - "loss": 1.0337, - "mean_token_accuracy": 0.7126587689301417, - "step": 965 - }, - { - "epoch": 0.7067395264116576, - "grad_norm": 0.3447455167770386, - "learning_rate": 2e-05, - "loss": 1.0254, - "mean_token_accuracy": 0.7147410845139229, - "step": 970 - }, - { - "epoch": 0.7103825136612022, - "grad_norm": 0.28738129138946533, - "learning_rate": 2e-05, - "loss": 1.0263, - "mean_token_accuracy": 0.7152876160234489, - "step": 975 - }, - { - "epoch": 0.7140255009107468, - "grad_norm": 0.32235240936279297, - "learning_rate": 2e-05, - "loss": 1.0199, - "mean_token_accuracy": 0.7157211773326819, - "step": 980 - }, - { - "epoch": 0.7176684881602914, - "grad_norm": 0.35440897941589355, - "learning_rate": 2e-05, - "loss": 1.0319, - "mean_token_accuracy": 0.7125122129946264, - "step": 985 - }, - { - "epoch": 0.7213114754098361, - "grad_norm": 0.37784770131111145, - "learning_rate": 2e-05, - "loss": 1.0336, - "mean_token_accuracy": 0.7125488519785051, - "step": 990 - }, - { - "epoch": 0.7249544626593807, - "grad_norm": 0.31454479694366455, - "learning_rate": 2e-05, - "loss": 1.0533, - "mean_token_accuracy": 0.7099444308744505, - "step": 995 - }, - { - "epoch": 0.7285974499089253, - "grad_norm": 0.3095937967300415, - "learning_rate": 2e-05, - "loss": 1.0418, - "mean_token_accuracy": 0.7107138495359062, - "step": 1000 - }, - { - "epoch": 0.7285974499089253, - "eval_loss": 1.052242398262024, - "eval_mean_token_accuracy": 0.7063926113598887, - "eval_runtime": 14.4403, - "eval_samples_per_second": 18.144, - "eval_steps_per_second": 1.177, - "step": 1000 - }, - { - "epoch": 0.73224043715847, - "grad_norm": 0.3146299421787262, - "learning_rate": 2e-05, - "loss": 1.0171, - "mean_token_accuracy": 0.7154250122129946, - "step": 1005 - }, - { - "epoch": 0.7358834244080146, - "grad_norm": 0.3378625214099884, - "learning_rate": 2e-05, - "loss": 1.0309, - "mean_token_accuracy": 0.7123320713238885, - "step": 1010 - }, - { - "epoch": 0.7395264116575592, - "grad_norm": 0.346305787563324, - "learning_rate": 2e-05, - "loss": 1.0303, - "mean_token_accuracy": 0.71228627259404, - "step": 1015 - }, - { - "epoch": 0.7431693989071039, - "grad_norm": 0.3319399654865265, - "learning_rate": 2e-05, - "loss": 1.0508, - "mean_token_accuracy": 0.7065431118710308, - "step": 1020 - }, - { - "epoch": 0.7468123861566485, - "grad_norm": 0.31447839736938477, - "learning_rate": 2e-05, - "loss": 1.0135, - "mean_token_accuracy": 0.7169516365412798, - "step": 1025 - }, - { - "epoch": 0.7504553734061931, - "grad_norm": 0.3323952853679657, - "learning_rate": 2e-05, - "loss": 1.0683, - "mean_token_accuracy": 0.7038593063019053, - "step": 1030 - }, - { - "epoch": 0.7540983606557377, - "grad_norm": 0.33427804708480835, - "learning_rate": 2e-05, - "loss": 1.0319, - "mean_token_accuracy": 0.7127320468978994, - "step": 1035 - }, - { - "epoch": 0.7577413479052824, - "grad_norm": 0.33110910654067993, - "learning_rate": 2e-05, - "loss": 1.05, - "mean_token_accuracy": 0.7086345872007815, - "step": 1040 - }, - { - "epoch": 0.761384335154827, - "grad_norm": 0.31014275550842285, - "learning_rate": 2e-05, - "loss": 1.0545, - "mean_token_accuracy": 0.7088544211040547, - "step": 1045 - }, - { - "epoch": 0.7650273224043715, - "grad_norm": 0.34434184432029724, - "learning_rate": 2e-05, - "loss": 1.0181, - "mean_token_accuracy": 0.7153089887640449, - "step": 1050 - }, - { - "epoch": 0.7686703096539163, - "grad_norm": 0.31507736444473267, - "learning_rate": 2e-05, - "loss": 1.0436, - "mean_token_accuracy": 0.7094620175867123, - "step": 1055 - }, - { - "epoch": 0.7723132969034608, - "grad_norm": 0.3478749990463257, - "learning_rate": 2e-05, - "loss": 1.039, - "mean_token_accuracy": 0.7100054958475819, - "step": 1060 - }, - { - "epoch": 0.7759562841530054, - "grad_norm": 0.3535066246986389, - "learning_rate": 2e-05, - "loss": 1.0351, - "mean_token_accuracy": 0.7126282364435758, - "step": 1065 - }, - { - "epoch": 0.7795992714025501, - "grad_norm": 0.30896762013435364, - "learning_rate": 2e-05, - "loss": 1.056, - "mean_token_accuracy": 0.70635686370298, - "step": 1070 - }, - { - "epoch": 0.7832422586520947, - "grad_norm": 0.3770598769187927, - "learning_rate": 2e-05, - "loss": 1.034, - "mean_token_accuracy": 0.7107138495359061, - "step": 1075 - }, - { - "epoch": 0.7868852459016393, - "grad_norm": 0.3074532449245453, - "learning_rate": 2e-05, - "loss": 1.0325, - "mean_token_accuracy": 0.7111871030776745, - "step": 1080 - }, - { - "epoch": 0.7905282331511839, - "grad_norm": 0.2974810004234314, - "learning_rate": 2e-05, - "loss": 1.041, - "mean_token_accuracy": 0.7103230337078652, - "step": 1085 - }, - { - "epoch": 0.7941712204007286, - "grad_norm": 0.3332538306713104, - "learning_rate": 2e-05, - "loss": 1.0351, - "mean_token_accuracy": 0.7102680752320469, - "step": 1090 - }, - { - "epoch": 0.7978142076502732, - "grad_norm": 0.3428950011730194, - "learning_rate": 2e-05, - "loss": 1.0277, - "mean_token_accuracy": 0.7125457987298486, - "step": 1095 - }, - { - "epoch": 0.8014571948998178, - "grad_norm": 0.3009244203567505, - "learning_rate": 2e-05, - "loss": 1.0228, - "mean_token_accuracy": 0.7134342940889103, - "step": 1100 - }, - { - "epoch": 0.8014571948998178, - "eval_loss": 1.0443105697631836, - "eval_mean_token_accuracy": 0.7080775799721868, - "eval_runtime": 14.3394, - "eval_samples_per_second": 18.271, - "eval_steps_per_second": 1.186, - "step": 1100 - }, - { - "epoch": 0.8051001821493625, - "grad_norm": 0.3108626902103424, - "learning_rate": 2e-05, - "loss": 1.0141, - "mean_token_accuracy": 0.717134831460674, - "step": 1105 - }, - { - "epoch": 0.8087431693989071, - "grad_norm": 0.31533291935920715, - "learning_rate": 2e-05, - "loss": 1.0231, - "mean_token_accuracy": 0.7143407530199676, - "step": 1110 - }, - { - "epoch": 0.8123861566484517, - "grad_norm": 0.33208322525024414, - "learning_rate": 2e-05, - "loss": 1.0412, - "mean_token_accuracy": 0.7096116267708843, - "step": 1115 - }, - { - "epoch": 0.8160291438979964, - "grad_norm": 0.33526450395584106, - "learning_rate": 2e-05, - "loss": 1.0223, - "mean_token_accuracy": 0.7140693698094772, - "step": 1120 - }, - { - "epoch": 0.819672131147541, - "grad_norm": 0.33586207032203674, - "learning_rate": 2e-05, - "loss": 1.0417, - "mean_token_accuracy": 0.7102894479726429, - "step": 1125 - }, - { - "epoch": 0.8233151183970856, - "grad_norm": 0.3324921429157257, - "learning_rate": 2e-05, - "loss": 1.0112, - "mean_token_accuracy": 0.7177057889594529, - "step": 1130 - }, - { - "epoch": 0.8269581056466302, - "grad_norm": 0.3279407322406769, - "learning_rate": 2e-05, - "loss": 1.0215, - "mean_token_accuracy": 0.7144968246213971, - "step": 1135 - }, - { - "epoch": 0.8306010928961749, - "grad_norm": 0.3108167350292206, - "learning_rate": 2e-05, - "loss": 1.0458, - "mean_token_accuracy": 0.7076056424035174, - "step": 1140 - }, - { - "epoch": 0.8342440801457195, - "grad_norm": 0.3357519507408142, - "learning_rate": 2e-05, - "loss": 1.0416, - "mean_token_accuracy": 0.7098100879335613, - "step": 1145 - }, - { - "epoch": 0.8378870673952641, - "grad_norm": 0.3577389717102051, - "learning_rate": 2e-05, - "loss": 1.0267, - "mean_token_accuracy": 0.7120114802149485, - "step": 1150 - }, - { - "epoch": 0.8415300546448088, - "grad_norm": 0.3234967887401581, - "learning_rate": 2e-05, - "loss": 1.0654, - "mean_token_accuracy": 0.7035234489496826, - "step": 1155 - }, - { - "epoch": 0.8451730418943534, - "grad_norm": 0.328988254070282, - "learning_rate": 2e-05, - "loss": 1.0385, - "mean_token_accuracy": 0.7108115534929164, - "step": 1160 - }, - { - "epoch": 0.848816029143898, - "grad_norm": 0.3636587858200073, - "learning_rate": 2e-05, - "loss": 1.0515, - "mean_token_accuracy": 0.7053462383976552, - "step": 1165 - }, - { - "epoch": 0.8524590163934426, - "grad_norm": 0.3206827938556671, - "learning_rate": 2e-05, - "loss": 1.0167, - "mean_token_accuracy": 0.7159135319980459, - "step": 1170 - }, - { - "epoch": 0.8561020036429873, - "grad_norm": 0.3391059637069702, - "learning_rate": 2e-05, - "loss": 1.0397, - "mean_token_accuracy": 0.7123595505617978, - "step": 1175 - }, - { - "epoch": 0.8597449908925319, - "grad_norm": 0.3314334750175476, - "learning_rate": 2e-05, - "loss": 1.0297, - "mean_token_accuracy": 0.714533463605276, - "step": 1180 - }, - { - "epoch": 0.8633879781420765, - "grad_norm": 0.3360309600830078, - "learning_rate": 2e-05, - "loss": 1.0692, - "mean_token_accuracy": 0.7027845627747925, - "step": 1185 - }, - { - "epoch": 0.8670309653916212, - "grad_norm": 0.3183639645576477, - "learning_rate": 2e-05, - "loss": 1.0175, - "mean_token_accuracy": 0.7152265510503177, - "step": 1190 - }, - { - "epoch": 0.8706739526411658, - "grad_norm": 0.3319450914859772, - "learning_rate": 2e-05, - "loss": 1.0542, - "mean_token_accuracy": 0.7045035417684418, - "step": 1195 - }, - { - "epoch": 0.8743169398907104, - "grad_norm": 0.32276779413223267, - "learning_rate": 2e-05, - "loss": 1.0273, - "mean_token_accuracy": 0.7115290669272106, - "step": 1200 - }, - { - "epoch": 0.8743169398907104, - "eval_loss": 1.0371540784835815, - "eval_mean_token_accuracy": 0.7092831164176581, - "eval_runtime": 14.3641, - "eval_samples_per_second": 18.24, - "eval_steps_per_second": 1.184, - "step": 1200 - }, - { - "epoch": 0.8779599271402551, - "grad_norm": 0.3125724494457245, - "learning_rate": 2e-05, - "loss": 1.0164, - "mean_token_accuracy": 0.7162616023448949, - "step": 1205 - }, - { - "epoch": 0.8816029143897997, - "grad_norm": 0.33562660217285156, - "learning_rate": 2e-05, - "loss": 1.023, - "mean_token_accuracy": 0.714150905550501, - "step": 1210 - }, - { - "epoch": 0.8852459016393442, - "grad_norm": 0.31345677375793457, - "learning_rate": 2e-05, - "loss": 1.0235, - "mean_token_accuracy": 0.7146097948216903, - "step": 1215 - }, - { - "epoch": 0.8888888888888888, - "grad_norm": 0.3195979595184326, - "learning_rate": 2e-05, - "loss": 1.0359, - "mean_token_accuracy": 0.711648143624817, - "step": 1220 - }, - { - "epoch": 0.8925318761384335, - "grad_norm": 0.3194137513637543, - "learning_rate": 2e-05, - "loss": 1.0178, - "mean_token_accuracy": 0.7147838299951148, - "step": 1225 - }, - { - "epoch": 0.8961748633879781, - "grad_norm": 0.3297196328639984, - "learning_rate": 2e-05, - "loss": 1.0396, - "mean_token_accuracy": 0.7098375671714704, - "step": 1230 - }, - { - "epoch": 0.8998178506375227, - "grad_norm": 0.32576802372932434, - "learning_rate": 2e-05, - "loss": 1.0309, - "mean_token_accuracy": 0.7101613812069504, - "step": 1235 - }, - { - "epoch": 0.9034608378870674, - "grad_norm": 0.3285605311393738, - "learning_rate": 2e-05, - "loss": 1.0273, - "mean_token_accuracy": 0.7137640449438203, - "step": 1240 - }, - { - "epoch": 0.907103825136612, - "grad_norm": 0.32822245359420776, - "learning_rate": 2e-05, - "loss": 1.0257, - "mean_token_accuracy": 0.7128450170981925, - "step": 1245 - }, - { - "epoch": 0.9107468123861566, - "grad_norm": 0.32141098380088806, - "learning_rate": 2e-05, - "loss": 1.0354, - "mean_token_accuracy": 0.711327552515877, - "step": 1250 - }, - { - "epoch": 0.9143897996357013, - "grad_norm": 0.3102847635746002, - "learning_rate": 2e-05, - "loss": 1.0393, - "mean_token_accuracy": 0.7078682217879825, - "step": 1255 - }, - { - "epoch": 0.9180327868852459, - "grad_norm": 0.32655781507492065, - "learning_rate": 2e-05, - "loss": 1.0343, - "mean_token_accuracy": 0.7091322667318025, - "step": 1260 - }, - { - "epoch": 0.9216757741347905, - "grad_norm": 0.33519840240478516, - "learning_rate": 2e-05, - "loss": 1.0103, - "mean_token_accuracy": 0.7167684416218856, - "step": 1265 - }, - { - "epoch": 0.9253187613843351, - "grad_norm": 0.3166919946670532, - "learning_rate": 2e-05, - "loss": 1.0049, - "mean_token_accuracy": 0.7192385197850513, - "step": 1270 - }, - { - "epoch": 0.9289617486338798, - "grad_norm": 0.31707146763801575, - "learning_rate": 2e-05, - "loss": 1.0088, - "mean_token_accuracy": 0.7189606741573035, - "step": 1275 - }, - { - "epoch": 0.9326047358834244, - "grad_norm": 0.32707205414772034, - "learning_rate": 2e-05, - "loss": 1.016, - "mean_token_accuracy": 0.7171409379579873, - "step": 1280 - }, - { - "epoch": 0.936247723132969, - "grad_norm": 0.3101848065853119, - "learning_rate": 2e-05, - "loss": 1.0258, - "mean_token_accuracy": 0.711348925256473, - "step": 1285 - }, - { - "epoch": 0.9398907103825137, - "grad_norm": 0.36114785075187683, - "learning_rate": 2e-05, - "loss": 1.0088, - "mean_token_accuracy": 0.7187561064973134, - "step": 1290 - }, - { - "epoch": 0.9435336976320583, - "grad_norm": 0.30461356043815613, - "learning_rate": 2e-05, - "loss": 1.0016, - "mean_token_accuracy": 0.718658402540303, - "step": 1295 - }, - { - "epoch": 0.9471766848816029, - "grad_norm": 0.32494619488716125, - "learning_rate": 2e-05, - "loss": 1.0156, - "mean_token_accuracy": 0.7143166829506594, - "step": 1300 - }, - { - "epoch": 0.9471766848816029, - "eval_loss": 1.0324426889419556, - "eval_mean_token_accuracy": 0.7099415543626529, - "eval_runtime": 14.3232, - "eval_samples_per_second": 18.292, - "eval_steps_per_second": 1.187, - "step": 1300 - }, - { - "epoch": 0.9508196721311475, - "grad_norm": 0.3495274484157562, - "learning_rate": 2e-05, - "loss": 1.0224, - "mean_token_accuracy": 0.71535784074255, - "step": 1305 - }, - { - "epoch": 0.9544626593806922, - "grad_norm": 0.34313705563545227, - "learning_rate": 2e-05, - "loss": 1.0457, - "mean_token_accuracy": 0.7087200781631655, - "step": 1310 - }, - { - "epoch": 0.9581056466302368, - "grad_norm": 0.3228403925895691, - "learning_rate": 2e-05, - "loss": 1.0365, - "mean_token_accuracy": 0.7079201270151441, - "step": 1315 - }, - { - "epoch": 0.9617486338797814, - "grad_norm": 0.30886438488960266, - "learning_rate": 2e-05, - "loss": 0.9937, - "mean_token_accuracy": 0.7199041279921836, - "step": 1320 - }, - { - "epoch": 0.9653916211293261, - "grad_norm": 0.3261902630329132, - "learning_rate": 2e-05, - "loss": 1.0278, - "mean_token_accuracy": 0.7134495603321935, - "step": 1325 - }, - { - "epoch": 0.9690346083788707, - "grad_norm": 0.303875207901001, - "learning_rate": 2e-05, - "loss": 0.9981, - "mean_token_accuracy": 0.7204811919882755, - "step": 1330 - }, - { - "epoch": 0.9726775956284153, - "grad_norm": 0.33992454409599304, - "learning_rate": 2e-05, - "loss": 1.028, - "mean_token_accuracy": 0.7103535661944307, - "step": 1335 - }, - { - "epoch": 0.97632058287796, - "grad_norm": 0.35153236985206604, - "learning_rate": 2e-05, - "loss": 1.006, - "mean_token_accuracy": 0.7178431851489984, - "step": 1340 - }, - { - "epoch": 0.9799635701275046, - "grad_norm": 0.33356398344039917, - "learning_rate": 2e-05, - "loss": 1.0026, - "mean_token_accuracy": 0.720670493404983, - "step": 1345 - }, - { - "epoch": 0.9836065573770492, - "grad_norm": 0.32960230112075806, - "learning_rate": 2e-05, - "loss": 1.0319, - "mean_token_accuracy": 0.7102375427454812, - "step": 1350 - }, - { - "epoch": 0.9872495446265938, - "grad_norm": 0.3457731306552887, - "learning_rate": 2e-05, - "loss": 1.0324, - "mean_token_accuracy": 0.7103286771488085, - "step": 1355 - }, - { - "epoch": 0.9908925318761385, - "grad_norm": 0.34496790170669556, - "learning_rate": 2e-05, - "loss": 1.0129, - "mean_token_accuracy": 0.7149914509037616, - "step": 1360 - }, - { - "epoch": 0.994535519125683, - "grad_norm": 0.3273138105869293, - "learning_rate": 2e-05, - "loss": 1.0067, - "mean_token_accuracy": 0.7190278940364658, - "step": 1365 - }, - { - "epoch": 0.9981785063752276, - "grad_norm": 0.35900992155075073, - "learning_rate": 2e-05, - "loss": 1.0475, - "mean_token_accuracy": 0.7082559843673668, - "step": 1370 - }, - { - "epoch": 1.0021857923497268, - "grad_norm": 0.364685982465744, - "learning_rate": 2e-05, - "loss": 1.039, - "mean_token_accuracy": 0.720287528787773, - "step": 1375 - }, - { - "epoch": 1.0058287795992713, - "grad_norm": 0.3133888244628906, - "learning_rate": 2e-05, - "loss": 0.9677, - "mean_token_accuracy": 0.7262609916951638, - "step": 1380 - }, - { - "epoch": 1.009471766848816, - "grad_norm": 0.3458701968193054, - "learning_rate": 2e-05, - "loss": 0.9779, - "mean_token_accuracy": 0.7242458475818271, - "step": 1385 - }, - { - "epoch": 1.0131147540983607, - "grad_norm": 0.3019266426563263, - "learning_rate": 2e-05, - "loss": 0.9867, - "mean_token_accuracy": 0.7212231314118223, - "step": 1390 - }, - { - "epoch": 1.0167577413479052, - "grad_norm": 0.3397253751754761, - "learning_rate": 2e-05, - "loss": 0.9864, - "mean_token_accuracy": 0.7227711284807034, - "step": 1395 - }, - { - "epoch": 1.02040072859745, - "grad_norm": 0.3236973285675049, - "learning_rate": 2e-05, - "loss": 0.9789, - "mean_token_accuracy": 0.7222307034684905, - "step": 1400 - }, - { - "epoch": 1.02040072859745, - "eval_loss": 1.0278208255767822, - "eval_mean_token_accuracy": 0.7113203024833191, - "eval_runtime": 14.2654, - "eval_samples_per_second": 18.366, - "eval_steps_per_second": 1.192, - "step": 1400 - }, - { - "epoch": 1.0240437158469946, - "grad_norm": 0.3549124300479889, - "learning_rate": 2e-05, - "loss": 0.9882, - "mean_token_accuracy": 0.7225512945774304, - "step": 1405 - }, - { - "epoch": 1.027686703096539, - "grad_norm": 0.3246813416481018, - "learning_rate": 2e-05, - "loss": 1.0112, - "mean_token_accuracy": 0.7154952369320957, - "step": 1410 - }, - { - "epoch": 1.0313296903460838, - "grad_norm": 0.347151517868042, - "learning_rate": 2e-05, - "loss": 0.9723, - "mean_token_accuracy": 0.7266487542745483, - "step": 1415 - }, - { - "epoch": 1.0349726775956285, - "grad_norm": 0.35966718196868896, - "learning_rate": 2e-05, - "loss": 0.9863, - "mean_token_accuracy": 0.7234916951636542, - "step": 1420 - }, - { - "epoch": 1.038615664845173, - "grad_norm": 0.3249855935573578, - "learning_rate": 2e-05, - "loss": 0.9768, - "mean_token_accuracy": 0.7243191255495847, - "step": 1425 - }, - { - "epoch": 1.0422586520947177, - "grad_norm": 0.3137085437774658, - "learning_rate": 2e-05, - "loss": 0.9823, - "mean_token_accuracy": 0.721757449926722, - "step": 1430 - }, - { - "epoch": 1.0459016393442624, - "grad_norm": 0.3215838670730591, - "learning_rate": 2e-05, - "loss": 0.9939, - "mean_token_accuracy": 0.7202796775769419, - "step": 1435 - }, - { - "epoch": 1.0495446265938069, - "grad_norm": 0.3278406858444214, - "learning_rate": 2e-05, - "loss": 0.9775, - "mean_token_accuracy": 0.7260136785539814, - "step": 1440 - }, - { - "epoch": 1.0531876138433516, - "grad_norm": 0.32132425904273987, - "learning_rate": 2e-05, - "loss": 0.9815, - "mean_token_accuracy": 0.7215712017586711, - "step": 1445 - }, - { - "epoch": 1.0568306010928963, - "grad_norm": 0.29465213418006897, - "learning_rate": 2e-05, - "loss": 0.9981, - "mean_token_accuracy": 0.7183011724474841, - "step": 1450 - }, - { - "epoch": 1.0604735883424408, - "grad_norm": 0.32398778200149536, - "learning_rate": 2e-05, - "loss": 0.9833, - "mean_token_accuracy": 0.7217727161700049, - "step": 1455 - }, - { - "epoch": 1.0641165755919855, - "grad_norm": 0.3281712532043457, - "learning_rate": 2e-05, - "loss": 0.9946, - "mean_token_accuracy": 0.7185686983605112, - "step": 1460 - }, - { - "epoch": 1.06775956284153, - "grad_norm": 0.31330931186676025, - "learning_rate": 2e-05, - "loss": 0.967, - "mean_token_accuracy": 0.727317415730337, - "step": 1465 - }, - { - "epoch": 1.0714025500910747, - "grad_norm": 0.33476975560188293, - "learning_rate": 2e-05, - "loss": 0.952, - "mean_token_accuracy": 0.7321110161211529, - "step": 1470 - }, - { - "epoch": 1.0750455373406194, - "grad_norm": 0.3155185580253601, - "learning_rate": 2e-05, - "loss": 0.982, - "mean_token_accuracy": 0.7213910600879336, - "step": 1475 - }, - { - "epoch": 1.0786885245901638, - "grad_norm": 0.3037089407444, - "learning_rate": 2e-05, - "loss": 0.9817, - "mean_token_accuracy": 0.722349729518869, - "step": 1480 - }, - { - "epoch": 1.0823315118397085, - "grad_norm": 0.3598880171775818, - "learning_rate": 2e-05, - "loss": 1.0052, - "mean_token_accuracy": 0.7150433561309233, - "step": 1485 - }, - { - "epoch": 1.0859744990892533, - "grad_norm": 0.32113686203956604, - "learning_rate": 2e-05, - "loss": 0.9745, - "mean_token_accuracy": 0.7253543328356823, - "step": 1490 - }, - { - "epoch": 1.0896174863387977, - "grad_norm": 0.3267917335033417, - "learning_rate": 2e-05, - "loss": 0.9914, - "mean_token_accuracy": 0.7191438690766976, - "step": 1495 - }, - { - "epoch": 1.0932604735883424, - "grad_norm": 0.33139562606811523, - "learning_rate": 2e-05, - "loss": 0.9829, - "mean_token_accuracy": 0.7221024670249144, - "step": 1500 - }, - { - "epoch": 1.0932604735883424, - "eval_loss": 1.0242724418640137, - "eval_mean_token_accuracy": 0.7120056856343084, - "eval_runtime": 14.3696, - "eval_samples_per_second": 18.233, - "eval_steps_per_second": 1.183, - "step": 1500 - }, - { - "epoch": 1.0969034608378871, - "grad_norm": 0.3266630172729492, - "learning_rate": 2e-05, - "loss": 0.9739, - "mean_token_accuracy": 0.7254488275525159, - "step": 1505 - }, - { - "epoch": 1.1005464480874316, - "grad_norm": 0.3302668333053589, - "learning_rate": 2e-05, - "loss": 0.9751, - "mean_token_accuracy": 0.7234306301905229, - "step": 1510 - }, - { - "epoch": 1.1041894353369763, - "grad_norm": 0.32395848631858826, - "learning_rate": 2e-05, - "loss": 0.9685, - "mean_token_accuracy": 0.7260014655593551, - "step": 1515 - }, - { - "epoch": 1.107832422586521, - "grad_norm": 0.31805044412612915, - "learning_rate": 2e-05, - "loss": 0.992, - "mean_token_accuracy": 0.7183103321934539, - "step": 1520 - }, - { - "epoch": 1.1114754098360655, - "grad_norm": 0.3312923312187195, - "learning_rate": 2e-05, - "loss": 0.9656, - "mean_token_accuracy": 0.7273204689789936, - "step": 1525 - }, - { - "epoch": 1.1151183970856102, - "grad_norm": 0.31044256687164307, - "learning_rate": 2e-05, - "loss": 0.9764, - "mean_token_accuracy": 0.7244168295065949, - "step": 1530 - }, - { - "epoch": 1.118761384335155, - "grad_norm": 0.343591570854187, - "learning_rate": 2e-05, - "loss": 0.9865, - "mean_token_accuracy": 0.7209636052760137, - "step": 1535 - }, - { - "epoch": 1.1224043715846994, - "grad_norm": 0.3052913248538971, - "learning_rate": 2e-05, - "loss": 0.9773, - "mean_token_accuracy": 0.7230153883732291, - "step": 1540 - }, - { - "epoch": 1.1260473588342441, - "grad_norm": 0.32441943883895874, - "learning_rate": 2e-05, - "loss": 1.0037, - "mean_token_accuracy": 0.7171042989741085, - "step": 1545 - }, - { - "epoch": 1.1296903460837888, - "grad_norm": 0.33394140005111694, - "learning_rate": 2e-05, - "loss": 0.9877, - "mean_token_accuracy": 0.720826209086468, - "step": 1550 - }, - { - "epoch": 1.1333333333333333, - "grad_norm": 0.34548622369766235, - "learning_rate": 2e-05, - "loss": 0.989, - "mean_token_accuracy": 0.7204445530043968, - "step": 1555 - }, - { - "epoch": 1.136976320582878, - "grad_norm": 0.3255009949207306, - "learning_rate": 2e-05, - "loss": 0.9615, - "mean_token_accuracy": 0.7266609672691743, - "step": 1560 - }, - { - "epoch": 1.1406193078324225, - "grad_norm": 0.3292433023452759, - "learning_rate": 2e-05, - "loss": 1.0, - "mean_token_accuracy": 0.7187103077674646, - "step": 1565 - }, - { - "epoch": 1.1442622950819672, - "grad_norm": 0.3490438759326935, - "learning_rate": 2e-05, - "loss": 0.9932, - "mean_token_accuracy": 0.7185576453346362, - "step": 1570 - }, - { - "epoch": 1.147905282331512, - "grad_norm": 0.32841020822525024, - "learning_rate": 2e-05, - "loss": 0.9784, - "mean_token_accuracy": 0.7231283585735222, - "step": 1575 - }, - { - "epoch": 1.1515482695810564, - "grad_norm": 0.3233148753643036, - "learning_rate": 2e-05, - "loss": 0.9649, - "mean_token_accuracy": 0.7264472398632144, - "step": 1580 - }, - { - "epoch": 1.155191256830601, - "grad_norm": 0.29847216606140137, - "learning_rate": 2e-05, - "loss": 0.9563, - "mean_token_accuracy": 0.7308133854421105, - "step": 1585 - }, - { - "epoch": 1.1588342440801458, - "grad_norm": 0.31668856739997864, - "learning_rate": 2e-05, - "loss": 0.9666, - "mean_token_accuracy": 0.726267098192477, - "step": 1590 - }, - { - "epoch": 1.1624772313296903, - "grad_norm": 0.32061877846717834, - "learning_rate": 2e-05, - "loss": 0.9824, - "mean_token_accuracy": 0.7208720078163166, - "step": 1595 - }, - { - "epoch": 1.166120218579235, - "grad_norm": 0.3160786032676697, - "learning_rate": 2e-05, - "loss": 1.0117, - "mean_token_accuracy": 0.7149151196873472, - "step": 1600 - }, - { - "epoch": 1.166120218579235, - "eval_loss": 1.0199487209320068, - "eval_mean_token_accuracy": 0.7121113575576578, - "eval_runtime": 14.3112, - "eval_samples_per_second": 18.307, - "eval_steps_per_second": 1.188, - "step": 1600 - }, - { - "epoch": 1.1697632058287797, - "grad_norm": 0.32467761635780334, - "learning_rate": 2e-05, - "loss": 0.9945, - "mean_token_accuracy": 0.7184507816316562, - "step": 1605 - }, - { - "epoch": 1.1734061930783242, - "grad_norm": 0.31809383630752563, - "learning_rate": 2e-05, - "loss": 0.9654, - "mean_token_accuracy": 0.726557156814851, - "step": 1610 - }, - { - "epoch": 1.1770491803278689, - "grad_norm": 0.33797189593315125, - "learning_rate": 2e-05, - "loss": 0.9995, - "mean_token_accuracy": 0.7159257449926724, - "step": 1615 - }, - { - "epoch": 1.1806921675774136, - "grad_norm": 0.315861314535141, - "learning_rate": 2e-05, - "loss": 0.9621, - "mean_token_accuracy": 0.7287493893502688, - "step": 1620 - }, - { - "epoch": 1.184335154826958, - "grad_norm": 0.31979405879974365, - "learning_rate": 2e-05, - "loss": 0.9607, - "mean_token_accuracy": 0.7283768930141671, - "step": 1625 - }, - { - "epoch": 1.1879781420765028, - "grad_norm": 0.31935200095176697, - "learning_rate": 2e-05, - "loss": 0.9692, - "mean_token_accuracy": 0.7262060332193452, - "step": 1630 - }, - { - "epoch": 1.1916211293260472, - "grad_norm": 0.3271839916706085, - "learning_rate": 2e-05, - "loss": 0.9718, - "mean_token_accuracy": 0.7253847093307277, - "step": 1635 - }, - { - "epoch": 1.195264116575592, - "grad_norm": 0.30795371532440186, - "learning_rate": 2e-05, - "loss": 0.9508, - "mean_token_accuracy": 0.7300012212994627, - "step": 1640 - }, - { - "epoch": 1.1989071038251367, - "grad_norm": 0.3378467857837677, - "learning_rate": 2e-05, - "loss": 0.9933, - "mean_token_accuracy": 0.7200476306790425, - "step": 1645 - }, - { - "epoch": 1.2025500910746811, - "grad_norm": 0.3224976658821106, - "learning_rate": 2e-05, - "loss": 0.9886, - "mean_token_accuracy": 0.7199761428617866, - "step": 1650 - }, - { - "epoch": 1.2061930783242258, - "grad_norm": 0.30547860264778137, - "learning_rate": 2e-05, - "loss": 0.9856, - "mean_token_accuracy": 0.7202552515876893, - "step": 1655 - }, - { - "epoch": 1.2098360655737705, - "grad_norm": 0.346476674079895, - "learning_rate": 2e-05, - "loss": 0.9644, - "mean_token_accuracy": 0.7255495847581827, - "step": 1660 - }, - { - "epoch": 1.213479052823315, - "grad_norm": 0.3571411967277527, - "learning_rate": 2e-05, - "loss": 0.966, - "mean_token_accuracy": 0.7260930630190523, - "step": 1665 - }, - { - "epoch": 1.2171220400728597, - "grad_norm": 0.3297497034072876, - "learning_rate": 2e-05, - "loss": 0.9826, - "mean_token_accuracy": 0.7229054714215927, - "step": 1670 - }, - { - "epoch": 1.2207650273224044, - "grad_norm": 0.3237413167953491, - "learning_rate": 2e-05, - "loss": 0.9728, - "mean_token_accuracy": 0.7240382266731802, - "step": 1675 - }, - { - "epoch": 1.224408014571949, - "grad_norm": 0.29469963908195496, - "learning_rate": 2e-05, - "loss": 0.9388, - "mean_token_accuracy": 0.732352222765022, - "step": 1680 - }, - { - "epoch": 1.2280510018214936, - "grad_norm": 0.33955973386764526, - "learning_rate": 2e-05, - "loss": 0.9725, - "mean_token_accuracy": 0.722966536394724, - "step": 1685 - }, - { - "epoch": 1.2316939890710383, - "grad_norm": 0.29405203461647034, - "learning_rate": 2e-05, - "loss": 0.9681, - "mean_token_accuracy": 0.7255404250122128, - "step": 1690 - }, - { - "epoch": 1.2353369763205828, - "grad_norm": 0.3296028673648834, - "learning_rate": 2e-05, - "loss": 0.9769, - "mean_token_accuracy": 0.7248809233023937, - "step": 1695 - }, - { - "epoch": 1.2389799635701275, - "grad_norm": 0.3257172703742981, - "learning_rate": 2e-05, - "loss": 0.9819, - "mean_token_accuracy": 0.723152784562775, - "step": 1700 - }, - { - "epoch": 1.2389799635701275, - "eval_loss": 1.0157443284988403, - "eval_mean_token_accuracy": 0.7135474758957674, - "eval_runtime": 14.3005, - "eval_samples_per_second": 18.321, - "eval_steps_per_second": 1.189, - "step": 1700 - }, - { - "epoch": 1.2426229508196722, - "grad_norm": 0.30303245782852173, - "learning_rate": 2e-05, - "loss": 0.9754, - "mean_token_accuracy": 0.7243343917928676, - "step": 1705 - }, - { - "epoch": 1.2462659380692167, - "grad_norm": 0.29583922028541565, - "learning_rate": 2e-05, - "loss": 0.9914, - "mean_token_accuracy": 0.7174801125055479, - "step": 1710 - }, - { - "epoch": 1.2499089253187614, - "grad_norm": 0.3175574541091919, - "learning_rate": 2e-05, - "loss": 0.9695, - "mean_token_accuracy": 0.7243374450415242, - "step": 1715 - }, - { - "epoch": 1.2535519125683061, - "grad_norm": 0.32751041650772095, - "learning_rate": 2e-05, - "loss": 0.9658, - "mean_token_accuracy": 0.7266762335124572, - "step": 1720 - }, - { - "epoch": 1.2571948998178506, - "grad_norm": 0.3620862662792206, - "learning_rate": 2e-05, - "loss": 0.9437, - "mean_token_accuracy": 0.7318392769907182, - "step": 1725 - }, - { - "epoch": 1.2608378870673953, - "grad_norm": 0.31344443559646606, - "learning_rate": 2e-05, - "loss": 0.9519, - "mean_token_accuracy": 0.7299096238397655, - "step": 1730 - }, - { - "epoch": 1.26448087431694, - "grad_norm": 0.3178603947162628, - "learning_rate": 2e-05, - "loss": 0.9678, - "mean_token_accuracy": 0.7239557889594528, - "step": 1735 - }, - { - "epoch": 1.2681238615664845, - "grad_norm": 0.3062346279621124, - "learning_rate": 2e-05, - "loss": 0.9769, - "mean_token_accuracy": 0.7235344406448461, - "step": 1740 - }, - { - "epoch": 1.2717668488160292, - "grad_norm": 0.3273670971393585, - "learning_rate": 2e-05, - "loss": 0.9738, - "mean_token_accuracy": 0.7229634831460674, - "step": 1745 - }, - { - "epoch": 1.275409836065574, - "grad_norm": 0.3440561592578888, - "learning_rate": 2e-05, - "loss": 0.9708, - "mean_token_accuracy": 0.7243130190522715, - "step": 1750 - }, - { - "epoch": 1.2790528233151184, - "grad_norm": 0.3306814730167389, - "learning_rate": 2e-05, - "loss": 0.9763, - "mean_token_accuracy": 0.7213208353688324, - "step": 1755 - }, - { - "epoch": 1.282695810564663, - "grad_norm": 0.3273833990097046, - "learning_rate": 2e-05, - "loss": 0.9462, - "mean_token_accuracy": 0.7324743527112848, - "step": 1760 - }, - { - "epoch": 1.2863387978142076, - "grad_norm": 0.32791441679000854, - "learning_rate": 2e-05, - "loss": 0.9993, - "mean_token_accuracy": 0.7166310454323399, - "step": 1765 - }, - { - "epoch": 1.2899817850637523, - "grad_norm": 0.31501585245132446, - "learning_rate": 2e-05, - "loss": 0.9759, - "mean_token_accuracy": 0.7226764777723499, - "step": 1770 - }, - { - "epoch": 1.293624772313297, - "grad_norm": 0.3403507173061371, - "learning_rate": 2e-05, - "loss": 0.975, - "mean_token_accuracy": 0.7232352222765023, - "step": 1775 - }, - { - "epoch": 1.2972677595628415, - "grad_norm": 0.31439098715782166, - "learning_rate": 2e-05, - "loss": 0.9892, - "mean_token_accuracy": 0.719667688261515, - "step": 1780 - }, - { - "epoch": 1.3009107468123862, - "grad_norm": 0.32850295305252075, - "learning_rate": 2e-05, - "loss": 0.9702, - "mean_token_accuracy": 0.725277845627748, - "step": 1785 - }, - { - "epoch": 1.3045537340619306, - "grad_norm": 0.3256784975528717, - "learning_rate": 2e-05, - "loss": 0.9539, - "mean_token_accuracy": 0.7279036394723986, - "step": 1790 - }, - { - "epoch": 1.3081967213114754, - "grad_norm": 0.34349489212036133, - "learning_rate": 2e-05, - "loss": 0.9646, - "mean_token_accuracy": 0.7258457498778701, - "step": 1795 - }, - { - "epoch": 1.31183970856102, - "grad_norm": 0.3172832429409027, - "learning_rate": 2e-05, - "loss": 0.976, - "mean_token_accuracy": 0.7235344406448462, - "step": 1800 - }, - { - "epoch": 1.31183970856102, - "eval_loss": 1.0121958255767822, - "eval_mean_token_accuracy": 0.7141900481251602, - "eval_runtime": 14.3175, - "eval_samples_per_second": 18.299, - "eval_steps_per_second": 1.187, - "step": 1800 - }, - { - "epoch": 1.3154826958105645, - "grad_norm": 0.3244900405406952, - "learning_rate": 2e-05, - "loss": 0.9801, - "mean_token_accuracy": 0.7222307034684906, - "step": 1805 - }, - { - "epoch": 1.3191256830601092, - "grad_norm": 0.37057653069496155, - "learning_rate": 2e-05, - "loss": 0.972, - "mean_token_accuracy": 0.7246091841719591, - "step": 1810 - }, - { - "epoch": 1.322768670309654, - "grad_norm": 0.3139060437679291, - "learning_rate": 2e-05, - "loss": 0.9705, - "mean_token_accuracy": 0.7253602833414752, - "step": 1815 - }, - { - "epoch": 1.3264116575591984, - "grad_norm": 0.3245513439178467, - "learning_rate": 2e-05, - "loss": 0.9489, - "mean_token_accuracy": 0.7318087445041525, - "step": 1820 - }, - { - "epoch": 1.3300546448087431, - "grad_norm": 0.31191951036453247, - "learning_rate": 2e-05, - "loss": 0.9617, - "mean_token_accuracy": 0.7279738641914998, - "step": 1825 - }, - { - "epoch": 1.3336976320582878, - "grad_norm": 0.3181591331958771, - "learning_rate": 2e-05, - "loss": 0.9389, - "mean_token_accuracy": 0.7314179286761113, - "step": 1830 - }, - { - "epoch": 1.3373406193078323, - "grad_norm": 0.32525399327278137, - "learning_rate": 2e-05, - "loss": 1.0118, - "mean_token_accuracy": 0.71133365901319, - "step": 1835 - }, - { - "epoch": 1.340983606557377, - "grad_norm": 0.3335639238357544, - "learning_rate": 2e-05, - "loss": 0.9624, - "mean_token_accuracy": 0.72753724963361, - "step": 1840 - }, - { - "epoch": 1.3446265938069217, - "grad_norm": 0.36104097962379456, - "learning_rate": 2e-05, - "loss": 0.9731, - "mean_token_accuracy": 0.7239160967269173, - "step": 1845 - }, - { - "epoch": 1.3482695810564662, - "grad_norm": 0.3039902448654175, - "learning_rate": 2e-05, - "loss": 0.966, - "mean_token_accuracy": 0.7241908891060088, - "step": 1850 - }, - { - "epoch": 1.351912568306011, - "grad_norm": 0.30735480785369873, - "learning_rate": 2e-05, - "loss": 0.9614, - "mean_token_accuracy": 0.7256656082071324, - "step": 1855 - }, - { - "epoch": 1.3555555555555556, - "grad_norm": 0.31513282656669617, - "learning_rate": 2e-05, - "loss": 0.9668, - "mean_token_accuracy": 0.7249938935026867, - "step": 1860 - }, - { - "epoch": 1.3591985428051, - "grad_norm": 0.3390577733516693, - "learning_rate": 2e-05, - "loss": 0.9512, - "mean_token_accuracy": 0.7295585002442598, - "step": 1865 - }, - { - "epoch": 1.3628415300546448, - "grad_norm": 0.309792697429657, - "learning_rate": 2e-05, - "loss": 0.9693, - "mean_token_accuracy": 0.7264594528578409, - "step": 1870 - }, - { - "epoch": 1.3664845173041895, - "grad_norm": 0.3256780803203583, - "learning_rate": 2e-05, - "loss": 0.965, - "mean_token_accuracy": 0.7258304836345872, - "step": 1875 - }, - { - "epoch": 1.370127504553734, - "grad_norm": 0.3005531132221222, - "learning_rate": 2e-05, - "loss": 0.9479, - "mean_token_accuracy": 0.7296256717147045, - "step": 1880 - }, - { - "epoch": 1.3737704918032787, - "grad_norm": 0.32955485582351685, - "learning_rate": 2e-05, - "loss": 0.9692, - "mean_token_accuracy": 0.7237634342940888, - "step": 1885 - }, - { - "epoch": 1.3774134790528234, - "grad_norm": 0.3305559456348419, - "learning_rate": 2e-05, - "loss": 0.9599, - "mean_token_accuracy": 0.7268288959452859, - "step": 1890 - }, - { - "epoch": 1.381056466302368, - "grad_norm": 0.3320905268192291, - "learning_rate": 2e-05, - "loss": 0.9589, - "mean_token_accuracy": 0.7275158768930142, - "step": 1895 - }, - { - "epoch": 1.3846994535519126, - "grad_norm": 0.3323926329612732, - "learning_rate": 2e-05, - "loss": 0.9708, - "mean_token_accuracy": 0.7250183194919394, - "step": 1900 - }, - { - "epoch": 1.3846994535519126, - "eval_loss": 1.0094226598739624, - "eval_mean_token_accuracy": 0.7149347002206592, - "eval_runtime": 14.3499, - "eval_samples_per_second": 18.258, - "eval_steps_per_second": 1.185, - "step": 1900 - }, - { - "epoch": 1.3883424408014573, - "grad_norm": 0.33546602725982666, - "learning_rate": 2e-05, - "loss": 0.9688, - "mean_token_accuracy": 0.725732779677577, - "step": 1905 - }, - { - "epoch": 1.3919854280510018, - "grad_norm": 0.34871605038642883, - "learning_rate": 2e-05, - "loss": 0.9566, - "mean_token_accuracy": 0.7274456521739128, - "step": 1910 - }, - { - "epoch": 1.3956284153005465, - "grad_norm": 0.3340302109718323, - "learning_rate": 2e-05, - "loss": 0.9665, - "mean_token_accuracy": 0.7259342940889107, - "step": 1915 - }, - { - "epoch": 1.3992714025500912, - "grad_norm": 0.30206888914108276, - "learning_rate": 2e-05, - "loss": 0.9742, - "mean_token_accuracy": 0.7233756717147044, - "step": 1920 - }, - { - "epoch": 1.4029143897996357, - "grad_norm": 0.29673662781715393, - "learning_rate": 2e-05, - "loss": 0.9485, - "mean_token_accuracy": 0.731414875427455, - "step": 1925 - }, - { - "epoch": 1.4065573770491804, - "grad_norm": 0.3166307508945465, - "learning_rate": 2e-05, - "loss": 0.9447, - "mean_token_accuracy": 0.7307217879824133, - "step": 1930 - }, - { - "epoch": 1.410200364298725, - "grad_norm": 0.29243019223213196, - "learning_rate": 2e-05, - "loss": 0.9695, - "mean_token_accuracy": 0.7265571568148511, - "step": 1935 - }, - { - "epoch": 1.4138433515482696, - "grad_norm": 0.32718804478645325, - "learning_rate": 2e-05, - "loss": 0.9611, - "mean_token_accuracy": 0.7267220322423057, - "step": 1940 - }, - { - "epoch": 1.4174863387978143, - "grad_norm": 0.32046985626220703, - "learning_rate": 2e-05, - "loss": 0.9627, - "mean_token_accuracy": 0.7269021739130435, - "step": 1945 - }, - { - "epoch": 1.4211293260473588, - "grad_norm": 0.31391212344169617, - "learning_rate": 2e-05, - "loss": 0.9792, - "mean_token_accuracy": 0.7224474841231071, - "step": 1950 - }, - { - "epoch": 1.4247723132969035, - "grad_norm": 0.2963239848613739, - "learning_rate": 2e-05, - "loss": 0.9413, - "mean_token_accuracy": 0.7336468001954081, - "step": 1955 - }, - { - "epoch": 1.4284153005464482, - "grad_norm": 0.30880090594291687, - "learning_rate": 2e-05, - "loss": 0.9516, - "mean_token_accuracy": 0.7294607962872497, - "step": 1960 - }, - { - "epoch": 1.4320582877959926, - "grad_norm": 0.2999388873577118, - "learning_rate": 2e-05, - "loss": 0.9428, - "mean_token_accuracy": 0.7320010991695163, - "step": 1965 - }, - { - "epoch": 1.4357012750455374, - "grad_norm": 0.3056471049785614, - "learning_rate": 2e-05, - "loss": 0.9523, - "mean_token_accuracy": 0.7304927943331705, - "step": 1970 - }, - { - "epoch": 1.4393442622950818, - "grad_norm": 0.3117208182811737, - "learning_rate": 2e-05, - "loss": 0.9727, - "mean_token_accuracy": 0.7226520517830972, - "step": 1975 - }, - { - "epoch": 1.4429872495446265, - "grad_norm": 0.3220628499984741, - "learning_rate": 2e-05, - "loss": 0.9786, - "mean_token_accuracy": 0.723250488519785, - "step": 1980 - }, - { - "epoch": 1.4466302367941712, - "grad_norm": 0.3221818804740906, - "learning_rate": 2e-05, - "loss": 0.9713, - "mean_token_accuracy": 0.7243985100146556, - "step": 1985 - }, - { - "epoch": 1.4502732240437157, - "grad_norm": 0.30782631039619446, - "learning_rate": 2e-05, - "loss": 0.9525, - "mean_token_accuracy": 0.7300378602833414, - "step": 1990 - }, - { - "epoch": 1.4539162112932604, - "grad_norm": 0.32362139225006104, - "learning_rate": 2e-05, - "loss": 0.9706, - "mean_token_accuracy": 0.7234978016609672, - "step": 1995 - }, - { - "epoch": 1.4575591985428051, - "grad_norm": 0.31744489073753357, - "learning_rate": 2e-05, - "loss": 0.9497, - "mean_token_accuracy": 0.7307401074743527, - "step": 2000 - }, - { - "epoch": 1.4575591985428051, - "eval_loss": 1.0048604011535645, - "eval_mean_token_accuracy": 0.7154876811925385, - "eval_runtime": 14.3295, - "eval_samples_per_second": 18.284, - "eval_steps_per_second": 1.186, - "step": 2000 - }, - { - "epoch": 1.4612021857923496, - "grad_norm": 0.34834545850753784, - "learning_rate": 2e-05, - "loss": 0.9786, - "mean_token_accuracy": 0.7242458475818271, - "step": 2005 - }, - { - "epoch": 1.4648451730418943, - "grad_norm": 0.344738632440567, - "learning_rate": 2e-05, - "loss": 0.9607, - "mean_token_accuracy": 0.7274670249145089, - "step": 2010 - }, - { - "epoch": 1.468488160291439, - "grad_norm": 0.32923632860183716, - "learning_rate": 2e-05, - "loss": 0.9651, - "mean_token_accuracy": 0.7261327552515875, - "step": 2015 - }, - { - "epoch": 1.4721311475409835, - "grad_norm": 0.33209770917892456, - "learning_rate": 2e-05, - "loss": 0.9664, - "mean_token_accuracy": 0.7245175867122619, - "step": 2020 - }, - { - "epoch": 1.4757741347905282, - "grad_norm": 0.30448663234710693, - "learning_rate": 2e-05, - "loss": 0.961, - "mean_token_accuracy": 0.7256442354665364, - "step": 2025 - }, - { - "epoch": 1.479417122040073, - "grad_norm": 0.29810741543769836, - "learning_rate": 2e-05, - "loss": 0.9683, - "mean_token_accuracy": 0.7260777967757693, - "step": 2030 - }, - { - "epoch": 1.4830601092896174, - "grad_norm": 0.3135621249675751, - "learning_rate": 2e-05, - "loss": 0.9706, - "mean_token_accuracy": 0.7255373717635566, - "step": 2035 - }, - { - "epoch": 1.486703096539162, - "grad_norm": 0.32791656255722046, - "learning_rate": 2e-05, - "loss": 0.943, - "mean_token_accuracy": 0.7303187591597459, - "step": 2040 - }, - { - "epoch": 1.4903460837887068, - "grad_norm": 0.3219871520996094, - "learning_rate": 2e-05, - "loss": 0.9485, - "mean_token_accuracy": 0.7297630679042502, - "step": 2045 - }, - { - "epoch": 1.4939890710382513, - "grad_norm": 0.30337387323379517, - "learning_rate": 2e-05, - "loss": 0.9512, - "mean_token_accuracy": 0.7309752076209086, - "step": 2050 - }, - { - "epoch": 1.497632058287796, - "grad_norm": 0.3070669174194336, - "learning_rate": 2e-05, - "loss": 0.9529, - "mean_token_accuracy": 0.7285600879335614, - "step": 2055 - }, - { - "epoch": 1.5012750455373407, - "grad_norm": 0.3109239637851715, - "learning_rate": 2e-05, - "loss": 0.9473, - "mean_token_accuracy": 0.730019540791402, - "step": 2060 - }, - { - "epoch": 1.5049180327868852, - "grad_norm": 0.3539317846298218, - "learning_rate": 2e-05, - "loss": 0.9641, - "mean_token_accuracy": 0.7243801905227162, - "step": 2065 - }, - { - "epoch": 1.50856102003643, - "grad_norm": 0.33243849873542786, - "learning_rate": 2e-05, - "loss": 0.9736, - "mean_token_accuracy": 0.722575720566683, - "step": 2070 - }, - { - "epoch": 1.5122040072859746, - "grad_norm": 0.3301512897014618, - "learning_rate": 2e-05, - "loss": 0.9709, - "mean_token_accuracy": 0.7232603488129995, - "step": 2075 - }, - { - "epoch": 1.515846994535519, - "grad_norm": 0.34674686193466187, - "learning_rate": 2e-05, - "loss": 0.9957, - "mean_token_accuracy": 0.716304347826087, - "step": 2080 - }, - { - "epoch": 1.5194899817850638, - "grad_norm": 0.34269919991493225, - "learning_rate": 2e-05, - "loss": 0.9578, - "mean_token_accuracy": 0.7280013434294089, - "step": 2085 - }, - { - "epoch": 1.5231329690346085, - "grad_norm": 0.3565721809864044, - "learning_rate": 2e-05, - "loss": 0.9462, - "mean_token_accuracy": 0.7306362970200293, - "step": 2090 - }, - { - "epoch": 1.526775956284153, - "grad_norm": 0.32590150833129883, - "learning_rate": 2e-05, - "loss": 0.9778, - "mean_token_accuracy": 0.7213727405959941, - "step": 2095 - }, - { - "epoch": 1.5304189435336977, - "grad_norm": 0.3151698112487793, - "learning_rate": 2e-05, - "loss": 0.9566, - "mean_token_accuracy": 0.7274731314118223, - "step": 2100 - }, - { - "epoch": 1.5304189435336977, - "eval_loss": 1.0019382238388062, - "eval_mean_token_accuracy": 0.7163066606906044, - "eval_runtime": 14.3168, - "eval_samples_per_second": 18.3, - "eval_steps_per_second": 1.187, - "step": 2100 - }, - { - "epoch": 1.5340619307832424, - "grad_norm": 0.31888285279273987, - "learning_rate": 2e-05, - "loss": 0.9413, - "mean_token_accuracy": 0.7308591841719589, - "step": 2105 - }, - { - "epoch": 1.5377049180327869, - "grad_norm": 0.33658313751220703, - "learning_rate": 2e-05, - "loss": 0.9439, - "mean_token_accuracy": 0.7299126770884221, - "step": 2110 - }, - { - "epoch": 1.5413479052823316, - "grad_norm": 0.3298831880092621, - "learning_rate": 2e-05, - "loss": 0.9699, - "mean_token_accuracy": 0.7238916707376648, - "step": 2115 - }, - { - "epoch": 1.5449908925318763, - "grad_norm": 0.323344886302948, - "learning_rate": 2e-05, - "loss": 0.9464, - "mean_token_accuracy": 0.7299035173424525, - "step": 2120 - }, - { - "epoch": 1.5486338797814208, - "grad_norm": 0.3089609742164612, - "learning_rate": 2e-05, - "loss": 0.9593, - "mean_token_accuracy": 0.726954079140205, - "step": 2125 - }, - { - "epoch": 1.5522768670309652, - "grad_norm": 0.2949562072753906, - "learning_rate": 2e-05, - "loss": 0.9563, - "mean_token_accuracy": 0.7273143624816807, - "step": 2130 - }, - { - "epoch": 1.5559198542805102, - "grad_norm": 0.3209057152271271, - "learning_rate": 2e-05, - "loss": 0.9712, - "mean_token_accuracy": 0.7229115779189057, - "step": 2135 - }, - { - "epoch": 1.5595628415300546, - "grad_norm": 0.34897011518478394, - "learning_rate": 2e-05, - "loss": 0.9625, - "mean_token_accuracy": 0.7247007816316561, - "step": 2140 - }, - { - "epoch": 1.5632058287795991, - "grad_norm": 0.3226200044155121, - "learning_rate": 2e-05, - "loss": 0.9363, - "mean_token_accuracy": 0.7320926966292134, - "step": 2145 - }, - { - "epoch": 1.566848816029144, - "grad_norm": 0.3218037486076355, - "learning_rate": 2e-05, - "loss": 0.9749, - "mean_token_accuracy": 0.7230795065950171, - "step": 2150 - }, - { - "epoch": 1.5704918032786885, - "grad_norm": 0.309461772441864, - "learning_rate": 2e-05, - "loss": 0.9367, - "mean_token_accuracy": 0.7339643380556913, - "step": 2155 - }, - { - "epoch": 1.574134790528233, - "grad_norm": 0.35405707359313965, - "learning_rate": 2e-05, - "loss": 0.9549, - "mean_token_accuracy": 0.7288776257938446, - "step": 2160 - }, - { - "epoch": 1.5777777777777777, - "grad_norm": 0.3519957661628723, - "learning_rate": 2e-05, - "loss": 0.961, - "mean_token_accuracy": 0.7253358573522226, - "step": 2165 - }, - { - "epoch": 1.5814207650273224, - "grad_norm": 0.3647383451461792, - "learning_rate": 2e-05, - "loss": 0.9372, - "mean_token_accuracy": 0.7328651685393259, - "step": 2170 - }, - { - "epoch": 1.585063752276867, - "grad_norm": 0.3361128568649292, - "learning_rate": 2e-05, - "loss": 0.9645, - "mean_token_accuracy": 0.7258732291157791, - "step": 2175 - }, - { - "epoch": 1.5887067395264116, - "grad_norm": 0.3090117275714874, - "learning_rate": 2e-05, - "loss": 0.9645, - "mean_token_accuracy": 0.7250837591040762, - "step": 2180 - }, - { - "epoch": 1.5923497267759563, - "grad_norm": 0.3278610110282898, - "learning_rate": 2e-05, - "loss": 0.9489, - "mean_token_accuracy": 0.7295432340009771, - "step": 2185 - }, - { - "epoch": 1.5959927140255008, - "grad_norm": 0.3234911561012268, - "learning_rate": 2e-05, - "loss": 0.9484, - "mean_token_accuracy": 0.7298241328773816, - "step": 2190 - }, - { - "epoch": 1.5996357012750455, - "grad_norm": 0.3117011487483978, - "learning_rate": 2e-05, - "loss": 0.957, - "mean_token_accuracy": 0.7265479970688812, - "step": 2195 - }, - { - "epoch": 1.6032786885245902, - "grad_norm": 0.3146551549434662, - "learning_rate": 2e-05, - "loss": 0.9702, - "mean_token_accuracy": 0.7246213971665852, - "step": 2200 - }, - { - "epoch": 1.6032786885245902, - "eval_loss": 0.9981810450553894, - "eval_mean_token_accuracy": 0.7169864646888545, - "eval_runtime": 14.4002, - "eval_samples_per_second": 18.194, - "eval_steps_per_second": 1.181, - "step": 2200 - }, - { - "epoch": 1.6069216757741347, - "grad_norm": 0.3198413848876953, - "learning_rate": 2e-05, - "loss": 0.9378, - "mean_token_accuracy": 0.7318881289692233, - "step": 2205 - }, - { - "epoch": 1.6105646630236794, - "grad_norm": 0.3311745524406433, - "learning_rate": 2e-05, - "loss": 0.9751, - "mean_token_accuracy": 0.7230550806057646, - "step": 2210 - }, - { - "epoch": 1.614207650273224, - "grad_norm": 0.3199496269226074, - "learning_rate": 2e-05, - "loss": 0.9589, - "mean_token_accuracy": 0.7272288715192966, - "step": 2215 - }, - { - "epoch": 1.6178506375227686, - "grad_norm": 0.3042522668838501, - "learning_rate": 2e-05, - "loss": 0.9541, - "mean_token_accuracy": 0.7277082315583782, - "step": 2220 - }, - { - "epoch": 1.6214936247723133, - "grad_norm": 0.32280251383781433, - "learning_rate": 2e-05, - "loss": 0.9378, - "mean_token_accuracy": 0.7322545188080117, - "step": 2225 - }, - { - "epoch": 1.625136612021858, - "grad_norm": 0.3113802969455719, - "learning_rate": 2e-05, - "loss": 0.9678, - "mean_token_accuracy": 0.7245267464582316, - "step": 2230 - }, - { - "epoch": 1.6287795992714025, - "grad_norm": 0.32498499751091003, - "learning_rate": 2e-05, - "loss": 0.9649, - "mean_token_accuracy": 0.7253541768441623, - "step": 2235 - }, - { - "epoch": 1.6324225865209472, - "grad_norm": 0.3127484619617462, - "learning_rate": 2e-05, - "loss": 0.94, - "mean_token_accuracy": 0.7314881533952124, - "step": 2240 - }, - { - "epoch": 1.6360655737704919, - "grad_norm": 0.34295135736465454, - "learning_rate": 2e-05, - "loss": 0.98, - "mean_token_accuracy": 0.7219894968246214, - "step": 2245 - }, - { - "epoch": 1.6397085610200364, - "grad_norm": 0.32535672187805176, - "learning_rate": 2e-05, - "loss": 0.9454, - "mean_token_accuracy": 0.7287986041154254, - "step": 2250 - }, - { - "epoch": 1.643351548269581, - "grad_norm": 0.3259055018424988, - "learning_rate": 2e-05, - "loss": 0.9604, - "mean_token_accuracy": 0.7256808744504151, - "step": 2255 - }, - { - "epoch": 1.6469945355191258, - "grad_norm": 0.3303998112678528, - "learning_rate": 2e-05, - "loss": 0.9551, - "mean_token_accuracy": 0.7276705991907064, - "step": 2260 - }, - { - "epoch": 1.6506375227686703, - "grad_norm": 0.3288848400115967, - "learning_rate": 2e-05, - "loss": 0.9473, - "mean_token_accuracy": 0.7294455300439667, - "step": 2265 - }, - { - "epoch": 1.654280510018215, - "grad_norm": 0.3175695240497589, - "learning_rate": 2e-05, - "loss": 0.9757, - "mean_token_accuracy": 0.7240504396678065, - "step": 2270 - }, - { - "epoch": 1.6579234972677597, - "grad_norm": 0.3215230703353882, - "learning_rate": 2e-05, - "loss": 0.9631, - "mean_token_accuracy": 0.7257022471910114, - "step": 2275 - }, - { - "epoch": 1.6615664845173042, - "grad_norm": 0.31356170773506165, - "learning_rate": 2e-05, - "loss": 0.9771, - "mean_token_accuracy": 0.7222734489496825, - "step": 2280 - }, - { - "epoch": 1.6652094717668489, - "grad_norm": 0.31009095907211304, - "learning_rate": 2e-05, - "loss": 0.9339, - "mean_token_accuracy": 0.7347184904738644, - "step": 2285 - }, - { - "epoch": 1.6688524590163936, - "grad_norm": 0.3301193118095398, - "learning_rate": 2e-05, - "loss": 0.9575, - "mean_token_accuracy": 0.728318881289692, - "step": 2290 - }, - { - "epoch": 1.672495446265938, - "grad_norm": 0.31997668743133545, - "learning_rate": 2e-05, - "loss": 0.9648, - "mean_token_accuracy": 0.7259159745969711, - "step": 2295 - }, - { - "epoch": 1.6761384335154828, - "grad_norm": 0.32590940594673157, - "learning_rate": 2e-05, - "loss": 0.9509, - "mean_token_accuracy": 0.7274975574010748, - "step": 2300 - }, - { - "epoch": 1.6761384335154828, - "eval_loss": 0.9955868124961853, - "eval_mean_token_accuracy": 0.7172010781674456, - "eval_runtime": 14.3325, - "eval_samples_per_second": 18.28, - "eval_steps_per_second": 1.186, - "step": 2300 - }, - { - "epoch": 1.6797814207650275, - "grad_norm": 0.3190097510814667, - "learning_rate": 2e-05, - "loss": 0.9659, - "mean_token_accuracy": 0.7243710307767464, - "step": 2305 - }, - { - "epoch": 1.683424408014572, - "grad_norm": 0.3566085994243622, - "learning_rate": 2e-05, - "loss": 0.9797, - "mean_token_accuracy": 0.7204598192476794, - "step": 2310 - }, - { - "epoch": 1.6870673952641164, - "grad_norm": 0.3309180438518524, - "learning_rate": 2e-05, - "loss": 0.943, - "mean_token_accuracy": 0.7305782852955545, - "step": 2315 - }, - { - "epoch": 1.6907103825136613, - "grad_norm": 0.32704347372055054, - "learning_rate": 2e-05, - "loss": 0.9572, - "mean_token_accuracy": 0.7284349047386419, - "step": 2320 - }, - { - "epoch": 1.6943533697632058, - "grad_norm": 0.31231483817100525, - "learning_rate": 2e-05, - "loss": 0.9482, - "mean_token_accuracy": 0.7299615290669271, - "step": 2325 - }, - { - "epoch": 1.6979963570127503, - "grad_norm": 0.31142398715019226, - "learning_rate": 2e-05, - "loss": 0.9633, - "mean_token_accuracy": 0.7263281631656083, - "step": 2330 - }, - { - "epoch": 1.7016393442622952, - "grad_norm": 0.35823601484298706, - "learning_rate": 2e-05, - "loss": 0.9453, - "mean_token_accuracy": 0.7308439179286761, - "step": 2335 - }, - { - "epoch": 1.7052823315118397, - "grad_norm": 0.3349340260028839, - "learning_rate": 2e-05, - "loss": 0.9602, - "mean_token_accuracy": 0.7278425744992671, - "step": 2340 - }, - { - "epoch": 1.7089253187613842, - "grad_norm": 0.3404028117656708, - "learning_rate": 2e-05, - "loss": 0.9668, - "mean_token_accuracy": 0.7247404738641916, - "step": 2345 - }, - { - "epoch": 1.712568306010929, - "grad_norm": 0.3519185781478882, - "learning_rate": 2e-05, - "loss": 0.9325, - "mean_token_accuracy": 0.7344589643380556, - "step": 2350 - }, - { - "epoch": 1.7162112932604736, - "grad_norm": 0.33176666498184204, - "learning_rate": 2e-05, - "loss": 0.9474, - "mean_token_accuracy": 0.7299890083048365, - "step": 2355 - }, - { - "epoch": 1.719854280510018, - "grad_norm": 0.31135764718055725, - "learning_rate": 2e-05, - "loss": 0.9707, - "mean_token_accuracy": 0.7230459208597948, - "step": 2360 - }, - { - "epoch": 1.7234972677595628, - "grad_norm": 0.31730037927627563, - "learning_rate": 2e-05, - "loss": 0.9358, - "mean_token_accuracy": 0.7335399364924279, - "step": 2365 - }, - { - "epoch": 1.7271402550091075, - "grad_norm": 0.31346791982650757, - "learning_rate": 2e-05, - "loss": 0.9462, - "mean_token_accuracy": 0.7301294577430386, - "step": 2370 - }, - { - "epoch": 1.730783242258652, - "grad_norm": 0.30968010425567627, - "learning_rate": 2e-05, - "loss": 0.9683, - "mean_token_accuracy": 0.7245206399609183, - "step": 2375 - }, - { - "epoch": 1.7344262295081967, - "grad_norm": 0.3089336156845093, - "learning_rate": 2e-05, - "loss": 0.9386, - "mean_token_accuracy": 0.7323888617489008, - "step": 2380 - }, - { - "epoch": 1.7380692167577414, - "grad_norm": 0.3058148920536041, - "learning_rate": 2e-05, - "loss": 0.9559, - "mean_token_accuracy": 0.728135686370298, - "step": 2385 - }, - { - "epoch": 1.7417122040072859, - "grad_norm": 0.3387933075428009, - "learning_rate": 2e-05, - "loss": 0.9714, - "mean_token_accuracy": 0.7253419638495358, - "step": 2390 - }, - { - "epoch": 1.7453551912568306, - "grad_norm": 0.33751383423805237, - "learning_rate": 2e-05, - "loss": 0.9236, - "mean_token_accuracy": 0.7382297264289204, - "step": 2395 - }, - { - "epoch": 1.7489981785063753, - "grad_norm": 0.3331749439239502, - "learning_rate": 2e-05, - "loss": 0.9384, - "mean_token_accuracy": 0.7333597948216903, - "step": 2400 - }, - { - "epoch": 1.7489981785063753, - "eval_loss": 0.9930522441864014, - "eval_mean_token_accuracy": 0.717845759297467, - "eval_runtime": 14.2644, - "eval_samples_per_second": 18.367, - "eval_steps_per_second": 1.192, - "step": 2400 - }, - { - "epoch": 1.7526411657559198, - "grad_norm": 0.3001438081264496, - "learning_rate": 2e-05, - "loss": 0.9468, - "mean_token_accuracy": 0.7304439423546653, - "step": 2405 - }, - { - "epoch": 1.7562841530054645, - "grad_norm": 0.3136955201625824, - "learning_rate": 2e-05, - "loss": 0.9627, - "mean_token_accuracy": 0.725497679531021, - "step": 2410 - }, - { - "epoch": 1.7599271402550092, - "grad_norm": 0.33199629187583923, - "learning_rate": 2e-05, - "loss": 0.9689, - "mean_token_accuracy": 0.7236971195681857, - "step": 2415 - }, - { - "epoch": 1.7635701275045537, - "grad_norm": 0.34084373712539673, - "learning_rate": 2e-05, - "loss": 0.9479, - "mean_token_accuracy": 0.7301996824621397, - "step": 2420 - }, - { - "epoch": 1.7672131147540984, - "grad_norm": 0.3298395276069641, - "learning_rate": 2e-05, - "loss": 0.959, - "mean_token_accuracy": 0.7253541768441623, - "step": 2425 - }, - { - "epoch": 1.770856102003643, - "grad_norm": 0.3308035731315613, - "learning_rate": 2e-05, - "loss": 0.966, - "mean_token_accuracy": 0.7240626526624327, - "step": 2430 - }, - { - "epoch": 1.7744990892531876, - "grad_norm": 0.33812233805656433, - "learning_rate": 2e-05, - "loss": 0.9408, - "mean_token_accuracy": 0.7327674645823156, - "step": 2435 - }, - { - "epoch": 1.7781420765027323, - "grad_norm": 0.31434857845306396, - "learning_rate": 2e-05, - "loss": 0.9408, - "mean_token_accuracy": 0.7304866878358574, - "step": 2440 - }, - { - "epoch": 1.781785063752277, - "grad_norm": 0.33050814270973206, - "learning_rate": 2e-05, - "loss": 0.9496, - "mean_token_accuracy": 0.7288967664624881, - "step": 2445 - }, - { - "epoch": 1.7854280510018214, - "grad_norm": 0.3517625629901886, - "learning_rate": 2e-05, - "loss": 0.9708, - "mean_token_accuracy": 0.7233634587200781, - "step": 2450 - }, - { - "epoch": 1.7890710382513662, - "grad_norm": 0.39435386657714844, - "learning_rate": 2e-05, - "loss": 0.9768, - "mean_token_accuracy": 0.720649120664387, - "step": 2455 - }, - { - "epoch": 1.7927140255009109, - "grad_norm": 0.3447003662586212, - "learning_rate": 2e-05, - "loss": 0.9488, - "mean_token_accuracy": 0.7297325354176845, - "step": 2460 - }, - { - "epoch": 1.7963570127504553, - "grad_norm": 0.3322184681892395, - "learning_rate": 2e-05, - "loss": 0.9641, - "mean_token_accuracy": 0.7258030043966781, - "step": 2465 - }, - { - "epoch": 1.8, - "grad_norm": 0.31109198927879333, - "learning_rate": 2e-05, - "loss": 0.9415, - "mean_token_accuracy": 0.7302851734245237, - "step": 2470 - }, - { - "epoch": 1.8036429872495447, - "grad_norm": 0.32298213243484497, - "learning_rate": 2e-05, - "loss": 0.9645, - "mean_token_accuracy": 0.7249114557889594, - "step": 2475 - }, - { - "epoch": 1.8072859744990892, - "grad_norm": 0.31393465399742126, - "learning_rate": 2e-05, - "loss": 0.9373, - "mean_token_accuracy": 0.7328315828041037, - "step": 2480 - }, - { - "epoch": 1.8109289617486337, - "grad_norm": 0.343615859746933, - "learning_rate": 2e-05, - "loss": 0.9546, - "mean_token_accuracy": 0.7293600390815828, - "step": 2485 - }, - { - "epoch": 1.8145719489981786, - "grad_norm": 0.3566816747188568, - "learning_rate": 2e-05, - "loss": 0.9335, - "mean_token_accuracy": 0.7325537371763557, - "step": 2490 - }, - { - "epoch": 1.8182149362477231, - "grad_norm": 0.3791857957839966, - "learning_rate": 2e-05, - "loss": 0.9633, - "mean_token_accuracy": 0.7262063928835035, - "step": 2495 - }, - { - "epoch": 1.8218579234972676, - "grad_norm": 0.34828808903694153, - "learning_rate": 2e-05, - "loss": 0.9461, - "mean_token_accuracy": 0.7305782852955542, - "step": 2500 - }, - { - "epoch": 1.8218579234972676, - "eval_loss": 0.9900703430175781, - "eval_mean_token_accuracy": 0.7183870589182986, - "eval_runtime": 14.2978, - "eval_samples_per_second": 18.324, - "eval_steps_per_second": 1.189, - "step": 2500 - }, - { - "epoch": 1.8255009107468125, - "grad_norm": 0.34125182032585144, - "learning_rate": 2e-05, - "loss": 0.9546, - "mean_token_accuracy": 0.727842574499267, - "step": 2505 - }, - { - "epoch": 1.829143897996357, - "grad_norm": 0.31761088967323303, - "learning_rate": 2e-05, - "loss": 0.9251, - "mean_token_accuracy": 0.7357932340009771, - "step": 2510 - }, - { - "epoch": 1.8327868852459015, - "grad_norm": 0.3065294027328491, - "learning_rate": 2e-05, - "loss": 0.9495, - "mean_token_accuracy": 0.728977635288093, - "step": 2515 - }, - { - "epoch": 1.8364298724954464, - "grad_norm": 0.31213077902793884, - "learning_rate": 2e-05, - "loss": 0.9637, - "mean_token_accuracy": 0.7256961406936981, - "step": 2520 - }, - { - "epoch": 1.840072859744991, - "grad_norm": 0.35743096470832825, - "learning_rate": 2e-05, - "loss": 0.9703, - "mean_token_accuracy": 0.7234367366878359, - "step": 2525 - }, - { - "epoch": 1.8437158469945354, - "grad_norm": 0.3187105655670166, - "learning_rate": 2e-05, - "loss": 0.9289, - "mean_token_accuracy": 0.7341841719589643, - "step": 2530 - }, - { - "epoch": 1.84735883424408, - "grad_norm": 0.3167194426059723, - "learning_rate": 2e-05, - "loss": 0.9534, - "mean_token_accuracy": 0.7265876893014166, - "step": 2535 - }, - { - "epoch": 1.8510018214936248, - "grad_norm": 0.34710872173309326, - "learning_rate": 2e-05, - "loss": 0.9519, - "mean_token_accuracy": 0.727842574499267, - "step": 2540 - }, - { - "epoch": 1.8546448087431693, - "grad_norm": 0.3225826919078827, - "learning_rate": 2e-05, - "loss": 0.9613, - "mean_token_accuracy": 0.725058011724475, - "step": 2545 - }, - { - "epoch": 1.858287795992714, - "grad_norm": 0.31684067845344543, - "learning_rate": 2e-05, - "loss": 0.9578, - "mean_token_accuracy": 0.7259953590620422, - "step": 2550 - }, - { - "epoch": 1.8619307832422587, - "grad_norm": 0.3433721959590912, - "learning_rate": 2e-05, - "loss": 0.9457, - "mean_token_accuracy": 0.7303157059110894, - "step": 2555 - }, - { - "epoch": 1.8655737704918032, - "grad_norm": 0.3335418403148651, - "learning_rate": 2e-05, - "loss": 0.9461, - "mean_token_accuracy": 0.7287860283341476, - "step": 2560 - }, - { - "epoch": 1.8692167577413479, - "grad_norm": 0.34308183193206787, - "learning_rate": 2e-05, - "loss": 0.9524, - "mean_token_accuracy": 0.7277173913043478, - "step": 2565 - }, - { - "epoch": 1.8728597449908926, - "grad_norm": 0.3250623047351837, - "learning_rate": 2e-05, - "loss": 0.9246, - "mean_token_accuracy": 0.7354726428920371, - "step": 2570 - }, - { - "epoch": 1.876502732240437, - "grad_norm": 0.3206217586994171, - "learning_rate": 2e-05, - "loss": 0.9416, - "mean_token_accuracy": 0.730801172447484, - "step": 2575 - }, - { - "epoch": 1.8801457194899818, - "grad_norm": 0.33022913336753845, - "learning_rate": 2e-05, - "loss": 0.9636, - "mean_token_accuracy": 0.723464215925745, - "step": 2580 - }, - { - "epoch": 1.8837887067395265, - "grad_norm": 0.35024046897888184, - "learning_rate": 2e-05, - "loss": 0.9481, - "mean_token_accuracy": 0.7290852467024915, - "step": 2585 - }, - { - "epoch": 1.887431693989071, - "grad_norm": 0.3380562365055084, - "learning_rate": 2e-05, - "loss": 0.9613, - "mean_token_accuracy": 0.7261419149975574, - "step": 2590 - }, - { - "epoch": 1.8910746812386157, - "grad_norm": 0.34045740962028503, - "learning_rate": 2e-05, - "loss": 0.9627, - "mean_token_accuracy": 0.7240901319003419, - "step": 2595 - }, - { - "epoch": 1.8947176684881604, - "grad_norm": 0.3662465810775757, - "learning_rate": 2e-05, - "loss": 0.9463, - "mean_token_accuracy": 0.7302302149487054, - "step": 2600 - }, - { - "epoch": 1.8947176684881604, - "eval_loss": 0.9878339767456055, - "eval_mean_token_accuracy": 0.7185170775381476, - "eval_runtime": 14.2544, - "eval_samples_per_second": 18.38, - "eval_steps_per_second": 1.193, - "step": 2600 - }, - { - "epoch": 1.8983606557377048, - "grad_norm": 0.3937079906463623, - "learning_rate": 2e-05, - "loss": 0.9448, - "mean_token_accuracy": 0.7303340254030288, - "step": 2605 - }, - { - "epoch": 1.9020036429872496, - "grad_norm": 0.3231543004512787, - "learning_rate": 2e-05, - "loss": 0.9596, - "mean_token_accuracy": 0.725903761602345, - "step": 2610 - }, - { - "epoch": 1.9056466302367943, - "grad_norm": 0.3279549181461334, - "learning_rate": 2e-05, - "loss": 0.9598, - "mean_token_accuracy": 0.7254793600390816, - "step": 2615 - }, - { - "epoch": 1.9092896174863387, - "grad_norm": 0.300364226102829, - "learning_rate": 2e-05, - "loss": 0.9611, - "mean_token_accuracy": 0.7248473375671716, - "step": 2620 - }, - { - "epoch": 1.9129326047358834, - "grad_norm": 0.32046106457710266, - "learning_rate": 2e-05, - "loss": 0.9604, - "mean_token_accuracy": 0.72521372740596, - "step": 2625 - }, - { - "epoch": 1.9165755919854282, - "grad_norm": 0.30070799589157104, - "learning_rate": 2e-05, - "loss": 0.9282, - "mean_token_accuracy": 0.7357260625305325, - "step": 2630 - }, - { - "epoch": 1.9202185792349726, - "grad_norm": 0.3148041069507599, - "learning_rate": 2e-05, - "loss": 0.9495, - "mean_token_accuracy": 0.7294027148905409, - "step": 2635 - }, - { - "epoch": 1.9238615664845173, - "grad_norm": 0.30489620566368103, - "learning_rate": 2e-05, - "loss": 0.9302, - "mean_token_accuracy": 0.7349658036150464, - "step": 2640 - }, - { - "epoch": 1.927504553734062, - "grad_norm": 0.31464579701423645, - "learning_rate": 2e-05, - "loss": 0.9469, - "mean_token_accuracy": 0.7290333414753298, - "step": 2645 - }, - { - "epoch": 1.9311475409836065, - "grad_norm": 0.34819769859313965, - "learning_rate": 2e-05, - "loss": 0.95, - "mean_token_accuracy": 0.727873106985833, - "step": 2650 - }, - { - "epoch": 1.9347905282331512, - "grad_norm": 0.30317771434783936, - "learning_rate": 2e-05, - "loss": 0.931, - "mean_token_accuracy": 0.7324041279921838, - "step": 2655 - }, - { - "epoch": 1.938433515482696, - "grad_norm": 0.33199000358581543, - "learning_rate": 2e-05, - "loss": 0.9447, - "mean_token_accuracy": 0.7300042745481192, - "step": 2660 - }, - { - "epoch": 1.9420765027322404, - "grad_norm": 0.32186630368232727, - "learning_rate": 2e-05, - "loss": 0.9223, - "mean_token_accuracy": 0.7355062286272593, - "step": 2665 - }, - { - "epoch": 1.945719489981785, - "grad_norm": 0.3189879357814789, - "learning_rate": 2e-05, - "loss": 0.9214, - "mean_token_accuracy": 0.7362176355642402, - "step": 2670 - }, - { - "epoch": 1.9493624772313298, - "grad_norm": 0.3124241232872009, - "learning_rate": 2e-05, - "loss": 0.952, - "mean_token_accuracy": 0.7294241573033708, - "step": 2675 - }, - { - "epoch": 1.9530054644808743, - "grad_norm": 0.33368435502052307, - "learning_rate": 2e-05, - "loss": 0.9541, - "mean_token_accuracy": 0.726944973067412, - "step": 2680 - }, - { - "epoch": 1.9566484517304188, - "grad_norm": 0.31824326515197754, - "learning_rate": 2e-05, - "loss": 0.9595, - "mean_token_accuracy": 0.7259007083536885, - "step": 2685 - }, - { - "epoch": 1.9602914389799637, - "grad_norm": 0.33006536960601807, - "learning_rate": 2e-05, - "loss": 0.9401, - "mean_token_accuracy": 0.7317018808011726, - "step": 2690 - }, - { - "epoch": 1.9639344262295082, - "grad_norm": 0.32560908794403076, - "learning_rate": 2e-05, - "loss": 0.9497, - "mean_token_accuracy": 0.7289753297508549, - "step": 2695 - }, - { - "epoch": 1.9675774134790527, - "grad_norm": 0.32261034846305847, - "learning_rate": 2e-05, - "loss": 0.9432, - "mean_token_accuracy": 0.7300744992672203, - "step": 2700 - }, - { - "epoch": 1.9675774134790527, - "eval_loss": 0.9857168197631836, - "eval_mean_token_accuracy": 0.719198261994567, - "eval_runtime": 14.2692, - "eval_samples_per_second": 18.361, - "eval_steps_per_second": 1.191, - "step": 2700 - }, - { - "epoch": 1.9712204007285976, - "grad_norm": 0.3109206259250641, - "learning_rate": 2e-05, - "loss": 0.9401, - "mean_token_accuracy": 0.7307981191988278, - "step": 2705 - }, - { - "epoch": 1.974863387978142, - "grad_norm": 0.3254808783531189, - "learning_rate": 2e-05, - "loss": 0.9255, - "mean_token_accuracy": 0.7365412799218367, - "step": 2710 - }, - { - "epoch": 1.9785063752276866, - "grad_norm": 0.348579078912735, - "learning_rate": 2e-05, - "loss": 0.9367, - "mean_token_accuracy": 0.7322361993160723, - "step": 2715 - }, - { - "epoch": 1.9821493624772313, - "grad_norm": 0.2932336628437042, - "learning_rate": 2e-05, - "loss": 0.9373, - "mean_token_accuracy": 0.7305111138251098, - "step": 2720 - }, - { - "epoch": 1.985792349726776, - "grad_norm": 0.33148205280303955, - "learning_rate": 2e-05, - "loss": 0.9334, - "mean_token_accuracy": 0.7327063996091842, - "step": 2725 - }, - { - "epoch": 1.9894353369763205, - "grad_norm": 0.3459117114543915, - "learning_rate": 2e-05, - "loss": 0.9458, - "mean_token_accuracy": 0.7299727053900504, - "step": 2730 - }, - { - "epoch": 1.9930783242258652, - "grad_norm": 0.3014063835144043, - "learning_rate": 2e-05, - "loss": 0.9354, - "mean_token_accuracy": 0.7322545188080116, - "step": 2735 - }, - { - "epoch": 1.9967213114754099, - "grad_norm": 0.3319181501865387, - "learning_rate": 2e-05, - "loss": 0.9575, - "mean_token_accuracy": 0.7258640693698095, - "step": 2740 - }, - { - "epoch": 2.0007285974499087, - "grad_norm": 0.34205400943756104, - "learning_rate": 2e-05, - "loss": 0.9841, - "mean_token_accuracy": 0.7305871542559377, - "step": 2745 - }, - { - "epoch": 2.0043715846994536, - "grad_norm": 0.36871999502182007, - "learning_rate": 2e-05, - "loss": 0.9157, - "mean_token_accuracy": 0.7358542989741086, - "step": 2750 - }, - { - "epoch": 2.008014571948998, - "grad_norm": 0.32711997628211975, - "learning_rate": 2e-05, - "loss": 0.8998, - "mean_token_accuracy": 0.7395395701025891, - "step": 2755 - }, - { - "epoch": 2.0116575591985426, - "grad_norm": 0.31132447719573975, - "learning_rate": 2e-05, - "loss": 0.9142, - "mean_token_accuracy": 0.7366878358573523, - "step": 2760 - }, - { - "epoch": 2.0153005464480875, - "grad_norm": 0.3128187656402588, - "learning_rate": 2e-05, - "loss": 0.9041, - "mean_token_accuracy": 0.7384342940889106, - "step": 2765 - }, - { - "epoch": 2.018943533697632, - "grad_norm": 0.3222801983356476, - "learning_rate": 2e-05, - "loss": 0.9163, - "mean_token_accuracy": 0.735851245725452, - "step": 2770 - }, - { - "epoch": 2.0225865209471765, - "grad_norm": 0.31613680720329285, - "learning_rate": 2e-05, - "loss": 0.8652, - "mean_token_accuracy": 0.7491267708842207, - "step": 2775 - }, - { - "epoch": 2.0262295081967214, - "grad_norm": 0.3172297179698944, - "learning_rate": 2e-05, - "loss": 0.8856, - "mean_token_accuracy": 0.7436980947728384, - "step": 2780 - }, - { - "epoch": 2.029872495446266, - "grad_norm": 0.31364375352859497, - "learning_rate": 2e-05, - "loss": 0.8942, - "mean_token_accuracy": 0.7424645823155838, - "step": 2785 - }, - { - "epoch": 2.0335154826958104, - "grad_norm": 0.30311858654022217, - "learning_rate": 2e-05, - "loss": 0.8925, - "mean_token_accuracy": 0.740699804592086, - "step": 2790 - }, - { - "epoch": 2.0371584699453553, - "grad_norm": 0.3112095892429352, - "learning_rate": 2e-05, - "loss": 0.8659, - "mean_token_accuracy": 0.7491664631167564, - "step": 2795 - }, - { - "epoch": 2.0408014571949, - "grad_norm": 0.3168626129627228, - "learning_rate": 2e-05, - "loss": 0.9142, - "mean_token_accuracy": 0.7359581094284315, - "step": 2800 - }, - { - "epoch": 2.0408014571949, - "eval_loss": 0.9864921569824219, - "eval_mean_token_accuracy": 0.7194168831008534, - "eval_runtime": 14.3174, - "eval_samples_per_second": 18.299, - "eval_steps_per_second": 1.187, - "step": 2800 - }, - { - "epoch": 2.0444444444444443, - "grad_norm": 0.31390464305877686, - "learning_rate": 2e-05, - "loss": 0.9135, - "mean_token_accuracy": 0.7361962628236444, - "step": 2805 - }, - { - "epoch": 2.048087431693989, - "grad_norm": 0.3033863306045532, - "learning_rate": 2e-05, - "loss": 0.8846, - "mean_token_accuracy": 0.7426691499755742, - "step": 2810 - }, - { - "epoch": 2.0517304189435337, - "grad_norm": 0.31870099902153015, - "learning_rate": 2e-05, - "loss": 0.8978, - "mean_token_accuracy": 0.7406356863702981, - "step": 2815 - }, - { - "epoch": 2.055373406193078, - "grad_norm": 0.33986830711364746, - "learning_rate": 2e-05, - "loss": 0.9034, - "mean_token_accuracy": 0.73845261358085, - "step": 2820 - }, - { - "epoch": 2.059016393442623, - "grad_norm": 0.31481096148490906, - "learning_rate": 2e-05, - "loss": 0.9133, - "mean_token_accuracy": 0.7372191011235955, - "step": 2825 - }, - { - "epoch": 2.0626593806921676, - "grad_norm": 0.3070632517337799, - "learning_rate": 2e-05, - "loss": 0.9003, - "mean_token_accuracy": 0.7388953346360528, - "step": 2830 - }, - { - "epoch": 2.066302367941712, - "grad_norm": 0.31041911244392395, - "learning_rate": 2e-05, - "loss": 0.9196, - "mean_token_accuracy": 0.734300195407914, - "step": 2835 - }, - { - "epoch": 2.069945355191257, - "grad_norm": 0.35703134536743164, - "learning_rate": 2e-05, - "loss": 0.9083, - "mean_token_accuracy": 0.7365473864191501, - "step": 2840 - }, - { - "epoch": 2.0735883424408015, - "grad_norm": 0.3322874903678894, - "learning_rate": 2e-05, - "loss": 0.8989, - "mean_token_accuracy": 0.7415150219833901, - "step": 2845 - }, - { - "epoch": 2.077231329690346, - "grad_norm": 0.3197586238384247, - "learning_rate": 2e-05, - "loss": 0.914, - "mean_token_accuracy": 0.7365657059110894, - "step": 2850 - }, - { - "epoch": 2.080874316939891, - "grad_norm": 0.3333277404308319, - "learning_rate": 2e-05, - "loss": 0.8899, - "mean_token_accuracy": 0.7442049340498289, - "step": 2855 - }, - { - "epoch": 2.0845173041894354, - "grad_norm": 0.3269352912902832, - "learning_rate": 2e-05, - "loss": 0.9002, - "mean_token_accuracy": 0.7409226917440156, - "step": 2860 - }, - { - "epoch": 2.08816029143898, - "grad_norm": 0.35753223299980164, - "learning_rate": 2e-05, - "loss": 0.8883, - "mean_token_accuracy": 0.7434691011235955, - "step": 2865 - }, - { - "epoch": 2.091803278688525, - "grad_norm": 0.33091840147972107, - "learning_rate": 2e-05, - "loss": 0.9363, - "mean_token_accuracy": 0.7311492427943331, - "step": 2870 - }, - { - "epoch": 2.0954462659380693, - "grad_norm": 0.3381136953830719, - "learning_rate": 2e-05, - "loss": 0.9195, - "mean_token_accuracy": 0.7348192476795311, - "step": 2875 - }, - { - "epoch": 2.0990892531876137, - "grad_norm": 0.311617374420166, - "learning_rate": 2e-05, - "loss": 0.895, - "mean_token_accuracy": 0.740876893014167, - "step": 2880 - }, - { - "epoch": 2.1027322404371587, - "grad_norm": 0.30609360337257385, - "learning_rate": 2e-05, - "loss": 0.8873, - "mean_token_accuracy": 0.7443514899853445, - "step": 2885 - }, - { - "epoch": 2.106375227686703, - "grad_norm": 0.39882877469062805, - "learning_rate": 2e-05, - "loss": 0.8927, - "mean_token_accuracy": 0.7430172203224231, - "step": 2890 - }, - { - "epoch": 2.1100182149362476, - "grad_norm": 0.35035115480422974, - "learning_rate": 2e-05, - "loss": 0.909, - "mean_token_accuracy": 0.7358333581223833, - "step": 2895 - }, - { - "epoch": 2.1136612021857926, - "grad_norm": 0.32378336787223816, - "learning_rate": 2e-05, - "loss": 0.9167, - "mean_token_accuracy": 0.7358481924767953, - "step": 2900 - }, - { - "epoch": 2.1136612021857926, - "eval_loss": 0.9846732020378113, - "eval_mean_token_accuracy": 0.7199527994124922, - "eval_runtime": 14.3227, - "eval_samples_per_second": 18.293, - "eval_steps_per_second": 1.187, - "step": 2900 - }, - { - "epoch": 2.117304189435337, - "grad_norm": 0.3474068343639374, - "learning_rate": 2e-05, - "loss": 0.8941, - "mean_token_accuracy": 0.7412768685881779, - "step": 2905 - }, - { - "epoch": 2.1209471766848815, - "grad_norm": 0.3359163999557495, - "learning_rate": 2e-05, - "loss": 0.9217, - "mean_token_accuracy": 0.7339460185637519, - "step": 2910 - }, - { - "epoch": 2.1245901639344265, - "grad_norm": 0.33082684874534607, - "learning_rate": 2e-05, - "loss": 0.879, - "mean_token_accuracy": 0.7462780898876404, - "step": 2915 - }, - { - "epoch": 2.128233151183971, - "grad_norm": 0.3506236672401428, - "learning_rate": 2e-05, - "loss": 0.8999, - "mean_token_accuracy": 0.7397227650219833, - "step": 2920 - }, - { - "epoch": 2.1318761384335154, - "grad_norm": 0.3336649239063263, - "learning_rate": 2e-05, - "loss": 0.8984, - "mean_token_accuracy": 0.7394846116267708, - "step": 2925 - }, - { - "epoch": 2.13551912568306, - "grad_norm": 0.3224109709262848, - "learning_rate": 2e-05, - "loss": 0.9222, - "mean_token_accuracy": 0.7358970444553004, - "step": 2930 - }, - { - "epoch": 2.139162112932605, - "grad_norm": 0.3256509602069855, - "learning_rate": 2e-05, - "loss": 0.8949, - "mean_token_accuracy": 0.7407028578407425, - "step": 2935 - }, - { - "epoch": 2.1428051001821493, - "grad_norm": 0.3140164911746979, - "learning_rate": 2e-05, - "loss": 0.9041, - "mean_token_accuracy": 0.7382724719101122, - "step": 2940 - }, - { - "epoch": 2.146448087431694, - "grad_norm": 0.3872627019882202, - "learning_rate": 2e-05, - "loss": 0.9106, - "mean_token_accuracy": 0.736156570591109, - "step": 2945 - }, - { - "epoch": 2.1500910746812387, - "grad_norm": 0.332190603017807, - "learning_rate": 2e-05, - "loss": 0.9018, - "mean_token_accuracy": 0.7400036638983879, - "step": 2950 - }, - { - "epoch": 2.153734061930783, - "grad_norm": 0.3359198570251465, - "learning_rate": 2e-05, - "loss": 0.9245, - "mean_token_accuracy": 0.7332101856375183, - "step": 2955 - }, - { - "epoch": 2.1573770491803277, - "grad_norm": 0.33812880516052246, - "learning_rate": 2e-05, - "loss": 0.888, - "mean_token_accuracy": 0.7440675378602833, - "step": 2960 - }, - { - "epoch": 2.1610200364298726, - "grad_norm": 0.34355029463768005, - "learning_rate": 2e-05, - "loss": 0.9231, - "mean_token_accuracy": 0.7337109184171959, - "step": 2965 - }, - { - "epoch": 2.164663023679417, - "grad_norm": 0.3436281979084015, - "learning_rate": 2e-05, - "loss": 0.8767, - "mean_token_accuracy": 0.7456704934049829, - "step": 2970 - }, - { - "epoch": 2.1683060109289616, - "grad_norm": 0.321349561214447, - "learning_rate": 2e-05, - "loss": 0.8905, - "mean_token_accuracy": 0.7416035661944308, - "step": 2975 - }, - { - "epoch": 2.1719489981785065, - "grad_norm": 0.33591070771217346, - "learning_rate": 2e-05, - "loss": 0.9287, - "mean_token_accuracy": 0.7313660234489496, - "step": 2980 - }, - { - "epoch": 2.175591985428051, - "grad_norm": 0.3216773271560669, - "learning_rate": 2e-05, - "loss": 0.893, - "mean_token_accuracy": 0.7406540058622374, - "step": 2985 - }, - { - "epoch": 2.1792349726775955, - "grad_norm": 0.3402141332626343, - "learning_rate": 2e-05, - "loss": 0.9236, - "mean_token_accuracy": 0.7342177576941865, - "step": 2990 - }, - { - "epoch": 2.1828779599271404, - "grad_norm": 0.3489401936531067, - "learning_rate": 2e-05, - "loss": 0.9054, - "mean_token_accuracy": 0.7382938446507085, - "step": 2995 - }, - { - "epoch": 2.186520947176685, - "grad_norm": 0.32912924885749817, - "learning_rate": 2e-05, - "loss": 0.8798, - "mean_token_accuracy": 0.7457101856375183, - "step": 3000 - }, - { - "epoch": 2.186520947176685, - "eval_loss": 0.982496440410614, - "eval_mean_token_accuracy": 0.7202835672208421, - "eval_runtime": 14.2889, - "eval_samples_per_second": 18.336, - "eval_steps_per_second": 1.19, - "step": 3000 - }, - { - "epoch": 2.1901639344262294, - "grad_norm": 0.3348352909088135, - "learning_rate": 2e-05, - "loss": 0.9096, - "mean_token_accuracy": 0.7373412310698583, - "step": 3005 - }, - { - "epoch": 2.1938069216757743, - "grad_norm": 0.32950860261917114, - "learning_rate": 2e-05, - "loss": 0.8937, - "mean_token_accuracy": 0.7408616267708842, - "step": 3010 - }, - { - "epoch": 2.1974499089253188, - "grad_norm": 0.2998691201210022, - "learning_rate": 2e-05, - "loss": 0.8911, - "mean_token_accuracy": 0.7423943575964826, - "step": 3015 - }, - { - "epoch": 2.2010928961748633, - "grad_norm": 0.32780349254608154, - "learning_rate": 2e-05, - "loss": 0.9153, - "mean_token_accuracy": 0.735530654616512, - "step": 3020 - }, - { - "epoch": 2.204735883424408, - "grad_norm": 0.31766557693481445, - "learning_rate": 2e-05, - "loss": 0.9022, - "mean_token_accuracy": 0.7394968246213971, - "step": 3025 - }, - { - "epoch": 2.2083788706739527, - "grad_norm": 0.32309776544570923, - "learning_rate": 2e-05, - "loss": 0.8915, - "mean_token_accuracy": 0.7415302882266731, - "step": 3030 - }, - { - "epoch": 2.212021857923497, - "grad_norm": 0.319327712059021, - "learning_rate": 2e-05, - "loss": 0.9192, - "mean_token_accuracy": 0.734413165608207, - "step": 3035 - }, - { - "epoch": 2.215664845173042, - "grad_norm": 0.30570122599601746, - "learning_rate": 2e-05, - "loss": 0.8889, - "mean_token_accuracy": 0.7435271128480703, - "step": 3040 - }, - { - "epoch": 2.2193078324225866, - "grad_norm": 0.30299875140190125, - "learning_rate": 2e-05, - "loss": 0.9105, - "mean_token_accuracy": 0.7357810210063506, - "step": 3045 - }, - { - "epoch": 2.222950819672131, - "grad_norm": 0.3085125982761383, - "learning_rate": 2e-05, - "loss": 0.9112, - "mean_token_accuracy": 0.7353413531998043, - "step": 3050 - }, - { - "epoch": 2.226593806921676, - "grad_norm": 0.32464104890823364, - "learning_rate": 2e-05, - "loss": 0.8885, - "mean_token_accuracy": 0.7432745613269548, - "step": 3055 - }, - { - "epoch": 2.2302367941712204, - "grad_norm": 0.309283047914505, - "learning_rate": 2e-05, - "loss": 0.8881, - "mean_token_accuracy": 0.7437042012701514, - "step": 3060 - }, - { - "epoch": 2.233879781420765, - "grad_norm": 0.3193177282810211, - "learning_rate": 2e-05, - "loss": 0.9189, - "mean_token_accuracy": 0.7337139716658526, - "step": 3065 - }, - { - "epoch": 2.23752276867031, - "grad_norm": 0.31948578357696533, - "learning_rate": 2e-05, - "loss": 0.9111, - "mean_token_accuracy": 0.7373259648265754, - "step": 3070 - }, - { - "epoch": 2.2411657559198543, - "grad_norm": 0.3639989495277405, - "learning_rate": 2e-05, - "loss": 0.9098, - "mean_token_accuracy": 0.7367061553492917, - "step": 3075 - }, - { - "epoch": 2.244808743169399, - "grad_norm": 0.31007903814315796, - "learning_rate": 2e-05, - "loss": 0.9083, - "mean_token_accuracy": 0.737045065950171, - "step": 3080 - }, - { - "epoch": 2.2484517304189433, - "grad_norm": 0.34219416975975037, - "learning_rate": 2e-05, - "loss": 0.9011, - "mean_token_accuracy": 0.737329018075232, - "step": 3085 - }, - { - "epoch": 2.2520947176684882, - "grad_norm": 0.3155016005039215, - "learning_rate": 2e-05, - "loss": 0.886, - "mean_token_accuracy": 0.7428920371275037, - "step": 3090 - }, - { - "epoch": 2.2557377049180327, - "grad_norm": 0.34369996190071106, - "learning_rate": 2e-05, - "loss": 0.8909, - "mean_token_accuracy": 0.7413593063019053, - "step": 3095 - }, - { - "epoch": 2.2593806921675776, - "grad_norm": 0.3238091468811035, - "learning_rate": 2e-05, - "loss": 0.9196, - "mean_token_accuracy": 0.7337597703957011, - "step": 3100 - }, - { - "epoch": 2.2593806921675776, - "eval_loss": 0.9811844229698181, - "eval_mean_token_accuracy": 0.7203799509057488, - "eval_runtime": 14.3438, - "eval_samples_per_second": 18.266, - "eval_steps_per_second": 1.185, - "step": 3100 - }, - { - "epoch": 2.263023679417122, - "grad_norm": 0.34170106053352356, - "learning_rate": 2e-05, - "loss": 0.9115, - "mean_token_accuracy": 0.736837445041524, - "step": 3105 - }, - { - "epoch": 2.2666666666666666, - "grad_norm": 0.34322506189346313, - "learning_rate": 2e-05, - "loss": 0.9017, - "mean_token_accuracy": 0.738431240840254, - "step": 3110 - }, - { - "epoch": 2.270309653916211, - "grad_norm": 0.3346925973892212, - "learning_rate": 2e-05, - "loss": 0.89, - "mean_token_accuracy": 0.7419211040547142, - "step": 3115 - }, - { - "epoch": 2.273952641165756, - "grad_norm": 0.30860498547554016, - "learning_rate": 2e-05, - "loss": 0.8928, - "mean_token_accuracy": 0.7410234489496824, - "step": 3120 - }, - { - "epoch": 2.2775956284153005, - "grad_norm": 0.34893983602523804, - "learning_rate": 2e-05, - "loss": 0.8988, - "mean_token_accuracy": 0.7408524670249145, - "step": 3125 - }, - { - "epoch": 2.281238615664845, - "grad_norm": 0.33216506242752075, - "learning_rate": 2e-05, - "loss": 0.8947, - "mean_token_accuracy": 0.7412585490962382, - "step": 3130 - }, - { - "epoch": 2.28488160291439, - "grad_norm": 0.34543153643608093, - "learning_rate": 2e-05, - "loss": 0.9253, - "mean_token_accuracy": 0.7330850024425989, - "step": 3135 - }, - { - "epoch": 2.2885245901639344, - "grad_norm": 0.31388479471206665, - "learning_rate": 2e-05, - "loss": 0.9089, - "mean_token_accuracy": 0.7361382510991696, - "step": 3140 - }, - { - "epoch": 2.292167577413479, - "grad_norm": 0.3121238350868225, - "learning_rate": 2e-05, - "loss": 0.9108, - "mean_token_accuracy": 0.7372282608695653, - "step": 3145 - }, - { - "epoch": 2.295810564663024, - "grad_norm": 0.33222267031669617, - "learning_rate": 2e-05, - "loss": 0.9163, - "mean_token_accuracy": 0.7346329995114803, - "step": 3150 - }, - { - "epoch": 2.2994535519125683, - "grad_norm": 0.3138783872127533, - "learning_rate": 2e-05, - "loss": 0.8981, - "mean_token_accuracy": 0.7387548851978505, - "step": 3155 - }, - { - "epoch": 2.3030965391621128, - "grad_norm": 0.3232060372829437, - "learning_rate": 2e-05, - "loss": 0.8933, - "mean_token_accuracy": 0.7417134831460673, - "step": 3160 - }, - { - "epoch": 2.3067395264116577, - "grad_norm": 0.3669489920139313, - "learning_rate": 2e-05, - "loss": 0.9146, - "mean_token_accuracy": 0.7371519296531508, - "step": 3165 - }, - { - "epoch": 2.310382513661202, - "grad_norm": 0.31355297565460205, - "learning_rate": 2e-05, - "loss": 0.9092, - "mean_token_accuracy": 0.7390754763067905, - "step": 3170 - }, - { - "epoch": 2.3140255009107467, - "grad_norm": 0.3025984764099121, - "learning_rate": 2e-05, - "loss": 0.8852, - "mean_token_accuracy": 0.7443026380068392, - "step": 3175 - }, - { - "epoch": 2.3176684881602916, - "grad_norm": 0.3665474057197571, - "learning_rate": 2e-05, - "loss": 0.9014, - "mean_token_accuracy": 0.7385075720566683, - "step": 3180 - }, - { - "epoch": 2.321311475409836, - "grad_norm": 0.33963456749916077, - "learning_rate": 2e-05, - "loss": 0.8944, - "mean_token_accuracy": 0.7402723497801661, - "step": 3185 - }, - { - "epoch": 2.3249544626593805, - "grad_norm": 0.3448450565338135, - "learning_rate": 2e-05, - "loss": 0.9042, - "mean_token_accuracy": 0.7382358329262335, - "step": 3190 - }, - { - "epoch": 2.3285974499089255, - "grad_norm": 0.32416313886642456, - "learning_rate": 2e-05, - "loss": 0.9044, - "mean_token_accuracy": 0.7392951526599021, - "step": 3195 - }, - { - "epoch": 2.33224043715847, - "grad_norm": 0.32076016068458557, - "learning_rate": 2e-05, - "loss": 0.899, - "mean_token_accuracy": 0.7407059110893991, - "step": 3200 - }, - { - "epoch": 2.33224043715847, - "eval_loss": 0.9800810813903809, - "eval_mean_token_accuracy": 0.7205252333805446, - "eval_runtime": 14.3063, - "eval_samples_per_second": 18.314, - "eval_steps_per_second": 1.188, - "step": 3200 - }, - { - "epoch": 2.3358834244080144, - "grad_norm": 0.31426194310188293, - "learning_rate": 2e-05, - "loss": 0.9071, - "mean_token_accuracy": 0.7373351245725452, - "step": 3205 - }, - { - "epoch": 2.3395264116575594, - "grad_norm": 0.3155065178871155, - "learning_rate": 2e-05, - "loss": 0.9108, - "mean_token_accuracy": 0.7373076453346361, - "step": 3210 - }, - { - "epoch": 2.343169398907104, - "grad_norm": 0.32219287753105164, - "learning_rate": 2e-05, - "loss": 0.9271, - "mean_token_accuracy": 0.7313629702002931, - "step": 3215 - }, - { - "epoch": 2.3468123861566483, - "grad_norm": 0.3496605455875397, - "learning_rate": 2e-05, - "loss": 0.8893, - "mean_token_accuracy": 0.7418984967469944, - "step": 3220 - }, - { - "epoch": 2.3504553734061933, - "grad_norm": 0.31775274872779846, - "learning_rate": 2e-05, - "loss": 0.9079, - "mean_token_accuracy": 0.7362817537860283, - "step": 3225 - }, - { - "epoch": 2.3540983606557377, - "grad_norm": 0.3471783399581909, - "learning_rate": 2e-05, - "loss": 0.8733, - "mean_token_accuracy": 0.7464673913043477, - "step": 3230 - }, - { - "epoch": 2.3577413479052822, - "grad_norm": 0.3281533420085907, - "learning_rate": 2e-05, - "loss": 0.8887, - "mean_token_accuracy": 0.7437988519785051, - "step": 3235 - }, - { - "epoch": 2.361384335154827, - "grad_norm": 0.31612518429756165, - "learning_rate": 2e-05, - "loss": 0.9116, - "mean_token_accuracy": 0.7362176355642404, - "step": 3240 - }, - { - "epoch": 2.3650273224043716, - "grad_norm": 0.3653080463409424, - "learning_rate": 2e-05, - "loss": 0.9029, - "mean_token_accuracy": 0.7385961162677088, - "step": 3245 - }, - { - "epoch": 2.368670309653916, - "grad_norm": 0.33496755361557007, - "learning_rate": 2e-05, - "loss": 0.9137, - "mean_token_accuracy": 0.7352253297508549, - "step": 3250 - }, - { - "epoch": 2.372313296903461, - "grad_norm": 0.3244491219520569, - "learning_rate": 2e-05, - "loss": 0.9206, - "mean_token_accuracy": 0.734910845139228, - "step": 3255 - }, - { - "epoch": 2.3759562841530055, - "grad_norm": 0.31501951813697815, - "learning_rate": 2e-05, - "loss": 0.8909, - "mean_token_accuracy": 0.7428553981436248, - "step": 3260 - }, - { - "epoch": 2.37959927140255, - "grad_norm": 0.3064330518245697, - "learning_rate": 2e-05, - "loss": 0.9008, - "mean_token_accuracy": 0.7384281875915975, - "step": 3265 - }, - { - "epoch": 2.3832422586520945, - "grad_norm": 0.31080517172813416, - "learning_rate": 2e-05, - "loss": 0.8998, - "mean_token_accuracy": 0.7391243282852955, - "step": 3270 - }, - { - "epoch": 2.3868852459016394, - "grad_norm": 0.3312147855758667, - "learning_rate": 2e-05, - "loss": 0.8757, - "mean_token_accuracy": 0.7458524138337579, - "step": 3275 - }, - { - "epoch": 2.390528233151184, - "grad_norm": 0.3071138858795166, - "learning_rate": 2e-05, - "loss": 0.908, - "mean_token_accuracy": 0.7380068392769907, - "step": 3280 - }, - { - "epoch": 2.394171220400729, - "grad_norm": 0.3064194619655609, - "learning_rate": 2e-05, - "loss": 0.8614, - "mean_token_accuracy": 0.7494107230092817, - "step": 3285 - }, - { - "epoch": 2.3978142076502733, - "grad_norm": 0.30981016159057617, - "learning_rate": 2e-05, - "loss": 0.8708, - "mean_token_accuracy": 0.7473253541768441, - "step": 3290 - }, - { - "epoch": 2.401457194899818, - "grad_norm": 0.30898359417915344, - "learning_rate": 2e-05, - "loss": 0.8938, - "mean_token_accuracy": 0.7413928920371275, - "step": 3295 - }, - { - "epoch": 2.4051001821493623, - "grad_norm": 0.3338389992713928, - "learning_rate": 2e-05, - "loss": 0.8809, - "mean_token_accuracy": 0.74310881778212, - "step": 3300 - }, - { - "epoch": 2.4051001821493623, - "eval_loss": 0.9772781729698181, - "eval_mean_token_accuracy": 0.7212848507889992, - "eval_runtime": 14.2945, - "eval_samples_per_second": 18.329, - "eval_steps_per_second": 1.189, - "step": 3300 - }, - { - "epoch": 2.408743169398907, - "grad_norm": 0.33642616868019104, - "learning_rate": 2e-05, - "loss": 0.8856, - "mean_token_accuracy": 0.743920981924768, - "step": 3305 - }, - { - "epoch": 2.4123861566484517, - "grad_norm": 0.33066093921661377, - "learning_rate": 2e-05, - "loss": 0.8851, - "mean_token_accuracy": 0.7444217147044455, - "step": 3310 - }, - { - "epoch": 2.416029143897996, - "grad_norm": 0.3612671196460724, - "learning_rate": 2e-05, - "loss": 0.9075, - "mean_token_accuracy": 0.7383610161211529, - "step": 3315 - }, - { - "epoch": 2.419672131147541, - "grad_norm": 0.3145372271537781, - "learning_rate": 2e-05, - "loss": 0.9059, - "mean_token_accuracy": 0.7378511235955056, - "step": 3320 - }, - { - "epoch": 2.4233151183970856, - "grad_norm": 0.31871461868286133, - "learning_rate": 2e-05, - "loss": 0.9106, - "mean_token_accuracy": 0.7359000977039571, - "step": 3325 - }, - { - "epoch": 2.42695810564663, - "grad_norm": 0.3099087178707123, - "learning_rate": 2e-05, - "loss": 0.9293, - "mean_token_accuracy": 0.7312988519785052, - "step": 3330 - }, - { - "epoch": 2.430601092896175, - "grad_norm": 0.32242926955223083, - "learning_rate": 2e-05, - "loss": 0.9075, - "mean_token_accuracy": 0.737692354665364, - "step": 3335 - }, - { - "epoch": 2.4342440801457195, - "grad_norm": 0.3814336955547333, - "learning_rate": 2e-05, - "loss": 0.911, - "mean_token_accuracy": 0.7348680996580361, - "step": 3340 - }, - { - "epoch": 2.437887067395264, - "grad_norm": 0.332454115152359, - "learning_rate": 2e-05, - "loss": 0.8897, - "mean_token_accuracy": 0.7413684660478751, - "step": 3345 - }, - { - "epoch": 2.441530054644809, - "grad_norm": 0.3682723045349121, - "learning_rate": 2e-05, - "loss": 0.929, - "mean_token_accuracy": 0.7310484855886664, - "step": 3350 - }, - { - "epoch": 2.4451730418943534, - "grad_norm": 0.34098002314567566, - "learning_rate": 2e-05, - "loss": 0.8978, - "mean_token_accuracy": 0.7393344319677048, - "step": 3355 - }, - { - "epoch": 2.448816029143898, - "grad_norm": 0.3256863057613373, - "learning_rate": 2e-05, - "loss": 0.9005, - "mean_token_accuracy": 0.739646433805569, - "step": 3360 - }, - { - "epoch": 2.4524590163934428, - "grad_norm": 0.3080803155899048, - "learning_rate": 2e-05, - "loss": 0.9018, - "mean_token_accuracy": 0.7381930874450416, - "step": 3365 - }, - { - "epoch": 2.4561020036429873, - "grad_norm": 0.3441084623336792, - "learning_rate": 2e-05, - "loss": 0.8865, - "mean_token_accuracy": 0.7436248168050805, - "step": 3370 - }, - { - "epoch": 2.4597449908925317, - "grad_norm": 0.30771470069885254, - "learning_rate": 2e-05, - "loss": 0.8927, - "mean_token_accuracy": 0.7419821690278456, - "step": 3375 - }, - { - "epoch": 2.4633879781420767, - "grad_norm": 0.36730024218559265, - "learning_rate": 2e-05, - "loss": 0.9168, - "mean_token_accuracy": 0.736474364845075, - "step": 3380 - }, - { - "epoch": 2.467030965391621, - "grad_norm": 0.33307135105133057, - "learning_rate": 2e-05, - "loss": 0.9013, - "mean_token_accuracy": 0.7384861993160723, - "step": 3385 - }, - { - "epoch": 2.4706739526411656, - "grad_norm": 0.3176257014274597, - "learning_rate": 2e-05, - "loss": 0.8822, - "mean_token_accuracy": 0.7452796775769418, - "step": 3390 - }, - { - "epoch": 2.4743169398907106, - "grad_norm": 0.3291166126728058, - "learning_rate": 2e-05, - "loss": 0.9069, - "mean_token_accuracy": 0.7390083048363458, - "step": 3395 - }, - { - "epoch": 2.477959927140255, - "grad_norm": 0.3459428548812866, - "learning_rate": 2e-05, - "loss": 0.9279, - "mean_token_accuracy": 0.7312042012701515, - "step": 3400 - }, - { - "epoch": 2.477959927140255, - "eval_loss": 0.9747435450553894, - "eval_mean_token_accuracy": 0.7213943106131382, - "eval_runtime": 14.2613, - "eval_samples_per_second": 18.371, - "eval_steps_per_second": 1.192, - "step": 3400 - }, - { - "epoch": 2.4816029143897995, - "grad_norm": 0.30401602387428284, + "epoch": 0.9617486338797814, + "grad_norm": 0.24876342713832855, "learning_rate": 2e-05, - "loss": 0.8715, - "mean_token_accuracy": 0.746794088910601, - "step": 3405 + "loss": 0.9561, + "mean_token_accuracy": 0.7260304714215927, + "step": 660 }, { - "epoch": 2.4852459016393444, - "grad_norm": 0.320230096578598, + "epoch": 0.9690346083788707, + "grad_norm": 0.25081324577331543, "learning_rate": 2e-05, - "loss": 0.8973, - "mean_token_accuracy": 0.7395816801558677, - "step": 3410 + "loss": 0.9534, + "mean_token_accuracy": 0.7289707498778701, + "step": 665 }, { - "epoch": 2.488888888888889, - "grad_norm": 0.32111015915870667, + "epoch": 0.97632058287796, + "grad_norm": 0.22985951602458954, "learning_rate": 2e-05, - "loss": 0.8763, - "mean_token_accuracy": 0.7474658036150463, - "step": 3415 + "loss": 0.9596, + "mean_token_accuracy": 0.726289997557401, + "step": 670 }, { - "epoch": 2.4925318761384334, - "grad_norm": 0.31403398513793945, + "epoch": 0.9836065573770492, + "grad_norm": 0.24084888398647308, "learning_rate": 2e-05, - "loss": 0.8985, - "mean_token_accuracy": 0.7403731069858328, - "step": 3420 + "loss": 0.9573, + "mean_token_accuracy": 0.7278120420127016, + "step": 675 }, { - "epoch": 2.496174863387978, - "grad_norm": 0.32882779836654663, + "epoch": 0.9908925318761385, + "grad_norm": 0.2884751558303833, "learning_rate": 2e-05, - "loss": 0.8912, - "mean_token_accuracy": 0.7424951148021495, - "step": 3425 + "loss": 0.9637, + "mean_token_accuracy": 0.7256558724047968, + "step": 680 }, { - "epoch": 2.499817850637523, - "grad_norm": 0.3540889620780945, + "epoch": 0.9981785063752276, + "grad_norm": 0.23881231248378754, "learning_rate": 2e-05, - "loss": 0.8965, - "mean_token_accuracy": 0.7381808744504154, - "step": 3430 + "loss": 0.9658, + "mean_token_accuracy": 0.7258991125185024, + "step": 685 }, { - "epoch": 2.5034608378870673, - "grad_norm": 0.31427791714668274, + "epoch": 1.0043715846994536, + "grad_norm": 0.2694164514541626, "learning_rate": 2e-05, - "loss": 0.888, - "mean_token_accuracy": 0.7435668050806058, - "step": 3435 + "loss": 0.78, + "mean_token_accuracy": 0.7354216356791862, + "step": 690 }, { - "epoch": 2.5071038251366122, - "grad_norm": 0.32461288571357727, + "epoch": 1.0116575591985428, + "grad_norm": 0.28284427523612976, "learning_rate": 2e-05, - "loss": 0.9023, - "mean_token_accuracy": 0.7397227650219834, - "step": 3440 + "loss": 0.9199, + "mean_token_accuracy": 0.7356466780654618, + "step": 695 }, { - "epoch": 2.5107468123861567, - "grad_norm": 0.31609442830085754, + "epoch": 1.018943533697632, + "grad_norm": 0.24294410645961761, "learning_rate": 2e-05, - "loss": 0.9039, - "mean_token_accuracy": 0.7388678553981436, - "step": 3445 + "loss": 0.9269, + "mean_token_accuracy": 0.7344269052271617, + "step": 700 }, { - "epoch": 2.514389799635701, - "grad_norm": 0.34672296047210693, - "learning_rate": 2e-05, - "loss": 0.8981, - "mean_token_accuracy": 0.7388814035812061, - "step": 3450 + "epoch": 1.018943533697632, + "eval_loss": 0.9708074927330017, + "eval_mean_token_accuracy": 0.7221159128785493, + "eval_runtime": 39.0528, + "eval_samples_per_second": 6.709, + "eval_steps_per_second": 0.435, + "step": 700 }, { - "epoch": 2.5180327868852457, - "grad_norm": 0.363460510969162, + "epoch": 1.0262295081967212, + "grad_norm": 0.2431807816028595, "learning_rate": 2e-05, - "loss": 0.8908, - "mean_token_accuracy": 0.7427638006839276, - "step": 3455 + "loss": 0.9302, + "mean_token_accuracy": 0.7342864557889597, + "step": 705 }, { - "epoch": 2.5216757741347906, - "grad_norm": 0.32321321964263916, + "epoch": 1.0335154826958106, + "grad_norm": 0.23510615527629852, "learning_rate": 2e-05, - "loss": 0.8851, - "mean_token_accuracy": 0.7424738649440817, - "step": 3460 + "loss": 0.9161, + "mean_token_accuracy": 0.7370954445530045, + "step": 710 }, { - "epoch": 2.525318761384335, - "grad_norm": 0.30429279804229736, + "epoch": 1.0408014571948998, + "grad_norm": 0.25080975890159607, "learning_rate": 2e-05, - "loss": 0.9026, - "mean_token_accuracy": 0.7386907669760625, - "step": 3465 + "loss": 0.9284, + "mean_token_accuracy": 0.7337506106497316, + "step": 715 }, { - "epoch": 2.52896174863388, - "grad_norm": 0.3118142783641815, + "epoch": 1.048087431693989, + "grad_norm": 0.22864864766597748, "learning_rate": 2e-05, - "loss": 0.8841, - "mean_token_accuracy": 0.7432004152418175, - "step": 3470 + "loss": 0.9224, + "mean_token_accuracy": 0.7355474474841233, + "step": 720 }, { - "epoch": 2.5326047358834245, - "grad_norm": 0.34448346495628357, + "epoch": 1.0553734061930784, + "grad_norm": 0.22231045365333557, "learning_rate": 2e-05, - "loss": 0.9038, - "mean_token_accuracy": 0.7381159228060409, - "step": 3475 + "loss": 0.9309, + "mean_token_accuracy": 0.7322361993160723, + "step": 725 }, { - "epoch": 2.536247723132969, - "grad_norm": 0.31645679473876953, + "epoch": 1.0626593806921676, + "grad_norm": 0.26311782002449036, "learning_rate": 2e-05, - "loss": 0.8834, - "mean_token_accuracy": 0.7446140693698095, - "step": 3480 + "loss": 0.927, + "mean_token_accuracy": 0.7335413663120914, + "step": 730 }, { - "epoch": 2.5398907103825135, - "grad_norm": 0.3462752401828766, + "epoch": 1.0699453551912568, + "grad_norm": 0.27380216121673584, "learning_rate": 2e-05, - "loss": 0.9091, - "mean_token_accuracy": 0.7369626282364435, - "step": 3485 + "loss": 0.9099, + "mean_token_accuracy": 0.7381701880801176, + "step": 735 }, { - "epoch": 2.5435336976320584, - "grad_norm": 0.3447904586791992, + "epoch": 1.0772313296903462, + "grad_norm": 0.23722966015338898, "learning_rate": 2e-05, - "loss": 0.8708, - "mean_token_accuracy": 0.7476581582804104, - "step": 3490 + "loss": 0.9104, + "mean_token_accuracy": 0.7384098680996581, + "step": 740 }, { - "epoch": 2.547176684881603, - "grad_norm": 0.34256285429000854, + "epoch": 1.0845173041894354, + "grad_norm": 0.23841090500354767, "learning_rate": 2e-05, - "loss": 0.8867, - "mean_token_accuracy": 0.7426874694675135, - "step": 3495 + "loss": 0.9326, + "mean_token_accuracy": 0.730987809040814, + "step": 745 }, { - "epoch": 2.550819672131148, - "grad_norm": 0.33325037360191345, + "epoch": 1.0918032786885246, + "grad_norm": 0.21909235417842865, "learning_rate": 2e-05, - "loss": 0.9018, - "mean_token_accuracy": 0.7395792623351245, - "step": 3500 + "loss": 0.9216, + "mean_token_accuracy": 0.7349217046220666, + "step": 750 }, { - "epoch": 2.550819672131148, - "eval_loss": 0.973312258720398, - "eval_mean_token_accuracy": 0.7216909538776792, - "eval_runtime": 14.34, - "eval_samples_per_second": 18.271, - "eval_steps_per_second": 1.185, - "step": 3500 + "epoch": 1.0990892531876137, + "grad_norm": 0.23422910273075104, + "learning_rate": 2e-05, + "loss": 0.9153, + "mean_token_accuracy": 0.737058805569126, + "step": 755 }, { - "epoch": 2.5544626593806923, - "grad_norm": 0.3150700032711029, + "epoch": 1.1063752276867032, + "grad_norm": 0.25514715909957886, "learning_rate": 2e-05, - "loss": 0.865, - "mean_token_accuracy": 0.7495542256961406, - "step": 3505 + "loss": 0.9185, + "mean_token_accuracy": 0.7353154005862238, + "step": 760 }, { - "epoch": 2.5581056466302368, - "grad_norm": 0.31783822178840637, + "epoch": 1.1136612021857923, + "grad_norm": 0.2466048300266266, "learning_rate": 2e-05, - "loss": 0.9029, - "mean_token_accuracy": 0.739710552027357, - "step": 3510 + "loss": 0.9171, + "mean_token_accuracy": 0.7364954811919885, + "step": 765 }, { - "epoch": 2.5617486338797812, - "grad_norm": 0.2993064224720001, + "epoch": 1.1209471766848815, + "grad_norm": 0.2330339401960373, "learning_rate": 2e-05, - "loss": 0.8922, - "mean_token_accuracy": 0.7411700048851977, - "step": 3515 + "loss": 0.9181, + "mean_token_accuracy": 0.7352024303859309, + "step": 770 }, { - "epoch": 2.565391621129326, - "grad_norm": 0.3268680274486542, + "epoch": 1.128233151183971, + "grad_norm": 0.23831795156002045, "learning_rate": 2e-05, - "loss": 0.9183, - "mean_token_accuracy": 0.7340380082411782, - "step": 3520 + "loss": 0.93, + "mean_token_accuracy": 0.7330208842208114, + "step": 775 }, { - "epoch": 2.5690346083788707, - "grad_norm": 0.313725084066391, + "epoch": 1.1355191256830601, + "grad_norm": 0.23831504583358765, "learning_rate": 2e-05, - "loss": 0.9055, - "mean_token_accuracy": 0.7373168050806058, - "step": 3525 + "loss": 0.928, + "mean_token_accuracy": 0.7325506839276994, + "step": 780 }, { - "epoch": 2.572677595628415, - "grad_norm": 0.31111860275268555, + "epoch": 1.1428051001821493, + "grad_norm": 0.243895024061203, "learning_rate": 2e-05, - "loss": 0.8665, - "mean_token_accuracy": 0.7484214704445529, - "step": 3530 + "loss": 0.926, + "mean_token_accuracy": 0.7334284929164633, + "step": 785 }, { - "epoch": 2.57632058287796, - "grad_norm": 0.36401307582855225, + "epoch": 1.1500910746812387, + "grad_norm": 0.23136085271835327, "learning_rate": 2e-05, - "loss": 0.9009, - "mean_token_accuracy": 0.7393563751831947, - "step": 3535 + "loss": 0.9135, + "mean_token_accuracy": 0.736092452369321, + "step": 790 }, { - "epoch": 2.5799635701275045, - "grad_norm": 0.32261475920677185, + "epoch": 1.157377049180328, + "grad_norm": 0.2278895527124405, "learning_rate": 2e-05, - "loss": 0.904, - "mean_token_accuracy": 0.7369870542256962, - "step": 3540 + "loss": 0.8998, + "mean_token_accuracy": 0.7414035784074257, + "step": 795 }, { - "epoch": 2.583606557377049, - "grad_norm": 0.3348728120326996, + "epoch": 1.164663023679417, + "grad_norm": 0.21887780725955963, "learning_rate": 2e-05, - "loss": 0.8999, - "mean_token_accuracy": 0.7393777479237909, - "step": 3545 + "loss": 0.931, + "mean_token_accuracy": 0.7319369809477286, + "step": 800 }, { - "epoch": 2.587249544626594, - "grad_norm": 0.3013302683830261, - "learning_rate": 2e-05, - "loss": 0.8864, - "mean_token_accuracy": 0.7442293600390817, - "step": 3550 + "epoch": 1.164663023679417, + "eval_loss": 0.962488055229187, + "eval_mean_token_accuracy": 0.7237455079222439, + "eval_runtime": 35.8741, + "eval_samples_per_second": 7.303, + "eval_steps_per_second": 0.474, + "step": 800 }, { - "epoch": 2.5908925318761384, - "grad_norm": 0.3235645890235901, + "epoch": 1.1719489981785063, + "grad_norm": 0.24219731986522675, "learning_rate": 2e-05, - "loss": 0.8804, - "mean_token_accuracy": 0.7435545920859795, - "step": 3555 + "loss": 0.9308, + "mean_token_accuracy": 0.7323140571568149, + "step": 805 }, { - "epoch": 2.594535519125683, - "grad_norm": 0.30001094937324524, + "epoch": 1.1792349726775957, + "grad_norm": 0.23919643461704254, "learning_rate": 2e-05, - "loss": 0.8755, - "mean_token_accuracy": 0.7467116511968734, - "step": 3560 + "loss": 0.9198, + "mean_token_accuracy": 0.73474902296043, + "step": 810 }, { - "epoch": 2.598178506375228, - "grad_norm": 0.3245663642883301, + "epoch": 1.1865209471766849, + "grad_norm": 0.2236970216035843, "learning_rate": 2e-05, - "loss": 0.9012, - "mean_token_accuracy": 0.7391304347826086, - "step": 3565 + "loss": 0.8978, + "mean_token_accuracy": 0.7420615534929167, + "step": 815 }, { - "epoch": 2.6018214936247723, - "grad_norm": 0.30933406949043274, + "epoch": 1.193806921675774, + "grad_norm": 0.22984063625335693, "learning_rate": 2e-05, - "loss": 0.8758, - "mean_token_accuracy": 0.7474138983878847, - "step": 3570 + "loss": 0.9075, + "mean_token_accuracy": 0.7390708964338057, + "step": 820 }, { - "epoch": 2.605464480874317, - "grad_norm": 0.33006611466407776, + "epoch": 1.2010928961748635, + "grad_norm": 0.24438656866550446, "learning_rate": 2e-05, - "loss": 0.9045, - "mean_token_accuracy": 0.7370572789447973, - "step": 3575 + "loss": 0.9321, + "mean_token_accuracy": 0.7320885006237531, + "step": 825 }, { - "epoch": 2.6091074681238613, - "grad_norm": 0.3191071152687073, + "epoch": 1.2083788706739527, + "grad_norm": 0.23722946643829346, "learning_rate": 2e-05, - "loss": 0.8865, - "mean_token_accuracy": 0.7418264533463605, - "step": 3580 + "loss": 0.912, + "mean_token_accuracy": 0.7364176233512458, + "step": 830 }, { - "epoch": 2.612750455373406, - "grad_norm": 0.3415972888469696, + "epoch": 1.2156648451730419, + "grad_norm": 0.24531032145023346, "learning_rate": 2e-05, - "loss": 0.8861, - "mean_token_accuracy": 0.7413806790425013, - "step": 3585 + "loss": 0.9119, + "mean_token_accuracy": 0.7369840009770396, + "step": 835 }, { - "epoch": 2.6163934426229507, - "grad_norm": 0.3029073178768158, + "epoch": 1.222950819672131, + "grad_norm": 0.2706034481525421, "learning_rate": 2e-05, - "loss": 0.8917, - "mean_token_accuracy": 0.7411974841231072, - "step": 3590 + "loss": 0.9024, + "mean_token_accuracy": 0.7401547997068881, + "step": 840 }, { - "epoch": 2.6200364298724956, - "grad_norm": 0.325960636138916, + "epoch": 1.2302367941712204, + "grad_norm": 0.23218482732772827, "learning_rate": 2e-05, - "loss": 0.9157, - "mean_token_accuracy": 0.7347215437225207, - "step": 3595 + "loss": 0.9054, + "mean_token_accuracy": 0.7385793234000977, + "step": 845 }, { - "epoch": 2.62367941712204, - "grad_norm": 0.3525567054748535, + "epoch": 1.2375227686703096, + "grad_norm": 0.23459894955158234, "learning_rate": 2e-05, - "loss": 0.8991, - "mean_token_accuracy": 0.7409623839765509, - "step": 3600 + "loss": 0.922, + "mean_token_accuracy": 0.7354253175378604, + "step": 850 }, { - "epoch": 2.62367941712204, - "eval_loss": 0.9719704389572144, - "eval_mean_token_accuracy": 0.7224160455018883, - "eval_runtime": 14.2805, - "eval_samples_per_second": 18.347, - "eval_steps_per_second": 1.19, - "step": 3600 + "epoch": 1.2448087431693988, + "grad_norm": 0.25812986493110657, + "learning_rate": 2e-05, + "loss": 0.9228, + "mean_token_accuracy": 0.7338277706546423, + "step": 855 }, { - "epoch": 2.6273224043715846, - "grad_norm": 0.3182205557823181, + "epoch": 1.2520947176684882, + "grad_norm": 0.2745145261287689, "learning_rate": 2e-05, - "loss": 0.8875, - "mean_token_accuracy": 0.7440828041035662, - "step": 3605 + "loss": 0.9108, + "mean_token_accuracy": 0.7370404860771861, + "step": 860 }, { - "epoch": 2.630965391621129, - "grad_norm": 0.3188170790672302, + "epoch": 1.2593806921675774, + "grad_norm": 0.2526284158229828, "learning_rate": 2e-05, - "loss": 0.8778, - "mean_token_accuracy": 0.7443850757205668, - "step": 3610 + "loss": 0.898, + "mean_token_accuracy": 0.7409898632144605, + "step": 865 }, { - "epoch": 2.634608378870674, - "grad_norm": 0.320301353931427, + "epoch": 1.2666666666666666, + "grad_norm": 0.24674515426158905, "learning_rate": 2e-05, - "loss": 0.8714, - "mean_token_accuracy": 0.7470810942843185, - "step": 3615 + "loss": 0.8939, + "mean_token_accuracy": 0.7410646678065462, + "step": 870 }, { - "epoch": 2.6382513661202185, - "grad_norm": 0.3600040078163147, + "epoch": 1.273952641165756, + "grad_norm": 0.23100045323371887, "learning_rate": 2e-05, - "loss": 0.9075, - "mean_token_accuracy": 0.7374511480214949, - "step": 3620 + "loss": 0.9145, + "mean_token_accuracy": 0.7354451636541282, + "step": 875 }, { - "epoch": 2.6418943533697634, - "grad_norm": 0.34417158365249634, + "epoch": 1.2812386156648452, + "grad_norm": 0.23644477128982544, "learning_rate": 2e-05, - "loss": 0.8938, - "mean_token_accuracy": 0.743273693209575, - "step": 3625 + "loss": 0.9088, + "mean_token_accuracy": 0.7372084147532976, + "step": 880 }, { - "epoch": 2.645537340619308, - "grad_norm": 0.3184971213340759, + "epoch": 1.2885245901639344, + "grad_norm": 0.25217151641845703, "learning_rate": 2e-05, - "loss": 0.884, - "mean_token_accuracy": 0.7432584269662921, - "step": 3630 + "loss": 0.9217, + "mean_token_accuracy": 0.7347871885686371, + "step": 885 }, { - "epoch": 2.6491803278688524, - "grad_norm": 0.34727537631988525, + "epoch": 1.2958105646630238, + "grad_norm": 0.25527337193489075, "learning_rate": 2e-05, - "loss": 0.9198, - "mean_token_accuracy": 0.7346726917440157, - "step": 3635 + "loss": 0.9227, + "mean_token_accuracy": 0.7333750610649734, + "step": 890 }, { - "epoch": 2.652823315118397, - "grad_norm": 0.3686891496181488, + "epoch": 1.303096539162113, + "grad_norm": 0.2655749022960663, "learning_rate": 2e-05, - "loss": 0.8614, - "mean_token_accuracy": 0.7494320957498779, - "step": 3640 + "loss": 0.8971, + "mean_token_accuracy": 0.7408488269506435, + "step": 895 }, { - "epoch": 2.656466302367942, - "grad_norm": 0.3379385769367218, + "epoch": 1.3103825136612022, + "grad_norm": 0.26281771063804626, "learning_rate": 2e-05, - "loss": 0.8768, - "mean_token_accuracy": 0.7454476062530533, - "step": 3645 + "loss": 0.9074, + "mean_token_accuracy": 0.7382098803126528, + "step": 900 }, { - "epoch": 2.6601092896174863, - "grad_norm": 0.3417653441429138, - "learning_rate": 2e-05, - "loss": 0.8937, - "mean_token_accuracy": 0.7402234978016609, - "step": 3650 + "epoch": 1.3103825136612022, + "eval_loss": 0.9559279680252075, + "eval_mean_token_accuracy": 0.7254805694923981, + "eval_runtime": 37.1818, + "eval_samples_per_second": 7.046, + "eval_steps_per_second": 0.457, + "step": 900 }, { - "epoch": 2.663752276867031, - "grad_norm": 0.32817432284355164, + "epoch": 1.3176684881602914, + "grad_norm": 0.24635250866413116, "learning_rate": 2e-05, - "loss": 0.8892, - "mean_token_accuracy": 0.7415791402051782, - "step": 3655 + "loss": 0.9268, + "mean_token_accuracy": 0.7334086468001956, + "step": 905 }, { - "epoch": 2.6673952641165757, - "grad_norm": 0.3513006865978241, + "epoch": 1.3249544626593808, + "grad_norm": 0.2402939647436142, "learning_rate": 2e-05, - "loss": 0.8658, - "mean_token_accuracy": 0.7484397899364924, - "step": 3660 + "loss": 0.8963, + "mean_token_accuracy": 0.7412554958475819, + "step": 910 }, { - "epoch": 2.67103825136612, - "grad_norm": 0.36159011721611023, + "epoch": 1.33224043715847, + "grad_norm": 0.23926663398742676, "learning_rate": 2e-05, - "loss": 0.8869, - "mean_token_accuracy": 0.7432492672203225, - "step": 3665 + "loss": 0.9015, + "mean_token_accuracy": 0.7405654616511969, + "step": 915 }, { - "epoch": 2.6746812386156646, - "grad_norm": 0.3413192927837372, + "epoch": 1.3395264116575591, + "grad_norm": 0.23617114126682281, "learning_rate": 2e-05, - "loss": 0.8909, - "mean_token_accuracy": 0.7409226917440156, - "step": 3670 + "loss": 0.9204, + "mean_token_accuracy": 0.734159745969712, + "step": 920 }, { - "epoch": 2.6783242258652096, - "grad_norm": 0.30612418055534363, + "epoch": 1.3468123861566483, + "grad_norm": 0.24529297649860382, "learning_rate": 2e-05, - "loss": 0.8743, - "mean_token_accuracy": 0.744739252564729, - "step": 3675 + "loss": 0.91, + "mean_token_accuracy": 0.7378984489496825, + "step": 925 }, { - "epoch": 2.681967213114754, - "grad_norm": 0.3212524354457855, + "epoch": 1.3540983606557377, + "grad_norm": 0.2535146176815033, "learning_rate": 2e-05, - "loss": 0.865, - "mean_token_accuracy": 0.7500091597459695, - "step": 3680 + "loss": 0.8946, + "mean_token_accuracy": 0.740582254518808, + "step": 930 }, { - "epoch": 2.685610200364299, - "grad_norm": 0.3098811209201813, + "epoch": 1.361384335154827, + "grad_norm": 0.24595101177692413, "learning_rate": 2e-05, - "loss": 0.9011, - "mean_token_accuracy": 0.7383976551050317, - "step": 3685 + "loss": 0.9149, + "mean_token_accuracy": 0.7362970200293113, + "step": 935 }, { - "epoch": 2.6892531876138435, - "grad_norm": 0.32596051692962646, + "epoch": 1.3686703096539161, + "grad_norm": 0.22996020317077637, "learning_rate": 2e-05, - "loss": 0.9077, - "mean_token_accuracy": 0.7350451880801172, - "step": 3690 + "loss": 0.8912, + "mean_token_accuracy": 0.7419562164142649, + "step": 940 }, { - "epoch": 2.692896174863388, - "grad_norm": 0.333390474319458, + "epoch": 1.3759562841530055, + "grad_norm": 0.27776476740837097, "learning_rate": 2e-05, - "loss": 0.8947, - "mean_token_accuracy": 0.7402296042989742, - "step": 3695 + "loss": 0.9057, + "mean_token_accuracy": 0.7376755617977531, + "step": 945 }, { - "epoch": 2.6965391621129324, - "grad_norm": 0.3289414048194885, + "epoch": 1.3832422586520947, + "grad_norm": 0.2726893126964569, "learning_rate": 2e-05, - "loss": 0.9052, - "mean_token_accuracy": 0.7383640693698095, - "step": 3700 + "loss": 0.8975, + "mean_token_accuracy": 0.7407517098192475, + "step": 950 }, { - "epoch": 2.6965391621129324, - "eval_loss": 0.970956563949585, - "eval_mean_token_accuracy": 0.722475216524625, - "eval_runtime": 14.3324, - "eval_samples_per_second": 18.28, - "eval_steps_per_second": 1.186, - "step": 3700 + "epoch": 1.390528233151184, + "grad_norm": 0.24319769442081451, + "learning_rate": 2e-05, + "loss": 0.911, + "mean_token_accuracy": 0.7371152906692723, + "step": 955 }, { - "epoch": 2.7001821493624774, - "grad_norm": 0.30651167035102844, + "epoch": 1.3978142076502733, + "grad_norm": 0.24976621568202972, "learning_rate": 2e-05, - "loss": 0.885, - "mean_token_accuracy": 0.7432095749877871, - "step": 3705 + "loss": 0.9148, + "mean_token_accuracy": 0.7358390327308257, + "step": 960 }, { - "epoch": 2.703825136612022, - "grad_norm": 0.30320581793785095, + "epoch": 1.4051001821493625, + "grad_norm": 0.23896424472332, "learning_rate": 2e-05, - "loss": 0.8982, - "mean_token_accuracy": 0.7390021983390329, - "step": 3710 + "loss": 0.8832, + "mean_token_accuracy": 0.7439560942843186, + "step": 965 }, { - "epoch": 2.7074681238615663, - "grad_norm": 0.30784809589385986, + "epoch": 1.4123861566484517, + "grad_norm": 0.23795920610427856, "learning_rate": 2e-05, - "loss": 0.9066, - "mean_token_accuracy": 0.7373412310698583, - "step": 3715 + "loss": 0.9021, + "mean_token_accuracy": 0.7406188935026868, + "step": 970 }, { - "epoch": 2.7111111111111112, - "grad_norm": 0.34543514251708984, + "epoch": 1.419672131147541, + "grad_norm": 0.2664782404899597, "learning_rate": 2e-05, - "loss": 0.9057, - "mean_token_accuracy": 0.7384190278456277, - "step": 3720 + "loss": 0.9101, + "mean_token_accuracy": 0.7370694919394236, + "step": 975 }, { - "epoch": 2.7147540983606557, - "grad_norm": 0.30098143219947815, + "epoch": 1.4269581056466303, + "grad_norm": 0.2434052973985672, "learning_rate": 2e-05, - "loss": 0.877, - "mean_token_accuracy": 0.7448186370297997, - "step": 3725 + "loss": 0.8896, + "mean_token_accuracy": 0.7435683317049342, + "step": 980 }, { - "epoch": 2.7183970856102, - "grad_norm": 0.3111075460910797, + "epoch": 1.4342440801457195, + "grad_norm": 0.2583552300930023, "learning_rate": 2e-05, - "loss": 0.8778, - "mean_token_accuracy": 0.7456033219345384, - "step": 3730 + "loss": 0.8936, + "mean_token_accuracy": 0.7419669027845627, + "step": 985 }, { - "epoch": 2.722040072859745, - "grad_norm": 0.316897988319397, + "epoch": 1.4415300546448089, + "grad_norm": 0.24104474484920502, "learning_rate": 2e-05, - "loss": 0.8799, - "mean_token_accuracy": 0.7437561064973132, - "step": 3735 + "loss": 0.9149, + "mean_token_accuracy": 0.7361550439667807, + "step": 990 }, { - "epoch": 2.7256830601092896, - "grad_norm": 0.3311839699745178, + "epoch": 1.448816029143898, + "grad_norm": 0.247370645403862, "learning_rate": 2e-05, - "loss": 0.8764, - "mean_token_accuracy": 0.7459161556975303, - "step": 3740 + "loss": 0.9037, + "mean_token_accuracy": 0.7384266609672692, + "step": 995 }, { - "epoch": 2.729326047358834, - "grad_norm": 0.31449511647224426, + "epoch": 1.4561020036429873, + "grad_norm": 0.22899129986763, "learning_rate": 2e-05, - "loss": 0.9195, - "mean_token_accuracy": 0.7339704445530045, - "step": 3745 + "loss": 0.9055, + "mean_token_accuracy": 0.7383884953590621, + "step": 1000 }, { - "epoch": 2.732969034608379, - "grad_norm": 0.3076689541339874, - "learning_rate": 2e-05, - "loss": 0.9106, - "mean_token_accuracy": 0.7375, - "step": 3750 + "epoch": 1.4561020036429873, + "eval_loss": 0.9484136700630188, + "eval_mean_token_accuracy": 0.7270321376011663, + "eval_runtime": 36.2695, + "eval_samples_per_second": 7.224, + "eval_steps_per_second": 0.469, + "step": 1000 }, { - "epoch": 2.7366120218579235, - "grad_norm": 0.32780900597572327, + "epoch": 1.4633879781420764, + "grad_norm": 0.2408648431301117, "learning_rate": 2e-05, - "loss": 0.8937, - "mean_token_accuracy": 0.7403578407425501, - "step": 3755 + "loss": 0.8974, + "mean_token_accuracy": 0.7416111993160724, + "step": 1005 }, { - "epoch": 2.740255009107468, - "grad_norm": 0.33124321699142456, + "epoch": 1.4706739526411656, + "grad_norm": 0.22463728487491608, "learning_rate": 2e-05, - "loss": 0.8945, - "mean_token_accuracy": 0.7399700781631655, - "step": 3760 + "loss": 0.9098, + "mean_token_accuracy": 0.7365138006839276, + "step": 1010 }, { - "epoch": 2.7438979963570125, - "grad_norm": 0.3387695848941803, + "epoch": 1.477959927140255, + "grad_norm": 0.2450459599494934, "learning_rate": 2e-05, - "loss": 0.8765, - "mean_token_accuracy": 0.7466475329750856, - "step": 3765 + "loss": 0.9068, + "mean_token_accuracy": 0.7383442232535418, + "step": 1015 }, { - "epoch": 2.7475409836065574, - "grad_norm": 0.3448026478290558, + "epoch": 1.4852459016393442, + "grad_norm": 0.25232186913490295, "learning_rate": 2e-05, - "loss": 0.8911, - "mean_token_accuracy": 0.7415677923927882, - "step": 3770 + "loss": 0.8972, + "mean_token_accuracy": 0.7410585613092329, + "step": 1020 }, { - "epoch": 2.751183970856102, - "grad_norm": 0.3187689781188965, + "epoch": 1.4925318761384334, + "grad_norm": 0.23277677595615387, "learning_rate": 2e-05, - "loss": 0.8754, - "mean_token_accuracy": 0.745847581827064, - "step": 3775 + "loss": 0.8895, + "mean_token_accuracy": 0.7428874572545188, + "step": 1025 }, { - "epoch": 2.754826958105647, - "grad_norm": 0.31393617391586304, + "epoch": 1.4998178506375228, + "grad_norm": 0.24832560122013092, "learning_rate": 2e-05, - "loss": 0.9051, - "mean_token_accuracy": 0.7382816316560822, - "step": 3780 + "loss": 0.8962, + "mean_token_accuracy": 0.7403868466047876, + "step": 1030 }, { - "epoch": 2.7584699453551913, - "grad_norm": 0.3261832594871521, + "epoch": 1.507103825136612, + "grad_norm": 0.24917817115783691, "learning_rate": 2e-05, - "loss": 0.8941, - "mean_token_accuracy": 0.7418172936003908, - "step": 3785 + "loss": 0.899, + "mean_token_accuracy": 0.7395075109916951, + "step": 1035 }, { - "epoch": 2.762112932604736, - "grad_norm": 0.30824825167655945, + "epoch": 1.5143897996357012, + "grad_norm": 0.2454776167869568, "learning_rate": 2e-05, - "loss": 0.8949, - "mean_token_accuracy": 0.7418473941582479, - "step": 3790 + "loss": 0.925, + "mean_token_accuracy": 0.7319293695039306, + "step": 1040 }, { - "epoch": 2.7657559198542803, - "grad_norm": 0.3116508722305298, + "epoch": 1.5216757741347906, + "grad_norm": 0.2297164499759674, "learning_rate": 2e-05, - "loss": 0.8786, - "mean_token_accuracy": 0.7441438690766975, - "step": 3795 + "loss": 0.9006, + "mean_token_accuracy": 0.7397594040058624, + "step": 1045 }, { - "epoch": 2.769398907103825, - "grad_norm": 0.365998238325119, + "epoch": 1.5289617486338798, + "grad_norm": 0.23188287019729614, "learning_rate": 2e-05, - "loss": 0.8915, - "mean_token_accuracy": 0.741508915486077, - "step": 3800 + "loss": 0.909, + "mean_token_accuracy": 0.7355245481191989, + "step": 1050 }, { - "epoch": 2.769398907103825, - "eval_loss": 0.9683325290679932, - "eval_mean_token_accuracy": 0.7231605948989424, - "eval_runtime": 14.2659, - "eval_samples_per_second": 18.366, - "eval_steps_per_second": 1.192, - "step": 3800 + "epoch": 1.536247723132969, + "grad_norm": 0.2465352863073349, + "learning_rate": 2e-05, + "loss": 0.8836, + "mean_token_accuracy": 0.7430340131900344, + "step": 1055 }, { - "epoch": 2.7730418943533697, - "grad_norm": 0.31595027446746826, + "epoch": 1.5435336976320584, + "grad_norm": 0.24740713834762573, "learning_rate": 2e-05, - "loss": 0.9011, - "mean_token_accuracy": 0.7383213238886175, - "step": 3805 + "loss": 0.9019, + "mean_token_accuracy": 0.7388159501709819, + "step": 1060 }, { - "epoch": 2.7766848816029146, - "grad_norm": 0.3480280041694641, + "epoch": 1.5508196721311476, + "grad_norm": 0.24196745455265045, "learning_rate": 2e-05, - "loss": 0.8807, - "mean_token_accuracy": 0.744873595505618, - "step": 3810 + "loss": 0.9029, + "mean_token_accuracy": 0.7383976551050319, + "step": 1065 }, { - "epoch": 2.780327868852459, - "grad_norm": 0.3077247440814972, + "epoch": 1.5581056466302368, + "grad_norm": 0.26808851957321167, "learning_rate": 2e-05, - "loss": 0.8804, - "mean_token_accuracy": 0.7448308500244261, - "step": 3815 + "loss": 0.8949, + "mean_token_accuracy": 0.7395578895945287, + "step": 1070 }, { - "epoch": 2.7839708561020036, - "grad_norm": 0.31197667121887207, + "epoch": 1.5653916211293262, + "grad_norm": 0.23099803924560547, "learning_rate": 2e-05, - "loss": 0.8889, - "mean_token_accuracy": 0.7428950903761603, - "step": 3820 + "loss": 0.8969, + "mean_token_accuracy": 0.740501343429409, + "step": 1075 }, { - "epoch": 2.787613843351548, - "grad_norm": 0.30954769253730774, + "epoch": 1.5726775956284151, + "grad_norm": 0.23486842215061188, "learning_rate": 2e-05, - "loss": 0.8974, - "mean_token_accuracy": 0.739206765999023, - "step": 3825 + "loss": 0.8934, + "mean_token_accuracy": 0.7417089032730828, + "step": 1080 }, { - "epoch": 2.791256830601093, - "grad_norm": 0.30114755034446716, + "epoch": 1.5799635701275045, + "grad_norm": 0.27474597096443176, "learning_rate": 2e-05, - "loss": 0.9126, - "mean_token_accuracy": 0.73676111382511, - "step": 3830 + "loss": 0.8868, + "mean_token_accuracy": 0.7421821568148511, + "step": 1085 }, { - "epoch": 2.7948998178506375, - "grad_norm": 0.3321165442466736, + "epoch": 1.587249544626594, + "grad_norm": 0.25371241569519043, "learning_rate": 2e-05, - "loss": 0.8907, - "mean_token_accuracy": 0.7411974841231069, - "step": 3835 + "loss": 0.9051, + "mean_token_accuracy": 0.7380175256472887, + "step": 1090 }, { - "epoch": 2.7985428051001824, - "grad_norm": 0.3215268552303314, + "epoch": 1.594535519125683, + "grad_norm": 0.24320611357688904, "learning_rate": 2e-05, - "loss": 0.8832, - "mean_token_accuracy": 0.7422920089136892, - "step": 3840 + "loss": 0.8896, + "mean_token_accuracy": 0.7424699888439653, + "step": 1095 }, { - "epoch": 2.802185792349727, - "grad_norm": 0.3101475238800049, + "epoch": 1.6018214936247723, + "grad_norm": 0.24586845934391022, "learning_rate": 2e-05, - "loss": 0.8845, - "mean_token_accuracy": 0.7431729360039082, - "step": 3845 + "loss": 0.9074, + "mean_token_accuracy": 0.7369397288715193, + "step": 1100 }, { - "epoch": 2.8058287795992713, - "grad_norm": 0.30512532591819763, - "learning_rate": 2e-05, - "loss": 0.8797, - "mean_token_accuracy": 0.7461132144601857, - "step": 3850 + "epoch": 1.6018214936247723, + "eval_loss": 0.9426586627960205, + "eval_mean_token_accuracy": 0.7278746708682018, + "eval_runtime": 36.7765, + "eval_samples_per_second": 7.124, + "eval_steps_per_second": 0.462, + "step": 1100 }, { - "epoch": 2.809471766848816, - "grad_norm": 0.30885010957717896, + "epoch": 1.6091074681238615, + "grad_norm": 0.2685466408729553, "learning_rate": 2e-05, - "loss": 0.8835, - "mean_token_accuracy": 0.7433317049340498, - "step": 3855 + "loss": 0.8912, + "mean_token_accuracy": 0.7420249145090378, + "step": 1105 }, { - "epoch": 2.8131147540983608, - "grad_norm": 0.31740885972976685, + "epoch": 1.6163934426229507, + "grad_norm": 0.24883808195590973, "learning_rate": 2e-05, - "loss": 0.9041, - "mean_token_accuracy": 0.7379488275525158, - "step": 3860 + "loss": 0.8973, + "mean_token_accuracy": 0.7395487298485591, + "step": 1110 }, { - "epoch": 2.8167577413479052, - "grad_norm": 0.3246329426765442, + "epoch": 1.6236794171220401, + "grad_norm": 0.24616660177707672, "learning_rate": 2e-05, - "loss": 0.8886, - "mean_token_accuracy": 0.7418539325842698, - "step": 3865 + "loss": 0.9001, + "mean_token_accuracy": 0.7390174645823157, + "step": 1115 }, { - "epoch": 2.82040072859745, - "grad_norm": 0.3088524341583252, + "epoch": 1.6309653916211293, + "grad_norm": 0.24163974821567535, "learning_rate": 2e-05, - "loss": 0.891, - "mean_token_accuracy": 0.7422966536394723, - "step": 3870 + "loss": 0.8976, + "mean_token_accuracy": 0.7398876404494382, + "step": 1120 }, { - "epoch": 2.8240437158469947, - "grad_norm": 0.33513322472572327, + "epoch": 1.6382513661202185, + "grad_norm": 0.3287493586540222, "learning_rate": 2e-05, - "loss": 0.9228, - "mean_token_accuracy": 0.732694186614558, - "step": 3875 + "loss": 0.9024, + "mean_token_accuracy": 0.7385472780685156, + "step": 1125 }, { - "epoch": 2.827686703096539, - "grad_norm": 0.33118847012519836, + "epoch": 1.645537340619308, + "grad_norm": 0.27125054597854614, "learning_rate": 2e-05, - "loss": 0.8659, - "mean_token_accuracy": 0.7481375183194922, - "step": 3880 + "loss": 0.8934, + "mean_token_accuracy": 0.7404921836834394, + "step": 1130 }, { - "epoch": 2.8313296903460836, - "grad_norm": 0.3234773576259613, + "epoch": 1.652823315118397, + "grad_norm": 0.2530953884124756, "learning_rate": 2e-05, - "loss": 0.9047, - "mean_token_accuracy": 0.7398235222276502, - "step": 3885 + "loss": 0.8947, + "mean_token_accuracy": 0.7405977039159446, + "step": 1135 }, { - "epoch": 2.8349726775956285, - "grad_norm": 0.3078250586986542, + "epoch": 1.6601092896174863, + "grad_norm": 0.24643385410308838, "learning_rate": 2e-05, - "loss": 0.8619, - "mean_token_accuracy": 0.7485436003908157, - "step": 3890 + "loss": 0.9148, + "mean_token_accuracy": 0.734823827552516, + "step": 1140 }, { - "epoch": 2.838615664845173, - "grad_norm": 0.3281988203525543, + "epoch": 1.6673952641165757, + "grad_norm": 0.2359907627105713, "learning_rate": 2e-05, - "loss": 0.8847, - "mean_token_accuracy": 0.7424554225696142, - "step": 3895 + "loss": 0.8908, + "mean_token_accuracy": 0.7428370786516856, + "step": 1145 }, { - "epoch": 2.8422586520947175, - "grad_norm": 0.3436889946460724, + "epoch": 1.6746812386156649, + "grad_norm": 0.2252684086561203, "learning_rate": 2e-05, - "loss": 0.9215, - "mean_token_accuracy": 0.7336040547142159, - "step": 3900 + "loss": 0.9013, + "mean_token_accuracy": 0.7393166829506594, + "step": 1150 }, { - "epoch": 2.8422586520947175, - "eval_loss": 0.9669012427330017, - "eval_mean_token_accuracy": 0.7233897867800676, - "eval_runtime": 14.2753, - "eval_samples_per_second": 18.353, - "eval_steps_per_second": 1.191, - "step": 3900 + "epoch": 1.681967213114754, + "grad_norm": 0.24328413605690002, + "learning_rate": 2e-05, + "loss": 0.9094, + "mean_token_accuracy": 0.7362924401563264, + "step": 1155 }, { - "epoch": 2.8459016393442624, - "grad_norm": 0.32411956787109375, + "epoch": 1.6892531876138435, + "grad_norm": 0.25157198309898376, "learning_rate": 2e-05, - "loss": 0.9043, - "mean_token_accuracy": 0.7381014899853444, - "step": 3905 + "loss": 0.8964, + "mean_token_accuracy": 0.740287616023449, + "step": 1160 }, { - "epoch": 2.849544626593807, - "grad_norm": 0.293822318315506, + "epoch": 1.6965391621129327, + "grad_norm": 0.2484838217496872, "learning_rate": 2e-05, - "loss": 0.8795, - "mean_token_accuracy": 0.745911700048852, - "step": 3910 + "loss": 0.8947, + "mean_token_accuracy": 0.7410952002931117, + "step": 1165 }, { - "epoch": 2.8531876138433514, - "grad_norm": 0.31667500734329224, + "epoch": 1.7038251366120218, + "grad_norm": 0.23148389160633087, "learning_rate": 2e-05, - "loss": 0.8954, - "mean_token_accuracy": 0.7399792379091353, - "step": 3915 + "loss": 0.8939, + "mean_token_accuracy": 0.7413776257938448, + "step": 1170 }, { - "epoch": 2.8568306010928963, - "grad_norm": 0.29437676072120667, + "epoch": 1.7111111111111112, + "grad_norm": 0.2488527148962021, "learning_rate": 2e-05, - "loss": 0.8796, - "mean_token_accuracy": 0.7442171470444553, - "step": 3920 + "loss": 0.8938, + "mean_token_accuracy": 0.7411043600390815, + "step": 1175 }, { - "epoch": 2.860473588342441, - "grad_norm": 0.3104299306869507, + "epoch": 1.7183970856102002, + "grad_norm": 0.24698656797409058, "learning_rate": 2e-05, - "loss": 0.8805, - "mean_token_accuracy": 0.7429195163654126, - "step": 3925 + "loss": 0.8933, + "mean_token_accuracy": 0.7404463849535907, + "step": 1180 }, { - "epoch": 2.8641165755919853, - "grad_norm": 0.3084963858127594, + "epoch": 1.7256830601092896, + "grad_norm": 0.2506196200847626, "learning_rate": 2e-05, - "loss": 0.8997, - "mean_token_accuracy": 0.7388525891548607, - "step": 3930 + "loss": 0.8828, + "mean_token_accuracy": 0.7443102711284807, + "step": 1185 }, { - "epoch": 2.86775956284153, - "grad_norm": 0.3696286678314209, + "epoch": 1.732969034608379, + "grad_norm": 0.22465619444847107, "learning_rate": 2e-05, - "loss": 0.8896, - "mean_token_accuracy": 0.7407944553004396, - "step": 3935 + "loss": 0.8947, + "mean_token_accuracy": 0.740437225207621, + "step": 1190 }, { - "epoch": 2.8714025500910747, - "grad_norm": 0.30959826707839966, + "epoch": 1.740255009107468, + "grad_norm": 0.2372632622718811, "learning_rate": 2e-05, - "loss": 0.8934, - "mean_token_accuracy": 0.7408810524700733, - "step": 3940 + "loss": 0.9029, + "mean_token_accuracy": 0.7392021861260379, + "step": 1195 }, { - "epoch": 2.875045537340619, - "grad_norm": 0.3243521451950073, + "epoch": 1.7475409836065574, + "grad_norm": 0.259969025850296, "learning_rate": 2e-05, - "loss": 0.8602, - "mean_token_accuracy": 0.7506778212017586, - "step": 3945 + "loss": 0.878, + "mean_token_accuracy": 0.7458796409379582, + "step": 1200 }, { - "epoch": 2.8786885245901637, - "grad_norm": 0.33037546277046204, - "learning_rate": 2e-05, - "loss": 0.8975, - "mean_token_accuracy": 0.7393472154372251, - "step": 3950 + "epoch": 1.7475409836065574, + "eval_loss": 0.9372912645339966, + "eval_mean_token_accuracy": 0.7289227538789632, + "eval_runtime": 36.6965, + "eval_samples_per_second": 7.14, + "eval_steps_per_second": 0.463, + "step": 1200 }, { - "epoch": 2.8823315118397086, - "grad_norm": 0.3091663718223572, + "epoch": 1.7548269581056466, + "grad_norm": 0.23141299188137054, "learning_rate": 2e-05, - "loss": 0.8705, - "mean_token_accuracy": 0.7468124084025402, - "step": 3955 + "loss": 0.8977, + "mean_token_accuracy": 0.7405669882755255, + "step": 1205 }, { - "epoch": 2.885974499089253, - "grad_norm": 0.32397231459617615, + "epoch": 1.7621129326047358, + "grad_norm": 0.241252601146698, "learning_rate": 2e-05, - "loss": 0.896, - "mean_token_accuracy": 0.7402326575476307, - "step": 3960 + "loss": 0.8983, + "mean_token_accuracy": 0.7395714566117425, + "step": 1210 }, { - "epoch": 2.889617486338798, - "grad_norm": 0.34801802039146423, + "epoch": 1.7693989071038252, + "grad_norm": 0.2374604344367981, "learning_rate": 2e-05, - "loss": 0.8859, - "mean_token_accuracy": 0.7428798241328776, - "step": 3965 + "loss": 0.8941, + "mean_token_accuracy": 0.7401441133365902, + "step": 1215 }, { - "epoch": 2.8932604735883425, - "grad_norm": 0.33170145750045776, + "epoch": 1.7766848816029144, + "grad_norm": 0.2405879646539688, "learning_rate": 2e-05, - "loss": 0.8748, - "mean_token_accuracy": 0.7472826086956522, - "step": 3970 + "loss": 0.8926, + "mean_token_accuracy": 0.7409089521250611, + "step": 1220 }, { - "epoch": 2.896903460837887, - "grad_norm": 0.3254745602607727, + "epoch": 1.7839708561020036, + "grad_norm": 0.2612534761428833, "learning_rate": 2e-05, - "loss": 0.8793, - "mean_token_accuracy": 0.7446232291157793, - "step": 3975 + "loss": 0.8894, + "mean_token_accuracy": 0.741570077275746, + "step": 1225 }, { - "epoch": 2.9005464480874315, - "grad_norm": 0.3225265145301819, + "epoch": 1.791256830601093, + "grad_norm": 0.22933730483055115, "learning_rate": 2e-05, - "loss": 0.8896, - "mean_token_accuracy": 0.7418783585735221, - "step": 3980 + "loss": 0.9101, + "mean_token_accuracy": 0.736487848070347, + "step": 1230 }, { - "epoch": 2.9041894353369764, - "grad_norm": 0.3272905945777893, + "epoch": 1.7985428051001822, + "grad_norm": 0.22422315180301666, "learning_rate": 2e-05, - "loss": 0.8889, - "mean_token_accuracy": 0.7421134587200783, - "step": 3985 + "loss": 0.8908, + "mean_token_accuracy": 0.7419745359062043, + "step": 1235 }, { - "epoch": 2.907832422586521, - "grad_norm": 0.34882068634033203, + "epoch": 1.8058287795992713, + "grad_norm": 0.22796830534934998, "learning_rate": 2e-05, - "loss": 0.8992, - "mean_token_accuracy": 0.7385655837811432, - "step": 3990 + "loss": 0.8922, + "mean_token_accuracy": 0.7410585613092332, + "step": 1240 }, { - "epoch": 2.911475409836066, - "grad_norm": 0.3148573040962219, + "epoch": 1.8131147540983608, + "grad_norm": 0.244882270693779, "learning_rate": 2e-05, - "loss": 0.8773, - "mean_token_accuracy": 0.7454720322423058, - "step": 3995 + "loss": 0.8833, + "mean_token_accuracy": 0.7435210063507574, + "step": 1245 }, { - "epoch": 2.9151183970856103, - "grad_norm": 0.31475773453712463, + "epoch": 1.82040072859745, + "grad_norm": 0.24056288599967957, "learning_rate": 2e-05, - "loss": 0.9036, - "mean_token_accuracy": 0.7386785539814363, - "step": 4000 + "loss": 0.8965, + "mean_token_accuracy": 0.7406986070429273, + "step": 1250 }, { - "epoch": 2.9151183970856103, - "eval_loss": 0.9656786918640137, - "eval_mean_token_accuracy": 0.723362347188535, - "eval_runtime": 14.2833, - "eval_samples_per_second": 18.343, - "eval_steps_per_second": 1.19, - "step": 4000 + "epoch": 1.8276867030965391, + "grad_norm": 0.2528102695941925, + "learning_rate": 2e-05, + "loss": 0.8877, + "mean_token_accuracy": 0.7422752808988765, + "step": 1255 }, { - "epoch": 2.9187613843351548, - "grad_norm": 0.301011323928833, + "epoch": 1.8349726775956285, + "grad_norm": 0.2547694444656372, "learning_rate": 2e-05, - "loss": 0.8715, - "mean_token_accuracy": 0.7476062530532486, - "step": 4005 + "loss": 0.8942, + "mean_token_accuracy": 0.7405499980795623, + "step": 1260 }, { - "epoch": 2.9224043715846992, - "grad_norm": 0.3263828158378601, + "epoch": 1.8422586520947175, + "grad_norm": 0.24326466023921967, "learning_rate": 2e-05, - "loss": 0.8848, - "mean_token_accuracy": 0.7429073033707867, - "step": 4010 + "loss": 0.8889, + "mean_token_accuracy": 0.7411135197850514, + "step": 1265 }, { - "epoch": 2.926047358834244, - "grad_norm": 0.36500218510627747, + "epoch": 1.849544626593807, + "grad_norm": 0.2283967137336731, "learning_rate": 2e-05, - "loss": 0.8937, - "mean_token_accuracy": 0.7400250366389841, - "step": 4015 + "loss": 0.8918, + "mean_token_accuracy": 0.7406066805080608, + "step": 1270 }, { - "epoch": 2.9296903460837886, - "grad_norm": 0.32235583662986755, + "epoch": 1.8568306010928963, + "grad_norm": 0.2283177226781845, "learning_rate": 2e-05, - "loss": 0.8906, - "mean_token_accuracy": 0.7414417440156326, - "step": 4020 + "loss": 0.9075, + "mean_token_accuracy": 0.7364283097215438, + "step": 1275 }, { - "epoch": 2.9333333333333336, - "grad_norm": 0.32114923000335693, + "epoch": 1.8641165755919853, + "grad_norm": 0.2293919175863266, "learning_rate": 2e-05, - "loss": 0.8742, - "mean_token_accuracy": 0.7465345627747924, - "step": 4025 + "loss": 0.883, + "mean_token_accuracy": 0.7429531021006351, + "step": 1280 }, { - "epoch": 2.936976320582878, - "grad_norm": 0.30394190549850464, + "epoch": 1.8714025500910747, + "grad_norm": 0.2358449548482895, "learning_rate": 2e-05, - "loss": 0.8815, - "mean_token_accuracy": 0.742946995603322, - "step": 4030 + "loss": 0.873, + "mean_token_accuracy": 0.7455468368343917, + "step": 1285 }, { - "epoch": 2.9406193078324225, - "grad_norm": 0.32967275381088257, + "epoch": 1.8786885245901639, + "grad_norm": 0.24609410762786865, "learning_rate": 2e-05, - "loss": 0.8831, - "mean_token_accuracy": 0.7421653639472399, - "step": 4035 + "loss": 0.8909, + "mean_token_accuracy": 0.7408952125061068, + "step": 1290 }, { - "epoch": 2.944262295081967, - "grad_norm": 0.312377005815506, + "epoch": 1.885974499089253, + "grad_norm": 0.24984125792980194, "learning_rate": 2e-05, - "loss": 0.8916, - "mean_token_accuracy": 0.7408097215437226, - "step": 4040 + "loss": 0.8979, + "mean_token_accuracy": 0.7388785417684416, + "step": 1295 }, { - "epoch": 2.947905282331512, - "grad_norm": 0.3285759389400482, + "epoch": 1.8932604735883425, + "grad_norm": 0.2732260227203369, "learning_rate": 2e-05, - "loss": 0.8649, - "mean_token_accuracy": 0.7494778944797267, - "step": 4045 + "loss": 0.8971, + "mean_token_accuracy": 0.7392968368343918, + "step": 1300 }, { - "epoch": 2.9515482695810564, - "grad_norm": 0.320372611284256, - "learning_rate": 2e-05, - "loss": 0.8772, - "mean_token_accuracy": 0.7447942110405471, - "step": 4050 + "epoch": 1.8932604735883425, + "eval_loss": 0.931566059589386, + "eval_mean_token_accuracy": 0.7301682973396459, + "eval_runtime": 39.2113, + "eval_samples_per_second": 6.682, + "eval_steps_per_second": 0.434, + "step": 1300 }, { - "epoch": 2.9551912568306014, - "grad_norm": 0.33457741141319275, + "epoch": 1.9005464480874317, + "grad_norm": 0.2397509217262268, "learning_rate": 2e-05, - "loss": 0.8985, - "mean_token_accuracy": 0.7386907669760626, - "step": 4055 + "loss": 0.8939, + "mean_token_accuracy": 0.7408295676599905, + "step": 1305 }, { - "epoch": 2.958834244080146, - "grad_norm": 0.3132217526435852, + "epoch": 1.9078324225865209, + "grad_norm": 0.24797864258289337, "learning_rate": 2e-05, - "loss": 0.9028, - "mean_token_accuracy": 0.7372679531021007, - "step": 4060 + "loss": 0.9037, + "mean_token_accuracy": 0.7368725574010748, + "step": 1310 }, { - "epoch": 2.9624772313296903, - "grad_norm": 0.3045112192630768, + "epoch": 1.9151183970856103, + "grad_norm": 0.2255106419324875, "learning_rate": 2e-05, - "loss": 0.9002, - "mean_token_accuracy": 0.7387335124572545, - "step": 4065 + "loss": 0.8858, + "mean_token_accuracy": 0.7422493282852957, + "step": 1315 }, { - "epoch": 2.966120218579235, - "grad_norm": 0.3133973479270935, + "epoch": 1.9224043715846995, + "grad_norm": 0.22497308254241943, "learning_rate": 2e-05, - "loss": 0.8899, - "mean_token_accuracy": 0.7421562042012703, - "step": 4070 + "loss": 0.8869, + "mean_token_accuracy": 0.7429964063296582, + "step": 1320 }, { - "epoch": 2.9697632058287797, - "grad_norm": 0.3029925227165222, + "epoch": 1.9296903460837886, + "grad_norm": 0.23234759271144867, "learning_rate": 2e-05, - "loss": 0.8778, - "mean_token_accuracy": 0.7461406936980948, - "step": 4075 + "loss": 0.8769, + "mean_token_accuracy": 0.7449453468490475, + "step": 1325 }, { - "epoch": 2.973406193078324, - "grad_norm": 0.35844218730926514, + "epoch": 1.936976320582878, + "grad_norm": 0.23880702257156372, "learning_rate": 2e-05, - "loss": 0.8842, - "mean_token_accuracy": 0.7436828285295553, - "step": 4080 + "loss": 0.8829, + "mean_token_accuracy": 0.7424783219345384, + "step": 1330 }, { - "epoch": 2.9770491803278687, - "grad_norm": 0.32251685857772827, + "epoch": 1.9442622950819672, + "grad_norm": 0.24366316199302673, "learning_rate": 2e-05, - "loss": 0.9023, - "mean_token_accuracy": 0.7384923058133854, - "step": 4085 + "loss": 0.8627, + "mean_token_accuracy": 0.7484535295554471, + "step": 1335 }, { - "epoch": 2.9806921675774136, - "grad_norm": 0.31728076934814453, + "epoch": 1.9515482695810564, + "grad_norm": 0.23241998255252838, "learning_rate": 2e-05, - "loss": 0.8827, - "mean_token_accuracy": 0.7427424279433319, - "step": 4090 + "loss": 0.8974, + "mean_token_accuracy": 0.7385976428920371, + "step": 1340 }, { - "epoch": 2.984335154826958, - "grad_norm": 0.3193369507789612, + "epoch": 1.9588342440801458, + "grad_norm": 0.2727753818035126, "learning_rate": 2e-05, - "loss": 0.8938, - "mean_token_accuracy": 0.7414539570102588, - "step": 4095 + "loss": 0.885, + "mean_token_accuracy": 0.7431951350665288, + "step": 1345 }, { - "epoch": 2.9879781420765026, - "grad_norm": 0.330759733915329, + "epoch": 1.966120218579235, + "grad_norm": 0.24258148670196533, "learning_rate": 2e-05, - "loss": 0.8971, - "mean_token_accuracy": 0.7398968001954079, - "step": 4100 + "loss": 0.8952, + "mean_token_accuracy": 0.7398082559843674, + "step": 1350 }, { - "epoch": 2.9879781420765026, - "eval_loss": 0.9635317325592041, - "eval_mean_token_accuracy": 0.723665181019814, - "eval_runtime": 14.2498, - "eval_samples_per_second": 18.386, - "eval_steps_per_second": 1.193, - "step": 4100 + "epoch": 1.9734061930783242, + "grad_norm": 0.2493925392627716, + "learning_rate": 2e-05, + "loss": 0.8824, + "mean_token_accuracy": 0.7440339521250612, + "step": 1355 }, { - "epoch": 2.9916211293260475, - "grad_norm": 0.31818100810050964, + "epoch": 1.9806921675774136, + "grad_norm": 0.26210054755210876, "learning_rate": 2e-05, - "loss": 0.8821, - "mean_token_accuracy": 0.7423058133854422, - "step": 4105 + "loss": 0.8782, + "mean_token_accuracy": 0.7450537371763556, + "step": 1360 }, { - "epoch": 2.995264116575592, - "grad_norm": 0.3011719882488251, + "epoch": 1.9879781420765026, + "grad_norm": 0.28852924704551697, "learning_rate": 2e-05, - "loss": 0.8524, - "mean_token_accuracy": 0.7518441621885686, - "step": 4110 + "loss": 0.876, + "mean_token_accuracy": 0.7450354176844163, + "step": 1365 }, { - "epoch": 2.9989071038251365, - "grad_norm": 0.3335364758968353, + "epoch": 1.995264116575592, + "grad_norm": 0.23531539738178253, "learning_rate": 2e-05, - "loss": 0.9245, - "mean_token_accuracy": 0.7316560820713238, - "step": 4115 + "loss": 0.882, + "mean_token_accuracy": 0.7429299400447796, + "step": 1370 }, { - "epoch": 2.9996357012750456, - "mean_token_accuracy": 0.7347490229604299, - "step": 4116, + "epoch": 1.9981785063752278, + "mean_token_accuracy": 0.7337910661944309, + "step": 1372, "total_flos": 0.0, - "train_loss": 0.984842965855427, - "train_runtime": 39952.7795, - "train_samples_per_second": 3.298, - "train_steps_per_second": 0.103 + "train_loss": 0.9694986148756377, + "train_runtime": 60321.7593, + "train_samples_per_second": 1.456, + "train_steps_per_second": 0.023 } ], "logging_steps": 5, - "max_steps": 4116, + "max_steps": 1372, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -6983,8 +2339,8 @@ "should_epoch_stop": false, "should_evaluate": false, "should_log": false, - "should_save": false, - "should_training_stop": false + "should_save": true, + "should_training_stop": true }, "attributes": {} }