diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,580 +1,18035 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9992277992277993, + "epoch": 0.9999758529930215, "eval_steps": 100, - "global_step": 647, + "global_step": 20706, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.015444015444015444, - "grad_norm": 42.98995701464493, - "learning_rate": 3.0769230769230774e-06, - "loss": 8.2594, - "mean_token_accuracy": 0.10587121248245239, + "epoch": 0.0004829401395697003, + "grad_norm": 86.61036024405438, + "learning_rate": 9.657170449058426e-08, + "loss": 9.5563, + "mean_token_accuracy": 0.08225806429982185, "step": 10 }, { - "epoch": 0.03088803088803089, - "grad_norm": 40.1390719983803, - "learning_rate": 6.153846153846155e-06, - "loss": 8.0227, - "mean_token_accuracy": 0.1078659575432539, + "epoch": 0.0009658802791394006, + "grad_norm": 123.58741881187208, + "learning_rate": 1.9314340898116852e-07, + "loss": 9.3203, + "mean_token_accuracy": 0.08860887065529824, "step": 20 }, { - "epoch": 0.04633204633204633, - "grad_norm": 32.03219460040334, - "learning_rate": 9.230769230769232e-06, - "loss": 7.2273, - "mean_token_accuracy": 0.11569220423698426, + "epoch": 0.001448820418709101, + "grad_norm": 43.63197078149982, + "learning_rate": 2.897151134717528e-07, + "loss": 9.3719, + "mean_token_accuracy": 0.08770161308348179, "step": 30 }, { - "epoch": 0.06177606177606178, - "grad_norm": 11.444295356765586, - "learning_rate": 1.230769230769231e-05, - "loss": 6.1875, - "mean_token_accuracy": 0.13410007320344447, + "epoch": 0.0019317605582788013, + "grad_norm": 56.11384023907413, + "learning_rate": 3.8628681796233705e-07, + "loss": 9.3297, + "mean_token_accuracy": 0.08316532205790281, "step": 40 }, { - "epoch": 0.07722007722007722, - "grad_norm": 8.828148588388997, - "learning_rate": 1.5384615384615387e-05, - "loss": 5.4797, - "mean_token_accuracy": 0.17271810993552209, + "epoch": 0.0024147006978485017, + "grad_norm": 111.94241070044647, + "learning_rate": 4.828585224529214e-07, + "loss": 9.525, + "mean_token_accuracy": 0.07893145252019167, "step": 50 }, { - "epoch": 0.09266409266409266, - "grad_norm": 4.29140253345486, - "learning_rate": 1.8461538461538465e-05, - "loss": 4.7477, - "mean_token_accuracy": 0.2291911043226719, + "epoch": 0.002897640837418202, + "grad_norm": 87.47872168525997, + "learning_rate": 5.794302269435056e-07, + "loss": 9.2312, + "mean_token_accuracy": 0.08457661308348179, "step": 60 }, { - "epoch": 0.10810810810810811, - "grad_norm": 3.6498072205880834, - "learning_rate": 1.9996358021096174e-05, - "loss": 4.2266, - "mean_token_accuracy": 0.2794737696647644, + "epoch": 0.0033805809769879023, + "grad_norm": 44.16636970911555, + "learning_rate": 6.760019314340899e-07, + "loss": 9.2469, + "mean_token_accuracy": 0.08669354766607285, "step": 70 }, { - "epoch": 0.12355212355212356, - "grad_norm": 2.4885713862379912, - "learning_rate": 1.9967238104745695e-05, - "loss": 3.9047, - "mean_token_accuracy": 0.3135935992002487, + "epoch": 0.0038635211165576025, + "grad_norm": 41.62348347154347, + "learning_rate": 7.725736359246741e-07, + "loss": 9.5484, + "mean_token_accuracy": 0.0815524198114872, "step": 80 }, { - "epoch": 0.138996138996139, - "grad_norm": 1.744563941216085, - "learning_rate": 1.9909083099891682e-05, - "loss": 3.559, - "mean_token_accuracy": 0.3519733637571335, + "epoch": 0.004346461256127303, + "grad_norm": 60.14104875747736, + "learning_rate": 8.691453404152583e-07, + "loss": 9.1422, + "mean_token_accuracy": 0.08770161271095275, "step": 90 }, { - "epoch": 0.15444015444015444, - "grad_norm": 1.2706812171088628, - "learning_rate": 1.9822062415120053e-05, - "loss": 3.441, - "mean_token_accuracy": 0.3674425706267357, + "epoch": 0.0048294013956970035, + "grad_norm": 75.73130908570681, + "learning_rate": 9.657170449058428e-07, + "loss": 9.1875, + "mean_token_accuracy": 0.08941532261669635, "step": 100 }, { - "epoch": 0.15444015444015444, - "eval_runtime": 0.3607, - "eval_samples_per_second": 257.851, - "eval_steps_per_second": 16.636, + "epoch": 0.0048294013956970035, + "eval_runtime": 7.7611, + "eval_samples_per_second": 380.616, + "eval_steps_per_second": 23.837, "step": 100 }, { - "epoch": 0.16988416988416988, - "grad_norm": 1.159034895523178, - "learning_rate": 1.9706429546259592e-05, - "loss": 3.4188, - "mean_token_accuracy": 0.3701730087399483, + "epoch": 0.005312341535266703, + "grad_norm": 117.3695190602538, + "learning_rate": 1.062288749396427e-06, + "loss": 8.9477, + "mean_token_accuracy": 0.08780241906642913, "step": 110 }, { - "epoch": 0.18532818532818532, - "grad_norm": 1.1341139919684715, - "learning_rate": 1.9562521337935255e-05, - "loss": 3.3375, - "mean_token_accuracy": 0.37968291640281676, + "epoch": 0.005795281674836404, + "grad_norm": 62.298764753082835, + "learning_rate": 1.1588604538870113e-06, + "loss": 8.9211, + "mean_token_accuracy": 0.08397177457809449, "step": 120 }, { - "epoch": 0.20077220077220076, - "grad_norm": 1.1760013263316322, - "learning_rate": 1.939075700232209e-05, - "loss": 3.3172, - "mean_token_accuracy": 0.3814638316631317, + "epoch": 0.006278221814406105, + "grad_norm": 61.87255784883387, + "learning_rate": 1.2554321583775955e-06, + "loss": 8.8445, + "mean_token_accuracy": 0.08659274317324162, "step": 130 }, { - "epoch": 0.21621621621621623, - "grad_norm": 1.161176266224095, - "learning_rate": 1.9191636897958123e-05, - "loss": 3.3266, - "mean_token_accuracy": 0.38159212917089463, + "epoch": 0.0067611619539758045, + "grad_norm": 40.09540993639851, + "learning_rate": 1.3520038628681797e-06, + "loss": 8.7438, + "mean_token_accuracy": 0.08639112897217274, "step": 140 }, { - "epoch": 0.23166023166023167, - "grad_norm": 1.0603780531054983, - "learning_rate": 1.8965741072173647e-05, - "loss": 3.3289, - "mean_token_accuracy": 0.38168987780809405, + "epoch": 0.007244102093545505, + "grad_norm": 53.951627839893334, + "learning_rate": 1.448575567358764e-06, + "loss": 8.4688, + "mean_token_accuracy": 0.0918346781283617, "step": 150 }, { - "epoch": 0.2471042471042471, - "grad_norm": 1.1211648128868228, - "learning_rate": 1.8713727571382857e-05, - "loss": 3.3199, - "mean_token_accuracy": 0.3806579351425171, + "epoch": 0.007727042233115205, + "grad_norm": 44.44604487004671, + "learning_rate": 1.5451472718493482e-06, + "loss": 8.5523, + "mean_token_accuracy": 0.0860887099057436, "step": 160 }, { - "epoch": 0.2625482625482625, - "grad_norm": 1.0851646580539058, - "learning_rate": 1.8436330524160048e-05, - "loss": 3.3172, - "mean_token_accuracy": 0.38043438643217087, + "epoch": 0.008209982372684905, + "grad_norm": 40.12808283651626, + "learning_rate": 1.6417189763399324e-06, + "loss": 8.2422, + "mean_token_accuracy": 0.09122983925044537, "step": 170 }, { - "epoch": 0.277992277992278, - "grad_norm": 1.1539100078602913, - "learning_rate": 1.8134358002684504e-05, - "loss": 3.2984, - "mean_token_accuracy": 0.38442752957344056, + "epoch": 0.008692922512254606, + "grad_norm": 30.19029302720017, + "learning_rate": 1.7382906808305167e-06, + "loss": 8.0227, + "mean_token_accuracy": 0.08991935513913632, "step": 180 }, { - "epoch": 0.29343629343629346, - "grad_norm": 1.0382547421808463, - "learning_rate": 1.7808689668783762e-05, - "loss": 3.2707, - "mean_token_accuracy": 0.38669049441814424, + "epoch": 0.009175862651824306, + "grad_norm": 64.96587558878197, + "learning_rate": 1.8348623853211011e-06, + "loss": 7.8492, + "mean_token_accuracy": 0.08810483925044536, "step": 190 }, { - "epoch": 0.3088803088803089, - "grad_norm": 1.0822222546327178, - "learning_rate": 1.7460274211432463e-05, - "loss": 3.3223, - "mean_token_accuracy": 0.3834402486681938, + "epoch": 0.009658802791394007, + "grad_norm": 32.51514438330331, + "learning_rate": 1.9314340898116856e-06, + "loss": 7.6289, + "mean_token_accuracy": 0.09506048373878002, "step": 200 }, { - "epoch": 0.3088803088803089, - "eval_runtime": 0.3594, - "eval_samples_per_second": 258.777, - "eval_steps_per_second": 16.695, + "epoch": 0.009658802791394007, + "eval_runtime": 7.7531, + "eval_samples_per_second": 381.011, + "eval_steps_per_second": 23.862, "step": 200 }, { - "epoch": 0.32432432432432434, - "grad_norm": 1.0759072756730748, - "learning_rate": 1.7090126583171503e-05, - "loss": 3.3051, - "mean_token_accuracy": 0.38562438935041427, + "epoch": 0.010141742930963708, + "grad_norm": 44.57479787298447, + "learning_rate": 2.0280057943022696e-06, + "loss": 7.4602, + "mean_token_accuracy": 0.09778225757181644, "step": 210 }, { - "epoch": 0.33976833976833976, - "grad_norm": 1.0899543450508766, - "learning_rate": 1.6699325043497957e-05, - "loss": 3.277, - "mean_token_accuracy": 0.38706011772155763, + "epoch": 0.010624683070533407, + "grad_norm": 44.43161424366738, + "learning_rate": 2.124577498792854e-06, + "loss": 7.382, + "mean_token_accuracy": 0.09465725794434547, "step": 220 }, { - "epoch": 0.3552123552123552, - "grad_norm": 1.0888745843131695, - "learning_rate": 1.6289008017838447e-05, - "loss": 3.2484, - "mean_token_accuracy": 0.3877565965056419, + "epoch": 0.011107623210103107, + "grad_norm": 19.835010827719895, + "learning_rate": 2.221149203283438e-06, + "loss": 7.2172, + "mean_token_accuracy": 0.10131048299372196, "step": 230 }, { - "epoch": 0.37065637065637064, - "grad_norm": 1.0687266206034804, - "learning_rate": 1.586037078125607e-05, - "loss": 3.2484, - "mean_token_accuracy": 0.39076298773288726, + "epoch": 0.011590563349672808, + "grad_norm": 44.481799789933895, + "learning_rate": 2.3177209077740225e-06, + "loss": 7.1703, + "mean_token_accuracy": 0.09455645158886909, "step": 240 }, { - "epoch": 0.3861003861003861, - "grad_norm": 1.065599177703124, - "learning_rate": 1.54146619765513e-05, - "loss": 3.252, - "mean_token_accuracy": 0.38923814594745637, + "epoch": 0.012073503489242509, + "grad_norm": 15.888481832711705, + "learning_rate": 2.4142926122646065e-06, + "loss": 6.993, + "mean_token_accuracy": 0.0969124186784029, "step": 250 }, { - "epoch": 0.4015444015444015, - "grad_norm": 1.0534666626792726, - "learning_rate": 1.4953179976899878e-05, - "loss": 3.2898, - "mean_token_accuracy": 0.3861650750041008, + "epoch": 0.01255644362881221, + "grad_norm": 19.533213158788257, + "learning_rate": 2.510864316755191e-06, + "loss": 6.8922, + "mean_token_accuracy": 0.10161290280520915, "step": 260 }, { - "epoch": 0.416988416988417, - "grad_norm": 1.0857869655420103, - "learning_rate": 1.4477269103623496e-05, - "loss": 3.2473, - "mean_token_accuracy": 0.3895466774702072, + "epoch": 0.013039383768381908, + "grad_norm": 21.36034584462512, + "learning_rate": 2.607436021245775e-06, + "loss": 6.7648, + "mean_token_accuracy": 0.10776209682226182, "step": 270 }, { - "epoch": 0.43243243243243246, - "grad_norm": 1.0559408912325952, - "learning_rate": 1.3988315710111151e-05, - "loss": 3.2305, - "mean_token_accuracy": 0.3926136389374733, + "epoch": 0.013522323907951609, + "grad_norm": 12.507709531888091, + "learning_rate": 2.7040077257363594e-06, + "loss": 6.725, + "mean_token_accuracy": 0.1073588702827692, "step": 280 }, { - "epoch": 0.44787644787644787, - "grad_norm": 1.0847010732629856, - "learning_rate": 1.3487744143298822e-05, - "loss": 3.2504, - "mean_token_accuracy": 0.3903006136417389, + "epoch": 0.01400526404752131, + "grad_norm": 12.963157072765076, + "learning_rate": 2.800579430226944e-06, + "loss": 6.5703, + "mean_token_accuracy": 0.11703629083931447, "step": 290 }, { - "epoch": 0.46332046332046334, - "grad_norm": 0.9793151207941333, - "learning_rate": 1.2977012594472008e-05, - "loss": 3.25, - "mean_token_accuracy": 0.38797043114900587, + "epoch": 0.01448820418709101, + "grad_norm": 22.02834139909145, + "learning_rate": 2.897151134717528e-06, + "loss": 6.4898, + "mean_token_accuracy": 0.12318548522889614, "step": 300 }, { - "epoch": 0.46332046332046334, - "eval_runtime": 0.3596, - "eval_samples_per_second": 258.588, - "eval_steps_per_second": 16.683, + "epoch": 0.01448820418709101, + "eval_runtime": 7.8018, + "eval_samples_per_second": 378.63, + "eval_steps_per_second": 23.712, "step": 300 }, { - "epoch": 0.47876447876447875, - "grad_norm": 1.001746885258135, - "learning_rate": 1.2457608851477833e-05, - "loss": 3.2687, - "mean_token_accuracy": 0.38685850501060487, + "epoch": 0.014971144326660711, + "grad_norm": 11.320048994516409, + "learning_rate": 2.9937228392081124e-06, + "loss": 6.418, + "mean_token_accuracy": 0.11633064523339272, "step": 310 }, { - "epoch": 0.4942084942084942, - "grad_norm": 1.0372559843317462, - "learning_rate": 1.1931045964720882e-05, - "loss": 3.1965, - "mean_token_accuracy": 0.3949413478374481, + "epoch": 0.01545408446623041, + "grad_norm": 19.74022910633883, + "learning_rate": 3.0902945436986964e-06, + "loss": 6.3891, + "mean_token_accuracy": 0.12268145121634007, "step": 320 }, { - "epoch": 0.5096525096525096, - "grad_norm": 1.0255559620343244, - "learning_rate": 1.1398857839567811e-05, - "loss": 3.2348, - "mean_token_accuracy": 0.3925586506724358, + "epoch": 0.015937024605800112, + "grad_norm": 13.693407143559101, + "learning_rate": 3.186866248189281e-06, + "loss": 6.25, + "mean_token_accuracy": 0.1266129020601511, "step": 330 }, { - "epoch": 0.525096525096525, - "grad_norm": 1.0569722959009906, - "learning_rate": 1.086259476800041e-05, - "loss": 3.2184, - "mean_token_accuracy": 0.3907044231891632, + "epoch": 0.01641996474536981, + "grad_norm": 18.620314421238913, + "learning_rate": 3.283437952679865e-06, + "loss": 6.2508, + "mean_token_accuracy": 0.12681451588869094, "step": 340 }, { - "epoch": 0.5405405405405406, - "grad_norm": 1.0427772185171493, - "learning_rate": 1.0323818912533561e-05, - "loss": 3.2445, - "mean_token_accuracy": 0.38938239961862564, + "epoch": 0.01690290488493951, + "grad_norm": 12.777325276570116, + "learning_rate": 3.3800096571704493e-06, + "loss": 6.1133, + "mean_token_accuracy": 0.13225806467235088, "step": 350 }, { - "epoch": 0.555984555984556, - "grad_norm": 1.0256028596579942, - "learning_rate": 9.784099755553723e-06, - "loss": 3.2621, - "mean_token_accuracy": 0.3908414125442505, + "epoch": 0.01738584502450921, + "grad_norm": 11.72514226678663, + "learning_rate": 3.4765813616610333e-06, + "loss": 6.0352, + "mean_token_accuracy": 0.1365927428007126, "step": 360 }, { - "epoch": 0.5714285714285714, - "grad_norm": 0.9932814831249556, - "learning_rate": 9.245009527334243e-06, - "loss": 3.252, - "mean_token_accuracy": 0.3901301324367523, + "epoch": 0.017868785164078912, + "grad_norm": 8.912693254717752, + "learning_rate": 3.5731530661516178e-06, + "loss": 6.0945, + "mean_token_accuracy": 0.13094758056104183, "step": 370 }, { - "epoch": 0.5868725868725869, - "grad_norm": 1.0250250249915, - "learning_rate": 8.708118626045939e-06, - "loss": 3.2543, - "mean_token_accuracy": 0.38824535608291627, + "epoch": 0.018351725303648612, + "grad_norm": 29.402027928221635, + "learning_rate": 3.6697247706422022e-06, + "loss": 5.9836, + "mean_token_accuracy": 0.14576612897217273, "step": 380 }, { - "epoch": 0.6023166023166023, - "grad_norm": 1.0413127972450467, - "learning_rate": 8.174991043104662e-06, - "loss": 3.2559, - "mean_token_accuracy": 0.38956108689308167, + "epoch": 0.018834665443218313, + "grad_norm": 14.16575018639554, + "learning_rate": 3.7662964751327863e-06, + "loss": 5.9086, + "mean_token_accuracy": 0.1471774186939001, "step": 390 }, { - "epoch": 0.6177606177606177, - "grad_norm": 1.0350187771944364, - "learning_rate": 7.647179807182125e-06, - "loss": 3.2285, - "mean_token_accuracy": 0.39289467632770536, + "epoch": 0.019317605582788014, + "grad_norm": 9.897396578369301, + "learning_rate": 3.862868179623371e-06, + "loss": 5.8781, + "mean_token_accuracy": 0.14737903252243995, "step": 400 }, { - "epoch": 0.6177606177606177, - "eval_runtime": 0.3592, - "eval_samples_per_second": 258.925, - "eval_steps_per_second": 16.705, + "epoch": 0.019317605582788014, + "eval_runtime": 7.8031, + "eval_samples_per_second": 378.566, + "eval_steps_per_second": 23.708, "step": 400 }, { - "epoch": 0.6332046332046332, - "grad_norm": 1.0632043827118496, - "learning_rate": 7.126222460151719e-06, - "loss": 3.2043, - "mean_token_accuracy": 0.39414961636066437, + "epoch": 0.019800545722357715, + "grad_norm": 9.580434319364983, + "learning_rate": 3.959439884113955e-06, + "loss": 5.843, + "mean_token_accuracy": 0.1519153229892254, "step": 410 }, { - "epoch": 0.6486486486486487, - "grad_norm": 0.9869305853216295, - "learning_rate": 6.613636578148242e-06, - "loss": 3.2301, - "mean_token_accuracy": 0.3913428604602814, + "epoch": 0.020283485861927415, + "grad_norm": 9.785106533868191, + "learning_rate": 4.056011588604539e-06, + "loss": 5.7188, + "mean_token_accuracy": 0.16239919289946556, "step": 420 }, { - "epoch": 0.6640926640926641, - "grad_norm": 1.0493326593085537, - "learning_rate": 6.110915350788846e-06, - "loss": 3.2211, - "mean_token_accuracy": 0.39174955785274507, + "epoch": 0.020766426001497116, + "grad_norm": 10.405009225619438, + "learning_rate": 4.152583293095124e-06, + "loss": 5.6664, + "mean_token_accuracy": 0.16360887214541436, "step": 430 }, { - "epoch": 0.6795366795366795, - "grad_norm": 1.0205446931794757, - "learning_rate": 5.619523231433177e-06, - "loss": 3.2578, - "mean_token_accuracy": 0.3872604101896286, + "epoch": 0.021249366141066813, + "grad_norm": 11.591917579819466, + "learning_rate": 4.249154997585708e-06, + "loss": 5.6148, + "mean_token_accuracy": 0.16381048336625098, "step": 440 }, { - "epoch": 0.694980694980695, - "grad_norm": 1.0300075002455658, - "learning_rate": 5.140891671153797e-06, - "loss": 3.2836, - "mean_token_accuracy": 0.38629337400197983, + "epoch": 0.021732306280636514, + "grad_norm": 26.190074462558268, + "learning_rate": 4.3457267020762925e-06, + "loss": 5.4719, + "mean_token_accuracy": 0.1736895151436329, "step": 450 }, { - "epoch": 0.7104247104247104, - "grad_norm": 0.9838291672080677, - "learning_rate": 4.676414948843934e-06, - "loss": 3.207, - "mean_token_accuracy": 0.39439760595560075, + "epoch": 0.022215246420206215, + "grad_norm": 8.413665658649023, + "learning_rate": 4.442298406566876e-06, + "loss": 5.4727, + "mean_token_accuracy": 0.18094758093357086, "step": 460 }, { - "epoch": 0.7258687258687259, - "grad_norm": 1.0775473314799002, - "learning_rate": 4.2274461096098085e-06, - "loss": 3.2121, - "mean_token_accuracy": 0.39395820051431657, + "epoch": 0.022698186559775915, + "grad_norm": 14.092203313066426, + "learning_rate": 4.5388701110574606e-06, + "loss": 5.3945, + "mean_token_accuracy": 0.1817540317773819, "step": 470 }, { - "epoch": 0.7413127413127413, - "grad_norm": 1.0369017860922958, - "learning_rate": 3.795293023279093e-06, - "loss": 3.2305, - "mean_token_accuracy": 0.39412878900766374, + "epoch": 0.023181126699345616, + "grad_norm": 9.583297265678569, + "learning_rate": 4.635441815548045e-06, + "loss": 5.3266, + "mean_token_accuracy": 0.18991935551166533, "step": 480 }, { - "epoch": 0.7567567567567568, - "grad_norm": 0.9806680315971065, - "learning_rate": 3.3812145745073834e-06, - "loss": 3.2641, - "mean_token_accuracy": 0.3884286418557167, + "epoch": 0.023664066838915317, + "grad_norm": 10.43916281216745, + "learning_rate": 4.7320135200386295e-06, + "loss": 5.25, + "mean_token_accuracy": 0.1966733880341053, "step": 490 }, { - "epoch": 0.7722007722007722, - "grad_norm": 0.9401032118500524, - "learning_rate": 2.9864169955810085e-06, - "loss": 3.2332, - "mean_token_accuracy": 0.39272971749305724, + "epoch": 0.024147006978485017, + "grad_norm": 9.419476847259737, + "learning_rate": 4.828585224529213e-06, + "loss": 5.2266, + "mean_token_accuracy": 0.19637096896767617, "step": 500 }, { - "epoch": 0.7722007722007722, - "eval_runtime": 0.3583, - "eval_samples_per_second": 259.573, - "eval_steps_per_second": 16.747, + "epoch": 0.024147006978485017, + "eval_runtime": 7.799, + "eval_samples_per_second": 378.765, + "eval_steps_per_second": 23.721, "step": 500 }, { - "epoch": 0.7876447876447876, - "grad_norm": 1.177478632276624, - "learning_rate": 2.6120503525989894e-06, - "loss": 3.2062, - "mean_token_accuracy": 0.3941104575991631, + "epoch": 0.024629947118054718, + "grad_norm": 9.063478033216443, + "learning_rate": 4.9251569290197975e-06, + "loss": 5.1898, + "mean_token_accuracy": 0.20735886916518212, "step": 510 }, { - "epoch": 0.803088803088803, - "grad_norm": 1.0196873232040227, - "learning_rate": 2.25920519527003e-06, - "loss": 3.2375, - "mean_token_accuracy": 0.39012096673250196, + "epoch": 0.02511288725762442, + "grad_norm": 7.6407757225586925, + "learning_rate": 5.021728633510382e-06, + "loss": 5.0984, + "mean_token_accuracy": 0.20796370953321458, "step": 520 }, { - "epoch": 0.8185328185328186, - "grad_norm": 0.9984465118490191, - "learning_rate": 1.9289093800839067e-06, - "loss": 3.248, - "mean_token_accuracy": 0.3905516877770424, + "epoch": 0.02559582739719412, + "grad_norm": 7.56966381564491, + "learning_rate": 5.118300338000966e-06, + "loss": 5.0875, + "mean_token_accuracy": 0.20544354915618895, "step": 530 }, { - "epoch": 0.833976833976834, - "grad_norm": 0.9871370926118326, - "learning_rate": 1.6221250761114803e-06, - "loss": 3.2148, - "mean_token_accuracy": 0.39371639788150786, + "epoch": 0.026078767536763817, + "grad_norm": 7.151939418897148, + "learning_rate": 5.21487204249155e-06, + "loss": 4.9336, + "mean_token_accuracy": 0.21118951588869095, "step": 540 }, { - "epoch": 0.8494208494208494, - "grad_norm": 1.0443669852905835, - "learning_rate": 1.339745962155613e-06, - "loss": 3.2434, - "mean_token_accuracy": 0.3890929415822029, + "epoch": 0.026561707676333517, + "grad_norm": 7.891718493300589, + "learning_rate": 5.3114437469821344e-06, + "loss": 4.9742, + "mean_token_accuracy": 0.21895161271095276, "step": 550 }, { - "epoch": 0.8648648648648649, - "grad_norm": 0.993800834286733, - "learning_rate": 1.0825946234178575e-06, - "loss": 3.2687, - "mean_token_accuracy": 0.3884435802698135, + "epoch": 0.027044647815903218, + "grad_norm": 7.022858601401243, + "learning_rate": 5.408015451472719e-06, + "loss": 4.9344, + "mean_token_accuracy": 0.21784274280071259, "step": 560 }, { - "epoch": 0.8803088803088803, - "grad_norm": 0.9734947406228226, - "learning_rate": 8.514201552645052e-07, - "loss": 3.2516, - "mean_token_accuracy": 0.3875122174620628, + "epoch": 0.02752758795547292, + "grad_norm": 8.03316781863098, + "learning_rate": 5.504587155963303e-06, + "loss": 4.9422, + "mean_token_accuracy": 0.22177419289946557, "step": 570 }, { - "epoch": 0.8957528957528957, - "grad_norm": 1.0630859235563528, - "learning_rate": 6.468959810724329e-07, - "loss": 3.2141, - "mean_token_accuracy": 0.39336204826831817, + "epoch": 0.02801052809504262, + "grad_norm": 6.416473059586127, + "learning_rate": 5.601158860453888e-06, + "loss": 4.9328, + "mean_token_accuracy": 0.216935483366251, "step": 580 }, { - "epoch": 0.9111969111969112, - "grad_norm": 0.947391901217572, - "learning_rate": 4.696178905113913e-07, - "loss": 3.2309, - "mean_token_accuracy": 0.39211184084415435, + "epoch": 0.02849346823461232, + "grad_norm": 6.947560111550373, + "learning_rate": 5.697730564944471e-06, + "loss": 4.8422, + "mean_token_accuracy": 0.2323588691651821, "step": 590 }, { - "epoch": 0.9266409266409267, - "grad_norm": 0.9951831133521754, - "learning_rate": 3.2010230397739206e-07, - "loss": 3.2234, - "mean_token_accuracy": 0.39296826124191286, + "epoch": 0.02897640837418202, + "grad_norm": 6.751214653454153, + "learning_rate": 5.794302269435056e-06, + "loss": 4.8609, + "mean_token_accuracy": 0.2269153229892254, "step": 600 }, { - "epoch": 0.9266409266409267, - "eval_runtime": 0.3594, - "eval_samples_per_second": 258.756, - "eval_steps_per_second": 16.694, + "epoch": 0.02897640837418202, + "eval_runtime": 7.8282, + "eval_samples_per_second": 377.356, + "eval_steps_per_second": 23.633, "step": 600 }, { - "epoch": 0.9420849420849421, - "grad_norm": 1.1028094081214548, - "learning_rate": 1.9878476823294467e-07, - "loss": 3.2227, - "mean_token_accuracy": 0.39292670488357545, + "epoch": 0.02945934851375172, + "grad_norm": 6.568818826918091, + "learning_rate": 5.89087397392564e-06, + "loss": 4.8547, + "mean_token_accuracy": 0.23074596747756004, "step": 610 }, { - "epoch": 0.9575289575289575, - "grad_norm": 0.9598974406726024, - "learning_rate": 1.0601868763643997e-07, - "loss": 3.2152, - "mean_token_accuracy": 0.39489552527666094, + "epoch": 0.029942288653321422, + "grad_norm": 6.768505739040085, + "learning_rate": 5.987445678416225e-06, + "loss": 4.8031, + "mean_token_accuracy": 0.23568548262119293, "step": 620 }, { - "epoch": 0.972972972972973, - "grad_norm": 1.056929496505018, - "learning_rate": 4.207429465668877e-08, - "loss": 3.2141, - "mean_token_accuracy": 0.39310239255428314, + "epoch": 0.03042522879289112, + "grad_norm": 7.312646650877884, + "learning_rate": 6.084017382906808e-06, + "loss": 4.7609, + "mean_token_accuracy": 0.23659274280071257, "step": 630 }, { - "epoch": 0.9884169884169884, - "grad_norm": 1.0002172665298499, - "learning_rate": 7.1378626715268295e-09, - "loss": 3.2508, - "mean_token_accuracy": 0.39017595201730726, + "epoch": 0.03090816893246082, + "grad_norm": 6.751931563442281, + "learning_rate": 6.180589087397393e-06, + "loss": 4.7914, + "mean_token_accuracy": 0.2309475801885128, "step": 640 }, { - "epoch": 0.9992277992277993, - "mean_token_accuracy": 0.3900075065238135, - "step": 647, - "total_flos": 5418484972388352.0, - "train_loss": 3.603820517774343, - "train_runtime": 400.5994, - "train_samples_per_second": 51.708, - "train_steps_per_second": 1.615 + "epoch": 0.03139110907203052, + "grad_norm": 6.35129343291686, + "learning_rate": 6.277160791887977e-06, + "loss": 4.8023, + "mean_token_accuracy": 0.23679435327649118, + "step": 650 + }, + { + "epoch": 0.031874049211600225, + "grad_norm": 6.798276877968672, + "learning_rate": 6.373732496378562e-06, + "loss": 4.7078, + "mean_token_accuracy": 0.2406249985098839, + "step": 660 + }, + { + "epoch": 0.03235698935116992, + "grad_norm": 6.41128184388978, + "learning_rate": 6.470304200869146e-06, + "loss": 4.6484, + "mean_token_accuracy": 0.24546370953321456, + "step": 670 + }, + { + "epoch": 0.03283992949073962, + "grad_norm": 6.786893678077557, + "learning_rate": 6.56687590535973e-06, + "loss": 4.7242, + "mean_token_accuracy": 0.23760080561041833, + "step": 680 + }, + { + "epoch": 0.033322869630309324, + "grad_norm": 6.874307740254109, + "learning_rate": 6.663447609850314e-06, + "loss": 4.6531, + "mean_token_accuracy": 0.2535282254219055, + "step": 690 + }, + { + "epoch": 0.03380580976987902, + "grad_norm": 6.529334198552244, + "learning_rate": 6.760019314340899e-06, + "loss": 4.6836, + "mean_token_accuracy": 0.24284274205565454, + "step": 700 + }, + { + "epoch": 0.03380580976987902, + "eval_runtime": 7.7775, + "eval_samples_per_second": 379.811, + "eval_steps_per_second": 23.786, + "step": 700 + }, + { + "epoch": 0.034288749909448725, + "grad_norm": 6.571496054562236, + "learning_rate": 6.856591018831483e-06, + "loss": 4.6344, + "mean_token_accuracy": 0.25030241534113884, + "step": 710 + }, + { + "epoch": 0.03477169004901842, + "grad_norm": 6.3473260026917275, + "learning_rate": 6.953162723322067e-06, + "loss": 4.6398, + "mean_token_accuracy": 0.2475806452333927, + "step": 720 + }, + { + "epoch": 0.035254630188588126, + "grad_norm": 6.562261892713004, + "learning_rate": 7.049734427812651e-06, + "loss": 4.6148, + "mean_token_accuracy": 0.24868951588869095, + "step": 730 + }, + { + "epoch": 0.035737570328157824, + "grad_norm": 5.858977665193235, + "learning_rate": 7.1463061323032356e-06, + "loss": 4.657, + "mean_token_accuracy": 0.24516128972172738, + "step": 740 + }, + { + "epoch": 0.03622051046772753, + "grad_norm": 6.800295538759567, + "learning_rate": 7.24287783679382e-06, + "loss": 4.6344, + "mean_token_accuracy": 0.24808467850089072, + "step": 750 + }, + { + "epoch": 0.036703450607297225, + "grad_norm": 6.529416845885988, + "learning_rate": 7.3394495412844045e-06, + "loss": 4.6594, + "mean_token_accuracy": 0.24889112785458564, + "step": 760 + }, + { + "epoch": 0.03718639074686693, + "grad_norm": 6.697630717561144, + "learning_rate": 7.436021245774988e-06, + "loss": 4.6055, + "mean_token_accuracy": 0.2462701603770256, + "step": 770 + }, + { + "epoch": 0.037669330886436626, + "grad_norm": 5.95009608166028, + "learning_rate": 7.5325929502655725e-06, + "loss": 4.6008, + "mean_token_accuracy": 0.2548387087881565, + "step": 780 + }, + { + "epoch": 0.038152271026006324, + "grad_norm": 5.647138551872577, + "learning_rate": 7.629164654756157e-06, + "loss": 4.5828, + "mean_token_accuracy": 0.25282258093357085, + "step": 790 + }, + { + "epoch": 0.03863521116557603, + "grad_norm": 6.436677460184058, + "learning_rate": 7.725736359246742e-06, + "loss": 4.6082, + "mean_token_accuracy": 0.25241935551166533, + "step": 800 + }, + { + "epoch": 0.03863521116557603, + "eval_runtime": 7.7986, + "eval_samples_per_second": 378.785, + "eval_steps_per_second": 23.722, + "step": 800 + }, + { + "epoch": 0.039118151305145725, + "grad_norm": 6.557950012681355, + "learning_rate": 7.822308063737327e-06, + "loss": 4.543, + "mean_token_accuracy": 0.2562500022351742, + "step": 810 + }, + { + "epoch": 0.03960109144471543, + "grad_norm": 5.60271360072184, + "learning_rate": 7.91887976822791e-06, + "loss": 4.5906, + "mean_token_accuracy": 0.257358867675066, + "step": 820 + }, + { + "epoch": 0.040084031584285126, + "grad_norm": 5.891590551442568, + "learning_rate": 8.015451472718494e-06, + "loss": 4.4961, + "mean_token_accuracy": 0.26139112934470177, + "step": 830 + }, + { + "epoch": 0.04056697172385483, + "grad_norm": 6.319141995609288, + "learning_rate": 8.112023177209078e-06, + "loss": 4.5438, + "mean_token_accuracy": 0.26350806280970573, + "step": 840 + }, + { + "epoch": 0.04104991186342453, + "grad_norm": 5.9347957915729515, + "learning_rate": 8.208594881699663e-06, + "loss": 4.5984, + "mean_token_accuracy": 0.2543346740305424, + "step": 850 + }, + { + "epoch": 0.04153285200299423, + "grad_norm": 6.21887500489896, + "learning_rate": 8.305166586190247e-06, + "loss": 4.4961, + "mean_token_accuracy": 0.26088709756731987, + "step": 860 + }, + { + "epoch": 0.04201579214256393, + "grad_norm": 5.8005895616062535, + "learning_rate": 8.401738290680832e-06, + "loss": 4.5555, + "mean_token_accuracy": 0.2596774183213711, + "step": 870 + }, + { + "epoch": 0.042498732282133626, + "grad_norm": 5.645546026576973, + "learning_rate": 8.498309995171416e-06, + "loss": 4.4992, + "mean_token_accuracy": 0.2630040317773819, + "step": 880 + }, + { + "epoch": 0.04298167242170333, + "grad_norm": 5.867461510415207, + "learning_rate": 8.594881699662e-06, + "loss": 4.4867, + "mean_token_accuracy": 0.262298384308815, + "step": 890 + }, + { + "epoch": 0.04346461256127303, + "grad_norm": 6.4078157319256155, + "learning_rate": 8.691453404152585e-06, + "loss": 4.5141, + "mean_token_accuracy": 0.2566532239317894, + "step": 900 + }, + { + "epoch": 0.04346461256127303, + "eval_runtime": 7.7925, + "eval_samples_per_second": 379.084, + "eval_steps_per_second": 23.741, + "step": 900 + }, + { + "epoch": 0.04394755270084273, + "grad_norm": 6.1661426514813185, + "learning_rate": 8.788025108643168e-06, + "loss": 4.416, + "mean_token_accuracy": 0.26330645233392713, + "step": 910 + }, + { + "epoch": 0.04443049284041243, + "grad_norm": 5.817055841219091, + "learning_rate": 8.884596813133752e-06, + "loss": 4.4391, + "mean_token_accuracy": 0.27298386916518214, + "step": 920 + }, + { + "epoch": 0.04491343297998213, + "grad_norm": 5.984395658176951, + "learning_rate": 8.981168517624337e-06, + "loss": 4.5164, + "mean_token_accuracy": 0.2577620968222618, + "step": 930 + }, + { + "epoch": 0.04539637311955183, + "grad_norm": 5.882810812790242, + "learning_rate": 9.077740222114921e-06, + "loss": 4.4363, + "mean_token_accuracy": 0.27116935402154924, + "step": 940 + }, + { + "epoch": 0.045879313259121535, + "grad_norm": 5.11555968599847, + "learning_rate": 9.174311926605506e-06, + "loss": 4.4656, + "mean_token_accuracy": 0.26764112785458566, + "step": 950 + }, + { + "epoch": 0.04636225339869123, + "grad_norm": 5.590901293802474, + "learning_rate": 9.27088363109609e-06, + "loss": 4.4117, + "mean_token_accuracy": 0.26743951588869097, + "step": 960 + }, + { + "epoch": 0.04684519353826093, + "grad_norm": 5.647756025719848, + "learning_rate": 9.367455335586674e-06, + "loss": 4.5121, + "mean_token_accuracy": 0.260080648213625, + "step": 970 + }, + { + "epoch": 0.04732813367783063, + "grad_norm": 5.153246489988432, + "learning_rate": 9.464027040077259e-06, + "loss": 4.482, + "mean_token_accuracy": 0.2638104811310768, + "step": 980 + }, + { + "epoch": 0.04781107381740033, + "grad_norm": 5.47031302012429, + "learning_rate": 9.560598744567843e-06, + "loss": 4.432, + "mean_token_accuracy": 0.2661290317773819, + "step": 990 + }, + { + "epoch": 0.048294013956970035, + "grad_norm": 5.496662380685803, + "learning_rate": 9.657170449058426e-06, + "loss": 4.4934, + "mean_token_accuracy": 0.2628024198114872, + "step": 1000 + }, + { + "epoch": 0.048294013956970035, + "eval_runtime": 7.8503, + "eval_samples_per_second": 376.294, + "eval_steps_per_second": 23.566, + "step": 1000 + }, + { + "epoch": 0.04877695409653973, + "grad_norm": 5.415310676462476, + "learning_rate": 9.75374215354901e-06, + "loss": 4.3781, + "mean_token_accuracy": 0.2776209689676762, + "step": 1010 + }, + { + "epoch": 0.049259894236109436, + "grad_norm": 5.5124107735369465, + "learning_rate": 9.850313858039595e-06, + "loss": 4.4715, + "mean_token_accuracy": 0.27147177159786223, + "step": 1020 + }, + { + "epoch": 0.04974283437567913, + "grad_norm": 5.649516777672486, + "learning_rate": 9.94688556253018e-06, + "loss": 4.45, + "mean_token_accuracy": 0.27530241534113886, + "step": 1030 + }, + { + "epoch": 0.05022577451524884, + "grad_norm": 5.504346017846997, + "learning_rate": 1.0043457267020764e-05, + "loss": 4.4496, + "mean_token_accuracy": 0.2676411300897598, + "step": 1040 + }, + { + "epoch": 0.050708714654818535, + "grad_norm": 5.315690449608703, + "learning_rate": 1.0140028971511348e-05, + "loss": 4.5195, + "mean_token_accuracy": 0.26239918991923333, + "step": 1050 + }, + { + "epoch": 0.05119165479438824, + "grad_norm": 5.2296044318266635, + "learning_rate": 1.0236600676001933e-05, + "loss": 4.4371, + "mean_token_accuracy": 0.27227822244167327, + "step": 1060 + }, + { + "epoch": 0.051674594933957936, + "grad_norm": 5.403037573405085, + "learning_rate": 1.0333172380492516e-05, + "loss": 4.3746, + "mean_token_accuracy": 0.2784274183213711, + "step": 1070 + }, + { + "epoch": 0.05215753507352763, + "grad_norm": 5.199701390121784, + "learning_rate": 1.04297440849831e-05, + "loss": 4.4359, + "mean_token_accuracy": 0.27046370729804037, + "step": 1080 + }, + { + "epoch": 0.05264047521309734, + "grad_norm": 5.215482834477001, + "learning_rate": 1.0526315789473684e-05, + "loss": 4.4203, + "mean_token_accuracy": 0.2684475801885128, + "step": 1090 + }, + { + "epoch": 0.053123415352667035, + "grad_norm": 5.66227835348697, + "learning_rate": 1.0622887493964269e-05, + "loss": 4.4219, + "mean_token_accuracy": 0.27338709458708765, + "step": 1100 + }, + { + "epoch": 0.053123415352667035, + "eval_runtime": 7.7947, + "eval_samples_per_second": 378.975, + "eval_steps_per_second": 23.734, + "step": 1100 + }, + { + "epoch": 0.05360635549223674, + "grad_norm": 4.747092470280433, + "learning_rate": 1.0719459198454853e-05, + "loss": 4.3652, + "mean_token_accuracy": 0.27106855139136316, + "step": 1110 + }, + { + "epoch": 0.054089295631806436, + "grad_norm": 5.013979725224065, + "learning_rate": 1.0816030902945438e-05, + "loss": 4.402, + "mean_token_accuracy": 0.277318549156189, + "step": 1120 + }, + { + "epoch": 0.05457223577137614, + "grad_norm": 5.26115325559556, + "learning_rate": 1.0912602607436022e-05, + "loss": 4.4031, + "mean_token_accuracy": 0.2803427383303642, + "step": 1130 + }, + { + "epoch": 0.05505517591094584, + "grad_norm": 5.033659584260112, + "learning_rate": 1.1009174311926607e-05, + "loss": 4.4023, + "mean_token_accuracy": 0.27883064597845075, + "step": 1140 + }, + { + "epoch": 0.05553811605051554, + "grad_norm": 4.954391238616325, + "learning_rate": 1.1105746016417191e-05, + "loss": 4.4113, + "mean_token_accuracy": 0.2770161248743534, + "step": 1150 + }, + { + "epoch": 0.05602105619008524, + "grad_norm": 5.263067395585244, + "learning_rate": 1.1202317720907776e-05, + "loss": 4.3461, + "mean_token_accuracy": 0.2686491936445236, + "step": 1160 + }, + { + "epoch": 0.056503996329654936, + "grad_norm": 5.16759966644505, + "learning_rate": 1.1298889425398358e-05, + "loss": 4.432, + "mean_token_accuracy": 0.2745967745780945, + "step": 1170 + }, + { + "epoch": 0.05698693646922464, + "grad_norm": 5.21914041586483, + "learning_rate": 1.1395461129888943e-05, + "loss": 4.4094, + "mean_token_accuracy": 0.2734879031777382, + "step": 1180 + }, + { + "epoch": 0.05746987660879434, + "grad_norm": 4.972840372050854, + "learning_rate": 1.1492032834379527e-05, + "loss": 4.4453, + "mean_token_accuracy": 0.273689516633749, + "step": 1190 + }, + { + "epoch": 0.05795281674836404, + "grad_norm": 4.91837651816623, + "learning_rate": 1.1588604538870112e-05, + "loss": 4.3688, + "mean_token_accuracy": 0.27631047964096067, + "step": 1200 + }, + { + "epoch": 0.05795281674836404, + "eval_runtime": 7.7933, + "eval_samples_per_second": 379.046, + "eval_steps_per_second": 23.738, + "step": 1200 + }, + { + "epoch": 0.05843575688793374, + "grad_norm": 5.120258110700915, + "learning_rate": 1.1685176243360696e-05, + "loss": 4.393, + "mean_token_accuracy": 0.27358871027827264, + "step": 1210 + }, + { + "epoch": 0.05891869702750344, + "grad_norm": 4.715854893519893, + "learning_rate": 1.178174794785128e-05, + "loss": 4.3922, + "mean_token_accuracy": 0.26824596524238586, + "step": 1220 + }, + { + "epoch": 0.05940163716707314, + "grad_norm": 4.748569022412669, + "learning_rate": 1.1878319652341865e-05, + "loss": 4.4379, + "mean_token_accuracy": 0.26653225868940356, + "step": 1230 + }, + { + "epoch": 0.059884577306642844, + "grad_norm": 5.220648990494797, + "learning_rate": 1.197489135683245e-05, + "loss": 4.3531, + "mean_token_accuracy": 0.2740927390754223, + "step": 1240 + }, + { + "epoch": 0.06036751744621254, + "grad_norm": 5.028586477606002, + "learning_rate": 1.2071463061323034e-05, + "loss": 4.3551, + "mean_token_accuracy": 0.27772177308797835, + "step": 1250 + }, + { + "epoch": 0.06085045758578224, + "grad_norm": 5.068781345504261, + "learning_rate": 1.2168034765813617e-05, + "loss": 4.325, + "mean_token_accuracy": 0.2738911293447018, + "step": 1260 + }, + { + "epoch": 0.06133339772535194, + "grad_norm": 5.221080689633655, + "learning_rate": 1.2264606470304201e-05, + "loss": 4.3875, + "mean_token_accuracy": 0.27278225794434546, + "step": 1270 + }, + { + "epoch": 0.06181633786492164, + "grad_norm": 5.085764550248687, + "learning_rate": 1.2361178174794786e-05, + "loss": 4.3203, + "mean_token_accuracy": 0.2834677413105965, + "step": 1280 + }, + { + "epoch": 0.062299278004491344, + "grad_norm": 5.367307973698626, + "learning_rate": 1.245774987928537e-05, + "loss": 4.316, + "mean_token_accuracy": 0.28387096524238586, + "step": 1290 + }, + { + "epoch": 0.06278221814406104, + "grad_norm": 4.814088937557432, + "learning_rate": 1.2554321583775954e-05, + "loss": 4.4062, + "mean_token_accuracy": 0.27338709533214567, + "step": 1300 + }, + { + "epoch": 0.06278221814406104, + "eval_runtime": 7.7872, + "eval_samples_per_second": 379.342, + "eval_steps_per_second": 23.757, + "step": 1300 + }, + { + "epoch": 0.06326515828363075, + "grad_norm": 4.830576508274259, + "learning_rate": 1.2650893288266539e-05, + "loss": 4.3211, + "mean_token_accuracy": 0.28417338654398916, + "step": 1310 + }, + { + "epoch": 0.06374809842320045, + "grad_norm": 4.9332935687133785, + "learning_rate": 1.2747464992757123e-05, + "loss": 4.3812, + "mean_token_accuracy": 0.27560483366250993, + "step": 1320 + }, + { + "epoch": 0.06423103856277014, + "grad_norm": 5.4934363818373635, + "learning_rate": 1.2844036697247708e-05, + "loss": 4.3316, + "mean_token_accuracy": 0.2795362912118435, + "step": 1330 + }, + { + "epoch": 0.06471397870233984, + "grad_norm": 4.800260532111643, + "learning_rate": 1.2940608401738292e-05, + "loss": 4.3551, + "mean_token_accuracy": 0.280947582423687, + "step": 1340 + }, + { + "epoch": 0.06519691884190955, + "grad_norm": 4.880148264641701, + "learning_rate": 1.3037180106228875e-05, + "loss": 4.3531, + "mean_token_accuracy": 0.2753034979104996, + "step": 1350 + }, + { + "epoch": 0.06567985898147924, + "grad_norm": 4.808834621682347, + "learning_rate": 1.313375181071946e-05, + "loss": 4.3152, + "mean_token_accuracy": 0.27631048709154127, + "step": 1360 + }, + { + "epoch": 0.06616279912104894, + "grad_norm": 4.466947481484539, + "learning_rate": 1.3230323515210044e-05, + "loss": 4.4105, + "mean_token_accuracy": 0.2709677405655384, + "step": 1370 + }, + { + "epoch": 0.06664573926061865, + "grad_norm": 4.735459855718925, + "learning_rate": 1.3326895219700628e-05, + "loss": 4.309, + "mean_token_accuracy": 0.2811491943895817, + "step": 1380 + }, + { + "epoch": 0.06712867940018835, + "grad_norm": 4.515857419173914, + "learning_rate": 1.3423466924191213e-05, + "loss": 4.2199, + "mean_token_accuracy": 0.28780241683125496, + "step": 1390 + }, + { + "epoch": 0.06761161953975804, + "grad_norm": 4.750233472061156, + "learning_rate": 1.3520038628681797e-05, + "loss": 4.291, + "mean_token_accuracy": 0.2888104856014252, + "step": 1400 + }, + { + "epoch": 0.06761161953975804, + "eval_runtime": 7.8003, + "eval_samples_per_second": 378.703, + "eval_steps_per_second": 23.717, + "step": 1400 + }, + { + "epoch": 0.06809455967932775, + "grad_norm": 4.680421546574031, + "learning_rate": 1.3616610333172382e-05, + "loss": 4.3289, + "mean_token_accuracy": 0.27913306653499603, + "step": 1410 + }, + { + "epoch": 0.06857749981889745, + "grad_norm": 4.601224269285572, + "learning_rate": 1.3713182037662966e-05, + "loss": 4.3352, + "mean_token_accuracy": 0.2777217745780945, + "step": 1420 + }, + { + "epoch": 0.06906043995846715, + "grad_norm": 4.472689233093631, + "learning_rate": 1.380975374215355e-05, + "loss": 4.209, + "mean_token_accuracy": 0.2897177398204803, + "step": 1430 + }, + { + "epoch": 0.06954338009803684, + "grad_norm": 4.619177247391706, + "learning_rate": 1.3906325446644133e-05, + "loss": 4.284, + "mean_token_accuracy": 0.28467742130160334, + "step": 1440 + }, + { + "epoch": 0.07002632023760655, + "grad_norm": 4.799177042165004, + "learning_rate": 1.4002897151134718e-05, + "loss": 4.3965, + "mean_token_accuracy": 0.27217742130160333, + "step": 1450 + }, + { + "epoch": 0.07050926037717625, + "grad_norm": 4.393611048681592, + "learning_rate": 1.4099468855625302e-05, + "loss": 4.2645, + "mean_token_accuracy": 0.2820564515888691, + "step": 1460 + }, + { + "epoch": 0.07099220051674594, + "grad_norm": 4.454390565066716, + "learning_rate": 1.4196040560115887e-05, + "loss": 4.3277, + "mean_token_accuracy": 0.2783266119658947, + "step": 1470 + }, + { + "epoch": 0.07147514065631565, + "grad_norm": 4.199911067866371, + "learning_rate": 1.4292612264606471e-05, + "loss": 4.2387, + "mean_token_accuracy": 0.2902217753231525, + "step": 1480 + }, + { + "epoch": 0.07195808079588535, + "grad_norm": 4.438915347397052, + "learning_rate": 1.4389183969097056e-05, + "loss": 4.2637, + "mean_token_accuracy": 0.286995966732502, + "step": 1490 + }, + { + "epoch": 0.07244102093545506, + "grad_norm": 4.579843923204942, + "learning_rate": 1.448575567358764e-05, + "loss": 4.2477, + "mean_token_accuracy": 0.2926411271095276, + "step": 1500 + }, + { + "epoch": 0.07244102093545506, + "eval_runtime": 7.81, + "eval_samples_per_second": 378.231, + "eval_steps_per_second": 23.687, + "step": 1500 + }, + { + "epoch": 0.07292396107502475, + "grad_norm": 4.431824023740648, + "learning_rate": 1.4582327378078224e-05, + "loss": 4.2543, + "mean_token_accuracy": 0.285181450843811, + "step": 1510 + }, + { + "epoch": 0.07340690121459445, + "grad_norm": 4.510306609633914, + "learning_rate": 1.4678899082568809e-05, + "loss": 4.2793, + "mean_token_accuracy": 0.28054435551166534, + "step": 1520 + }, + { + "epoch": 0.07388984135416415, + "grad_norm": 4.365911038914563, + "learning_rate": 1.4775470787059393e-05, + "loss": 4.2566, + "mean_token_accuracy": 0.28346774354577065, + "step": 1530 + }, + { + "epoch": 0.07437278149373386, + "grad_norm": 4.088488922706824, + "learning_rate": 1.4872042491549976e-05, + "loss": 4.2594, + "mean_token_accuracy": 0.2848790302872658, + "step": 1540 + }, + { + "epoch": 0.07485572163330355, + "grad_norm": 4.120686586249447, + "learning_rate": 1.496861419604056e-05, + "loss": 4.3441, + "mean_token_accuracy": 0.28104838728904724, + "step": 1550 + }, + { + "epoch": 0.07533866177287325, + "grad_norm": 4.43368775520899, + "learning_rate": 1.5065185900531145e-05, + "loss": 4.3773, + "mean_token_accuracy": 0.2772177442908287, + "step": 1560 + }, + { + "epoch": 0.07582160191244296, + "grad_norm": 4.696316687482952, + "learning_rate": 1.516175760502173e-05, + "loss": 4.2699, + "mean_token_accuracy": 0.28266128823161124, + "step": 1570 + }, + { + "epoch": 0.07630454205201265, + "grad_norm": 4.258982028304477, + "learning_rate": 1.5258329309512314e-05, + "loss": 4.2465, + "mean_token_accuracy": 0.2902217723429203, + "step": 1580 + }, + { + "epoch": 0.07678748219158235, + "grad_norm": 4.183184611426825, + "learning_rate": 1.53549010140029e-05, + "loss": 4.2719, + "mean_token_accuracy": 0.281552417576313, + "step": 1590 + }, + { + "epoch": 0.07727042233115206, + "grad_norm": 4.475490851991573, + "learning_rate": 1.5451472718493484e-05, + "loss": 4.2766, + "mean_token_accuracy": 0.2819556452333927, + "step": 1600 + }, + { + "epoch": 0.07727042233115206, + "eval_runtime": 7.7922, + "eval_samples_per_second": 379.095, + "eval_steps_per_second": 23.742, + "step": 1600 + }, + { + "epoch": 0.07775336247072176, + "grad_norm": 4.195793070651388, + "learning_rate": 1.5548044422984067e-05, + "loss": 4.2336, + "mean_token_accuracy": 0.2904233880341053, + "step": 1610 + }, + { + "epoch": 0.07823630261029145, + "grad_norm": 4.280488621876795, + "learning_rate": 1.5644616127474653e-05, + "loss": 4.2367, + "mean_token_accuracy": 0.2879032239317894, + "step": 1620 + }, + { + "epoch": 0.07871924274986115, + "grad_norm": 4.247019183705438, + "learning_rate": 1.5741187831965236e-05, + "loss": 4.2648, + "mean_token_accuracy": 0.28840725868940353, + "step": 1630 + }, + { + "epoch": 0.07920218288943086, + "grad_norm": 4.07130581710393, + "learning_rate": 1.583775953645582e-05, + "loss": 4.2324, + "mean_token_accuracy": 0.2824596792459488, + "step": 1640 + }, + { + "epoch": 0.07968512302900055, + "grad_norm": 4.218784263208886, + "learning_rate": 1.5934331240946405e-05, + "loss": 4.2477, + "mean_token_accuracy": 0.28618951737880705, + "step": 1650 + }, + { + "epoch": 0.08016806316857025, + "grad_norm": 4.2590102178275755, + "learning_rate": 1.6030902945436988e-05, + "loss": 4.1945, + "mean_token_accuracy": 0.29002016186714175, + "step": 1660 + }, + { + "epoch": 0.08065100330813996, + "grad_norm": 4.022664270818587, + "learning_rate": 1.6127474649927574e-05, + "loss": 4.3285, + "mean_token_accuracy": 0.2779233880341053, + "step": 1670 + }, + { + "epoch": 0.08113394344770966, + "grad_norm": 3.8988764717241056, + "learning_rate": 1.6224046354418157e-05, + "loss": 4.2473, + "mean_token_accuracy": 0.2934475809335709, + "step": 1680 + }, + { + "epoch": 0.08161688358727935, + "grad_norm": 4.252351158534765, + "learning_rate": 1.6320618058908743e-05, + "loss": 4.2309, + "mean_token_accuracy": 0.2824596777558327, + "step": 1690 + }, + { + "epoch": 0.08209982372684906, + "grad_norm": 3.999145949805774, + "learning_rate": 1.6417189763399326e-05, + "loss": 4.2441, + "mean_token_accuracy": 0.2934475809335709, + "step": 1700 + }, + { + "epoch": 0.08209982372684906, + "eval_runtime": 7.766, + "eval_samples_per_second": 380.376, + "eval_steps_per_second": 23.822, + "step": 1700 + }, + { + "epoch": 0.08258276386641876, + "grad_norm": 4.492602153549056, + "learning_rate": 1.6513761467889912e-05, + "loss": 4.2141, + "mean_token_accuracy": 0.28719758093357084, + "step": 1710 + }, + { + "epoch": 0.08306570400598846, + "grad_norm": 3.961285953203282, + "learning_rate": 1.6610333172380494e-05, + "loss": 4.2934, + "mean_token_accuracy": 0.27923387214541434, + "step": 1720 + }, + { + "epoch": 0.08354864414555815, + "grad_norm": 4.236996974317345, + "learning_rate": 1.6706904876871077e-05, + "loss": 4.2449, + "mean_token_accuracy": 0.28639113157987595, + "step": 1730 + }, + { + "epoch": 0.08403158428512786, + "grad_norm": 3.9002420355103173, + "learning_rate": 1.6803476581361663e-05, + "loss": 4.2094, + "mean_token_accuracy": 0.2850806437432766, + "step": 1740 + }, + { + "epoch": 0.08451452442469756, + "grad_norm": 4.283672233634765, + "learning_rate": 1.6900048285852246e-05, + "loss": 4.2703, + "mean_token_accuracy": 0.2866935506463051, + "step": 1750 + }, + { + "epoch": 0.08499746456426725, + "grad_norm": 3.7545429095344107, + "learning_rate": 1.6996619990342832e-05, + "loss": 4.1707, + "mean_token_accuracy": 0.29284274131059645, + "step": 1760 + }, + { + "epoch": 0.08548040470383696, + "grad_norm": 4.064213012685202, + "learning_rate": 1.7093191694833415e-05, + "loss": 4.1891, + "mean_token_accuracy": 0.29143145456910136, + "step": 1770 + }, + { + "epoch": 0.08596334484340666, + "grad_norm": 4.158862971978716, + "learning_rate": 1.7189763399324e-05, + "loss": 4.2285, + "mean_token_accuracy": 0.2854838721454144, + "step": 1780 + }, + { + "epoch": 0.08644628498297637, + "grad_norm": 4.271117505211573, + "learning_rate": 1.7286335103814584e-05, + "loss": 4.2121, + "mean_token_accuracy": 0.29284274354577067, + "step": 1790 + }, + { + "epoch": 0.08692922512254606, + "grad_norm": 3.960333913637192, + "learning_rate": 1.738290680830517e-05, + "loss": 4.2281, + "mean_token_accuracy": 0.2903225809335709, + "step": 1800 + }, + { + "epoch": 0.08692922512254606, + "eval_runtime": 7.7754, + "eval_samples_per_second": 379.917, + "eval_steps_per_second": 23.793, + "step": 1800 + }, + { + "epoch": 0.08741216526211576, + "grad_norm": 3.9117115145614343, + "learning_rate": 1.7479478512795753e-05, + "loss": 4.1781, + "mean_token_accuracy": 0.2910282269120216, + "step": 1810 + }, + { + "epoch": 0.08789510540168546, + "grad_norm": 3.905931678714368, + "learning_rate": 1.7576050217286336e-05, + "loss": 4.2211, + "mean_token_accuracy": 0.2852822542190552, + "step": 1820 + }, + { + "epoch": 0.08837804554125517, + "grad_norm": 4.0562310282381535, + "learning_rate": 1.7672621921776922e-05, + "loss": 4.1496, + "mean_token_accuracy": 0.284375, + "step": 1830 + }, + { + "epoch": 0.08886098568082486, + "grad_norm": 3.930219040624092, + "learning_rate": 1.7769193626267504e-05, + "loss": 4.2453, + "mean_token_accuracy": 0.28830645233392715, + "step": 1840 + }, + { + "epoch": 0.08934392582039456, + "grad_norm": 3.7955300751416616, + "learning_rate": 1.786576533075809e-05, + "loss": 4.2336, + "mean_token_accuracy": 0.28578629046678544, + "step": 1850 + }, + { + "epoch": 0.08982686595996427, + "grad_norm": 3.827481874435193, + "learning_rate": 1.7962337035248673e-05, + "loss": 4.1898, + "mean_token_accuracy": 0.2917338728904724, + "step": 1860 + }, + { + "epoch": 0.09030980609953396, + "grad_norm": 3.96875879591722, + "learning_rate": 1.805890873973926e-05, + "loss": 4.1773, + "mean_token_accuracy": 0.2904233902692795, + "step": 1870 + }, + { + "epoch": 0.09079274623910366, + "grad_norm": 4.469603855470973, + "learning_rate": 1.8155480444229842e-05, + "loss": 4.3141, + "mean_token_accuracy": 0.28276209607720376, + "step": 1880 + }, + { + "epoch": 0.09127568637867337, + "grad_norm": 3.8364012516557833, + "learning_rate": 1.825205214872043e-05, + "loss": 4.2082, + "mean_token_accuracy": 0.2919354811310768, + "step": 1890 + }, + { + "epoch": 0.09175862651824307, + "grad_norm": 3.789969887449224, + "learning_rate": 1.834862385321101e-05, + "loss": 4.1961, + "mean_token_accuracy": 0.29455645084381105, + "step": 1900 + }, + { + "epoch": 0.09175862651824307, + "eval_runtime": 7.7993, + "eval_samples_per_second": 378.752, + "eval_steps_per_second": 23.72, + "step": 1900 + }, + { + "epoch": 0.09224156665781276, + "grad_norm": 4.1914990070207665, + "learning_rate": 1.8445195557701594e-05, + "loss": 4.1949, + "mean_token_accuracy": 0.292943549156189, + "step": 1910 + }, + { + "epoch": 0.09272450679738246, + "grad_norm": 3.796855649734539, + "learning_rate": 1.854176726219218e-05, + "loss": 4.2289, + "mean_token_accuracy": 0.28387096896767616, + "step": 1920 + }, + { + "epoch": 0.09320744693695217, + "grad_norm": 3.7432314938765665, + "learning_rate": 1.8638338966682763e-05, + "loss": 4.2934, + "mean_token_accuracy": 0.28387096971273423, + "step": 1930 + }, + { + "epoch": 0.09369038707652186, + "grad_norm": 3.8498893083454835, + "learning_rate": 1.873491067117335e-05, + "loss": 4.1922, + "mean_token_accuracy": 0.29002015963196753, + "step": 1940 + }, + { + "epoch": 0.09417332721609156, + "grad_norm": 3.8439065882660004, + "learning_rate": 1.883148237566393e-05, + "loss": 4.1785, + "mean_token_accuracy": 0.28729838579893113, + "step": 1950 + }, + { + "epoch": 0.09465626735566127, + "grad_norm": 3.8852014685722884, + "learning_rate": 1.8928054080154518e-05, + "loss": 4.2266, + "mean_token_accuracy": 0.2824596770107746, + "step": 1960 + }, + { + "epoch": 0.09513920749523097, + "grad_norm": 3.679777948886876, + "learning_rate": 1.90246257846451e-05, + "loss": 4.2246, + "mean_token_accuracy": 0.28931451588869095, + "step": 1970 + }, + { + "epoch": 0.09562214763480066, + "grad_norm": 3.729761142862288, + "learning_rate": 1.9121197489135687e-05, + "loss": 4.2203, + "mean_token_accuracy": 0.2883064493536949, + "step": 1980 + }, + { + "epoch": 0.09610508777437037, + "grad_norm": 3.981413827863695, + "learning_rate": 1.921776919362627e-05, + "loss": 4.1734, + "mean_token_accuracy": 0.29052419364452364, + "step": 1990 + }, + { + "epoch": 0.09658802791394007, + "grad_norm": 3.689671413395015, + "learning_rate": 1.9314340898116852e-05, + "loss": 4.1859, + "mean_token_accuracy": 0.28558467552065847, + "step": 2000 + }, + { + "epoch": 0.09658802791394007, + "eval_runtime": 7.8265, + "eval_samples_per_second": 377.433, + "eval_steps_per_second": 23.638, + "step": 2000 + }, + { + "epoch": 0.09707096805350977, + "grad_norm": 3.3695174549915956, + "learning_rate": 1.941091260260744e-05, + "loss": 4.1332, + "mean_token_accuracy": 0.2949596747756004, + "step": 2010 + }, + { + "epoch": 0.09755390819307946, + "grad_norm": 3.824944050520389, + "learning_rate": 1.950748430709802e-05, + "loss": 4.1762, + "mean_token_accuracy": 0.2873991928994656, + "step": 2020 + }, + { + "epoch": 0.09803684833264917, + "grad_norm": 3.770968937762511, + "learning_rate": 1.9604056011588607e-05, + "loss": 4.2035, + "mean_token_accuracy": 0.2889112919569016, + "step": 2030 + }, + { + "epoch": 0.09851978847221887, + "grad_norm": 3.6748457634050813, + "learning_rate": 1.970062771607919e-05, + "loss": 4.1648, + "mean_token_accuracy": 0.29838709682226183, + "step": 2040 + }, + { + "epoch": 0.09900272861178856, + "grad_norm": 3.4417568990801595, + "learning_rate": 1.9797199420569776e-05, + "loss": 4.1871, + "mean_token_accuracy": 0.2861895151436329, + "step": 2050 + }, + { + "epoch": 0.09948566875135827, + "grad_norm": 3.5192581295855567, + "learning_rate": 1.989377112506036e-05, + "loss": 4.1605, + "mean_token_accuracy": 0.29586693495512006, + "step": 2060 + }, + { + "epoch": 0.09996860889092797, + "grad_norm": 3.564576590092872, + "learning_rate": 1.9990342829550945e-05, + "loss": 4.0996, + "mean_token_accuracy": 0.2984879031777382, + "step": 2070 + }, + { + "epoch": 0.10045154903049767, + "grad_norm": 3.404975178767239, + "learning_rate": 1.9999988489454894e-05, + "loss": 4.216, + "mean_token_accuracy": 0.29022177383303643, + "step": 2080 + }, + { + "epoch": 0.10093448917006737, + "grad_norm": 3.496957526602745, + "learning_rate": 1.999994869995027e-05, + "loss": 4.1328, + "mean_token_accuracy": 0.2946572571992874, + "step": 2090 + }, + { + "epoch": 0.10141742930963707, + "grad_norm": 3.4605287084179563, + "learning_rate": 1.9999880489493693e-05, + "loss": 4.2566, + "mean_token_accuracy": 0.28326612934470174, + "step": 2100 + }, + { + "epoch": 0.10141742930963707, + "eval_runtime": 7.789, + "eval_samples_per_second": 379.252, + "eval_steps_per_second": 23.751, + "step": 2100 + }, + { + "epoch": 0.10190036944920677, + "grad_norm": 3.483317885353719, + "learning_rate": 1.999978385827903e-05, + "loss": 4.1437, + "mean_token_accuracy": 0.2941532261669636, + "step": 2110 + }, + { + "epoch": 0.10238330958877648, + "grad_norm": 3.5647520449079333, + "learning_rate": 1.9999658806580906e-05, + "loss": 4.2262, + "mean_token_accuracy": 0.290423384308815, + "step": 2120 + }, + { + "epoch": 0.10286624972834617, + "grad_norm": 3.7643349384881755, + "learning_rate": 1.9999505334754743e-05, + "loss": 4.157, + "mean_token_accuracy": 0.28901209533214567, + "step": 2130 + }, + { + "epoch": 0.10334918986791587, + "grad_norm": 3.2285226338348383, + "learning_rate": 1.999932344323672e-05, + "loss": 4.1895, + "mean_token_accuracy": 0.2916330635547638, + "step": 2140 + }, + { + "epoch": 0.10383213000748558, + "grad_norm": 3.5042926196337785, + "learning_rate": 1.9999113132543795e-05, + "loss": 4.1281, + "mean_token_accuracy": 0.2879032239317894, + "step": 2150 + }, + { + "epoch": 0.10431507014705527, + "grad_norm": 3.253912585958381, + "learning_rate": 1.999887440327369e-05, + "loss": 4.1699, + "mean_token_accuracy": 0.29606854915618896, + "step": 2160 + }, + { + "epoch": 0.10479801028662497, + "grad_norm": 3.472984439660169, + "learning_rate": 1.9998607256104902e-05, + "loss": 4.2035, + "mean_token_accuracy": 0.288306450843811, + "step": 2170 + }, + { + "epoch": 0.10528095042619467, + "grad_norm": 3.331946274518246, + "learning_rate": 1.9998311691796695e-05, + "loss": 4.1824, + "mean_token_accuracy": 0.2936491914093494, + "step": 2180 + }, + { + "epoch": 0.10576389056576438, + "grad_norm": 3.3809721423831878, + "learning_rate": 1.9997987711189088e-05, + "loss": 4.1719, + "mean_token_accuracy": 0.29324596375226974, + "step": 2190 + }, + { + "epoch": 0.10624683070533407, + "grad_norm": 3.4062492330108185, + "learning_rate": 1.9997635315202876e-05, + "loss": 4.2039, + "mean_token_accuracy": 0.29022177308797836, + "step": 2200 + }, + { + "epoch": 0.10624683070533407, + "eval_runtime": 7.8107, + "eval_samples_per_second": 378.2, + "eval_steps_per_second": 23.685, + "step": 2200 + }, + { + "epoch": 0.10672977084490377, + "grad_norm": 3.343433237267684, + "learning_rate": 1.999725450483961e-05, + "loss": 4.1156, + "mean_token_accuracy": 0.29495968073606493, + "step": 2210 + }, + { + "epoch": 0.10721271098447348, + "grad_norm": 3.3718022631206317, + "learning_rate": 1.9996845281181583e-05, + "loss": 4.1953, + "mean_token_accuracy": 0.28971774131059647, + "step": 2220 + }, + { + "epoch": 0.10769565112404317, + "grad_norm": 3.33376451036372, + "learning_rate": 1.9996407645391868e-05, + "loss": 4.1168, + "mean_token_accuracy": 0.29606854617595674, + "step": 2230 + }, + { + "epoch": 0.10817859126361287, + "grad_norm": 3.2845288968838577, + "learning_rate": 1.9995941598714263e-05, + "loss": 4.1551, + "mean_token_accuracy": 0.293951615691185, + "step": 2240 + }, + { + "epoch": 0.10866153140318258, + "grad_norm": 3.3637315136255124, + "learning_rate": 1.9995447142473327e-05, + "loss": 4.1141, + "mean_token_accuracy": 0.29868951439857483, + "step": 2250 + }, + { + "epoch": 0.10914447154275228, + "grad_norm": 3.4123615176518225, + "learning_rate": 1.999492427807436e-05, + "loss": 4.1992, + "mean_token_accuracy": 0.30312499701976775, + "step": 2260 + }, + { + "epoch": 0.10962741168232197, + "grad_norm": 3.3718098621564194, + "learning_rate": 1.99943730070034e-05, + "loss": 4.0992, + "mean_token_accuracy": 0.301008066534996, + "step": 2270 + }, + { + "epoch": 0.11011035182189167, + "grad_norm": 3.3948366801832646, + "learning_rate": 1.999379333082722e-05, + "loss": 4.1141, + "mean_token_accuracy": 0.30120967477560046, + "step": 2280 + }, + { + "epoch": 0.11059329196146138, + "grad_norm": 3.3914235079525676, + "learning_rate": 1.999318525119332e-05, + "loss": 4.2668, + "mean_token_accuracy": 0.29092741906642916, + "step": 2290 + }, + { + "epoch": 0.11107623210103108, + "grad_norm": 3.205971200422679, + "learning_rate": 1.9992548769829933e-05, + "loss": 4.1477, + "mean_token_accuracy": 0.2950604870915413, + "step": 2300 + }, + { + "epoch": 0.11107623210103108, + "eval_runtime": 7.8009, + "eval_samples_per_second": 378.673, + "eval_steps_per_second": 23.715, + "step": 2300 + }, + { + "epoch": 0.11155917224060077, + "grad_norm": 3.393777518949162, + "learning_rate": 1.999188388854601e-05, + "loss": 4.132, + "mean_token_accuracy": 0.29082661047577857, + "step": 2310 + }, + { + "epoch": 0.11204211238017048, + "grad_norm": 3.094440841604228, + "learning_rate": 1.9991190609231214e-05, + "loss": 4.1207, + "mean_token_accuracy": 0.2965725839138031, + "step": 2320 + }, + { + "epoch": 0.11252505251974018, + "grad_norm": 3.3479123424365915, + "learning_rate": 1.999046893385592e-05, + "loss": 4.1105, + "mean_token_accuracy": 0.2955645158886909, + "step": 2330 + }, + { + "epoch": 0.11300799265930987, + "grad_norm": 3.0668023779433766, + "learning_rate": 1.998971886447121e-05, + "loss": 4.1785, + "mean_token_accuracy": 0.28941532522439956, + "step": 2340 + }, + { + "epoch": 0.11349093279887958, + "grad_norm": 3.143968664214931, + "learning_rate": 1.998894040320886e-05, + "loss": 4.1613, + "mean_token_accuracy": 0.291633065789938, + "step": 2350 + }, + { + "epoch": 0.11397387293844928, + "grad_norm": 3.1427389765606955, + "learning_rate": 1.9988133552281348e-05, + "loss": 4.1406, + "mean_token_accuracy": 0.2945564478635788, + "step": 2360 + }, + { + "epoch": 0.11445681307801898, + "grad_norm": 3.2771465720865285, + "learning_rate": 1.998729831398183e-05, + "loss": 4.1137, + "mean_token_accuracy": 0.2974798411130905, + "step": 2370 + }, + { + "epoch": 0.11493975321758867, + "grad_norm": 3.4001197997830976, + "learning_rate": 1.998643469068415e-05, + "loss": 4.1777, + "mean_token_accuracy": 0.28921371400356294, + "step": 2380 + }, + { + "epoch": 0.11542269335715838, + "grad_norm": 3.126329774194059, + "learning_rate": 1.9985542684842813e-05, + "loss": 4.1, + "mean_token_accuracy": 0.29294354766607283, + "step": 2390 + }, + { + "epoch": 0.11590563349672808, + "grad_norm": 3.1907225391859932, + "learning_rate": 1.9984622298992996e-05, + "loss": 4.0848, + "mean_token_accuracy": 0.29758064448833466, + "step": 2400 + }, + { + "epoch": 0.11590563349672808, + "eval_runtime": 7.7913, + "eval_samples_per_second": 379.139, + "eval_steps_per_second": 23.744, + "step": 2400 + }, + { + "epoch": 0.11638857363629779, + "grad_norm": 3.134878526360008, + "learning_rate": 1.9983673535750547e-05, + "loss": 4.1465, + "mean_token_accuracy": 0.2937500014901161, + "step": 2410 + }, + { + "epoch": 0.11687151377586748, + "grad_norm": 3.236939441567008, + "learning_rate": 1.9982696397811944e-05, + "loss": 4.2105, + "mean_token_accuracy": 0.2894153207540512, + "step": 2420 + }, + { + "epoch": 0.11735445391543718, + "grad_norm": 3.084502916032764, + "learning_rate": 1.998169088795433e-05, + "loss": 4.1746, + "mean_token_accuracy": 0.2940524220466614, + "step": 2430 + }, + { + "epoch": 0.11783739405500689, + "grad_norm": 3.060352329478195, + "learning_rate": 1.998065700903547e-05, + "loss": 4.1437, + "mean_token_accuracy": 0.2989919319748878, + "step": 2440 + }, + { + "epoch": 0.11832033419457658, + "grad_norm": 3.1678451476640643, + "learning_rate": 1.997959476399376e-05, + "loss": 4.1527, + "mean_token_accuracy": 0.29425403028726577, + "step": 2450 + }, + { + "epoch": 0.11880327433414628, + "grad_norm": 3.094998419824479, + "learning_rate": 1.9978504155848217e-05, + "loss": 4.2156, + "mean_token_accuracy": 0.2966733887791634, + "step": 2460 + }, + { + "epoch": 0.11928621447371598, + "grad_norm": 3.2866241351872123, + "learning_rate": 1.997738518769847e-05, + "loss": 4.2145, + "mean_token_accuracy": 0.29163306653499604, + "step": 2470 + }, + { + "epoch": 0.11976915461328569, + "grad_norm": 3.3145211795801326, + "learning_rate": 1.9976237862724752e-05, + "loss": 4.2016, + "mean_token_accuracy": 0.28296371176838875, + "step": 2480 + }, + { + "epoch": 0.12025209475285538, + "grad_norm": 2.9865965006326918, + "learning_rate": 1.9975062184187884e-05, + "loss": 4.1645, + "mean_token_accuracy": 0.2926411278545856, + "step": 2490 + }, + { + "epoch": 0.12073503489242508, + "grad_norm": 3.149402413863351, + "learning_rate": 1.9973858155429272e-05, + "loss": 4.1547, + "mean_token_accuracy": 0.29475806280970573, + "step": 2500 + }, + { + "epoch": 0.12073503489242508, + "eval_runtime": 7.7765, + "eval_samples_per_second": 379.863, + "eval_steps_per_second": 23.79, + "step": 2500 + }, + { + "epoch": 0.12121797503199479, + "grad_norm": 3.053530624006074, + "learning_rate": 1.99726257798709e-05, + "loss": 4.1641, + "mean_token_accuracy": 0.2897177398204803, + "step": 2510 + }, + { + "epoch": 0.12170091517156448, + "grad_norm": 2.818520679482681, + "learning_rate": 1.9971365061015314e-05, + "loss": 4.1824, + "mean_token_accuracy": 0.29042338877916335, + "step": 2520 + }, + { + "epoch": 0.12218385531113418, + "grad_norm": 3.0383887925537554, + "learning_rate": 1.9970076002445616e-05, + "loss": 4.1348, + "mean_token_accuracy": 0.2960685446858406, + "step": 2530 + }, + { + "epoch": 0.12266679545070389, + "grad_norm": 3.0440103948940167, + "learning_rate": 1.9968758607825455e-05, + "loss": 4.1137, + "mean_token_accuracy": 0.29667339026927947, + "step": 2540 + }, + { + "epoch": 0.12314973559027359, + "grad_norm": 3.137336762278146, + "learning_rate": 1.9967412880899002e-05, + "loss": 4.152, + "mean_token_accuracy": 0.28568548560142515, + "step": 2550 + }, + { + "epoch": 0.12363267572984328, + "grad_norm": 3.2256735052882033, + "learning_rate": 1.996603882549097e-05, + "loss": 4.1059, + "mean_token_accuracy": 0.29092742055654525, + "step": 2560 + }, + { + "epoch": 0.12411561586941298, + "grad_norm": 2.810525058032397, + "learning_rate": 1.9964636445506567e-05, + "loss": 4.1738, + "mean_token_accuracy": 0.2917338714003563, + "step": 2570 + }, + { + "epoch": 0.12459855600898269, + "grad_norm": 3.0368595802556615, + "learning_rate": 1.996320574493152e-05, + "loss": 4.1023, + "mean_token_accuracy": 0.2969758063554764, + "step": 2580 + }, + { + "epoch": 0.12508149614855238, + "grad_norm": 3.107936710751673, + "learning_rate": 1.9961746727832035e-05, + "loss": 4.1766, + "mean_token_accuracy": 0.2967741936445236, + "step": 2590 + }, + { + "epoch": 0.12556443628812208, + "grad_norm": 3.1934944513060377, + "learning_rate": 1.996025939835479e-05, + "loss": 4.1219, + "mean_token_accuracy": 0.3009072571992874, + "step": 2600 + }, + { + "epoch": 0.12556443628812208, + "eval_runtime": 7.7742, + "eval_samples_per_second": 379.974, + "eval_steps_per_second": 23.797, + "step": 2600 + }, + { + "epoch": 0.1260473764276918, + "grad_norm": 2.9318511625725474, + "learning_rate": 1.995874376072695e-05, + "loss": 4.1039, + "mean_token_accuracy": 0.29314516112208366, + "step": 2610 + }, + { + "epoch": 0.1265303165672615, + "grad_norm": 2.9737147117589364, + "learning_rate": 1.9957199819256114e-05, + "loss": 4.1355, + "mean_token_accuracy": 0.2991935461759567, + "step": 2620 + }, + { + "epoch": 0.1270132567068312, + "grad_norm": 2.994991653253815, + "learning_rate": 1.9955627578330342e-05, + "loss": 4.1125, + "mean_token_accuracy": 0.29899193346500397, + "step": 2630 + }, + { + "epoch": 0.1274961968464009, + "grad_norm": 3.141545019029578, + "learning_rate": 1.995402704241811e-05, + "loss": 4.1816, + "mean_token_accuracy": 0.29324597045779227, + "step": 2640 + }, + { + "epoch": 0.12797913698597058, + "grad_norm": 3.1399846562592284, + "learning_rate": 1.9952398216068313e-05, + "loss": 4.1461, + "mean_token_accuracy": 0.29395161271095277, + "step": 2650 + }, + { + "epoch": 0.12846207712554028, + "grad_norm": 3.2048341512162204, + "learning_rate": 1.9950741103910266e-05, + "loss": 4.0828, + "mean_token_accuracy": 0.30231854766607286, + "step": 2660 + }, + { + "epoch": 0.12894501726510998, + "grad_norm": 3.1432823321613714, + "learning_rate": 1.9949055710653652e-05, + "loss": 4.1328, + "mean_token_accuracy": 0.3035282254219055, + "step": 2670 + }, + { + "epoch": 0.1294279574046797, + "grad_norm": 3.0971336034072174, + "learning_rate": 1.9947342041088548e-05, + "loss": 4.1203, + "mean_token_accuracy": 0.29374999925494194, + "step": 2680 + }, + { + "epoch": 0.1299108975442494, + "grad_norm": 2.8419491657941887, + "learning_rate": 1.9945600100085394e-05, + "loss": 4.2234, + "mean_token_accuracy": 0.29052419364452364, + "step": 2690 + }, + { + "epoch": 0.1303938376838191, + "grad_norm": 2.98093428485918, + "learning_rate": 1.9943829892594975e-05, + "loss": 4.0855, + "mean_token_accuracy": 0.2977822571992874, + "step": 2700 + }, + { + "epoch": 0.1303938376838191, + "eval_runtime": 7.7829, + "eval_samples_per_second": 379.55, + "eval_steps_per_second": 23.77, + "step": 2700 + }, + { + "epoch": 0.1308767778233888, + "grad_norm": 3.0655424493867027, + "learning_rate": 1.9942031423648412e-05, + "loss": 4.1461, + "mean_token_accuracy": 0.29586693495512006, + "step": 2710 + }, + { + "epoch": 0.13135971796295848, + "grad_norm": 2.917395046685627, + "learning_rate": 1.9940204698357157e-05, + "loss": 4.0898, + "mean_token_accuracy": 0.30181451588869096, + "step": 2720 + }, + { + "epoch": 0.13184265810252818, + "grad_norm": 2.780683782637958, + "learning_rate": 1.993834972191296e-05, + "loss": 4.1449, + "mean_token_accuracy": 0.29334677159786227, + "step": 2730 + }, + { + "epoch": 0.13232559824209789, + "grad_norm": 2.780131520760637, + "learning_rate": 1.9936466499587867e-05, + "loss": 4.1094, + "mean_token_accuracy": 0.2979838714003563, + "step": 2740 + }, + { + "epoch": 0.1328085383816676, + "grad_norm": 2.894335714716186, + "learning_rate": 1.9934555036734204e-05, + "loss": 4.0707, + "mean_token_accuracy": 0.29848790168762207, + "step": 2750 + }, + { + "epoch": 0.1332914785212373, + "grad_norm": 2.8123238985658383, + "learning_rate": 1.9932615338784563e-05, + "loss": 4.1219, + "mean_token_accuracy": 0.2932459652423859, + "step": 2760 + }, + { + "epoch": 0.133774418660807, + "grad_norm": 3.1669770335511807, + "learning_rate": 1.993064741125177e-05, + "loss": 4.1277, + "mean_token_accuracy": 0.28961693644523623, + "step": 2770 + }, + { + "epoch": 0.1342573588003767, + "grad_norm": 2.8788532089749053, + "learning_rate": 1.9928651259728895e-05, + "loss": 4.1324, + "mean_token_accuracy": 0.2976814493536949, + "step": 2780 + }, + { + "epoch": 0.1347402989399464, + "grad_norm": 2.902731924766266, + "learning_rate": 1.992662688988922e-05, + "loss": 4.1277, + "mean_token_accuracy": 0.29949596896767616, + "step": 2790 + }, + { + "epoch": 0.13522323907951608, + "grad_norm": 2.9026165656263068, + "learning_rate": 1.9924574307486226e-05, + "loss": 4.0316, + "mean_token_accuracy": 0.30332661271095274, + "step": 2800 + }, + { + "epoch": 0.13522323907951608, + "eval_runtime": 7.7959, + "eval_samples_per_second": 378.917, + "eval_steps_per_second": 23.73, + "step": 2800 + }, + { + "epoch": 0.1357061792190858, + "grad_norm": 2.959141394406022, + "learning_rate": 1.992249351835358e-05, + "loss": 4.116, + "mean_token_accuracy": 0.296572582423687, + "step": 2810 + }, + { + "epoch": 0.1361891193586555, + "grad_norm": 2.9565526284260835, + "learning_rate": 1.992038452840511e-05, + "loss": 4.132, + "mean_token_accuracy": 0.2971774205565453, + "step": 2820 + }, + { + "epoch": 0.1366720594982252, + "grad_norm": 3.0187909126734094, + "learning_rate": 1.9918247343634792e-05, + "loss": 4.0859, + "mean_token_accuracy": 0.30312499701976775, + "step": 2830 + }, + { + "epoch": 0.1371549996377949, + "grad_norm": 3.2696135134499875, + "learning_rate": 1.9916081970116754e-05, + "loss": 4.15, + "mean_token_accuracy": 0.2948588699102402, + "step": 2840 + }, + { + "epoch": 0.1376379397773646, + "grad_norm": 3.110202672973678, + "learning_rate": 1.991388841400521e-05, + "loss": 4.0668, + "mean_token_accuracy": 0.3077620968222618, + "step": 2850 + }, + { + "epoch": 0.1381208799169343, + "grad_norm": 3.0364024561026004, + "learning_rate": 1.9911666681534498e-05, + "loss": 4.1109, + "mean_token_accuracy": 0.30060483813285827, + "step": 2860 + }, + { + "epoch": 0.13860382005650398, + "grad_norm": 2.8116312935370122, + "learning_rate": 1.990941677901902e-05, + "loss": 4.0707, + "mean_token_accuracy": 0.29868951737880706, + "step": 2870 + }, + { + "epoch": 0.1390867601960737, + "grad_norm": 2.8181773456766335, + "learning_rate": 1.9907138712853247e-05, + "loss": 4.1582, + "mean_token_accuracy": 0.2958669364452362, + "step": 2880 + }, + { + "epoch": 0.1395697003356434, + "grad_norm": 2.72404271407935, + "learning_rate": 1.9904832489511694e-05, + "loss": 4.0938, + "mean_token_accuracy": 0.2978830635547638, + "step": 2890 + }, + { + "epoch": 0.1400526404752131, + "grad_norm": 2.8388884078244674, + "learning_rate": 1.99024981155489e-05, + "loss": 4.1098, + "mean_token_accuracy": 0.29707661271095276, + "step": 2900 + }, + { + "epoch": 0.1400526404752131, + "eval_runtime": 7.7975, + "eval_samples_per_second": 378.84, + "eval_steps_per_second": 23.726, + "step": 2900 + }, + { + "epoch": 0.1405355806147828, + "grad_norm": 2.7411410608071587, + "learning_rate": 1.9900135597599412e-05, + "loss": 4.0812, + "mean_token_accuracy": 0.30100806802511215, + "step": 2910 + }, + { + "epoch": 0.1410185207543525, + "grad_norm": 2.74663069342885, + "learning_rate": 1.989774494237777e-05, + "loss": 4.1445, + "mean_token_accuracy": 0.29445564597845075, + "step": 2920 + }, + { + "epoch": 0.1415014608939222, + "grad_norm": 2.953799847289113, + "learning_rate": 1.9895326156678466e-05, + "loss": 4.1664, + "mean_token_accuracy": 0.2919354811310768, + "step": 2930 + }, + { + "epoch": 0.14198440103349189, + "grad_norm": 2.8073107926018626, + "learning_rate": 1.989287924737597e-05, + "loss": 4.1949, + "mean_token_accuracy": 0.288306450098753, + "step": 2940 + }, + { + "epoch": 0.1424673411730616, + "grad_norm": 2.743598961280255, + "learning_rate": 1.9890404221424658e-05, + "loss": 4.109, + "mean_token_accuracy": 0.2983870953321457, + "step": 2950 + }, + { + "epoch": 0.1429502813126313, + "grad_norm": 3.0012326335675636, + "learning_rate": 1.9887901085858826e-05, + "loss": 4.0977, + "mean_token_accuracy": 0.29838709682226183, + "step": 2960 + }, + { + "epoch": 0.143433221452201, + "grad_norm": 2.8868559864713546, + "learning_rate": 1.988536984779266e-05, + "loss": 3.9898, + "mean_token_accuracy": 0.30514112710952757, + "step": 2970 + }, + { + "epoch": 0.1439161615917707, + "grad_norm": 3.2907404789514145, + "learning_rate": 1.9882810514420222e-05, + "loss": 4.1109, + "mean_token_accuracy": 0.29707661718130113, + "step": 2980 + }, + { + "epoch": 0.1443991017313404, + "grad_norm": 2.724490848885843, + "learning_rate": 1.988022309301541e-05, + "loss": 4.0551, + "mean_token_accuracy": 0.3006048396229744, + "step": 2990 + }, + { + "epoch": 0.1448820418709101, + "grad_norm": 2.7436759004887366, + "learning_rate": 1.9877607590931966e-05, + "loss": 4.1488, + "mean_token_accuracy": 0.2912298396229744, + "step": 3000 + }, + { + "epoch": 0.1448820418709101, + "eval_runtime": 7.7911, + "eval_samples_per_second": 379.152, + "eval_steps_per_second": 23.745, + "step": 3000 + }, + { + "epoch": 0.1453649820104798, + "grad_norm": 2.692265011369886, + "learning_rate": 1.987496401560343e-05, + "loss": 4.1621, + "mean_token_accuracy": 0.29082661122083664, + "step": 3010 + }, + { + "epoch": 0.1458479221500495, + "grad_norm": 2.7831114038649365, + "learning_rate": 1.9872292374543137e-05, + "loss": 4.0285, + "mean_token_accuracy": 0.30302419513463974, + "step": 3020 + }, + { + "epoch": 0.1463308622896192, + "grad_norm": 2.8814211491029686, + "learning_rate": 1.9869592675344185e-05, + "loss": 4.0773, + "mean_token_accuracy": 0.29979838877916337, + "step": 3030 + }, + { + "epoch": 0.1468138024291889, + "grad_norm": 2.884743053840664, + "learning_rate": 1.9866864925679407e-05, + "loss": 4.0977, + "mean_token_accuracy": 0.294052417576313, + "step": 3040 + }, + { + "epoch": 0.1472967425687586, + "grad_norm": 2.654526475003225, + "learning_rate": 1.9864109133301376e-05, + "loss": 4.123, + "mean_token_accuracy": 0.2966733887791634, + "step": 3050 + }, + { + "epoch": 0.1477796827083283, + "grad_norm": 2.9486897935026013, + "learning_rate": 1.9861325306042352e-05, + "loss": 4.1078, + "mean_token_accuracy": 0.2957661285996437, + "step": 3060 + }, + { + "epoch": 0.148262622847898, + "grad_norm": 2.634288213557304, + "learning_rate": 1.9858513451814278e-05, + "loss": 4.1004, + "mean_token_accuracy": 0.29687499850988386, + "step": 3070 + }, + { + "epoch": 0.14874556298746772, + "grad_norm": 2.941275199589005, + "learning_rate": 1.9855673578608755e-05, + "loss": 4.082, + "mean_token_accuracy": 0.3023185521364212, + "step": 3080 + }, + { + "epoch": 0.1492285031270374, + "grad_norm": 2.8403359075616947, + "learning_rate": 1.9852805694497012e-05, + "loss": 4.0883, + "mean_token_accuracy": 0.30131048560142515, + "step": 3090 + }, + { + "epoch": 0.1497114432666071, + "grad_norm": 2.8047399921946536, + "learning_rate": 1.9849909807629892e-05, + "loss": 4.1461, + "mean_token_accuracy": 0.29848790615797044, + "step": 3100 + }, + { + "epoch": 0.1497114432666071, + "eval_runtime": 7.7828, + "eval_samples_per_second": 379.553, + "eval_steps_per_second": 23.77, + "step": 3100 + }, + { + "epoch": 0.1501943834061768, + "grad_norm": 2.744187832623059, + "learning_rate": 1.984698592623782e-05, + "loss": 4.184, + "mean_token_accuracy": 0.28760080486536027, + "step": 3110 + }, + { + "epoch": 0.1506773235457465, + "grad_norm": 2.746691952115286, + "learning_rate": 1.9844034058630795e-05, + "loss": 4.1148, + "mean_token_accuracy": 0.2985887095332146, + "step": 3120 + }, + { + "epoch": 0.1511602636853162, + "grad_norm": 2.8818710937151355, + "learning_rate": 1.984105421319834e-05, + "loss": 4.1711, + "mean_token_accuracy": 0.29546370804309846, + "step": 3130 + }, + { + "epoch": 0.1516432038248859, + "grad_norm": 2.702875712828679, + "learning_rate": 1.9838046398409507e-05, + "loss": 4.1145, + "mean_token_accuracy": 0.29989919662475584, + "step": 3140 + }, + { + "epoch": 0.15212614396445562, + "grad_norm": 2.856644546684219, + "learning_rate": 1.983501062281284e-05, + "loss": 4.1246, + "mean_token_accuracy": 0.3035282254219055, + "step": 3150 + }, + { + "epoch": 0.1526090841040253, + "grad_norm": 2.700008163445202, + "learning_rate": 1.983194689503634e-05, + "loss": 4.0668, + "mean_token_accuracy": 0.3047379031777382, + "step": 3160 + }, + { + "epoch": 0.153092024243595, + "grad_norm": 2.8706883733850765, + "learning_rate": 1.982885522378746e-05, + "loss": 4.1195, + "mean_token_accuracy": 0.30161290168762206, + "step": 3170 + }, + { + "epoch": 0.1535749643831647, + "grad_norm": 2.6616090481057513, + "learning_rate": 1.9825735617853064e-05, + "loss": 4.0949, + "mean_token_accuracy": 0.30171371102333067, + "step": 3180 + }, + { + "epoch": 0.1540579045227344, + "grad_norm": 3.01874133105245, + "learning_rate": 1.9822588086099425e-05, + "loss": 4.0676, + "mean_token_accuracy": 0.3040322571992874, + "step": 3190 + }, + { + "epoch": 0.1545408446623041, + "grad_norm": 2.675413319261908, + "learning_rate": 1.9819412637472166e-05, + "loss": 4.1273, + "mean_token_accuracy": 0.2976814515888691, + "step": 3200 + }, + { + "epoch": 0.1545408446623041, + "eval_runtime": 7.7928, + "eval_samples_per_second": 379.069, + "eval_steps_per_second": 23.74, + "step": 3200 + }, + { + "epoch": 0.15502378480187382, + "grad_norm": 2.766899311175021, + "learning_rate": 1.9816209280996265e-05, + "loss": 4.0605, + "mean_token_accuracy": 0.30584677308797836, + "step": 3210 + }, + { + "epoch": 0.15550672494144352, + "grad_norm": 2.791158894607005, + "learning_rate": 1.981297802577601e-05, + "loss": 4.1082, + "mean_token_accuracy": 0.29828629121184347, + "step": 3220 + }, + { + "epoch": 0.1559896650810132, + "grad_norm": 3.02082827149008, + "learning_rate": 1.9809718880994984e-05, + "loss": 4.1406, + "mean_token_accuracy": 0.29637096673250196, + "step": 3230 + }, + { + "epoch": 0.1564726052205829, + "grad_norm": 2.759968414381429, + "learning_rate": 1.980643185591603e-05, + "loss": 4.0215, + "mean_token_accuracy": 0.29969757944345476, + "step": 3240 + }, + { + "epoch": 0.1569555453601526, + "grad_norm": 2.683569058322477, + "learning_rate": 1.9803116959881243e-05, + "loss": 4.066, + "mean_token_accuracy": 0.30231854915618894, + "step": 3250 + }, + { + "epoch": 0.1574384854997223, + "grad_norm": 2.5285953993153893, + "learning_rate": 1.9799774202311917e-05, + "loss": 4.1395, + "mean_token_accuracy": 0.30070564299821856, + "step": 3260 + }, + { + "epoch": 0.157921425639292, + "grad_norm": 2.580832491779241, + "learning_rate": 1.979640359270853e-05, + "loss": 4.082, + "mean_token_accuracy": 0.30030242055654527, + "step": 3270 + }, + { + "epoch": 0.15840436577886172, + "grad_norm": 2.7382317260883715, + "learning_rate": 1.9793005140650738e-05, + "loss": 4.1094, + "mean_token_accuracy": 0.29879032224416735, + "step": 3280 + }, + { + "epoch": 0.15888730591843142, + "grad_norm": 2.7128435969484563, + "learning_rate": 1.97895788557973e-05, + "loss": 4.0668, + "mean_token_accuracy": 0.3035282254219055, + "step": 3290 + }, + { + "epoch": 0.1593702460580011, + "grad_norm": 2.662083350826242, + "learning_rate": 1.97861247478861e-05, + "loss": 4.0465, + "mean_token_accuracy": 0.30756048411130904, + "step": 3300 + }, + { + "epoch": 0.1593702460580011, + "eval_runtime": 7.7695, + "eval_samples_per_second": 380.205, + "eval_steps_per_second": 23.811, + "step": 3300 + }, + { + "epoch": 0.1598531861975708, + "grad_norm": 2.6896017438928532, + "learning_rate": 1.9782642826734095e-05, + "loss": 4.125, + "mean_token_accuracy": 0.2977187469601631, + "step": 3310 + }, + { + "epoch": 0.1603361263371405, + "grad_norm": 2.6343597072414275, + "learning_rate": 1.9779133102237285e-05, + "loss": 3.9711, + "mean_token_accuracy": 0.3121975839138031, + "step": 3320 + }, + { + "epoch": 0.1608190664767102, + "grad_norm": 2.6208106907293707, + "learning_rate": 1.977559558437069e-05, + "loss": 4.0887, + "mean_token_accuracy": 0.30060483887791634, + "step": 3330 + }, + { + "epoch": 0.1613020066162799, + "grad_norm": 2.733434376395422, + "learning_rate": 1.9772030283188327e-05, + "loss": 4.032, + "mean_token_accuracy": 0.30191532224416734, + "step": 3340 + }, + { + "epoch": 0.16178494675584962, + "grad_norm": 2.528211328004999, + "learning_rate": 1.9768437208823173e-05, + "loss": 4.0785, + "mean_token_accuracy": 0.30332661122083665, + "step": 3350 + }, + { + "epoch": 0.16226788689541932, + "grad_norm": 2.722087051300455, + "learning_rate": 1.9764816371487137e-05, + "loss": 4.1055, + "mean_token_accuracy": 0.29657257795333863, + "step": 3360 + }, + { + "epoch": 0.16275082703498903, + "grad_norm": 2.5219048235861528, + "learning_rate": 1.976116778147104e-05, + "loss": 4.1, + "mean_token_accuracy": 0.29778225943446157, + "step": 3370 + }, + { + "epoch": 0.1632337671745587, + "grad_norm": 2.680587551133981, + "learning_rate": 1.975749144914457e-05, + "loss": 4.052, + "mean_token_accuracy": 0.2977822571992874, + "step": 3380 + }, + { + "epoch": 0.1637167073141284, + "grad_norm": 2.9112199550423377, + "learning_rate": 1.9753787384956276e-05, + "loss": 4.1535, + "mean_token_accuracy": 0.30362903475761416, + "step": 3390 + }, + { + "epoch": 0.1641996474536981, + "grad_norm": 2.621013954914834, + "learning_rate": 1.9750055599433503e-05, + "loss": 4.0965, + "mean_token_accuracy": 0.2966733857989311, + "step": 3400 + }, + { + "epoch": 0.1641996474536981, + "eval_runtime": 7.838, + "eval_samples_per_second": 376.884, + "eval_steps_per_second": 23.603, + "step": 3400 + }, + { + "epoch": 0.16468258759326782, + "grad_norm": 2.692676483705507, + "learning_rate": 1.9746296103182406e-05, + "loss": 4.068, + "mean_token_accuracy": 0.30221773982048034, + "step": 3410 + }, + { + "epoch": 0.16516552773283752, + "grad_norm": 2.75647521867288, + "learning_rate": 1.974250890688788e-05, + "loss": 4.0297, + "mean_token_accuracy": 0.3034274227917194, + "step": 3420 + }, + { + "epoch": 0.16564846787240722, + "grad_norm": 2.736914462919695, + "learning_rate": 1.973869402131356e-05, + "loss": 4.057, + "mean_token_accuracy": 0.30070564299821856, + "step": 3430 + }, + { + "epoch": 0.16613140801197693, + "grad_norm": 2.7217035002411603, + "learning_rate": 1.9734851457301757e-05, + "loss": 4.0801, + "mean_token_accuracy": 0.29868951588869097, + "step": 3440 + }, + { + "epoch": 0.1666143481515466, + "grad_norm": 2.5327655563603177, + "learning_rate": 1.973098122577347e-05, + "loss": 4.1125, + "mean_token_accuracy": 0.30332661271095274, + "step": 3450 + }, + { + "epoch": 0.1670972882911163, + "grad_norm": 2.655328286104727, + "learning_rate": 1.9727083337728316e-05, + "loss": 4.084, + "mean_token_accuracy": 0.30584677308797836, + "step": 3460 + }, + { + "epoch": 0.167580228430686, + "grad_norm": 2.6751770694202093, + "learning_rate": 1.9723157804244522e-05, + "loss": 4.0687, + "mean_token_accuracy": 0.30262096524238585, + "step": 3470 + }, + { + "epoch": 0.16806316857025572, + "grad_norm": 2.828274606821587, + "learning_rate": 1.9719204636478893e-05, + "loss": 4.0469, + "mean_token_accuracy": 0.3049395151436329, + "step": 3480 + }, + { + "epoch": 0.16854610870982542, + "grad_norm": 2.666808629903219, + "learning_rate": 1.9715223845666754e-05, + "loss": 4.0746, + "mean_token_accuracy": 0.2991935446858406, + "step": 3490 + }, + { + "epoch": 0.16902904884939512, + "grad_norm": 2.775065079718119, + "learning_rate": 1.9711215443121955e-05, + "loss": 4.0316, + "mean_token_accuracy": 0.3129032254219055, + "step": 3500 + }, + { + "epoch": 0.16902904884939512, + "eval_runtime": 7.8054, + "eval_samples_per_second": 378.457, + "eval_steps_per_second": 23.702, + "step": 3500 + }, + { + "epoch": 0.16951198898896483, + "grad_norm": 2.5983820917312026, + "learning_rate": 1.9707179440236815e-05, + "loss": 4.0781, + "mean_token_accuracy": 0.30453629046678543, + "step": 3510 + }, + { + "epoch": 0.1699949291285345, + "grad_norm": 2.61383874753937, + "learning_rate": 1.97031158484821e-05, + "loss": 4.0926, + "mean_token_accuracy": 0.3074596762657166, + "step": 3520 + }, + { + "epoch": 0.1704778692681042, + "grad_norm": 2.613847766199223, + "learning_rate": 1.969902467940698e-05, + "loss": 4.1414, + "mean_token_accuracy": 0.2979838714003563, + "step": 3530 + }, + { + "epoch": 0.1709608094076739, + "grad_norm": 2.4731166444967645, + "learning_rate": 1.9694905944639014e-05, + "loss": 4.0578, + "mean_token_accuracy": 0.3025201618671417, + "step": 3540 + }, + { + "epoch": 0.17144374954724362, + "grad_norm": 2.5600193940786995, + "learning_rate": 1.9690759655884085e-05, + "loss": 4.0527, + "mean_token_accuracy": 0.3074596807360649, + "step": 3550 + }, + { + "epoch": 0.17192668968681332, + "grad_norm": 2.738596210941972, + "learning_rate": 1.9686585824926412e-05, + "loss": 4.1148, + "mean_token_accuracy": 0.29213709831237794, + "step": 3560 + }, + { + "epoch": 0.17240962982638303, + "grad_norm": 2.599359635619837, + "learning_rate": 1.9682384463628477e-05, + "loss": 4.0469, + "mean_token_accuracy": 0.30080644935369494, + "step": 3570 + }, + { + "epoch": 0.17289256996595273, + "grad_norm": 2.691353570094291, + "learning_rate": 1.967815558393101e-05, + "loss": 4.0758, + "mean_token_accuracy": 0.3056451603770256, + "step": 3580 + }, + { + "epoch": 0.1733755101055224, + "grad_norm": 2.7396764808543943, + "learning_rate": 1.967389919785295e-05, + "loss": 4.0512, + "mean_token_accuracy": 0.3, + "step": 3590 + }, + { + "epoch": 0.1738584502450921, + "grad_norm": 2.7179381888390814, + "learning_rate": 1.9669615317491418e-05, + "loss": 4.068, + "mean_token_accuracy": 0.2933467745780945, + "step": 3600 + }, + { + "epoch": 0.1738584502450921, + "eval_runtime": 7.8039, + "eval_samples_per_second": 378.529, + "eval_steps_per_second": 23.706, + "step": 3600 + }, + { + "epoch": 0.17434139038466182, + "grad_norm": 2.7449641837357737, + "learning_rate": 1.966530395502167e-05, + "loss": 4.0566, + "mean_token_accuracy": 0.2998991921544075, + "step": 3610 + }, + { + "epoch": 0.17482433052423152, + "grad_norm": 2.6189015079358233, + "learning_rate": 1.9660965122697067e-05, + "loss": 4.0191, + "mean_token_accuracy": 0.3065524235367775, + "step": 3620 + }, + { + "epoch": 0.17530727066380122, + "grad_norm": 2.9319900971973567, + "learning_rate": 1.965659883284905e-05, + "loss": 4.0141, + "mean_token_accuracy": 0.3042338714003563, + "step": 3630 + }, + { + "epoch": 0.17579021080337093, + "grad_norm": 2.6196512440180566, + "learning_rate": 1.9652205097887097e-05, + "loss": 4.0523, + "mean_token_accuracy": 0.30383064299821855, + "step": 3640 + }, + { + "epoch": 0.17627315094294063, + "grad_norm": 2.618384526978317, + "learning_rate": 1.9647783930298683e-05, + "loss": 4.1137, + "mean_token_accuracy": 0.2987903207540512, + "step": 3650 + }, + { + "epoch": 0.17675609108251034, + "grad_norm": 2.5126722492749822, + "learning_rate": 1.9643335342649253e-05, + "loss": 4.0223, + "mean_token_accuracy": 0.3099798396229744, + "step": 3660 + }, + { + "epoch": 0.17723903122208, + "grad_norm": 2.494061535290806, + "learning_rate": 1.9638859347582176e-05, + "loss": 4.0906, + "mean_token_accuracy": 0.2984879031777382, + "step": 3670 + }, + { + "epoch": 0.17772197136164972, + "grad_norm": 2.550490711171001, + "learning_rate": 1.9634355957818724e-05, + "loss": 4.0457, + "mean_token_accuracy": 0.3073588699102402, + "step": 3680 + }, + { + "epoch": 0.17820491150121942, + "grad_norm": 2.4847633736880357, + "learning_rate": 1.9629825186158033e-05, + "loss": 4.0473, + "mean_token_accuracy": 0.3077620968222618, + "step": 3690 + }, + { + "epoch": 0.17868785164078912, + "grad_norm": 2.7663336779028618, + "learning_rate": 1.962526704547704e-05, + "loss": 4.0941, + "mean_token_accuracy": 0.30010080337524414, + "step": 3700 + }, + { + "epoch": 0.17868785164078912, + "eval_runtime": 7.8038, + "eval_samples_per_second": 378.533, + "eval_steps_per_second": 23.706, + "step": 3700 + }, + { + "epoch": 0.17917079178035883, + "grad_norm": 2.5166344462922665, + "learning_rate": 1.962068154873049e-05, + "loss": 4.0844, + "mean_token_accuracy": 0.3015120968222618, + "step": 3710 + }, + { + "epoch": 0.17965373191992853, + "grad_norm": 2.671642786530828, + "learning_rate": 1.9616068708950865e-05, + "loss": 4.109, + "mean_token_accuracy": 0.29949596524238586, + "step": 3720 + }, + { + "epoch": 0.18013667205949824, + "grad_norm": 2.5424801288239878, + "learning_rate": 1.9611428539248364e-05, + "loss": 4.0547, + "mean_token_accuracy": 0.30453629046678543, + "step": 3730 + }, + { + "epoch": 0.1806196121990679, + "grad_norm": 2.4744737906443475, + "learning_rate": 1.9606761052810858e-05, + "loss": 4.0434, + "mean_token_accuracy": 0.3083669379353523, + "step": 3740 + }, + { + "epoch": 0.18110255233863762, + "grad_norm": 2.661085975386548, + "learning_rate": 1.9602066262903855e-05, + "loss": 4.1684, + "mean_token_accuracy": 0.29294354766607283, + "step": 3750 + }, + { + "epoch": 0.18158549247820732, + "grad_norm": 2.551128771570501, + "learning_rate": 1.9597344182870463e-05, + "loss": 4.0281, + "mean_token_accuracy": 0.3139112889766693, + "step": 3760 + }, + { + "epoch": 0.18206843261777703, + "grad_norm": 2.4986089678252035, + "learning_rate": 1.9592594826131352e-05, + "loss": 4.1246, + "mean_token_accuracy": 0.30141129195690153, + "step": 3770 + }, + { + "epoch": 0.18255137275734673, + "grad_norm": 2.5241612996931884, + "learning_rate": 1.9587818206184718e-05, + "loss": 4.0605, + "mean_token_accuracy": 0.30463709980249404, + "step": 3780 + }, + { + "epoch": 0.18303431289691643, + "grad_norm": 2.574199584048626, + "learning_rate": 1.958301433660623e-05, + "loss": 4.0734, + "mean_token_accuracy": 0.3029233872890472, + "step": 3790 + }, + { + "epoch": 0.18351725303648614, + "grad_norm": 2.6257630018910145, + "learning_rate": 1.9578183231049028e-05, + "loss": 4.0523, + "mean_token_accuracy": 0.3030241936445236, + "step": 3800 + }, + { + "epoch": 0.18351725303648614, + "eval_runtime": 7.7827, + "eval_samples_per_second": 379.557, + "eval_steps_per_second": 23.771, + "step": 3800 + }, + { + "epoch": 0.18400019317605582, + "grad_norm": 2.8490950431431727, + "learning_rate": 1.9573324903243633e-05, + "loss": 4.098, + "mean_token_accuracy": 0.3015120938420296, + "step": 3810 + }, + { + "epoch": 0.18448313331562552, + "grad_norm": 2.6442037232679105, + "learning_rate": 1.956843936699795e-05, + "loss": 4.0773, + "mean_token_accuracy": 0.30846773982048037, + "step": 3820 + }, + { + "epoch": 0.18496607345519522, + "grad_norm": 2.440382964429585, + "learning_rate": 1.9563526636197205e-05, + "loss": 4.0621, + "mean_token_accuracy": 0.31008064597845075, + "step": 3830 + }, + { + "epoch": 0.18544901359476493, + "grad_norm": 2.506411414128234, + "learning_rate": 1.9558586724803926e-05, + "loss": 4.0563, + "mean_token_accuracy": 0.3015120968222618, + "step": 3840 + }, + { + "epoch": 0.18593195373433463, + "grad_norm": 2.4416581277853506, + "learning_rate": 1.9553619646857876e-05, + "loss": 4.05, + "mean_token_accuracy": 0.300504033267498, + "step": 3850 + }, + { + "epoch": 0.18641489387390434, + "grad_norm": 2.538610139614956, + "learning_rate": 1.9548625416476037e-05, + "loss": 4.0324, + "mean_token_accuracy": 0.3061491921544075, + "step": 3860 + }, + { + "epoch": 0.18689783401347404, + "grad_norm": 2.4205853818127934, + "learning_rate": 1.9543604047852565e-05, + "loss": 4.066, + "mean_token_accuracy": 0.3055443525314331, + "step": 3870 + }, + { + "epoch": 0.18738077415304372, + "grad_norm": 2.587017003337788, + "learning_rate": 1.9538555555258737e-05, + "loss": 4.0422, + "mean_token_accuracy": 0.29848790168762207, + "step": 3880 + }, + { + "epoch": 0.18786371429261342, + "grad_norm": 2.500367801105121, + "learning_rate": 1.9533479953042923e-05, + "loss": 4.0195, + "mean_token_accuracy": 0.305745966732502, + "step": 3890 + }, + { + "epoch": 0.18834665443218312, + "grad_norm": 2.4991761246992277, + "learning_rate": 1.9528377255630543e-05, + "loss": 4.0637, + "mean_token_accuracy": 0.3063508063554764, + "step": 3900 + }, + { + "epoch": 0.18834665443218312, + "eval_runtime": 7.7872, + "eval_samples_per_second": 379.342, + "eval_steps_per_second": 23.757, + "step": 3900 + }, + { + "epoch": 0.18882959457175283, + "grad_norm": 2.514616543709038, + "learning_rate": 1.9523247477524024e-05, + "loss": 4.0043, + "mean_token_accuracy": 0.3078629016876221, + "step": 3910 + }, + { + "epoch": 0.18931253471132253, + "grad_norm": 2.521595783266744, + "learning_rate": 1.9518090633302755e-05, + "loss": 4.0191, + "mean_token_accuracy": 0.300806450843811, + "step": 3920 + }, + { + "epoch": 0.18979547485089224, + "grad_norm": 2.4025892379321174, + "learning_rate": 1.9512906737623054e-05, + "loss": 4.0715, + "mean_token_accuracy": 0.30372984111309054, + "step": 3930 + }, + { + "epoch": 0.19027841499046194, + "grad_norm": 2.3736905652478364, + "learning_rate": 1.950769580521812e-05, + "loss": 3.9996, + "mean_token_accuracy": 0.3108870968222618, + "step": 3940 + }, + { + "epoch": 0.19076135513003165, + "grad_norm": 2.5732494964441366, + "learning_rate": 1.9502457850898007e-05, + "loss": 4.0187, + "mean_token_accuracy": 0.31018145084381105, + "step": 3950 + }, + { + "epoch": 0.19124429526960132, + "grad_norm": 2.574772993521237, + "learning_rate": 1.9497192889549544e-05, + "loss": 4.0824, + "mean_token_accuracy": 0.3025201603770256, + "step": 3960 + }, + { + "epoch": 0.19172723540917103, + "grad_norm": 2.403181142089266, + "learning_rate": 1.949190093613633e-05, + "loss": 4.0613, + "mean_token_accuracy": 0.30302419513463974, + "step": 3970 + }, + { + "epoch": 0.19221017554874073, + "grad_norm": 2.6461715859268677, + "learning_rate": 1.948658200569868e-05, + "loss": 4.1051, + "mean_token_accuracy": 0.3, + "step": 3980 + }, + { + "epoch": 0.19269311568831043, + "grad_norm": 2.5242849341583167, + "learning_rate": 1.948123611335358e-05, + "loss": 4.0645, + "mean_token_accuracy": 0.3054435446858406, + "step": 3990 + }, + { + "epoch": 0.19317605582788014, + "grad_norm": 2.534385061138649, + "learning_rate": 1.947586327429464e-05, + "loss": 4.0613, + "mean_token_accuracy": 0.30665321946144103, + "step": 4000 + }, + { + "epoch": 0.19317605582788014, + "eval_runtime": 7.8081, + "eval_samples_per_second": 378.323, + "eval_steps_per_second": 23.693, + "step": 4000 + }, + { + "epoch": 0.19365899596744984, + "grad_norm": 2.516760678379009, + "learning_rate": 1.9470463503792058e-05, + "loss": 4.0867, + "mean_token_accuracy": 0.29929435551166533, + "step": 4010 + }, + { + "epoch": 0.19414193610701955, + "grad_norm": 2.52633694862886, + "learning_rate": 1.9465036817192576e-05, + "loss": 3.966, + "mean_token_accuracy": 0.3123991906642914, + "step": 4020 + }, + { + "epoch": 0.19462487624658922, + "grad_norm": 2.694992357818292, + "learning_rate": 1.9459583229919436e-05, + "loss": 4.0941, + "mean_token_accuracy": 0.30514113008975985, + "step": 4030 + }, + { + "epoch": 0.19510781638615893, + "grad_norm": 2.42494813979197, + "learning_rate": 1.9454102757472325e-05, + "loss": 4.0844, + "mean_token_accuracy": 0.30564516186714175, + "step": 4040 + }, + { + "epoch": 0.19559075652572863, + "grad_norm": 2.7997016430800534, + "learning_rate": 1.9448595415427348e-05, + "loss": 4.0699, + "mean_token_accuracy": 0.30967741906642915, + "step": 4050 + }, + { + "epoch": 0.19607369666529834, + "grad_norm": 2.450927188900928, + "learning_rate": 1.9443061219436984e-05, + "loss": 4.0328, + "mean_token_accuracy": 0.30493951588869095, + "step": 4060 + }, + { + "epoch": 0.19655663680486804, + "grad_norm": 2.4520710916361503, + "learning_rate": 1.943750018523002e-05, + "loss": 4.0465, + "mean_token_accuracy": 0.30141129195690153, + "step": 4070 + }, + { + "epoch": 0.19703957694443774, + "grad_norm": 2.548513085821371, + "learning_rate": 1.9431912328611523e-05, + "loss": 4.0676, + "mean_token_accuracy": 0.30866935551166536, + "step": 4080 + }, + { + "epoch": 0.19752251708400745, + "grad_norm": 2.4211864889240675, + "learning_rate": 1.94262976654628e-05, + "loss": 4.0, + "mean_token_accuracy": 0.30413306653499605, + "step": 4090 + }, + { + "epoch": 0.19800545722357712, + "grad_norm": 2.534300677746175, + "learning_rate": 1.9420656211741335e-05, + "loss": 4.0172, + "mean_token_accuracy": 0.3080645173788071, + "step": 4100 + }, + { + "epoch": 0.19800545722357712, + "eval_runtime": 7.8179, + "eval_samples_per_second": 377.853, + "eval_steps_per_second": 23.664, + "step": 4100 + }, + { + "epoch": 0.19848839736314683, + "grad_norm": 2.4251826832719137, + "learning_rate": 1.9414987983480764e-05, + "loss": 4.0523, + "mean_token_accuracy": 0.30443548411130905, + "step": 4110 + }, + { + "epoch": 0.19897133750271653, + "grad_norm": 2.5359288027916804, + "learning_rate": 1.940929299679081e-05, + "loss": 4.0242, + "mean_token_accuracy": 0.3116935506463051, + "step": 4120 + }, + { + "epoch": 0.19945427764228624, + "grad_norm": 2.41459622269149, + "learning_rate": 1.940357126785725e-05, + "loss": 4.0703, + "mean_token_accuracy": 0.30262096896767615, + "step": 4130 + }, + { + "epoch": 0.19993721778185594, + "grad_norm": 2.548625471100205, + "learning_rate": 1.939782281294187e-05, + "loss": 4.0781, + "mean_token_accuracy": 0.2980846785008907, + "step": 4140 + }, + { + "epoch": 0.20042015792142565, + "grad_norm": 2.5293055980853896, + "learning_rate": 1.9392047648382405e-05, + "loss": 4.0387, + "mean_token_accuracy": 0.3074596792459488, + "step": 4150 + }, + { + "epoch": 0.20090309806099535, + "grad_norm": 2.5377151154619035, + "learning_rate": 1.9386245790592513e-05, + "loss": 4.0367, + "mean_token_accuracy": 0.3019153207540512, + "step": 4160 + }, + { + "epoch": 0.20138603820056503, + "grad_norm": 2.3948393404934385, + "learning_rate": 1.9380417256061707e-05, + "loss": 4.0332, + "mean_token_accuracy": 0.31018145233392713, + "step": 4170 + }, + { + "epoch": 0.20186897834013473, + "grad_norm": 2.4518674883472453, + "learning_rate": 1.9374562061355315e-05, + "loss": 4.0863, + "mean_token_accuracy": 0.30050402879714966, + "step": 4180 + }, + { + "epoch": 0.20235191847970443, + "grad_norm": 2.419370833469503, + "learning_rate": 1.9368680223114457e-05, + "loss": 4.0539, + "mean_token_accuracy": 0.3035282254219055, + "step": 4190 + }, + { + "epoch": 0.20283485861927414, + "grad_norm": 2.4552380224389645, + "learning_rate": 1.9362771758055952e-05, + "loss": 4.0672, + "mean_token_accuracy": 0.30221773982048034, + "step": 4200 + }, + { + "epoch": 0.20283485861927414, + "eval_runtime": 7.8164, + "eval_samples_per_second": 377.925, + "eval_steps_per_second": 23.668, + "step": 4200 + }, + { + "epoch": 0.20331779875884384, + "grad_norm": 2.5764697890208894, + "learning_rate": 1.935683668297231e-05, + "loss": 4.0445, + "mean_token_accuracy": 0.3090725839138031, + "step": 4210 + }, + { + "epoch": 0.20380073889841355, + "grad_norm": 2.5877640791755927, + "learning_rate": 1.9350875014731664e-05, + "loss": 4.0734, + "mean_token_accuracy": 0.30030241757631304, + "step": 4220 + }, + { + "epoch": 0.20428367903798325, + "grad_norm": 2.5672705433903324, + "learning_rate": 1.9344886770277735e-05, + "loss": 4.0711, + "mean_token_accuracy": 0.29818548560142516, + "step": 4230 + }, + { + "epoch": 0.20476661917755296, + "grad_norm": 2.6070097591406136, + "learning_rate": 1.9338871966629767e-05, + "loss": 4.0594, + "mean_token_accuracy": 0.2996975779533386, + "step": 4240 + }, + { + "epoch": 0.20524955931712263, + "grad_norm": 2.5056060856865385, + "learning_rate": 1.9332830620882493e-05, + "loss": 4.0156, + "mean_token_accuracy": 0.30766128599643705, + "step": 4250 + }, + { + "epoch": 0.20573249945669234, + "grad_norm": 2.536954497339716, + "learning_rate": 1.9326762750206082e-05, + "loss": 4.0355, + "mean_token_accuracy": 0.30493951588869095, + "step": 4260 + }, + { + "epoch": 0.20621543959626204, + "grad_norm": 2.483717821604835, + "learning_rate": 1.9320668371846087e-05, + "loss": 3.9477, + "mean_token_accuracy": 0.3126008063554764, + "step": 4270 + }, + { + "epoch": 0.20669837973583174, + "grad_norm": 2.439588924034685, + "learning_rate": 1.9314547503123396e-05, + "loss": 4.0891, + "mean_token_accuracy": 0.3006048366427422, + "step": 4280 + }, + { + "epoch": 0.20718131987540145, + "grad_norm": 2.512676808493711, + "learning_rate": 1.93084001614342e-05, + "loss": 4.093, + "mean_token_accuracy": 0.29627016335725787, + "step": 4290 + }, + { + "epoch": 0.20766426001497115, + "grad_norm": 2.5338270946674397, + "learning_rate": 1.930222636424991e-05, + "loss": 4.0172, + "mean_token_accuracy": 0.2998991921544075, + "step": 4300 + }, + { + "epoch": 0.20766426001497115, + "eval_runtime": 7.8133, + "eval_samples_per_second": 378.072, + "eval_steps_per_second": 23.677, + "step": 4300 + }, + { + "epoch": 0.20814720015454086, + "grad_norm": 2.3935398550737017, + "learning_rate": 1.929602612911714e-05, + "loss": 4.1211, + "mean_token_accuracy": 0.2969758063554764, + "step": 4310 + }, + { + "epoch": 0.20863014029411053, + "grad_norm": 2.5121903523760496, + "learning_rate": 1.928979947365764e-05, + "loss": 3.9719, + "mean_token_accuracy": 0.3152217730879784, + "step": 4320 + }, + { + "epoch": 0.20911308043368024, + "grad_norm": 2.3193582128852923, + "learning_rate": 1.928354641556824e-05, + "loss": 4.0297, + "mean_token_accuracy": 0.3068548396229744, + "step": 4330 + }, + { + "epoch": 0.20959602057324994, + "grad_norm": 2.3912353508871553, + "learning_rate": 1.9277266972620828e-05, + "loss": 4.0625, + "mean_token_accuracy": 0.30030241757631304, + "step": 4340 + }, + { + "epoch": 0.21007896071281965, + "grad_norm": 2.4598227549349585, + "learning_rate": 1.927096116266226e-05, + "loss": 4.0727, + "mean_token_accuracy": 0.30231854766607286, + "step": 4350 + }, + { + "epoch": 0.21056190085238935, + "grad_norm": 2.461571886061888, + "learning_rate": 1.9264629003614352e-05, + "loss": 4.034, + "mean_token_accuracy": 0.3034274160861969, + "step": 4360 + }, + { + "epoch": 0.21104484099195905, + "grad_norm": 2.3895560003156677, + "learning_rate": 1.9258270513473788e-05, + "loss": 4.0336, + "mean_token_accuracy": 0.3137096747756004, + "step": 4370 + }, + { + "epoch": 0.21152778113152876, + "grad_norm": 2.501270819608181, + "learning_rate": 1.9251885710312096e-05, + "loss": 4.0816, + "mean_token_accuracy": 0.2969758063554764, + "step": 4380 + }, + { + "epoch": 0.21201072127109843, + "grad_norm": 2.4225682062797573, + "learning_rate": 1.9245474612275583e-05, + "loss": 4.009, + "mean_token_accuracy": 0.3053427398204803, + "step": 4390 + }, + { + "epoch": 0.21249366141066814, + "grad_norm": 2.5966437690923576, + "learning_rate": 1.92390372375853e-05, + "loss": 4.0742, + "mean_token_accuracy": 0.30655242055654525, + "step": 4400 + }, + { + "epoch": 0.21249366141066814, + "eval_runtime": 7.7937, + "eval_samples_per_second": 379.026, + "eval_steps_per_second": 23.737, + "step": 4400 + }, + { + "epoch": 0.21297660155023784, + "grad_norm": 2.4000872847358217, + "learning_rate": 1.923257360453697e-05, + "loss": 4.0328, + "mean_token_accuracy": 0.30141128972172737, + "step": 4410 + }, + { + "epoch": 0.21345954168980755, + "grad_norm": 2.4318047007054773, + "learning_rate": 1.922608373150095e-05, + "loss": 3.9641, + "mean_token_accuracy": 0.31270161271095276, + "step": 4420 + }, + { + "epoch": 0.21394248182937725, + "grad_norm": 2.3537304374562993, + "learning_rate": 1.921956763692217e-05, + "loss": 4.0258, + "mean_token_accuracy": 0.30967741906642915, + "step": 4430 + }, + { + "epoch": 0.21442542196894696, + "grad_norm": 2.4907276209319975, + "learning_rate": 1.9213025339320083e-05, + "loss": 4.0141, + "mean_token_accuracy": 0.30927419364452363, + "step": 4440 + }, + { + "epoch": 0.21490836210851666, + "grad_norm": 2.3957895079994493, + "learning_rate": 1.920645685728862e-05, + "loss": 4.0098, + "mean_token_accuracy": 0.3104838699102402, + "step": 4450 + }, + { + "epoch": 0.21539130224808634, + "grad_norm": 2.544550132805664, + "learning_rate": 1.919986220949613e-05, + "loss": 4.0543, + "mean_token_accuracy": 0.3125, + "step": 4460 + }, + { + "epoch": 0.21587424238765604, + "grad_norm": 2.4014975958752403, + "learning_rate": 1.9193241414685318e-05, + "loss": 4.0, + "mean_token_accuracy": 0.30322580635547636, + "step": 4470 + }, + { + "epoch": 0.21635718252722574, + "grad_norm": 2.466015174751556, + "learning_rate": 1.9186594491673217e-05, + "loss": 3.9961, + "mean_token_accuracy": 0.31471773982048035, + "step": 4480 + }, + { + "epoch": 0.21684012266679545, + "grad_norm": 2.5329011496134886, + "learning_rate": 1.917992145935111e-05, + "loss": 4.1074, + "mean_token_accuracy": 0.30383064299821855, + "step": 4490 + }, + { + "epoch": 0.21732306280636515, + "grad_norm": 2.3974894070580572, + "learning_rate": 1.9173222336684492e-05, + "loss": 4.0496, + "mean_token_accuracy": 0.3053427398204803, + "step": 4500 + }, + { + "epoch": 0.21732306280636515, + "eval_runtime": 7.7965, + "eval_samples_per_second": 378.888, + "eval_steps_per_second": 23.729, + "step": 4500 + }, + { + "epoch": 0.21780600294593486, + "grad_norm": 2.3904189884183205, + "learning_rate": 1.9166497142712995e-05, + "loss": 4.0852, + "mean_token_accuracy": 0.302620966732502, + "step": 4510 + }, + { + "epoch": 0.21828894308550456, + "grad_norm": 2.512258121567953, + "learning_rate": 1.9159745896550367e-05, + "loss": 4.032, + "mean_token_accuracy": 0.3066532239317894, + "step": 4520 + }, + { + "epoch": 0.21877188322507427, + "grad_norm": 2.4135957188161914, + "learning_rate": 1.915296861738439e-05, + "loss": 4.073, + "mean_token_accuracy": 0.31098789870738985, + "step": 4530 + }, + { + "epoch": 0.21925482336464394, + "grad_norm": 2.4447776108764976, + "learning_rate": 1.914616532447683e-05, + "loss": 4.0187, + "mean_token_accuracy": 0.3093750014901161, + "step": 4540 + }, + { + "epoch": 0.21973776350421365, + "grad_norm": 2.494262130906879, + "learning_rate": 1.9139336037163394e-05, + "loss": 4.0305, + "mean_token_accuracy": 0.3131048381328583, + "step": 4550 + }, + { + "epoch": 0.22022070364378335, + "grad_norm": 2.318031299502848, + "learning_rate": 1.913248077485367e-05, + "loss": 4.0391, + "mean_token_accuracy": 0.30957661420106886, + "step": 4560 + }, + { + "epoch": 0.22070364378335305, + "grad_norm": 2.634787930670605, + "learning_rate": 1.912559955703106e-05, + "loss": 3.9695, + "mean_token_accuracy": 0.312298384308815, + "step": 4570 + }, + { + "epoch": 0.22118658392292276, + "grad_norm": 2.489743329988923, + "learning_rate": 1.9118692403252747e-05, + "loss": 3.9906, + "mean_token_accuracy": 0.3118951588869095, + "step": 4580 + }, + { + "epoch": 0.22166952406249246, + "grad_norm": 2.473567565951065, + "learning_rate": 1.9111759333149615e-05, + "loss": 4.0449, + "mean_token_accuracy": 0.30614919364452364, + "step": 4590 + }, + { + "epoch": 0.22215246420206217, + "grad_norm": 2.397229662717697, + "learning_rate": 1.9104800366426216e-05, + "loss": 4.0438, + "mean_token_accuracy": 0.30816532075405123, + "step": 4600 + }, + { + "epoch": 0.22215246420206217, + "eval_runtime": 7.7744, + "eval_samples_per_second": 379.963, + "eval_steps_per_second": 23.796, + "step": 4600 + }, + { + "epoch": 0.22263540434163184, + "grad_norm": 2.409335257933568, + "learning_rate": 1.9097815522860692e-05, + "loss": 3.9949, + "mean_token_accuracy": 0.31088709384202956, + "step": 4610 + }, + { + "epoch": 0.22311834448120155, + "grad_norm": 2.4997405268347372, + "learning_rate": 1.909080482230474e-05, + "loss": 4.0527, + "mean_token_accuracy": 0.3063508063554764, + "step": 4620 + }, + { + "epoch": 0.22360128462077125, + "grad_norm": 2.442181270108447, + "learning_rate": 1.9083768284683533e-05, + "loss": 4.0812, + "mean_token_accuracy": 0.3057459682226181, + "step": 4630 + }, + { + "epoch": 0.22408422476034096, + "grad_norm": 2.4272279353962065, + "learning_rate": 1.907670592999569e-05, + "loss": 4.0672, + "mean_token_accuracy": 0.3069556444883347, + "step": 4640 + }, + { + "epoch": 0.22456716489991066, + "grad_norm": 2.405488540267964, + "learning_rate": 1.9069617778313196e-05, + "loss": 4.059, + "mean_token_accuracy": 0.3022177413105965, + "step": 4650 + }, + { + "epoch": 0.22505010503948036, + "grad_norm": 2.312409121561029, + "learning_rate": 1.9062503849781356e-05, + "loss": 4.0168, + "mean_token_accuracy": 0.31098790317773817, + "step": 4660 + }, + { + "epoch": 0.22553304517905007, + "grad_norm": 2.598683537126093, + "learning_rate": 1.9055364164618738e-05, + "loss": 4.0605, + "mean_token_accuracy": 0.30705645084381106, + "step": 4670 + }, + { + "epoch": 0.22601598531861974, + "grad_norm": 2.391835886913309, + "learning_rate": 1.904819874311711e-05, + "loss": 3.9828, + "mean_token_accuracy": 0.31723790615797043, + "step": 4680 + }, + { + "epoch": 0.22649892545818945, + "grad_norm": 2.4726092766464505, + "learning_rate": 1.9041007605641387e-05, + "loss": 3.9828, + "mean_token_accuracy": 0.30987902730703354, + "step": 4690 + }, + { + "epoch": 0.22698186559775915, + "grad_norm": 2.280213411304886, + "learning_rate": 1.9033790772629566e-05, + "loss": 4.1113, + "mean_token_accuracy": 0.30614919513463973, + "step": 4700 + }, + { + "epoch": 0.22698186559775915, + "eval_runtime": 7.8102, + "eval_samples_per_second": 378.223, + "eval_steps_per_second": 23.687, + "step": 4700 + }, + { + "epoch": 0.22746480573732886, + "grad_norm": 2.3442566846349977, + "learning_rate": 1.9026548264592682e-05, + "loss": 4.0648, + "mean_token_accuracy": 0.30282258093357084, + "step": 4710 + }, + { + "epoch": 0.22794774587689856, + "grad_norm": 2.4219609056524773, + "learning_rate": 1.9019280102114743e-05, + "loss": 3.9504, + "mean_token_accuracy": 0.30967741906642915, + "step": 4720 + }, + { + "epoch": 0.22843068601646827, + "grad_norm": 2.5403332570120254, + "learning_rate": 1.9011986305852656e-05, + "loss": 4.043, + "mean_token_accuracy": 0.31088709384202956, + "step": 4730 + }, + { + "epoch": 0.22891362615603797, + "grad_norm": 2.469312361045575, + "learning_rate": 1.90046668965362e-05, + "loss": 4.1277, + "mean_token_accuracy": 0.29979838207364085, + "step": 4740 + }, + { + "epoch": 0.22939656629560765, + "grad_norm": 2.4236133149561994, + "learning_rate": 1.8997321894967927e-05, + "loss": 4.1148, + "mean_token_accuracy": 0.29919354766607287, + "step": 4750 + }, + { + "epoch": 0.22987950643517735, + "grad_norm": 2.5307277765411715, + "learning_rate": 1.898995132202315e-05, + "loss": 4.0148, + "mean_token_accuracy": 0.3024193555116653, + "step": 4760 + }, + { + "epoch": 0.23036244657474705, + "grad_norm": 2.462445980377924, + "learning_rate": 1.8982555198649843e-05, + "loss": 4.0047, + "mean_token_accuracy": 0.30796370953321456, + "step": 4770 + }, + { + "epoch": 0.23084538671431676, + "grad_norm": 2.499054204821308, + "learning_rate": 1.8975133545868595e-05, + "loss": 4.0594, + "mean_token_accuracy": 0.30181451588869096, + "step": 4780 + }, + { + "epoch": 0.23132832685388646, + "grad_norm": 2.3215590357733156, + "learning_rate": 1.8967686384772566e-05, + "loss": 3.9836, + "mean_token_accuracy": 0.3108870968222618, + "step": 4790 + }, + { + "epoch": 0.23181126699345617, + "grad_norm": 2.4739045579549193, + "learning_rate": 1.8960213736527403e-05, + "loss": 4.027, + "mean_token_accuracy": 0.306955648958683, + "step": 4800 + }, + { + "epoch": 0.23181126699345617, + "eval_runtime": 7.7829, + "eval_samples_per_second": 379.549, + "eval_steps_per_second": 23.77, + "step": 4800 + }, + { + "epoch": 0.23229420713302587, + "grad_norm": 2.42768862130812, + "learning_rate": 1.8952715622371183e-05, + "loss": 3.9781, + "mean_token_accuracy": 0.3152217775583267, + "step": 4810 + }, + { + "epoch": 0.23277714727259557, + "grad_norm": 2.3453972872455546, + "learning_rate": 1.8945192063614384e-05, + "loss": 4.1219, + "mean_token_accuracy": 0.29899193346500397, + "step": 4820 + }, + { + "epoch": 0.23326008741216525, + "grad_norm": 2.322289983616877, + "learning_rate": 1.893764308163978e-05, + "loss": 4.0668, + "mean_token_accuracy": 0.30332661271095274, + "step": 4830 + }, + { + "epoch": 0.23374302755173496, + "grad_norm": 2.3257425058685555, + "learning_rate": 1.8930068697902405e-05, + "loss": 3.9918, + "mean_token_accuracy": 0.30614919513463973, + "step": 4840 + }, + { + "epoch": 0.23422596769130466, + "grad_norm": 2.453908435447591, + "learning_rate": 1.892246893392949e-05, + "loss": 3.9914, + "mean_token_accuracy": 0.31834677755832674, + "step": 4850 + }, + { + "epoch": 0.23470890783087436, + "grad_norm": 2.304456121714858, + "learning_rate": 1.89148438113204e-05, + "loss": 3.9898, + "mean_token_accuracy": 0.3147177428007126, + "step": 4860 + }, + { + "epoch": 0.23519184797044407, + "grad_norm": 2.317812799182742, + "learning_rate": 1.8907193351746567e-05, + "loss": 4.0, + "mean_token_accuracy": 0.3084677428007126, + "step": 4870 + }, + { + "epoch": 0.23567478811001377, + "grad_norm": 2.416255377936248, + "learning_rate": 1.8899517576951438e-05, + "loss": 4.0121, + "mean_token_accuracy": 0.31360886693000795, + "step": 4880 + }, + { + "epoch": 0.23615772824958348, + "grad_norm": 2.377659782204465, + "learning_rate": 1.889181650875041e-05, + "loss": 3.9875, + "mean_token_accuracy": 0.31219758093357086, + "step": 4890 + }, + { + "epoch": 0.23664066838915315, + "grad_norm": 2.409949073682637, + "learning_rate": 1.888409016903076e-05, + "loss": 4.082, + "mean_token_accuracy": 0.30181451737880705, + "step": 4900 + }, + { + "epoch": 0.23664066838915315, + "eval_runtime": 7.7906, + "eval_samples_per_second": 379.177, + "eval_steps_per_second": 23.747, + "step": 4900 + }, + { + "epoch": 0.23712360852872286, + "grad_norm": 2.5229186101860743, + "learning_rate": 1.8876338579751604e-05, + "loss": 3.9629, + "mean_token_accuracy": 0.31350806504487994, + "step": 4910 + }, + { + "epoch": 0.23760654866829256, + "grad_norm": 2.4904309689203314, + "learning_rate": 1.8868561762943796e-05, + "loss": 4.059, + "mean_token_accuracy": 0.30635080486536026, + "step": 4920 + }, + { + "epoch": 0.23808948880786227, + "grad_norm": 2.5161866260229986, + "learning_rate": 1.886075974070991e-05, + "loss": 4.0559, + "mean_token_accuracy": 0.3110887065529823, + "step": 4930 + }, + { + "epoch": 0.23857242894743197, + "grad_norm": 2.3755953881544096, + "learning_rate": 1.8852932535224152e-05, + "loss": 4.0434, + "mean_token_accuracy": 0.30453629046678543, + "step": 4940 + }, + { + "epoch": 0.23905536908700167, + "grad_norm": 2.2664228117675336, + "learning_rate": 1.884508016873229e-05, + "loss": 3.9973, + "mean_token_accuracy": 0.31199596971273424, + "step": 4950 + }, + { + "epoch": 0.23953830922657138, + "grad_norm": 2.333137006574579, + "learning_rate": 1.8837202663551623e-05, + "loss": 4.0246, + "mean_token_accuracy": 0.3083669349551201, + "step": 4960 + }, + { + "epoch": 0.24002124936614105, + "grad_norm": 2.311318342982302, + "learning_rate": 1.882930004207088e-05, + "loss": 3.9625, + "mean_token_accuracy": 0.3132056400179863, + "step": 4970 + }, + { + "epoch": 0.24050418950571076, + "grad_norm": 2.302982732308885, + "learning_rate": 1.8821372326750175e-05, + "loss": 4.0613, + "mean_token_accuracy": 0.30715726017951966, + "step": 4980 + }, + { + "epoch": 0.24098712964528046, + "grad_norm": 2.394695239765169, + "learning_rate": 1.881341954012095e-05, + "loss": 4.0289, + "mean_token_accuracy": 0.3076612904667854, + "step": 4990 + }, + { + "epoch": 0.24147006978485017, + "grad_norm": 2.4550501235440985, + "learning_rate": 1.88054417047859e-05, + "loss": 4.0441, + "mean_token_accuracy": 0.30292338579893113, + "step": 5000 + }, + { + "epoch": 0.24147006978485017, + "eval_runtime": 7.8018, + "eval_samples_per_second": 378.632, + "eval_steps_per_second": 23.713, + "step": 5000 + }, + { + "epoch": 0.24195300992441987, + "grad_norm": 2.5326888838733743, + "learning_rate": 1.8797438843418906e-05, + "loss": 4.0234, + "mean_token_accuracy": 0.3087701633572578, + "step": 5010 + }, + { + "epoch": 0.24243595006398957, + "grad_norm": 2.363472762144466, + "learning_rate": 1.8789410978764972e-05, + "loss": 4.0305, + "mean_token_accuracy": 0.30917338728904725, + "step": 5020 + }, + { + "epoch": 0.24291889020355928, + "grad_norm": 2.5754690547802315, + "learning_rate": 1.878135813364018e-05, + "loss": 4.1043, + "mean_token_accuracy": 0.29858870804309845, + "step": 5030 + }, + { + "epoch": 0.24340183034312896, + "grad_norm": 2.473461141961251, + "learning_rate": 1.87732803309316e-05, + "loss": 4.0371, + "mean_token_accuracy": 0.3082661285996437, + "step": 5040 + }, + { + "epoch": 0.24388477048269866, + "grad_norm": 2.3109907906029945, + "learning_rate": 1.8765177593597225e-05, + "loss": 4.0055, + "mean_token_accuracy": 0.30252016335725784, + "step": 5050 + }, + { + "epoch": 0.24436771062226836, + "grad_norm": 2.490930908371213, + "learning_rate": 1.875704994466593e-05, + "loss": 4.1031, + "mean_token_accuracy": 0.30524193644523623, + "step": 5060 + }, + { + "epoch": 0.24485065076183807, + "grad_norm": 2.3890853308173745, + "learning_rate": 1.874889740723739e-05, + "loss": 4.0176, + "mean_token_accuracy": 0.3140120953321457, + "step": 5070 + }, + { + "epoch": 0.24533359090140777, + "grad_norm": 2.310256812584567, + "learning_rate": 1.8740720004482003e-05, + "loss": 3.9996, + "mean_token_accuracy": 0.31270160973072053, + "step": 5080 + }, + { + "epoch": 0.24581653104097748, + "grad_norm": 2.3601953691482374, + "learning_rate": 1.873251775964085e-05, + "loss": 4.0211, + "mean_token_accuracy": 0.3055443540215492, + "step": 5090 + }, + { + "epoch": 0.24629947118054718, + "grad_norm": 2.493184998826223, + "learning_rate": 1.8724290696025606e-05, + "loss": 4.0047, + "mean_token_accuracy": 0.3021169379353523, + "step": 5100 + }, + { + "epoch": 0.24629947118054718, + "eval_runtime": 7.7994, + "eval_samples_per_second": 378.746, + "eval_steps_per_second": 23.72, + "step": 5100 + }, + { + "epoch": 0.24678241132011688, + "grad_norm": 2.3883737233663416, + "learning_rate": 1.8716038837018496e-05, + "loss": 3.9617, + "mean_token_accuracy": 0.3137096792459488, + "step": 5110 + }, + { + "epoch": 0.24726535145968656, + "grad_norm": 2.407067649736428, + "learning_rate": 1.8707762206072203e-05, + "loss": 4.0223, + "mean_token_accuracy": 0.3102822542190552, + "step": 5120 + }, + { + "epoch": 0.24774829159925627, + "grad_norm": 2.331924706482883, + "learning_rate": 1.8699460826709828e-05, + "loss": 3.9387, + "mean_token_accuracy": 0.31340725868940356, + "step": 5130 + }, + { + "epoch": 0.24823123173882597, + "grad_norm": 2.398068312035197, + "learning_rate": 1.8691134722524794e-05, + "loss": 4.0398, + "mean_token_accuracy": 0.3116935476660728, + "step": 5140 + }, + { + "epoch": 0.24871417187839567, + "grad_norm": 2.341430158591099, + "learning_rate": 1.8682783917180808e-05, + "loss": 4.0211, + "mean_token_accuracy": 0.31653226017951963, + "step": 5150 + }, + { + "epoch": 0.24919711201796538, + "grad_norm": 2.1777730832714246, + "learning_rate": 1.8674408434411778e-05, + "loss": 3.9914, + "mean_token_accuracy": 0.31169354915618896, + "step": 5160 + }, + { + "epoch": 0.24968005215753508, + "grad_norm": 2.368915441612645, + "learning_rate": 1.8666008298021738e-05, + "loss": 3.9723, + "mean_token_accuracy": 0.31290322840213775, + "step": 5170 + }, + { + "epoch": 0.25016299229710476, + "grad_norm": 2.4409459468408214, + "learning_rate": 1.8657583531884804e-05, + "loss": 4.0809, + "mean_token_accuracy": 0.3044354856014252, + "step": 5180 + }, + { + "epoch": 0.2506459324366745, + "grad_norm": 2.2763184014052045, + "learning_rate": 1.8649134159945083e-05, + "loss": 3.9469, + "mean_token_accuracy": 0.3229838714003563, + "step": 5190 + }, + { + "epoch": 0.25112887257624417, + "grad_norm": 2.3311617057759557, + "learning_rate": 1.8640660206216622e-05, + "loss": 4.0914, + "mean_token_accuracy": 0.30272177457809446, + "step": 5200 + }, + { + "epoch": 0.25112887257624417, + "eval_runtime": 7.7925, + "eval_samples_per_second": 379.084, + "eval_steps_per_second": 23.741, + "step": 5200 + }, + { + "epoch": 0.2516118127158139, + "grad_norm": 2.309320261908768, + "learning_rate": 1.863216169478332e-05, + "loss": 4.0309, + "mean_token_accuracy": 0.3045362874865532, + "step": 5210 + }, + { + "epoch": 0.2520947528553836, + "grad_norm": 2.3046826489539063, + "learning_rate": 1.8623638649798886e-05, + "loss": 4.0602, + "mean_token_accuracy": 0.31320564448833466, + "step": 5220 + }, + { + "epoch": 0.25257769299495325, + "grad_norm": 2.3767757884245615, + "learning_rate": 1.8615091095486745e-05, + "loss": 4.1125, + "mean_token_accuracy": 0.29727822095155715, + "step": 5230 + }, + { + "epoch": 0.253060633134523, + "grad_norm": 2.3274956242869926, + "learning_rate": 1.860651905613999e-05, + "loss": 3.993, + "mean_token_accuracy": 0.30947580337524416, + "step": 5240 + }, + { + "epoch": 0.25354357327409266, + "grad_norm": 2.3604895458445903, + "learning_rate": 1.859792255612129e-05, + "loss": 4.0355, + "mean_token_accuracy": 0.305544351041317, + "step": 5250 + }, + { + "epoch": 0.2540265134136624, + "grad_norm": 2.340736150567122, + "learning_rate": 1.8589301619862852e-05, + "loss": 3.984, + "mean_token_accuracy": 0.30967742055654524, + "step": 5260 + }, + { + "epoch": 0.25450945355323207, + "grad_norm": 2.5036716573170503, + "learning_rate": 1.8580656271866317e-05, + "loss": 4.0289, + "mean_token_accuracy": 0.3067540302872658, + "step": 5270 + }, + { + "epoch": 0.2549923936928018, + "grad_norm": 2.603590775132617, + "learning_rate": 1.857198653670271e-05, + "loss": 4.0816, + "mean_token_accuracy": 0.29989919513463975, + "step": 5280 + }, + { + "epoch": 0.2554753338323715, + "grad_norm": 2.3112711829783756, + "learning_rate": 1.8563292439012376e-05, + "loss": 4.0676, + "mean_token_accuracy": 0.3093749970197678, + "step": 5290 + }, + { + "epoch": 0.25595827397194115, + "grad_norm": 2.399626459631329, + "learning_rate": 1.8554574003504893e-05, + "loss": 4.0102, + "mean_token_accuracy": 0.30700733959674836, + "step": 5300 + }, + { + "epoch": 0.25595827397194115, + "eval_runtime": 7.7952, + "eval_samples_per_second": 378.951, + "eval_steps_per_second": 23.733, + "step": 5300 + }, + { + "epoch": 0.2564412141115109, + "grad_norm": 2.3447856428896325, + "learning_rate": 1.8545831254959014e-05, + "loss": 4.0438, + "mean_token_accuracy": 0.30171371102333067, + "step": 5310 + }, + { + "epoch": 0.25692415425108056, + "grad_norm": 2.367630972367812, + "learning_rate": 1.8537064218222586e-05, + "loss": 4.0352, + "mean_token_accuracy": 0.30766129195690156, + "step": 5320 + }, + { + "epoch": 0.2574070943906503, + "grad_norm": 2.3729306020193657, + "learning_rate": 1.8528272918212487e-05, + "loss": 4.0512, + "mean_token_accuracy": 0.3099798411130905, + "step": 5330 + }, + { + "epoch": 0.25789003453021997, + "grad_norm": 2.314830670765685, + "learning_rate": 1.851945737991457e-05, + "loss": 3.9586, + "mean_token_accuracy": 0.3130040317773819, + "step": 5340 + }, + { + "epoch": 0.2583729746697897, + "grad_norm": 2.4771361605823046, + "learning_rate": 1.8510617628383544e-05, + "loss": 3.9879, + "mean_token_accuracy": 0.31411290168762207, + "step": 5350 + }, + { + "epoch": 0.2588559148093594, + "grad_norm": 2.332880643688743, + "learning_rate": 1.850175368874297e-05, + "loss": 4.0176, + "mean_token_accuracy": 0.3073588699102402, + "step": 5360 + }, + { + "epoch": 0.25933885494892905, + "grad_norm": 2.279526720713524, + "learning_rate": 1.8492865586185127e-05, + "loss": 4.0496, + "mean_token_accuracy": 0.30866935551166536, + "step": 5370 + }, + { + "epoch": 0.2598217950884988, + "grad_norm": 2.2715043499782044, + "learning_rate": 1.8483953345970983e-05, + "loss": 4.0449, + "mean_token_accuracy": 0.2966733857989311, + "step": 5380 + }, + { + "epoch": 0.26030473522806846, + "grad_norm": 2.417856469263586, + "learning_rate": 1.8475016993430102e-05, + "loss": 4.0316, + "mean_token_accuracy": 0.3053427442908287, + "step": 5390 + }, + { + "epoch": 0.2607876753676382, + "grad_norm": 2.3939657066886317, + "learning_rate": 1.8466056553960576e-05, + "loss": 3.9688, + "mean_token_accuracy": 0.31421370804309845, + "step": 5400 + }, + { + "epoch": 0.2607876753676382, + "eval_runtime": 7.7982, + "eval_samples_per_second": 378.807, + "eval_steps_per_second": 23.723, + "step": 5400 + }, + { + "epoch": 0.26127061550720787, + "grad_norm": 2.3629090392686325, + "learning_rate": 1.8457072053028962e-05, + "loss": 3.9867, + "mean_token_accuracy": 0.31522177457809447, + "step": 5410 + }, + { + "epoch": 0.2617535556467776, + "grad_norm": 2.3251406340563907, + "learning_rate": 1.84480635161702e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.3161290377378464, + "step": 5420 + }, + { + "epoch": 0.2622364957863473, + "grad_norm": 2.337857664117226, + "learning_rate": 1.843903096898753e-05, + "loss": 3.9801, + "mean_token_accuracy": 0.3078629061579704, + "step": 5430 + }, + { + "epoch": 0.26271943592591696, + "grad_norm": 2.4673230321655577, + "learning_rate": 1.8429974437152454e-05, + "loss": 4.0332, + "mean_token_accuracy": 0.31491935551166533, + "step": 5440 + }, + { + "epoch": 0.2632023760654867, + "grad_norm": 2.1333952726018865, + "learning_rate": 1.8420893946404623e-05, + "loss": 4.0789, + "mean_token_accuracy": 0.3047379031777382, + "step": 5450 + }, + { + "epoch": 0.26368531620505636, + "grad_norm": 2.3968585259124864, + "learning_rate": 1.841178952255179e-05, + "loss": 3.943, + "mean_token_accuracy": 0.31602822691202165, + "step": 5460 + }, + { + "epoch": 0.2641682563446261, + "grad_norm": 2.3898328278252765, + "learning_rate": 1.840266119146973e-05, + "loss": 4.0254, + "mean_token_accuracy": 0.31471773982048035, + "step": 5470 + }, + { + "epoch": 0.26465119648419577, + "grad_norm": 2.3747369956171855, + "learning_rate": 1.8393508979102163e-05, + "loss": 3.9707, + "mean_token_accuracy": 0.31129032373428345, + "step": 5480 + }, + { + "epoch": 0.2651341366237655, + "grad_norm": 2.3128205536022657, + "learning_rate": 1.8384332911460672e-05, + "loss": 3.9594, + "mean_token_accuracy": 0.3176411300897598, + "step": 5490 + }, + { + "epoch": 0.2656170767633352, + "grad_norm": 2.379800505518838, + "learning_rate": 1.8375133014624654e-05, + "loss": 3.966, + "mean_token_accuracy": 0.3194556459784508, + "step": 5500 + }, + { + "epoch": 0.2656170767633352, + "eval_runtime": 7.7902, + "eval_samples_per_second": 379.196, + "eval_steps_per_second": 23.748, + "step": 5500 + }, + { + "epoch": 0.2661000169029049, + "grad_norm": 2.3470868818778694, + "learning_rate": 1.8365909314741232e-05, + "loss": 4.0016, + "mean_token_accuracy": 0.30826612561941147, + "step": 5510 + }, + { + "epoch": 0.2665829570424746, + "grad_norm": 2.3216580874961776, + "learning_rate": 1.8356661838025162e-05, + "loss": 4.0672, + "mean_token_accuracy": 0.3048387080430984, + "step": 5520 + }, + { + "epoch": 0.26706589718204427, + "grad_norm": 2.317526685490564, + "learning_rate": 1.8347390610758798e-05, + "loss": 4.0211, + "mean_token_accuracy": 0.3072580650448799, + "step": 5530 + }, + { + "epoch": 0.267548837321614, + "grad_norm": 2.364153300933969, + "learning_rate": 1.833809565929198e-05, + "loss": 3.9344, + "mean_token_accuracy": 0.3083669379353523, + "step": 5540 + }, + { + "epoch": 0.2680317774611837, + "grad_norm": 2.3604797273813602, + "learning_rate": 1.832877701004198e-05, + "loss": 4.0441, + "mean_token_accuracy": 0.30907257944345473, + "step": 5550 + }, + { + "epoch": 0.2685147176007534, + "grad_norm": 2.3251109589491397, + "learning_rate": 1.8319434689493424e-05, + "loss": 4.0059, + "mean_token_accuracy": 0.3059475839138031, + "step": 5560 + }, + { + "epoch": 0.2689976577403231, + "grad_norm": 2.2868448114703916, + "learning_rate": 1.8310068724198213e-05, + "loss": 3.9762, + "mean_token_accuracy": 0.3176411271095276, + "step": 5570 + }, + { + "epoch": 0.2694805978798928, + "grad_norm": 2.4192872652574082, + "learning_rate": 1.830067914077545e-05, + "loss": 4.0133, + "mean_token_accuracy": 0.30695564597845076, + "step": 5580 + }, + { + "epoch": 0.2699635380194625, + "grad_norm": 2.3705732549526077, + "learning_rate": 1.8291265965911358e-05, + "loss": 3.9937, + "mean_token_accuracy": 0.3160282254219055, + "step": 5590 + }, + { + "epoch": 0.27044647815903217, + "grad_norm": 2.329064013791676, + "learning_rate": 1.8281829226359216e-05, + "loss": 3.9742, + "mean_token_accuracy": 0.3175403237342834, + "step": 5600 + }, + { + "epoch": 0.27044647815903217, + "eval_runtime": 7.7957, + "eval_samples_per_second": 378.927, + "eval_steps_per_second": 23.731, + "step": 5600 + }, + { + "epoch": 0.2709294182986019, + "grad_norm": 2.3676441570772373, + "learning_rate": 1.827236894893927e-05, + "loss": 3.991, + "mean_token_accuracy": 0.3101814493536949, + "step": 5610 + }, + { + "epoch": 0.2714123584381716, + "grad_norm": 2.275190405273513, + "learning_rate": 1.8262885160538676e-05, + "loss": 4.0, + "mean_token_accuracy": 0.3052419304847717, + "step": 5620 + }, + { + "epoch": 0.2718952985777413, + "grad_norm": 2.3252384232163097, + "learning_rate": 1.825337788811139e-05, + "loss": 4.0281, + "mean_token_accuracy": 0.3046370968222618, + "step": 5630 + }, + { + "epoch": 0.272378238717311, + "grad_norm": 2.3927840022508207, + "learning_rate": 1.8243847158678133e-05, + "loss": 4.0023, + "mean_token_accuracy": 0.31330645009875296, + "step": 5640 + }, + { + "epoch": 0.2728611788568807, + "grad_norm": 2.334558087459042, + "learning_rate": 1.8234292999326277e-05, + "loss": 4.0418, + "mean_token_accuracy": 0.3017137125134468, + "step": 5650 + }, + { + "epoch": 0.2733441189964504, + "grad_norm": 2.2926672240841426, + "learning_rate": 1.8224715437209798e-05, + "loss": 4.0664, + "mean_token_accuracy": 0.3090725809335709, + "step": 5660 + }, + { + "epoch": 0.27382705913602007, + "grad_norm": 2.2422588275577477, + "learning_rate": 1.8215114499549176e-05, + "loss": 4.0184, + "mean_token_accuracy": 0.30625, + "step": 5670 + }, + { + "epoch": 0.2743099992755898, + "grad_norm": 2.2692370292422153, + "learning_rate": 1.8205490213631328e-05, + "loss": 4.0055, + "mean_token_accuracy": 0.30907257944345473, + "step": 5680 + }, + { + "epoch": 0.2747929394151595, + "grad_norm": 2.3184674910987595, + "learning_rate": 1.8195842606809536e-05, + "loss": 3.9398, + "mean_token_accuracy": 0.31834677755832674, + "step": 5690 + }, + { + "epoch": 0.2752758795547292, + "grad_norm": 2.4104798829582395, + "learning_rate": 1.8186171706503354e-05, + "loss": 4.0023, + "mean_token_accuracy": 0.3135080635547638, + "step": 5700 + }, + { + "epoch": 0.2752758795547292, + "eval_runtime": 7.8, + "eval_samples_per_second": 378.718, + "eval_steps_per_second": 23.718, + "step": 5700 + }, + { + "epoch": 0.2757588196942989, + "grad_norm": 2.771761647922274, + "learning_rate": 1.8176477540198547e-05, + "loss": 4.0348, + "mean_token_accuracy": 0.30937499850988387, + "step": 5710 + }, + { + "epoch": 0.2762417598338686, + "grad_norm": 2.3544110496302055, + "learning_rate": 1.816676013544699e-05, + "loss": 3.9828, + "mean_token_accuracy": 0.3074596792459488, + "step": 5720 + }, + { + "epoch": 0.2767246999734383, + "grad_norm": 2.3375642096045075, + "learning_rate": 1.815701951986662e-05, + "loss": 3.9117, + "mean_token_accuracy": 0.31118951439857484, + "step": 5730 + }, + { + "epoch": 0.27720764011300797, + "grad_norm": 2.3809022847712193, + "learning_rate": 1.814725572114134e-05, + "loss": 4.0848, + "mean_token_accuracy": 0.29979838281869886, + "step": 5740 + }, + { + "epoch": 0.2776905802525777, + "grad_norm": 2.300537379180946, + "learning_rate": 1.813746876702093e-05, + "loss": 4.025, + "mean_token_accuracy": 0.305745966732502, + "step": 5750 + }, + { + "epoch": 0.2781735203921474, + "grad_norm": 2.3679605419463248, + "learning_rate": 1.8127658685320996e-05, + "loss": 3.9797, + "mean_token_accuracy": 0.31431451737880706, + "step": 5760 + }, + { + "epoch": 0.2786564605317171, + "grad_norm": 2.219694442047168, + "learning_rate": 1.8117825503922858e-05, + "loss": 4.0605, + "mean_token_accuracy": 0.309879033267498, + "step": 5770 + }, + { + "epoch": 0.2791394006712868, + "grad_norm": 2.457312995322759, + "learning_rate": 1.81079692507735e-05, + "loss": 4.0773, + "mean_token_accuracy": 0.30645161420106887, + "step": 5780 + }, + { + "epoch": 0.2796223408108565, + "grad_norm": 2.376516180002499, + "learning_rate": 1.809808995388548e-05, + "loss": 4.1172, + "mean_token_accuracy": 0.29667338728904724, + "step": 5790 + }, + { + "epoch": 0.2801052809504262, + "grad_norm": 2.34405592895596, + "learning_rate": 1.8088187641336846e-05, + "loss": 3.9707, + "mean_token_accuracy": 0.3206653207540512, + "step": 5800 + }, + { + "epoch": 0.2801052809504262, + "eval_runtime": 7.8092, + "eval_samples_per_second": 378.27, + "eval_steps_per_second": 23.69, + "step": 5800 + }, + { + "epoch": 0.28058822108999587, + "grad_norm": 2.4583421902969524, + "learning_rate": 1.8078262341271044e-05, + "loss": 3.9945, + "mean_token_accuracy": 0.31401209980249406, + "step": 5810 + }, + { + "epoch": 0.2810711612295656, + "grad_norm": 2.3363306807246365, + "learning_rate": 1.8068314081896877e-05, + "loss": 3.9973, + "mean_token_accuracy": 0.30897177159786227, + "step": 5820 + }, + { + "epoch": 0.2815541013691353, + "grad_norm": 2.448743622770045, + "learning_rate": 1.8058342891488392e-05, + "loss": 4.0687, + "mean_token_accuracy": 0.3038306459784508, + "step": 5830 + }, + { + "epoch": 0.282037041508705, + "grad_norm": 2.3124185813096902, + "learning_rate": 1.8048348798384802e-05, + "loss": 3.9844, + "mean_token_accuracy": 0.31068548411130903, + "step": 5840 + }, + { + "epoch": 0.2825199816482747, + "grad_norm": 2.273467667922356, + "learning_rate": 1.8038331830990416e-05, + "loss": 4.0, + "mean_token_accuracy": 0.3031250014901161, + "step": 5850 + }, + { + "epoch": 0.2830029217878444, + "grad_norm": 2.3418750143560194, + "learning_rate": 1.8028292017774556e-05, + "loss": 3.9664, + "mean_token_accuracy": 0.3056451566517353, + "step": 5860 + }, + { + "epoch": 0.2834858619274141, + "grad_norm": 2.4395724436683146, + "learning_rate": 1.8018229387271472e-05, + "loss": 4.0223, + "mean_token_accuracy": 0.3108870968222618, + "step": 5870 + }, + { + "epoch": 0.28396880206698377, + "grad_norm": 2.3306774837969146, + "learning_rate": 1.8008143968080273e-05, + "loss": 3.9578, + "mean_token_accuracy": 0.31622983813285827, + "step": 5880 + }, + { + "epoch": 0.2844517422065535, + "grad_norm": 2.3061022402294635, + "learning_rate": 1.7998035788864815e-05, + "loss": 3.9645, + "mean_token_accuracy": 0.3126008063554764, + "step": 5890 + }, + { + "epoch": 0.2849346823461232, + "grad_norm": 2.350985131599099, + "learning_rate": 1.798790487835366e-05, + "loss": 3.9238, + "mean_token_accuracy": 0.3196572557091713, + "step": 5900 + }, + { + "epoch": 0.2849346823461232, + "eval_runtime": 7.7908, + "eval_samples_per_second": 379.167, + "eval_steps_per_second": 23.746, + "step": 5900 + }, + { + "epoch": 0.2854176224856929, + "grad_norm": 2.301420782412212, + "learning_rate": 1.7977751265339967e-05, + "loss": 4.068, + "mean_token_accuracy": 0.30937499850988387, + "step": 5910 + }, + { + "epoch": 0.2859005626252626, + "grad_norm": 2.2861918585040044, + "learning_rate": 1.796757497868142e-05, + "loss": 3.9934, + "mean_token_accuracy": 0.30947580486536025, + "step": 5920 + }, + { + "epoch": 0.2863835027648323, + "grad_norm": 2.3410887630206685, + "learning_rate": 1.7957376047300135e-05, + "loss": 3.9977, + "mean_token_accuracy": 0.3123991936445236, + "step": 5930 + }, + { + "epoch": 0.286866442904402, + "grad_norm": 2.3398464731768764, + "learning_rate": 1.7947154500182605e-05, + "loss": 4.0187, + "mean_token_accuracy": 0.3138104796409607, + "step": 5940 + }, + { + "epoch": 0.2873493830439717, + "grad_norm": 2.2449216061438557, + "learning_rate": 1.7936910366379587e-05, + "loss": 3.9395, + "mean_token_accuracy": 0.32268145233392714, + "step": 5950 + }, + { + "epoch": 0.2878323231835414, + "grad_norm": 2.318184758343338, + "learning_rate": 1.7926643675006027e-05, + "loss": 4.0031, + "mean_token_accuracy": 0.30221773982048034, + "step": 5960 + }, + { + "epoch": 0.2883152633231111, + "grad_norm": 2.2948538362379534, + "learning_rate": 1.7916354455241e-05, + "loss": 3.9879, + "mean_token_accuracy": 0.3088709682226181, + "step": 5970 + }, + { + "epoch": 0.2887982034626808, + "grad_norm": 2.299339809625044, + "learning_rate": 1.7906042736327583e-05, + "loss": 3.9789, + "mean_token_accuracy": 0.3131048396229744, + "step": 5980 + }, + { + "epoch": 0.2892811436022505, + "grad_norm": 2.3323406552667016, + "learning_rate": 1.7895708547572828e-05, + "loss": 4.0512, + "mean_token_accuracy": 0.3063508063554764, + "step": 5990 + }, + { + "epoch": 0.2897640837418202, + "grad_norm": 2.366436181394848, + "learning_rate": 1.7885351918347625e-05, + "loss": 3.993, + "mean_token_accuracy": 0.31149193793535235, + "step": 6000 + }, + { + "epoch": 0.2897640837418202, + "eval_runtime": 7.8014, + "eval_samples_per_second": 378.652, + "eval_steps_per_second": 23.714, + "step": 6000 + }, + { + "epoch": 0.2902470238813899, + "grad_norm": 2.3568261989143098, + "learning_rate": 1.7874972878086653e-05, + "loss": 3.9637, + "mean_token_accuracy": 0.31441532224416735, + "step": 6010 + }, + { + "epoch": 0.2907299640209596, + "grad_norm": 2.3092386997302032, + "learning_rate": 1.7864571456288286e-05, + "loss": 3.9551, + "mean_token_accuracy": 0.3186491966247559, + "step": 6020 + }, + { + "epoch": 0.2912129041605293, + "grad_norm": 2.425376774244814, + "learning_rate": 1.7854147682514505e-05, + "loss": 3.9777, + "mean_token_accuracy": 0.3087701633572578, + "step": 6030 + }, + { + "epoch": 0.291695844300099, + "grad_norm": 2.1874338268109907, + "learning_rate": 1.7843701586390815e-05, + "loss": 4.0047, + "mean_token_accuracy": 0.3167338714003563, + "step": 6040 + }, + { + "epoch": 0.2921787844396687, + "grad_norm": 2.2979702515249962, + "learning_rate": 1.783323319760618e-05, + "loss": 4.0727, + "mean_token_accuracy": 0.3062499985098839, + "step": 6050 + }, + { + "epoch": 0.2926617245792384, + "grad_norm": 2.3467958301553713, + "learning_rate": 1.7822742545912897e-05, + "loss": 3.9926, + "mean_token_accuracy": 0.31118951439857484, + "step": 6060 + }, + { + "epoch": 0.2931446647188081, + "grad_norm": 2.160488858973282, + "learning_rate": 1.7812229661126554e-05, + "loss": 4.0066, + "mean_token_accuracy": 0.3072580650448799, + "step": 6070 + }, + { + "epoch": 0.2936276048583778, + "grad_norm": 2.359186013504172, + "learning_rate": 1.7801694573125927e-05, + "loss": 4.0707, + "mean_token_accuracy": 0.3089717760682106, + "step": 6080 + }, + { + "epoch": 0.29411054499794753, + "grad_norm": 2.273310488578305, + "learning_rate": 1.779113731185289e-05, + "loss": 3.9664, + "mean_token_accuracy": 0.3099798396229744, + "step": 6090 + }, + { + "epoch": 0.2945934851375172, + "grad_norm": 2.313191289463342, + "learning_rate": 1.7780557907312338e-05, + "loss": 3.95, + "mean_token_accuracy": 0.3112903207540512, + "step": 6100 + }, + { + "epoch": 0.2945934851375172, + "eval_runtime": 7.797, + "eval_samples_per_second": 378.861, + "eval_steps_per_second": 23.727, + "step": 6100 + }, + { + "epoch": 0.2950764252770869, + "grad_norm": 2.4133379570023874, + "learning_rate": 1.7769956389572103e-05, + "loss": 3.9832, + "mean_token_accuracy": 0.31421370804309845, + "step": 6110 + }, + { + "epoch": 0.2955593654166566, + "grad_norm": 2.4454949190189743, + "learning_rate": 1.775933278876286e-05, + "loss": 3.9859, + "mean_token_accuracy": 0.31350806802511216, + "step": 6120 + }, + { + "epoch": 0.2960423055562263, + "grad_norm": 2.3296075809571026, + "learning_rate": 1.7748687135078048e-05, + "loss": 3.9578, + "mean_token_accuracy": 0.32026209533214567, + "step": 6130 + }, + { + "epoch": 0.296525245695796, + "grad_norm": 2.396511974580495, + "learning_rate": 1.773801945877378e-05, + "loss": 4.002, + "mean_token_accuracy": 0.3178427442908287, + "step": 6140 + }, + { + "epoch": 0.2970081858353657, + "grad_norm": 2.203523945577575, + "learning_rate": 1.772732979016877e-05, + "loss": 3.9719, + "mean_token_accuracy": 0.3086693525314331, + "step": 6150 + }, + { + "epoch": 0.29749112597493543, + "grad_norm": 2.360976073490464, + "learning_rate": 1.7716618159644228e-05, + "loss": 4.0012, + "mean_token_accuracy": 0.3068548411130905, + "step": 6160 + }, + { + "epoch": 0.2979740661145051, + "grad_norm": 2.3598974847464174, + "learning_rate": 1.7705884597643783e-05, + "loss": 3.9164, + "mean_token_accuracy": 0.3168346807360649, + "step": 6170 + }, + { + "epoch": 0.2984570062540748, + "grad_norm": 2.398539588723626, + "learning_rate": 1.7695129134673396e-05, + "loss": 3.9957, + "mean_token_accuracy": 0.3072580620646477, + "step": 6180 + }, + { + "epoch": 0.2989399463936445, + "grad_norm": 2.293329734876782, + "learning_rate": 1.768435180130127e-05, + "loss": 3.9508, + "mean_token_accuracy": 0.3196572601795197, + "step": 6190 + }, + { + "epoch": 0.2994228865332142, + "grad_norm": 2.174807099599974, + "learning_rate": 1.767355262815778e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.3173387095332146, + "step": 6200 + }, + { + "epoch": 0.2994228865332142, + "eval_runtime": 7.8005, + "eval_samples_per_second": 378.695, + "eval_steps_per_second": 23.717, + "step": 6200 + }, + { + "epoch": 0.2999058266727839, + "grad_norm": 2.2990879033306793, + "learning_rate": 1.766273164593535e-05, + "loss": 4.0105, + "mean_token_accuracy": 0.3074596762657166, + "step": 6210 + }, + { + "epoch": 0.3003887668123536, + "grad_norm": 2.622382771294755, + "learning_rate": 1.76518888853884e-05, + "loss": 3.9035, + "mean_token_accuracy": 0.3196572601795197, + "step": 6220 + }, + { + "epoch": 0.30087170695192333, + "grad_norm": 2.2749746177018655, + "learning_rate": 1.7641024377333254e-05, + "loss": 4.025, + "mean_token_accuracy": 0.3048387080430984, + "step": 6230 + }, + { + "epoch": 0.301354647091493, + "grad_norm": 2.2327096640255313, + "learning_rate": 1.7630138152648036e-05, + "loss": 3.9844, + "mean_token_accuracy": 0.31300403326749804, + "step": 6240 + }, + { + "epoch": 0.3018375872310627, + "grad_norm": 2.227496928899117, + "learning_rate": 1.7619230242272586e-05, + "loss": 3.9461, + "mean_token_accuracy": 0.3133064553141594, + "step": 6250 + }, + { + "epoch": 0.3023205273706324, + "grad_norm": 2.3784554510990104, + "learning_rate": 1.760830067720838e-05, + "loss": 3.925, + "mean_token_accuracy": 0.3194556474685669, + "step": 6260 + }, + { + "epoch": 0.3028034675102021, + "grad_norm": 2.2579992858785003, + "learning_rate": 1.7597349488518452e-05, + "loss": 3.9316, + "mean_token_accuracy": 0.318548384308815, + "step": 6270 + }, + { + "epoch": 0.3032864076497718, + "grad_norm": 2.294492670101738, + "learning_rate": 1.7586376707327273e-05, + "loss": 4.0078, + "mean_token_accuracy": 0.3031249962747097, + "step": 6280 + }, + { + "epoch": 0.3037693477893415, + "grad_norm": 2.2467390160866145, + "learning_rate": 1.7575382364820697e-05, + "loss": 3.9383, + "mean_token_accuracy": 0.3201612919569016, + "step": 6290 + }, + { + "epoch": 0.30425228792891124, + "grad_norm": 2.301583546317634, + "learning_rate": 1.756436649224585e-05, + "loss": 4.0238, + "mean_token_accuracy": 0.30715725272893907, + "step": 6300 + }, + { + "epoch": 0.30425228792891124, + "eval_runtime": 7.8096, + "eval_samples_per_second": 378.254, + "eval_steps_per_second": 23.689, + "step": 6300 + }, + { + "epoch": 0.3047352280684809, + "grad_norm": 2.282605851373963, + "learning_rate": 1.7553329120911052e-05, + "loss": 3.9641, + "mean_token_accuracy": 0.3104838699102402, + "step": 6310 + }, + { + "epoch": 0.3052181682080506, + "grad_norm": 2.273058203677823, + "learning_rate": 1.7542270282185724e-05, + "loss": 3.9902, + "mean_token_accuracy": 0.31068548560142517, + "step": 6320 + }, + { + "epoch": 0.3057011083476203, + "grad_norm": 2.3399972661739454, + "learning_rate": 1.75311900075003e-05, + "loss": 3.8977, + "mean_token_accuracy": 0.31441532373428344, + "step": 6330 + }, + { + "epoch": 0.30618404848719, + "grad_norm": 2.2333003307684054, + "learning_rate": 1.7520088328346138e-05, + "loss": 3.9914, + "mean_token_accuracy": 0.3073588743805885, + "step": 6340 + }, + { + "epoch": 0.30666698862675973, + "grad_norm": 2.3255225647707514, + "learning_rate": 1.7508965276275424e-05, + "loss": 4.0156, + "mean_token_accuracy": 0.31048387438058855, + "step": 6350 + }, + { + "epoch": 0.3071499287663294, + "grad_norm": 2.243317344334465, + "learning_rate": 1.7497820882901098e-05, + "loss": 4.0066, + "mean_token_accuracy": 0.3132056474685669, + "step": 6360 + }, + { + "epoch": 0.30763286890589914, + "grad_norm": 2.2924556285616275, + "learning_rate": 1.7486655179896747e-05, + "loss": 4.0406, + "mean_token_accuracy": 0.30181451588869096, + "step": 6370 + }, + { + "epoch": 0.3081158090454688, + "grad_norm": 2.3063926676659676, + "learning_rate": 1.7475468198996525e-05, + "loss": 3.923, + "mean_token_accuracy": 0.3242943540215492, + "step": 6380 + }, + { + "epoch": 0.3085987491850385, + "grad_norm": 2.391693480237752, + "learning_rate": 1.746425997199506e-05, + "loss": 3.8898, + "mean_token_accuracy": 0.31885080933570864, + "step": 6390 + }, + { + "epoch": 0.3090816893246082, + "grad_norm": 2.3597982085372484, + "learning_rate": 1.7453030530747364e-05, + "loss": 4.0312, + "mean_token_accuracy": 0.30544354766607285, + "step": 6400 + }, + { + "epoch": 0.3090816893246082, + "eval_runtime": 7.8112, + "eval_samples_per_second": 378.173, + "eval_steps_per_second": 23.684, + "step": 6400 + }, + { + "epoch": 0.3095646294641779, + "grad_norm": 2.194724301086377, + "learning_rate": 1.7441779907168745e-05, + "loss": 3.9488, + "mean_token_accuracy": 0.32197580933570863, + "step": 6410 + }, + { + "epoch": 0.31004756960374763, + "grad_norm": 2.522540882686619, + "learning_rate": 1.7430508133234702e-05, + "loss": 3.9648, + "mean_token_accuracy": 0.31048387140035627, + "step": 6420 + }, + { + "epoch": 0.3105305097433173, + "grad_norm": 2.3446865957219734, + "learning_rate": 1.741921524098086e-05, + "loss": 4.0629, + "mean_token_accuracy": 0.3073588714003563, + "step": 6430 + }, + { + "epoch": 0.31101344988288704, + "grad_norm": 2.267605067107876, + "learning_rate": 1.7407901262502855e-05, + "loss": 3.9762, + "mean_token_accuracy": 0.30846773982048037, + "step": 6440 + }, + { + "epoch": 0.3114963900224567, + "grad_norm": 2.2985752161065007, + "learning_rate": 1.739656622995626e-05, + "loss": 3.9434, + "mean_token_accuracy": 0.3181451603770256, + "step": 6450 + }, + { + "epoch": 0.3119793301620264, + "grad_norm": 2.22405506620997, + "learning_rate": 1.738521017555648e-05, + "loss": 4.0066, + "mean_token_accuracy": 0.3078629031777382, + "step": 6460 + }, + { + "epoch": 0.3124622703015961, + "grad_norm": 2.4051041058811533, + "learning_rate": 1.7373833131578667e-05, + "loss": 4.048, + "mean_token_accuracy": 0.3053427435457706, + "step": 6470 + }, + { + "epoch": 0.3129452104411658, + "grad_norm": 2.290105618837381, + "learning_rate": 1.7362435130357633e-05, + "loss": 4.0113, + "mean_token_accuracy": 0.3079637110233307, + "step": 6480 + }, + { + "epoch": 0.31342815058073553, + "grad_norm": 2.3770429874571852, + "learning_rate": 1.735101620428774e-05, + "loss": 3.9371, + "mean_token_accuracy": 0.31985886842012407, + "step": 6490 + }, + { + "epoch": 0.3139110907203052, + "grad_norm": 2.1643821886932604, + "learning_rate": 1.7339576385822837e-05, + "loss": 3.9945, + "mean_token_accuracy": 0.31401209682226183, + "step": 6500 + }, + { + "epoch": 0.3139110907203052, + "eval_runtime": 7.782, + "eval_samples_per_second": 379.593, + "eval_steps_per_second": 23.773, + "step": 6500 + }, + { + "epoch": 0.31439403085987494, + "grad_norm": 2.33468782646229, + "learning_rate": 1.7328115707476143e-05, + "loss": 3.9449, + "mean_token_accuracy": 0.3156249985098839, + "step": 6510 + }, + { + "epoch": 0.3148769709994446, + "grad_norm": 2.2244561010714414, + "learning_rate": 1.731663420182016e-05, + "loss": 3.9426, + "mean_token_accuracy": 0.3172379083931446, + "step": 6520 + }, + { + "epoch": 0.3153599111390143, + "grad_norm": 2.1989794630573445, + "learning_rate": 1.7305131901486594e-05, + "loss": 3.9879, + "mean_token_accuracy": 0.3126008093357086, + "step": 6530 + }, + { + "epoch": 0.315842851278584, + "grad_norm": 2.28891198117997, + "learning_rate": 1.729360883916624e-05, + "loss": 3.9672, + "mean_token_accuracy": 0.3138104841113091, + "step": 6540 + }, + { + "epoch": 0.3163257914181537, + "grad_norm": 2.3101288529255846, + "learning_rate": 1.7282065047608906e-05, + "loss": 3.9621, + "mean_token_accuracy": 0.31703629046678544, + "step": 6550 + }, + { + "epoch": 0.31680873155772343, + "grad_norm": 2.416214398465251, + "learning_rate": 1.7270500559623315e-05, + "loss": 4.0199, + "mean_token_accuracy": 0.3084677457809448, + "step": 6560 + }, + { + "epoch": 0.3172916716972931, + "grad_norm": 2.2385735416834502, + "learning_rate": 1.7258915408077014e-05, + "loss": 3.9742, + "mean_token_accuracy": 0.30806451588869094, + "step": 6570 + }, + { + "epoch": 0.31777461183686284, + "grad_norm": 2.3290882483172166, + "learning_rate": 1.7247309625896275e-05, + "loss": 4.0238, + "mean_token_accuracy": 0.3094758078455925, + "step": 6580 + }, + { + "epoch": 0.3182575519764325, + "grad_norm": 2.3401643027405843, + "learning_rate": 1.7235683246066004e-05, + "loss": 3.9742, + "mean_token_accuracy": 0.316431450843811, + "step": 6590 + }, + { + "epoch": 0.3187404921160022, + "grad_norm": 2.1333934219974733, + "learning_rate": 1.722403630162965e-05, + "loss": 3.9773, + "mean_token_accuracy": 0.31088709384202956, + "step": 6600 + }, + { + "epoch": 0.3187404921160022, + "eval_runtime": 7.778, + "eval_samples_per_second": 379.788, + "eval_steps_per_second": 23.785, + "step": 6600 + }, + { + "epoch": 0.3192234322555719, + "grad_norm": 2.2208459993722136, + "learning_rate": 1.7212368825689103e-05, + "loss": 3.9668, + "mean_token_accuracy": 0.31512096524238586, + "step": 6610 + }, + { + "epoch": 0.3197063723951416, + "grad_norm": 2.359267365961413, + "learning_rate": 1.7200680851404618e-05, + "loss": 3.9828, + "mean_token_accuracy": 0.3094758063554764, + "step": 6620 + }, + { + "epoch": 0.32018931253471133, + "grad_norm": 2.216233674902618, + "learning_rate": 1.7188972411994692e-05, + "loss": 3.9285, + "mean_token_accuracy": 0.3195564553141594, + "step": 6630 + }, + { + "epoch": 0.320672252674281, + "grad_norm": 2.502862582329525, + "learning_rate": 1.7177243540736e-05, + "loss": 4.0441, + "mean_token_accuracy": 0.3050403207540512, + "step": 6640 + }, + { + "epoch": 0.32115519281385074, + "grad_norm": 2.437597504964185, + "learning_rate": 1.7165494270963286e-05, + "loss": 4.0027, + "mean_token_accuracy": 0.3105846792459488, + "step": 6650 + }, + { + "epoch": 0.3216381329534204, + "grad_norm": 2.313393770355039, + "learning_rate": 1.7153724636069258e-05, + "loss": 3.9859, + "mean_token_accuracy": 0.3132056459784508, + "step": 6660 + }, + { + "epoch": 0.32212107309299015, + "grad_norm": 2.3105426977274743, + "learning_rate": 1.7141934669504514e-05, + "loss": 4.0621, + "mean_token_accuracy": 0.29979838579893114, + "step": 6670 + }, + { + "epoch": 0.3226040132325598, + "grad_norm": 2.3483809214252642, + "learning_rate": 1.7130124404777428e-05, + "loss": 3.9746, + "mean_token_accuracy": 0.3112903252243996, + "step": 6680 + }, + { + "epoch": 0.3230869533721295, + "grad_norm": 2.1739337127979077, + "learning_rate": 1.7118293875454077e-05, + "loss": 3.9113, + "mean_token_accuracy": 0.318044351041317, + "step": 6690 + }, + { + "epoch": 0.32356989351169924, + "grad_norm": 2.322400227042309, + "learning_rate": 1.7106443115158114e-05, + "loss": 4.0137, + "mean_token_accuracy": 0.31391129046678545, + "step": 6700 + }, + { + "epoch": 0.32356989351169924, + "eval_runtime": 7.7776, + "eval_samples_per_second": 379.81, + "eval_steps_per_second": 23.786, + "step": 6700 + }, + { + "epoch": 0.3240528336512689, + "grad_norm": 2.352950881936604, + "learning_rate": 1.709457215757071e-05, + "loss": 3.9652, + "mean_token_accuracy": 0.309879033267498, + "step": 6710 + }, + { + "epoch": 0.32453577379083864, + "grad_norm": 2.2879967792380693, + "learning_rate": 1.7082681036430426e-05, + "loss": 4.0363, + "mean_token_accuracy": 0.30816532373428346, + "step": 6720 + }, + { + "epoch": 0.3250187139304083, + "grad_norm": 2.3073019576661684, + "learning_rate": 1.707076978553313e-05, + "loss": 4.0613, + "mean_token_accuracy": 0.30947580486536025, + "step": 6730 + }, + { + "epoch": 0.32550165406997805, + "grad_norm": 2.355077349448005, + "learning_rate": 1.705883843873191e-05, + "loss": 3.9422, + "mean_token_accuracy": 0.3134072571992874, + "step": 6740 + }, + { + "epoch": 0.32598459420954773, + "grad_norm": 2.242615298248909, + "learning_rate": 1.7046887029936962e-05, + "loss": 3.9125, + "mean_token_accuracy": 0.31693548411130906, + "step": 6750 + }, + { + "epoch": 0.3264675343491174, + "grad_norm": 2.3218418604028583, + "learning_rate": 1.7034915593115502e-05, + "loss": 4.0098, + "mean_token_accuracy": 0.3086693540215492, + "step": 6760 + }, + { + "epoch": 0.32695047448868714, + "grad_norm": 2.2112851894813974, + "learning_rate": 1.7022924162291667e-05, + "loss": 3.9285, + "mean_token_accuracy": 0.3201612919569016, + "step": 6770 + }, + { + "epoch": 0.3274334146282568, + "grad_norm": 2.1722341674024768, + "learning_rate": 1.701091277154642e-05, + "loss": 3.9543, + "mean_token_accuracy": 0.323387099802494, + "step": 6780 + }, + { + "epoch": 0.32791635476782655, + "grad_norm": 2.3522008685694122, + "learning_rate": 1.6998881455017448e-05, + "loss": 3.9836, + "mean_token_accuracy": 0.30493951588869095, + "step": 6790 + }, + { + "epoch": 0.3283992949073962, + "grad_norm": 2.172941073489028, + "learning_rate": 1.698683024689908e-05, + "loss": 3.9684, + "mean_token_accuracy": 0.31602822840213773, + "step": 6800 + }, + { + "epoch": 0.3283992949073962, + "eval_runtime": 7.778, + "eval_samples_per_second": 379.789, + "eval_steps_per_second": 23.785, + "step": 6800 + }, + { + "epoch": 0.32888223504696595, + "grad_norm": 2.2967390741857394, + "learning_rate": 1.6974759181442165e-05, + "loss": 4.0223, + "mean_token_accuracy": 0.30917338877916334, + "step": 6810 + }, + { + "epoch": 0.32936517518653563, + "grad_norm": 2.3853410377504636, + "learning_rate": 1.6962668292954004e-05, + "loss": 4.0043, + "mean_token_accuracy": 0.30957661122083663, + "step": 6820 + }, + { + "epoch": 0.3298481153261053, + "grad_norm": 2.328304698510418, + "learning_rate": 1.695055761579823e-05, + "loss": 3.9078, + "mean_token_accuracy": 0.32137096524238584, + "step": 6830 + }, + { + "epoch": 0.33033105546567504, + "grad_norm": 2.4057333311056124, + "learning_rate": 1.693842718439471e-05, + "loss": 3.9516, + "mean_token_accuracy": 0.3177419379353523, + "step": 6840 + }, + { + "epoch": 0.3308139956052447, + "grad_norm": 2.336966053167253, + "learning_rate": 1.692627703321946e-05, + "loss": 4.0426, + "mean_token_accuracy": 0.3061491891741753, + "step": 6850 + }, + { + "epoch": 0.33129693574481445, + "grad_norm": 2.3512761115617726, + "learning_rate": 1.691410719680455e-05, + "loss": 3.932, + "mean_token_accuracy": 0.3092741951346397, + "step": 6860 + }, + { + "epoch": 0.3317798758843841, + "grad_norm": 2.2427326694619465, + "learning_rate": 1.6901917709737988e-05, + "loss": 4.0445, + "mean_token_accuracy": 0.3020161330699921, + "step": 6870 + }, + { + "epoch": 0.33226281602395386, + "grad_norm": 2.2925829819531813, + "learning_rate": 1.688970860666364e-05, + "loss": 4.0129, + "mean_token_accuracy": 0.3090725839138031, + "step": 6880 + }, + { + "epoch": 0.33274575616352353, + "grad_norm": 2.238877296471795, + "learning_rate": 1.687747992228111e-05, + "loss": 4.0363, + "mean_token_accuracy": 0.307762099802494, + "step": 6890 + }, + { + "epoch": 0.3332286963030932, + "grad_norm": 2.318578708273, + "learning_rate": 1.6865231691345663e-05, + "loss": 3.9574, + "mean_token_accuracy": 0.315625, + "step": 6900 + }, + { + "epoch": 0.3332286963030932, + "eval_runtime": 7.7861, + "eval_samples_per_second": 379.393, + "eval_steps_per_second": 23.76, + "step": 6900 + }, + { + "epoch": 0.33371163644266294, + "grad_norm": 2.4347999850906823, + "learning_rate": 1.6852963948668114e-05, + "loss": 3.9707, + "mean_token_accuracy": 0.31008064597845075, + "step": 6910 + }, + { + "epoch": 0.3341945765822326, + "grad_norm": 2.2178276285514262, + "learning_rate": 1.684067672911474e-05, + "loss": 3.9242, + "mean_token_accuracy": 0.31784274578094485, + "step": 6920 + }, + { + "epoch": 0.33467751672180235, + "grad_norm": 2.3206908076372996, + "learning_rate": 1.6828370067607166e-05, + "loss": 4.043, + "mean_token_accuracy": 0.3073588669300079, + "step": 6930 + }, + { + "epoch": 0.335160456861372, + "grad_norm": 2.2500304159130926, + "learning_rate": 1.681604399912227e-05, + "loss": 3.966, + "mean_token_accuracy": 0.3155241906642914, + "step": 6940 + }, + { + "epoch": 0.33564339700094176, + "grad_norm": 2.181974160572126, + "learning_rate": 1.68036985586921e-05, + "loss": 4.0199, + "mean_token_accuracy": 0.3102822571992874, + "step": 6950 + }, + { + "epoch": 0.33612633714051143, + "grad_norm": 2.283425489086256, + "learning_rate": 1.6791333781403747e-05, + "loss": 3.9734, + "mean_token_accuracy": 0.31844757944345475, + "step": 6960 + }, + { + "epoch": 0.3366092772800811, + "grad_norm": 2.2393146725876814, + "learning_rate": 1.6778949702399266e-05, + "loss": 3.9312, + "mean_token_accuracy": 0.3215725839138031, + "step": 6970 + }, + { + "epoch": 0.33709221741965084, + "grad_norm": 2.367171414398298, + "learning_rate": 1.676654635687557e-05, + "loss": 3.9805, + "mean_token_accuracy": 0.30957661420106886, + "step": 6980 + }, + { + "epoch": 0.3375751575592205, + "grad_norm": 2.257205199857503, + "learning_rate": 1.675412378008433e-05, + "loss": 3.9688, + "mean_token_accuracy": 0.3132056429982185, + "step": 6990 + }, + { + "epoch": 0.33805809769879025, + "grad_norm": 2.3866668524460413, + "learning_rate": 1.674168200733187e-05, + "loss": 4.0309, + "mean_token_accuracy": 0.3129032254219055, + "step": 7000 + }, + { + "epoch": 0.33805809769879025, + "eval_runtime": 7.776, + "eval_samples_per_second": 379.888, + "eval_steps_per_second": 23.791, + "step": 7000 + }, + { + "epoch": 0.3385410378383599, + "grad_norm": 2.3452923804444175, + "learning_rate": 1.6729221073979078e-05, + "loss": 3.9207, + "mean_token_accuracy": 0.31895161271095274, + "step": 7010 + }, + { + "epoch": 0.33902397797792966, + "grad_norm": 2.3079760616919582, + "learning_rate": 1.671674101544129e-05, + "loss": 4.0207, + "mean_token_accuracy": 0.3066532239317894, + "step": 7020 + }, + { + "epoch": 0.33950691811749933, + "grad_norm": 2.2099399443930845, + "learning_rate": 1.6704241867188202e-05, + "loss": 3.966, + "mean_token_accuracy": 0.31401209682226183, + "step": 7030 + }, + { + "epoch": 0.339989858257069, + "grad_norm": 2.347282765374838, + "learning_rate": 1.6691723664743774e-05, + "loss": 3.993, + "mean_token_accuracy": 0.3141129046678543, + "step": 7040 + }, + { + "epoch": 0.34047279839663874, + "grad_norm": 2.41141185648113, + "learning_rate": 1.66791864436861e-05, + "loss": 3.9246, + "mean_token_accuracy": 0.32147177308797836, + "step": 7050 + }, + { + "epoch": 0.3409557385362084, + "grad_norm": 2.3865850421326535, + "learning_rate": 1.6666630239647345e-05, + "loss": 3.8871, + "mean_token_accuracy": 0.3296370983123779, + "step": 7060 + }, + { + "epoch": 0.34143867867577815, + "grad_norm": 2.3534648212718663, + "learning_rate": 1.6654055088313614e-05, + "loss": 4.1, + "mean_token_accuracy": 0.301008066534996, + "step": 7070 + }, + { + "epoch": 0.3419216188153478, + "grad_norm": 2.2459606706905575, + "learning_rate": 1.6641461025424876e-05, + "loss": 3.9703, + "mean_token_accuracy": 0.31784274280071256, + "step": 7080 + }, + { + "epoch": 0.34240455895491756, + "grad_norm": 2.174141593439425, + "learning_rate": 1.6628848086774836e-05, + "loss": 3.9152, + "mean_token_accuracy": 0.31723790243268013, + "step": 7090 + }, + { + "epoch": 0.34288749909448724, + "grad_norm": 2.3933393971085413, + "learning_rate": 1.661621630821085e-05, + "loss": 3.9453, + "mean_token_accuracy": 0.31602822840213773, + "step": 7100 + }, + { + "epoch": 0.34288749909448724, + "eval_runtime": 7.8005, + "eval_samples_per_second": 378.693, + "eval_steps_per_second": 23.716, + "step": 7100 + }, + { + "epoch": 0.3433704392340569, + "grad_norm": 2.198487978497135, + "learning_rate": 1.6603565725633816e-05, + "loss": 3.9359, + "mean_token_accuracy": 0.3143145129084587, + "step": 7110 + }, + { + "epoch": 0.34385337937362664, + "grad_norm": 2.503913516569388, + "learning_rate": 1.6590896374998084e-05, + "loss": 4.0527, + "mean_token_accuracy": 0.3056451603770256, + "step": 7120 + }, + { + "epoch": 0.3443363195131963, + "grad_norm": 2.301727766189351, + "learning_rate": 1.6578208292311336e-05, + "loss": 3.9906, + "mean_token_accuracy": 0.30383064448833463, + "step": 7130 + }, + { + "epoch": 0.34481925965276605, + "grad_norm": 2.2129990274286757, + "learning_rate": 1.65655015136345e-05, + "loss": 3.9406, + "mean_token_accuracy": 0.3174395188689232, + "step": 7140 + }, + { + "epoch": 0.34530219979233573, + "grad_norm": 2.2052477974110762, + "learning_rate": 1.655277607508163e-05, + "loss": 3.9262, + "mean_token_accuracy": 0.31542338579893114, + "step": 7150 + }, + { + "epoch": 0.34578513993190546, + "grad_norm": 2.187364445349514, + "learning_rate": 1.6540032012819822e-05, + "loss": 3.9453, + "mean_token_accuracy": 0.32237903624773023, + "step": 7160 + }, + { + "epoch": 0.34626808007147514, + "grad_norm": 2.1950098819251203, + "learning_rate": 1.6527269363069104e-05, + "loss": 3.9715, + "mean_token_accuracy": 0.3137096777558327, + "step": 7170 + }, + { + "epoch": 0.3467510202110448, + "grad_norm": 2.216177434209642, + "learning_rate": 1.651448816210232e-05, + "loss": 3.9219, + "mean_token_accuracy": 0.31844757944345475, + "step": 7180 + }, + { + "epoch": 0.34723396035061455, + "grad_norm": 2.36810281830132, + "learning_rate": 1.6501688446245046e-05, + "loss": 3.9516, + "mean_token_accuracy": 0.318951615691185, + "step": 7190 + }, + { + "epoch": 0.3477169004901842, + "grad_norm": 2.3448721178663083, + "learning_rate": 1.6488870251875488e-05, + "loss": 3.9438, + "mean_token_accuracy": 0.3159274190664291, + "step": 7200 + }, + { + "epoch": 0.3477169004901842, + "eval_runtime": 7.7785, + "eval_samples_per_second": 379.766, + "eval_steps_per_second": 23.784, + "step": 7200 + }, + { + "epoch": 0.34819984062975395, + "grad_norm": 2.3616374782903953, + "learning_rate": 1.6476033615424358e-05, + "loss": 4.009, + "mean_token_accuracy": 0.3084677428007126, + "step": 7210 + }, + { + "epoch": 0.34868278076932363, + "grad_norm": 2.1638043387752206, + "learning_rate": 1.6463178573374784e-05, + "loss": 3.9508, + "mean_token_accuracy": 0.31542338728904723, + "step": 7220 + }, + { + "epoch": 0.34916572090889336, + "grad_norm": 2.2885376591805238, + "learning_rate": 1.6450305162262203e-05, + "loss": 3.9305, + "mean_token_accuracy": 0.3120967738330364, + "step": 7230 + }, + { + "epoch": 0.34964866104846304, + "grad_norm": 2.287172788102938, + "learning_rate": 1.6437413418674273e-05, + "loss": 4.0156, + "mean_token_accuracy": 0.31602822691202165, + "step": 7240 + }, + { + "epoch": 0.35013160118803277, + "grad_norm": 2.1779051064389123, + "learning_rate": 1.6424503379250735e-05, + "loss": 3.9602, + "mean_token_accuracy": 0.3115927428007126, + "step": 7250 + }, + { + "epoch": 0.35061454132760245, + "grad_norm": 2.0871353987714745, + "learning_rate": 1.6411575080683343e-05, + "loss": 4.0141, + "mean_token_accuracy": 0.30463709831237795, + "step": 7260 + }, + { + "epoch": 0.3510974814671721, + "grad_norm": 2.2193379463158, + "learning_rate": 1.639862855971574e-05, + "loss": 4.007, + "mean_token_accuracy": 0.30635080933570863, + "step": 7270 + }, + { + "epoch": 0.35158042160674186, + "grad_norm": 2.280612581787595, + "learning_rate": 1.6385663853143355e-05, + "loss": 4.0191, + "mean_token_accuracy": 0.3054435484111309, + "step": 7280 + }, + { + "epoch": 0.35206336174631153, + "grad_norm": 2.2214063243459554, + "learning_rate": 1.6372680997813315e-05, + "loss": 3.9973, + "mean_token_accuracy": 0.31179435551166534, + "step": 7290 + }, + { + "epoch": 0.35254630188588126, + "grad_norm": 2.296752881513919, + "learning_rate": 1.6359680030624318e-05, + "loss": 3.9945, + "mean_token_accuracy": 0.30907257944345473, + "step": 7300 + }, + { + "epoch": 0.35254630188588126, + "eval_runtime": 7.7721, + "eval_samples_per_second": 380.076, + "eval_steps_per_second": 23.803, + "step": 7300 + }, + { + "epoch": 0.35302924202545094, + "grad_norm": 2.2489755150132837, + "learning_rate": 1.634666098852654e-05, + "loss": 3.9039, + "mean_token_accuracy": 0.3200604826211929, + "step": 7310 + }, + { + "epoch": 0.35351218216502067, + "grad_norm": 2.1856828168168208, + "learning_rate": 1.633362390852152e-05, + "loss": 3.9832, + "mean_token_accuracy": 0.31602822840213773, + "step": 7320 + }, + { + "epoch": 0.35399512230459035, + "grad_norm": 2.3085684332824288, + "learning_rate": 1.6320568827662083e-05, + "loss": 3.9922, + "mean_token_accuracy": 0.3146169349551201, + "step": 7330 + }, + { + "epoch": 0.35447806244416, + "grad_norm": 2.26913729212531, + "learning_rate": 1.630749578305219e-05, + "loss": 4.0238, + "mean_token_accuracy": 0.30977822840213776, + "step": 7340 + }, + { + "epoch": 0.35496100258372976, + "grad_norm": 2.3402266186887606, + "learning_rate": 1.629440481184688e-05, + "loss": 3.9488, + "mean_token_accuracy": 0.3134072557091713, + "step": 7350 + }, + { + "epoch": 0.35544394272329943, + "grad_norm": 2.209580816580286, + "learning_rate": 1.6281295951252124e-05, + "loss": 3.8887, + "mean_token_accuracy": 0.3208669349551201, + "step": 7360 + }, + { + "epoch": 0.35592688286286916, + "grad_norm": 2.252813356788115, + "learning_rate": 1.6268169238524742e-05, + "loss": 3.9789, + "mean_token_accuracy": 0.31350806206464765, + "step": 7370 + }, + { + "epoch": 0.35640982300243884, + "grad_norm": 2.3898285632577076, + "learning_rate": 1.6255024710972295e-05, + "loss": 4.0363, + "mean_token_accuracy": 0.3119959682226181, + "step": 7380 + }, + { + "epoch": 0.3568927631420086, + "grad_norm": 2.0969880498936573, + "learning_rate": 1.624186240595297e-05, + "loss": 3.9418, + "mean_token_accuracy": 0.31703629195690153, + "step": 7390 + }, + { + "epoch": 0.35737570328157825, + "grad_norm": 2.223692148575789, + "learning_rate": 1.622868236087549e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.30897177308797835, + "step": 7400 + }, + { + "epoch": 0.35737570328157825, + "eval_runtime": 7.806, + "eval_samples_per_second": 378.425, + "eval_steps_per_second": 23.7, + "step": 7400 + }, + { + "epoch": 0.3578586434211479, + "grad_norm": 2.325578031025663, + "learning_rate": 1.6215484613198982e-05, + "loss": 4.0258, + "mean_token_accuracy": 0.31209677308797834, + "step": 7410 + }, + { + "epoch": 0.35834158356071766, + "grad_norm": 2.476555569341456, + "learning_rate": 1.62022692004329e-05, + "loss": 4.0598, + "mean_token_accuracy": 0.30181451588869096, + "step": 7420 + }, + { + "epoch": 0.35882452370028733, + "grad_norm": 2.266881035928201, + "learning_rate": 1.618903616013689e-05, + "loss": 4.0301, + "mean_token_accuracy": 0.3078629031777382, + "step": 7430 + }, + { + "epoch": 0.35930746383985707, + "grad_norm": 2.2966846022527636, + "learning_rate": 1.6175785529920713e-05, + "loss": 3.9391, + "mean_token_accuracy": 0.32207661122083664, + "step": 7440 + }, + { + "epoch": 0.35979040397942674, + "grad_norm": 2.349804419385055, + "learning_rate": 1.6162517347444112e-05, + "loss": 3.9059, + "mean_token_accuracy": 0.3203629031777382, + "step": 7450 + }, + { + "epoch": 0.3602733441189965, + "grad_norm": 2.20560920505012, + "learning_rate": 1.6149231650416718e-05, + "loss": 4.0027, + "mean_token_accuracy": 0.31088710129261016, + "step": 7460 + }, + { + "epoch": 0.36075628425856615, + "grad_norm": 2.2392022644333203, + "learning_rate": 1.6135928476597937e-05, + "loss": 3.9527, + "mean_token_accuracy": 0.30846774131059645, + "step": 7470 + }, + { + "epoch": 0.3612392243981358, + "grad_norm": 2.1744927379007586, + "learning_rate": 1.6122607863796854e-05, + "loss": 3.968, + "mean_token_accuracy": 0.31794354915618894, + "step": 7480 + }, + { + "epoch": 0.36172216453770556, + "grad_norm": 2.1556922176031685, + "learning_rate": 1.6109269849872117e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.31461693346500397, + "step": 7490 + }, + { + "epoch": 0.36220510467727524, + "grad_norm": 2.2754901303161197, + "learning_rate": 1.6095914472731813e-05, + "loss": 4.0094, + "mean_token_accuracy": 0.3083669364452362, + "step": 7500 + }, + { + "epoch": 0.36220510467727524, + "eval_runtime": 7.7782, + "eval_samples_per_second": 379.779, + "eval_steps_per_second": 23.784, + "step": 7500 + }, + { + "epoch": 0.36268804481684497, + "grad_norm": 2.3576399930936116, + "learning_rate": 1.60825417703334e-05, + "loss": 3.9918, + "mean_token_accuracy": 0.31270161718130113, + "step": 7510 + }, + { + "epoch": 0.36317098495641464, + "grad_norm": 2.161842792240655, + "learning_rate": 1.606915178068356e-05, + "loss": 4.0711, + "mean_token_accuracy": 0.3039314493536949, + "step": 7520 + }, + { + "epoch": 0.3636539250959844, + "grad_norm": 2.224554755731731, + "learning_rate": 1.605574454183812e-05, + "loss": 3.9766, + "mean_token_accuracy": 0.318245966732502, + "step": 7530 + }, + { + "epoch": 0.36413686523555405, + "grad_norm": 2.2959269342884854, + "learning_rate": 1.6042320091901918e-05, + "loss": 3.8844, + "mean_token_accuracy": 0.32368951886892317, + "step": 7540 + }, + { + "epoch": 0.36461980537512373, + "grad_norm": 2.2806347103495135, + "learning_rate": 1.602887846902872e-05, + "loss": 3.9969, + "mean_token_accuracy": 0.3185483917593956, + "step": 7550 + }, + { + "epoch": 0.36510274551469346, + "grad_norm": 2.2406801044540394, + "learning_rate": 1.6015419711421093e-05, + "loss": 3.9875, + "mean_token_accuracy": 0.3098790317773819, + "step": 7560 + }, + { + "epoch": 0.36558568565426314, + "grad_norm": 2.3253377666358217, + "learning_rate": 1.60019438573303e-05, + "loss": 3.9105, + "mean_token_accuracy": 0.3145161300897598, + "step": 7570 + }, + { + "epoch": 0.36606862579383287, + "grad_norm": 2.1932508333597203, + "learning_rate": 1.5988450945056202e-05, + "loss": 4.0008, + "mean_token_accuracy": 0.3133064493536949, + "step": 7580 + }, + { + "epoch": 0.36655156593340255, + "grad_norm": 2.206922499068559, + "learning_rate": 1.5974941012947133e-05, + "loss": 3.975, + "mean_token_accuracy": 0.31149193495512006, + "step": 7590 + }, + { + "epoch": 0.3670345060729723, + "grad_norm": 2.2426919737010262, + "learning_rate": 1.5961414099399802e-05, + "loss": 3.984, + "mean_token_accuracy": 0.31189516335725787, + "step": 7600 + }, + { + "epoch": 0.3670345060729723, + "eval_runtime": 7.7741, + "eval_samples_per_second": 379.98, + "eval_steps_per_second": 23.797, + "step": 7600 + }, + { + "epoch": 0.36751744621254195, + "grad_norm": 2.225392847432725, + "learning_rate": 1.5947870242859188e-05, + "loss": 3.9598, + "mean_token_accuracy": 0.31330645382404326, + "step": 7610 + }, + { + "epoch": 0.36800038635211163, + "grad_norm": 2.3060781296421933, + "learning_rate": 1.5934309481818414e-05, + "loss": 3.9707, + "mean_token_accuracy": 0.31542338877916337, + "step": 7620 + }, + { + "epoch": 0.36848332649168136, + "grad_norm": 2.3585163707006824, + "learning_rate": 1.592073185481865e-05, + "loss": 3.9453, + "mean_token_accuracy": 0.315826615691185, + "step": 7630 + }, + { + "epoch": 0.36896626663125104, + "grad_norm": 2.2081498545111007, + "learning_rate": 1.590713740044901e-05, + "loss": 3.9375, + "mean_token_accuracy": 0.315625, + "step": 7640 + }, + { + "epoch": 0.36944920677082077, + "grad_norm": 2.347466197507941, + "learning_rate": 1.5893526157346416e-05, + "loss": 3.9582, + "mean_token_accuracy": 0.3138104811310768, + "step": 7650 + }, + { + "epoch": 0.36993214691039045, + "grad_norm": 2.328017784756564, + "learning_rate": 1.587989816419552e-05, + "loss": 4.0203, + "mean_token_accuracy": 0.3035282284021378, + "step": 7660 + }, + { + "epoch": 0.3704150870499602, + "grad_norm": 2.2293863274912398, + "learning_rate": 1.5866253459728574e-05, + "loss": 4.0051, + "mean_token_accuracy": 0.31129032373428345, + "step": 7670 + }, + { + "epoch": 0.37089802718952986, + "grad_norm": 2.384547800693373, + "learning_rate": 1.585259208272533e-05, + "loss": 4.0281, + "mean_token_accuracy": 0.30483870953321457, + "step": 7680 + }, + { + "epoch": 0.37138096732909953, + "grad_norm": 2.4612956238927244, + "learning_rate": 1.583891407201291e-05, + "loss": 3.9922, + "mean_token_accuracy": 0.31415562331676483, + "step": 7690 + }, + { + "epoch": 0.37186390746866926, + "grad_norm": 2.327182082327985, + "learning_rate": 1.5825219466465734e-05, + "loss": 4.0098, + "mean_token_accuracy": 0.30957661420106886, + "step": 7700 + }, + { + "epoch": 0.37186390746866926, + "eval_runtime": 7.7596, + "eval_samples_per_second": 380.691, + "eval_steps_per_second": 23.842, + "step": 7700 + }, + { + "epoch": 0.37234684760823894, + "grad_norm": 2.4641206129952424, + "learning_rate": 1.581150830500537e-05, + "loss": 4.0785, + "mean_token_accuracy": 0.30372984111309054, + "step": 7710 + }, + { + "epoch": 0.37282978774780867, + "grad_norm": 2.1559255914426236, + "learning_rate": 1.5797780626600444e-05, + "loss": 3.9988, + "mean_token_accuracy": 0.31592742055654527, + "step": 7720 + }, + { + "epoch": 0.37331272788737835, + "grad_norm": 2.2097018480771355, + "learning_rate": 1.5784036470266524e-05, + "loss": 3.9465, + "mean_token_accuracy": 0.3174395188689232, + "step": 7730 + }, + { + "epoch": 0.3737956680269481, + "grad_norm": 2.1729835502772623, + "learning_rate": 1.577027587506601e-05, + "loss": 3.8246, + "mean_token_accuracy": 0.3202620968222618, + "step": 7740 + }, + { + "epoch": 0.37427860816651776, + "grad_norm": 2.1650976320133632, + "learning_rate": 1.5756498880108027e-05, + "loss": 3.932, + "mean_token_accuracy": 0.32207661420106887, + "step": 7750 + }, + { + "epoch": 0.37476154830608743, + "grad_norm": 2.271808348234846, + "learning_rate": 1.57427055245483e-05, + "loss": 3.9785, + "mean_token_accuracy": 0.31532258093357085, + "step": 7760 + }, + { + "epoch": 0.37524448844565716, + "grad_norm": 2.4107833507934298, + "learning_rate": 1.5728895847589073e-05, + "loss": 4.0227, + "mean_token_accuracy": 0.304217104613781, + "step": 7770 + }, + { + "epoch": 0.37572742858522684, + "grad_norm": 2.235251553932183, + "learning_rate": 1.571506988847895e-05, + "loss": 4.0187, + "mean_token_accuracy": 0.30423386991024015, + "step": 7780 + }, + { + "epoch": 0.3762103687247966, + "grad_norm": 2.4843361279684637, + "learning_rate": 1.5701227686512836e-05, + "loss": 3.9539, + "mean_token_accuracy": 0.3168346777558327, + "step": 7790 + }, + { + "epoch": 0.37669330886436625, + "grad_norm": 2.191000039618457, + "learning_rate": 1.568736928103178e-05, + "loss": 4.0094, + "mean_token_accuracy": 0.31491935551166533, + "step": 7800 + }, + { + "epoch": 0.37669330886436625, + "eval_runtime": 7.7585, + "eval_samples_per_second": 380.745, + "eval_steps_per_second": 23.845, + "step": 7800 + }, + { + "epoch": 0.377176249003936, + "grad_norm": 2.1437180306225576, + "learning_rate": 1.56734947114229e-05, + "loss": 3.9578, + "mean_token_accuracy": 0.31068548262119294, + "step": 7810 + }, + { + "epoch": 0.37765918914350566, + "grad_norm": 2.381090588792243, + "learning_rate": 1.5659604017119233e-05, + "loss": 4.0047, + "mean_token_accuracy": 0.31320564448833466, + "step": 7820 + }, + { + "epoch": 0.3781421292830754, + "grad_norm": 2.3369925433015073, + "learning_rate": 1.564569723759967e-05, + "loss": 3.9406, + "mean_token_accuracy": 0.31431451588869097, + "step": 7830 + }, + { + "epoch": 0.37862506942264507, + "grad_norm": 2.2286225438782328, + "learning_rate": 1.56317744123888e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.3090725809335709, + "step": 7840 + }, + { + "epoch": 0.37910800956221474, + "grad_norm": 2.196373245240866, + "learning_rate": 1.561783558105682e-05, + "loss": 3.909, + "mean_token_accuracy": 0.3228830635547638, + "step": 7850 + }, + { + "epoch": 0.3795909497017845, + "grad_norm": 2.2697919408351357, + "learning_rate": 1.560388078321942e-05, + "loss": 3.9746, + "mean_token_accuracy": 0.31582661122083666, + "step": 7860 + }, + { + "epoch": 0.38007388984135415, + "grad_norm": 2.184099157719294, + "learning_rate": 1.5589910058537666e-05, + "loss": 3.977, + "mean_token_accuracy": 0.3138373658061028, + "step": 7870 + }, + { + "epoch": 0.3805568299809239, + "grad_norm": 2.2440503338997018, + "learning_rate": 1.5575923446717893e-05, + "loss": 3.9629, + "mean_token_accuracy": 0.3138104841113091, + "step": 7880 + }, + { + "epoch": 0.38103977012049356, + "grad_norm": 2.5432355518332384, + "learning_rate": 1.556192098751158e-05, + "loss": 3.9684, + "mean_token_accuracy": 0.3126008093357086, + "step": 7890 + }, + { + "epoch": 0.3815227102600633, + "grad_norm": 2.280111413169912, + "learning_rate": 1.5547902720715265e-05, + "loss": 3.9809, + "mean_token_accuracy": 0.3133064523339272, + "step": 7900 + }, + { + "epoch": 0.3815227102600633, + "eval_runtime": 7.762, + "eval_samples_per_second": 380.572, + "eval_steps_per_second": 23.834, + "step": 7900 + }, + { + "epoch": 0.38200565039963297, + "grad_norm": 2.177443611410207, + "learning_rate": 1.5533868686170396e-05, + "loss": 4.0441, + "mean_token_accuracy": 0.3105846792459488, + "step": 7910 + }, + { + "epoch": 0.38248859053920264, + "grad_norm": 2.2264488495344397, + "learning_rate": 1.5519818923763235e-05, + "loss": 4.0105, + "mean_token_accuracy": 0.30645161122083664, + "step": 7920 + }, + { + "epoch": 0.3829715306787724, + "grad_norm": 2.4923483307127805, + "learning_rate": 1.5505753473424757e-05, + "loss": 3.957, + "mean_token_accuracy": 0.316431450843811, + "step": 7930 + }, + { + "epoch": 0.38345447081834205, + "grad_norm": 2.3275734075195946, + "learning_rate": 1.549167237513051e-05, + "loss": 3.9637, + "mean_token_accuracy": 0.3182459682226181, + "step": 7940 + }, + { + "epoch": 0.3839374109579118, + "grad_norm": 2.3410843465541618, + "learning_rate": 1.5477575668900523e-05, + "loss": 3.9215, + "mean_token_accuracy": 0.3196572601795197, + "step": 7950 + }, + { + "epoch": 0.38442035109748146, + "grad_norm": 2.2839220796747415, + "learning_rate": 1.5463463394799184e-05, + "loss": 3.9203, + "mean_token_accuracy": 0.3147177428007126, + "step": 7960 + }, + { + "epoch": 0.3849032912370512, + "grad_norm": 2.2019522922585764, + "learning_rate": 1.5449335592935125e-05, + "loss": 4.0023, + "mean_token_accuracy": 0.3155241936445236, + "step": 7970 + }, + { + "epoch": 0.38538623137662087, + "grad_norm": 2.3933252938518867, + "learning_rate": 1.543519230346111e-05, + "loss": 4.0477, + "mean_token_accuracy": 0.3072580635547638, + "step": 7980 + }, + { + "epoch": 0.38586917151619055, + "grad_norm": 2.2673013759721576, + "learning_rate": 1.542103356657391e-05, + "loss": 3.9945, + "mean_token_accuracy": 0.315625, + "step": 7990 + }, + { + "epoch": 0.3863521116557603, + "grad_norm": 2.245016953142722, + "learning_rate": 1.540685942251423e-05, + "loss": 3.9141, + "mean_token_accuracy": 0.3180443555116653, + "step": 8000 + }, + { + "epoch": 0.3863521116557603, + "eval_runtime": 7.7913, + "eval_samples_per_second": 379.142, + "eval_steps_per_second": 23.745, + "step": 8000 + }, + { + "epoch": 0.38683505179532995, + "grad_norm": 2.3171600703872017, + "learning_rate": 1.5392669911566525e-05, + "loss": 3.9766, + "mean_token_accuracy": 0.31723790168762206, + "step": 8010 + }, + { + "epoch": 0.3873179919348997, + "grad_norm": 2.2051650337965265, + "learning_rate": 1.5378465074058953e-05, + "loss": 3.9844, + "mean_token_accuracy": 0.32086693644523623, + "step": 8020 + }, + { + "epoch": 0.38780093207446936, + "grad_norm": 2.317902630186066, + "learning_rate": 1.5364244950363216e-05, + "loss": 3.9902, + "mean_token_accuracy": 0.31663306057453156, + "step": 8030 + }, + { + "epoch": 0.3882838722140391, + "grad_norm": 2.302866413568975, + "learning_rate": 1.535000958089447e-05, + "loss": 3.9898, + "mean_token_accuracy": 0.3165322616696358, + "step": 8040 + }, + { + "epoch": 0.38876681235360877, + "grad_norm": 2.2651441307335185, + "learning_rate": 1.5335759006111195e-05, + "loss": 3.9566, + "mean_token_accuracy": 0.3184475839138031, + "step": 8050 + }, + { + "epoch": 0.38924975249317845, + "grad_norm": 2.2371477945459577, + "learning_rate": 1.532149326651509e-05, + "loss": 4.0086, + "mean_token_accuracy": 0.30977822542190553, + "step": 8060 + }, + { + "epoch": 0.3897326926327482, + "grad_norm": 2.267391115586042, + "learning_rate": 1.5307212402650956e-05, + "loss": 3.973, + "mean_token_accuracy": 0.3146169349551201, + "step": 8070 + }, + { + "epoch": 0.39021563277231786, + "grad_norm": 2.221125299434015, + "learning_rate": 1.5292916455106572e-05, + "loss": 4.0359, + "mean_token_accuracy": 0.3105846747756004, + "step": 8080 + }, + { + "epoch": 0.3906985729118876, + "grad_norm": 2.2532059615380153, + "learning_rate": 1.527860546451259e-05, + "loss": 4.0367, + "mean_token_accuracy": 0.30625, + "step": 8090 + }, + { + "epoch": 0.39118151305145726, + "grad_norm": 2.249756955836091, + "learning_rate": 1.526427947154242e-05, + "loss": 3.9863, + "mean_token_accuracy": 0.31249999850988386, + "step": 8100 + }, + { + "epoch": 0.39118151305145726, + "eval_runtime": 7.7701, + "eval_samples_per_second": 380.175, + "eval_steps_per_second": 23.809, + "step": 8100 + }, + { + "epoch": 0.391664453191027, + "grad_norm": 2.3681585876910405, + "learning_rate": 1.5249938516912104e-05, + "loss": 3.9473, + "mean_token_accuracy": 0.319556450843811, + "step": 8110 + }, + { + "epoch": 0.39214739333059667, + "grad_norm": 2.273513562099052, + "learning_rate": 1.5235582641380208e-05, + "loss": 3.9297, + "mean_token_accuracy": 0.3137096747756004, + "step": 8120 + }, + { + "epoch": 0.39263033347016635, + "grad_norm": 2.434611549389301, + "learning_rate": 1.522121188574771e-05, + "loss": 3.9719, + "mean_token_accuracy": 0.31532258093357085, + "step": 8130 + }, + { + "epoch": 0.3931132736097361, + "grad_norm": 2.2711279245845137, + "learning_rate": 1.520682629085787e-05, + "loss": 3.9609, + "mean_token_accuracy": 0.31683467626571654, + "step": 8140 + }, + { + "epoch": 0.39359621374930576, + "grad_norm": 2.43464539956847, + "learning_rate": 1.5192425897596134e-05, + "loss": 3.8906, + "mean_token_accuracy": 0.32499999850988387, + "step": 8150 + }, + { + "epoch": 0.3940791538888755, + "grad_norm": 2.354420987733528, + "learning_rate": 1.5178010746889995e-05, + "loss": 3.9301, + "mean_token_accuracy": 0.3187499985098839, + "step": 8160 + }, + { + "epoch": 0.39456209402844516, + "grad_norm": 2.2094552811465746, + "learning_rate": 1.5163580879708897e-05, + "loss": 3.9504, + "mean_token_accuracy": 0.3152217760682106, + "step": 8170 + }, + { + "epoch": 0.3950450341680149, + "grad_norm": 2.2521642433158, + "learning_rate": 1.5149136337064105e-05, + "loss": 3.9172, + "mean_token_accuracy": 0.3178427413105965, + "step": 8180 + }, + { + "epoch": 0.3955279743075846, + "grad_norm": 2.3528903523414084, + "learning_rate": 1.5134677160008594e-05, + "loss": 3.927, + "mean_token_accuracy": 0.3142137140035629, + "step": 8190 + }, + { + "epoch": 0.39601091444715425, + "grad_norm": 2.3613415375312554, + "learning_rate": 1.5120203389636937e-05, + "loss": 3.9484, + "mean_token_accuracy": 0.3144153207540512, + "step": 8200 + }, + { + "epoch": 0.39601091444715425, + "eval_runtime": 7.7794, + "eval_samples_per_second": 379.722, + "eval_steps_per_second": 23.781, + "step": 8200 + }, + { + "epoch": 0.396493854586724, + "grad_norm": 2.3690149146054877, + "learning_rate": 1.5105715067085173e-05, + "loss": 3.9527, + "mean_token_accuracy": 0.3152217760682106, + "step": 8210 + }, + { + "epoch": 0.39697679472629366, + "grad_norm": 2.2257570825117794, + "learning_rate": 1.5091212233530707e-05, + "loss": 3.9906, + "mean_token_accuracy": 0.3203629061579704, + "step": 8220 + }, + { + "epoch": 0.3974597348658634, + "grad_norm": 2.4680611405500144, + "learning_rate": 1.5076694930192187e-05, + "loss": 4.0457, + "mean_token_accuracy": 0.305443549156189, + "step": 8230 + }, + { + "epoch": 0.39794267500543307, + "grad_norm": 2.644212903219562, + "learning_rate": 1.5062163198329376e-05, + "loss": 3.9098, + "mean_token_accuracy": 0.32157257944345474, + "step": 8240 + }, + { + "epoch": 0.3984256151450028, + "grad_norm": 2.1640338436490767, + "learning_rate": 1.5047617079243057e-05, + "loss": 3.9254, + "mean_token_accuracy": 0.3203629061579704, + "step": 8250 + }, + { + "epoch": 0.3989085552845725, + "grad_norm": 2.356569793571822, + "learning_rate": 1.5033056614274898e-05, + "loss": 3.9813, + "mean_token_accuracy": 0.31401209980249406, + "step": 8260 + }, + { + "epoch": 0.39939149542414215, + "grad_norm": 2.201265363950054, + "learning_rate": 1.5018481844807333e-05, + "loss": 3.9473, + "mean_token_accuracy": 0.3187500014901161, + "step": 8270 + }, + { + "epoch": 0.3998744355637119, + "grad_norm": 2.3178787761333455, + "learning_rate": 1.5003892812263461e-05, + "loss": 3.8969, + "mean_token_accuracy": 0.31955645233392715, + "step": 8280 + }, + { + "epoch": 0.40035737570328156, + "grad_norm": 2.3310885134933015, + "learning_rate": 1.4989289558106913e-05, + "loss": 3.9312, + "mean_token_accuracy": 0.3175403207540512, + "step": 8290 + }, + { + "epoch": 0.4008403158428513, + "grad_norm": 2.3309224168306426, + "learning_rate": 1.4974672123841739e-05, + "loss": 3.9328, + "mean_token_accuracy": 0.3181451618671417, + "step": 8300 + }, + { + "epoch": 0.4008403158428513, + "eval_runtime": 7.7759, + "eval_samples_per_second": 379.893, + "eval_steps_per_second": 23.792, + "step": 8300 + }, + { + "epoch": 0.40132325598242097, + "grad_norm": 2.253461363076083, + "learning_rate": 1.4960040551012293e-05, + "loss": 3.9582, + "mean_token_accuracy": 0.31522177159786224, + "step": 8310 + }, + { + "epoch": 0.4018061961219907, + "grad_norm": 2.182744451847417, + "learning_rate": 1.4945394881203115e-05, + "loss": 3.9953, + "mean_token_accuracy": 0.30826613008975984, + "step": 8320 + }, + { + "epoch": 0.4022891362615604, + "grad_norm": 2.2082836776661137, + "learning_rate": 1.4930735156038798e-05, + "loss": 3.9375, + "mean_token_accuracy": 0.3208669349551201, + "step": 8330 + }, + { + "epoch": 0.40277207640113005, + "grad_norm": 2.152139964086219, + "learning_rate": 1.4916061417183899e-05, + "loss": 3.9145, + "mean_token_accuracy": 0.3189400926232338, + "step": 8340 + }, + { + "epoch": 0.4032550165406998, + "grad_norm": 2.322393374678207, + "learning_rate": 1.4901373706342788e-05, + "loss": 4.0086, + "mean_token_accuracy": 0.3099798396229744, + "step": 8350 + }, + { + "epoch": 0.40373795668026946, + "grad_norm": 2.2454702269187448, + "learning_rate": 1.4886672065259553e-05, + "loss": 3.9633, + "mean_token_accuracy": 0.31058468073606493, + "step": 8360 + }, + { + "epoch": 0.4042208968198392, + "grad_norm": 2.2150750047492487, + "learning_rate": 1.4871956535717875e-05, + "loss": 3.966, + "mean_token_accuracy": 0.31350806504487994, + "step": 8370 + }, + { + "epoch": 0.40470383695940887, + "grad_norm": 2.3506011177671615, + "learning_rate": 1.4857227159540901e-05, + "loss": 3.9621, + "mean_token_accuracy": 0.3196572601795197, + "step": 8380 + }, + { + "epoch": 0.4051867770989786, + "grad_norm": 2.2359460578761396, + "learning_rate": 1.484248397859114e-05, + "loss": 4.009, + "mean_token_accuracy": 0.3127016112208366, + "step": 8390 + }, + { + "epoch": 0.4056697172385483, + "grad_norm": 2.2535177698933393, + "learning_rate": 1.4827727034770326e-05, + "loss": 3.95, + "mean_token_accuracy": 0.3102822571992874, + "step": 8400 + }, + { + "epoch": 0.4056697172385483, + "eval_runtime": 7.7816, + "eval_samples_per_second": 379.612, + "eval_steps_per_second": 23.774, + "step": 8400 + }, + { + "epoch": 0.406152657378118, + "grad_norm": 2.4127913079682517, + "learning_rate": 1.4812956370019311e-05, + "loss": 4.0125, + "mean_token_accuracy": 0.30453629046678543, + "step": 8410 + }, + { + "epoch": 0.4066355975176877, + "grad_norm": 2.173502998824011, + "learning_rate": 1.4798172026317949e-05, + "loss": 3.9602, + "mean_token_accuracy": 0.31548836827278137, + "step": 8420 + }, + { + "epoch": 0.40711853765725736, + "grad_norm": 2.1571546369952515, + "learning_rate": 1.4783374045684971e-05, + "loss": 4.0199, + "mean_token_accuracy": 0.30756048262119295, + "step": 8430 + }, + { + "epoch": 0.4076014777968271, + "grad_norm": 2.2908423105756834, + "learning_rate": 1.4768562470177856e-05, + "loss": 3.9863, + "mean_token_accuracy": 0.30463709533214567, + "step": 8440 + }, + { + "epoch": 0.40808441793639677, + "grad_norm": 2.303398828783417, + "learning_rate": 1.4753737341892733e-05, + "loss": 3.9504, + "mean_token_accuracy": 0.3122983857989311, + "step": 8450 + }, + { + "epoch": 0.4085673580759665, + "grad_norm": 2.1881230355524606, + "learning_rate": 1.4738898702964239e-05, + "loss": 3.932, + "mean_token_accuracy": 0.31683467477560046, + "step": 8460 + }, + { + "epoch": 0.4090502982155362, + "grad_norm": 2.098728300090589, + "learning_rate": 1.472404659556542e-05, + "loss": 3.9145, + "mean_token_accuracy": 0.32026209831237795, + "step": 8470 + }, + { + "epoch": 0.4095332383551059, + "grad_norm": 2.2391037790745467, + "learning_rate": 1.4709181061907591e-05, + "loss": 3.9684, + "mean_token_accuracy": 0.31653225868940355, + "step": 8480 + }, + { + "epoch": 0.4100161784946756, + "grad_norm": 2.4090827424703534, + "learning_rate": 1.4694302144240234e-05, + "loss": 3.991, + "mean_token_accuracy": 0.3087701603770256, + "step": 8490 + }, + { + "epoch": 0.41049911863424526, + "grad_norm": 2.427448304656656, + "learning_rate": 1.4679409884850866e-05, + "loss": 4.0211, + "mean_token_accuracy": 0.3073588699102402, + "step": 8500 + }, + { + "epoch": 0.41049911863424526, + "eval_runtime": 7.77, + "eval_samples_per_second": 380.179, + "eval_steps_per_second": 23.809, + "step": 8500 + }, + { + "epoch": 0.410982058773815, + "grad_norm": 2.2633867160742454, + "learning_rate": 1.4664504326064919e-05, + "loss": 3.9004, + "mean_token_accuracy": 0.3173387095332146, + "step": 8510 + }, + { + "epoch": 0.41146499891338467, + "grad_norm": 2.3383005703347934, + "learning_rate": 1.4649585510245632e-05, + "loss": 3.9965, + "mean_token_accuracy": 0.31421370804309845, + "step": 8520 + }, + { + "epoch": 0.4119479390529544, + "grad_norm": 2.1632549205395146, + "learning_rate": 1.4634653479793917e-05, + "loss": 4.0121, + "mean_token_accuracy": 0.3075604856014252, + "step": 8530 + }, + { + "epoch": 0.4124308791925241, + "grad_norm": 2.3189173200001245, + "learning_rate": 1.4619708277148242e-05, + "loss": 3.9887, + "mean_token_accuracy": 0.31391129046678545, + "step": 8540 + }, + { + "epoch": 0.4129138193320938, + "grad_norm": 2.1664619049782603, + "learning_rate": 1.460474994478451e-05, + "loss": 3.9121, + "mean_token_accuracy": 0.31552419662475584, + "step": 8550 + }, + { + "epoch": 0.4133967594716635, + "grad_norm": 2.1446203636972916, + "learning_rate": 1.4589778525215952e-05, + "loss": 3.9684, + "mean_token_accuracy": 0.31602822691202165, + "step": 8560 + }, + { + "epoch": 0.41387969961123316, + "grad_norm": 2.234965721040499, + "learning_rate": 1.4574794060992976e-05, + "loss": 4.007, + "mean_token_accuracy": 0.3116935506463051, + "step": 8570 + }, + { + "epoch": 0.4143626397508029, + "grad_norm": 2.3350292840778666, + "learning_rate": 1.4559796594703084e-05, + "loss": 3.9723, + "mean_token_accuracy": 0.319254033267498, + "step": 8580 + }, + { + "epoch": 0.4148455798903726, + "grad_norm": 2.2834840773216483, + "learning_rate": 1.4544786168970715e-05, + "loss": 4.0074, + "mean_token_accuracy": 0.3156250029802322, + "step": 8590 + }, + { + "epoch": 0.4153285200299423, + "grad_norm": 2.522144799497461, + "learning_rate": 1.452976282645715e-05, + "loss": 3.8715, + "mean_token_accuracy": 0.327116933465004, + "step": 8600 + }, + { + "epoch": 0.4153285200299423, + "eval_runtime": 7.7737, + "eval_samples_per_second": 379.998, + "eval_steps_per_second": 23.798, + "step": 8600 + }, + { + "epoch": 0.415811460169512, + "grad_norm": 2.26372699187666, + "learning_rate": 1.4514726609860374e-05, + "loss": 3.9582, + "mean_token_accuracy": 0.3166330620646477, + "step": 8610 + }, + { + "epoch": 0.4162944003090817, + "grad_norm": 2.359948200481088, + "learning_rate": 1.4499677561914969e-05, + "loss": 3.9809, + "mean_token_accuracy": 0.31653225868940355, + "step": 8620 + }, + { + "epoch": 0.4167773404486514, + "grad_norm": 2.2059864287873068, + "learning_rate": 1.448461572539198e-05, + "loss": 3.9836, + "mean_token_accuracy": 0.3127016142010689, + "step": 8630 + }, + { + "epoch": 0.41726028058822107, + "grad_norm": 2.2288805568568475, + "learning_rate": 1.44695411430988e-05, + "loss": 3.8973, + "mean_token_accuracy": 0.3197580650448799, + "step": 8640 + }, + { + "epoch": 0.4177432207277908, + "grad_norm": 2.4082297684586793, + "learning_rate": 1.4454453857879047e-05, + "loss": 4.0121, + "mean_token_accuracy": 0.3090725764632225, + "step": 8650 + }, + { + "epoch": 0.4182261608673605, + "grad_norm": 2.2782643806095133, + "learning_rate": 1.4439353912612441e-05, + "loss": 3.882, + "mean_token_accuracy": 0.3180443555116653, + "step": 8660 + }, + { + "epoch": 0.4187091010069302, + "grad_norm": 2.273361530716404, + "learning_rate": 1.4424241350214683e-05, + "loss": 3.9695, + "mean_token_accuracy": 0.3146169364452362, + "step": 8670 + }, + { + "epoch": 0.4191920411464999, + "grad_norm": 2.1614064668476947, + "learning_rate": 1.4409116213637335e-05, + "loss": 3.9875, + "mean_token_accuracy": 0.31622984260320663, + "step": 8680 + }, + { + "epoch": 0.4196749812860696, + "grad_norm": 2.1749090104092113, + "learning_rate": 1.4393978545867699e-05, + "loss": 3.95, + "mean_token_accuracy": 0.3173387095332146, + "step": 8690 + }, + { + "epoch": 0.4201579214256393, + "grad_norm": 2.1901106971585516, + "learning_rate": 1.4378828389928683e-05, + "loss": 3.9254, + "mean_token_accuracy": 0.3138104841113091, + "step": 8700 + }, + { + "epoch": 0.4201579214256393, + "eval_runtime": 7.7431, + "eval_samples_per_second": 381.5, + "eval_steps_per_second": 23.892, + "step": 8700 + }, + { + "epoch": 0.42064086156520897, + "grad_norm": 2.3886563997614583, + "learning_rate": 1.4363665788878698e-05, + "loss": 3.9629, + "mean_token_accuracy": 0.31784274280071256, + "step": 8710 + }, + { + "epoch": 0.4211238017047787, + "grad_norm": 2.187173443011444, + "learning_rate": 1.4348490785811516e-05, + "loss": 3.957, + "mean_token_accuracy": 0.3177419379353523, + "step": 8720 + }, + { + "epoch": 0.4216067418443484, + "grad_norm": 2.336368136735689, + "learning_rate": 1.4333303423856161e-05, + "loss": 3.9625, + "mean_token_accuracy": 0.312802417576313, + "step": 8730 + }, + { + "epoch": 0.4220896819839181, + "grad_norm": 2.1307192899137193, + "learning_rate": 1.4318103746176793e-05, + "loss": 3.9059, + "mean_token_accuracy": 0.32520160973072054, + "step": 8740 + }, + { + "epoch": 0.4225726221234878, + "grad_norm": 2.2181936867558063, + "learning_rate": 1.4302891795972553e-05, + "loss": 3.9238, + "mean_token_accuracy": 0.3177419379353523, + "step": 8750 + }, + { + "epoch": 0.4230555622630575, + "grad_norm": 2.3146108820988402, + "learning_rate": 1.428766761647748e-05, + "loss": 4.0324, + "mean_token_accuracy": 0.30786290392279625, + "step": 8760 + }, + { + "epoch": 0.4235385024026272, + "grad_norm": 2.107398900918669, + "learning_rate": 1.4272431250960362e-05, + "loss": 3.9082, + "mean_token_accuracy": 0.3193548396229744, + "step": 8770 + }, + { + "epoch": 0.42402144254219687, + "grad_norm": 2.2395181923870235, + "learning_rate": 1.425718274272462e-05, + "loss": 3.9879, + "mean_token_accuracy": 0.3102822571992874, + "step": 8780 + }, + { + "epoch": 0.4245043826817666, + "grad_norm": 2.321570759451804, + "learning_rate": 1.4241922135108188e-05, + "loss": 3.9383, + "mean_token_accuracy": 0.31693548411130906, + "step": 8790 + }, + { + "epoch": 0.4249873228213363, + "grad_norm": 2.3972951483660623, + "learning_rate": 1.4226649471483396e-05, + "loss": 3.959, + "mean_token_accuracy": 0.3133064493536949, + "step": 8800 + }, + { + "epoch": 0.4249873228213363, + "eval_runtime": 7.7675, + "eval_samples_per_second": 380.301, + "eval_steps_per_second": 23.817, + "step": 8800 + }, + { + "epoch": 0.425470262960906, + "grad_norm": 2.2281928237376114, + "learning_rate": 1.4211364795256819e-05, + "loss": 3.9672, + "mean_token_accuracy": 0.31804435700178146, + "step": 8810 + }, + { + "epoch": 0.4259532031004757, + "grad_norm": 2.330225329925326, + "learning_rate": 1.4196068149869194e-05, + "loss": 3.9516, + "mean_token_accuracy": 0.32036290466785433, + "step": 8820 + }, + { + "epoch": 0.4264361432400454, + "grad_norm": 2.213484525240926, + "learning_rate": 1.4180759578795258e-05, + "loss": 3.9992, + "mean_token_accuracy": 0.3104838728904724, + "step": 8830 + }, + { + "epoch": 0.4269190833796151, + "grad_norm": 2.3257342531686045, + "learning_rate": 1.4165439125543659e-05, + "loss": 3.9672, + "mean_token_accuracy": 0.3222782239317894, + "step": 8840 + }, + { + "epoch": 0.42740202351918477, + "grad_norm": 2.270032724911227, + "learning_rate": 1.4150106833656801e-05, + "loss": 3.9555, + "mean_token_accuracy": 0.3232862904667854, + "step": 8850 + }, + { + "epoch": 0.4278849636587545, + "grad_norm": 2.225139220617103, + "learning_rate": 1.4134762746710741e-05, + "loss": 3.9465, + "mean_token_accuracy": 0.31743951588869096, + "step": 8860 + }, + { + "epoch": 0.4283679037983242, + "grad_norm": 2.143885802204563, + "learning_rate": 1.4119406908315065e-05, + "loss": 3.9324, + "mean_token_accuracy": 0.3220766097307205, + "step": 8870 + }, + { + "epoch": 0.4288508439378939, + "grad_norm": 2.2760174263761077, + "learning_rate": 1.4104039362112745e-05, + "loss": 3.9664, + "mean_token_accuracy": 0.3151209682226181, + "step": 8880 + }, + { + "epoch": 0.4293337840774636, + "grad_norm": 2.2498136887954576, + "learning_rate": 1.4088660151780035e-05, + "loss": 3.9051, + "mean_token_accuracy": 0.3233870953321457, + "step": 8890 + }, + { + "epoch": 0.4298167242170333, + "grad_norm": 2.3821445970295216, + "learning_rate": 1.4073269321026342e-05, + "loss": 3.9484, + "mean_token_accuracy": 0.3211693540215492, + "step": 8900 + }, + { + "epoch": 0.4298167242170333, + "eval_runtime": 7.783, + "eval_samples_per_second": 379.545, + "eval_steps_per_second": 23.77, + "step": 8900 + }, + { + "epoch": 0.430299664356603, + "grad_norm": 2.401569834089016, + "learning_rate": 1.4057866913594092e-05, + "loss": 3.9688, + "mean_token_accuracy": 0.3162298396229744, + "step": 8910 + }, + { + "epoch": 0.43078260449617267, + "grad_norm": 2.154146649871768, + "learning_rate": 1.404245297325862e-05, + "loss": 3.9457, + "mean_token_accuracy": 0.31804435700178146, + "step": 8920 + }, + { + "epoch": 0.4312655446357424, + "grad_norm": 2.3684647515183177, + "learning_rate": 1.4027027543828043e-05, + "loss": 3.9621, + "mean_token_accuracy": 0.30473790168762205, + "step": 8930 + }, + { + "epoch": 0.4317484847753121, + "grad_norm": 2.3698024233540362, + "learning_rate": 1.4011590669143112e-05, + "loss": 3.957, + "mean_token_accuracy": 0.3072580635547638, + "step": 8940 + }, + { + "epoch": 0.4322314249148818, + "grad_norm": 2.214949557739117, + "learning_rate": 1.3996142393077128e-05, + "loss": 3.9004, + "mean_token_accuracy": 0.32247983664274216, + "step": 8950 + }, + { + "epoch": 0.4327143650544515, + "grad_norm": 2.2609835092620583, + "learning_rate": 1.3980682759535784e-05, + "loss": 4.0207, + "mean_token_accuracy": 0.3058467753231525, + "step": 8960 + }, + { + "epoch": 0.4331973051940212, + "grad_norm": 2.276536125550772, + "learning_rate": 1.3965211812457055e-05, + "loss": 3.9355, + "mean_token_accuracy": 0.316129033267498, + "step": 8970 + }, + { + "epoch": 0.4336802453335909, + "grad_norm": 2.2358364256812724, + "learning_rate": 1.3949729595811077e-05, + "loss": 3.982, + "mean_token_accuracy": 0.3171370968222618, + "step": 8980 + }, + { + "epoch": 0.43416318547316063, + "grad_norm": 2.1560697570679688, + "learning_rate": 1.3934236153600002e-05, + "loss": 3.9379, + "mean_token_accuracy": 0.31401209384202955, + "step": 8990 + }, + { + "epoch": 0.4346461256127303, + "grad_norm": 2.1936154624335344, + "learning_rate": 1.3918731529857904e-05, + "loss": 3.9953, + "mean_token_accuracy": 0.3138104841113091, + "step": 9000 + }, + { + "epoch": 0.4346461256127303, + "eval_runtime": 7.7834, + "eval_samples_per_second": 379.528, + "eval_steps_per_second": 23.769, + "step": 9000 + }, + { + "epoch": 0.4351290657523, + "grad_norm": 2.1575081284022177, + "learning_rate": 1.3903215768650609e-05, + "loss": 3.9266, + "mean_token_accuracy": 0.319556450843811, + "step": 9010 + }, + { + "epoch": 0.4356120058918697, + "grad_norm": 2.345633670110874, + "learning_rate": 1.3887688914075628e-05, + "loss": 3.9949, + "mean_token_accuracy": 0.3073588714003563, + "step": 9020 + }, + { + "epoch": 0.4360949460314394, + "grad_norm": 2.3863605971117625, + "learning_rate": 1.387215101026198e-05, + "loss": 4.0414, + "mean_token_accuracy": 0.3067540302872658, + "step": 9030 + }, + { + "epoch": 0.4365778861710091, + "grad_norm": 2.232717047721671, + "learning_rate": 1.3856602101370095e-05, + "loss": 3.9133, + "mean_token_accuracy": 0.3209677428007126, + "step": 9040 + }, + { + "epoch": 0.4370608263105788, + "grad_norm": 2.267300866113889, + "learning_rate": 1.384104223159168e-05, + "loss": 3.9359, + "mean_token_accuracy": 0.31844758093357084, + "step": 9050 + }, + { + "epoch": 0.43754376645014853, + "grad_norm": 2.4453508923457736, + "learning_rate": 1.3825471445149589e-05, + "loss": 4.0, + "mean_token_accuracy": 0.3066532239317894, + "step": 9060 + }, + { + "epoch": 0.4380267065897182, + "grad_norm": 2.2267880358266043, + "learning_rate": 1.380988978629771e-05, + "loss": 3.9133, + "mean_token_accuracy": 0.322379033267498, + "step": 9070 + }, + { + "epoch": 0.4385096467292879, + "grad_norm": 2.384098935035482, + "learning_rate": 1.379429729932083e-05, + "loss": 4.0133, + "mean_token_accuracy": 0.3161290273070335, + "step": 9080 + }, + { + "epoch": 0.4389925868688576, + "grad_norm": 2.303557148223578, + "learning_rate": 1.3778694028534498e-05, + "loss": 3.941, + "mean_token_accuracy": 0.3209677428007126, + "step": 9090 + }, + { + "epoch": 0.4394755270084273, + "grad_norm": 2.3744461420478533, + "learning_rate": 1.3763080018284933e-05, + "loss": 3.9652, + "mean_token_accuracy": 0.3171370968222618, + "step": 9100 + }, + { + "epoch": 0.4394755270084273, + "eval_runtime": 7.7948, + "eval_samples_per_second": 378.969, + "eval_steps_per_second": 23.734, + "step": 9100 + }, + { + "epoch": 0.439958467147997, + "grad_norm": 2.394834932677048, + "learning_rate": 1.3747455312948866e-05, + "loss": 4.0305, + "mean_token_accuracy": 0.3071572557091713, + "step": 9110 + }, + { + "epoch": 0.4404414072875667, + "grad_norm": 2.172997247257797, + "learning_rate": 1.3731819956933419e-05, + "loss": 3.8891, + "mean_token_accuracy": 0.3120967745780945, + "step": 9120 + }, + { + "epoch": 0.44092434742713643, + "grad_norm": 2.3921038051430323, + "learning_rate": 1.3716173994675995e-05, + "loss": 3.9277, + "mean_token_accuracy": 0.30856855064630506, + "step": 9130 + }, + { + "epoch": 0.4414072875667061, + "grad_norm": 2.380354175518894, + "learning_rate": 1.370051747064413e-05, + "loss": 3.9543, + "mean_token_accuracy": 0.3088709689676762, + "step": 9140 + }, + { + "epoch": 0.4418902277062758, + "grad_norm": 2.2681764924645833, + "learning_rate": 1.3684850429335392e-05, + "loss": 3.9375, + "mean_token_accuracy": 0.3178427413105965, + "step": 9150 + }, + { + "epoch": 0.4423731678458455, + "grad_norm": 2.231651451929135, + "learning_rate": 1.3669172915277228e-05, + "loss": 3.9336, + "mean_token_accuracy": 0.3242943525314331, + "step": 9160 + }, + { + "epoch": 0.4428561079854152, + "grad_norm": 2.2826170704440396, + "learning_rate": 1.3653484973026854e-05, + "loss": 3.959, + "mean_token_accuracy": 0.31703629195690153, + "step": 9170 + }, + { + "epoch": 0.4433390481249849, + "grad_norm": 2.2816822868383797, + "learning_rate": 1.3637786647171122e-05, + "loss": 3.9797, + "mean_token_accuracy": 0.3132056429982185, + "step": 9180 + }, + { + "epoch": 0.4438219882645546, + "grad_norm": 2.282689470466177, + "learning_rate": 1.3622077982326403e-05, + "loss": 3.9418, + "mean_token_accuracy": 0.3214717760682106, + "step": 9190 + }, + { + "epoch": 0.44430492840412433, + "grad_norm": 2.2627975614159492, + "learning_rate": 1.3606359023138445e-05, + "loss": 3.9309, + "mean_token_accuracy": 0.3224798381328583, + "step": 9200 + }, + { + "epoch": 0.44430492840412433, + "eval_runtime": 7.8104, + "eval_samples_per_second": 378.212, + "eval_steps_per_second": 23.686, + "step": 9200 + }, + { + "epoch": 0.444787868543694, + "grad_norm": 2.2483442824641036, + "learning_rate": 1.359062981428225e-05, + "loss": 3.9121, + "mean_token_accuracy": 0.3189516142010689, + "step": 9210 + }, + { + "epoch": 0.4452708086832637, + "grad_norm": 2.269327754254287, + "learning_rate": 1.3574890400461963e-05, + "loss": 4.0125, + "mean_token_accuracy": 0.31300403326749804, + "step": 9220 + }, + { + "epoch": 0.4457537488228334, + "grad_norm": 2.445299738412008, + "learning_rate": 1.3559140826410724e-05, + "loss": 3.9691, + "mean_token_accuracy": 0.3160282239317894, + "step": 9230 + }, + { + "epoch": 0.4462366889624031, + "grad_norm": 2.2992504773357854, + "learning_rate": 1.354338113689055e-05, + "loss": 3.9277, + "mean_token_accuracy": 0.31834677457809446, + "step": 9240 + }, + { + "epoch": 0.4467196291019728, + "grad_norm": 2.273439600994875, + "learning_rate": 1.3527611376692209e-05, + "loss": 3.9535, + "mean_token_accuracy": 0.3162298396229744, + "step": 9250 + }, + { + "epoch": 0.4472025692415425, + "grad_norm": 2.238553038082488, + "learning_rate": 1.3511831590635091e-05, + "loss": 3.9395, + "mean_token_accuracy": 0.31340725868940356, + "step": 9260 + }, + { + "epoch": 0.44768550938111223, + "grad_norm": 2.225002583957854, + "learning_rate": 1.3496041823567082e-05, + "loss": 3.8457, + "mean_token_accuracy": 0.32419354617595675, + "step": 9270 + }, + { + "epoch": 0.4481684495206819, + "grad_norm": 2.440731510413359, + "learning_rate": 1.348024212036443e-05, + "loss": 3.9398, + "mean_token_accuracy": 0.3142137110233307, + "step": 9280 + }, + { + "epoch": 0.4486513896602516, + "grad_norm": 2.338191980385382, + "learning_rate": 1.3464432525931627e-05, + "loss": 3.9293, + "mean_token_accuracy": 0.31502016335725785, + "step": 9290 + }, + { + "epoch": 0.4491343297998213, + "grad_norm": 2.371082201919232, + "learning_rate": 1.3448613085201278e-05, + "loss": 3.9152, + "mean_token_accuracy": 0.3167338728904724, + "step": 9300 + }, + { + "epoch": 0.4491343297998213, + "eval_runtime": 7.7579, + "eval_samples_per_second": 380.771, + "eval_steps_per_second": 23.847, + "step": 9300 + }, + { + "epoch": 0.449617269939391, + "grad_norm": 2.3399554490922645, + "learning_rate": 1.3432783843133965e-05, + "loss": 3.9305, + "mean_token_accuracy": 0.3200604856014252, + "step": 9310 + }, + { + "epoch": 0.4501002100789607, + "grad_norm": 2.1538933836986507, + "learning_rate": 1.341694484471814e-05, + "loss": 3.9668, + "mean_token_accuracy": 0.31754032224416734, + "step": 9320 + }, + { + "epoch": 0.4505831502185304, + "grad_norm": 2.293289133417718, + "learning_rate": 1.3401096134969969e-05, + "loss": 3.9414, + "mean_token_accuracy": 0.3149193570017815, + "step": 9330 + }, + { + "epoch": 0.45106609035810014, + "grad_norm": 2.326297513006786, + "learning_rate": 1.3385237758933225e-05, + "loss": 3.9285, + "mean_token_accuracy": 0.3254032239317894, + "step": 9340 + }, + { + "epoch": 0.4515490304976698, + "grad_norm": 2.2071577920584455, + "learning_rate": 1.3369369761679156e-05, + "loss": 3.9957, + "mean_token_accuracy": 0.3078629046678543, + "step": 9350 + }, + { + "epoch": 0.4520319706372395, + "grad_norm": 2.241496164141993, + "learning_rate": 1.3353492188306349e-05, + "loss": 3.934, + "mean_token_accuracy": 0.3123991936445236, + "step": 9360 + }, + { + "epoch": 0.4525149107768092, + "grad_norm": 2.3380920777547494, + "learning_rate": 1.3337605083940614e-05, + "loss": 3.9848, + "mean_token_accuracy": 0.31139112561941146, + "step": 9370 + }, + { + "epoch": 0.4529978509163789, + "grad_norm": 2.231851607449588, + "learning_rate": 1.3321708493734844e-05, + "loss": 3.8746, + "mean_token_accuracy": 0.3270161300897598, + "step": 9380 + }, + { + "epoch": 0.45348079105594863, + "grad_norm": 2.367890514060785, + "learning_rate": 1.3305802462868894e-05, + "loss": 3.9422, + "mean_token_accuracy": 0.31905241757631303, + "step": 9390 + }, + { + "epoch": 0.4539637311955183, + "grad_norm": 2.2455674457503187, + "learning_rate": 1.3289887036549447e-05, + "loss": 4.0, + "mean_token_accuracy": 0.312802417576313, + "step": 9400 + }, + { + "epoch": 0.4539637311955183, + "eval_runtime": 7.7648, + "eval_samples_per_second": 380.434, + "eval_steps_per_second": 23.825, + "step": 9400 + }, + { + "epoch": 0.45444667133508804, + "grad_norm": 2.323137998201826, + "learning_rate": 1.3273962260009895e-05, + "loss": 3.9586, + "mean_token_accuracy": 0.3180443555116653, + "step": 9410 + }, + { + "epoch": 0.4549296114746577, + "grad_norm": 2.398052115634056, + "learning_rate": 1.32580281785102e-05, + "loss": 3.959, + "mean_token_accuracy": 0.31754032522439957, + "step": 9420 + }, + { + "epoch": 0.4554125516142274, + "grad_norm": 2.3919912724781427, + "learning_rate": 1.3242084837336772e-05, + "loss": 3.8762, + "mean_token_accuracy": 0.3264112904667854, + "step": 9430 + }, + { + "epoch": 0.4558954917537971, + "grad_norm": 2.2675232139312205, + "learning_rate": 1.3226132281802335e-05, + "loss": 3.9656, + "mean_token_accuracy": 0.31683467477560046, + "step": 9440 + }, + { + "epoch": 0.4563784318933668, + "grad_norm": 2.228346077251232, + "learning_rate": 1.3210170557245806e-05, + "loss": 4.0035, + "mean_token_accuracy": 0.3098790317773819, + "step": 9450 + }, + { + "epoch": 0.45686137203293653, + "grad_norm": 2.2966577634723664, + "learning_rate": 1.319419970903215e-05, + "loss": 3.9625, + "mean_token_accuracy": 0.3170362889766693, + "step": 9460 + }, + { + "epoch": 0.4573443121725062, + "grad_norm": 2.215148758529441, + "learning_rate": 1.3178219782552282e-05, + "loss": 3.9332, + "mean_token_accuracy": 0.3218749985098839, + "step": 9470 + }, + { + "epoch": 0.45782725231207594, + "grad_norm": 2.2667547610082326, + "learning_rate": 1.3162230823222901e-05, + "loss": 4.0102, + "mean_token_accuracy": 0.3093750014901161, + "step": 9480 + }, + { + "epoch": 0.4583101924516456, + "grad_norm": 2.085511179511814, + "learning_rate": 1.3146232876486384e-05, + "loss": 3.9547, + "mean_token_accuracy": 0.31068548411130903, + "step": 9490 + }, + { + "epoch": 0.4587931325912153, + "grad_norm": 2.263111990078179, + "learning_rate": 1.3130225987810657e-05, + "loss": 3.9461, + "mean_token_accuracy": 0.3170362874865532, + "step": 9500 + }, + { + "epoch": 0.4587931325912153, + "eval_runtime": 7.7917, + "eval_samples_per_second": 379.123, + "eval_steps_per_second": 23.743, + "step": 9500 + }, + { + "epoch": 0.459276072730785, + "grad_norm": 2.2530238073022417, + "learning_rate": 1.3114210202689048e-05, + "loss": 3.934, + "mean_token_accuracy": 0.3177419364452362, + "step": 9510 + }, + { + "epoch": 0.4597590128703547, + "grad_norm": 2.2585304557956176, + "learning_rate": 1.309818556664018e-05, + "loss": 3.9332, + "mean_token_accuracy": 0.322379033267498, + "step": 9520 + }, + { + "epoch": 0.46024195300992443, + "grad_norm": 2.207587094785256, + "learning_rate": 1.308215212520783e-05, + "loss": 3.9406, + "mean_token_accuracy": 0.31370967626571655, + "step": 9530 + }, + { + "epoch": 0.4607248931494941, + "grad_norm": 2.2335953303518257, + "learning_rate": 1.3066109923960794e-05, + "loss": 3.9105, + "mean_token_accuracy": 0.32237903475761415, + "step": 9540 + }, + { + "epoch": 0.46120783328906384, + "grad_norm": 2.3310224784615086, + "learning_rate": 1.3050059008492771e-05, + "loss": 3.9855, + "mean_token_accuracy": 0.31451612859964373, + "step": 9550 + }, + { + "epoch": 0.4616907734286335, + "grad_norm": 2.116490389888414, + "learning_rate": 1.3033999424422228e-05, + "loss": 3.9207, + "mean_token_accuracy": 0.3198588743805885, + "step": 9560 + }, + { + "epoch": 0.46217371356820325, + "grad_norm": 2.2997641313449866, + "learning_rate": 1.3017931217392259e-05, + "loss": 4.0242, + "mean_token_accuracy": 0.3093750014901161, + "step": 9570 + }, + { + "epoch": 0.4626566537077729, + "grad_norm": 2.181443807057096, + "learning_rate": 1.3001854433070478e-05, + "loss": 3.9418, + "mean_token_accuracy": 0.3158266082406044, + "step": 9580 + }, + { + "epoch": 0.4631395938473426, + "grad_norm": 2.403186175938397, + "learning_rate": 1.2985769117148867e-05, + "loss": 3.993, + "mean_token_accuracy": 0.31713709980249405, + "step": 9590 + }, + { + "epoch": 0.46362253398691233, + "grad_norm": 2.2126270345800663, + "learning_rate": 1.2969675315343663e-05, + "loss": 3.9359, + "mean_token_accuracy": 0.3174395188689232, + "step": 9600 + }, + { + "epoch": 0.46362253398691233, + "eval_runtime": 7.7796, + "eval_samples_per_second": 379.71, + "eval_steps_per_second": 23.78, + "step": 9600 + }, + { + "epoch": 0.464105474126482, + "grad_norm": 2.45734268606812, + "learning_rate": 1.2953573073395219e-05, + "loss": 3.9441, + "mean_token_accuracy": 0.3205645173788071, + "step": 9610 + }, + { + "epoch": 0.46458841426605174, + "grad_norm": 2.3243357566422427, + "learning_rate": 1.2937462437067866e-05, + "loss": 3.9609, + "mean_token_accuracy": 0.3123991936445236, + "step": 9620 + }, + { + "epoch": 0.4650713544056214, + "grad_norm": 2.458474031772993, + "learning_rate": 1.2921343452149811e-05, + "loss": 3.8781, + "mean_token_accuracy": 0.33306451588869096, + "step": 9630 + }, + { + "epoch": 0.46555429454519115, + "grad_norm": 2.4502888914113004, + "learning_rate": 1.290521616445297e-05, + "loss": 3.9398, + "mean_token_accuracy": 0.32046370953321457, + "step": 9640 + }, + { + "epoch": 0.4660372346847608, + "grad_norm": 2.298589153856968, + "learning_rate": 1.288908061981287e-05, + "loss": 4.0453, + "mean_token_accuracy": 0.30846774131059645, + "step": 9650 + }, + { + "epoch": 0.4665201748243305, + "grad_norm": 2.278982290215792, + "learning_rate": 1.28729368640885e-05, + "loss": 3.916, + "mean_token_accuracy": 0.3150201618671417, + "step": 9660 + }, + { + "epoch": 0.46700311496390023, + "grad_norm": 2.2573462629551253, + "learning_rate": 1.2856784943162181e-05, + "loss": 3.923, + "mean_token_accuracy": 0.3208669349551201, + "step": 9670 + }, + { + "epoch": 0.4674860551034699, + "grad_norm": 2.396623800295243, + "learning_rate": 1.2840624902939452e-05, + "loss": 3.9633, + "mean_token_accuracy": 0.31512096971273423, + "step": 9680 + }, + { + "epoch": 0.46796899524303964, + "grad_norm": 2.396254030297396, + "learning_rate": 1.282445678934892e-05, + "loss": 3.9676, + "mean_token_accuracy": 0.31774193346500396, + "step": 9690 + }, + { + "epoch": 0.4684519353826093, + "grad_norm": 2.3470138750190097, + "learning_rate": 1.2808280648342134e-05, + "loss": 3.9488, + "mean_token_accuracy": 0.3152217760682106, + "step": 9700 + }, + { + "epoch": 0.4684519353826093, + "eval_runtime": 7.7998, + "eval_samples_per_second": 378.728, + "eval_steps_per_second": 23.719, + "step": 9700 + }, + { + "epoch": 0.46893487552217905, + "grad_norm": 2.3919409766177164, + "learning_rate": 1.279209652589347e-05, + "loss": 3.9918, + "mean_token_accuracy": 0.31602822467684744, + "step": 9710 + }, + { + "epoch": 0.4694178156617487, + "grad_norm": 2.32929020746566, + "learning_rate": 1.2775904467999973e-05, + "loss": 3.9418, + "mean_token_accuracy": 0.31723790615797043, + "step": 9720 + }, + { + "epoch": 0.4699007558013184, + "grad_norm": 2.2356462523592735, + "learning_rate": 1.2759704520681253e-05, + "loss": 3.957, + "mean_token_accuracy": 0.31229838728904724, + "step": 9730 + }, + { + "epoch": 0.47038369594088814, + "grad_norm": 2.275634011665633, + "learning_rate": 1.2743496729979338e-05, + "loss": 3.9527, + "mean_token_accuracy": 0.31683467924594877, + "step": 9740 + }, + { + "epoch": 0.4708666360804578, + "grad_norm": 2.3433856130443513, + "learning_rate": 1.272728114195855e-05, + "loss": 3.9996, + "mean_token_accuracy": 0.3057459682226181, + "step": 9750 + }, + { + "epoch": 0.47134957622002754, + "grad_norm": 2.2118099297941742, + "learning_rate": 1.2711057802705369e-05, + "loss": 3.9645, + "mean_token_accuracy": 0.32076613008975985, + "step": 9760 + }, + { + "epoch": 0.4718325163595972, + "grad_norm": 2.0910877221753332, + "learning_rate": 1.2694826758328303e-05, + "loss": 3.9074, + "mean_token_accuracy": 0.32127016335725783, + "step": 9770 + }, + { + "epoch": 0.47231545649916695, + "grad_norm": 2.305421910998035, + "learning_rate": 1.2678588054957766e-05, + "loss": 3.9648, + "mean_token_accuracy": 0.3132056459784508, + "step": 9780 + }, + { + "epoch": 0.47279839663873663, + "grad_norm": 2.447527686786078, + "learning_rate": 1.2662341738745934e-05, + "loss": 3.9348, + "mean_token_accuracy": 0.3136088699102402, + "step": 9790 + }, + { + "epoch": 0.4732813367783063, + "grad_norm": 2.394433415830251, + "learning_rate": 1.264608785586662e-05, + "loss": 4.0125, + "mean_token_accuracy": 0.3113911300897598, + "step": 9800 + }, + { + "epoch": 0.4732813367783063, + "eval_runtime": 7.7965, + "eval_samples_per_second": 378.886, + "eval_steps_per_second": 23.728, + "step": 9800 + }, + { + "epoch": 0.47376427691787604, + "grad_norm": 2.261396764613608, + "learning_rate": 1.2629826452515146e-05, + "loss": 3.9477, + "mean_token_accuracy": 0.31250000149011614, + "step": 9810 + }, + { + "epoch": 0.4742472170574457, + "grad_norm": 2.310636829001386, + "learning_rate": 1.2613557574908203e-05, + "loss": 3.9453, + "mean_token_accuracy": 0.31481854915618895, + "step": 9820 + }, + { + "epoch": 0.47473015719701545, + "grad_norm": 2.3013881999511248, + "learning_rate": 1.2597281269283727e-05, + "loss": 3.9102, + "mean_token_accuracy": 0.3211693540215492, + "step": 9830 + }, + { + "epoch": 0.4752130973365851, + "grad_norm": 2.1879800213534644, + "learning_rate": 1.2580997581900771e-05, + "loss": 3.9777, + "mean_token_accuracy": 0.309173384308815, + "step": 9840 + }, + { + "epoch": 0.47569603747615485, + "grad_norm": 2.3377746248218534, + "learning_rate": 1.2564706559039355e-05, + "loss": 3.9937, + "mean_token_accuracy": 0.31118951216340063, + "step": 9850 + }, + { + "epoch": 0.47617897761572453, + "grad_norm": 2.1164390877555084, + "learning_rate": 1.2548408247000356e-05, + "loss": 3.9363, + "mean_token_accuracy": 0.3177419364452362, + "step": 9860 + }, + { + "epoch": 0.4766619177552942, + "grad_norm": 2.321940221914267, + "learning_rate": 1.2532102692105368e-05, + "loss": 3.9176, + "mean_token_accuracy": 0.3163306459784508, + "step": 9870 + }, + { + "epoch": 0.47714485789486394, + "grad_norm": 2.3141071723130775, + "learning_rate": 1.2515789940696568e-05, + "loss": 3.9109, + "mean_token_accuracy": 0.322379033267498, + "step": 9880 + }, + { + "epoch": 0.4776277980344336, + "grad_norm": 2.294320812990155, + "learning_rate": 1.2499470039136586e-05, + "loss": 3.9469, + "mean_token_accuracy": 0.3117943540215492, + "step": 9890 + }, + { + "epoch": 0.47811073817400335, + "grad_norm": 2.4159952300234844, + "learning_rate": 1.248314303380837e-05, + "loss": 3.9961, + "mean_token_accuracy": 0.3142137065529823, + "step": 9900 + }, + { + "epoch": 0.47811073817400335, + "eval_runtime": 7.7835, + "eval_samples_per_second": 379.522, + "eval_steps_per_second": 23.768, + "step": 9900 + }, + { + "epoch": 0.478593678313573, + "grad_norm": 2.1712414152335473, + "learning_rate": 1.2466808971115065e-05, + "loss": 3.9656, + "mean_token_accuracy": 0.3052419349551201, + "step": 9910 + }, + { + "epoch": 0.47907661845314276, + "grad_norm": 2.3495237798146036, + "learning_rate": 1.2450467897479868e-05, + "loss": 3.8773, + "mean_token_accuracy": 0.32268145233392714, + "step": 9920 + }, + { + "epoch": 0.47955955859271243, + "grad_norm": 2.2143312103701334, + "learning_rate": 1.2434119859345908e-05, + "loss": 3.943, + "mean_token_accuracy": 0.31653226017951963, + "step": 9930 + }, + { + "epoch": 0.4800424987322821, + "grad_norm": 2.180722180897748, + "learning_rate": 1.24177649031761e-05, + "loss": 3.8539, + "mean_token_accuracy": 0.3285282239317894, + "step": 9940 + }, + { + "epoch": 0.48052543887185184, + "grad_norm": 2.2574779401170684, + "learning_rate": 1.2401403075453029e-05, + "loss": 3.9254, + "mean_token_accuracy": 0.3148185446858406, + "step": 9950 + }, + { + "epoch": 0.4810083790114215, + "grad_norm": 2.1759463002031767, + "learning_rate": 1.23850344226788e-05, + "loss": 3.9422, + "mean_token_accuracy": 0.32006048411130905, + "step": 9960 + }, + { + "epoch": 0.48149131915099125, + "grad_norm": 2.3677686123349577, + "learning_rate": 1.2368658991374926e-05, + "loss": 3.9238, + "mean_token_accuracy": 0.31522177532315254, + "step": 9970 + }, + { + "epoch": 0.4819742592905609, + "grad_norm": 2.175187690610891, + "learning_rate": 1.2352276828082177e-05, + "loss": 3.9129, + "mean_token_accuracy": 0.3162298396229744, + "step": 9980 + }, + { + "epoch": 0.48245719943013066, + "grad_norm": 2.280563514433045, + "learning_rate": 1.2335887979360462e-05, + "loss": 3.85, + "mean_token_accuracy": 0.3267137110233307, + "step": 9990 + }, + { + "epoch": 0.48294013956970033, + "grad_norm": 2.283340175136506, + "learning_rate": 1.2319492491788685e-05, + "loss": 3.9324, + "mean_token_accuracy": 0.31754032224416734, + "step": 10000 + }, + { + "epoch": 0.48294013956970033, + "eval_runtime": 7.7816, + "eval_samples_per_second": 379.615, + "eval_steps_per_second": 23.774, + "step": 10000 + }, + { + "epoch": 0.48342307970927, + "grad_norm": 2.1493249415035782, + "learning_rate": 1.2303090411964627e-05, + "loss": 3.9215, + "mean_token_accuracy": 0.3209677428007126, + "step": 10010 + }, + { + "epoch": 0.48390601984883974, + "grad_norm": 2.4424066440382037, + "learning_rate": 1.2286681786504795e-05, + "loss": 3.9117, + "mean_token_accuracy": 0.3243951588869095, + "step": 10020 + }, + { + "epoch": 0.4843889599884094, + "grad_norm": 2.2762859628611185, + "learning_rate": 1.2270266662044301e-05, + "loss": 3.8781, + "mean_token_accuracy": 0.3213709697127342, + "step": 10030 + }, + { + "epoch": 0.48487190012797915, + "grad_norm": 2.3210041214373103, + "learning_rate": 1.225384508523674e-05, + "loss": 4.0113, + "mean_token_accuracy": 0.30967741906642915, + "step": 10040 + }, + { + "epoch": 0.4853548402675488, + "grad_norm": 2.495495862554966, + "learning_rate": 1.2237417102754025e-05, + "loss": 3.9047, + "mean_token_accuracy": 0.3147177375853062, + "step": 10050 + }, + { + "epoch": 0.48583778040711856, + "grad_norm": 2.339844516174924, + "learning_rate": 1.2220982761286294e-05, + "loss": 3.9547, + "mean_token_accuracy": 0.3172379046678543, + "step": 10060 + }, + { + "epoch": 0.48632072054668823, + "grad_norm": 2.2196723011029826, + "learning_rate": 1.2204542107541746e-05, + "loss": 3.8898, + "mean_token_accuracy": 0.322177417576313, + "step": 10070 + }, + { + "epoch": 0.4868036606862579, + "grad_norm": 2.332789857876502, + "learning_rate": 1.2188095188246524e-05, + "loss": 3.9613, + "mean_token_accuracy": 0.3122983857989311, + "step": 10080 + }, + { + "epoch": 0.48728660082582764, + "grad_norm": 2.259344403341194, + "learning_rate": 1.2171642050144576e-05, + "loss": 3.9891, + "mean_token_accuracy": 0.3157258093357086, + "step": 10090 + }, + { + "epoch": 0.4877695409653973, + "grad_norm": 2.472643268483903, + "learning_rate": 1.2155182739997533e-05, + "loss": 4.0141, + "mean_token_accuracy": 0.31108870804309846, + "step": 10100 + }, + { + "epoch": 0.4877695409653973, + "eval_runtime": 7.7784, + "eval_samples_per_second": 379.772, + "eval_steps_per_second": 23.784, + "step": 10100 + }, + { + "epoch": 0.48825248110496705, + "grad_norm": 2.202889164832667, + "learning_rate": 1.2138717304584555e-05, + "loss": 3.9086, + "mean_token_accuracy": 0.32459677308797835, + "step": 10110 + }, + { + "epoch": 0.4887354212445367, + "grad_norm": 2.182021867116535, + "learning_rate": 1.212224579070222e-05, + "loss": 3.9539, + "mean_token_accuracy": 0.31733871102333067, + "step": 10120 + }, + { + "epoch": 0.48921836138410646, + "grad_norm": 2.265691829180602, + "learning_rate": 1.2105768245164377e-05, + "loss": 3.9965, + "mean_token_accuracy": 0.3166330635547638, + "step": 10130 + }, + { + "epoch": 0.48970130152367614, + "grad_norm": 2.3732455717470695, + "learning_rate": 1.2089284714802021e-05, + "loss": 3.9504, + "mean_token_accuracy": 0.3175403192639351, + "step": 10140 + }, + { + "epoch": 0.49018424166324587, + "grad_norm": 2.201656192904955, + "learning_rate": 1.2072795246463156e-05, + "loss": 3.9332, + "mean_token_accuracy": 0.3156249985098839, + "step": 10150 + }, + { + "epoch": 0.49066718180281554, + "grad_norm": 2.413824988697088, + "learning_rate": 1.2056299887012654e-05, + "loss": 3.9051, + "mean_token_accuracy": 0.3150201618671417, + "step": 10160 + }, + { + "epoch": 0.4911501219423852, + "grad_norm": 2.19345059928571, + "learning_rate": 1.2039798683332145e-05, + "loss": 3.9703, + "mean_token_accuracy": 0.30756048709154127, + "step": 10170 + }, + { + "epoch": 0.49163306208195495, + "grad_norm": 2.169462612804747, + "learning_rate": 1.2023291682319858e-05, + "loss": 3.9129, + "mean_token_accuracy": 0.32046370953321457, + "step": 10180 + }, + { + "epoch": 0.49211600222152463, + "grad_norm": 2.2622377753071885, + "learning_rate": 1.2006778930890503e-05, + "loss": 3.975, + "mean_token_accuracy": 0.31129032373428345, + "step": 10190 + }, + { + "epoch": 0.49259894236109436, + "grad_norm": 2.2386350798000385, + "learning_rate": 1.199026047597513e-05, + "loss": 3.9305, + "mean_token_accuracy": 0.32328629195690156, + "step": 10200 + }, + { + "epoch": 0.49259894236109436, + "eval_runtime": 7.7922, + "eval_samples_per_second": 379.099, + "eval_steps_per_second": 23.742, + "step": 10200 + }, + { + "epoch": 0.49308188250066404, + "grad_norm": 2.3260327986868417, + "learning_rate": 1.1973736364521005e-05, + "loss": 3.982, + "mean_token_accuracy": 0.31098790317773817, + "step": 10210 + }, + { + "epoch": 0.49356482264023377, + "grad_norm": 2.2568709878274, + "learning_rate": 1.1957206643491463e-05, + "loss": 3.9852, + "mean_token_accuracy": 0.31784273982048034, + "step": 10220 + }, + { + "epoch": 0.49404776277980345, + "grad_norm": 2.398317256605666, + "learning_rate": 1.194067135986579e-05, + "loss": 3.9937, + "mean_token_accuracy": 0.3069556444883347, + "step": 10230 + }, + { + "epoch": 0.4945307029193731, + "grad_norm": 2.238419212183823, + "learning_rate": 1.1924130560639072e-05, + "loss": 3.9172, + "mean_token_accuracy": 0.3240927442908287, + "step": 10240 + }, + { + "epoch": 0.49501364305894285, + "grad_norm": 2.26556200416511, + "learning_rate": 1.190758429282208e-05, + "loss": 3.9523, + "mean_token_accuracy": 0.31602822691202165, + "step": 10250 + }, + { + "epoch": 0.49549658319851253, + "grad_norm": 2.200531837278337, + "learning_rate": 1.1891032603441121e-05, + "loss": 3.9422, + "mean_token_accuracy": 0.31834677457809446, + "step": 10260 + }, + { + "epoch": 0.49597952333808226, + "grad_norm": 2.279611482245076, + "learning_rate": 1.1874475539537917e-05, + "loss": 3.9008, + "mean_token_accuracy": 0.32368951439857485, + "step": 10270 + }, + { + "epoch": 0.49646246347765194, + "grad_norm": 2.3156117711132564, + "learning_rate": 1.1857913148169455e-05, + "loss": 3.9438, + "mean_token_accuracy": 0.31451612859964373, + "step": 10280 + }, + { + "epoch": 0.49694540361722167, + "grad_norm": 2.1610310175257283, + "learning_rate": 1.1841345476407877e-05, + "loss": 3.9379, + "mean_token_accuracy": 0.3158266142010689, + "step": 10290 + }, + { + "epoch": 0.49742834375679135, + "grad_norm": 2.2393961935207556, + "learning_rate": 1.1824772571340319e-05, + "loss": 3.993, + "mean_token_accuracy": 0.31683467626571654, + "step": 10300 + }, + { + "epoch": 0.49742834375679135, + "eval_runtime": 7.8092, + "eval_samples_per_second": 378.272, + "eval_steps_per_second": 23.69, + "step": 10300 + }, + { + "epoch": 0.497911283896361, + "grad_norm": 2.246921590257132, + "learning_rate": 1.1808194480068798e-05, + "loss": 3.9641, + "mean_token_accuracy": 0.31693548560142515, + "step": 10310 + }, + { + "epoch": 0.49839422403593076, + "grad_norm": 2.266915245346127, + "learning_rate": 1.179161124971007e-05, + "loss": 3.9164, + "mean_token_accuracy": 0.3166330635547638, + "step": 10320 + }, + { + "epoch": 0.49887716417550043, + "grad_norm": 2.3696333857943777, + "learning_rate": 1.1775022927395496e-05, + "loss": 4.0281, + "mean_token_accuracy": 0.3110887110233307, + "step": 10330 + }, + { + "epoch": 0.49936010431507016, + "grad_norm": 2.2861514387756565, + "learning_rate": 1.175842956027091e-05, + "loss": 3.991, + "mean_token_accuracy": 0.31381048262119293, + "step": 10340 + }, + { + "epoch": 0.49984304445463984, + "grad_norm": 2.2991366720026, + "learning_rate": 1.1741831195496478e-05, + "loss": 3.8738, + "mean_token_accuracy": 0.32086693346500395, + "step": 10350 + }, + { + "epoch": 0.5003259845942095, + "grad_norm": 2.3171652950450308, + "learning_rate": 1.1725227880246581e-05, + "loss": 3.9305, + "mean_token_accuracy": 0.31784274280071256, + "step": 10360 + }, + { + "epoch": 0.5008089247337792, + "grad_norm": 2.3714053289075117, + "learning_rate": 1.1708619661709662e-05, + "loss": 3.9137, + "mean_token_accuracy": 0.31834677159786223, + "step": 10370 + }, + { + "epoch": 0.501291864873349, + "grad_norm": 2.466585158068514, + "learning_rate": 1.1692006587088098e-05, + "loss": 3.941, + "mean_token_accuracy": 0.3130040317773819, + "step": 10380 + }, + { + "epoch": 0.5017748050129186, + "grad_norm": 2.2030803067840794, + "learning_rate": 1.1675388703598074e-05, + "loss": 3.8672, + "mean_token_accuracy": 0.32167338877916335, + "step": 10390 + }, + { + "epoch": 0.5022577451524883, + "grad_norm": 2.2053148137272016, + "learning_rate": 1.1658766058469437e-05, + "loss": 3.9215, + "mean_token_accuracy": 0.32217742055654525, + "step": 10400 + }, + { + "epoch": 0.5022577451524883, + "eval_runtime": 7.7878, + "eval_samples_per_second": 379.309, + "eval_steps_per_second": 23.755, + "step": 10400 + }, + { + "epoch": 0.5027406852920581, + "grad_norm": 2.4747415131909123, + "learning_rate": 1.1642138698945573e-05, + "loss": 3.932, + "mean_token_accuracy": 0.3148185506463051, + "step": 10410 + }, + { + "epoch": 0.5032236254316278, + "grad_norm": 2.3673901546378464, + "learning_rate": 1.162550667228326e-05, + "loss": 3.8859, + "mean_token_accuracy": 0.31905242055654526, + "step": 10420 + }, + { + "epoch": 0.5037065655711974, + "grad_norm": 2.3899713683933803, + "learning_rate": 1.1608870025752544e-05, + "loss": 3.966, + "mean_token_accuracy": 0.3223790302872658, + "step": 10430 + }, + { + "epoch": 0.5041895057107671, + "grad_norm": 2.298399917558338, + "learning_rate": 1.1592228806636598e-05, + "loss": 3.9344, + "mean_token_accuracy": 0.3208669349551201, + "step": 10440 + }, + { + "epoch": 0.5046724458503369, + "grad_norm": 2.2654227165459977, + "learning_rate": 1.1575583062231599e-05, + "loss": 3.9668, + "mean_token_accuracy": 0.3087701633572578, + "step": 10450 + }, + { + "epoch": 0.5051553859899065, + "grad_norm": 2.3169942952651272, + "learning_rate": 1.1558932839846575e-05, + "loss": 3.9359, + "mean_token_accuracy": 0.31915322691202164, + "step": 10460 + }, + { + "epoch": 0.5056383261294762, + "grad_norm": 2.3090126349842515, + "learning_rate": 1.154227818680329e-05, + "loss": 3.9637, + "mean_token_accuracy": 0.3152217760682106, + "step": 10470 + }, + { + "epoch": 0.506121266269046, + "grad_norm": 2.306503415929606, + "learning_rate": 1.1525619150436095e-05, + "loss": 3.9676, + "mean_token_accuracy": 0.311491933465004, + "step": 10480 + }, + { + "epoch": 0.5066042064086157, + "grad_norm": 2.337187519533768, + "learning_rate": 1.1508955778091795e-05, + "loss": 3.9492, + "mean_token_accuracy": 0.3140120983123779, + "step": 10490 + }, + { + "epoch": 0.5070871465481853, + "grad_norm": 2.3344131919011497, + "learning_rate": 1.1492288117129531e-05, + "loss": 3.9926, + "mean_token_accuracy": 0.3085685446858406, + "step": 10500 + }, + { + "epoch": 0.5070871465481853, + "eval_runtime": 7.8283, + "eval_samples_per_second": 377.347, + "eval_steps_per_second": 23.632, + "step": 10500 + }, + { + "epoch": 0.507570086687755, + "grad_norm": 2.3624904907751696, + "learning_rate": 1.1475616214920622e-05, + "loss": 3.9664, + "mean_token_accuracy": 0.3083669364452362, + "step": 10510 + }, + { + "epoch": 0.5080530268273248, + "grad_norm": 2.2940469285090974, + "learning_rate": 1.145894011884844e-05, + "loss": 3.9941, + "mean_token_accuracy": 0.3200604856014252, + "step": 10520 + }, + { + "epoch": 0.5085359669668944, + "grad_norm": 2.2291193041702595, + "learning_rate": 1.1442259876308288e-05, + "loss": 3.943, + "mean_token_accuracy": 0.31864919513463974, + "step": 10530 + }, + { + "epoch": 0.5090189071064641, + "grad_norm": 2.3136245512748954, + "learning_rate": 1.1425575534707244e-05, + "loss": 3.9, + "mean_token_accuracy": 0.3193548396229744, + "step": 10540 + }, + { + "epoch": 0.5095018472460339, + "grad_norm": 2.3174983672935205, + "learning_rate": 1.1408887141464033e-05, + "loss": 3.9684, + "mean_token_accuracy": 0.30856855064630506, + "step": 10550 + }, + { + "epoch": 0.5099847873856036, + "grad_norm": 2.511282812333674, + "learning_rate": 1.1392194744008914e-05, + "loss": 3.9777, + "mean_token_accuracy": 0.31895161271095274, + "step": 10560 + }, + { + "epoch": 0.5104677275251732, + "grad_norm": 2.2478055922913343, + "learning_rate": 1.1375498389783498e-05, + "loss": 3.934, + "mean_token_accuracy": 0.3179435461759567, + "step": 10570 + }, + { + "epoch": 0.510950667664743, + "grad_norm": 2.3186034837060454, + "learning_rate": 1.1358798126240662e-05, + "loss": 3.9203, + "mean_token_accuracy": 0.32127016186714175, + "step": 10580 + }, + { + "epoch": 0.5114336078043127, + "grad_norm": 2.238581092322491, + "learning_rate": 1.1342094000844388e-05, + "loss": 3.9383, + "mean_token_accuracy": 0.31713709980249405, + "step": 10590 + }, + { + "epoch": 0.5119165479438823, + "grad_norm": 2.3716603791992927, + "learning_rate": 1.1325386061069639e-05, + "loss": 4.0141, + "mean_token_accuracy": 0.3174395188689232, + "step": 10600 + }, + { + "epoch": 0.5119165479438823, + "eval_runtime": 7.8217, + "eval_samples_per_second": 377.666, + "eval_steps_per_second": 23.652, + "step": 10600 + }, + { + "epoch": 0.512399488083452, + "grad_norm": 2.3212554081107797, + "learning_rate": 1.1308674354402207e-05, + "loss": 4.0004, + "mean_token_accuracy": 0.31411290168762207, + "step": 10610 + }, + { + "epoch": 0.5128824282230218, + "grad_norm": 2.303960307323921, + "learning_rate": 1.12919589283386e-05, + "loss": 3.9234, + "mean_token_accuracy": 0.3179435521364212, + "step": 10620 + }, + { + "epoch": 0.5133653683625915, + "grad_norm": 2.2702987121612144, + "learning_rate": 1.1275239830385894e-05, + "loss": 3.9254, + "mean_token_accuracy": 0.32389113008975984, + "step": 10630 + }, + { + "epoch": 0.5138483085021611, + "grad_norm": 2.395402624771453, + "learning_rate": 1.1258517108061598e-05, + "loss": 3.9223, + "mean_token_accuracy": 0.3180443525314331, + "step": 10640 + }, + { + "epoch": 0.5143312486417309, + "grad_norm": 2.4870151406189733, + "learning_rate": 1.124179080889353e-05, + "loss": 3.8977, + "mean_token_accuracy": 0.3171370968222618, + "step": 10650 + }, + { + "epoch": 0.5148141887813006, + "grad_norm": 2.1750747428090214, + "learning_rate": 1.1225060980419661e-05, + "loss": 3.9453, + "mean_token_accuracy": 0.31431451588869097, + "step": 10660 + }, + { + "epoch": 0.5152971289208702, + "grad_norm": 2.287363578196008, + "learning_rate": 1.120832767018801e-05, + "loss": 3.9469, + "mean_token_accuracy": 0.3174395188689232, + "step": 10670 + }, + { + "epoch": 0.5157800690604399, + "grad_norm": 2.4797993128254543, + "learning_rate": 1.1191590925756473e-05, + "loss": 3.984, + "mean_token_accuracy": 0.31562500447034836, + "step": 10680 + }, + { + "epoch": 0.5162630092000097, + "grad_norm": 2.362031912899414, + "learning_rate": 1.117485079469272e-05, + "loss": 3.941, + "mean_token_accuracy": 0.31391128748655317, + "step": 10690 + }, + { + "epoch": 0.5167459493395794, + "grad_norm": 2.2458500672311548, + "learning_rate": 1.1158107324574037e-05, + "loss": 3.948, + "mean_token_accuracy": 0.31280241906642914, + "step": 10700 + }, + { + "epoch": 0.5167459493395794, + "eval_runtime": 7.8, + "eval_samples_per_second": 378.718, + "eval_steps_per_second": 23.718, + "step": 10700 + }, + { + "epoch": 0.517228889479149, + "grad_norm": 2.27954095190706, + "learning_rate": 1.1141360562987206e-05, + "loss": 3.9387, + "mean_token_accuracy": 0.31985886991024015, + "step": 10710 + }, + { + "epoch": 0.5177118296187188, + "grad_norm": 2.3040922937651604, + "learning_rate": 1.112461055752836e-05, + "loss": 3.9059, + "mean_token_accuracy": 0.32076613008975985, + "step": 10720 + }, + { + "epoch": 0.5181947697582885, + "grad_norm": 2.204297655001981, + "learning_rate": 1.110785735580286e-05, + "loss": 3.9332, + "mean_token_accuracy": 0.3213709697127342, + "step": 10730 + }, + { + "epoch": 0.5186777098978581, + "grad_norm": 2.2882252399119505, + "learning_rate": 1.1091101005425135e-05, + "loss": 3.9496, + "mean_token_accuracy": 0.322379033267498, + "step": 10740 + }, + { + "epoch": 0.5191606500374278, + "grad_norm": 2.341400370606141, + "learning_rate": 1.107434155401858e-05, + "loss": 3.9656, + "mean_token_accuracy": 0.3123991906642914, + "step": 10750 + }, + { + "epoch": 0.5196435901769976, + "grad_norm": 2.372411346313528, + "learning_rate": 1.105757904921539e-05, + "loss": 3.9699, + "mean_token_accuracy": 0.3140120983123779, + "step": 10760 + }, + { + "epoch": 0.5201265303165673, + "grad_norm": 2.4067753613053418, + "learning_rate": 1.1040813538656445e-05, + "loss": 3.9688, + "mean_token_accuracy": 0.31411290168762207, + "step": 10770 + }, + { + "epoch": 0.5206094704561369, + "grad_norm": 2.3964796483430413, + "learning_rate": 1.1024045069991172e-05, + "loss": 3.9305, + "mean_token_accuracy": 0.31915322244167327, + "step": 10780 + }, + { + "epoch": 0.5210924105957067, + "grad_norm": 2.2471462878199917, + "learning_rate": 1.1007273690877392e-05, + "loss": 3.9629, + "mean_token_accuracy": 0.31411290168762207, + "step": 10790 + }, + { + "epoch": 0.5215753507352764, + "grad_norm": 2.3748495250402057, + "learning_rate": 1.099049944898121e-05, + "loss": 3.9555, + "mean_token_accuracy": 0.316129033267498, + "step": 10800 + }, + { + "epoch": 0.5215753507352764, + "eval_runtime": 7.8065, + "eval_samples_per_second": 378.402, + "eval_steps_per_second": 23.698, + "step": 10800 + }, + { + "epoch": 0.522058290874846, + "grad_norm": 2.3859897231005394, + "learning_rate": 1.097372239197686e-05, + "loss": 3.9426, + "mean_token_accuracy": 0.31491935551166533, + "step": 10810 + }, + { + "epoch": 0.5225412310144157, + "grad_norm": 2.390124828104098, + "learning_rate": 1.0956942567546583e-05, + "loss": 3.9844, + "mean_token_accuracy": 0.3099798381328583, + "step": 10820 + }, + { + "epoch": 0.5230241711539855, + "grad_norm": 2.442322466247052, + "learning_rate": 1.0940160023380482e-05, + "loss": 3.9586, + "mean_token_accuracy": 0.31955645233392715, + "step": 10830 + }, + { + "epoch": 0.5235071112935552, + "grad_norm": 2.36435218742294, + "learning_rate": 1.0923374807176386e-05, + "loss": 3.9289, + "mean_token_accuracy": 0.32429435551166536, + "step": 10840 + }, + { + "epoch": 0.5239900514331248, + "grad_norm": 2.4045762621804583, + "learning_rate": 1.0906586966639724e-05, + "loss": 3.9348, + "mean_token_accuracy": 0.3165322557091713, + "step": 10850 + }, + { + "epoch": 0.5244729915726946, + "grad_norm": 2.417811215817686, + "learning_rate": 1.0889796549483383e-05, + "loss": 3.9047, + "mean_token_accuracy": 0.3194556474685669, + "step": 10860 + }, + { + "epoch": 0.5249559317122643, + "grad_norm": 2.2478094349703475, + "learning_rate": 1.087300360342757e-05, + "loss": 3.9504, + "mean_token_accuracy": 0.3219758078455925, + "step": 10870 + }, + { + "epoch": 0.5254388718518339, + "grad_norm": 2.2546786410288324, + "learning_rate": 1.0856208176199683e-05, + "loss": 3.8969, + "mean_token_accuracy": 0.3184575974941254, + "step": 10880 + }, + { + "epoch": 0.5259218119914036, + "grad_norm": 2.5084391879776216, + "learning_rate": 1.0839410315534166e-05, + "loss": 3.9516, + "mean_token_accuracy": 0.3171370983123779, + "step": 10890 + }, + { + "epoch": 0.5264047521309734, + "grad_norm": 2.374367668553866, + "learning_rate": 1.0822610069172388e-05, + "loss": 3.9586, + "mean_token_accuracy": 0.31764113157987595, + "step": 10900 + }, + { + "epoch": 0.5264047521309734, + "eval_runtime": 7.8066, + "eval_samples_per_second": 378.396, + "eval_steps_per_second": 23.698, + "step": 10900 + }, + { + "epoch": 0.5268876922705431, + "grad_norm": 2.344279403985828, + "learning_rate": 1.0805807484862491e-05, + "loss": 3.877, + "mean_token_accuracy": 0.33064516335725785, + "step": 10910 + }, + { + "epoch": 0.5273706324101127, + "grad_norm": 2.3718471707951894, + "learning_rate": 1.0789002610359263e-05, + "loss": 3.9363, + "mean_token_accuracy": 0.3161290317773819, + "step": 10920 + }, + { + "epoch": 0.5278535725496825, + "grad_norm": 2.327565851459907, + "learning_rate": 1.0772195493424005e-05, + "loss": 3.9121, + "mean_token_accuracy": 0.32409274131059645, + "step": 10930 + }, + { + "epoch": 0.5283365126892522, + "grad_norm": 2.416244321524799, + "learning_rate": 1.0755386181824386e-05, + "loss": 3.8828, + "mean_token_accuracy": 0.32368951588869094, + "step": 10940 + }, + { + "epoch": 0.5288194528288218, + "grad_norm": 2.3202515296480866, + "learning_rate": 1.0738574723334317e-05, + "loss": 3.9805, + "mean_token_accuracy": 0.30423387289047243, + "step": 10950 + }, + { + "epoch": 0.5293023929683915, + "grad_norm": 2.3575938023043124, + "learning_rate": 1.0721761165733807e-05, + "loss": 3.8828, + "mean_token_accuracy": 0.3261088699102402, + "step": 10960 + }, + { + "epoch": 0.5297853331079613, + "grad_norm": 2.389861043165183, + "learning_rate": 1.0704945556808832e-05, + "loss": 3.9766, + "mean_token_accuracy": 0.32127015888690946, + "step": 10970 + }, + { + "epoch": 0.530268273247531, + "grad_norm": 2.5340142582520104, + "learning_rate": 1.06881279443512e-05, + "loss": 3.907, + "mean_token_accuracy": 0.3184475779533386, + "step": 10980 + }, + { + "epoch": 0.5307512133871006, + "grad_norm": 2.355080602588245, + "learning_rate": 1.0671308376158408e-05, + "loss": 4.0246, + "mean_token_accuracy": 0.30887096375226974, + "step": 10990 + }, + { + "epoch": 0.5312341535266704, + "grad_norm": 2.376495740197977, + "learning_rate": 1.0654486900033518e-05, + "loss": 3.968, + "mean_token_accuracy": 0.3168346777558327, + "step": 11000 + }, + { + "epoch": 0.5312341535266704, + "eval_runtime": 7.7917, + "eval_samples_per_second": 379.12, + "eval_steps_per_second": 23.743, + "step": 11000 + }, + { + "epoch": 0.5317170936662401, + "grad_norm": 2.3428629742516445, + "learning_rate": 1.0637663563785013e-05, + "loss": 3.9023, + "mean_token_accuracy": 0.32268145233392714, + "step": 11010 + }, + { + "epoch": 0.5322000338058098, + "grad_norm": 2.2645437045799346, + "learning_rate": 1.062083841522666e-05, + "loss": 3.884, + "mean_token_accuracy": 0.3223790317773819, + "step": 11020 + }, + { + "epoch": 0.5326829739453794, + "grad_norm": 2.319140422066057, + "learning_rate": 1.0604011502177376e-05, + "loss": 3.8789, + "mean_token_accuracy": 0.3240927457809448, + "step": 11030 + }, + { + "epoch": 0.5331659140849492, + "grad_norm": 2.3837471862106607, + "learning_rate": 1.0587182872461102e-05, + "loss": 3.9473, + "mean_token_accuracy": 0.3145161300897598, + "step": 11040 + }, + { + "epoch": 0.5336488542245189, + "grad_norm": 2.2134495943303603, + "learning_rate": 1.0570352573906641e-05, + "loss": 3.9727, + "mean_token_accuracy": 0.31471773982048035, + "step": 11050 + }, + { + "epoch": 0.5341317943640885, + "grad_norm": 2.228727056930029, + "learning_rate": 1.055352065434756e-05, + "loss": 3.9137, + "mean_token_accuracy": 0.31905242055654526, + "step": 11060 + }, + { + "epoch": 0.5346147345036583, + "grad_norm": 2.3886647426886274, + "learning_rate": 1.0536687161622012e-05, + "loss": 3.8305, + "mean_token_accuracy": 0.3269153222441673, + "step": 11070 + }, + { + "epoch": 0.535097674643228, + "grad_norm": 2.480877813492789, + "learning_rate": 1.0519852143572638e-05, + "loss": 3.9457, + "mean_token_accuracy": 0.31280242204666137, + "step": 11080 + }, + { + "epoch": 0.5355806147827977, + "grad_norm": 2.266460194605464, + "learning_rate": 1.0503015648046402e-05, + "loss": 3.9848, + "mean_token_accuracy": 0.3245967745780945, + "step": 11090 + }, + { + "epoch": 0.5360635549223673, + "grad_norm": 2.374578298133916, + "learning_rate": 1.0486177722894482e-05, + "loss": 3.8977, + "mean_token_accuracy": 0.31965725421905516, + "step": 11100 + }, + { + "epoch": 0.5360635549223673, + "eval_runtime": 7.8063, + "eval_samples_per_second": 378.412, + "eval_steps_per_second": 23.699, + "step": 11100 + }, + { + "epoch": 0.5365464950619371, + "grad_norm": 2.320960704411755, + "learning_rate": 1.0469338415972099e-05, + "loss": 3.9395, + "mean_token_accuracy": 0.32076613008975985, + "step": 11110 + }, + { + "epoch": 0.5370294352015068, + "grad_norm": 2.3530541764856405, + "learning_rate": 1.0452497775138417e-05, + "loss": 3.8898, + "mean_token_accuracy": 0.3242943540215492, + "step": 11120 + }, + { + "epoch": 0.5375123753410764, + "grad_norm": 2.223179542471163, + "learning_rate": 1.0435655848256382e-05, + "loss": 3.9848, + "mean_token_accuracy": 0.321370966732502, + "step": 11130 + }, + { + "epoch": 0.5379953154806462, + "grad_norm": 2.3937974926676833, + "learning_rate": 1.0418812683192603e-05, + "loss": 4.0148, + "mean_token_accuracy": 0.3041330650448799, + "step": 11140 + }, + { + "epoch": 0.5384782556202159, + "grad_norm": 2.375824739824623, + "learning_rate": 1.0401968327817206e-05, + "loss": 3.9188, + "mean_token_accuracy": 0.313508066534996, + "step": 11150 + }, + { + "epoch": 0.5389611957597856, + "grad_norm": 2.1913264553811067, + "learning_rate": 1.0385122830003694e-05, + "loss": 3.9766, + "mean_token_accuracy": 0.31522177457809447, + "step": 11160 + }, + { + "epoch": 0.5394441358993552, + "grad_norm": 2.353812281790984, + "learning_rate": 1.036827623762882e-05, + "loss": 3.9738, + "mean_token_accuracy": 0.31229838728904724, + "step": 11170 + }, + { + "epoch": 0.539927076038925, + "grad_norm": 2.2045320603283427, + "learning_rate": 1.0351428598572453e-05, + "loss": 3.907, + "mean_token_accuracy": 0.32389113306999207, + "step": 11180 + }, + { + "epoch": 0.5404100161784947, + "grad_norm": 2.2653742662058276, + "learning_rate": 1.0334579960717432e-05, + "loss": 3.9461, + "mean_token_accuracy": 0.3148185446858406, + "step": 11190 + }, + { + "epoch": 0.5408929563180643, + "grad_norm": 2.2064719315934798, + "learning_rate": 1.031773037194943e-05, + "loss": 3.8781, + "mean_token_accuracy": 0.3208669349551201, + "step": 11200 + }, + { + "epoch": 0.5408929563180643, + "eval_runtime": 7.8046, + "eval_samples_per_second": 378.493, + "eval_steps_per_second": 23.704, + "step": 11200 + }, + { + "epoch": 0.5413758964576341, + "grad_norm": 2.4647689060259537, + "learning_rate": 1.0300879880156836e-05, + "loss": 3.9629, + "mean_token_accuracy": 0.3076612904667854, + "step": 11210 + }, + { + "epoch": 0.5418588365972038, + "grad_norm": 2.1410016454859697, + "learning_rate": 1.0284028533230593e-05, + "loss": 3.8711, + "mean_token_accuracy": 0.3243951603770256, + "step": 11220 + }, + { + "epoch": 0.5423417767367735, + "grad_norm": 2.387697158122027, + "learning_rate": 1.0267176379064076e-05, + "loss": 3.9371, + "mean_token_accuracy": 0.3259072586894035, + "step": 11230 + }, + { + "epoch": 0.5428247168763431, + "grad_norm": 2.269711835329223, + "learning_rate": 1.0250323465552964e-05, + "loss": 3.9402, + "mean_token_accuracy": 0.32076612710952757, + "step": 11240 + }, + { + "epoch": 0.5433076570159129, + "grad_norm": 2.344636114368095, + "learning_rate": 1.0233469840595083e-05, + "loss": 3.9113, + "mean_token_accuracy": 0.3166330635547638, + "step": 11250 + }, + { + "epoch": 0.5437905971554826, + "grad_norm": 2.2553448211770966, + "learning_rate": 1.0216615552090285e-05, + "loss": 3.9902, + "mean_token_accuracy": 0.3181451603770256, + "step": 11260 + }, + { + "epoch": 0.5442735372950522, + "grad_norm": 2.240013445261615, + "learning_rate": 1.0199760647940308e-05, + "loss": 3.9777, + "mean_token_accuracy": 0.30897177308797835, + "step": 11270 + }, + { + "epoch": 0.544756477434622, + "grad_norm": 2.3231306384272377, + "learning_rate": 1.0182905176048643e-05, + "loss": 3.9523, + "mean_token_accuracy": 0.31794354915618894, + "step": 11280 + }, + { + "epoch": 0.5452394175741917, + "grad_norm": 2.341563149896763, + "learning_rate": 1.0166049184320386e-05, + "loss": 3.9375, + "mean_token_accuracy": 0.30816532075405123, + "step": 11290 + }, + { + "epoch": 0.5457223577137614, + "grad_norm": 2.3258022595342105, + "learning_rate": 1.0149192720662122e-05, + "loss": 3.9734, + "mean_token_accuracy": 0.3102822571992874, + "step": 11300 + }, + { + "epoch": 0.5457223577137614, + "eval_runtime": 7.8043, + "eval_samples_per_second": 378.507, + "eval_steps_per_second": 23.705, + "step": 11300 + }, + { + "epoch": 0.546205297853331, + "grad_norm": 2.40866319180267, + "learning_rate": 1.0132335832981765e-05, + "loss": 3.9434, + "mean_token_accuracy": 0.3118951603770256, + "step": 11310 + }, + { + "epoch": 0.5466882379929008, + "grad_norm": 2.3244668549304497, + "learning_rate": 1.0115478569188448e-05, + "loss": 3.9926, + "mean_token_accuracy": 0.31693548411130906, + "step": 11320 + }, + { + "epoch": 0.5471711781324705, + "grad_norm": 2.3252721244410735, + "learning_rate": 1.0098620977192356e-05, + "loss": 3.9047, + "mean_token_accuracy": 0.3171370968222618, + "step": 11330 + }, + { + "epoch": 0.5476541182720401, + "grad_norm": 2.3901838842617176, + "learning_rate": 1.0081763104904625e-05, + "loss": 3.898, + "mean_token_accuracy": 0.3221774220466614, + "step": 11340 + }, + { + "epoch": 0.5481370584116099, + "grad_norm": 2.2742395633915318, + "learning_rate": 1.006490500023717e-05, + "loss": 3.9441, + "mean_token_accuracy": 0.3181451603770256, + "step": 11350 + }, + { + "epoch": 0.5486199985511796, + "grad_norm": 2.345525038506473, + "learning_rate": 1.0048046711102584e-05, + "loss": 3.9562, + "mean_token_accuracy": 0.31350806057453157, + "step": 11360 + }, + { + "epoch": 0.5491029386907493, + "grad_norm": 2.3075893914563137, + "learning_rate": 1.0031188285413969e-05, + "loss": 3.9691, + "mean_token_accuracy": 0.3171370968222618, + "step": 11370 + }, + { + "epoch": 0.549585878830319, + "grad_norm": 2.4733962445569393, + "learning_rate": 1.0014329771084822e-05, + "loss": 4.0105, + "mean_token_accuracy": 0.3080645173788071, + "step": 11380 + }, + { + "epoch": 0.5500688189698887, + "grad_norm": 2.3411508446767733, + "learning_rate": 9.997471216028893e-06, + "loss": 3.9012, + "mean_token_accuracy": 0.3288306474685669, + "step": 11390 + }, + { + "epoch": 0.5505517591094584, + "grad_norm": 2.676127249800124, + "learning_rate": 9.980612668160046e-06, + "loss": 3.8801, + "mean_token_accuracy": 0.32066532373428347, + "step": 11400 + }, + { + "epoch": 0.5505517591094584, + "eval_runtime": 7.7952, + "eval_samples_per_second": 378.953, + "eval_steps_per_second": 23.733, + "step": 11400 + }, + { + "epoch": 0.551034699249028, + "grad_norm": 2.250586146003107, + "learning_rate": 9.963754175392124e-06, + "loss": 3.8809, + "mean_token_accuracy": 0.3181451603770256, + "step": 11410 + }, + { + "epoch": 0.5515176393885978, + "grad_norm": 2.3292220907663443, + "learning_rate": 9.946895785638814e-06, + "loss": 3.8988, + "mean_token_accuracy": 0.3243951603770256, + "step": 11420 + }, + { + "epoch": 0.5520005795281675, + "grad_norm": 2.217312220725214, + "learning_rate": 9.930037546813513e-06, + "loss": 3.9523, + "mean_token_accuracy": 0.3222782269120216, + "step": 11430 + }, + { + "epoch": 0.5524835196677372, + "grad_norm": 2.3205444966822455, + "learning_rate": 9.913179506829182e-06, + "loss": 3.893, + "mean_token_accuracy": 0.31622983515262604, + "step": 11440 + }, + { + "epoch": 0.5529664598073069, + "grad_norm": 2.41816032361089, + "learning_rate": 9.896321713598222e-06, + "loss": 3.9715, + "mean_token_accuracy": 0.3160282224416733, + "step": 11450 + }, + { + "epoch": 0.5534493999468766, + "grad_norm": 2.396254560045998, + "learning_rate": 9.879464215032337e-06, + "loss": 3.9297, + "mean_token_accuracy": 0.3211693525314331, + "step": 11460 + }, + { + "epoch": 0.5539323400864463, + "grad_norm": 2.2817830440751825, + "learning_rate": 9.862607059042381e-06, + "loss": 3.9281, + "mean_token_accuracy": 0.3254032269120216, + "step": 11470 + }, + { + "epoch": 0.5544152802260159, + "grad_norm": 2.325569099250059, + "learning_rate": 9.84575029353825e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.3155241906642914, + "step": 11480 + }, + { + "epoch": 0.5548982203655857, + "grad_norm": 2.49117197996408, + "learning_rate": 9.828893966428712e-06, + "loss": 3.9949, + "mean_token_accuracy": 0.31209677159786225, + "step": 11490 + }, + { + "epoch": 0.5553811605051554, + "grad_norm": 2.4464685178320598, + "learning_rate": 9.812038125621308e-06, + "loss": 3.9898, + "mean_token_accuracy": 0.3132056504487991, + "step": 11500 + }, + { + "epoch": 0.5553811605051554, + "eval_runtime": 7.8159, + "eval_samples_per_second": 377.946, + "eval_steps_per_second": 23.67, + "step": 11500 + }, + { + "epoch": 0.5558641006447251, + "grad_norm": 2.4246638993009784, + "learning_rate": 9.795182819022182e-06, + "loss": 3.9281, + "mean_token_accuracy": 0.3229838728904724, + "step": 11510 + }, + { + "epoch": 0.5563470407842948, + "grad_norm": 2.459596532539133, + "learning_rate": 9.77832809453597e-06, + "loss": 3.9813, + "mean_token_accuracy": 0.30524193346500395, + "step": 11520 + }, + { + "epoch": 0.5568299809238645, + "grad_norm": 2.1643566153937845, + "learning_rate": 9.761474000065649e-06, + "loss": 3.8715, + "mean_token_accuracy": 0.32106854766607285, + "step": 11530 + }, + { + "epoch": 0.5573129210634342, + "grad_norm": 2.412486105184616, + "learning_rate": 9.744620583512403e-06, + "loss": 3.9602, + "mean_token_accuracy": 0.3075604856014252, + "step": 11540 + }, + { + "epoch": 0.5577958612030038, + "grad_norm": 2.2652121668305085, + "learning_rate": 9.727767892775491e-06, + "loss": 3.923, + "mean_token_accuracy": 0.3234879031777382, + "step": 11550 + }, + { + "epoch": 0.5582788013425736, + "grad_norm": 2.4037233821602926, + "learning_rate": 9.710915975752116e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.31743951588869096, + "step": 11560 + }, + { + "epoch": 0.5587617414821433, + "grad_norm": 2.3119405511393274, + "learning_rate": 9.694064880337267e-06, + "loss": 3.9055, + "mean_token_accuracy": 0.3221774235367775, + "step": 11570 + }, + { + "epoch": 0.559244681621713, + "grad_norm": 2.3914437892968476, + "learning_rate": 9.677214654423618e-06, + "loss": 3.9312, + "mean_token_accuracy": 0.3207661285996437, + "step": 11580 + }, + { + "epoch": 0.5597276217612827, + "grad_norm": 2.600317317789711, + "learning_rate": 9.660365345901351e-06, + "loss": 3.9789, + "mean_token_accuracy": 0.31885080635547636, + "step": 11590 + }, + { + "epoch": 0.5602105619008524, + "grad_norm": 2.3190814614590214, + "learning_rate": 9.643517002658055e-06, + "loss": 3.902, + "mean_token_accuracy": 0.3232862904667854, + "step": 11600 + }, + { + "epoch": 0.5602105619008524, + "eval_runtime": 7.8061, + "eval_samples_per_second": 378.422, + "eval_steps_per_second": 23.699, + "step": 11600 + }, + { + "epoch": 0.5606935020404221, + "grad_norm": 2.48315326365775, + "learning_rate": 9.62666967257857e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.3162298411130905, + "step": 11610 + }, + { + "epoch": 0.5611764421799917, + "grad_norm": 2.3032605096738896, + "learning_rate": 9.609823403544858e-06, + "loss": 3.8871, + "mean_token_accuracy": 0.3181451618671417, + "step": 11620 + }, + { + "epoch": 0.5616593823195615, + "grad_norm": 2.313723676072136, + "learning_rate": 9.592978243435867e-06, + "loss": 3.9109, + "mean_token_accuracy": 0.31915322691202164, + "step": 11630 + }, + { + "epoch": 0.5621423224591312, + "grad_norm": 2.492365331637608, + "learning_rate": 9.576134240127387e-06, + "loss": 3.9258, + "mean_token_accuracy": 0.3150201618671417, + "step": 11640 + }, + { + "epoch": 0.5626252625987009, + "grad_norm": 2.3474685174521626, + "learning_rate": 9.55929144149193e-06, + "loss": 3.9117, + "mean_token_accuracy": 0.3222782239317894, + "step": 11650 + }, + { + "epoch": 0.5631082027382706, + "grad_norm": 2.1915421526225205, + "learning_rate": 9.54244989539857e-06, + "loss": 3.9324, + "mean_token_accuracy": 0.31995967775583267, + "step": 11660 + }, + { + "epoch": 0.5635911428778403, + "grad_norm": 2.4628609017806293, + "learning_rate": 9.525609649712838e-06, + "loss": 3.993, + "mean_token_accuracy": 0.32076613008975985, + "step": 11670 + }, + { + "epoch": 0.56407408301741, + "grad_norm": 2.497022895162499, + "learning_rate": 9.508770752296557e-06, + "loss": 3.9898, + "mean_token_accuracy": 0.315322582423687, + "step": 11680 + }, + { + "epoch": 0.5645570231569796, + "grad_norm": 2.3586812171901554, + "learning_rate": 9.491933251007723e-06, + "loss": 3.916, + "mean_token_accuracy": 0.3128024205565453, + "step": 11690 + }, + { + "epoch": 0.5650399632965494, + "grad_norm": 2.18913763633722, + "learning_rate": 9.475097193700362e-06, + "loss": 3.8754, + "mean_token_accuracy": 0.33034273982048035, + "step": 11700 + }, + { + "epoch": 0.5650399632965494, + "eval_runtime": 7.8115, + "eval_samples_per_second": 378.163, + "eval_steps_per_second": 23.683, + "step": 11700 + }, + { + "epoch": 0.5655229034361191, + "grad_norm": 2.3360906122348397, + "learning_rate": 9.4582626282244e-06, + "loss": 3.9098, + "mean_token_accuracy": 0.3188508078455925, + "step": 11710 + }, + { + "epoch": 0.5660058435756888, + "grad_norm": 2.480090500671984, + "learning_rate": 9.441429602425518e-06, + "loss": 3.9766, + "mean_token_accuracy": 0.31572580337524414, + "step": 11720 + }, + { + "epoch": 0.5664887837152585, + "grad_norm": 2.3157149068680547, + "learning_rate": 9.42459816414502e-06, + "loss": 4.0152, + "mean_token_accuracy": 0.31491935551166533, + "step": 11730 + }, + { + "epoch": 0.5669717238548282, + "grad_norm": 2.4681760983235583, + "learning_rate": 9.407768361219707e-06, + "loss": 3.8789, + "mean_token_accuracy": 0.32893145084381104, + "step": 11740 + }, + { + "epoch": 0.5674546639943979, + "grad_norm": 2.163160934296099, + "learning_rate": 9.390940241481722e-06, + "loss": 3.9195, + "mean_token_accuracy": 0.32207661122083664, + "step": 11750 + }, + { + "epoch": 0.5679376041339675, + "grad_norm": 2.3419831974967527, + "learning_rate": 9.374113852758432e-06, + "loss": 3.884, + "mean_token_accuracy": 0.3201612919569016, + "step": 11760 + }, + { + "epoch": 0.5684205442735373, + "grad_norm": 2.280715914252699, + "learning_rate": 9.357289242872277e-06, + "loss": 3.8645, + "mean_token_accuracy": 0.3285282254219055, + "step": 11770 + }, + { + "epoch": 0.568903484413107, + "grad_norm": 2.3055525311385434, + "learning_rate": 9.34046645964065e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.31995967775583267, + "step": 11780 + }, + { + "epoch": 0.5693864245526767, + "grad_norm": 2.4661423209753583, + "learning_rate": 9.323645550875743e-06, + "loss": 3.9766, + "mean_token_accuracy": 0.3115927428007126, + "step": 11790 + }, + { + "epoch": 0.5698693646922464, + "grad_norm": 2.548462482871013, + "learning_rate": 9.30682656438443e-06, + "loss": 3.9113, + "mean_token_accuracy": 0.3269153252243996, + "step": 11800 + }, + { + "epoch": 0.5698693646922464, + "eval_runtime": 7.7874, + "eval_samples_per_second": 379.333, + "eval_steps_per_second": 23.756, + "step": 11800 + }, + { + "epoch": 0.5703523048318161, + "grad_norm": 2.392113069324535, + "learning_rate": 9.290009547968111e-06, + "loss": 3.9367, + "mean_token_accuracy": 0.3148185461759567, + "step": 11810 + }, + { + "epoch": 0.5708352449713858, + "grad_norm": 2.3701005453591617, + "learning_rate": 9.2731945494226e-06, + "loss": 3.9137, + "mean_token_accuracy": 0.3216733857989311, + "step": 11820 + }, + { + "epoch": 0.5713181851109554, + "grad_norm": 2.4702942758554247, + "learning_rate": 9.256381616537958e-06, + "loss": 3.9223, + "mean_token_accuracy": 0.32379032373428346, + "step": 11830 + }, + { + "epoch": 0.5718011252505252, + "grad_norm": 2.325168256903362, + "learning_rate": 9.2395707970984e-06, + "loss": 3.9918, + "mean_token_accuracy": 0.3133064478635788, + "step": 11840 + }, + { + "epoch": 0.5722840653900949, + "grad_norm": 2.485310226148532, + "learning_rate": 9.222762138882113e-06, + "loss": 3.9906, + "mean_token_accuracy": 0.3068548396229744, + "step": 11850 + }, + { + "epoch": 0.5727670055296646, + "grad_norm": 2.377160008463382, + "learning_rate": 9.205955689661144e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.32197580486536026, + "step": 11860 + }, + { + "epoch": 0.5732499456692343, + "grad_norm": 2.3302635537296985, + "learning_rate": 9.189151497201273e-06, + "loss": 3.8973, + "mean_token_accuracy": 0.319556450843811, + "step": 11870 + }, + { + "epoch": 0.573732885808804, + "grad_norm": 2.422412499545331, + "learning_rate": 9.172349609261859e-06, + "loss": 3.9164, + "mean_token_accuracy": 0.3197580620646477, + "step": 11880 + }, + { + "epoch": 0.5742158259483737, + "grad_norm": 2.2368465530388772, + "learning_rate": 9.155550073595712e-06, + "loss": 3.9434, + "mean_token_accuracy": 0.31703629344701767, + "step": 11890 + }, + { + "epoch": 0.5746987660879433, + "grad_norm": 2.457396731204454, + "learning_rate": 9.138752937948953e-06, + "loss": 3.9305, + "mean_token_accuracy": 0.3182459697127342, + "step": 11900 + }, + { + "epoch": 0.5746987660879433, + "eval_runtime": 7.8237, + "eval_samples_per_second": 377.572, + "eval_steps_per_second": 23.646, + "step": 11900 + }, + { + "epoch": 0.5751817062275131, + "grad_norm": 2.4210992392868467, + "learning_rate": 9.121958250060889e-06, + "loss": 3.9445, + "mean_token_accuracy": 0.32066532224416733, + "step": 11910 + }, + { + "epoch": 0.5756646463670828, + "grad_norm": 2.3403407054192464, + "learning_rate": 9.105166057663864e-06, + "loss": 3.8945, + "mean_token_accuracy": 0.31844758093357084, + "step": 11920 + }, + { + "epoch": 0.5761475865066525, + "grad_norm": 2.2856354948411397, + "learning_rate": 9.088376408483137e-06, + "loss": 3.984, + "mean_token_accuracy": 0.3168346777558327, + "step": 11930 + }, + { + "epoch": 0.5766305266462222, + "grad_norm": 2.4176027554030255, + "learning_rate": 9.071589350236727e-06, + "loss": 3.8922, + "mean_token_accuracy": 0.3244959682226181, + "step": 11940 + }, + { + "epoch": 0.5771134667857919, + "grad_norm": 2.3349999288145424, + "learning_rate": 9.054804930635304e-06, + "loss": 3.8887, + "mean_token_accuracy": 0.3199596792459488, + "step": 11950 + }, + { + "epoch": 0.5775964069253616, + "grad_norm": 2.326350755884072, + "learning_rate": 9.038023197382023e-06, + "loss": 3.8801, + "mean_token_accuracy": 0.3221451297402382, + "step": 11960 + }, + { + "epoch": 0.5780793470649312, + "grad_norm": 2.406344004202903, + "learning_rate": 9.021244198172421e-06, + "loss": 3.9602, + "mean_token_accuracy": 0.30957660973072054, + "step": 11970 + }, + { + "epoch": 0.578562287204501, + "grad_norm": 2.3392853291595004, + "learning_rate": 9.00446798069425e-06, + "loss": 3.9234, + "mean_token_accuracy": 0.3192540377378464, + "step": 11980 + }, + { + "epoch": 0.5790452273440707, + "grad_norm": 2.3558877376188363, + "learning_rate": 8.98769459262736e-06, + "loss": 3.9219, + "mean_token_accuracy": 0.3214717715978622, + "step": 11990 + }, + { + "epoch": 0.5795281674836404, + "grad_norm": 2.461191680076343, + "learning_rate": 8.970924081643566e-06, + "loss": 3.9043, + "mean_token_accuracy": 0.3205645173788071, + "step": 12000 + }, + { + "epoch": 0.5795281674836404, + "eval_runtime": 7.8191, + "eval_samples_per_second": 377.794, + "eval_steps_per_second": 23.66, + "step": 12000 + }, + { + "epoch": 0.5800111076232101, + "grad_norm": 2.4685072931996785, + "learning_rate": 8.954156495406497e-06, + "loss": 3.9227, + "mean_token_accuracy": 0.31703629046678544, + "step": 12010 + }, + { + "epoch": 0.5804940477627798, + "grad_norm": 2.333901134265098, + "learning_rate": 8.937391881571479e-06, + "loss": 3.9207, + "mean_token_accuracy": 0.32207661122083664, + "step": 12020 + }, + { + "epoch": 0.5809769879023495, + "grad_norm": 2.346218130006008, + "learning_rate": 8.920630287785377e-06, + "loss": 3.9172, + "mean_token_accuracy": 0.32479838877916334, + "step": 12030 + }, + { + "epoch": 0.5814599280419191, + "grad_norm": 2.345848427930141, + "learning_rate": 8.903871761686487e-06, + "loss": 3.8883, + "mean_token_accuracy": 0.3265120953321457, + "step": 12040 + }, + { + "epoch": 0.5819428681814889, + "grad_norm": 2.4235354075177864, + "learning_rate": 8.887116350904378e-06, + "loss": 3.8875, + "mean_token_accuracy": 0.31955645233392715, + "step": 12050 + }, + { + "epoch": 0.5824258083210586, + "grad_norm": 2.348899335064894, + "learning_rate": 8.87036410305977e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.3224798396229744, + "step": 12060 + }, + { + "epoch": 0.5829087484606283, + "grad_norm": 2.4310303799046076, + "learning_rate": 8.85361506576438e-06, + "loss": 3.9242, + "mean_token_accuracy": 0.3147177428007126, + "step": 12070 + }, + { + "epoch": 0.583391688600198, + "grad_norm": 2.3188114823528476, + "learning_rate": 8.836869286620827e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.31431451588869097, + "step": 12080 + }, + { + "epoch": 0.5838746287397677, + "grad_norm": 2.3406640174114153, + "learning_rate": 8.820126813222436e-06, + "loss": 3.923, + "mean_token_accuracy": 0.3131048381328583, + "step": 12090 + }, + { + "epoch": 0.5843575688793374, + "grad_norm": 2.479455770239232, + "learning_rate": 8.803387693153169e-06, + "loss": 3.9801, + "mean_token_accuracy": 0.3151209712028503, + "step": 12100 + }, + { + "epoch": 0.5843575688793374, + "eval_runtime": 7.8199, + "eval_samples_per_second": 377.752, + "eval_steps_per_second": 23.657, + "step": 12100 + }, + { + "epoch": 0.584840509018907, + "grad_norm": 2.376407361536894, + "learning_rate": 8.78665197398743e-06, + "loss": 3.9539, + "mean_token_accuracy": 0.3192540317773819, + "step": 12110 + }, + { + "epoch": 0.5853234491584768, + "grad_norm": 2.2861915670853734, + "learning_rate": 8.769919703289985e-06, + "loss": 3.9445, + "mean_token_accuracy": 0.31834677457809446, + "step": 12120 + }, + { + "epoch": 0.5858063892980465, + "grad_norm": 2.3229430651807528, + "learning_rate": 8.753190928615773e-06, + "loss": 3.8785, + "mean_token_accuracy": 0.32368951886892317, + "step": 12130 + }, + { + "epoch": 0.5862893294376162, + "grad_norm": 2.3473446680131467, + "learning_rate": 8.736465697509807e-06, + "loss": 3.9035, + "mean_token_accuracy": 0.31229838728904724, + "step": 12140 + }, + { + "epoch": 0.5867722695771859, + "grad_norm": 2.5268537210273183, + "learning_rate": 8.719744057507036e-06, + "loss": 3.9605, + "mean_token_accuracy": 0.31263890117406845, + "step": 12150 + }, + { + "epoch": 0.5872552097167556, + "grad_norm": 2.347070448429671, + "learning_rate": 8.703026056132191e-06, + "loss": 3.9918, + "mean_token_accuracy": 0.312298384308815, + "step": 12160 + }, + { + "epoch": 0.5877381498563253, + "grad_norm": 2.455137886945934, + "learning_rate": 8.686311740899673e-06, + "loss": 3.9012, + "mean_token_accuracy": 0.31794354915618894, + "step": 12170 + }, + { + "epoch": 0.5882210899958951, + "grad_norm": 2.461354202215206, + "learning_rate": 8.669601159313396e-06, + "loss": 3.9309, + "mean_token_accuracy": 0.3152217730879784, + "step": 12180 + }, + { + "epoch": 0.5887040301354647, + "grad_norm": 2.4164386323857747, + "learning_rate": 8.652894358866672e-06, + "loss": 4.0621, + "mean_token_accuracy": 0.30796370953321456, + "step": 12190 + }, + { + "epoch": 0.5891869702750344, + "grad_norm": 2.3317146862006366, + "learning_rate": 8.636191387042055e-06, + "loss": 3.9422, + "mean_token_accuracy": 0.3167338714003563, + "step": 12200 + }, + { + "epoch": 0.5891869702750344, + "eval_runtime": 7.8041, + "eval_samples_per_second": 378.517, + "eval_steps_per_second": 23.705, + "step": 12200 + }, + { + "epoch": 0.5896699104146041, + "grad_norm": 2.382845058326198, + "learning_rate": 8.619492291311232e-06, + "loss": 3.9211, + "mean_token_accuracy": 0.3157753825187683, + "step": 12210 + }, + { + "epoch": 0.5901528505541738, + "grad_norm": 2.3838188339279496, + "learning_rate": 8.602797119134857e-06, + "loss": 3.909, + "mean_token_accuracy": 0.31844758093357084, + "step": 12220 + }, + { + "epoch": 0.5906357906937435, + "grad_norm": 2.2911175279726685, + "learning_rate": 8.586105917962456e-06, + "loss": 3.898, + "mean_token_accuracy": 0.31602822691202165, + "step": 12230 + }, + { + "epoch": 0.5911187308333132, + "grad_norm": 2.5265796074142326, + "learning_rate": 8.56941873523224e-06, + "loss": 3.8953, + "mean_token_accuracy": 0.3108870953321457, + "step": 12240 + }, + { + "epoch": 0.591601670972883, + "grad_norm": 2.4085651802137846, + "learning_rate": 8.552735618371027e-06, + "loss": 3.9387, + "mean_token_accuracy": 0.31895161122083665, + "step": 12250 + }, + { + "epoch": 0.5920846111124526, + "grad_norm": 2.4501739619412195, + "learning_rate": 8.536056614794058e-06, + "loss": 3.9391, + "mean_token_accuracy": 0.31693548560142515, + "step": 12260 + }, + { + "epoch": 0.5925675512520223, + "grad_norm": 2.4531039553366107, + "learning_rate": 8.51938177190489e-06, + "loss": 3.8828, + "mean_token_accuracy": 0.31502016335725785, + "step": 12270 + }, + { + "epoch": 0.593050491391592, + "grad_norm": 2.275677901505879, + "learning_rate": 8.502711137095268e-06, + "loss": 3.948, + "mean_token_accuracy": 0.3147177442908287, + "step": 12280 + }, + { + "epoch": 0.5935334315311617, + "grad_norm": 2.7275202797910327, + "learning_rate": 8.486044757744955e-06, + "loss": 3.9289, + "mean_token_accuracy": 0.3179435461759567, + "step": 12290 + }, + { + "epoch": 0.5940163716707314, + "grad_norm": 2.308175116377804, + "learning_rate": 8.469382681221638e-06, + "loss": 3.8691, + "mean_token_accuracy": 0.32328628897666933, + "step": 12300 + }, + { + "epoch": 0.5940163716707314, + "eval_runtime": 7.8025, + "eval_samples_per_second": 378.596, + "eval_steps_per_second": 23.71, + "step": 12300 + }, + { + "epoch": 0.5944993118103011, + "grad_norm": 2.4988315147767026, + "learning_rate": 8.45272495488076e-06, + "loss": 3.9902, + "mean_token_accuracy": 0.3171370968222618, + "step": 12310 + }, + { + "epoch": 0.5949822519498709, + "grad_norm": 2.336446395859129, + "learning_rate": 8.43607162606542e-06, + "loss": 4.0223, + "mean_token_accuracy": 0.31431451588869097, + "step": 12320 + }, + { + "epoch": 0.5954651920894405, + "grad_norm": 2.3516115918214773, + "learning_rate": 8.419422742106192e-06, + "loss": 3.9191, + "mean_token_accuracy": 0.3229838714003563, + "step": 12330 + }, + { + "epoch": 0.5959481322290102, + "grad_norm": 2.3736666646366906, + "learning_rate": 8.402778350321047e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.32237903475761415, + "step": 12340 + }, + { + "epoch": 0.59643107236858, + "grad_norm": 2.296403879228017, + "learning_rate": 8.386138498015157e-06, + "loss": 3.9012, + "mean_token_accuracy": 0.32056451588869095, + "step": 12350 + }, + { + "epoch": 0.5969140125081496, + "grad_norm": 2.3583058482072237, + "learning_rate": 8.369503232480825e-06, + "loss": 3.9758, + "mean_token_accuracy": 0.3149193570017815, + "step": 12360 + }, + { + "epoch": 0.5973969526477193, + "grad_norm": 2.314194923594278, + "learning_rate": 8.352872600997289e-06, + "loss": 3.9047, + "mean_token_accuracy": 0.31975806355476377, + "step": 12370 + }, + { + "epoch": 0.597879892787289, + "grad_norm": 2.4114810572442185, + "learning_rate": 8.336246650830642e-06, + "loss": 3.8652, + "mean_token_accuracy": 0.32631048560142517, + "step": 12380 + }, + { + "epoch": 0.5983628329268588, + "grad_norm": 2.3430620758953835, + "learning_rate": 8.319625429233649e-06, + "loss": 3.9531, + "mean_token_accuracy": 0.31895161271095274, + "step": 12390 + }, + { + "epoch": 0.5988457730664284, + "grad_norm": 2.3480062433608264, + "learning_rate": 8.303008983445647e-06, + "loss": 3.9781, + "mean_token_accuracy": 0.31451612859964373, + "step": 12400 + }, + { + "epoch": 0.5988457730664284, + "eval_runtime": 7.7997, + "eval_samples_per_second": 378.734, + "eval_steps_per_second": 23.719, + "step": 12400 + }, + { + "epoch": 0.5993287132059981, + "grad_norm": 2.4846036268181964, + "learning_rate": 8.286397360692403e-06, + "loss": 3.9492, + "mean_token_accuracy": 0.3157258078455925, + "step": 12410 + }, + { + "epoch": 0.5998116533455679, + "grad_norm": 2.271801156314681, + "learning_rate": 8.269790608185971e-06, + "loss": 3.9371, + "mean_token_accuracy": 0.31451612859964373, + "step": 12420 + }, + { + "epoch": 0.6002945934851375, + "grad_norm": 2.5306507378807432, + "learning_rate": 8.253188773124565e-06, + "loss": 3.8914, + "mean_token_accuracy": 0.31875, + "step": 12430 + }, + { + "epoch": 0.6007775336247072, + "grad_norm": 2.3723808856815443, + "learning_rate": 8.23659190269242e-06, + "loss": 3.9098, + "mean_token_accuracy": 0.3179435506463051, + "step": 12440 + }, + { + "epoch": 0.6012604737642769, + "grad_norm": 2.440474777149874, + "learning_rate": 8.22000004405967e-06, + "loss": 3.9199, + "mean_token_accuracy": 0.3185483902692795, + "step": 12450 + }, + { + "epoch": 0.6017434139038467, + "grad_norm": 2.4337366972057013, + "learning_rate": 8.203413244382191e-06, + "loss": 4.0355, + "mean_token_accuracy": 0.3094758056104183, + "step": 12460 + }, + { + "epoch": 0.6022263540434163, + "grad_norm": 2.4108682935467383, + "learning_rate": 8.186831550801498e-06, + "loss": 3.9516, + "mean_token_accuracy": 0.3181451618671417, + "step": 12470 + }, + { + "epoch": 0.602709294182986, + "grad_norm": 2.490215018819108, + "learning_rate": 8.17025501044457e-06, + "loss": 3.8852, + "mean_token_accuracy": 0.3260080650448799, + "step": 12480 + }, + { + "epoch": 0.6031922343225558, + "grad_norm": 2.346653145172493, + "learning_rate": 8.153683670423772e-06, + "loss": 3.9492, + "mean_token_accuracy": 0.3160282239317894, + "step": 12490 + }, + { + "epoch": 0.6036751744621254, + "grad_norm": 2.44406010865299, + "learning_rate": 8.137117577836654e-06, + "loss": 3.8703, + "mean_token_accuracy": 0.319254033267498, + "step": 12500 + }, + { + "epoch": 0.6036751744621254, + "eval_runtime": 7.8222, + "eval_samples_per_second": 377.645, + "eval_steps_per_second": 23.651, + "step": 12500 + }, + { + "epoch": 0.6041581146016951, + "grad_norm": 2.293463921830506, + "learning_rate": 8.120556779765886e-06, + "loss": 3.9297, + "mean_token_accuracy": 0.3200604826211929, + "step": 12510 + }, + { + "epoch": 0.6046410547412648, + "grad_norm": 2.3423937214795587, + "learning_rate": 8.10400132327906e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.31633064448833464, + "step": 12520 + }, + { + "epoch": 0.6051239948808346, + "grad_norm": 2.2193064020173416, + "learning_rate": 8.087451255428614e-06, + "loss": 3.941, + "mean_token_accuracy": 0.31481854766607287, + "step": 12530 + }, + { + "epoch": 0.6056069350204042, + "grad_norm": 2.274125797565075, + "learning_rate": 8.070906623251646e-06, + "loss": 3.8508, + "mean_token_accuracy": 0.3283266112208366, + "step": 12540 + }, + { + "epoch": 0.6060898751599739, + "grad_norm": 2.450745719002159, + "learning_rate": 8.054367473769822e-06, + "loss": 3.9395, + "mean_token_accuracy": 0.3186491921544075, + "step": 12550 + }, + { + "epoch": 0.6065728152995437, + "grad_norm": 2.3467240545768635, + "learning_rate": 8.03783385398922e-06, + "loss": 3.9297, + "mean_token_accuracy": 0.3268145158886909, + "step": 12560 + }, + { + "epoch": 0.6070557554391133, + "grad_norm": 2.3832867287564587, + "learning_rate": 8.021305810900198e-06, + "loss": 3.8937, + "mean_token_accuracy": 0.322076615691185, + "step": 12570 + }, + { + "epoch": 0.607538695578683, + "grad_norm": 2.333120906074266, + "learning_rate": 8.004783391477281e-06, + "loss": 3.9285, + "mean_token_accuracy": 0.315625, + "step": 12580 + }, + { + "epoch": 0.6080216357182527, + "grad_norm": 2.637448853043515, + "learning_rate": 7.988266642678983e-06, + "loss": 3.9594, + "mean_token_accuracy": 0.31532258093357085, + "step": 12590 + }, + { + "epoch": 0.6085045758578225, + "grad_norm": 2.378957418572957, + "learning_rate": 7.971755611447732e-06, + "loss": 3.8285, + "mean_token_accuracy": 0.3304435461759567, + "step": 12600 + }, + { + "epoch": 0.6085045758578225, + "eval_runtime": 7.829, + "eval_samples_per_second": 377.316, + "eval_steps_per_second": 23.63, + "step": 12600 + }, + { + "epoch": 0.6089875159973921, + "grad_norm": 2.327415186219916, + "learning_rate": 7.955250344709677e-06, + "loss": 3.9434, + "mean_token_accuracy": 0.3135080635547638, + "step": 12610 + }, + { + "epoch": 0.6094704561369618, + "grad_norm": 2.3591974534040183, + "learning_rate": 7.938750889374614e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.3084677442908287, + "step": 12620 + }, + { + "epoch": 0.6099533962765316, + "grad_norm": 2.385798003771275, + "learning_rate": 7.92225729233579e-06, + "loss": 3.9516, + "mean_token_accuracy": 0.3189516142010689, + "step": 12630 + }, + { + "epoch": 0.6104363364161012, + "grad_norm": 2.4359180069415722, + "learning_rate": 7.90576960046983e-06, + "loss": 3.8957, + "mean_token_accuracy": 0.32177419513463973, + "step": 12640 + }, + { + "epoch": 0.6109192765556709, + "grad_norm": 2.3362767954011403, + "learning_rate": 7.889287860636556e-06, + "loss": 3.9156, + "mean_token_accuracy": 0.3185483872890472, + "step": 12650 + }, + { + "epoch": 0.6114022166952406, + "grad_norm": 2.5328674143723178, + "learning_rate": 7.872812119678893e-06, + "loss": 4.0055, + "mean_token_accuracy": 0.3107862934470177, + "step": 12660 + }, + { + "epoch": 0.6118851568348104, + "grad_norm": 2.5996935950577473, + "learning_rate": 7.856342424422693e-06, + "loss": 3.9906, + "mean_token_accuracy": 0.3160282239317894, + "step": 12670 + }, + { + "epoch": 0.61236809697438, + "grad_norm": 2.3733602776773997, + "learning_rate": 7.839878821676642e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.31743951588869096, + "step": 12680 + }, + { + "epoch": 0.6128510371139497, + "grad_norm": 2.433579799147079, + "learning_rate": 7.823421358232113e-06, + "loss": 3.9031, + "mean_token_accuracy": 0.32338709235191343, + "step": 12690 + }, + { + "epoch": 0.6133339772535195, + "grad_norm": 2.435948421081829, + "learning_rate": 7.806970080863013e-06, + "loss": 3.8687, + "mean_token_accuracy": 0.32893145084381104, + "step": 12700 + }, + { + "epoch": 0.6133339772535195, + "eval_runtime": 7.8186, + "eval_samples_per_second": 377.818, + "eval_steps_per_second": 23.662, + "step": 12700 + }, + { + "epoch": 0.6138169173930891, + "grad_norm": 2.5919961474739925, + "learning_rate": 7.790525036325688e-06, + "loss": 4.0113, + "mean_token_accuracy": 0.31008064150810244, + "step": 12710 + }, + { + "epoch": 0.6142998575326588, + "grad_norm": 2.324324256109048, + "learning_rate": 7.774086271358752e-06, + "loss": 3.9492, + "mean_token_accuracy": 0.31118951439857484, + "step": 12720 + }, + { + "epoch": 0.6147827976722285, + "grad_norm": 2.456467135348635, + "learning_rate": 7.757653832682988e-06, + "loss": 4.0051, + "mean_token_accuracy": 0.31491935402154925, + "step": 12730 + }, + { + "epoch": 0.6152657378117983, + "grad_norm": 2.4413301215839813, + "learning_rate": 7.741227767001178e-06, + "loss": 3.8988, + "mean_token_accuracy": 0.3238911271095276, + "step": 12740 + }, + { + "epoch": 0.6157486779513679, + "grad_norm": 2.4955358813010773, + "learning_rate": 7.724808120998019e-06, + "loss": 3.9805, + "mean_token_accuracy": 0.3141129031777382, + "step": 12750 + }, + { + "epoch": 0.6162316180909376, + "grad_norm": 2.506360797782655, + "learning_rate": 7.708394941339933e-06, + "loss": 3.8996, + "mean_token_accuracy": 0.3160282298922539, + "step": 12760 + }, + { + "epoch": 0.6167145582305074, + "grad_norm": 2.2751606705591367, + "learning_rate": 7.691988274674991e-06, + "loss": 3.8918, + "mean_token_accuracy": 0.3219758063554764, + "step": 12770 + }, + { + "epoch": 0.617197498370077, + "grad_norm": 2.3632981596438487, + "learning_rate": 7.67558816763273e-06, + "loss": 3.9844, + "mean_token_accuracy": 0.3179435506463051, + "step": 12780 + }, + { + "epoch": 0.6176804385096467, + "grad_norm": 2.6603198143339464, + "learning_rate": 7.659194666824065e-06, + "loss": 3.9547, + "mean_token_accuracy": 0.3224798396229744, + "step": 12790 + }, + { + "epoch": 0.6181633786492164, + "grad_norm": 2.4044952755815827, + "learning_rate": 7.642807818841117e-06, + "loss": 3.9102, + "mean_token_accuracy": 0.3218749985098839, + "step": 12800 + }, + { + "epoch": 0.6181633786492164, + "eval_runtime": 7.8228, + "eval_samples_per_second": 377.612, + "eval_steps_per_second": 23.649, + "step": 12800 + }, + { + "epoch": 0.6186463187887862, + "grad_norm": 2.568865982176715, + "learning_rate": 7.626427670257106e-06, + "loss": 3.8984, + "mean_token_accuracy": 0.31965726166963576, + "step": 12810 + }, + { + "epoch": 0.6191292589283558, + "grad_norm": 2.242642250851699, + "learning_rate": 7.610054267626221e-06, + "loss": 3.9457, + "mean_token_accuracy": 0.32066532373428347, + "step": 12820 + }, + { + "epoch": 0.6196121990679255, + "grad_norm": 2.3693172932920716, + "learning_rate": 7.593687657483459e-06, + "loss": 3.9426, + "mean_token_accuracy": 0.32066532224416733, + "step": 12830 + }, + { + "epoch": 0.6200951392074953, + "grad_norm": 2.2068181216476486, + "learning_rate": 7.577327886344532e-06, + "loss": 3.9043, + "mean_token_accuracy": 0.3173387095332146, + "step": 12840 + }, + { + "epoch": 0.6205780793470649, + "grad_norm": 2.429565571275932, + "learning_rate": 7.560975000705697e-06, + "loss": 4.0078, + "mean_token_accuracy": 0.30997983664274215, + "step": 12850 + }, + { + "epoch": 0.6210610194866346, + "grad_norm": 2.418662179857565, + "learning_rate": 7.544629047043661e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.3155241936445236, + "step": 12860 + }, + { + "epoch": 0.6215439596262043, + "grad_norm": 2.3463015231804767, + "learning_rate": 7.528290071815405e-06, + "loss": 3.9633, + "mean_token_accuracy": 0.31703629046678544, + "step": 12870 + }, + { + "epoch": 0.6220268997657741, + "grad_norm": 2.317958165392629, + "learning_rate": 7.511958121458105e-06, + "loss": 3.9246, + "mean_token_accuracy": 0.3205645143985748, + "step": 12880 + }, + { + "epoch": 0.6225098399053437, + "grad_norm": 2.4637128813599634, + "learning_rate": 7.495633242388942e-06, + "loss": 3.9512, + "mean_token_accuracy": 0.3074596792459488, + "step": 12890 + }, + { + "epoch": 0.6229927800449134, + "grad_norm": 2.3577747313384916, + "learning_rate": 7.479315481005027e-06, + "loss": 3.8328, + "mean_token_accuracy": 0.33014112859964373, + "step": 12900 + }, + { + "epoch": 0.6229927800449134, + "eval_runtime": 7.8122, + "eval_samples_per_second": 378.128, + "eval_steps_per_second": 23.681, + "step": 12900 + }, + { + "epoch": 0.6234757201844832, + "grad_norm": 2.3936035472824497, + "learning_rate": 7.463004883683219e-06, + "loss": 3.8766, + "mean_token_accuracy": 0.3201612874865532, + "step": 12910 + }, + { + "epoch": 0.6239586603240528, + "grad_norm": 2.397476131748257, + "learning_rate": 7.446701496780034e-06, + "loss": 3.943, + "mean_token_accuracy": 0.3152217760682106, + "step": 12920 + }, + { + "epoch": 0.6244416004636225, + "grad_norm": 2.3223812381697777, + "learning_rate": 7.430405366631488e-06, + "loss": 3.918, + "mean_token_accuracy": 0.3207661285996437, + "step": 12930 + }, + { + "epoch": 0.6249245406031922, + "grad_norm": 2.2315053966770484, + "learning_rate": 7.41411653955296e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.322379033267498, + "step": 12940 + }, + { + "epoch": 0.625407480742762, + "grad_norm": 2.3335703162067882, + "learning_rate": 7.3978350618390985e-06, + "loss": 3.9332, + "mean_token_accuracy": 0.31995967775583267, + "step": 12950 + }, + { + "epoch": 0.6258904208823316, + "grad_norm": 2.3212974620696682, + "learning_rate": 7.381560979763639e-06, + "loss": 3.9594, + "mean_token_accuracy": 0.3174395143985748, + "step": 12960 + }, + { + "epoch": 0.6263733610219013, + "grad_norm": 2.3574048410155775, + "learning_rate": 7.365294339579321e-06, + "loss": 3.9246, + "mean_token_accuracy": 0.31774193346500396, + "step": 12970 + }, + { + "epoch": 0.6268563011614711, + "grad_norm": 2.294702562502172, + "learning_rate": 7.349035187517709e-06, + "loss": 3.9031, + "mean_token_accuracy": 0.32379032373428346, + "step": 12980 + }, + { + "epoch": 0.6273392413010407, + "grad_norm": 2.321542868350157, + "learning_rate": 7.332783569789111e-06, + "loss": 3.9711, + "mean_token_accuracy": 0.31381048262119293, + "step": 12990 + }, + { + "epoch": 0.6278221814406104, + "grad_norm": 2.3054377755348927, + "learning_rate": 7.316539532582395e-06, + "loss": 3.9422, + "mean_token_accuracy": 0.32217742055654525, + "step": 13000 + }, + { + "epoch": 0.6278221814406104, + "eval_runtime": 7.7865, + "eval_samples_per_second": 379.376, + "eval_steps_per_second": 23.759, + "step": 13000 + }, + { + "epoch": 0.6283051215801801, + "grad_norm": 2.46251436645923, + "learning_rate": 7.300303122064913e-06, + "loss": 3.8961, + "mean_token_accuracy": 0.3270161271095276, + "step": 13010 + }, + { + "epoch": 0.6287880617197499, + "grad_norm": 2.5061581647684847, + "learning_rate": 7.284074384382309e-06, + "loss": 3.8523, + "mean_token_accuracy": 0.3245967745780945, + "step": 13020 + }, + { + "epoch": 0.6292710018593195, + "grad_norm": 2.409695697482411, + "learning_rate": 7.267853365658453e-06, + "loss": 3.9711, + "mean_token_accuracy": 0.3150201618671417, + "step": 13030 + }, + { + "epoch": 0.6297539419988892, + "grad_norm": 2.4323365334076623, + "learning_rate": 7.251640111995248e-06, + "loss": 3.9613, + "mean_token_accuracy": 0.318548384308815, + "step": 13040 + }, + { + "epoch": 0.630236882138459, + "grad_norm": 2.388842497691783, + "learning_rate": 7.235434669472552e-06, + "loss": 3.9395, + "mean_token_accuracy": 0.31562500149011613, + "step": 13050 + }, + { + "epoch": 0.6307198222780286, + "grad_norm": 2.3337639860673467, + "learning_rate": 7.2192370841480035e-06, + "loss": 3.8188, + "mean_token_accuracy": 0.33185483813285827, + "step": 13060 + }, + { + "epoch": 0.6312027624175983, + "grad_norm": 2.4249027758975994, + "learning_rate": 7.2030474020569216e-06, + "loss": 3.9527, + "mean_token_accuracy": 0.3166330650448799, + "step": 13070 + }, + { + "epoch": 0.631685702557168, + "grad_norm": 2.455308685176902, + "learning_rate": 7.186865669212162e-06, + "loss": 3.9477, + "mean_token_accuracy": 0.3134072571992874, + "step": 13080 + }, + { + "epoch": 0.6321686426967378, + "grad_norm": 2.498420458961923, + "learning_rate": 7.170691931603977e-06, + "loss": 3.9316, + "mean_token_accuracy": 0.32157257944345474, + "step": 13090 + }, + { + "epoch": 0.6326515828363074, + "grad_norm": 2.481104271475756, + "learning_rate": 7.154526235199917e-06, + "loss": 3.9301, + "mean_token_accuracy": 0.319556450843811, + "step": 13100 + }, + { + "epoch": 0.6326515828363074, + "eval_runtime": 7.8352, + "eval_samples_per_second": 377.015, + "eval_steps_per_second": 23.611, + "step": 13100 + }, + { + "epoch": 0.6331345229758771, + "grad_norm": 2.385737967049577, + "learning_rate": 7.138368625944652e-06, + "loss": 3.8805, + "mean_token_accuracy": 0.3213709697127342, + "step": 13110 + }, + { + "epoch": 0.6336174631154469, + "grad_norm": 2.3991427423310174, + "learning_rate": 7.1222191497598945e-06, + "loss": 3.9078, + "mean_token_accuracy": 0.32036290466785433, + "step": 13120 + }, + { + "epoch": 0.6341004032550165, + "grad_norm": 2.449850530644843, + "learning_rate": 7.106077852544218e-06, + "loss": 4.0055, + "mean_token_accuracy": 0.31118951737880707, + "step": 13130 + }, + { + "epoch": 0.6345833433945862, + "grad_norm": 2.4961170883202426, + "learning_rate": 7.089944780172971e-06, + "loss": 3.9934, + "mean_token_accuracy": 0.30423387289047243, + "step": 13140 + }, + { + "epoch": 0.635066283534156, + "grad_norm": 2.360820203905407, + "learning_rate": 7.073819978498102e-06, + "loss": 3.9316, + "mean_token_accuracy": 0.32157257944345474, + "step": 13150 + }, + { + "epoch": 0.6355492236737257, + "grad_norm": 2.2824310704213926, + "learning_rate": 7.057703493348085e-06, + "loss": 3.9227, + "mean_token_accuracy": 0.31582661122083666, + "step": 13160 + }, + { + "epoch": 0.6360321638132953, + "grad_norm": 2.441871581416318, + "learning_rate": 7.041595370527725e-06, + "loss": 3.9133, + "mean_token_accuracy": 0.3216733857989311, + "step": 13170 + }, + { + "epoch": 0.636515103952865, + "grad_norm": 2.407221197623045, + "learning_rate": 7.025495655818084e-06, + "loss": 3.9371, + "mean_token_accuracy": 0.3126008078455925, + "step": 13180 + }, + { + "epoch": 0.6369980440924348, + "grad_norm": 2.4327088299934143, + "learning_rate": 7.009404394976315e-06, + "loss": 3.973, + "mean_token_accuracy": 0.3137096777558327, + "step": 13190 + }, + { + "epoch": 0.6374809842320044, + "grad_norm": 2.491724091378324, + "learning_rate": 6.993321633735553e-06, + "loss": 3.9207, + "mean_token_accuracy": 0.32006048411130905, + "step": 13200 + }, + { + "epoch": 0.6374809842320044, + "eval_runtime": 7.8027, + "eval_samples_per_second": 378.589, + "eval_steps_per_second": 23.71, + "step": 13200 + }, + { + "epoch": 0.6379639243715741, + "grad_norm": 2.7785307670964987, + "learning_rate": 6.977247417804766e-06, + "loss": 3.9078, + "mean_token_accuracy": 0.3141129046678543, + "step": 13210 + }, + { + "epoch": 0.6384468645111439, + "grad_norm": 2.391861656939564, + "learning_rate": 6.961181792868637e-06, + "loss": 3.898, + "mean_token_accuracy": 0.3157258093357086, + "step": 13220 + }, + { + "epoch": 0.6389298046507136, + "grad_norm": 2.4026082956779273, + "learning_rate": 6.945124804587444e-06, + "loss": 3.9121, + "mean_token_accuracy": 0.3226814493536949, + "step": 13230 + }, + { + "epoch": 0.6394127447902832, + "grad_norm": 2.33323761553131, + "learning_rate": 6.9290764985969e-06, + "loss": 3.991, + "mean_token_accuracy": 0.31169354617595674, + "step": 13240 + }, + { + "epoch": 0.6398956849298529, + "grad_norm": 2.38750310086146, + "learning_rate": 6.9130369205080646e-06, + "loss": 3.9328, + "mean_token_accuracy": 0.3231854856014252, + "step": 13250 + }, + { + "epoch": 0.6403786250694227, + "grad_norm": 2.4831110194490997, + "learning_rate": 6.897006115907168e-06, + "loss": 3.9871, + "mean_token_accuracy": 0.3108870968222618, + "step": 13260 + }, + { + "epoch": 0.6408615652089924, + "grad_norm": 2.45661553413711, + "learning_rate": 6.880984130355528e-06, + "loss": 3.909, + "mean_token_accuracy": 0.32570564597845075, + "step": 13270 + }, + { + "epoch": 0.641344505348562, + "grad_norm": 2.367788164306088, + "learning_rate": 6.864971009389373e-06, + "loss": 3.8633, + "mean_token_accuracy": 0.3243951588869095, + "step": 13280 + }, + { + "epoch": 0.6418274454881318, + "grad_norm": 2.4928497571389343, + "learning_rate": 6.848966798519763e-06, + "loss": 3.9535, + "mean_token_accuracy": 0.3149193525314331, + "step": 13290 + }, + { + "epoch": 0.6423103856277015, + "grad_norm": 2.411054866056794, + "learning_rate": 6.832971543232414e-06, + "loss": 3.8816, + "mean_token_accuracy": 0.32298386842012405, + "step": 13300 + }, + { + "epoch": 0.6423103856277015, + "eval_runtime": 7.8101, + "eval_samples_per_second": 378.23, + "eval_steps_per_second": 23.687, + "step": 13300 + }, + { + "epoch": 0.6427933257672711, + "grad_norm": 2.2097641685763514, + "learning_rate": 6.816985288987603e-06, + "loss": 3.9855, + "mean_token_accuracy": 0.31542338877916337, + "step": 13310 + }, + { + "epoch": 0.6432762659068408, + "grad_norm": 2.388671729476071, + "learning_rate": 6.801008081220015e-06, + "loss": 3.9508, + "mean_token_accuracy": 0.321169351041317, + "step": 13320 + }, + { + "epoch": 0.6437592060464106, + "grad_norm": 2.462324067346631, + "learning_rate": 6.785039965338632e-06, + "loss": 3.9867, + "mean_token_accuracy": 0.30544354766607285, + "step": 13330 + }, + { + "epoch": 0.6442421461859803, + "grad_norm": 2.4048972164718423, + "learning_rate": 6.769080986726593e-06, + "loss": 3.9375, + "mean_token_accuracy": 0.32086693346500395, + "step": 13340 + }, + { + "epoch": 0.6447250863255499, + "grad_norm": 2.457099332342788, + "learning_rate": 6.753131190741058e-06, + "loss": 3.8625, + "mean_token_accuracy": 0.3244959682226181, + "step": 13350 + }, + { + "epoch": 0.6452080264651197, + "grad_norm": 2.702352747281538, + "learning_rate": 6.7371906227131125e-06, + "loss": 3.948, + "mean_token_accuracy": 0.3256048396229744, + "step": 13360 + }, + { + "epoch": 0.6456909666046894, + "grad_norm": 2.450534668684664, + "learning_rate": 6.721259327947585e-06, + "loss": 3.891, + "mean_token_accuracy": 0.3180443555116653, + "step": 13370 + }, + { + "epoch": 0.646173906744259, + "grad_norm": 2.4241048462215202, + "learning_rate": 6.705337351722978e-06, + "loss": 3.8945, + "mean_token_accuracy": 0.31875, + "step": 13380 + }, + { + "epoch": 0.6466568468838287, + "grad_norm": 2.4140939409017346, + "learning_rate": 6.689424739291284e-06, + "loss": 3.9734, + "mean_token_accuracy": 0.3208669379353523, + "step": 13390 + }, + { + "epoch": 0.6471397870233985, + "grad_norm": 2.470336158681267, + "learning_rate": 6.673521535877907e-06, + "loss": 3.9711, + "mean_token_accuracy": 0.31300403028726576, + "step": 13400 + }, + { + "epoch": 0.6471397870233985, + "eval_runtime": 7.8005, + "eval_samples_per_second": 378.695, + "eval_steps_per_second": 23.717, + "step": 13400 + }, + { + "epoch": 0.6476227271629682, + "grad_norm": 2.5353353960367, + "learning_rate": 6.657627786681484e-06, + "loss": 3.9293, + "mean_token_accuracy": 0.32026209533214567, + "step": 13410 + }, + { + "epoch": 0.6481056673025378, + "grad_norm": 2.3230612136218216, + "learning_rate": 6.641743536873804e-06, + "loss": 3.9219, + "mean_token_accuracy": 0.3149193525314331, + "step": 13420 + }, + { + "epoch": 0.6485886074421076, + "grad_norm": 2.533045045902366, + "learning_rate": 6.625868831599645e-06, + "loss": 3.9008, + "mean_token_accuracy": 0.3262096747756004, + "step": 13430 + }, + { + "epoch": 0.6490715475816773, + "grad_norm": 2.422131897303017, + "learning_rate": 6.610003715976663e-06, + "loss": 3.9062, + "mean_token_accuracy": 0.32389113008975984, + "step": 13440 + }, + { + "epoch": 0.6495544877212469, + "grad_norm": 2.4198555502996633, + "learning_rate": 6.594148235095257e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.3145161300897598, + "step": 13450 + }, + { + "epoch": 0.6500374278608166, + "grad_norm": 2.607695233539751, + "learning_rate": 6.578302434018446e-06, + "loss": 3.9508, + "mean_token_accuracy": 0.31844758093357084, + "step": 13460 + }, + { + "epoch": 0.6505203680003864, + "grad_norm": 2.390740146052622, + "learning_rate": 6.562466357781738e-06, + "loss": 3.8449, + "mean_token_accuracy": 0.3236895173788071, + "step": 13470 + }, + { + "epoch": 0.6510033081399561, + "grad_norm": 2.469974288889219, + "learning_rate": 6.546640051392992e-06, + "loss": 3.8391, + "mean_token_accuracy": 0.32963709682226183, + "step": 13480 + }, + { + "epoch": 0.6514862482795257, + "grad_norm": 2.4665650951223754, + "learning_rate": 6.530823559832318e-06, + "loss": 3.9434, + "mean_token_accuracy": 0.3211693525314331, + "step": 13490 + }, + { + "epoch": 0.6519691884190955, + "grad_norm": 2.3379100061768, + "learning_rate": 6.515016928051911e-06, + "loss": 3.8563, + "mean_token_accuracy": 0.3257056474685669, + "step": 13500 + }, + { + "epoch": 0.6519691884190955, + "eval_runtime": 7.8268, + "eval_samples_per_second": 377.419, + "eval_steps_per_second": 23.637, + "step": 13500 + }, + { + "epoch": 0.6524521285586652, + "grad_norm": 2.3116877312986888, + "learning_rate": 6.499220200975967e-06, + "loss": 3.8906, + "mean_token_accuracy": 0.3240927457809448, + "step": 13510 + }, + { + "epoch": 0.6529350686982348, + "grad_norm": 2.4643670214315514, + "learning_rate": 6.483433423500503e-06, + "loss": 3.9195, + "mean_token_accuracy": 0.31935483664274217, + "step": 13520 + }, + { + "epoch": 0.6534180088378045, + "grad_norm": 2.7259989461781156, + "learning_rate": 6.467656640493285e-06, + "loss": 3.9254, + "mean_token_accuracy": 0.31733871400356295, + "step": 13530 + }, + { + "epoch": 0.6539009489773743, + "grad_norm": 2.4693318113401057, + "learning_rate": 6.451889896793657e-06, + "loss": 3.9773, + "mean_token_accuracy": 0.3107862904667854, + "step": 13540 + }, + { + "epoch": 0.654383889116944, + "grad_norm": 2.486509368633819, + "learning_rate": 6.4361332372124395e-06, + "loss": 3.868, + "mean_token_accuracy": 0.32550403028726577, + "step": 13550 + }, + { + "epoch": 0.6548668292565136, + "grad_norm": 2.416245832401473, + "learning_rate": 6.420386706531784e-06, + "loss": 3.9191, + "mean_token_accuracy": 0.31915322989225386, + "step": 13560 + }, + { + "epoch": 0.6553497693960834, + "grad_norm": 2.4438580785603117, + "learning_rate": 6.404650349505064e-06, + "loss": 3.975, + "mean_token_accuracy": 0.31018145084381105, + "step": 13570 + }, + { + "epoch": 0.6558327095356531, + "grad_norm": 2.370352554275622, + "learning_rate": 6.388924210856728e-06, + "loss": 3.9355, + "mean_token_accuracy": 0.31542338877916337, + "step": 13580 + }, + { + "epoch": 0.6563156496752227, + "grad_norm": 2.554690529825553, + "learning_rate": 6.373208335282194e-06, + "loss": 3.9961, + "mean_token_accuracy": 0.3079637125134468, + "step": 13590 + }, + { + "epoch": 0.6567985898147924, + "grad_norm": 2.393189770980954, + "learning_rate": 6.357502767447701e-06, + "loss": 3.8934, + "mean_token_accuracy": 0.3205645129084587, + "step": 13600 + }, + { + "epoch": 0.6567985898147924, + "eval_runtime": 7.8081, + "eval_samples_per_second": 378.325, + "eval_steps_per_second": 23.693, + "step": 13600 + }, + { + "epoch": 0.6572815299543622, + "grad_norm": 2.4939519544957505, + "learning_rate": 6.3418075519902e-06, + "loss": 3.8641, + "mean_token_accuracy": 0.3302419349551201, + "step": 13610 + }, + { + "epoch": 0.6577644700939319, + "grad_norm": 2.498907949016617, + "learning_rate": 6.326122733517219e-06, + "loss": 3.898, + "mean_token_accuracy": 0.3212701603770256, + "step": 13620 + }, + { + "epoch": 0.6582474102335015, + "grad_norm": 2.4503704301524225, + "learning_rate": 6.310448356606722e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.3220766127109528, + "step": 13630 + }, + { + "epoch": 0.6587303503730713, + "grad_norm": 2.4254164909360627, + "learning_rate": 6.294784465807024e-06, + "loss": 3.9262, + "mean_token_accuracy": 0.31532257944345476, + "step": 13640 + }, + { + "epoch": 0.659213290512641, + "grad_norm": 2.862695016604388, + "learning_rate": 6.27913110563661e-06, + "loss": 3.9422, + "mean_token_accuracy": 0.3191532254219055, + "step": 13650 + }, + { + "epoch": 0.6596962306522106, + "grad_norm": 2.449129559009671, + "learning_rate": 6.2634883205840566e-06, + "loss": 3.9691, + "mean_token_accuracy": 0.31602822467684744, + "step": 13660 + }, + { + "epoch": 0.6601791707917803, + "grad_norm": 2.3446670577028264, + "learning_rate": 6.24785615510787e-06, + "loss": 3.9137, + "mean_token_accuracy": 0.3180443584918976, + "step": 13670 + }, + { + "epoch": 0.6606621109313501, + "grad_norm": 2.557532204979107, + "learning_rate": 6.232234653636386e-06, + "loss": 3.9473, + "mean_token_accuracy": 0.3185483872890472, + "step": 13680 + }, + { + "epoch": 0.6611450510709198, + "grad_norm": 2.420993812282275, + "learning_rate": 6.216623860567621e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.31754032522439957, + "step": 13690 + }, + { + "epoch": 0.6616279912104894, + "grad_norm": 2.3094562537185346, + "learning_rate": 6.201023820269168e-06, + "loss": 3.9016, + "mean_token_accuracy": 0.3211693540215492, + "step": 13700 + }, + { + "epoch": 0.6616279912104894, + "eval_runtime": 7.8045, + "eval_samples_per_second": 378.501, + "eval_steps_per_second": 23.704, + "step": 13700 + }, + { + "epoch": 0.6621109313500592, + "grad_norm": 2.381447202179818, + "learning_rate": 6.185434577078048e-06, + "loss": 3.8926, + "mean_token_accuracy": 0.321370966732502, + "step": 13710 + }, + { + "epoch": 0.6625938714896289, + "grad_norm": 2.4072660215105306, + "learning_rate": 6.169856175300608e-06, + "loss": 3.925, + "mean_token_accuracy": 0.31673386991024016, + "step": 13720 + }, + { + "epoch": 0.6630768116291985, + "grad_norm": 2.4079167228823573, + "learning_rate": 6.15428865921237e-06, + "loss": 3.952, + "mean_token_accuracy": 0.32086693644523623, + "step": 13730 + }, + { + "epoch": 0.6635597517687682, + "grad_norm": 2.5129848355695947, + "learning_rate": 6.138732073057929e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.32268145233392714, + "step": 13740 + }, + { + "epoch": 0.664042691908338, + "grad_norm": 2.5972387787368905, + "learning_rate": 6.123186461050809e-06, + "loss": 3.9574, + "mean_token_accuracy": 0.3197580650448799, + "step": 13750 + }, + { + "epoch": 0.6645256320479077, + "grad_norm": 2.5199844587444713, + "learning_rate": 6.10765186737334e-06, + "loss": 3.8922, + "mean_token_accuracy": 0.3214717715978622, + "step": 13760 + }, + { + "epoch": 0.6650085721874773, + "grad_norm": 2.4518189167491977, + "learning_rate": 6.09212833617655e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.3166330650448799, + "step": 13770 + }, + { + "epoch": 0.6654915123270471, + "grad_norm": 2.638132964794942, + "learning_rate": 6.076615911580015e-06, + "loss": 3.8254, + "mean_token_accuracy": 0.32510080486536025, + "step": 13780 + }, + { + "epoch": 0.6659744524666168, + "grad_norm": 2.424275930362351, + "learning_rate": 6.061114637671752e-06, + "loss": 3.8687, + "mean_token_accuracy": 0.3265120968222618, + "step": 13790 + }, + { + "epoch": 0.6664573926061864, + "grad_norm": 2.948220453970061, + "learning_rate": 6.045624558508079e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.318245966732502, + "step": 13800 + }, + { + "epoch": 0.6664573926061864, + "eval_runtime": 7.7802, + "eval_samples_per_second": 379.679, + "eval_steps_per_second": 23.778, + "step": 13800 + }, + { + "epoch": 0.6669403327457561, + "grad_norm": 2.359312713919547, + "learning_rate": 6.030145718113505e-06, + "loss": 3.9305, + "mean_token_accuracy": 0.32469757795333865, + "step": 13810 + }, + { + "epoch": 0.6674232728853259, + "grad_norm": 2.5959829401054186, + "learning_rate": 6.014678160480589e-06, + "loss": 3.9172, + "mean_token_accuracy": 0.32197580486536026, + "step": 13820 + }, + { + "epoch": 0.6679062130248956, + "grad_norm": 2.3969151206364008, + "learning_rate": 5.999221929569834e-06, + "loss": 3.9574, + "mean_token_accuracy": 0.3176411271095276, + "step": 13830 + }, + { + "epoch": 0.6683891531644652, + "grad_norm": 2.4227529562048105, + "learning_rate": 5.983777069309539e-06, + "loss": 3.9023, + "mean_token_accuracy": 0.32086693644523623, + "step": 13840 + }, + { + "epoch": 0.668872093304035, + "grad_norm": 2.418079183322335, + "learning_rate": 5.968343623595696e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.3179435461759567, + "step": 13850 + }, + { + "epoch": 0.6693550334436047, + "grad_norm": 2.418766494528195, + "learning_rate": 5.952921636291851e-06, + "loss": 3.9578, + "mean_token_accuracy": 0.3129032254219055, + "step": 13860 + }, + { + "epoch": 0.6698379735831743, + "grad_norm": 2.5143601669096887, + "learning_rate": 5.937511151228984e-06, + "loss": 3.9035, + "mean_token_accuracy": 0.3157258085906506, + "step": 13870 + }, + { + "epoch": 0.670320913722744, + "grad_norm": 2.5177516821141315, + "learning_rate": 5.922112212205389e-06, + "loss": 3.9613, + "mean_token_accuracy": 0.3132056429982185, + "step": 13880 + }, + { + "epoch": 0.6708038538623138, + "grad_norm": 2.428742140891193, + "learning_rate": 5.906724862986533e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.31451613157987596, + "step": 13890 + }, + { + "epoch": 0.6712867940018835, + "grad_norm": 2.4586403759398663, + "learning_rate": 5.891349147304959e-06, + "loss": 3.9664, + "mean_token_accuracy": 0.3160282254219055, + "step": 13900 + }, + { + "epoch": 0.6712867940018835, + "eval_runtime": 7.819, + "eval_samples_per_second": 377.799, + "eval_steps_per_second": 23.66, + "step": 13900 + }, + { + "epoch": 0.6717697341414531, + "grad_norm": 2.3892298855439003, + "learning_rate": 5.8759851088601365e-06, + "loss": 3.8863, + "mean_token_accuracy": 0.3189516142010689, + "step": 13910 + }, + { + "epoch": 0.6722526742810229, + "grad_norm": 2.4162785767272976, + "learning_rate": 5.860632791318349e-06, + "loss": 3.932, + "mean_token_accuracy": 0.31491935849189756, + "step": 13920 + }, + { + "epoch": 0.6727356144205926, + "grad_norm": 2.4674787625719508, + "learning_rate": 5.845292238312568e-06, + "loss": 3.9031, + "mean_token_accuracy": 0.32116935551166537, + "step": 13930 + }, + { + "epoch": 0.6732185545601622, + "grad_norm": 2.6481593182014507, + "learning_rate": 5.829963493442332e-06, + "loss": 3.9152, + "mean_token_accuracy": 0.3179435461759567, + "step": 13940 + }, + { + "epoch": 0.673701494699732, + "grad_norm": 2.4627066242099094, + "learning_rate": 5.814646600273611e-06, + "loss": 3.8961, + "mean_token_accuracy": 0.3146169364452362, + "step": 13950 + }, + { + "epoch": 0.6741844348393017, + "grad_norm": 2.5241716916536503, + "learning_rate": 5.799341602338706e-06, + "loss": 3.9008, + "mean_token_accuracy": 0.32096774131059647, + "step": 13960 + }, + { + "epoch": 0.6746673749788714, + "grad_norm": 2.4278502281401866, + "learning_rate": 5.784048543136089e-06, + "loss": 3.959, + "mean_token_accuracy": 0.3180443555116653, + "step": 13970 + }, + { + "epoch": 0.675150315118441, + "grad_norm": 2.480624354040345, + "learning_rate": 5.768767466130323e-06, + "loss": 3.8937, + "mean_token_accuracy": 0.3196572571992874, + "step": 13980 + }, + { + "epoch": 0.6756332552580108, + "grad_norm": 2.4312966451771953, + "learning_rate": 5.753498414751901e-06, + "loss": 3.9695, + "mean_token_accuracy": 0.3206653192639351, + "step": 13990 + }, + { + "epoch": 0.6761161953975805, + "grad_norm": 2.4529403596977306, + "learning_rate": 5.738241432397148e-06, + "loss": 3.852, + "mean_token_accuracy": 0.3302419364452362, + "step": 14000 + }, + { + "epoch": 0.6761161953975805, + "eval_runtime": 7.8069, + "eval_samples_per_second": 378.384, + "eval_steps_per_second": 23.697, + "step": 14000 + }, + { + "epoch": 0.6765991355371501, + "grad_norm": 2.8264414353458815, + "learning_rate": 5.722996562428073e-06, + "loss": 3.9027, + "mean_token_accuracy": 0.3213709697127342, + "step": 14010 + }, + { + "epoch": 0.6770820756767199, + "grad_norm": 2.49225939231994, + "learning_rate": 5.707763848172284e-06, + "loss": 3.8805, + "mean_token_accuracy": 0.3251008078455925, + "step": 14020 + }, + { + "epoch": 0.6775650158162896, + "grad_norm": 2.414729369619342, + "learning_rate": 5.69254333292282e-06, + "loss": 3.9215, + "mean_token_accuracy": 0.3220766082406044, + "step": 14030 + }, + { + "epoch": 0.6780479559558593, + "grad_norm": 2.482081685297335, + "learning_rate": 5.677335059938052e-06, + "loss": 3.8676, + "mean_token_accuracy": 0.32550403028726577, + "step": 14040 + }, + { + "epoch": 0.6785308960954289, + "grad_norm": 2.611387177885236, + "learning_rate": 5.662139072441566e-06, + "loss": 3.9324, + "mean_token_accuracy": 0.31108871251344683, + "step": 14050 + }, + { + "epoch": 0.6790138362349987, + "grad_norm": 2.384795669222592, + "learning_rate": 5.646955413622024e-06, + "loss": 3.9078, + "mean_token_accuracy": 0.3208669379353523, + "step": 14060 + }, + { + "epoch": 0.6794967763745684, + "grad_norm": 2.32157343155579, + "learning_rate": 5.631784126633058e-06, + "loss": 3.8996, + "mean_token_accuracy": 0.3175403267145157, + "step": 14070 + }, + { + "epoch": 0.679979716514138, + "grad_norm": 2.504607872955561, + "learning_rate": 5.616625254593122e-06, + "loss": 3.9629, + "mean_token_accuracy": 0.309677417576313, + "step": 14080 + }, + { + "epoch": 0.6804626566537078, + "grad_norm": 2.7441888026175203, + "learning_rate": 5.601478840585399e-06, + "loss": 3.8898, + "mean_token_accuracy": 0.32046370953321457, + "step": 14090 + }, + { + "epoch": 0.6809455967932775, + "grad_norm": 2.6365495571998814, + "learning_rate": 5.5863449276576595e-06, + "loss": 3.9422, + "mean_token_accuracy": 0.3156249985098839, + "step": 14100 + }, + { + "epoch": 0.6809455967932775, + "eval_runtime": 7.8276, + "eval_samples_per_second": 377.384, + "eval_steps_per_second": 23.634, + "step": 14100 + }, + { + "epoch": 0.6814285369328472, + "grad_norm": 2.371646624269115, + "learning_rate": 5.57122355882215e-06, + "loss": 3.9062, + "mean_token_accuracy": 0.3235887140035629, + "step": 14110 + }, + { + "epoch": 0.6819114770724168, + "grad_norm": 2.4822156340525194, + "learning_rate": 5.556114777055448e-06, + "loss": 3.9262, + "mean_token_accuracy": 0.3153225779533386, + "step": 14120 + }, + { + "epoch": 0.6823944172119866, + "grad_norm": 2.686293541984887, + "learning_rate": 5.541018625298387e-06, + "loss": 3.9156, + "mean_token_accuracy": 0.32127015888690946, + "step": 14130 + }, + { + "epoch": 0.6828773573515563, + "grad_norm": 2.5840822066386684, + "learning_rate": 5.5259351464558716e-06, + "loss": 3.8797, + "mean_token_accuracy": 0.32106854766607285, + "step": 14140 + }, + { + "epoch": 0.6833602974911259, + "grad_norm": 2.5815636065214167, + "learning_rate": 5.510864383396819e-06, + "loss": 3.9922, + "mean_token_accuracy": 0.3129032254219055, + "step": 14150 + }, + { + "epoch": 0.6838432376306957, + "grad_norm": 2.580527635344399, + "learning_rate": 5.4958063789539785e-06, + "loss": 3.9617, + "mean_token_accuracy": 0.3160282239317894, + "step": 14160 + }, + { + "epoch": 0.6843261777702654, + "grad_norm": 2.487363174552614, + "learning_rate": 5.480761175923858e-06, + "loss": 3.9844, + "mean_token_accuracy": 0.3126008063554764, + "step": 14170 + }, + { + "epoch": 0.6848091179098351, + "grad_norm": 2.634061485900264, + "learning_rate": 5.4657288170665826e-06, + "loss": 3.8746, + "mean_token_accuracy": 0.3256048411130905, + "step": 14180 + }, + { + "epoch": 0.6852920580494047, + "grad_norm": 2.5070766485362665, + "learning_rate": 5.450709345105753e-06, + "loss": 3.9535, + "mean_token_accuracy": 0.31542338579893114, + "step": 14190 + }, + { + "epoch": 0.6857749981889745, + "grad_norm": 2.432497984129058, + "learning_rate": 5.435702802728366e-06, + "loss": 3.9016, + "mean_token_accuracy": 0.32358870655298233, + "step": 14200 + }, + { + "epoch": 0.6857749981889745, + "eval_runtime": 7.8074, + "eval_samples_per_second": 378.357, + "eval_steps_per_second": 23.695, + "step": 14200 + }, + { + "epoch": 0.6862579383285442, + "grad_norm": 2.405571423456711, + "learning_rate": 5.42070923258466e-06, + "loss": 3.907, + "mean_token_accuracy": 0.3243951588869095, + "step": 14210 + }, + { + "epoch": 0.6867408784681138, + "grad_norm": 2.371988315484479, + "learning_rate": 5.405728677288011e-06, + "loss": 3.9602, + "mean_token_accuracy": 0.3175403237342834, + "step": 14220 + }, + { + "epoch": 0.6872238186076836, + "grad_norm": 2.3963169699532805, + "learning_rate": 5.390761179414792e-06, + "loss": 3.8766, + "mean_token_accuracy": 0.32268145233392714, + "step": 14230 + }, + { + "epoch": 0.6877067587472533, + "grad_norm": 2.304178036628061, + "learning_rate": 5.375806781504288e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.32086693644523623, + "step": 14240 + }, + { + "epoch": 0.688189698886823, + "grad_norm": 2.4471313784025908, + "learning_rate": 5.3608655260585294e-06, + "loss": 3.8777, + "mean_token_accuracy": 0.32147177010774614, + "step": 14250 + }, + { + "epoch": 0.6886726390263926, + "grad_norm": 2.6521343840359113, + "learning_rate": 5.345937455542212e-06, + "loss": 3.934, + "mean_token_accuracy": 0.317237900197506, + "step": 14260 + }, + { + "epoch": 0.6891555791659624, + "grad_norm": 2.4685010593263916, + "learning_rate": 5.331022612382537e-06, + "loss": 3.9535, + "mean_token_accuracy": 0.3150201603770256, + "step": 14270 + }, + { + "epoch": 0.6896385193055321, + "grad_norm": 2.55936453226136, + "learning_rate": 5.316121038969146e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.3237903222441673, + "step": 14280 + }, + { + "epoch": 0.6901214594451017, + "grad_norm": 2.4982568171465944, + "learning_rate": 5.301232777653935e-06, + "loss": 3.991, + "mean_token_accuracy": 0.3178427413105965, + "step": 14290 + }, + { + "epoch": 0.6906043995846715, + "grad_norm": 2.4175112580335574, + "learning_rate": 5.286357870750976e-06, + "loss": 3.9367, + "mean_token_accuracy": 0.3175403207540512, + "step": 14300 + }, + { + "epoch": 0.6906043995846715, + "eval_runtime": 7.8215, + "eval_samples_per_second": 377.677, + "eval_steps_per_second": 23.653, + "step": 14300 + }, + { + "epoch": 0.6910873397242412, + "grad_norm": 2.425430169136526, + "learning_rate": 5.271496360536388e-06, + "loss": 3.9793, + "mean_token_accuracy": 0.3131048381328583, + "step": 14310 + }, + { + "epoch": 0.6915702798638109, + "grad_norm": 2.314037880061397, + "learning_rate": 5.256648289248215e-06, + "loss": 3.9266, + "mean_token_accuracy": 0.31532257944345476, + "step": 14320 + }, + { + "epoch": 0.6920532200033805, + "grad_norm": 2.3724551691365363, + "learning_rate": 5.241813699086311e-06, + "loss": 3.9227, + "mean_token_accuracy": 0.32318548262119295, + "step": 14330 + }, + { + "epoch": 0.6925361601429503, + "grad_norm": 2.446218547681278, + "learning_rate": 5.2269926322122026e-06, + "loss": 3.9043, + "mean_token_accuracy": 0.3210685506463051, + "step": 14340 + }, + { + "epoch": 0.69301910028252, + "grad_norm": 2.516961842821772, + "learning_rate": 5.212185130748991e-06, + "loss": 3.9129, + "mean_token_accuracy": 0.3183467760682106, + "step": 14350 + }, + { + "epoch": 0.6935020404220896, + "grad_norm": 2.3091894518979195, + "learning_rate": 5.197391236781221e-06, + "loss": 3.8883, + "mean_token_accuracy": 0.3285282254219055, + "step": 14360 + }, + { + "epoch": 0.6939849805616594, + "grad_norm": 2.4512283398448704, + "learning_rate": 5.182610992354768e-06, + "loss": 3.9641, + "mean_token_accuracy": 0.3127016112208366, + "step": 14370 + }, + { + "epoch": 0.6944679207012291, + "grad_norm": 2.370448114144957, + "learning_rate": 5.167844439476697e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.3180443555116653, + "step": 14380 + }, + { + "epoch": 0.6949508608407988, + "grad_norm": 2.358065363678817, + "learning_rate": 5.153091620115187e-06, + "loss": 3.9398, + "mean_token_accuracy": 0.32167338877916335, + "step": 14390 + }, + { + "epoch": 0.6954338009803684, + "grad_norm": 2.644788406142881, + "learning_rate": 5.138352576199359e-06, + "loss": 3.9617, + "mean_token_accuracy": 0.31965725868940353, + "step": 14400 + }, + { + "epoch": 0.6954338009803684, + "eval_runtime": 7.8204, + "eval_samples_per_second": 377.729, + "eval_steps_per_second": 23.656, + "step": 14400 + }, + { + "epoch": 0.6959167411199382, + "grad_norm": 2.489715561821747, + "learning_rate": 5.1236273496192035e-06, + "loss": 3.9211, + "mean_token_accuracy": 0.3196572601795197, + "step": 14410 + }, + { + "epoch": 0.6963996812595079, + "grad_norm": 2.6532229560459504, + "learning_rate": 5.108915982225421e-06, + "loss": 3.8781, + "mean_token_accuracy": 0.3236895129084587, + "step": 14420 + }, + { + "epoch": 0.6968826213990776, + "grad_norm": 2.425560989655647, + "learning_rate": 5.0942185158293365e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.3158266142010689, + "step": 14430 + }, + { + "epoch": 0.6973655615386473, + "grad_norm": 2.498177671354803, + "learning_rate": 5.079534992202767e-06, + "loss": 3.934, + "mean_token_accuracy": 0.3134072616696358, + "step": 14440 + }, + { + "epoch": 0.697848501678217, + "grad_norm": 2.40580931359974, + "learning_rate": 5.064865453077892e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.31885080859065057, + "step": 14450 + }, + { + "epoch": 0.6983314418177867, + "grad_norm": 2.3475911608267492, + "learning_rate": 5.050209940147154e-06, + "loss": 3.9828, + "mean_token_accuracy": 0.31018145233392713, + "step": 14460 + }, + { + "epoch": 0.6988143819573563, + "grad_norm": 2.3605570313582374, + "learning_rate": 5.03556849506313e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.32288306653499604, + "step": 14470 + }, + { + "epoch": 0.6992973220969261, + "grad_norm": 2.295470415797191, + "learning_rate": 5.02094115943842e-06, + "loss": 3.9293, + "mean_token_accuracy": 0.32167338728904726, + "step": 14480 + }, + { + "epoch": 0.6997802622364958, + "grad_norm": 2.5862582890249928, + "learning_rate": 5.006327974845504e-06, + "loss": 3.9164, + "mean_token_accuracy": 0.3180443525314331, + "step": 14490 + }, + { + "epoch": 0.7002632023760655, + "grad_norm": 2.3676120963198635, + "learning_rate": 4.991728982816672e-06, + "loss": 3.9469, + "mean_token_accuracy": 0.3202620968222618, + "step": 14500 + }, + { + "epoch": 0.7002632023760655, + "eval_runtime": 7.8114, + "eval_samples_per_second": 378.165, + "eval_steps_per_second": 23.683, + "step": 14500 + }, + { + "epoch": 0.7007461425156352, + "grad_norm": 2.6551673981950086, + "learning_rate": 4.977144224843853e-06, + "loss": 3.9465, + "mean_token_accuracy": 0.3246975809335709, + "step": 14510 + }, + { + "epoch": 0.7012290826552049, + "grad_norm": 2.5382371856969557, + "learning_rate": 4.962573742378534e-06, + "loss": 3.9574, + "mean_token_accuracy": 0.31179435551166534, + "step": 14520 + }, + { + "epoch": 0.7017120227947746, + "grad_norm": 2.532309626583744, + "learning_rate": 4.948017576831617e-06, + "loss": 3.9258, + "mean_token_accuracy": 0.31592742204666135, + "step": 14530 + }, + { + "epoch": 0.7021949629343442, + "grad_norm": 2.491087270974582, + "learning_rate": 4.933475769573337e-06, + "loss": 3.8625, + "mean_token_accuracy": 0.33125, + "step": 14540 + }, + { + "epoch": 0.702677903073914, + "grad_norm": 2.456895204287448, + "learning_rate": 4.918948361933096e-06, + "loss": 3.891, + "mean_token_accuracy": 0.3257056474685669, + "step": 14550 + }, + { + "epoch": 0.7031608432134837, + "grad_norm": 2.590229869518269, + "learning_rate": 4.904435395199386e-06, + "loss": 3.9656, + "mean_token_accuracy": 0.30957661122083663, + "step": 14560 + }, + { + "epoch": 0.7036437833530534, + "grad_norm": 2.316206424021573, + "learning_rate": 4.889936910619647e-06, + "loss": 3.8652, + "mean_token_accuracy": 0.3272177428007126, + "step": 14570 + }, + { + "epoch": 0.7041267234926231, + "grad_norm": 2.6372734864838874, + "learning_rate": 4.875452949400166e-06, + "loss": 3.8891, + "mean_token_accuracy": 0.31864919513463974, + "step": 14580 + }, + { + "epoch": 0.7046096636321928, + "grad_norm": 2.4704041222080932, + "learning_rate": 4.860983552705955e-06, + "loss": 3.9645, + "mean_token_accuracy": 0.31340725868940356, + "step": 14590 + }, + { + "epoch": 0.7050926037717625, + "grad_norm": 2.4519715493817986, + "learning_rate": 4.846528761660616e-06, + "loss": 3.9297, + "mean_token_accuracy": 0.31885080486536027, + "step": 14600 + }, + { + "epoch": 0.7050926037717625, + "eval_runtime": 7.8098, + "eval_samples_per_second": 378.242, + "eval_steps_per_second": 23.688, + "step": 14600 + }, + { + "epoch": 0.7055755439113321, + "grad_norm": 2.4573889374384668, + "learning_rate": 4.832088617346269e-06, + "loss": 3.9016, + "mean_token_accuracy": 0.32328628897666933, + "step": 14610 + }, + { + "epoch": 0.7060584840509019, + "grad_norm": 2.522003340352901, + "learning_rate": 4.817663160803375e-06, + "loss": 3.923, + "mean_token_accuracy": 0.321068549156189, + "step": 14620 + }, + { + "epoch": 0.7065414241904716, + "grad_norm": 2.582041652621467, + "learning_rate": 4.803252433030675e-06, + "loss": 3.8738, + "mean_token_accuracy": 0.31965725868940353, + "step": 14630 + }, + { + "epoch": 0.7070243643300413, + "grad_norm": 2.4274847926799024, + "learning_rate": 4.788856474985027e-06, + "loss": 3.952, + "mean_token_accuracy": 0.31673386991024016, + "step": 14640 + }, + { + "epoch": 0.707507304469611, + "grad_norm": 2.555341376776363, + "learning_rate": 4.774475327581338e-06, + "loss": 3.8648, + "mean_token_accuracy": 0.3256048411130905, + "step": 14650 + }, + { + "epoch": 0.7079902446091807, + "grad_norm": 2.614824409703627, + "learning_rate": 4.760109031692398e-06, + "loss": 3.9508, + "mean_token_accuracy": 0.31592742055654527, + "step": 14660 + }, + { + "epoch": 0.7084731847487504, + "grad_norm": 2.2531611451556093, + "learning_rate": 4.745757628148804e-06, + "loss": 3.8855, + "mean_token_accuracy": 0.3239919379353523, + "step": 14670 + }, + { + "epoch": 0.70895612488832, + "grad_norm": 2.49962981853155, + "learning_rate": 4.731421157738809e-06, + "loss": 3.8937, + "mean_token_accuracy": 0.3175403207540512, + "step": 14680 + }, + { + "epoch": 0.7094390650278898, + "grad_norm": 2.403220615500529, + "learning_rate": 4.717099661208251e-06, + "loss": 3.907, + "mean_token_accuracy": 0.3205645173788071, + "step": 14690 + }, + { + "epoch": 0.7099220051674595, + "grad_norm": 2.4963327525832333, + "learning_rate": 4.702793179260387e-06, + "loss": 3.9488, + "mean_token_accuracy": 0.3166330650448799, + "step": 14700 + }, + { + "epoch": 0.7099220051674595, + "eval_runtime": 7.8138, + "eval_samples_per_second": 378.047, + "eval_steps_per_second": 23.676, + "step": 14700 + }, + { + "epoch": 0.7104049453070292, + "grad_norm": 2.5411854440802295, + "learning_rate": 4.6885017525558074e-06, + "loss": 3.8969, + "mean_token_accuracy": 0.3195564538240433, + "step": 14710 + }, + { + "epoch": 0.7108878854465989, + "grad_norm": 2.612826539910147, + "learning_rate": 4.674225421712317e-06, + "loss": 3.9684, + "mean_token_accuracy": 0.3085685521364212, + "step": 14720 + }, + { + "epoch": 0.7113708255861686, + "grad_norm": 2.323685904255609, + "learning_rate": 4.659964227304816e-06, + "loss": 3.9621, + "mean_token_accuracy": 0.31058467775583265, + "step": 14730 + }, + { + "epoch": 0.7118537657257383, + "grad_norm": 2.4355541715215208, + "learning_rate": 4.645718209865189e-06, + "loss": 3.9695, + "mean_token_accuracy": 0.3129032239317894, + "step": 14740 + }, + { + "epoch": 0.712336705865308, + "grad_norm": 2.46237779541236, + "learning_rate": 4.6314874098821696e-06, + "loss": 3.9191, + "mean_token_accuracy": 0.32358870804309847, + "step": 14750 + }, + { + "epoch": 0.7128196460048777, + "grad_norm": 2.8244521530754603, + "learning_rate": 4.617271867801268e-06, + "loss": 3.9863, + "mean_token_accuracy": 0.31370967626571655, + "step": 14760 + }, + { + "epoch": 0.7133025861444474, + "grad_norm": 2.342530027987054, + "learning_rate": 4.603071624024605e-06, + "loss": 3.9066, + "mean_token_accuracy": 0.3244959697127342, + "step": 14770 + }, + { + "epoch": 0.7137855262840171, + "grad_norm": 2.552251880164965, + "learning_rate": 4.58888671891084e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.31875, + "step": 14780 + }, + { + "epoch": 0.7142684664235868, + "grad_norm": 2.339784549556923, + "learning_rate": 4.5747171927750175e-06, + "loss": 3.9496, + "mean_token_accuracy": 0.3142137140035629, + "step": 14790 + }, + { + "epoch": 0.7147514065631565, + "grad_norm": 2.3843810229299995, + "learning_rate": 4.560563085888503e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.31905242055654526, + "step": 14800 + }, + { + "epoch": 0.7147514065631565, + "eval_runtime": 7.8314, + "eval_samples_per_second": 377.198, + "eval_steps_per_second": 23.623, + "step": 14800 + }, + { + "epoch": 0.7152343467027262, + "grad_norm": 2.428513651369846, + "learning_rate": 4.54642443847881e-06, + "loss": 3.9254, + "mean_token_accuracy": 0.31824596524238585, + "step": 14810 + }, + { + "epoch": 0.7157172868422959, + "grad_norm": 2.6030870042909204, + "learning_rate": 4.532301290729535e-06, + "loss": 3.9707, + "mean_token_accuracy": 0.32258064597845076, + "step": 14820 + }, + { + "epoch": 0.7162002269818656, + "grad_norm": 2.5543406619630935, + "learning_rate": 4.518193682780205e-06, + "loss": 3.9523, + "mean_token_accuracy": 0.31491935551166533, + "step": 14830 + }, + { + "epoch": 0.7166831671214353, + "grad_norm": 2.605715236873571, + "learning_rate": 4.504101654726195e-06, + "loss": 3.943, + "mean_token_accuracy": 0.31542338579893114, + "step": 14840 + }, + { + "epoch": 0.717166107261005, + "grad_norm": 2.509142932962603, + "learning_rate": 4.4900252466186e-06, + "loss": 3.9023, + "mean_token_accuracy": 0.3197351634502411, + "step": 14850 + }, + { + "epoch": 0.7176490474005747, + "grad_norm": 2.499479951166928, + "learning_rate": 4.475964498464106e-06, + "loss": 3.8676, + "mean_token_accuracy": 0.3274193570017815, + "step": 14860 + }, + { + "epoch": 0.7181319875401444, + "grad_norm": 2.44542708774872, + "learning_rate": 4.4619194502249165e-06, + "loss": 3.9789, + "mean_token_accuracy": 0.31340223997831346, + "step": 14870 + }, + { + "epoch": 0.7186149276797141, + "grad_norm": 2.4438661074715426, + "learning_rate": 4.44789014181859e-06, + "loss": 3.9961, + "mean_token_accuracy": 0.3086693525314331, + "step": 14880 + }, + { + "epoch": 0.7190978678192838, + "grad_norm": 2.3923380712423414, + "learning_rate": 4.433876613117968e-06, + "loss": 3.902, + "mean_token_accuracy": 0.31733871102333067, + "step": 14890 + }, + { + "epoch": 0.7195808079588535, + "grad_norm": 2.5628356632297313, + "learning_rate": 4.419878903951027e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.3243951633572578, + "step": 14900 + }, + { + "epoch": 0.7195808079588535, + "eval_runtime": 7.812, + "eval_samples_per_second": 378.138, + "eval_steps_per_second": 23.682, + "step": 14900 + }, + { + "epoch": 0.7200637480984232, + "grad_norm": 2.391454109510873, + "learning_rate": 4.405897054100808e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.32056451588869095, + "step": 14910 + }, + { + "epoch": 0.720546688237993, + "grad_norm": 2.4146831039124046, + "learning_rate": 4.391931103305251e-06, + "loss": 3.8344, + "mean_token_accuracy": 0.322076615691185, + "step": 14920 + }, + { + "epoch": 0.7210296283775626, + "grad_norm": 2.2980539186703433, + "learning_rate": 4.37798109125713e-06, + "loss": 4.002, + "mean_token_accuracy": 0.31108870804309846, + "step": 14930 + }, + { + "epoch": 0.7215125685171323, + "grad_norm": 2.673315473006604, + "learning_rate": 4.364047057603897e-06, + "loss": 3.9367, + "mean_token_accuracy": 0.31885080635547636, + "step": 14940 + }, + { + "epoch": 0.721995508656702, + "grad_norm": 2.3936242046831175, + "learning_rate": 4.350129041947623e-06, + "loss": 3.9316, + "mean_token_accuracy": 0.31300403028726576, + "step": 14950 + }, + { + "epoch": 0.7224784487962717, + "grad_norm": 2.590915431177032, + "learning_rate": 4.3362270838448275e-06, + "loss": 3.8977, + "mean_token_accuracy": 0.31985886991024015, + "step": 14960 + }, + { + "epoch": 0.7229613889358414, + "grad_norm": 2.375467613791358, + "learning_rate": 4.322341222806394e-06, + "loss": 3.9246, + "mean_token_accuracy": 0.30735886842012405, + "step": 14970 + }, + { + "epoch": 0.7234443290754111, + "grad_norm": 2.507571735217888, + "learning_rate": 4.30847149829748e-06, + "loss": 3.9066, + "mean_token_accuracy": 0.3207661330699921, + "step": 14980 + }, + { + "epoch": 0.7239272692149809, + "grad_norm": 2.4644818301505262, + "learning_rate": 4.294617949737353e-06, + "loss": 4.0098, + "mean_token_accuracy": 0.31330645084381104, + "step": 14990 + }, + { + "epoch": 0.7244102093545505, + "grad_norm": 2.408721104937917, + "learning_rate": 4.280780616499325e-06, + "loss": 3.873, + "mean_token_accuracy": 0.3273185506463051, + "step": 15000 + }, + { + "epoch": 0.7244102093545505, + "eval_runtime": 7.7943, + "eval_samples_per_second": 378.994, + "eval_steps_per_second": 23.735, + "step": 15000 + }, + { + "epoch": 0.7248931494941202, + "grad_norm": 2.593428874376227, + "learning_rate": 4.266959537910608e-06, + "loss": 3.9258, + "mean_token_accuracy": 0.31784274280071256, + "step": 15010 + }, + { + "epoch": 0.7253760896336899, + "grad_norm": 2.6411816877271255, + "learning_rate": 4.253154753252235e-06, + "loss": 3.902, + "mean_token_accuracy": 0.31370967626571655, + "step": 15020 + }, + { + "epoch": 0.7258590297732596, + "grad_norm": 2.542398653412803, + "learning_rate": 4.239366301758914e-06, + "loss": 3.9395, + "mean_token_accuracy": 0.3205645129084587, + "step": 15030 + }, + { + "epoch": 0.7263419699128293, + "grad_norm": 2.521715865718538, + "learning_rate": 4.225594222618939e-06, + "loss": 3.8879, + "mean_token_accuracy": 0.32651209831237793, + "step": 15040 + }, + { + "epoch": 0.726824910052399, + "grad_norm": 2.4930738109906505, + "learning_rate": 4.211838554974065e-06, + "loss": 3.923, + "mean_token_accuracy": 0.3111895158886909, + "step": 15050 + }, + { + "epoch": 0.7273078501919688, + "grad_norm": 2.4315326034226166, + "learning_rate": 4.198099337919421e-06, + "loss": 3.9227, + "mean_token_accuracy": 0.3218749985098839, + "step": 15060 + }, + { + "epoch": 0.7277907903315384, + "grad_norm": 2.4449574507272636, + "learning_rate": 4.18437661050336e-06, + "loss": 3.9441, + "mean_token_accuracy": 0.3107862897217274, + "step": 15070 + }, + { + "epoch": 0.7282737304711081, + "grad_norm": 2.370351988418548, + "learning_rate": 4.1706704117273845e-06, + "loss": 3.9156, + "mean_token_accuracy": 0.3192540317773819, + "step": 15080 + }, + { + "epoch": 0.7287566706106778, + "grad_norm": 2.591991883355936, + "learning_rate": 4.156980780546014e-06, + "loss": 3.9797, + "mean_token_accuracy": 0.31673386693000793, + "step": 15090 + }, + { + "epoch": 0.7292396107502475, + "grad_norm": 2.4510274646424866, + "learning_rate": 4.1433077558666876e-06, + "loss": 3.8547, + "mean_token_accuracy": 0.32661290317773817, + "step": 15100 + }, + { + "epoch": 0.7292396107502475, + "eval_runtime": 7.8426, + "eval_samples_per_second": 376.663, + "eval_steps_per_second": 23.589, + "step": 15100 + }, + { + "epoch": 0.7297225508898172, + "grad_norm": 2.5027839013131756, + "learning_rate": 4.129651376549639e-06, + "loss": 3.8727, + "mean_token_accuracy": 0.316129033267498, + "step": 15110 + }, + { + "epoch": 0.7302054910293869, + "grad_norm": 2.5042297187716707, + "learning_rate": 4.11601168140779e-06, + "loss": 3.807, + "mean_token_accuracy": 0.334072582423687, + "step": 15120 + }, + { + "epoch": 0.7306884311689567, + "grad_norm": 2.4037398800644545, + "learning_rate": 4.102388709206666e-06, + "loss": 3.8957, + "mean_token_accuracy": 0.3201612919569016, + "step": 15130 + }, + { + "epoch": 0.7311713713085263, + "grad_norm": 2.5212822272654987, + "learning_rate": 4.088782498664238e-06, + "loss": 3.9094, + "mean_token_accuracy": 0.3208669349551201, + "step": 15140 + }, + { + "epoch": 0.731654311448096, + "grad_norm": 2.6670973617578593, + "learning_rate": 4.0751930884508586e-06, + "loss": 3.9637, + "mean_token_accuracy": 0.3102822571992874, + "step": 15150 + }, + { + "epoch": 0.7321372515876657, + "grad_norm": 2.354301800171696, + "learning_rate": 4.061620517189111e-06, + "loss": 3.9531, + "mean_token_accuracy": 0.3113911300897598, + "step": 15160 + }, + { + "epoch": 0.7326201917272354, + "grad_norm": 2.5479479795334816, + "learning_rate": 4.048064823453748e-06, + "loss": 3.9254, + "mean_token_accuracy": 0.31854838579893113, + "step": 15170 + }, + { + "epoch": 0.7331031318668051, + "grad_norm": 2.4479654094931718, + "learning_rate": 4.034526045771529e-06, + "loss": 3.8668, + "mean_token_accuracy": 0.3244959697127342, + "step": 15180 + }, + { + "epoch": 0.7335860720063748, + "grad_norm": 2.44521805835857, + "learning_rate": 4.021004222621151e-06, + "loss": 3.9937, + "mean_token_accuracy": 0.3016129031777382, + "step": 15190 + }, + { + "epoch": 0.7340690121459446, + "grad_norm": 2.5390031389189214, + "learning_rate": 4.007499392433113e-06, + "loss": 3.9332, + "mean_token_accuracy": 0.317237900197506, + "step": 15200 + }, + { + "epoch": 0.7340690121459446, + "eval_runtime": 7.8288, + "eval_samples_per_second": 377.324, + "eval_steps_per_second": 23.631, + "step": 15200 + }, + { + "epoch": 0.7345519522855142, + "grad_norm": 2.6208555190571587, + "learning_rate": 3.994011593589635e-06, + "loss": 3.9801, + "mean_token_accuracy": 0.31562500149011613, + "step": 15210 + }, + { + "epoch": 0.7350348924250839, + "grad_norm": 2.367527893585157, + "learning_rate": 3.9805408644245145e-06, + "loss": 3.9496, + "mean_token_accuracy": 0.31784273982048034, + "step": 15220 + }, + { + "epoch": 0.7355178325646536, + "grad_norm": 2.707198956784325, + "learning_rate": 3.967087243223046e-06, + "loss": 3.9797, + "mean_token_accuracy": 0.3088709652423859, + "step": 15230 + }, + { + "epoch": 0.7360007727042233, + "grad_norm": 2.4441958593750677, + "learning_rate": 3.953650768221898e-06, + "loss": 3.8805, + "mean_token_accuracy": 0.3265120953321457, + "step": 15240 + }, + { + "epoch": 0.736483712843793, + "grad_norm": 2.33734175337742, + "learning_rate": 3.940231477609004e-06, + "loss": 3.9145, + "mean_token_accuracy": 0.31653225868940355, + "step": 15250 + }, + { + "epoch": 0.7369666529833627, + "grad_norm": 2.468068568695171, + "learning_rate": 3.926829409523466e-06, + "loss": 3.9555, + "mean_token_accuracy": 0.3209677398204803, + "step": 15260 + }, + { + "epoch": 0.7374495931229325, + "grad_norm": 2.5370706100855136, + "learning_rate": 3.913444602055424e-06, + "loss": 3.893, + "mean_token_accuracy": 0.32237903475761415, + "step": 15270 + }, + { + "epoch": 0.7379325332625021, + "grad_norm": 2.4632435230577596, + "learning_rate": 3.900077093245982e-06, + "loss": 3.9699, + "mean_token_accuracy": 0.3167338714003563, + "step": 15280 + }, + { + "epoch": 0.7384154734020718, + "grad_norm": 2.561719935990509, + "learning_rate": 3.886726921087058e-06, + "loss": 3.8734, + "mean_token_accuracy": 0.3237903222441673, + "step": 15290 + }, + { + "epoch": 0.7388984135416415, + "grad_norm": 2.790950404677061, + "learning_rate": 3.873394123521315e-06, + "loss": 3.9156, + "mean_token_accuracy": 0.31804435700178146, + "step": 15300 + }, + { + "epoch": 0.7388984135416415, + "eval_runtime": 7.8425, + "eval_samples_per_second": 376.665, + "eval_steps_per_second": 23.589, + "step": 15300 + }, + { + "epoch": 0.7393813536812112, + "grad_norm": 2.4446765590082546, + "learning_rate": 3.860078738442014e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.3146169364452362, + "step": 15310 + }, + { + "epoch": 0.7398642938207809, + "grad_norm": 2.65967736413874, + "learning_rate": 3.846780803692958e-06, + "loss": 3.941, + "mean_token_accuracy": 0.31804435700178146, + "step": 15320 + }, + { + "epoch": 0.7403472339603506, + "grad_norm": 2.491670541254839, + "learning_rate": 3.833500357068325e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.31441531926393507, + "step": 15330 + }, + { + "epoch": 0.7408301740999204, + "grad_norm": 2.3460682734120524, + "learning_rate": 3.820237436312606e-06, + "loss": 3.9336, + "mean_token_accuracy": 0.3200604826211929, + "step": 15340 + }, + { + "epoch": 0.74131311423949, + "grad_norm": 2.388014872853918, + "learning_rate": 3.806992079120477e-06, + "loss": 3.8742, + "mean_token_accuracy": 0.32610887438058855, + "step": 15350 + }, + { + "epoch": 0.7417960543790597, + "grad_norm": 2.4530352255650296, + "learning_rate": 3.7937643231367038e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.31794354766607286, + "step": 15360 + }, + { + "epoch": 0.7422789945186294, + "grad_norm": 2.664514435291622, + "learning_rate": 3.780554205956014e-06, + "loss": 3.9906, + "mean_token_accuracy": 0.3133064493536949, + "step": 15370 + }, + { + "epoch": 0.7427619346581991, + "grad_norm": 2.3191899016908537, + "learning_rate": 3.7673617651230055e-06, + "loss": 3.8566, + "mean_token_accuracy": 0.32510080486536025, + "step": 15380 + }, + { + "epoch": 0.7432448747977688, + "grad_norm": 2.474256875227795, + "learning_rate": 3.7541870381320564e-06, + "loss": 3.941, + "mean_token_accuracy": 0.31542338579893114, + "step": 15390 + }, + { + "epoch": 0.7437278149373385, + "grad_norm": 2.555223276304514, + "learning_rate": 3.7410300624271768e-06, + "loss": 3.902, + "mean_token_accuracy": 0.3140120983123779, + "step": 15400 + }, + { + "epoch": 0.7437278149373385, + "eval_runtime": 7.7795, + "eval_samples_per_second": 379.716, + "eval_steps_per_second": 23.78, + "step": 15400 + }, + { + "epoch": 0.7442107550769083, + "grad_norm": 2.537189244982377, + "learning_rate": 3.7278908754019438e-06, + "loss": 3.9645, + "mean_token_accuracy": 0.31733870804309844, + "step": 15410 + }, + { + "epoch": 0.7446936952164779, + "grad_norm": 2.442429163563041, + "learning_rate": 3.714769514399358e-06, + "loss": 3.9188, + "mean_token_accuracy": 0.3183467760682106, + "step": 15420 + }, + { + "epoch": 0.7451766353560476, + "grad_norm": 2.430822312159094, + "learning_rate": 3.7016660167117826e-06, + "loss": 3.902, + "mean_token_accuracy": 0.32631048411130903, + "step": 15430 + }, + { + "epoch": 0.7456595754956173, + "grad_norm": 2.4044508435609604, + "learning_rate": 3.688580419580785e-06, + "loss": 3.8777, + "mean_token_accuracy": 0.3193548396229744, + "step": 15440 + }, + { + "epoch": 0.746142515635187, + "grad_norm": 2.4335772557300683, + "learning_rate": 3.675512760197072e-06, + "loss": 3.9574, + "mean_token_accuracy": 0.3194556415081024, + "step": 15450 + }, + { + "epoch": 0.7466254557747567, + "grad_norm": 2.7082274819114507, + "learning_rate": 3.6624630757003676e-06, + "loss": 3.9371, + "mean_token_accuracy": 0.31512096971273423, + "step": 15460 + }, + { + "epoch": 0.7471083959143264, + "grad_norm": 2.5587950414668255, + "learning_rate": 3.6494314031793087e-06, + "loss": 3.9238, + "mean_token_accuracy": 0.3152217730879784, + "step": 15470 + }, + { + "epoch": 0.7475913360538962, + "grad_norm": 2.664794005054441, + "learning_rate": 3.6364177796713353e-06, + "loss": 3.9676, + "mean_token_accuracy": 0.3129032254219055, + "step": 15480 + }, + { + "epoch": 0.7480742761934658, + "grad_norm": 2.531862524210361, + "learning_rate": 3.6234222421625955e-06, + "loss": 3.9008, + "mean_token_accuracy": 0.3285282254219055, + "step": 15490 + }, + { + "epoch": 0.7485572163330355, + "grad_norm": 2.6051289402257902, + "learning_rate": 3.6104448275878335e-06, + "loss": 3.9234, + "mean_token_accuracy": 0.325, + "step": 15500 + }, + { + "epoch": 0.7485572163330355, + "eval_runtime": 7.8145, + "eval_samples_per_second": 378.017, + "eval_steps_per_second": 23.674, + "step": 15500 + }, + { + "epoch": 0.7490401564726052, + "grad_norm": 2.5638234621717126, + "learning_rate": 3.5974855728302893e-06, + "loss": 3.9395, + "mean_token_accuracy": 0.31935483664274217, + "step": 15510 + }, + { + "epoch": 0.7495230966121749, + "grad_norm": 2.7391427381924123, + "learning_rate": 3.5845445147215853e-06, + "loss": 3.9414, + "mean_token_accuracy": 0.316431450843811, + "step": 15520 + }, + { + "epoch": 0.7500060367517446, + "grad_norm": 2.425258497299364, + "learning_rate": 3.5716216900416223e-06, + "loss": 3.9055, + "mean_token_accuracy": 0.3233870953321457, + "step": 15530 + }, + { + "epoch": 0.7504889768913143, + "grad_norm": 2.366059174869697, + "learning_rate": 3.5587171355184993e-06, + "loss": 3.9266, + "mean_token_accuracy": 0.3136088728904724, + "step": 15540 + }, + { + "epoch": 0.7509719170308841, + "grad_norm": 2.636300062613671, + "learning_rate": 3.5458308878283664e-06, + "loss": 4.0047, + "mean_token_accuracy": 0.318044351041317, + "step": 15550 + }, + { + "epoch": 0.7514548571704537, + "grad_norm": 2.6542044032252843, + "learning_rate": 3.532962983595363e-06, + "loss": 3.8215, + "mean_token_accuracy": 0.3284274205565453, + "step": 15560 + }, + { + "epoch": 0.7519377973100234, + "grad_norm": 2.526497619039477, + "learning_rate": 3.520113459391473e-06, + "loss": 3.9012, + "mean_token_accuracy": 0.3254032239317894, + "step": 15570 + }, + { + "epoch": 0.7524207374495931, + "grad_norm": 2.5230094856956358, + "learning_rate": 3.5072823517364696e-06, + "loss": 3.9469, + "mean_token_accuracy": 0.31905241757631303, + "step": 15580 + }, + { + "epoch": 0.7529036775891629, + "grad_norm": 2.58256065884894, + "learning_rate": 3.4944696970977597e-06, + "loss": 3.9438, + "mean_token_accuracy": 0.3193548426032066, + "step": 15590 + }, + { + "epoch": 0.7533866177287325, + "grad_norm": 2.4598479476918675, + "learning_rate": 3.4816755318903162e-06, + "loss": 3.9102, + "mean_token_accuracy": 0.32409273982048037, + "step": 15600 + }, + { + "epoch": 0.7533866177287325, + "eval_runtime": 7.8083, + "eval_samples_per_second": 378.317, + "eval_steps_per_second": 23.693, + "step": 15600 + }, + { + "epoch": 0.7538695578683022, + "grad_norm": 2.415970107077365, + "learning_rate": 3.4688998924765615e-06, + "loss": 3.9352, + "mean_token_accuracy": 0.31794354766607286, + "step": 15610 + }, + { + "epoch": 0.754352498007872, + "grad_norm": 2.4321884051867384, + "learning_rate": 3.456142815166269e-06, + "loss": 3.943, + "mean_token_accuracy": 0.31955645233392715, + "step": 15620 + }, + { + "epoch": 0.7548354381474416, + "grad_norm": 2.392784198660163, + "learning_rate": 3.443404336216446e-06, + "loss": 3.9535, + "mean_token_accuracy": 0.31491935402154925, + "step": 15630 + }, + { + "epoch": 0.7553183782870113, + "grad_norm": 2.2782457512809136, + "learning_rate": 3.430684491831251e-06, + "loss": 3.9594, + "mean_token_accuracy": 0.3191532254219055, + "step": 15640 + }, + { + "epoch": 0.755801318426581, + "grad_norm": 2.473267520440249, + "learning_rate": 3.4179833181618815e-06, + "loss": 3.834, + "mean_token_accuracy": 0.3323588728904724, + "step": 15650 + }, + { + "epoch": 0.7562842585661508, + "grad_norm": 2.4249260049491, + "learning_rate": 3.405300851306462e-06, + "loss": 3.9762, + "mean_token_accuracy": 0.3129032239317894, + "step": 15660 + }, + { + "epoch": 0.7567671987057204, + "grad_norm": 2.6116856637486876, + "learning_rate": 3.39263712730996e-06, + "loss": 3.9023, + "mean_token_accuracy": 0.3192540302872658, + "step": 15670 + }, + { + "epoch": 0.7572501388452901, + "grad_norm": 2.7517678667929193, + "learning_rate": 3.3799921821640614e-06, + "loss": 3.9215, + "mean_token_accuracy": 0.32026209831237795, + "step": 15680 + }, + { + "epoch": 0.7577330789848599, + "grad_norm": 2.5161610627735715, + "learning_rate": 3.3673660518071004e-06, + "loss": 3.859, + "mean_token_accuracy": 0.3259072557091713, + "step": 15690 + }, + { + "epoch": 0.7582160191244295, + "grad_norm": 2.6017300673422197, + "learning_rate": 3.3547587721239163e-06, + "loss": 3.9094, + "mean_token_accuracy": 0.31885080635547636, + "step": 15700 + }, + { + "epoch": 0.7582160191244295, + "eval_runtime": 7.819, + "eval_samples_per_second": 377.8, + "eval_steps_per_second": 23.66, + "step": 15700 + }, + { + "epoch": 0.7586989592639992, + "grad_norm": 2.767250808746583, + "learning_rate": 3.3421703789457837e-06, + "loss": 3.9621, + "mean_token_accuracy": 0.3126008063554764, + "step": 15710 + }, + { + "epoch": 0.759181899403569, + "grad_norm": 2.329018495591996, + "learning_rate": 3.3296009080502987e-06, + "loss": 3.9062, + "mean_token_accuracy": 0.3284274205565453, + "step": 15720 + }, + { + "epoch": 0.7596648395431387, + "grad_norm": 2.5092994722659188, + "learning_rate": 3.317050395161282e-06, + "loss": 3.9113, + "mean_token_accuracy": 0.31330645084381104, + "step": 15730 + }, + { + "epoch": 0.7601477796827083, + "grad_norm": 2.6546587795161884, + "learning_rate": 3.304518875948661e-06, + "loss": 3.9152, + "mean_token_accuracy": 0.3172379046678543, + "step": 15740 + }, + { + "epoch": 0.760630719822278, + "grad_norm": 2.582949496137021, + "learning_rate": 3.2920063860283935e-06, + "loss": 3.9598, + "mean_token_accuracy": 0.31431451737880706, + "step": 15750 + }, + { + "epoch": 0.7611136599618478, + "grad_norm": 2.461486046197451, + "learning_rate": 3.2795129609623477e-06, + "loss": 3.9254, + "mean_token_accuracy": 0.31915322691202164, + "step": 15760 + }, + { + "epoch": 0.7615966001014174, + "grad_norm": 2.6381647773561756, + "learning_rate": 3.267038636258213e-06, + "loss": 3.9652, + "mean_token_accuracy": 0.3188508078455925, + "step": 15770 + }, + { + "epoch": 0.7620795402409871, + "grad_norm": 2.5098004155536167, + "learning_rate": 3.254583447369387e-06, + "loss": 3.9238, + "mean_token_accuracy": 0.31854838579893113, + "step": 15780 + }, + { + "epoch": 0.7625624803805569, + "grad_norm": 2.415709958900494, + "learning_rate": 3.2421474296948764e-06, + "loss": 3.9754, + "mean_token_accuracy": 0.3189516082406044, + "step": 15790 + }, + { + "epoch": 0.7630454205201266, + "grad_norm": 2.4976840406476124, + "learning_rate": 3.229730618579222e-06, + "loss": 3.9324, + "mean_token_accuracy": 0.3132056474685669, + "step": 15800 + }, + { + "epoch": 0.7630454205201266, + "eval_runtime": 7.7948, + "eval_samples_per_second": 378.971, + "eval_steps_per_second": 23.734, + "step": 15800 + }, + { + "epoch": 0.7635283606596962, + "grad_norm": 2.493601075255913, + "learning_rate": 3.2173330493123557e-06, + "loss": 3.9641, + "mean_token_accuracy": 0.32227822542190554, + "step": 15810 + }, + { + "epoch": 0.7640113007992659, + "grad_norm": 2.510355193627828, + "learning_rate": 3.2049547571295303e-06, + "loss": 3.9555, + "mean_token_accuracy": 0.31774193197488787, + "step": 15820 + }, + { + "epoch": 0.7644942409388357, + "grad_norm": 2.7045016229318986, + "learning_rate": 3.1925957772112136e-06, + "loss": 3.9082, + "mean_token_accuracy": 0.32147177010774614, + "step": 15830 + }, + { + "epoch": 0.7649771810784053, + "grad_norm": 2.3423531235386252, + "learning_rate": 3.180256144682986e-06, + "loss": 3.9355, + "mean_token_accuracy": 0.31350806504487994, + "step": 15840 + }, + { + "epoch": 0.765460121217975, + "grad_norm": 2.4603035111451534, + "learning_rate": 3.1679358946154306e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.31945564299821855, + "step": 15850 + }, + { + "epoch": 0.7659430613575448, + "grad_norm": 2.366092766507224, + "learning_rate": 3.155635062024055e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.3220766097307205, + "step": 15860 + }, + { + "epoch": 0.7664260014971145, + "grad_norm": 2.3563125980306787, + "learning_rate": 3.1433536818691734e-06, + "loss": 3.8848, + "mean_token_accuracy": 0.3233870968222618, + "step": 15870 + }, + { + "epoch": 0.7669089416366841, + "grad_norm": 2.472913696601303, + "learning_rate": 3.131091789055822e-06, + "loss": 3.8574, + "mean_token_accuracy": 0.3309475839138031, + "step": 15880 + }, + { + "epoch": 0.7673918817762538, + "grad_norm": 2.416212989032887, + "learning_rate": 3.1188494184336394e-06, + "loss": 3.927, + "mean_token_accuracy": 0.32026209831237795, + "step": 15890 + }, + { + "epoch": 0.7678748219158236, + "grad_norm": 2.4954487608961102, + "learning_rate": 3.1066266047967895e-06, + "loss": 3.8809, + "mean_token_accuracy": 0.3209677442908287, + "step": 15900 + }, + { + "epoch": 0.7678748219158236, + "eval_runtime": 7.8189, + "eval_samples_per_second": 377.803, + "eval_steps_per_second": 23.661, + "step": 15900 + }, + { + "epoch": 0.7683577620553932, + "grad_norm": 2.5377497456994824, + "learning_rate": 3.0944233828838533e-06, + "loss": 3.9172, + "mean_token_accuracy": 0.31885080635547636, + "step": 15910 + }, + { + "epoch": 0.7688407021949629, + "grad_norm": 2.3940917924875667, + "learning_rate": 3.082239787377721e-06, + "loss": 3.8754, + "mean_token_accuracy": 0.31844757944345475, + "step": 15920 + }, + { + "epoch": 0.7693236423345327, + "grad_norm": 2.512979259567937, + "learning_rate": 3.070075852905512e-06, + "loss": 3.9828, + "mean_token_accuracy": 0.30735886842012405, + "step": 15930 + }, + { + "epoch": 0.7698065824741024, + "grad_norm": 2.5069447000299054, + "learning_rate": 3.057931614038463e-06, + "loss": 3.9281, + "mean_token_accuracy": 0.32368951886892317, + "step": 15940 + }, + { + "epoch": 0.770289522613672, + "grad_norm": 2.4178966533910344, + "learning_rate": 3.045807105291838e-06, + "loss": 3.9266, + "mean_token_accuracy": 0.3143145129084587, + "step": 15950 + }, + { + "epoch": 0.7707724627532417, + "grad_norm": 2.4809741475818923, + "learning_rate": 3.033702361124814e-06, + "loss": 3.968, + "mean_token_accuracy": 0.3160282254219055, + "step": 15960 + }, + { + "epoch": 0.7712554028928115, + "grad_norm": 2.6846141184788004, + "learning_rate": 3.0216174159404067e-06, + "loss": 3.9156, + "mean_token_accuracy": 0.3222782269120216, + "step": 15970 + }, + { + "epoch": 0.7717383430323811, + "grad_norm": 2.489939738950206, + "learning_rate": 3.009552304085358e-06, + "loss": 3.9672, + "mean_token_accuracy": 0.3148185521364212, + "step": 15980 + }, + { + "epoch": 0.7722212831719508, + "grad_norm": 2.4379455939343044, + "learning_rate": 2.997507059850041e-06, + "loss": 3.8504, + "mean_token_accuracy": 0.32560483664274215, + "step": 15990 + }, + { + "epoch": 0.7727042233115206, + "grad_norm": 2.4669427582835954, + "learning_rate": 2.985481717468358e-06, + "loss": 3.9441, + "mean_token_accuracy": 0.31250000149011614, + "step": 16000 + }, + { + "epoch": 0.7727042233115206, + "eval_runtime": 7.7928, + "eval_samples_per_second": 379.068, + "eval_steps_per_second": 23.74, + "step": 16000 + }, + { + "epoch": 0.7731871634510903, + "grad_norm": 2.519576489907158, + "learning_rate": 2.9734763111176534e-06, + "loss": 3.9273, + "mean_token_accuracy": 0.3140120983123779, + "step": 16010 + }, + { + "epoch": 0.7736701035906599, + "grad_norm": 2.42330825198855, + "learning_rate": 2.9614908749186124e-06, + "loss": 3.909, + "mean_token_accuracy": 0.3224798396229744, + "step": 16020 + }, + { + "epoch": 0.7741530437302296, + "grad_norm": 2.4422860351800817, + "learning_rate": 2.9495254429351604e-06, + "loss": 3.8449, + "mean_token_accuracy": 0.3275201603770256, + "step": 16030 + }, + { + "epoch": 0.7746359838697994, + "grad_norm": 2.633996798829478, + "learning_rate": 2.9375800491743655e-06, + "loss": 3.8957, + "mean_token_accuracy": 0.32459677308797835, + "step": 16040 + }, + { + "epoch": 0.775118924009369, + "grad_norm": 2.4868630036775476, + "learning_rate": 2.925654727586349e-06, + "loss": 3.9402, + "mean_token_accuracy": 0.32258064597845076, + "step": 16050 + }, + { + "epoch": 0.7756018641489387, + "grad_norm": 2.5262480686013413, + "learning_rate": 2.913749512064188e-06, + "loss": 3.9461, + "mean_token_accuracy": 0.3170362889766693, + "step": 16060 + }, + { + "epoch": 0.7760848042885085, + "grad_norm": 2.39883548222417, + "learning_rate": 2.9018644364438053e-06, + "loss": 3.9383, + "mean_token_accuracy": 0.31804435700178146, + "step": 16070 + }, + { + "epoch": 0.7765677444280782, + "grad_norm": 2.6351213080563927, + "learning_rate": 2.889999534503893e-06, + "loss": 3.9387, + "mean_token_accuracy": 0.31834677457809446, + "step": 16080 + }, + { + "epoch": 0.7770506845676478, + "grad_norm": 2.5557718779587892, + "learning_rate": 2.8781548399658065e-06, + "loss": 3.9621, + "mean_token_accuracy": 0.3150201618671417, + "step": 16090 + }, + { + "epoch": 0.7775336247072175, + "grad_norm": 2.5652311494524644, + "learning_rate": 2.866330386493471e-06, + "loss": 3.9215, + "mean_token_accuracy": 0.3173387095332146, + "step": 16100 + }, + { + "epoch": 0.7775336247072175, + "eval_runtime": 7.7936, + "eval_samples_per_second": 379.027, + "eval_steps_per_second": 23.737, + "step": 16100 + }, + { + "epoch": 0.7780165648467873, + "grad_norm": 2.6079461085726923, + "learning_rate": 2.8545262076932734e-06, + "loss": 3.9406, + "mean_token_accuracy": 0.31572580635547637, + "step": 16110 + }, + { + "epoch": 0.7784995049863569, + "grad_norm": 2.5430824277773207, + "learning_rate": 2.8427423371139895e-06, + "loss": 3.9145, + "mean_token_accuracy": 0.3162298411130905, + "step": 16120 + }, + { + "epoch": 0.7789824451259266, + "grad_norm": 2.4719514016389494, + "learning_rate": 2.8309788082466736e-06, + "loss": 3.9121, + "mean_token_accuracy": 0.3245967745780945, + "step": 16130 + }, + { + "epoch": 0.7794653852654964, + "grad_norm": 2.4676565661863927, + "learning_rate": 2.819235654524568e-06, + "loss": 3.8949, + "mean_token_accuracy": 0.32106854766607285, + "step": 16140 + }, + { + "epoch": 0.7799483254050661, + "grad_norm": 2.6278193950196167, + "learning_rate": 2.8075129093230003e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.3162298366427422, + "step": 16150 + }, + { + "epoch": 0.7804312655446357, + "grad_norm": 2.690742344407508, + "learning_rate": 2.7958106059593015e-06, + "loss": 3.8832, + "mean_token_accuracy": 0.3235887110233307, + "step": 16160 + }, + { + "epoch": 0.7809142056842054, + "grad_norm": 2.6272651864157175, + "learning_rate": 2.7841287776927017e-06, + "loss": 3.9227, + "mean_token_accuracy": 0.3182459682226181, + "step": 16170 + }, + { + "epoch": 0.7813971458237752, + "grad_norm": 2.858965289263669, + "learning_rate": 2.772467457724243e-06, + "loss": 3.8965, + "mean_token_accuracy": 0.327822582423687, + "step": 16180 + }, + { + "epoch": 0.7818800859633448, + "grad_norm": 2.730948147511468, + "learning_rate": 2.76082667919667e-06, + "loss": 3.9512, + "mean_token_accuracy": 0.3166330620646477, + "step": 16190 + }, + { + "epoch": 0.7823630261029145, + "grad_norm": 2.4100917419409793, + "learning_rate": 2.749206475194357e-06, + "loss": 3.9395, + "mean_token_accuracy": 0.31108870878815653, + "step": 16200 + }, + { + "epoch": 0.7823630261029145, + "eval_runtime": 7.7859, + "eval_samples_per_second": 379.404, + "eval_steps_per_second": 23.761, + "step": 16200 + }, + { + "epoch": 0.7828459662424843, + "grad_norm": 2.5261171713649153, + "learning_rate": 2.737606878743202e-06, + "loss": 3.9016, + "mean_token_accuracy": 0.32429435551166536, + "step": 16210 + }, + { + "epoch": 0.783328906382054, + "grad_norm": 2.5556710826864326, + "learning_rate": 2.726027922810526e-06, + "loss": 3.9312, + "mean_token_accuracy": 0.3151209682226181, + "step": 16220 + }, + { + "epoch": 0.7838118465216236, + "grad_norm": 2.375014949832733, + "learning_rate": 2.714469640304994e-06, + "loss": 3.8926, + "mean_token_accuracy": 0.32681451439857484, + "step": 16230 + }, + { + "epoch": 0.7842947866611933, + "grad_norm": 2.5494710127341933, + "learning_rate": 2.7029320640765166e-06, + "loss": 3.9289, + "mean_token_accuracy": 0.3190524145960808, + "step": 16240 + }, + { + "epoch": 0.7847777268007631, + "grad_norm": 2.3762506462613633, + "learning_rate": 2.691415226916153e-06, + "loss": 3.8652, + "mean_token_accuracy": 0.32026209533214567, + "step": 16250 + }, + { + "epoch": 0.7852606669403327, + "grad_norm": 2.4240461691185216, + "learning_rate": 2.679919161556014e-06, + "loss": 3.9391, + "mean_token_accuracy": 0.32258064597845076, + "step": 16260 + }, + { + "epoch": 0.7857436070799024, + "grad_norm": 2.3379108180916286, + "learning_rate": 2.6684439006691807e-06, + "loss": 3.8594, + "mean_token_accuracy": 0.32389113008975984, + "step": 16270 + }, + { + "epoch": 0.7862265472194722, + "grad_norm": 2.5169286063457013, + "learning_rate": 2.6569894768696048e-06, + "loss": 3.9629, + "mean_token_accuracy": 0.3142137110233307, + "step": 16280 + }, + { + "epoch": 0.7867094873590419, + "grad_norm": 2.4211109867696003, + "learning_rate": 2.64555592271202e-06, + "loss": 3.9145, + "mean_token_accuracy": 0.31733870804309844, + "step": 16290 + }, + { + "epoch": 0.7871924274986115, + "grad_norm": 2.4890569253235735, + "learning_rate": 2.6341432706918357e-06, + "loss": 3.9414, + "mean_token_accuracy": 0.32227823138237, + "step": 16300 + }, + { + "epoch": 0.7871924274986115, + "eval_runtime": 7.8277, + "eval_samples_per_second": 377.377, + "eval_steps_per_second": 23.634, + "step": 16300 + }, + { + "epoch": 0.7876753676381812, + "grad_norm": 2.414208603695168, + "learning_rate": 2.6227515532450644e-06, + "loss": 3.9043, + "mean_token_accuracy": 0.32449596375226974, + "step": 16310 + }, + { + "epoch": 0.788158307777751, + "grad_norm": 2.842053128996133, + "learning_rate": 2.611380802748221e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.32157257944345474, + "step": 16320 + }, + { + "epoch": 0.7886412479173206, + "grad_norm": 2.511953260541321, + "learning_rate": 2.6000310515182213e-06, + "loss": 3.9387, + "mean_token_accuracy": 0.32399193197488785, + "step": 16330 + }, + { + "epoch": 0.7891241880568903, + "grad_norm": 2.4278924456945976, + "learning_rate": 2.5887023318123063e-06, + "loss": 3.9059, + "mean_token_accuracy": 0.3213709682226181, + "step": 16340 + }, + { + "epoch": 0.7896071281964601, + "grad_norm": 2.4623932720379536, + "learning_rate": 2.57739467582794e-06, + "loss": 3.9242, + "mean_token_accuracy": 0.3230846807360649, + "step": 16350 + }, + { + "epoch": 0.7900900683360298, + "grad_norm": 2.543508384283926, + "learning_rate": 2.566108115702728e-06, + "loss": 3.9363, + "mean_token_accuracy": 0.31834677457809446, + "step": 16360 + }, + { + "epoch": 0.7905730084755994, + "grad_norm": 2.4619000045994666, + "learning_rate": 2.554842683514305e-06, + "loss": 3.9219, + "mean_token_accuracy": 0.321169351041317, + "step": 16370 + }, + { + "epoch": 0.7910559486151691, + "grad_norm": 2.5424361809339002, + "learning_rate": 2.543598411280269e-06, + "loss": 3.8965, + "mean_token_accuracy": 0.32368951886892317, + "step": 16380 + }, + { + "epoch": 0.7915388887547389, + "grad_norm": 2.67327142330878, + "learning_rate": 2.5323753309580766e-06, + "loss": 3.898, + "mean_token_accuracy": 0.3201612889766693, + "step": 16390 + }, + { + "epoch": 0.7920218288943085, + "grad_norm": 2.433649543912036, + "learning_rate": 2.521173474444957e-06, + "loss": 3.9027, + "mean_token_accuracy": 0.32409273982048037, + "step": 16400 + }, + { + "epoch": 0.7920218288943085, + "eval_runtime": 7.8437, + "eval_samples_per_second": 376.608, + "eval_steps_per_second": 23.586, + "step": 16400 + }, + { + "epoch": 0.7925047690338782, + "grad_norm": 2.5224384469736822, + "learning_rate": 2.5099928735778103e-06, + "loss": 3.8809, + "mean_token_accuracy": 0.32469758242368696, + "step": 16410 + }, + { + "epoch": 0.792987709173448, + "grad_norm": 2.504768179751752, + "learning_rate": 2.4988335601331336e-06, + "loss": 3.9047, + "mean_token_accuracy": 0.31784273982048034, + "step": 16420 + }, + { + "epoch": 0.7934706493130177, + "grad_norm": 2.5360401475869305, + "learning_rate": 2.487695565826922e-06, + "loss": 4.0004, + "mean_token_accuracy": 0.31068548411130903, + "step": 16430 + }, + { + "epoch": 0.7939535894525873, + "grad_norm": 2.4926819515576226, + "learning_rate": 2.4765789223145797e-06, + "loss": 3.982, + "mean_token_accuracy": 0.31643145233392717, + "step": 16440 + }, + { + "epoch": 0.794436529592157, + "grad_norm": 2.6332858795459964, + "learning_rate": 2.465483661190824e-06, + "loss": 3.9148, + "mean_token_accuracy": 0.31461693346500397, + "step": 16450 + }, + { + "epoch": 0.7949194697317268, + "grad_norm": 2.483125440054413, + "learning_rate": 2.4544098139896056e-06, + "loss": 4.0039, + "mean_token_accuracy": 0.3110887110233307, + "step": 16460 + }, + { + "epoch": 0.7954024098712964, + "grad_norm": 2.6720920517350173, + "learning_rate": 2.4433574121840178e-06, + "loss": 3.9012, + "mean_token_accuracy": 0.31733870804309844, + "step": 16470 + }, + { + "epoch": 0.7958853500108661, + "grad_norm": 2.6393270847156165, + "learning_rate": 2.4323264871861927e-06, + "loss": 3.8641, + "mean_token_accuracy": 0.32358870804309847, + "step": 16480 + }, + { + "epoch": 0.7963682901504359, + "grad_norm": 2.4294413949842917, + "learning_rate": 2.4213170703472355e-06, + "loss": 3.9723, + "mean_token_accuracy": 0.31794354915618894, + "step": 16490 + }, + { + "epoch": 0.7968512302900056, + "grad_norm": 2.6723150672222635, + "learning_rate": 2.410329192957116e-06, + "loss": 3.8648, + "mean_token_accuracy": 0.32752016335725787, + "step": 16500 + }, + { + "epoch": 0.7968512302900056, + "eval_runtime": 7.8122, + "eval_samples_per_second": 378.128, + "eval_steps_per_second": 23.681, + "step": 16500 + }, + { + "epoch": 0.7973341704295752, + "grad_norm": 2.675929868974572, + "learning_rate": 2.399362886244592e-06, + "loss": 3.8508, + "mean_token_accuracy": 0.32782258093357086, + "step": 16510 + }, + { + "epoch": 0.797817110569145, + "grad_norm": 2.3894708340546615, + "learning_rate": 2.3884181813771025e-06, + "loss": 4.0023, + "mean_token_accuracy": 0.31844758093357084, + "step": 16520 + }, + { + "epoch": 0.7983000507087147, + "grad_norm": 2.6234629143490076, + "learning_rate": 2.377495109460706e-06, + "loss": 3.9281, + "mean_token_accuracy": 0.32520161420106886, + "step": 16530 + }, + { + "epoch": 0.7987829908482843, + "grad_norm": 2.451513525214137, + "learning_rate": 2.36659370153997e-06, + "loss": 3.8617, + "mean_token_accuracy": 0.32923386842012403, + "step": 16540 + }, + { + "epoch": 0.799265930987854, + "grad_norm": 2.499150543671022, + "learning_rate": 2.3557139885978965e-06, + "loss": 3.9324, + "mean_token_accuracy": 0.3196572571992874, + "step": 16550 + }, + { + "epoch": 0.7997488711274238, + "grad_norm": 2.3862913153766336, + "learning_rate": 2.3448560015558177e-06, + "loss": 3.9543, + "mean_token_accuracy": 0.3138104841113091, + "step": 16560 + }, + { + "epoch": 0.8002318112669935, + "grad_norm": 2.41113089862968, + "learning_rate": 2.334019771273327e-06, + "loss": 3.8668, + "mean_token_accuracy": 0.324193549156189, + "step": 16570 + }, + { + "epoch": 0.8007147514065631, + "grad_norm": 2.735495230570863, + "learning_rate": 2.3232053285481814e-06, + "loss": 3.9336, + "mean_token_accuracy": 0.31481854543089866, + "step": 16580 + }, + { + "epoch": 0.8011976915461329, + "grad_norm": 2.5973844807381727, + "learning_rate": 2.312412704116218e-06, + "loss": 3.9051, + "mean_token_accuracy": 0.3264112889766693, + "step": 16590 + }, + { + "epoch": 0.8016806316857026, + "grad_norm": 2.439730137959072, + "learning_rate": 2.3016419286512525e-06, + "loss": 3.8828, + "mean_token_accuracy": 0.3177419364452362, + "step": 16600 + }, + { + "epoch": 0.8016806316857026, + "eval_runtime": 7.7873, + "eval_samples_per_second": 379.336, + "eval_steps_per_second": 23.757, + "step": 16600 + }, + { + "epoch": 0.8021635718252722, + "grad_norm": 2.5416858470207355, + "learning_rate": 2.2908930327650157e-06, + "loss": 3.9117, + "mean_token_accuracy": 0.3216733857989311, + "step": 16610 + }, + { + "epoch": 0.8026465119648419, + "grad_norm": 2.6011642180344268, + "learning_rate": 2.2801660470070507e-06, + "loss": 3.8793, + "mean_token_accuracy": 0.3247262641787529, + "step": 16620 + }, + { + "epoch": 0.8031294521044117, + "grad_norm": 2.534611112505, + "learning_rate": 2.2694610018646245e-06, + "loss": 3.9574, + "mean_token_accuracy": 0.3204637080430984, + "step": 16630 + }, + { + "epoch": 0.8036123922439814, + "grad_norm": 2.486281908750067, + "learning_rate": 2.258777927762653e-06, + "loss": 3.9102, + "mean_token_accuracy": 0.32405434995889665, + "step": 16640 + }, + { + "epoch": 0.804095332383551, + "grad_norm": 2.6224260913507416, + "learning_rate": 2.2481168550636045e-06, + "loss": 3.8547, + "mean_token_accuracy": 0.3225806444883347, + "step": 16650 + }, + { + "epoch": 0.8045782725231208, + "grad_norm": 2.3806781619885204, + "learning_rate": 2.2374778140674225e-06, + "loss": 3.8582, + "mean_token_accuracy": 0.3275201618671417, + "step": 16660 + }, + { + "epoch": 0.8050612126626905, + "grad_norm": 2.4876755798307157, + "learning_rate": 2.226860835011423e-06, + "loss": 3.9195, + "mean_token_accuracy": 0.3277217760682106, + "step": 16670 + }, + { + "epoch": 0.8055441528022601, + "grad_norm": 2.4324733612217715, + "learning_rate": 2.2162659480702275e-06, + "loss": 3.9031, + "mean_token_accuracy": 0.3170362874865532, + "step": 16680 + }, + { + "epoch": 0.8060270929418298, + "grad_norm": 2.5046025756743977, + "learning_rate": 2.2056931833556694e-06, + "loss": 3.8676, + "mean_token_accuracy": 0.3243951588869095, + "step": 16690 + }, + { + "epoch": 0.8065100330813996, + "grad_norm": 2.687273634169363, + "learning_rate": 2.1951425709167095e-06, + "loss": 3.8457, + "mean_token_accuracy": 0.32963709682226183, + "step": 16700 + }, + { + "epoch": 0.8065100330813996, + "eval_runtime": 7.7601, + "eval_samples_per_second": 380.664, + "eval_steps_per_second": 23.84, + "step": 16700 + }, + { + "epoch": 0.8069929732209693, + "grad_norm": 2.6150983680182054, + "learning_rate": 2.1846141407393393e-06, + "loss": 3.9645, + "mean_token_accuracy": 0.3110887110233307, + "step": 16710 + }, + { + "epoch": 0.8074759133605389, + "grad_norm": 2.436027318869901, + "learning_rate": 2.174107922746518e-06, + "loss": 3.8863, + "mean_token_accuracy": 0.31512096524238586, + "step": 16720 + }, + { + "epoch": 0.8079588535001087, + "grad_norm": 2.5461376475714674, + "learning_rate": 2.1636239467980725e-06, + "loss": 3.925, + "mean_token_accuracy": 0.3215725839138031, + "step": 16730 + }, + { + "epoch": 0.8084417936396784, + "grad_norm": 2.4314482623233844, + "learning_rate": 2.1531622426906063e-06, + "loss": 3.8543, + "mean_token_accuracy": 0.32379032075405123, + "step": 16740 + }, + { + "epoch": 0.8089247337792481, + "grad_norm": 2.379302785226099, + "learning_rate": 2.142722840157435e-06, + "loss": 3.9109, + "mean_token_accuracy": 0.32328629195690156, + "step": 16750 + }, + { + "epoch": 0.8094076739188177, + "grad_norm": 2.611400788057758, + "learning_rate": 2.132305768868488e-06, + "loss": 3.95, + "mean_token_accuracy": 0.31945564448833463, + "step": 16760 + }, + { + "epoch": 0.8098906140583875, + "grad_norm": 2.392008594223068, + "learning_rate": 2.121911058430225e-06, + "loss": 3.9102, + "mean_token_accuracy": 0.3190524220466614, + "step": 16770 + }, + { + "epoch": 0.8103735541979572, + "grad_norm": 2.650320084641724, + "learning_rate": 2.1115387383855513e-06, + "loss": 3.9527, + "mean_token_accuracy": 0.31875000298023226, + "step": 16780 + }, + { + "epoch": 0.8108564943375268, + "grad_norm": 2.3822462082754767, + "learning_rate": 2.10118883821374e-06, + "loss": 3.9215, + "mean_token_accuracy": 0.315625, + "step": 16790 + }, + { + "epoch": 0.8113394344770966, + "grad_norm": 2.6889518708907922, + "learning_rate": 2.0908613873303454e-06, + "loss": 3.9277, + "mean_token_accuracy": 0.3151209712028503, + "step": 16800 + }, + { + "epoch": 0.8113394344770966, + "eval_runtime": 7.8206, + "eval_samples_per_second": 377.72, + "eval_steps_per_second": 23.655, + "step": 16800 + }, + { + "epoch": 0.8118223746166663, + "grad_norm": 2.550458750695889, + "learning_rate": 2.0805564150871173e-06, + "loss": 3.873, + "mean_token_accuracy": 0.3179435506463051, + "step": 16810 + }, + { + "epoch": 0.812305314756236, + "grad_norm": 2.6128267204973405, + "learning_rate": 2.070273950771915e-06, + "loss": 3.8691, + "mean_token_accuracy": 0.3272177383303642, + "step": 16820 + }, + { + "epoch": 0.8127882548958056, + "grad_norm": 2.587908446299981, + "learning_rate": 2.0600140236086308e-06, + "loss": 3.8918, + "mean_token_accuracy": 0.3230846762657166, + "step": 16830 + }, + { + "epoch": 0.8132711950353754, + "grad_norm": 2.5807468063594294, + "learning_rate": 2.0497766627571057e-06, + "loss": 3.9773, + "mean_token_accuracy": 0.3130040317773819, + "step": 16840 + }, + { + "epoch": 0.8137541351749451, + "grad_norm": 2.6755023367814896, + "learning_rate": 2.039561897313046e-06, + "loss": 3.9195, + "mean_token_accuracy": 0.3190524190664291, + "step": 16850 + }, + { + "epoch": 0.8142370753145147, + "grad_norm": 2.6044266489922934, + "learning_rate": 2.0293697563079308e-06, + "loss": 3.9348, + "mean_token_accuracy": 0.318245966732502, + "step": 16860 + }, + { + "epoch": 0.8147200154540845, + "grad_norm": 2.4927386975261454, + "learning_rate": 2.019200268708945e-06, + "loss": 3.9402, + "mean_token_accuracy": 0.31340725868940356, + "step": 16870 + }, + { + "epoch": 0.8152029555936542, + "grad_norm": 2.5621006819858887, + "learning_rate": 2.0090534634188928e-06, + "loss": 3.9223, + "mean_token_accuracy": 0.3196572601795197, + "step": 16880 + }, + { + "epoch": 0.8156858957332239, + "grad_norm": 2.4242295628336055, + "learning_rate": 1.9989293692761024e-06, + "loss": 3.9035, + "mean_token_accuracy": 0.3220766127109528, + "step": 16890 + }, + { + "epoch": 0.8161688358727935, + "grad_norm": 2.519424020713278, + "learning_rate": 1.9888280150543647e-06, + "loss": 3.868, + "mean_token_accuracy": 0.31885080337524413, + "step": 16900 + }, + { + "epoch": 0.8161688358727935, + "eval_runtime": 7.8301, + "eval_samples_per_second": 377.261, + "eval_steps_per_second": 23.627, + "step": 16900 + }, + { + "epoch": 0.8166517760123633, + "grad_norm": 2.535137419412962, + "learning_rate": 1.9787494294628373e-06, + "loss": 3.8527, + "mean_token_accuracy": 0.3292338728904724, + "step": 16910 + }, + { + "epoch": 0.817134716151933, + "grad_norm": 2.596490646650906, + "learning_rate": 1.968693641145968e-06, + "loss": 3.9609, + "mean_token_accuracy": 0.3181451603770256, + "step": 16920 + }, + { + "epoch": 0.8176176562915026, + "grad_norm": 2.4151702777889925, + "learning_rate": 1.958660678683406e-06, + "loss": 3.8828, + "mean_token_accuracy": 0.32177419364452364, + "step": 16930 + }, + { + "epoch": 0.8181005964310724, + "grad_norm": 2.573406277489698, + "learning_rate": 1.948650570589936e-06, + "loss": 3.8855, + "mean_token_accuracy": 0.3219758063554764, + "step": 16940 + }, + { + "epoch": 0.8185835365706421, + "grad_norm": 2.57344482841107, + "learning_rate": 1.9386633453153826e-06, + "loss": 3.9406, + "mean_token_accuracy": 0.3123991906642914, + "step": 16950 + }, + { + "epoch": 0.8190664767102118, + "grad_norm": 2.4151219582823202, + "learning_rate": 1.92869903124454e-06, + "loss": 3.8496, + "mean_token_accuracy": 0.32540322542190553, + "step": 16960 + }, + { + "epoch": 0.8195494168497814, + "grad_norm": 2.5543777691813956, + "learning_rate": 1.9187576566970766e-06, + "loss": 3.85, + "mean_token_accuracy": 0.3222782269120216, + "step": 16970 + }, + { + "epoch": 0.8200323569893512, + "grad_norm": 2.381780186947048, + "learning_rate": 1.9088392499274734e-06, + "loss": 3.8914, + "mean_token_accuracy": 0.3225806444883347, + "step": 16980 + }, + { + "epoch": 0.8205152971289209, + "grad_norm": 2.6101620136163994, + "learning_rate": 1.8989438391249315e-06, + "loss": 3.8863, + "mean_token_accuracy": 0.3214717760682106, + "step": 16990 + }, + { + "epoch": 0.8209982372684905, + "grad_norm": 2.502577854036647, + "learning_rate": 1.8890714524132958e-06, + "loss": 3.9422, + "mean_token_accuracy": 0.3118951618671417, + "step": 17000 + }, + { + "epoch": 0.8209982372684905, + "eval_runtime": 7.8065, + "eval_samples_per_second": 378.402, + "eval_steps_per_second": 23.698, + "step": 17000 + }, + { + "epoch": 0.8214811774080603, + "grad_norm": 2.5690215396224483, + "learning_rate": 1.8792221178509696e-06, + "loss": 3.8648, + "mean_token_accuracy": 0.3270161271095276, + "step": 17010 + }, + { + "epoch": 0.82196411754763, + "grad_norm": 2.502535822958402, + "learning_rate": 1.8693958634308452e-06, + "loss": 3.9094, + "mean_token_accuracy": 0.321370966732502, + "step": 17020 + }, + { + "epoch": 0.8224470576871997, + "grad_norm": 2.668974892618196, + "learning_rate": 1.8595927170802175e-06, + "loss": 3.8594, + "mean_token_accuracy": 0.3273185506463051, + "step": 17030 + }, + { + "epoch": 0.8229299978267693, + "grad_norm": 2.643450235908905, + "learning_rate": 1.849812706660702e-06, + "loss": 3.909, + "mean_token_accuracy": 0.3168697118759155, + "step": 17040 + }, + { + "epoch": 0.8234129379663391, + "grad_norm": 2.4585571603267833, + "learning_rate": 1.8400558599681617e-06, + "loss": 3.8957, + "mean_token_accuracy": 0.31673386991024016, + "step": 17050 + }, + { + "epoch": 0.8238958781059088, + "grad_norm": 2.6591025669212005, + "learning_rate": 1.8303222047326275e-06, + "loss": 4.0102, + "mean_token_accuracy": 0.3132056459784508, + "step": 17060 + }, + { + "epoch": 0.8243788182454784, + "grad_norm": 2.4330721163796944, + "learning_rate": 1.820611768618218e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.3226814538240433, + "step": 17070 + }, + { + "epoch": 0.8248617583850482, + "grad_norm": 2.535287842660409, + "learning_rate": 1.8109245792230545e-06, + "loss": 3.8875, + "mean_token_accuracy": 0.32096774131059647, + "step": 17080 + }, + { + "epoch": 0.8253446985246179, + "grad_norm": 2.609166515058401, + "learning_rate": 1.801260664079194e-06, + "loss": 3.8504, + "mean_token_accuracy": 0.3214717745780945, + "step": 17090 + }, + { + "epoch": 0.8258276386641876, + "grad_norm": 2.449982066803611, + "learning_rate": 1.7916200506525462e-06, + "loss": 3.8836, + "mean_token_accuracy": 0.3186491936445236, + "step": 17100 + }, + { + "epoch": 0.8258276386641876, + "eval_runtime": 7.8054, + "eval_samples_per_second": 378.454, + "eval_steps_per_second": 23.701, + "step": 17100 + }, + { + "epoch": 0.8263105788037572, + "grad_norm": 2.6535672183544214, + "learning_rate": 1.7820027663427918e-06, + "loss": 3.9543, + "mean_token_accuracy": 0.3135080635547638, + "step": 17110 + }, + { + "epoch": 0.826793518943327, + "grad_norm": 2.786276944579817, + "learning_rate": 1.772408838483307e-06, + "loss": 3.9449, + "mean_token_accuracy": 0.31219757795333863, + "step": 17120 + }, + { + "epoch": 0.8272764590828967, + "grad_norm": 2.6499136566142107, + "learning_rate": 1.7628382943410882e-06, + "loss": 3.9191, + "mean_token_accuracy": 0.3180443525314331, + "step": 17130 + }, + { + "epoch": 0.8277593992224663, + "grad_norm": 2.5238124344804853, + "learning_rate": 1.753291161116677e-06, + "loss": 3.9758, + "mean_token_accuracy": 0.31703629195690153, + "step": 17140 + }, + { + "epoch": 0.8282423393620361, + "grad_norm": 2.4099653479491594, + "learning_rate": 1.7437674659440663e-06, + "loss": 3.8711, + "mean_token_accuracy": 0.3224798426032066, + "step": 17150 + }, + { + "epoch": 0.8287252795016058, + "grad_norm": 2.5444956940139303, + "learning_rate": 1.7342672358906487e-06, + "loss": 3.882, + "mean_token_accuracy": 0.3190524160861969, + "step": 17160 + }, + { + "epoch": 0.8292082196411755, + "grad_norm": 2.524613062472469, + "learning_rate": 1.7247904979571184e-06, + "loss": 3.8875, + "mean_token_accuracy": 0.32076612710952757, + "step": 17170 + }, + { + "epoch": 0.8296911597807451, + "grad_norm": 2.5925078281534244, + "learning_rate": 1.7153372790774092e-06, + "loss": 3.9406, + "mean_token_accuracy": 0.31905242055654526, + "step": 17180 + }, + { + "epoch": 0.8301740999203149, + "grad_norm": 2.4780013122019193, + "learning_rate": 1.7059076061186008e-06, + "loss": 3.8977, + "mean_token_accuracy": 0.32116935700178145, + "step": 17190 + }, + { + "epoch": 0.8306570400598846, + "grad_norm": 2.574221523746746, + "learning_rate": 1.696501505880863e-06, + "loss": 3.9051, + "mean_token_accuracy": 0.32671370804309846, + "step": 17200 + }, + { + "epoch": 0.8306570400598846, + "eval_runtime": 7.7964, + "eval_samples_per_second": 378.891, + "eval_steps_per_second": 23.729, + "step": 17200 + }, + { + "epoch": 0.8311399801994542, + "grad_norm": 2.7117294620270824, + "learning_rate": 1.687119005097363e-06, + "loss": 3.9316, + "mean_token_accuracy": 0.3150201618671417, + "step": 17210 + }, + { + "epoch": 0.831622920339024, + "grad_norm": 2.4010674996449586, + "learning_rate": 1.6777601304342016e-06, + "loss": 3.9254, + "mean_token_accuracy": 0.321673384308815, + "step": 17220 + }, + { + "epoch": 0.8321058604785937, + "grad_norm": 2.4882554764900915, + "learning_rate": 1.6684249084903225e-06, + "loss": 3.9473, + "mean_token_accuracy": 0.3157258048653603, + "step": 17230 + }, + { + "epoch": 0.8325888006181634, + "grad_norm": 2.4635578182484803, + "learning_rate": 1.6591133657974557e-06, + "loss": 3.8875, + "mean_token_accuracy": 0.32046370655298234, + "step": 17240 + }, + { + "epoch": 0.833071740757733, + "grad_norm": 2.57611782415298, + "learning_rate": 1.6498255288200248e-06, + "loss": 3.9301, + "mean_token_accuracy": 0.31350806802511216, + "step": 17250 + }, + { + "epoch": 0.8335546808973028, + "grad_norm": 2.6001614071423464, + "learning_rate": 1.640561423955086e-06, + "loss": 3.875, + "mean_token_accuracy": 0.3285282239317894, + "step": 17260 + }, + { + "epoch": 0.8340376210368725, + "grad_norm": 2.584795999520936, + "learning_rate": 1.6313210775322375e-06, + "loss": 3.882, + "mean_token_accuracy": 0.32520161122083663, + "step": 17270 + }, + { + "epoch": 0.8345205611764421, + "grad_norm": 2.580457378985107, + "learning_rate": 1.6221045158135606e-06, + "loss": 3.9844, + "mean_token_accuracy": 0.31834677159786223, + "step": 17280 + }, + { + "epoch": 0.8350035013160119, + "grad_norm": 2.5138319706121486, + "learning_rate": 1.6129117649935378e-06, + "loss": 3.923, + "mean_token_accuracy": 0.31743952035903933, + "step": 17290 + }, + { + "epoch": 0.8354864414555816, + "grad_norm": 2.3423205888685645, + "learning_rate": 1.6037428511989705e-06, + "loss": 3.9813, + "mean_token_accuracy": 0.31441532671451566, + "step": 17300 + }, + { + "epoch": 0.8354864414555816, + "eval_runtime": 7.8065, + "eval_samples_per_second": 378.401, + "eval_steps_per_second": 23.698, + "step": 17300 + }, + { + "epoch": 0.8359693815951513, + "grad_norm": 2.5436061994772556, + "learning_rate": 1.5945978004889218e-06, + "loss": 3.9258, + "mean_token_accuracy": 0.318548384308815, + "step": 17310 + }, + { + "epoch": 0.836452321734721, + "grad_norm": 2.741691426941505, + "learning_rate": 1.5854766388546284e-06, + "loss": 3.9387, + "mean_token_accuracy": 0.315120966732502, + "step": 17320 + }, + { + "epoch": 0.8369352618742907, + "grad_norm": 2.686510441187782, + "learning_rate": 1.576379392219437e-06, + "loss": 3.941, + "mean_token_accuracy": 0.31542338579893114, + "step": 17330 + }, + { + "epoch": 0.8374182020138604, + "grad_norm": 2.4551701738351253, + "learning_rate": 1.5673060864387146e-06, + "loss": 3.8379, + "mean_token_accuracy": 0.33397177010774615, + "step": 17340 + }, + { + "epoch": 0.83790114215343, + "grad_norm": 2.6245231178723167, + "learning_rate": 1.5582567472997966e-06, + "loss": 3.8723, + "mean_token_accuracy": 0.3239919349551201, + "step": 17350 + }, + { + "epoch": 0.8383840822929998, + "grad_norm": 2.592285760307571, + "learning_rate": 1.5492314005218967e-06, + "loss": 3.9418, + "mean_token_accuracy": 0.3153225839138031, + "step": 17360 + }, + { + "epoch": 0.8388670224325695, + "grad_norm": 2.486787451717872, + "learning_rate": 1.5402300717560436e-06, + "loss": 3.8746, + "mean_token_accuracy": 0.32590726017951965, + "step": 17370 + }, + { + "epoch": 0.8393499625721392, + "grad_norm": 2.5401410402396567, + "learning_rate": 1.531252786584998e-06, + "loss": 3.9711, + "mean_token_accuracy": 0.3130040317773819, + "step": 17380 + }, + { + "epoch": 0.8398329027117089, + "grad_norm": 2.550211841189419, + "learning_rate": 1.5222995705231912e-06, + "loss": 3.9887, + "mean_token_accuracy": 0.3118951618671417, + "step": 17390 + }, + { + "epoch": 0.8403158428512786, + "grad_norm": 2.503064967082286, + "learning_rate": 1.5133704490166502e-06, + "loss": 3.9324, + "mean_token_accuracy": 0.32066532373428347, + "step": 17400 + }, + { + "epoch": 0.8403158428512786, + "eval_runtime": 7.8477, + "eval_samples_per_second": 376.417, + "eval_steps_per_second": 23.574, + "step": 17400 + }, + { + "epoch": 0.8407987829908483, + "grad_norm": 2.472698593343891, + "learning_rate": 1.504465447442911e-06, + "loss": 3.8543, + "mean_token_accuracy": 0.3254032239317894, + "step": 17410 + }, + { + "epoch": 0.8412817231304179, + "grad_norm": 2.4917730540245064, + "learning_rate": 1.4955845911109713e-06, + "loss": 3.9602, + "mean_token_accuracy": 0.3097782269120216, + "step": 17420 + }, + { + "epoch": 0.8417646632699877, + "grad_norm": 2.5462475749657068, + "learning_rate": 1.486727905261196e-06, + "loss": 3.9543, + "mean_token_accuracy": 0.3137096792459488, + "step": 17430 + }, + { + "epoch": 0.8422476034095574, + "grad_norm": 2.5142472951474737, + "learning_rate": 1.4778954150652646e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.3162298396229744, + "step": 17440 + }, + { + "epoch": 0.8427305435491271, + "grad_norm": 2.5390040650290473, + "learning_rate": 1.4690871456260758e-06, + "loss": 3.9176, + "mean_token_accuracy": 0.31582661271095275, + "step": 17450 + }, + { + "epoch": 0.8432134836886968, + "grad_norm": 2.585182230094246, + "learning_rate": 1.460303121977703e-06, + "loss": 3.8492, + "mean_token_accuracy": 0.3227822586894035, + "step": 17460 + }, + { + "epoch": 0.8436964238282665, + "grad_norm": 2.4832673111880776, + "learning_rate": 1.4515433690853054e-06, + "loss": 3.9402, + "mean_token_accuracy": 0.3161290347576141, + "step": 17470 + }, + { + "epoch": 0.8441793639678362, + "grad_norm": 2.6491581535177966, + "learning_rate": 1.4428079118450634e-06, + "loss": 3.9246, + "mean_token_accuracy": 0.3138104841113091, + "step": 17480 + }, + { + "epoch": 0.8446623041074058, + "grad_norm": 2.6871477807734956, + "learning_rate": 1.4340967750841006e-06, + "loss": 3.9051, + "mean_token_accuracy": 0.3205645173788071, + "step": 17490 + }, + { + "epoch": 0.8451452442469756, + "grad_norm": 2.6754167986754487, + "learning_rate": 1.4254099835604284e-06, + "loss": 4.0109, + "mean_token_accuracy": 0.3129032254219055, + "step": 17500 + }, + { + "epoch": 0.8451452442469756, + "eval_runtime": 8.0421, + "eval_samples_per_second": 367.319, + "eval_steps_per_second": 23.004, + "step": 17500 + }, + { + "epoch": 0.8456281843865453, + "grad_norm": 2.5573298098627895, + "learning_rate": 1.41674756196286e-06, + "loss": 3.8926, + "mean_token_accuracy": 0.3280241936445236, + "step": 17510 + }, + { + "epoch": 0.846111124526115, + "grad_norm": 2.443762118092534, + "learning_rate": 1.408109534910952e-06, + "loss": 3.923, + "mean_token_accuracy": 0.3142137095332146, + "step": 17520 + }, + { + "epoch": 0.8465940646656847, + "grad_norm": 2.512827526384991, + "learning_rate": 1.399495926954919e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.3180443584918976, + "step": 17530 + }, + { + "epoch": 0.8470770048052544, + "grad_norm": 2.492337351741429, + "learning_rate": 1.3909067625755856e-06, + "loss": 3.8785, + "mean_token_accuracy": 0.32893145084381104, + "step": 17540 + }, + { + "epoch": 0.8475599449448241, + "grad_norm": 2.595506866876267, + "learning_rate": 1.382342066184299e-06, + "loss": 3.9031, + "mean_token_accuracy": 0.3213709704577923, + "step": 17550 + }, + { + "epoch": 0.8480428850843937, + "grad_norm": 2.6035833153749395, + "learning_rate": 1.3738018621228643e-06, + "loss": 3.9512, + "mean_token_accuracy": 0.31381049007177353, + "step": 17560 + }, + { + "epoch": 0.8485258252239635, + "grad_norm": 2.7177819832696555, + "learning_rate": 1.3652861746634817e-06, + "loss": 3.9652, + "mean_token_accuracy": 0.31159274131059644, + "step": 17570 + }, + { + "epoch": 0.8490087653635332, + "grad_norm": 2.474075091366069, + "learning_rate": 1.3567950280086673e-06, + "loss": 3.9066, + "mean_token_accuracy": 0.3237903222441673, + "step": 17580 + }, + { + "epoch": 0.8494917055031029, + "grad_norm": 2.5098231785926752, + "learning_rate": 1.348328446291195e-06, + "loss": 3.8488, + "mean_token_accuracy": 0.3262096747756004, + "step": 17590 + }, + { + "epoch": 0.8499746456426726, + "grad_norm": 2.478735544185196, + "learning_rate": 1.3398864535740164e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.3163306474685669, + "step": 17600 + }, + { + "epoch": 0.8499746456426726, + "eval_runtime": 7.7963, + "eval_samples_per_second": 378.896, + "eval_steps_per_second": 23.729, + "step": 17600 + }, + { + "epoch": 0.8504575857822423, + "grad_norm": 2.411240439693436, + "learning_rate": 1.3314690738502024e-06, + "loss": 3.9797, + "mean_token_accuracy": 0.3172379031777382, + "step": 17610 + }, + { + "epoch": 0.850940525921812, + "grad_norm": 2.6229706648875752, + "learning_rate": 1.3230763310428695e-06, + "loss": 3.9004, + "mean_token_accuracy": 0.3230846717953682, + "step": 17620 + }, + { + "epoch": 0.8514234660613816, + "grad_norm": 2.5796257342013367, + "learning_rate": 1.3147082490051189e-06, + "loss": 3.868, + "mean_token_accuracy": 0.322076615691185, + "step": 17630 + }, + { + "epoch": 0.8519064062009514, + "grad_norm": 2.5354239984706286, + "learning_rate": 1.3063648515199522e-06, + "loss": 3.9684, + "mean_token_accuracy": 0.3152217730879784, + "step": 17640 + }, + { + "epoch": 0.8523893463405211, + "grad_norm": 2.716743495005877, + "learning_rate": 1.2980461623002226e-06, + "loss": 3.8937, + "mean_token_accuracy": 0.318951615691185, + "step": 17650 + }, + { + "epoch": 0.8528722864800908, + "grad_norm": 2.5504605263514093, + "learning_rate": 1.2897522049885603e-06, + "loss": 3.8477, + "mean_token_accuracy": 0.32671370804309846, + "step": 17660 + }, + { + "epoch": 0.8533552266196605, + "grad_norm": 2.629616945834561, + "learning_rate": 1.2814830031573023e-06, + "loss": 3.852, + "mean_token_accuracy": 0.3342741921544075, + "step": 17670 + }, + { + "epoch": 0.8538381667592302, + "grad_norm": 2.477653300619644, + "learning_rate": 1.2732385803084235e-06, + "loss": 3.9539, + "mean_token_accuracy": 0.31804435700178146, + "step": 17680 + }, + { + "epoch": 0.8543211068987999, + "grad_norm": 2.663303737764512, + "learning_rate": 1.265018959873483e-06, + "loss": 3.9129, + "mean_token_accuracy": 0.31874999701976775, + "step": 17690 + }, + { + "epoch": 0.8548040470383695, + "grad_norm": 2.42834151952541, + "learning_rate": 1.256824165213545e-06, + "loss": 3.8531, + "mean_token_accuracy": 0.32620967626571656, + "step": 17700 + }, + { + "epoch": 0.8548040470383695, + "eval_runtime": 7.8267, + "eval_samples_per_second": 377.427, + "eval_steps_per_second": 23.637, + "step": 17700 + }, + { + "epoch": 0.8552869871779393, + "grad_norm": 2.545214620236476, + "learning_rate": 1.2486542196191121e-06, + "loss": 3.9281, + "mean_token_accuracy": 0.31491935402154925, + "step": 17710 + }, + { + "epoch": 0.855769927317509, + "grad_norm": 2.6035612715879766, + "learning_rate": 1.2405091463100672e-06, + "loss": 3.902, + "mean_token_accuracy": 0.3232862874865532, + "step": 17720 + }, + { + "epoch": 0.8562528674570787, + "grad_norm": 2.659506168088208, + "learning_rate": 1.232388968435605e-06, + "loss": 3.8754, + "mean_token_accuracy": 0.3211693525314331, + "step": 17730 + }, + { + "epoch": 0.8567358075966484, + "grad_norm": 2.4774107184479877, + "learning_rate": 1.2242937090741624e-06, + "loss": 3.8656, + "mean_token_accuracy": 0.32137097120285035, + "step": 17740 + }, + { + "epoch": 0.8572187477362181, + "grad_norm": 2.520561799141872, + "learning_rate": 1.216223391233352e-06, + "loss": 3.8668, + "mean_token_accuracy": 0.3209677442908287, + "step": 17750 + }, + { + "epoch": 0.8577016878757878, + "grad_norm": 2.558043094812153, + "learning_rate": 1.2081780378499042e-06, + "loss": 3.9691, + "mean_token_accuracy": 0.3159274160861969, + "step": 17760 + }, + { + "epoch": 0.8581846280153574, + "grad_norm": 2.6220234175553068, + "learning_rate": 1.200157671789598e-06, + "loss": 3.8973, + "mean_token_accuracy": 0.31360886842012403, + "step": 17770 + }, + { + "epoch": 0.8586675681549272, + "grad_norm": 2.6051951492583125, + "learning_rate": 1.1921623158471951e-06, + "loss": 3.9566, + "mean_token_accuracy": 0.31481854766607287, + "step": 17780 + }, + { + "epoch": 0.8591505082944969, + "grad_norm": 2.6987052748218874, + "learning_rate": 1.184191992746372e-06, + "loss": 3.9297, + "mean_token_accuracy": 0.31542338579893114, + "step": 17790 + }, + { + "epoch": 0.8596334484340666, + "grad_norm": 2.478355542795563, + "learning_rate": 1.1762467251396614e-06, + "loss": 3.8891, + "mean_token_accuracy": 0.3191532239317894, + "step": 17800 + }, + { + "epoch": 0.8596334484340666, + "eval_runtime": 7.7942, + "eval_samples_per_second": 378.999, + "eval_steps_per_second": 23.736, + "step": 17800 + }, + { + "epoch": 0.8601163885736363, + "grad_norm": 2.5359644602852156, + "learning_rate": 1.1683265356083906e-06, + "loss": 3.9629, + "mean_token_accuracy": 0.31512096524238586, + "step": 17810 + }, + { + "epoch": 0.860599328713206, + "grad_norm": 2.3923709132791346, + "learning_rate": 1.1604314466626032e-06, + "loss": 3.9309, + "mean_token_accuracy": 0.3182459682226181, + "step": 17820 + }, + { + "epoch": 0.8610822688527757, + "grad_norm": 2.521639770013068, + "learning_rate": 1.152561480741009e-06, + "loss": 3.982, + "mean_token_accuracy": 0.3186491966247559, + "step": 17830 + }, + { + "epoch": 0.8615652089923453, + "grad_norm": 2.5079137372187725, + "learning_rate": 1.1447166602109162e-06, + "loss": 3.9332, + "mean_token_accuracy": 0.32096774131059647, + "step": 17840 + }, + { + "epoch": 0.8620481491319151, + "grad_norm": 2.3796392948960956, + "learning_rate": 1.1368970073681685e-06, + "loss": 3.9062, + "mean_token_accuracy": 0.3177419379353523, + "step": 17850 + }, + { + "epoch": 0.8625310892714848, + "grad_norm": 2.510990104182163, + "learning_rate": 1.129102544437074e-06, + "loss": 3.9914, + "mean_token_accuracy": 0.31209677308797834, + "step": 17860 + }, + { + "epoch": 0.8630140294110545, + "grad_norm": 2.4555247578595156, + "learning_rate": 1.1213332935703515e-06, + "loss": 3.9203, + "mean_token_accuracy": 0.31229838728904724, + "step": 17870 + }, + { + "epoch": 0.8634969695506242, + "grad_norm": 2.613419644383833, + "learning_rate": 1.1135892768490674e-06, + "loss": 3.8676, + "mean_token_accuracy": 0.3239919364452362, + "step": 17880 + }, + { + "epoch": 0.8639799096901939, + "grad_norm": 2.5738251401194954, + "learning_rate": 1.1058705162825677e-06, + "loss": 3.9164, + "mean_token_accuracy": 0.311794351041317, + "step": 17890 + }, + { + "epoch": 0.8644628498297636, + "grad_norm": 2.6197753140478475, + "learning_rate": 1.0981770338084152e-06, + "loss": 3.9641, + "mean_token_accuracy": 0.31804435402154924, + "step": 17900 + }, + { + "epoch": 0.8644628498297636, + "eval_runtime": 7.8205, + "eval_samples_per_second": 377.727, + "eval_steps_per_second": 23.656, + "step": 17900 + }, + { + "epoch": 0.8649457899693334, + "grad_norm": 2.5329537127157917, + "learning_rate": 1.0905088512923312e-06, + "loss": 3.9344, + "mean_token_accuracy": 0.32227822542190554, + "step": 17910 + }, + { + "epoch": 0.865428730108903, + "grad_norm": 2.5629187963373985, + "learning_rate": 1.0828659905281347e-06, + "loss": 3.8914, + "mean_token_accuracy": 0.3215725809335709, + "step": 17920 + }, + { + "epoch": 0.8659116702484727, + "grad_norm": 2.469573236383982, + "learning_rate": 1.0752484732376745e-06, + "loss": 3.8172, + "mean_token_accuracy": 0.3268145158886909, + "step": 17930 + }, + { + "epoch": 0.8663946103880424, + "grad_norm": 2.5116301701563466, + "learning_rate": 1.0676563210707658e-06, + "loss": 3.9629, + "mean_token_accuracy": 0.3157258048653603, + "step": 17940 + }, + { + "epoch": 0.8668775505276121, + "grad_norm": 2.5356796089718596, + "learning_rate": 1.0600895556051482e-06, + "loss": 3.9445, + "mean_token_accuracy": 0.3138104841113091, + "step": 17950 + }, + { + "epoch": 0.8673604906671818, + "grad_norm": 2.5825999856145465, + "learning_rate": 1.0525481983463937e-06, + "loss": 3.8207, + "mean_token_accuracy": 0.32570564895868304, + "step": 17960 + }, + { + "epoch": 0.8678434308067515, + "grad_norm": 2.6857045559811072, + "learning_rate": 1.045032270727866e-06, + "loss": 3.8469, + "mean_token_accuracy": 0.3246975809335709, + "step": 17970 + }, + { + "epoch": 0.8683263709463213, + "grad_norm": 2.497678004820524, + "learning_rate": 1.037541794110658e-06, + "loss": 3.9027, + "mean_token_accuracy": 0.3253024145960808, + "step": 17980 + }, + { + "epoch": 0.8688093110858909, + "grad_norm": 2.4785066488440477, + "learning_rate": 1.0300767897835284e-06, + "loss": 3.8891, + "mean_token_accuracy": 0.32933467626571655, + "step": 17990 + }, + { + "epoch": 0.8692922512254606, + "grad_norm": 2.466491452351288, + "learning_rate": 1.0226372789628392e-06, + "loss": 3.9184, + "mean_token_accuracy": 0.3203629061579704, + "step": 18000 + }, + { + "epoch": 0.8692922512254606, + "eval_runtime": 7.8099, + "eval_samples_per_second": 378.237, + "eval_steps_per_second": 23.688, + "step": 18000 + }, + { + "epoch": 0.8697751913650303, + "grad_norm": 2.6259082682652184, + "learning_rate": 1.0152232827924936e-06, + "loss": 4.0012, + "mean_token_accuracy": 0.3131048396229744, + "step": 18010 + }, + { + "epoch": 0.8702581315046, + "grad_norm": 2.782506770476591, + "learning_rate": 1.007834822343884e-06, + "loss": 3.9152, + "mean_token_accuracy": 0.3187499985098839, + "step": 18020 + }, + { + "epoch": 0.8707410716441697, + "grad_norm": 2.442090461501015, + "learning_rate": 1.0004719186158262e-06, + "loss": 3.9586, + "mean_token_accuracy": 0.3195564538240433, + "step": 18030 + }, + { + "epoch": 0.8712240117837394, + "grad_norm": 2.60213599038238, + "learning_rate": 9.931345925345038e-07, + "loss": 3.8836, + "mean_token_accuracy": 0.3230846792459488, + "step": 18040 + }, + { + "epoch": 0.8717069519233092, + "grad_norm": 2.6158348626524375, + "learning_rate": 9.858228649533975e-07, + "loss": 3.8551, + "mean_token_accuracy": 0.32207661420106887, + "step": 18050 + }, + { + "epoch": 0.8721898920628788, + "grad_norm": 2.6223838385893874, + "learning_rate": 9.785367566532434e-07, + "loss": 3.9312, + "mean_token_accuracy": 0.32258064597845076, + "step": 18060 + }, + { + "epoch": 0.8726728322024485, + "grad_norm": 2.379448928779309, + "learning_rate": 9.712762883419591e-07, + "loss": 3.8617, + "mean_token_accuracy": 0.32217741906642916, + "step": 18070 + }, + { + "epoch": 0.8731557723420182, + "grad_norm": 2.5482583997864547, + "learning_rate": 9.64041480654596e-07, + "loss": 3.968, + "mean_token_accuracy": 0.312197582423687, + "step": 18080 + }, + { + "epoch": 0.8736387124815879, + "grad_norm": 2.5431610039802774, + "learning_rate": 9.568323541532686e-07, + "loss": 3.9152, + "mean_token_accuracy": 0.3186491936445236, + "step": 18090 + }, + { + "epoch": 0.8741216526211576, + "grad_norm": 2.528358798814819, + "learning_rate": 9.49648929327105e-07, + "loss": 3.9676, + "mean_token_accuracy": 0.31340725868940356, + "step": 18100 + }, + { + "epoch": 0.8741216526211576, + "eval_runtime": 7.8035, + "eval_samples_per_second": 378.546, + "eval_steps_per_second": 23.707, + "step": 18100 + }, + { + "epoch": 0.8746045927607273, + "grad_norm": 2.5167682618470213, + "learning_rate": 9.424912265921915e-07, + "loss": 4.0215, + "mean_token_accuracy": 0.30443548411130905, + "step": 18110 + }, + { + "epoch": 0.8750875329002971, + "grad_norm": 2.5407010875098934, + "learning_rate": 9.353592662915e-07, + "loss": 3.9145, + "mean_token_accuracy": 0.32812499850988386, + "step": 18120 + }, + { + "epoch": 0.8755704730398667, + "grad_norm": 2.5827157374986025, + "learning_rate": 9.282530686948477e-07, + "loss": 3.9234, + "mean_token_accuracy": 0.31915322244167327, + "step": 18130 + }, + { + "epoch": 0.8760534131794364, + "grad_norm": 2.4776944374572167, + "learning_rate": 9.211726539988264e-07, + "loss": 3.8676, + "mean_token_accuracy": 0.3299395188689232, + "step": 18140 + }, + { + "epoch": 0.8765363533190061, + "grad_norm": 2.4300257563844583, + "learning_rate": 9.141180423267548e-07, + "loss": 3.8188, + "mean_token_accuracy": 0.33286290168762206, + "step": 18150 + }, + { + "epoch": 0.8770192934585758, + "grad_norm": 2.6459751674727943, + "learning_rate": 9.070892537286103e-07, + "loss": 3.9262, + "mean_token_accuracy": 0.31885080635547636, + "step": 18160 + }, + { + "epoch": 0.8775022335981455, + "grad_norm": 2.386959199458145, + "learning_rate": 9.000863081809841e-07, + "loss": 3.9355, + "mean_token_accuracy": 0.32268145233392714, + "step": 18170 + }, + { + "epoch": 0.8779851737377152, + "grad_norm": 2.518960737670266, + "learning_rate": 8.931092255870133e-07, + "loss": 3.8563, + "mean_token_accuracy": 0.32489919662475586, + "step": 18180 + }, + { + "epoch": 0.878468113877285, + "grad_norm": 2.556639602148005, + "learning_rate": 8.861580257763381e-07, + "loss": 3.9148, + "mean_token_accuracy": 0.3240927428007126, + "step": 18190 + }, + { + "epoch": 0.8789510540168546, + "grad_norm": 2.497428041319776, + "learning_rate": 8.792327285050229e-07, + "loss": 3.9277, + "mean_token_accuracy": 0.321068549156189, + "step": 18200 + }, + { + "epoch": 0.8789510540168546, + "eval_runtime": 7.8119, + "eval_samples_per_second": 378.141, + "eval_steps_per_second": 23.682, + "step": 18200 + }, + { + "epoch": 0.8794339941564243, + "grad_norm": 2.8076426527684197, + "learning_rate": 8.723333534555323e-07, + "loss": 3.8789, + "mean_token_accuracy": 0.3240927442908287, + "step": 18210 + }, + { + "epoch": 0.879916934295994, + "grad_norm": 2.5320265775086233, + "learning_rate": 8.654599202366431e-07, + "loss": 3.9105, + "mean_token_accuracy": 0.31925403103232386, + "step": 18220 + }, + { + "epoch": 0.8803998744355637, + "grad_norm": 2.436542761284518, + "learning_rate": 8.586124483834047e-07, + "loss": 3.8496, + "mean_token_accuracy": 0.32157258242368697, + "step": 18230 + }, + { + "epoch": 0.8808828145751334, + "grad_norm": 2.5187499498107475, + "learning_rate": 8.51790957357086e-07, + "loss": 3.9852, + "mean_token_accuracy": 0.3115927383303642, + "step": 18240 + }, + { + "epoch": 0.8813657547147031, + "grad_norm": 2.5069267484758146, + "learning_rate": 8.449954665451133e-07, + "loss": 3.9414, + "mean_token_accuracy": 0.3117943570017815, + "step": 18250 + }, + { + "epoch": 0.8818486948542729, + "grad_norm": 2.5843232539545644, + "learning_rate": 8.382259952610195e-07, + "loss": 3.9438, + "mean_token_accuracy": 0.3169354870915413, + "step": 18260 + }, + { + "epoch": 0.8823316349938425, + "grad_norm": 2.3856609815925465, + "learning_rate": 8.314825627443801e-07, + "loss": 3.8996, + "mean_token_accuracy": 0.3181451618671417, + "step": 18270 + }, + { + "epoch": 0.8828145751334122, + "grad_norm": 2.6416341033003, + "learning_rate": 8.247651881607755e-07, + "loss": 3.9152, + "mean_token_accuracy": 0.3177419364452362, + "step": 18280 + }, + { + "epoch": 0.883297515272982, + "grad_norm": 2.6640006965890657, + "learning_rate": 8.180738906017182e-07, + "loss": 3.8773, + "mean_token_accuracy": 0.3157258078455925, + "step": 18290 + }, + { + "epoch": 0.8837804554125516, + "grad_norm": 2.5647265541526534, + "learning_rate": 8.114086890846151e-07, + "loss": 3.9621, + "mean_token_accuracy": 0.31784274280071256, + "step": 18300 + }, + { + "epoch": 0.8837804554125516, + "eval_runtime": 7.8238, + "eval_samples_per_second": 377.565, + "eval_steps_per_second": 23.646, + "step": 18300 + }, + { + "epoch": 0.8842633955521213, + "grad_norm": 2.4336383373323858, + "learning_rate": 8.04769602552693e-07, + "loss": 3.891, + "mean_token_accuracy": 0.3234879031777382, + "step": 18310 + }, + { + "epoch": 0.884746335691691, + "grad_norm": 2.5498922039186795, + "learning_rate": 7.981566498749737e-07, + "loss": 3.9012, + "mean_token_accuracy": 0.32258064299821854, + "step": 18320 + }, + { + "epoch": 0.8852292758312608, + "grad_norm": 2.6650794974443217, + "learning_rate": 7.915698498461877e-07, + "loss": 3.8953, + "mean_token_accuracy": 0.3231854811310768, + "step": 18330 + }, + { + "epoch": 0.8857122159708304, + "grad_norm": 2.5875047933394644, + "learning_rate": 7.850092211867477e-07, + "loss": 3.9746, + "mean_token_accuracy": 0.3092741951346397, + "step": 18340 + }, + { + "epoch": 0.8861951561104001, + "grad_norm": 2.6501223033817336, + "learning_rate": 7.784747825426764e-07, + "loss": 3.916, + "mean_token_accuracy": 0.32167338877916335, + "step": 18350 + }, + { + "epoch": 0.8866780962499698, + "grad_norm": 2.4253460384541614, + "learning_rate": 7.71966552485569e-07, + "loss": 3.9891, + "mean_token_accuracy": 0.3151209682226181, + "step": 18360 + }, + { + "epoch": 0.8871610363895395, + "grad_norm": 2.7215885550299, + "learning_rate": 7.654845495125318e-07, + "loss": 3.9012, + "mean_token_accuracy": 0.3282258078455925, + "step": 18370 + }, + { + "epoch": 0.8876439765291092, + "grad_norm": 2.540776778450628, + "learning_rate": 7.590287920461225e-07, + "loss": 3.9289, + "mean_token_accuracy": 0.318951615691185, + "step": 18380 + }, + { + "epoch": 0.8881269166686789, + "grad_norm": 2.506801425692361, + "learning_rate": 7.525992984343178e-07, + "loss": 3.957, + "mean_token_accuracy": 0.3157258048653603, + "step": 18390 + }, + { + "epoch": 0.8886098568082487, + "grad_norm": 2.514043619789244, + "learning_rate": 7.461960869504414e-07, + "loss": 3.9254, + "mean_token_accuracy": 0.3174395188689232, + "step": 18400 + }, + { + "epoch": 0.8886098568082487, + "eval_runtime": 7.7965, + "eval_samples_per_second": 378.888, + "eval_steps_per_second": 23.729, + "step": 18400 + }, + { + "epoch": 0.8890927969478183, + "grad_norm": 2.449564853084911, + "learning_rate": 7.398191757931262e-07, + "loss": 3.9117, + "mean_token_accuracy": 0.325, + "step": 18410 + }, + { + "epoch": 0.889575737087388, + "grad_norm": 2.372405229356179, + "learning_rate": 7.334685830862509e-07, + "loss": 3.9176, + "mean_token_accuracy": 0.3192540302872658, + "step": 18420 + }, + { + "epoch": 0.8900586772269578, + "grad_norm": 2.5881527126395363, + "learning_rate": 7.271443268788981e-07, + "loss": 3.9031, + "mean_token_accuracy": 0.323991933465004, + "step": 18430 + }, + { + "epoch": 0.8905416173665274, + "grad_norm": 2.5115084852944047, + "learning_rate": 7.208464251452984e-07, + "loss": 3.9301, + "mean_token_accuracy": 0.31885080933570864, + "step": 18440 + }, + { + "epoch": 0.8910245575060971, + "grad_norm": 2.645389897017754, + "learning_rate": 7.145748957847809e-07, + "loss": 3.9539, + "mean_token_accuracy": 0.3170362874865532, + "step": 18450 + }, + { + "epoch": 0.8915074976456668, + "grad_norm": 2.5105823782977335, + "learning_rate": 7.083297566217163e-07, + "loss": 3.918, + "mean_token_accuracy": 0.3204637080430984, + "step": 18460 + }, + { + "epoch": 0.8919904377852366, + "grad_norm": 2.762280756865794, + "learning_rate": 7.02111025405482e-07, + "loss": 3.9438, + "mean_token_accuracy": 0.31895161271095274, + "step": 18470 + }, + { + "epoch": 0.8924733779248062, + "grad_norm": 2.6178122291116552, + "learning_rate": 6.959187198103901e-07, + "loss": 3.8445, + "mean_token_accuracy": 0.32600805908441544, + "step": 18480 + }, + { + "epoch": 0.8929563180643759, + "grad_norm": 2.675038901268123, + "learning_rate": 6.897528574356549e-07, + "loss": 3.9227, + "mean_token_accuracy": 0.3201612919569016, + "step": 18490 + }, + { + "epoch": 0.8934392582039457, + "grad_norm": 2.424588133790083, + "learning_rate": 6.836134558053331e-07, + "loss": 3.9367, + "mean_token_accuracy": 0.31300402879714967, + "step": 18500 + }, + { + "epoch": 0.8934392582039457, + "eval_runtime": 7.8421, + "eval_samples_per_second": 376.684, + "eval_steps_per_second": 23.591, + "step": 18500 + }, + { + "epoch": 0.8939221983435153, + "grad_norm": 2.5838555063615942, + "learning_rate": 6.775005323682782e-07, + "loss": 3.9551, + "mean_token_accuracy": 0.31431451588869097, + "step": 18510 + }, + { + "epoch": 0.894405138483085, + "grad_norm": 2.4958291171493356, + "learning_rate": 6.714141044980915e-07, + "loss": 3.8871, + "mean_token_accuracy": 0.32177419364452364, + "step": 18520 + }, + { + "epoch": 0.8948880786226547, + "grad_norm": 2.472250140068182, + "learning_rate": 6.65354189493066e-07, + "loss": 3.9043, + "mean_token_accuracy": 0.3266129046678543, + "step": 18530 + }, + { + "epoch": 0.8953710187622245, + "grad_norm": 2.563549558316242, + "learning_rate": 6.593208045761468e-07, + "loss": 3.8895, + "mean_token_accuracy": 0.3219758033752441, + "step": 18540 + }, + { + "epoch": 0.8958539589017941, + "grad_norm": 2.563429011031045, + "learning_rate": 6.533139668948762e-07, + "loss": 3.9164, + "mean_token_accuracy": 0.31854838877916336, + "step": 18550 + }, + { + "epoch": 0.8963368990413638, + "grad_norm": 2.5335162983722888, + "learning_rate": 6.473336935213481e-07, + "loss": 3.8594, + "mean_token_accuracy": 0.3230846792459488, + "step": 18560 + }, + { + "epoch": 0.8968198391809336, + "grad_norm": 2.4202524603323643, + "learning_rate": 6.413800014521521e-07, + "loss": 3.8156, + "mean_token_accuracy": 0.32479838877916334, + "step": 18570 + }, + { + "epoch": 0.8973027793205032, + "grad_norm": 2.500669415268221, + "learning_rate": 6.354529076083383e-07, + "loss": 3.9023, + "mean_token_accuracy": 0.31572580635547637, + "step": 18580 + }, + { + "epoch": 0.8977857194600729, + "grad_norm": 2.689120553609442, + "learning_rate": 6.295524288353561e-07, + "loss": 3.8383, + "mean_token_accuracy": 0.33094758093357085, + "step": 18590 + }, + { + "epoch": 0.8982686595996426, + "grad_norm": 2.515004864037933, + "learning_rate": 6.236785819030155e-07, + "loss": 3.9242, + "mean_token_accuracy": 0.32116935700178145, + "step": 18600 + }, + { + "epoch": 0.8982686595996426, + "eval_runtime": 7.8046, + "eval_samples_per_second": 378.497, + "eval_steps_per_second": 23.704, + "step": 18600 + }, + { + "epoch": 0.8987515997392124, + "grad_norm": 2.436796793368009, + "learning_rate": 6.178313835054295e-07, + "loss": 3.9789, + "mean_token_accuracy": 0.3178427442908287, + "step": 18610 + }, + { + "epoch": 0.899234539878782, + "grad_norm": 2.741369435917347, + "learning_rate": 6.120108502609845e-07, + "loss": 3.8687, + "mean_token_accuracy": 0.32439515739679337, + "step": 18620 + }, + { + "epoch": 0.8997174800183517, + "grad_norm": 2.5219506860062633, + "learning_rate": 6.062169987122724e-07, + "loss": 3.8992, + "mean_token_accuracy": 0.3222782269120216, + "step": 18630 + }, + { + "epoch": 0.9002004201579215, + "grad_norm": 2.7278126051330323, + "learning_rate": 6.004498453260532e-07, + "loss": 3.8949, + "mean_token_accuracy": 0.3157258078455925, + "step": 18640 + }, + { + "epoch": 0.9006833602974911, + "grad_norm": 2.6415796007207444, + "learning_rate": 5.947094064932113e-07, + "loss": 3.9766, + "mean_token_accuracy": 0.3200604811310768, + "step": 18650 + }, + { + "epoch": 0.9011663004370608, + "grad_norm": 2.4746303635471545, + "learning_rate": 5.889956985287049e-07, + "loss": 3.8977, + "mean_token_accuracy": 0.3229838728904724, + "step": 18660 + }, + { + "epoch": 0.9016492405766305, + "grad_norm": 2.7078937827890077, + "learning_rate": 5.833087376715185e-07, + "loss": 3.891, + "mean_token_accuracy": 0.3146169394254684, + "step": 18670 + }, + { + "epoch": 0.9021321807162003, + "grad_norm": 2.381055839751177, + "learning_rate": 5.776485400846177e-07, + "loss": 3.9328, + "mean_token_accuracy": 0.3219758063554764, + "step": 18680 + }, + { + "epoch": 0.9026151208557699, + "grad_norm": 2.444002346736162, + "learning_rate": 5.720151218549097e-07, + "loss": 3.8453, + "mean_token_accuracy": 0.3275201618671417, + "step": 18690 + }, + { + "epoch": 0.9030980609953396, + "grad_norm": 2.72492144695871, + "learning_rate": 5.664084989931829e-07, + "loss": 3.9266, + "mean_token_accuracy": 0.3153225839138031, + "step": 18700 + }, + { + "epoch": 0.9030980609953396, + "eval_runtime": 7.8284, + "eval_samples_per_second": 377.346, + "eval_steps_per_second": 23.632, + "step": 18700 + }, + { + "epoch": 0.9035810011349094, + "grad_norm": 2.6411950085154, + "learning_rate": 5.608286874340774e-07, + "loss": 3.9137, + "mean_token_accuracy": 0.3228830635547638, + "step": 18710 + }, + { + "epoch": 0.904063941274479, + "grad_norm": 2.611648832232795, + "learning_rate": 5.552757030360279e-07, + "loss": 3.9133, + "mean_token_accuracy": 0.3105846792459488, + "step": 18720 + }, + { + "epoch": 0.9045468814140487, + "grad_norm": 2.5337979197329634, + "learning_rate": 5.497495615812298e-07, + "loss": 3.8461, + "mean_token_accuracy": 0.3324596777558327, + "step": 18730 + }, + { + "epoch": 0.9050298215536184, + "grad_norm": 2.4826600829370196, + "learning_rate": 5.442502787755788e-07, + "loss": 3.9656, + "mean_token_accuracy": 0.3231854856014252, + "step": 18740 + }, + { + "epoch": 0.9055127616931882, + "grad_norm": 2.5017046847777564, + "learning_rate": 5.387778702486457e-07, + "loss": 3.9, + "mean_token_accuracy": 0.3176411300897598, + "step": 18750 + }, + { + "epoch": 0.9059957018327578, + "grad_norm": 2.4882045700749456, + "learning_rate": 5.333323515536104e-07, + "loss": 3.9445, + "mean_token_accuracy": 0.316129033267498, + "step": 18760 + }, + { + "epoch": 0.9064786419723275, + "grad_norm": 2.549440181267693, + "learning_rate": 5.279137381672395e-07, + "loss": 3.927, + "mean_token_accuracy": 0.31360886842012403, + "step": 18770 + }, + { + "epoch": 0.9069615821118973, + "grad_norm": 2.5172067210049796, + "learning_rate": 5.22522045489825e-07, + "loss": 3.8648, + "mean_token_accuracy": 0.3209677428007126, + "step": 18780 + }, + { + "epoch": 0.9074445222514669, + "grad_norm": 2.7296527838127735, + "learning_rate": 5.171572888451482e-07, + "loss": 3.8293, + "mean_token_accuracy": 0.32973790615797044, + "step": 18790 + }, + { + "epoch": 0.9079274623910366, + "grad_norm": 2.4374331740842092, + "learning_rate": 5.118194834804391e-07, + "loss": 3.9012, + "mean_token_accuracy": 0.32137097269296644, + "step": 18800 + }, + { + "epoch": 0.9079274623910366, + "eval_runtime": 7.7904, + "eval_samples_per_second": 379.186, + "eval_steps_per_second": 23.747, + "step": 18800 + }, + { + "epoch": 0.9084104025306063, + "grad_norm": 2.4773505001982508, + "learning_rate": 5.065086445663248e-07, + "loss": 3.8863, + "mean_token_accuracy": 0.32620967626571656, + "step": 18810 + }, + { + "epoch": 0.9088933426701761, + "grad_norm": 2.4670580941703752, + "learning_rate": 5.012247871967945e-07, + "loss": 3.85, + "mean_token_accuracy": 0.324798384308815, + "step": 18820 + }, + { + "epoch": 0.9093762828097457, + "grad_norm": 2.684457024602209, + "learning_rate": 4.959679263891471e-07, + "loss": 3.9586, + "mean_token_accuracy": 0.3166330635547638, + "step": 18830 + }, + { + "epoch": 0.9098592229493154, + "grad_norm": 2.488340701069753, + "learning_rate": 4.90738077083962e-07, + "loss": 3.9383, + "mean_token_accuracy": 0.318245966732502, + "step": 18840 + }, + { + "epoch": 0.9103421630888852, + "grad_norm": 2.5644966863299303, + "learning_rate": 4.85535254145042e-07, + "loss": 3.9203, + "mean_token_accuracy": 0.31169354617595674, + "step": 18850 + }, + { + "epoch": 0.9108251032284548, + "grad_norm": 2.5045931433189375, + "learning_rate": 4.80359472359384e-07, + "loss": 3.9109, + "mean_token_accuracy": 0.3205645173788071, + "step": 18860 + }, + { + "epoch": 0.9113080433680245, + "grad_norm": 2.690638461618957, + "learning_rate": 4.7521074643712473e-07, + "loss": 3.8918, + "mean_token_accuracy": 0.3225806415081024, + "step": 18870 + }, + { + "epoch": 0.9117909835075942, + "grad_norm": 2.587146130473699, + "learning_rate": 4.700890910115119e-07, + "loss": 3.9676, + "mean_token_accuracy": 0.31552419513463975, + "step": 18880 + }, + { + "epoch": 0.912273923647164, + "grad_norm": 2.485785065387238, + "learning_rate": 4.6499452063885064e-07, + "loss": 3.9277, + "mean_token_accuracy": 0.3143145129084587, + "step": 18890 + }, + { + "epoch": 0.9127568637867336, + "grad_norm": 2.6035654302146107, + "learning_rate": 4.599270497984676e-07, + "loss": 3.8852, + "mean_token_accuracy": 0.32338709831237794, + "step": 18900 + }, + { + "epoch": 0.9127568637867336, + "eval_runtime": 7.8079, + "eval_samples_per_second": 378.337, + "eval_steps_per_second": 23.694, + "step": 18900 + }, + { + "epoch": 0.9132398039263033, + "grad_norm": 2.6154486306538867, + "learning_rate": 4.548866928926732e-07, + "loss": 3.8309, + "mean_token_accuracy": 0.3325604870915413, + "step": 18910 + }, + { + "epoch": 0.9137227440658731, + "grad_norm": 2.6372359708400093, + "learning_rate": 4.498734642467151e-07, + "loss": 3.9453, + "mean_token_accuracy": 0.32308468222618103, + "step": 18920 + }, + { + "epoch": 0.9142056842054427, + "grad_norm": 2.7690705503226543, + "learning_rate": 4.4488737810874037e-07, + "loss": 3.893, + "mean_token_accuracy": 0.321370966732502, + "step": 18930 + }, + { + "epoch": 0.9146886243450124, + "grad_norm": 2.5262319425991637, + "learning_rate": 4.3992844864974905e-07, + "loss": 3.8953, + "mean_token_accuracy": 0.32600806653499603, + "step": 18940 + }, + { + "epoch": 0.9151715644845821, + "grad_norm": 2.537055947158476, + "learning_rate": 4.3499668996356824e-07, + "loss": 3.8543, + "mean_token_accuracy": 0.32278225719928744, + "step": 18950 + }, + { + "epoch": 0.9156545046241519, + "grad_norm": 2.7171126783994657, + "learning_rate": 4.300921160667937e-07, + "loss": 3.9355, + "mean_token_accuracy": 0.3167338714003563, + "step": 18960 + }, + { + "epoch": 0.9161374447637215, + "grad_norm": 2.602038936275718, + "learning_rate": 4.2521474089876614e-07, + "loss": 3.8523, + "mean_token_accuracy": 0.3256048381328583, + "step": 18970 + }, + { + "epoch": 0.9166203849032912, + "grad_norm": 2.619720114968416, + "learning_rate": 4.20364578321516e-07, + "loss": 3.8934, + "mean_token_accuracy": 0.3287298396229744, + "step": 18980 + }, + { + "epoch": 0.917103325042861, + "grad_norm": 2.5304704087644145, + "learning_rate": 4.1554164211974447e-07, + "loss": 3.9309, + "mean_token_accuracy": 0.32721774131059644, + "step": 18990 + }, + { + "epoch": 0.9175862651824306, + "grad_norm": 2.698019022324581, + "learning_rate": 4.107459460007601e-07, + "loss": 3.8473, + "mean_token_accuracy": 0.3297379031777382, + "step": 19000 + }, + { + "epoch": 0.9175862651824306, + "eval_runtime": 7.8214, + "eval_samples_per_second": 377.683, + "eval_steps_per_second": 23.653, + "step": 19000 + }, + { + "epoch": 0.9180692053220003, + "grad_norm": 2.8081133944101726, + "learning_rate": 4.059775035944613e-07, + "loss": 3.9379, + "mean_token_accuracy": 0.3162298336625099, + "step": 19010 + }, + { + "epoch": 0.91855214546157, + "grad_norm": 2.505467578492602, + "learning_rate": 4.0123632845328167e-07, + "loss": 3.8598, + "mean_token_accuracy": 0.3239919364452362, + "step": 19020 + }, + { + "epoch": 0.9190350856011398, + "grad_norm": 2.5358846827134105, + "learning_rate": 3.965224340521645e-07, + "loss": 3.8695, + "mean_token_accuracy": 0.32641129195690155, + "step": 19030 + }, + { + "epoch": 0.9195180257407094, + "grad_norm": 2.5024701792955395, + "learning_rate": 3.918358337885153e-07, + "loss": 3.8687, + "mean_token_accuracy": 0.32268145233392714, + "step": 19040 + }, + { + "epoch": 0.9200009658802791, + "grad_norm": 2.5629357644973654, + "learning_rate": 3.871765409821615e-07, + "loss": 3.8844, + "mean_token_accuracy": 0.32227822840213777, + "step": 19050 + }, + { + "epoch": 0.9204839060198489, + "grad_norm": 2.6066892342153416, + "learning_rate": 3.8254456887533156e-07, + "loss": 3.8828, + "mean_token_accuracy": 0.3198588714003563, + "step": 19060 + }, + { + "epoch": 0.9209668461594186, + "grad_norm": 2.488094364056554, + "learning_rate": 3.779399306325937e-07, + "loss": 3.9316, + "mean_token_accuracy": 0.31784273833036425, + "step": 19070 + }, + { + "epoch": 0.9214497862989882, + "grad_norm": 2.7034090356412888, + "learning_rate": 3.7336263934083737e-07, + "loss": 3.8559, + "mean_token_accuracy": 0.3318548396229744, + "step": 19080 + }, + { + "epoch": 0.921932726438558, + "grad_norm": 2.62611611703325, + "learning_rate": 3.688127080092252e-07, + "loss": 3.8559, + "mean_token_accuracy": 0.3225806444883347, + "step": 19090 + }, + { + "epoch": 0.9224156665781277, + "grad_norm": 2.5928103916030656, + "learning_rate": 3.642901495691642e-07, + "loss": 3.8402, + "mean_token_accuracy": 0.3228830650448799, + "step": 19100 + }, + { + "epoch": 0.9224156665781277, + "eval_runtime": 7.8233, + "eval_samples_per_second": 377.588, + "eval_steps_per_second": 23.647, + "step": 19100 + }, + { + "epoch": 0.9228986067176973, + "grad_norm": 2.601053724006797, + "learning_rate": 3.5979497687426036e-07, + "loss": 3.9699, + "mean_token_accuracy": 0.3127016142010689, + "step": 19110 + }, + { + "epoch": 0.923381546857267, + "grad_norm": 2.6358294746115805, + "learning_rate": 3.553272027002885e-07, + "loss": 3.9188, + "mean_token_accuracy": 0.3264112904667854, + "step": 19120 + }, + { + "epoch": 0.9238644869968368, + "grad_norm": 2.770921342024172, + "learning_rate": 3.5088683974515146e-07, + "loss": 3.9043, + "mean_token_accuracy": 0.32701613157987597, + "step": 19130 + }, + { + "epoch": 0.9243474271364065, + "grad_norm": 2.614000634361156, + "learning_rate": 3.464739006288509e-07, + "loss": 3.934, + "mean_token_accuracy": 0.31250000149011614, + "step": 19140 + }, + { + "epoch": 0.9248303672759761, + "grad_norm": 2.4745174672234787, + "learning_rate": 3.4208839789344196e-07, + "loss": 3.9582, + "mean_token_accuracy": 0.31965726166963576, + "step": 19150 + }, + { + "epoch": 0.9253133074155458, + "grad_norm": 2.610724419573157, + "learning_rate": 3.377303440030066e-07, + "loss": 3.9254, + "mean_token_accuracy": 0.3127016112208366, + "step": 19160 + }, + { + "epoch": 0.9257962475551156, + "grad_norm": 2.619733190776688, + "learning_rate": 3.3339975134361157e-07, + "loss": 3.8824, + "mean_token_accuracy": 0.3195564478635788, + "step": 19170 + }, + { + "epoch": 0.9262791876946852, + "grad_norm": 2.783267576792982, + "learning_rate": 3.2909663222327583e-07, + "loss": 3.9414, + "mean_token_accuracy": 0.32368951886892317, + "step": 19180 + }, + { + "epoch": 0.9267621278342549, + "grad_norm": 2.493079950658083, + "learning_rate": 3.248209988719386e-07, + "loss": 3.8648, + "mean_token_accuracy": 0.3292338714003563, + "step": 19190 + }, + { + "epoch": 0.9272450679738247, + "grad_norm": 2.7837353515841983, + "learning_rate": 3.2057286344141515e-07, + "loss": 3.923, + "mean_token_accuracy": 0.31411290615797044, + "step": 19200 + }, + { + "epoch": 0.9272450679738247, + "eval_runtime": 7.828, + "eval_samples_per_second": 377.366, + "eval_steps_per_second": 23.633, + "step": 19200 + }, + { + "epoch": 0.9277280081133944, + "grad_norm": 2.522543368816119, + "learning_rate": 3.163522380053785e-07, + "loss": 3.9141, + "mean_token_accuracy": 0.3188508078455925, + "step": 19210 + }, + { + "epoch": 0.928210948252964, + "grad_norm": 2.4282524879481957, + "learning_rate": 3.1215913455930337e-07, + "loss": 3.8691, + "mean_token_accuracy": 0.32217742055654525, + "step": 19220 + }, + { + "epoch": 0.9286938883925338, + "grad_norm": 2.709508618019017, + "learning_rate": 3.0799356502045464e-07, + "loss": 3.9027, + "mean_token_accuracy": 0.3201612919569016, + "step": 19230 + }, + { + "epoch": 0.9291768285321035, + "grad_norm": 2.543570879721227, + "learning_rate": 3.0385554122783545e-07, + "loss": 3.9539, + "mean_token_accuracy": 0.32227822542190554, + "step": 19240 + }, + { + "epoch": 0.9296597686716731, + "grad_norm": 2.62279741090973, + "learning_rate": 2.9974507494216596e-07, + "loss": 3.952, + "mean_token_accuracy": 0.31522177457809447, + "step": 19250 + }, + { + "epoch": 0.9301427088112428, + "grad_norm": 2.528492635895777, + "learning_rate": 2.9566217784584016e-07, + "loss": 4.0156, + "mean_token_accuracy": 0.3119959682226181, + "step": 19260 + }, + { + "epoch": 0.9306256489508126, + "grad_norm": 2.9014657624963727, + "learning_rate": 2.916068615429013e-07, + "loss": 3.9234, + "mean_token_accuracy": 0.31633064448833464, + "step": 19270 + }, + { + "epoch": 0.9311085890903823, + "grad_norm": 2.403591101207481, + "learning_rate": 2.87579137559002e-07, + "loss": 3.9937, + "mean_token_accuracy": 0.31340725868940356, + "step": 19280 + }, + { + "epoch": 0.9315915292299519, + "grad_norm": 2.6196081554643693, + "learning_rate": 2.835790173413788e-07, + "loss": 3.925, + "mean_token_accuracy": 0.3213709697127342, + "step": 19290 + }, + { + "epoch": 0.9320744693695217, + "grad_norm": 2.724996916497667, + "learning_rate": 2.7960651225881097e-07, + "loss": 3.9227, + "mean_token_accuracy": 0.3177419379353523, + "step": 19300 + }, + { + "epoch": 0.9320744693695217, + "eval_runtime": 7.8211, + "eval_samples_per_second": 377.697, + "eval_steps_per_second": 23.654, + "step": 19300 + }, + { + "epoch": 0.9325574095090914, + "grad_norm": 2.659378795088702, + "learning_rate": 2.756616336015916e-07, + "loss": 3.9969, + "mean_token_accuracy": 0.312197582423687, + "step": 19310 + }, + { + "epoch": 0.933040349648661, + "grad_norm": 2.7999484834304678, + "learning_rate": 2.7174439258150444e-07, + "loss": 3.9086, + "mean_token_accuracy": 0.323387099802494, + "step": 19320 + }, + { + "epoch": 0.9335232897882307, + "grad_norm": 3.156615195210052, + "learning_rate": 2.678548003317727e-07, + "loss": 3.8504, + "mean_token_accuracy": 0.3265120968222618, + "step": 19330 + }, + { + "epoch": 0.9340062299278005, + "grad_norm": 2.5308370104062194, + "learning_rate": 2.6399286790704803e-07, + "loss": 3.9227, + "mean_token_accuracy": 0.3246975809335709, + "step": 19340 + }, + { + "epoch": 0.9344891700673702, + "grad_norm": 2.4821951746772486, + "learning_rate": 2.6015860628336386e-07, + "loss": 3.8996, + "mean_token_accuracy": 0.3173387095332146, + "step": 19350 + }, + { + "epoch": 0.9349721102069398, + "grad_norm": 2.5257136886558467, + "learning_rate": 2.563520263581165e-07, + "loss": 3.907, + "mean_token_accuracy": 0.31915322691202164, + "step": 19360 + }, + { + "epoch": 0.9354550503465096, + "grad_norm": 2.672617688757303, + "learning_rate": 2.5257313895001965e-07, + "loss": 3.9016, + "mean_token_accuracy": 0.3269153222441673, + "step": 19370 + }, + { + "epoch": 0.9359379904860793, + "grad_norm": 2.561879482730617, + "learning_rate": 2.488219547990889e-07, + "loss": 3.9656, + "mean_token_accuracy": 0.3177419349551201, + "step": 19380 + }, + { + "epoch": 0.9364209306256489, + "grad_norm": 2.6147173633418728, + "learning_rate": 2.4509848456659934e-07, + "loss": 3.877, + "mean_token_accuracy": 0.32116935700178145, + "step": 19390 + }, + { + "epoch": 0.9369038707652186, + "grad_norm": 2.8378090427501927, + "learning_rate": 2.414027388350648e-07, + "loss": 3.9926, + "mean_token_accuracy": 0.31532258093357085, + "step": 19400 + }, + { + "epoch": 0.9369038707652186, + "eval_runtime": 7.7969, + "eval_samples_per_second": 378.869, + "eval_steps_per_second": 23.727, + "step": 19400 + }, + { + "epoch": 0.9373868109047884, + "grad_norm": 2.5140470070157894, + "learning_rate": 2.3773472810819874e-07, + "loss": 3.9332, + "mean_token_accuracy": 0.31502016335725785, + "step": 19410 + }, + { + "epoch": 0.9378697510443581, + "grad_norm": 2.593453842238223, + "learning_rate": 2.3409446281088988e-07, + "loss": 3.9145, + "mean_token_accuracy": 0.3166330650448799, + "step": 19420 + }, + { + "epoch": 0.9383526911839277, + "grad_norm": 2.461249695321873, + "learning_rate": 2.3048195328917223e-07, + "loss": 3.8922, + "mean_token_accuracy": 0.31441532373428344, + "step": 19430 + }, + { + "epoch": 0.9388356313234975, + "grad_norm": 2.4725622446835183, + "learning_rate": 2.2689720981019513e-07, + "loss": 3.9277, + "mean_token_accuracy": 0.3158266097307205, + "step": 19440 + }, + { + "epoch": 0.9393185714630672, + "grad_norm": 2.574068805859843, + "learning_rate": 2.2334024256219333e-07, + "loss": 3.816, + "mean_token_accuracy": 0.32711693793535235, + "step": 19450 + }, + { + "epoch": 0.9398015116026368, + "grad_norm": 2.560525805446135, + "learning_rate": 2.1981106165445465e-07, + "loss": 3.9574, + "mean_token_accuracy": 0.32116935551166537, + "step": 19460 + }, + { + "epoch": 0.9402844517422065, + "grad_norm": 2.5105187541810414, + "learning_rate": 2.1630967711730345e-07, + "loss": 3.934, + "mean_token_accuracy": 0.31633064448833464, + "step": 19470 + }, + { + "epoch": 0.9407673918817763, + "grad_norm": 2.4622664871562265, + "learning_rate": 2.1283609890205615e-07, + "loss": 3.8578, + "mean_token_accuracy": 0.3212701603770256, + "step": 19480 + }, + { + "epoch": 0.941250332021346, + "grad_norm": 2.5721867320714416, + "learning_rate": 2.0939033688100574e-07, + "loss": 3.9355, + "mean_token_accuracy": 0.3192540302872658, + "step": 19490 + }, + { + "epoch": 0.9417332721609156, + "grad_norm": 2.564343503918378, + "learning_rate": 2.059724008473818e-07, + "loss": 3.9234, + "mean_token_accuracy": 0.31411290019750593, + "step": 19500 + }, + { + "epoch": 0.9417332721609156, + "eval_runtime": 7.7798, + "eval_samples_per_second": 379.704, + "eval_steps_per_second": 23.78, + "step": 19500 + }, + { + "epoch": 0.9422162123004854, + "grad_norm": 2.6502418226088613, + "learning_rate": 2.0258230051533822e-07, + "loss": 3.9457, + "mean_token_accuracy": 0.31471773982048035, + "step": 19510 + }, + { + "epoch": 0.9426991524400551, + "grad_norm": 2.5163455055308597, + "learning_rate": 1.9922004551990891e-07, + "loss": 3.9641, + "mean_token_accuracy": 0.31905242055654526, + "step": 19520 + }, + { + "epoch": 0.9431820925796247, + "grad_norm": 2.4206413048422912, + "learning_rate": 1.958856454169944e-07, + "loss": 3.9332, + "mean_token_accuracy": 0.32137096524238584, + "step": 19530 + }, + { + "epoch": 0.9436650327191944, + "grad_norm": 2.6683290778560336, + "learning_rate": 1.9257910968332405e-07, + "loss": 3.9551, + "mean_token_accuracy": 0.31995967626571653, + "step": 19540 + }, + { + "epoch": 0.9441479728587642, + "grad_norm": 2.462647437956805, + "learning_rate": 1.893004477164373e-07, + "loss": 3.8719, + "mean_token_accuracy": 0.33014112561941145, + "step": 19550 + }, + { + "epoch": 0.9446309129983339, + "grad_norm": 2.5511477711346804, + "learning_rate": 1.8604966883464804e-07, + "loss": 3.8703, + "mean_token_accuracy": 0.31743951588869096, + "step": 19560 + }, + { + "epoch": 0.9451138531379035, + "grad_norm": 2.5828642009651, + "learning_rate": 1.828267822770302e-07, + "loss": 3.968, + "mean_token_accuracy": 0.31864919066429137, + "step": 19570 + }, + { + "epoch": 0.9455967932774733, + "grad_norm": 2.5446709674868075, + "learning_rate": 1.7963179720338008e-07, + "loss": 3.9684, + "mean_token_accuracy": 0.3165322571992874, + "step": 19580 + }, + { + "epoch": 0.946079733417043, + "grad_norm": 2.6205788486703927, + "learning_rate": 1.7646472269419401e-07, + "loss": 3.9262, + "mean_token_accuracy": 0.31784273982048034, + "step": 19590 + }, + { + "epoch": 0.9465626735566126, + "grad_norm": 2.701050907610122, + "learning_rate": 1.7332556775064845e-07, + "loss": 3.8879, + "mean_token_accuracy": 0.32268145233392714, + "step": 19600 + }, + { + "epoch": 0.9465626735566126, + "eval_runtime": 7.7806, + "eval_samples_per_second": 379.665, + "eval_steps_per_second": 23.777, + "step": 19600 + }, + { + "epoch": 0.9470456136961823, + "grad_norm": 2.4837191107950627, + "learning_rate": 1.7021434129456337e-07, + "loss": 3.8816, + "mean_token_accuracy": 0.3280241906642914, + "step": 19610 + }, + { + "epoch": 0.9475285538357521, + "grad_norm": 2.633241501741633, + "learning_rate": 1.6713105216838887e-07, + "loss": 3.923, + "mean_token_accuracy": 0.3229838743805885, + "step": 19620 + }, + { + "epoch": 0.9480114939753218, + "grad_norm": 2.555522130350023, + "learning_rate": 1.6407570913516967e-07, + "loss": 3.9082, + "mean_token_accuracy": 0.3270161300897598, + "step": 19630 + }, + { + "epoch": 0.9484944341148914, + "grad_norm": 2.6328896730196147, + "learning_rate": 1.6104832087852518e-07, + "loss": 3.8902, + "mean_token_accuracy": 0.3254032269120216, + "step": 19640 + }, + { + "epoch": 0.9489773742544612, + "grad_norm": 2.482082345541495, + "learning_rate": 1.5804889600262607e-07, + "loss": 3.882, + "mean_token_accuracy": 0.3225806444883347, + "step": 19650 + }, + { + "epoch": 0.9494603143940309, + "grad_norm": 2.685352604061562, + "learning_rate": 1.5507744303216777e-07, + "loss": 3.9352, + "mean_token_accuracy": 0.3145161300897598, + "step": 19660 + }, + { + "epoch": 0.9499432545336005, + "grad_norm": 2.5224640241860836, + "learning_rate": 1.521339704123448e-07, + "loss": 3.9273, + "mean_token_accuracy": 0.31985886842012407, + "step": 19670 + }, + { + "epoch": 0.9504261946731702, + "grad_norm": 2.505164318961583, + "learning_rate": 1.4921848650882976e-07, + "loss": 3.8691, + "mean_token_accuracy": 0.32822580337524415, + "step": 19680 + }, + { + "epoch": 0.95090913481274, + "grad_norm": 2.4491558120453267, + "learning_rate": 1.4633099960774777e-07, + "loss": 3.8699, + "mean_token_accuracy": 0.3232862904667854, + "step": 19690 + }, + { + "epoch": 0.9513920749523097, + "grad_norm": 2.67599030505745, + "learning_rate": 1.434715179156554e-07, + "loss": 3.9168, + "mean_token_accuracy": 0.31572581082582474, + "step": 19700 + }, + { + "epoch": 0.9513920749523097, + "eval_runtime": 7.7953, + "eval_samples_per_second": 378.946, + "eval_steps_per_second": 23.732, + "step": 19700 + }, + { + "epoch": 0.9518750150918793, + "grad_norm": 2.511813279636735, + "learning_rate": 1.4064004955951062e-07, + "loss": 3.9086, + "mean_token_accuracy": 0.3171370968222618, + "step": 19710 + }, + { + "epoch": 0.9523579552314491, + "grad_norm": 2.6119050450332453, + "learning_rate": 1.3783660258665733e-07, + "loss": 3.8668, + "mean_token_accuracy": 0.32016129046678543, + "step": 19720 + }, + { + "epoch": 0.9528408953710188, + "grad_norm": 2.7940854989777444, + "learning_rate": 1.3506118496480314e-07, + "loss": 4.0207, + "mean_token_accuracy": 0.31159274131059644, + "step": 19730 + }, + { + "epoch": 0.9533238355105884, + "grad_norm": 2.5316011212641656, + "learning_rate": 1.3231380458198605e-07, + "loss": 3.941, + "mean_token_accuracy": 0.32197580933570863, + "step": 19740 + }, + { + "epoch": 0.9538067756501581, + "grad_norm": 2.5339227212393194, + "learning_rate": 1.2959446924656448e-07, + "loss": 3.9297, + "mean_token_accuracy": 0.3189516142010689, + "step": 19750 + }, + { + "epoch": 0.9542897157897279, + "grad_norm": 2.617854696222853, + "learning_rate": 1.2690318668718726e-07, + "loss": 3.852, + "mean_token_accuracy": 0.31693548411130906, + "step": 19760 + }, + { + "epoch": 0.9547726559292976, + "grad_norm": 2.6508015106758016, + "learning_rate": 1.2423996455277477e-07, + "loss": 3.8594, + "mean_token_accuracy": 0.3305443570017815, + "step": 19770 + }, + { + "epoch": 0.9552555960688672, + "grad_norm": 2.4739673459249607, + "learning_rate": 1.2160481041249783e-07, + "loss": 3.8656, + "mean_token_accuracy": 0.3322580635547638, + "step": 19780 + }, + { + "epoch": 0.955738536208437, + "grad_norm": 2.7077860756096976, + "learning_rate": 1.1899773175575224e-07, + "loss": 3.8406, + "mean_token_accuracy": 0.32540322542190553, + "step": 19790 + }, + { + "epoch": 0.9562214763480067, + "grad_norm": 2.5397501129750637, + "learning_rate": 1.1641873599214204e-07, + "loss": 3.9133, + "mean_token_accuracy": 0.3172379031777382, + "step": 19800 + }, + { + "epoch": 0.9562214763480067, + "eval_runtime": 7.7919, + "eval_samples_per_second": 379.111, + "eval_steps_per_second": 23.743, + "step": 19800 + }, + { + "epoch": 0.9567044164875763, + "grad_norm": 2.6037360027328926, + "learning_rate": 1.1386783045145733e-07, + "loss": 3.95, + "mean_token_accuracy": 0.3125, + "step": 19810 + }, + { + "epoch": 0.957187356627146, + "grad_norm": 2.610495064130943, + "learning_rate": 1.1134502238365097e-07, + "loss": 3.9094, + "mean_token_accuracy": 0.31673387438058853, + "step": 19820 + }, + { + "epoch": 0.9576702967667158, + "grad_norm": 2.5182987998687927, + "learning_rate": 1.0885031895882081e-07, + "loss": 3.9223, + "mean_token_accuracy": 0.3259072557091713, + "step": 19830 + }, + { + "epoch": 0.9581532369062855, + "grad_norm": 2.6583038229965372, + "learning_rate": 1.0638372726718749e-07, + "loss": 3.8641, + "mean_token_accuracy": 0.3239577829837799, + "step": 19840 + }, + { + "epoch": 0.9586361770458551, + "grad_norm": 2.5819519431756266, + "learning_rate": 1.0394525431907443e-07, + "loss": 3.9074, + "mean_token_accuracy": 0.3214717760682106, + "step": 19850 + }, + { + "epoch": 0.9591191171854249, + "grad_norm": 2.815563560391342, + "learning_rate": 1.0153490704489233e-07, + "loss": 3.9242, + "mean_token_accuracy": 0.3147177442908287, + "step": 19860 + }, + { + "epoch": 0.9596020573249946, + "grad_norm": 2.5672375656625035, + "learning_rate": 9.915269229510805e-08, + "loss": 3.902, + "mean_token_accuracy": 0.3168346807360649, + "step": 19870 + }, + { + "epoch": 0.9600849974645642, + "grad_norm": 2.58054879091667, + "learning_rate": 9.679861684024239e-08, + "loss": 3.8988, + "mean_token_accuracy": 0.32056451588869095, + "step": 19880 + }, + { + "epoch": 0.960567937604134, + "grad_norm": 2.7407947990544574, + "learning_rate": 9.447268737083348e-08, + "loss": 3.9449, + "mean_token_accuracy": 0.3187499985098839, + "step": 19890 + }, + { + "epoch": 0.9610508777437037, + "grad_norm": 2.509597303413779, + "learning_rate": 9.217491049742789e-08, + "loss": 4.009, + "mean_token_accuracy": 0.32247983664274216, + "step": 19900 + }, + { + "epoch": 0.9610508777437037, + "eval_runtime": 7.8001, + "eval_samples_per_second": 378.711, + "eval_steps_per_second": 23.718, + "step": 19900 + }, + { + "epoch": 0.9615338178832734, + "grad_norm": 2.5509201174210783, + "learning_rate": 8.990529275056059e-08, + "loss": 3.9504, + "mean_token_accuracy": 0.32177419364452364, + "step": 19910 + }, + { + "epoch": 0.962016758022843, + "grad_norm": 2.809475091884003, + "learning_rate": 8.766384058073618e-08, + "loss": 3.9937, + "mean_token_accuracy": 0.31249999850988386, + "step": 19920 + }, + { + "epoch": 0.9624996981624128, + "grad_norm": 2.5517006156063413, + "learning_rate": 8.545056035840438e-08, + "loss": 3.9203, + "mean_token_accuracy": 0.3196572601795197, + "step": 19930 + }, + { + "epoch": 0.9629826383019825, + "grad_norm": 2.535404226207725, + "learning_rate": 8.326545837395228e-08, + "loss": 3.8465, + "mean_token_accuracy": 0.3216733902692795, + "step": 19940 + }, + { + "epoch": 0.9634655784415521, + "grad_norm": 2.7718206828584284, + "learning_rate": 8.110854083767883e-08, + "loss": 3.9039, + "mean_token_accuracy": 0.32268145084381106, + "step": 19950 + }, + { + "epoch": 0.9639485185811218, + "grad_norm": 2.5763078063963527, + "learning_rate": 7.89798138797826e-08, + "loss": 3.959, + "mean_token_accuracy": 0.3169354826211929, + "step": 19960 + }, + { + "epoch": 0.9644314587206916, + "grad_norm": 2.5595050443811984, + "learning_rate": 7.687928355033736e-08, + "loss": 3.9348, + "mean_token_accuracy": 0.324193549156189, + "step": 19970 + }, + { + "epoch": 0.9649143988602613, + "grad_norm": 2.68061210541419, + "learning_rate": 7.480695581927988e-08, + "loss": 3.8996, + "mean_token_accuracy": 0.3186491936445236, + "step": 19980 + }, + { + "epoch": 0.9653973389998309, + "grad_norm": 2.5632036100421622, + "learning_rate": 7.27628365763966e-08, + "loss": 3.9617, + "mean_token_accuracy": 0.32429435551166536, + "step": 19990 + }, + { + "epoch": 0.9658802791394007, + "grad_norm": 2.543367807101631, + "learning_rate": 7.074693163129476e-08, + "loss": 3.8383, + "mean_token_accuracy": 0.3331653192639351, + "step": 20000 + }, + { + "epoch": 0.9658802791394007, + "eval_runtime": 7.7868, + "eval_samples_per_second": 379.362, + "eval_steps_per_second": 23.758, + "step": 20000 + }, + { + "epoch": 0.9663632192789704, + "grad_norm": 2.6551108398559085, + "learning_rate": 6.875924671340018e-08, + "loss": 3.8672, + "mean_token_accuracy": 0.3329637095332146, + "step": 20010 + }, + { + "epoch": 0.96684615941854, + "grad_norm": 2.585869453538405, + "learning_rate": 6.679978747193061e-08, + "loss": 3.8949, + "mean_token_accuracy": 0.31774193346500396, + "step": 20020 + }, + { + "epoch": 0.9673290995581098, + "grad_norm": 2.6161156467423305, + "learning_rate": 6.486855947588467e-08, + "loss": 3.9406, + "mean_token_accuracy": 0.3146169319748878, + "step": 20030 + }, + { + "epoch": 0.9678120396976795, + "grad_norm": 2.733466732480941, + "learning_rate": 6.2965568214024e-08, + "loss": 3.8789, + "mean_token_accuracy": 0.3240927383303642, + "step": 20040 + }, + { + "epoch": 0.9682949798372492, + "grad_norm": 2.4510361763570763, + "learning_rate": 6.109081909485892e-08, + "loss": 3.884, + "mean_token_accuracy": 0.31975806653499605, + "step": 20050 + }, + { + "epoch": 0.9687779199768188, + "grad_norm": 2.5076379400532827, + "learning_rate": 5.924431744663173e-08, + "loss": 3.8844, + "mean_token_accuracy": 0.315625, + "step": 20060 + }, + { + "epoch": 0.9692608601163886, + "grad_norm": 2.5052522951514105, + "learning_rate": 5.7426068517303366e-08, + "loss": 3.9762, + "mean_token_accuracy": 0.319556450843811, + "step": 20070 + }, + { + "epoch": 0.9697438002559583, + "grad_norm": 2.6687676448832596, + "learning_rate": 5.563607747453681e-08, + "loss": 3.9125, + "mean_token_accuracy": 0.31995967775583267, + "step": 20080 + }, + { + "epoch": 0.9702267403955279, + "grad_norm": 2.649350649940303, + "learning_rate": 5.38743494056837e-08, + "loss": 3.9082, + "mean_token_accuracy": 0.3239919379353523, + "step": 20090 + }, + { + "epoch": 0.9707096805350977, + "grad_norm": 2.67673862199842, + "learning_rate": 5.214088931776662e-08, + "loss": 3.9301, + "mean_token_accuracy": 0.3192540317773819, + "step": 20100 + }, + { + "epoch": 0.9707096805350977, + "eval_runtime": 7.7901, + "eval_samples_per_second": 379.2, + "eval_steps_per_second": 23.748, + "step": 20100 + }, + { + "epoch": 0.9711926206746674, + "grad_norm": 2.6627407389383864, + "learning_rate": 5.0435702137472395e-08, + "loss": 3.9082, + "mean_token_accuracy": 0.3205645143985748, + "step": 20110 + }, + { + "epoch": 0.9716755608142371, + "grad_norm": 2.474276940486427, + "learning_rate": 4.875879271112771e-08, + "loss": 3.9426, + "mean_token_accuracy": 0.31673386991024016, + "step": 20120 + }, + { + "epoch": 0.9721585009538067, + "grad_norm": 2.746346971736028, + "learning_rate": 4.711016580469352e-08, + "loss": 3.925, + "mean_token_accuracy": 0.31592742055654527, + "step": 20130 + }, + { + "epoch": 0.9726414410933765, + "grad_norm": 2.4538207448738385, + "learning_rate": 4.548982610374952e-08, + "loss": 3.9438, + "mean_token_accuracy": 0.31532258689403536, + "step": 20140 + }, + { + "epoch": 0.9731243812329462, + "grad_norm": 2.5715442891879015, + "learning_rate": 4.389777821347862e-08, + "loss": 3.9867, + "mean_token_accuracy": 0.3145161300897598, + "step": 20150 + }, + { + "epoch": 0.9736073213725158, + "grad_norm": 2.5076641603491696, + "learning_rate": 4.2334026658655826e-08, + "loss": 3.8574, + "mean_token_accuracy": 0.32620967775583265, + "step": 20160 + }, + { + "epoch": 0.9740902615120856, + "grad_norm": 2.5807542888545534, + "learning_rate": 4.0798575883633784e-08, + "loss": 3.8766, + "mean_token_accuracy": 0.3172379031777382, + "step": 20170 + }, + { + "epoch": 0.9745732016516553, + "grad_norm": 2.5993565386051665, + "learning_rate": 3.929143025233395e-08, + "loss": 3.9453, + "mean_token_accuracy": 0.31824597120285036, + "step": 20180 + }, + { + "epoch": 0.975056141791225, + "grad_norm": 2.6279745872767406, + "learning_rate": 3.781259404822657e-08, + "loss": 3.8629, + "mean_token_accuracy": 0.32741935551166534, + "step": 20190 + }, + { + "epoch": 0.9755390819307946, + "grad_norm": 2.5852149492915517, + "learning_rate": 3.6362071474329574e-08, + "loss": 3.9547, + "mean_token_accuracy": 0.3145161345601082, + "step": 20200 + }, + { + "epoch": 0.9755390819307946, + "eval_runtime": 7.81, + "eval_samples_per_second": 378.233, + "eval_steps_per_second": 23.688, + "step": 20200 + }, + { + "epoch": 0.9760220220703644, + "grad_norm": 2.754839568962273, + "learning_rate": 3.4939866653186384e-08, + "loss": 3.9133, + "mean_token_accuracy": 0.3183467760682106, + "step": 20210 + }, + { + "epoch": 0.9765049622099341, + "grad_norm": 2.499466225306501, + "learning_rate": 3.354598362685923e-08, + "loss": 3.8711, + "mean_token_accuracy": 0.3232862904667854, + "step": 20220 + }, + { + "epoch": 0.9769879023495038, + "grad_norm": 2.6713748643221082, + "learning_rate": 3.218042635691587e-08, + "loss": 3.9184, + "mean_token_accuracy": 0.32106854766607285, + "step": 20230 + }, + { + "epoch": 0.9774708424890735, + "grad_norm": 2.326302461823539, + "learning_rate": 3.084319872442176e-08, + "loss": 3.9219, + "mean_token_accuracy": 0.31905242055654526, + "step": 20240 + }, + { + "epoch": 0.9779537826286432, + "grad_norm": 2.4955060925733896, + "learning_rate": 2.9534304529922343e-08, + "loss": 3.9242, + "mean_token_accuracy": 0.3198588714003563, + "step": 20250 + }, + { + "epoch": 0.9784367227682129, + "grad_norm": 2.6329725140740394, + "learning_rate": 2.8253747493439677e-08, + "loss": 3.9996, + "mean_token_accuracy": 0.3151209682226181, + "step": 20260 + }, + { + "epoch": 0.9789196629077825, + "grad_norm": 2.7774990851805583, + "learning_rate": 2.7001531254458036e-08, + "loss": 3.9141, + "mean_token_accuracy": 0.32288306802511213, + "step": 20270 + }, + { + "epoch": 0.9794026030473523, + "grad_norm": 2.7290209636423746, + "learning_rate": 2.577765937191279e-08, + "loss": 4.0113, + "mean_token_accuracy": 0.3159274160861969, + "step": 20280 + }, + { + "epoch": 0.979885543186922, + "grad_norm": 2.452912457697491, + "learning_rate": 2.458213532418263e-08, + "loss": 3.8773, + "mean_token_accuracy": 0.3323588684201241, + "step": 20290 + }, + { + "epoch": 0.9803684833264917, + "grad_norm": 2.7315987149614824, + "learning_rate": 2.3414962509077375e-08, + "loss": 3.9172, + "mean_token_accuracy": 0.32036290168762205, + "step": 20300 + }, + { + "epoch": 0.9803684833264917, + "eval_runtime": 7.7919, + "eval_samples_per_second": 379.11, + "eval_steps_per_second": 23.743, + "step": 20300 + }, + { + "epoch": 0.9808514234660614, + "grad_norm": 2.5893200792442834, + "learning_rate": 2.2276144243830177e-08, + "loss": 3.9375, + "mean_token_accuracy": 0.3176411300897598, + "step": 20310 + }, + { + "epoch": 0.9813343636056311, + "grad_norm": 2.5841617000924124, + "learning_rate": 2.116568376508865e-08, + "loss": 3.8, + "mean_token_accuracy": 0.3359879031777382, + "step": 20320 + }, + { + "epoch": 0.9818173037452008, + "grad_norm": 2.6170419293834932, + "learning_rate": 2.0083584228903775e-08, + "loss": 3.8832, + "mean_token_accuracy": 0.32762096971273424, + "step": 20330 + }, + { + "epoch": 0.9823002438847704, + "grad_norm": 2.526407636386915, + "learning_rate": 1.902984871071878e-08, + "loss": 3.9312, + "mean_token_accuracy": 0.3182459697127342, + "step": 20340 + }, + { + "epoch": 0.9827831840243402, + "grad_norm": 2.58792876005009, + "learning_rate": 1.8004480205368046e-08, + "loss": 3.891, + "mean_token_accuracy": 0.322177417576313, + "step": 20350 + }, + { + "epoch": 0.9832661241639099, + "grad_norm": 2.7570470226238784, + "learning_rate": 1.700748162705934e-08, + "loss": 3.8445, + "mean_token_accuracy": 0.32540322542190553, + "step": 20360 + }, + { + "epoch": 0.9837490643034796, + "grad_norm": 2.6759940011610506, + "learning_rate": 1.603885580937492e-08, + "loss": 3.9059, + "mean_token_accuracy": 0.32358870804309847, + "step": 20370 + }, + { + "epoch": 0.9842320044430493, + "grad_norm": 2.55852422932362, + "learning_rate": 1.509860550525266e-08, + "loss": 3.9617, + "mean_token_accuracy": 0.31633064672350886, + "step": 20380 + }, + { + "epoch": 0.984714944582619, + "grad_norm": 2.626404616788194, + "learning_rate": 1.4186733386989393e-08, + "loss": 3.9055, + "mean_token_accuracy": 0.3150201618671417, + "step": 20390 + }, + { + "epoch": 0.9851978847221887, + "grad_norm": 2.592649879971373, + "learning_rate": 1.330324204622424e-08, + "loss": 3.9805, + "mean_token_accuracy": 0.31542338877916337, + "step": 20400 + }, + { + "epoch": 0.9851978847221887, + "eval_runtime": 7.8227, + "eval_samples_per_second": 377.618, + "eval_steps_per_second": 23.649, + "step": 20400 + }, + { + "epoch": 0.9856808248617583, + "grad_norm": 2.7257211880867103, + "learning_rate": 1.2448133993938627e-08, + "loss": 3.8711, + "mean_token_accuracy": 0.3280241936445236, + "step": 20410 + }, + { + "epoch": 0.9861637650013281, + "grad_norm": 2.582473900644627, + "learning_rate": 1.1621411660440728e-08, + "loss": 3.8676, + "mean_token_accuracy": 0.328125, + "step": 20420 + }, + { + "epoch": 0.9866467051408978, + "grad_norm": 2.6080631228828404, + "learning_rate": 1.0823077395367698e-08, + "loss": 3.8563, + "mean_token_accuracy": 0.3216733828186989, + "step": 20430 + }, + { + "epoch": 0.9871296452804675, + "grad_norm": 2.7425221460032576, + "learning_rate": 1.0053133467673448e-08, + "loss": 3.9625, + "mean_token_accuracy": 0.31270160973072053, + "step": 20440 + }, + { + "epoch": 0.9876125854200372, + "grad_norm": 2.4726696547150446, + "learning_rate": 9.311582065623103e-09, + "loss": 3.891, + "mean_token_accuracy": 0.3218750014901161, + "step": 20450 + }, + { + "epoch": 0.9880955255596069, + "grad_norm": 2.4848261646937377, + "learning_rate": 8.598425296786339e-09, + "loss": 3.8535, + "mean_token_accuracy": 0.32731855362653733, + "step": 20460 + }, + { + "epoch": 0.9885784656991766, + "grad_norm": 2.6259157070343924, + "learning_rate": 7.913665188032938e-09, + "loss": 3.9988, + "mean_token_accuracy": 0.3130040317773819, + "step": 20470 + }, + { + "epoch": 0.9890614058387462, + "grad_norm": 2.5753441068460274, + "learning_rate": 7.2573036855272395e-09, + "loss": 3.9273, + "mean_token_accuracy": 0.31693548411130906, + "step": 20480 + }, + { + "epoch": 0.989544345978316, + "grad_norm": 2.532870554202357, + "learning_rate": 6.629342654720372e-09, + "loss": 3.9184, + "mean_token_accuracy": 0.3182459682226181, + "step": 20490 + }, + { + "epoch": 0.9900272861178857, + "grad_norm": 2.5959249282217107, + "learning_rate": 6.029783880345807e-09, + "loss": 3.8594, + "mean_token_accuracy": 0.3258064538240433, + "step": 20500 + }, + { + "epoch": 0.9900272861178857, + "eval_runtime": 7.7915, + "eval_samples_per_second": 379.13, + "eval_steps_per_second": 23.744, + "step": 20500 + }, + { + "epoch": 0.9905102262574554, + "grad_norm": 2.5909835121554745, + "learning_rate": 5.458629066416032e-09, + "loss": 3.898, + "mean_token_accuracy": 0.31935484111309054, + "step": 20510 + }, + { + "epoch": 0.9909931663970251, + "grad_norm": 2.551611851195527, + "learning_rate": 4.915879836216997e-09, + "loss": 3.9621, + "mean_token_accuracy": 0.31602822691202165, + "step": 20520 + }, + { + "epoch": 0.9914761065365948, + "grad_norm": 2.495172896407294, + "learning_rate": 4.4015377322981225e-09, + "loss": 3.9445, + "mean_token_accuracy": 0.31975806355476377, + "step": 20530 + }, + { + "epoch": 0.9919590466761645, + "grad_norm": 2.5717490943703707, + "learning_rate": 3.915604216480074e-09, + "loss": 3.9211, + "mean_token_accuracy": 0.3230846762657166, + "step": 20540 + }, + { + "epoch": 0.9924419868157341, + "grad_norm": 2.530971109670067, + "learning_rate": 3.458080669836994e-09, + "loss": 3.9359, + "mean_token_accuracy": 0.3204637125134468, + "step": 20550 + }, + { + "epoch": 0.9929249269553039, + "grad_norm": 2.6131099234680475, + "learning_rate": 3.0289683927009484e-09, + "loss": 3.9355, + "mean_token_accuracy": 0.3195564538240433, + "step": 20560 + }, + { + "epoch": 0.9934078670948736, + "grad_norm": 2.4800998635135802, + "learning_rate": 2.62826860465637e-09, + "loss": 3.9109, + "mean_token_accuracy": 0.3160282254219055, + "step": 20570 + }, + { + "epoch": 0.9938908072344433, + "grad_norm": 2.6540648843105674, + "learning_rate": 2.255982444536731e-09, + "loss": 3.9586, + "mean_token_accuracy": 0.31905242055654526, + "step": 20580 + }, + { + "epoch": 0.994373747374013, + "grad_norm": 2.727633761771066, + "learning_rate": 1.9121109704201e-09, + "loss": 3.8969, + "mean_token_accuracy": 0.31925403475761416, + "step": 20590 + }, + { + "epoch": 0.9948566875135827, + "grad_norm": 2.5216381608616687, + "learning_rate": 1.596655159625815e-09, + "loss": 3.9207, + "mean_token_accuracy": 0.31844758093357084, + "step": 20600 + }, + { + "epoch": 0.9948566875135827, + "eval_runtime": 7.7835, + "eval_samples_per_second": 379.521, + "eval_steps_per_second": 23.768, + "step": 20600 + }, + { + "epoch": 0.9953396276531524, + "grad_norm": 2.557566970702623, + "learning_rate": 1.3096159087155892e-09, + "loss": 3.9066, + "mean_token_accuracy": 0.3253024220466614, + "step": 20610 + }, + { + "epoch": 0.995822567792722, + "grad_norm": 2.46116757551048, + "learning_rate": 1.0509940334857417e-09, + "loss": 3.8754, + "mean_token_accuracy": 0.32429435551166536, + "step": 20620 + }, + { + "epoch": 0.9963055079322918, + "grad_norm": 2.479608506712216, + "learning_rate": 8.207902689671976e-10, + "loss": 3.8945, + "mean_token_accuracy": 0.32217741906642916, + "step": 20630 + }, + { + "epoch": 0.9967884480718615, + "grad_norm": 2.5148816980131254, + "learning_rate": 6.190052694254877e-10, + "loss": 3.918, + "mean_token_accuracy": 0.3203629031777382, + "step": 20640 + }, + { + "epoch": 0.9972713882114312, + "grad_norm": 2.6897319142489486, + "learning_rate": 4.4563960835519725e-10, + "loss": 3.9133, + "mean_token_accuracy": 0.3217741921544075, + "step": 20650 + }, + { + "epoch": 0.9977543283510009, + "grad_norm": 2.668387661313725, + "learning_rate": 3.0069377847996573e-10, + "loss": 3.9289, + "mean_token_accuracy": 0.3209677442908287, + "step": 20660 + }, + { + "epoch": 0.9982372684905706, + "grad_norm": 2.6502085298332934, + "learning_rate": 1.8416819175359756e-10, + "loss": 3.909, + "mean_token_accuracy": 0.3197580650448799, + "step": 20670 + }, + { + "epoch": 0.9987202086301403, + "grad_norm": 2.6602355596713547, + "learning_rate": 9.606317935229036e-11, + "loss": 3.8676, + "mean_token_accuracy": 0.324193549156189, + "step": 20680 + }, + { + "epoch": 0.99920314876971, + "grad_norm": 2.6061338286329367, + "learning_rate": 3.637899168240644e-11, + "loss": 3.9465, + "mean_token_accuracy": 0.31794354766607286, + "step": 20690 + }, + { + "epoch": 0.9996860889092797, + "grad_norm": 2.588158409191739, + "learning_rate": 5.115798370480818e-12, + "loss": 3.9426, + "mean_token_accuracy": 0.3201612919569016, + "step": 20700 + }, + { + "epoch": 0.9996860889092797, + "eval_runtime": 7.7842, + "eval_samples_per_second": 379.484, + "eval_steps_per_second": 23.766, + "step": 20700 + }, + { + "epoch": 0.9999758529930215, + "mean_token_accuracy": 0.32610887040694553, + "step": 20706, + "total_flos": 5419008396361728.0, + "train_loss": 4.076150745858688, + "train_runtime": 7573.4415, + "train_samples_per_second": 87.49, + "train_steps_per_second": 2.734 } ], "logging_steps": 10, - "max_steps": 647, + "max_steps": 20706, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -590,7 +18045,7 @@ "attributes": {} } }, - "total_flos": 5418484972388352.0, + "total_flos": 5419008396361728.0, "train_batch_size": 8, "trial_name": null, "trial_params": null