diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,18035 +1,580 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9999758529930215, + "epoch": 0.9992277992277993, "eval_steps": 100, - "global_step": 20706, + "global_step": 647, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0004829401395697003, - "grad_norm": 86.61036024405438, - "learning_rate": 9.657170449058426e-08, - "loss": 9.5563, - "mean_token_accuracy": 0.08225806429982185, + "epoch": 0.015444015444015444, + "grad_norm": 54.027117924566284, + "learning_rate": 3.0769230769230774e-06, + "loss": 8.2594, + "mean_token_accuracy": 0.10601478479802609, "step": 10 }, { - "epoch": 0.0009658802791394006, - "grad_norm": 123.58741881187208, - "learning_rate": 1.9314340898116852e-07, - "loss": 9.3203, - "mean_token_accuracy": 0.08860887065529824, + "epoch": 0.03088803088803089, + "grad_norm": 36.2771924758843, + "learning_rate": 6.153846153846155e-06, + "loss": 8.0141, + "mean_token_accuracy": 0.10835166163742542, "step": 20 }, { - "epoch": 0.001448820418709101, - "grad_norm": 43.63197078149982, - "learning_rate": 2.897151134717528e-07, - "loss": 9.3719, - "mean_token_accuracy": 0.08770161308348179, + "epoch": 0.04633204633204633, + "grad_norm": 32.30506084518261, + "learning_rate": 9.230769230769232e-06, + "loss": 7.1727, + "mean_token_accuracy": 0.11615957953035831, "step": 30 }, { - "epoch": 0.0019317605582788013, - "grad_norm": 56.11384023907413, - "learning_rate": 3.8628681796233705e-07, - "loss": 9.3297, - "mean_token_accuracy": 0.08316532205790281, + "epoch": 0.06177606177606178, + "grad_norm": 12.644482204441966, + "learning_rate": 1.230769230769231e-05, + "loss": 6.1906, + "mean_token_accuracy": 0.1327559869736433, "step": 40 }, { - "epoch": 0.0024147006978485017, - "grad_norm": 111.94241070044647, - "learning_rate": 4.828585224529214e-07, - "loss": 9.525, - "mean_token_accuracy": 0.07893145252019167, + "epoch": 0.07722007722007722, + "grad_norm": 10.568360790591178, + "learning_rate": 1.5384615384615387e-05, + "loss": 5.4813, + "mean_token_accuracy": 0.17196358889341354, "step": 50 }, { - "epoch": 0.002897640837418202, - "grad_norm": 87.47872168525997, - "learning_rate": 5.794302269435056e-07, - "loss": 9.2312, - "mean_token_accuracy": 0.08457661308348179, + "epoch": 0.09266409266409266, + "grad_norm": 4.068292936765287, + "learning_rate": 1.8461538461538465e-05, + "loss": 4.7438, + "mean_token_accuracy": 0.2288092628121376, "step": 60 }, { - "epoch": 0.0033805809769879023, - "grad_norm": 44.16636970911555, - "learning_rate": 6.760019314340899e-07, - "loss": 9.2469, - "mean_token_accuracy": 0.08669354766607285, + "epoch": 0.10810810810810811, + "grad_norm": 3.5423142348559904, + "learning_rate": 1.9996358021096174e-05, + "loss": 4.2523, + "mean_token_accuracy": 0.2767298325896263, "step": 70 }, { - "epoch": 0.0038635211165576025, - "grad_norm": 41.62348347154347, - "learning_rate": 7.725736359246741e-07, - "loss": 9.5484, - "mean_token_accuracy": 0.0815524198114872, + "epoch": 0.12355212355212356, + "grad_norm": 2.6809187857623313, + "learning_rate": 1.9967238104745695e-05, + "loss": 3.9688, + "mean_token_accuracy": 0.3063569128513336, "step": 80 }, { - "epoch": 0.004346461256127303, - "grad_norm": 60.14104875747736, - "learning_rate": 8.691453404152583e-07, - "loss": 9.1422, - "mean_token_accuracy": 0.08770161271095275, + "epoch": 0.138996138996139, + "grad_norm": 2.1859867880714967, + "learning_rate": 1.9909083099891682e-05, + "loss": 3.6148, + "mean_token_accuracy": 0.3451215773820877, "step": 90 }, { - "epoch": 0.0048294013956970035, - "grad_norm": 75.73130908570681, - "learning_rate": 9.657170449058428e-07, - "loss": 9.1875, - "mean_token_accuracy": 0.08941532261669635, + "epoch": 0.15444015444015444, + "grad_norm": 1.2979500528779593, + "learning_rate": 1.9822062415120053e-05, + "loss": 3.4617, + "mean_token_accuracy": 0.36571358889341354, "step": 100 }, { - "epoch": 0.0048294013956970035, - "eval_runtime": 7.7611, - "eval_samples_per_second": 380.616, - "eval_steps_per_second": 23.837, + "epoch": 0.15444015444015444, + "eval_runtime": 0.3678, + "eval_samples_per_second": 252.838, + "eval_steps_per_second": 16.312, "step": 100 }, { - "epoch": 0.005312341535266703, - "grad_norm": 117.3695190602538, - "learning_rate": 1.062288749396427e-06, - "loss": 8.9477, - "mean_token_accuracy": 0.08780241906642913, + "epoch": 0.16988416988416988, + "grad_norm": 1.2047363502334179, + "learning_rate": 1.9706429546259592e-05, + "loss": 3.4285, + "mean_token_accuracy": 0.3689419463276863, "step": 110 }, { - "epoch": 0.005795281674836404, - "grad_norm": 62.298764753082835, - "learning_rate": 1.1588604538870113e-06, - "loss": 8.9211, - "mean_token_accuracy": 0.08397177457809449, + "epoch": 0.18532818532818532, + "grad_norm": 1.1540457712296708, + "learning_rate": 1.9562521337935255e-05, + "loss": 3.3438, + "mean_token_accuracy": 0.37895588874816893, "step": 120 }, { - "epoch": 0.006278221814406105, - "grad_norm": 61.87255784883387, - "learning_rate": 1.2554321583775955e-06, - "loss": 8.8445, - "mean_token_accuracy": 0.08659274317324162, + "epoch": 0.20077220077220076, + "grad_norm": 1.1919170437393742, + "learning_rate": 1.939075700232209e-05, + "loss": 3.3227, + "mean_token_accuracy": 0.38107282519340513, "step": 130 }, { - "epoch": 0.0067611619539758045, - "grad_norm": 40.09540993639851, - "learning_rate": 1.3520038628681797e-06, - "loss": 8.7438, - "mean_token_accuracy": 0.08639112897217274, + "epoch": 0.21621621621621623, + "grad_norm": 1.0322594052165261, + "learning_rate": 1.9191636897958123e-05, + "loss": 3.3289, + "mean_token_accuracy": 0.38092619478702544, "step": 140 }, { - "epoch": 0.007244102093545505, - "grad_norm": 53.951627839893334, - "learning_rate": 1.448575567358764e-06, - "loss": 8.4688, - "mean_token_accuracy": 0.0918346781283617, + "epoch": 0.23166023166023167, + "grad_norm": 1.072622203579115, + "learning_rate": 1.8965741072173647e-05, + "loss": 3.3309, + "mean_token_accuracy": 0.3811278060078621, "step": 150 }, { - "epoch": 0.007727042233115205, - "grad_norm": 44.44604487004671, - "learning_rate": 1.5451472718493482e-06, - "loss": 8.5523, - "mean_token_accuracy": 0.0860887099057436, + "epoch": 0.2471042471042471, + "grad_norm": 1.1060003860280698, + "learning_rate": 1.8713727571382857e-05, + "loss": 3.3234, + "mean_token_accuracy": 0.38025770634412764, "step": 160 }, { - "epoch": 0.008209982372684905, - "grad_norm": 40.12808283651626, - "learning_rate": 1.6417189763399324e-06, - "loss": 8.2422, - "mean_token_accuracy": 0.09122983925044537, + "epoch": 0.2625482625482625, + "grad_norm": 1.086808022765859, + "learning_rate": 1.8436330524160048e-05, + "loss": 3.318, + "mean_token_accuracy": 0.38055351972579954, "step": 170 }, { - "epoch": 0.008692922512254606, - "grad_norm": 30.19029302720017, - "learning_rate": 1.7382906808305167e-06, - "loss": 8.0227, - "mean_token_accuracy": 0.08991935513913632, + "epoch": 0.277992277992278, + "grad_norm": 1.1385524957672613, + "learning_rate": 1.8134358002684504e-05, + "loss": 3.2988, + "mean_token_accuracy": 0.3846017554402351, "step": 180 }, { - "epoch": 0.009175862651824306, - "grad_norm": 64.96587558878197, - "learning_rate": 1.8348623853211011e-06, - "loss": 7.8492, - "mean_token_accuracy": 0.08810483925044536, + "epoch": 0.29343629343629346, + "grad_norm": 1.0815980244836383, + "learning_rate": 1.7808689668783762e-05, + "loss": 3.2711, + "mean_token_accuracy": 0.3869165450334549, "step": 190 }, { - "epoch": 0.009658802791394007, - "grad_norm": 32.51514438330331, - "learning_rate": 1.9314340898116856e-06, - "loss": 7.6289, - "mean_token_accuracy": 0.09506048373878002, + "epoch": 0.3088803088803089, + "grad_norm": 1.1075978174473033, + "learning_rate": 1.7460274211432463e-05, + "loss": 3.3227, + "mean_token_accuracy": 0.38340970128774643, "step": 200 }, { - "epoch": 0.009658802791394007, - "eval_runtime": 7.7531, - "eval_samples_per_second": 381.011, - "eval_steps_per_second": 23.862, + "epoch": 0.3088803088803089, + "eval_runtime": 0.3689, + "eval_samples_per_second": 252.111, + "eval_steps_per_second": 16.265, "step": 200 }, { - "epoch": 0.010141742930963708, - "grad_norm": 44.57479787298447, - "learning_rate": 2.0280057943022696e-06, - "loss": 7.4602, - "mean_token_accuracy": 0.09778225757181644, + "epoch": 0.32432432432432434, + "grad_norm": 1.2619430199962072, + "learning_rate": 1.7090126583171503e-05, + "loss": 3.3055, + "mean_token_accuracy": 0.3856549397110939, "step": 210 }, { - "epoch": 0.010624683070533407, - "grad_norm": 44.43161424366738, - "learning_rate": 2.124577498792854e-06, - "loss": 7.382, - "mean_token_accuracy": 0.09465725794434547, + "epoch": 0.33976833976833976, + "grad_norm": 1.087491804935316, + "learning_rate": 1.6699325043497957e-05, + "loss": 3.277, + "mean_token_accuracy": 0.3866904929280281, "step": 220 }, { - "epoch": 0.011107623210103107, - "grad_norm": 19.835010827719895, - "learning_rate": 2.221149203283438e-06, - "loss": 7.2172, - "mean_token_accuracy": 0.10131048299372196, + "epoch": 0.3552123552123552, + "grad_norm": 1.1152659184947136, + "learning_rate": 1.6289008017838447e-05, + "loss": 3.2496, + "mean_token_accuracy": 0.3880590170621872, "step": 230 }, { - "epoch": 0.011590563349672808, - "grad_norm": 44.481799789933895, - "learning_rate": 2.3177209077740225e-06, - "loss": 7.1703, - "mean_token_accuracy": 0.09455645158886909, + "epoch": 0.37065637065637064, + "grad_norm": 1.078360383901834, + "learning_rate": 1.586037078125607e-05, + "loss": 3.2484, + "mean_token_accuracy": 0.3903868407011032, "step": 240 }, { - "epoch": 0.012073503489242509, - "grad_norm": 15.888481832711705, - "learning_rate": 2.4142926122646065e-06, - "loss": 6.993, - "mean_token_accuracy": 0.0969124186784029, + "epoch": 0.3861003861003861, + "grad_norm": 1.049519316739431, + "learning_rate": 1.54146619765513e-05, + "loss": 3.252, + "mean_token_accuracy": 0.3888410285115242, "step": 250 }, { - "epoch": 0.01255644362881221, - "grad_norm": 19.533213158788257, - "learning_rate": 2.510864316755191e-06, - "loss": 6.8922, - "mean_token_accuracy": 0.10161290280520915, + "epoch": 0.4015444015444015, + "grad_norm": 1.0716141072297978, + "learning_rate": 1.4953179976899878e-05, + "loss": 3.2891, + "mean_token_accuracy": 0.3861253634095192, "step": 260 }, { - "epoch": 0.013039383768381908, - "grad_norm": 21.36034584462512, - "learning_rate": 2.607436021245775e-06, - "loss": 6.7648, - "mean_token_accuracy": 0.10776209682226182, + "epoch": 0.416988416988417, + "grad_norm": 1.1242108487806568, + "learning_rate": 1.4477269103623496e-05, + "loss": 3.2488, + "mean_token_accuracy": 0.38970552384853363, "step": 270 }, { - "epoch": 0.013522323907951609, - "grad_norm": 12.507709531888091, - "learning_rate": 2.7040077257363594e-06, - "loss": 6.725, - "mean_token_accuracy": 0.1073588702827692, + "epoch": 0.43243243243243246, + "grad_norm": 1.056564744386044, + "learning_rate": 1.3988315710111151e-05, + "loss": 3.232, + "mean_token_accuracy": 0.39249450266361235, "step": 280 }, { - "epoch": 0.01400526404752131, - "grad_norm": 12.963157072765076, - "learning_rate": 2.800579430226944e-06, - "loss": 6.5703, - "mean_token_accuracy": 0.11703629083931447, + "epoch": 0.44787644787644787, + "grad_norm": 1.070870946248766, + "learning_rate": 1.3487744143298822e-05, + "loss": 3.2512, + "mean_token_accuracy": 0.3900837257504463, "step": 290 }, { - "epoch": 0.01448820418709101, - "grad_norm": 22.02834139909145, - "learning_rate": 2.897151134717528e-06, - "loss": 6.4898, - "mean_token_accuracy": 0.12318548522889614, + "epoch": 0.46332046332046334, + "grad_norm": 1.0749246835150736, + "learning_rate": 1.2977012594472008e-05, + "loss": 3.2504, + "mean_token_accuracy": 0.38782380521297455, "step": 300 }, { - "epoch": 0.01448820418709101, - "eval_runtime": 7.8018, - "eval_samples_per_second": 378.63, - "eval_steps_per_second": 23.712, + "epoch": 0.46332046332046334, + "eval_runtime": 0.3696, + "eval_samples_per_second": 251.594, + "eval_steps_per_second": 16.232, "step": 300 }, { - "epoch": 0.014971144326660711, - "grad_norm": 11.320048994516409, - "learning_rate": 2.9937228392081124e-06, - "loss": 6.418, - "mean_token_accuracy": 0.11633064523339272, + "epoch": 0.47876447876447875, + "grad_norm": 0.9656072799953122, + "learning_rate": 1.2457608851477833e-05, + "loss": 3.2687, + "mean_token_accuracy": 0.3866996571421623, "step": 310 }, { - "epoch": 0.01545408446623041, - "grad_norm": 19.74022910633883, - "learning_rate": 3.0902945436986964e-06, - "loss": 6.3891, - "mean_token_accuracy": 0.12268145121634007, + "epoch": 0.4942084942084942, + "grad_norm": 1.0178005185953467, + "learning_rate": 1.1931045964720882e-05, + "loss": 3.198, + "mean_token_accuracy": 0.3944434255361557, "step": 320 }, { - "epoch": 0.015937024605800112, - "grad_norm": 13.693407143559101, - "learning_rate": 3.186866248189281e-06, - "loss": 6.25, - "mean_token_accuracy": 0.1266129020601511, + "epoch": 0.5096525096525096, + "grad_norm": 1.074522942556136, + "learning_rate": 1.1398857839567811e-05, + "loss": 3.2355, + "mean_token_accuracy": 0.39279997497797015, "step": 330 }, { - "epoch": 0.01641996474536981, - "grad_norm": 18.620314421238913, - "learning_rate": 3.283437952679865e-06, - "loss": 6.2508, - "mean_token_accuracy": 0.12681451588869094, + "epoch": 0.525096525096525, + "grad_norm": 1.0410557083388294, + "learning_rate": 1.086259476800041e-05, + "loss": 3.2195, + "mean_token_accuracy": 0.39092436134815217, "step": 340 }, { - "epoch": 0.01690290488493951, - "grad_norm": 12.777325276570116, - "learning_rate": 3.3800096571704493e-06, - "loss": 6.1133, - "mean_token_accuracy": 0.13225806467235088, + "epoch": 0.5405405405405406, + "grad_norm": 0.9762243696283865, + "learning_rate": 1.0323818912533561e-05, + "loss": 3.2445, + "mean_token_accuracy": 0.38936176896095276, "step": 350 }, { - "epoch": 0.01738584502450921, - "grad_norm": 11.72514226678663, - "learning_rate": 3.4765813616610333e-06, - "loss": 6.0352, - "mean_token_accuracy": 0.1365927428007126, + "epoch": 0.555984555984556, + "grad_norm": 1.0744349569161593, + "learning_rate": 9.784099755553723e-06, + "loss": 3.2625, + "mean_token_accuracy": 0.39045931249856947, "step": 360 }, { - "epoch": 0.017868785164078912, - "grad_norm": 8.912693254717752, - "learning_rate": 3.5731530661516178e-06, - "loss": 6.0945, - "mean_token_accuracy": 0.13094758056104183, + "epoch": 0.5714285714285714, + "grad_norm": 0.9548919869257485, + "learning_rate": 9.245009527334243e-06, + "loss": 3.2527, + "mean_token_accuracy": 0.38955584168434143, "step": 370 }, { - "epoch": 0.018351725303648612, - "grad_norm": 29.402027928221635, - "learning_rate": 3.6697247706422022e-06, - "loss": 5.9836, - "mean_token_accuracy": 0.14576612897217273, + "epoch": 0.5868725868725869, + "grad_norm": 1.0268956745848117, + "learning_rate": 8.708118626045939e-06, + "loss": 3.2535, + "mean_token_accuracy": 0.3885325014591217, "step": 380 }, { - "epoch": 0.018834665443218313, - "grad_norm": 14.16575018639554, - "learning_rate": 3.7662964751327863e-06, - "loss": 5.9086, - "mean_token_accuracy": 0.1471774186939001, + "epoch": 0.6023166023166023, + "grad_norm": 1.0496300516856243, + "learning_rate": 8.174991043104662e-06, + "loss": 3.2566, + "mean_token_accuracy": 0.38984403312206267, "step": 390 }, { - "epoch": 0.019317605582788014, - "grad_norm": 9.897396578369301, - "learning_rate": 3.862868179623371e-06, - "loss": 5.8781, - "mean_token_accuracy": 0.14737903252243995, + "epoch": 0.6177606177606177, + "grad_norm": 1.043332590039097, + "learning_rate": 7.647179807182125e-06, + "loss": 3.2281, + "mean_token_accuracy": 0.3923295482993126, "step": 400 }, { - "epoch": 0.019317605582788014, - "eval_runtime": 7.8031, - "eval_samples_per_second": 378.566, - "eval_steps_per_second": 23.708, + "epoch": 0.6177606177606177, + "eval_runtime": 0.3673, + "eval_samples_per_second": 253.192, + "eval_steps_per_second": 16.335, "step": 400 }, { - "epoch": 0.019800545722357715, - "grad_norm": 9.580434319364983, - "learning_rate": 3.959439884113955e-06, - "loss": 5.843, - "mean_token_accuracy": 0.1519153229892254, + "epoch": 0.6332046332046332, + "grad_norm": 1.061649815314641, + "learning_rate": 7.126222460151719e-06, + "loss": 3.2043, + "mean_token_accuracy": 0.39413081407546996, "step": 410 }, { - "epoch": 0.020283485861927415, - "grad_norm": 9.785106533868191, - "learning_rate": 4.056011588604539e-06, - "loss": 5.7188, - "mean_token_accuracy": 0.16239919289946556, + "epoch": 0.6486486486486487, + "grad_norm": 1.0147970765923096, + "learning_rate": 6.613636578148242e-06, + "loss": 3.2316, + "mean_token_accuracy": 0.3912878751754761, "step": 420 }, { - "epoch": 0.020766426001497116, - "grad_norm": 10.405009225619438, - "learning_rate": 4.152583293095124e-06, - "loss": 5.6664, - "mean_token_accuracy": 0.16360887214541436, + "epoch": 0.6640926640926641, + "grad_norm": 1.0058231917151492, + "learning_rate": 6.110915350788846e-06, + "loss": 3.2207, + "mean_token_accuracy": 0.3918399602174759, "step": 430 }, { - "epoch": 0.021249366141066813, - "grad_norm": 11.591917579819466, - "learning_rate": 4.249154997585708e-06, - "loss": 5.6148, - "mean_token_accuracy": 0.16381048336625098, + "epoch": 0.6795366795366795, + "grad_norm": 1.0703873139225168, + "learning_rate": 5.619523231433177e-06, + "loss": 3.2566, + "mean_token_accuracy": 0.38752417266368866, "step": 440 }, { - "epoch": 0.021732306280636514, - "grad_norm": 26.190074462558268, - "learning_rate": 4.3457267020762925e-06, - "loss": 5.4719, - "mean_token_accuracy": 0.1736895151436329, + "epoch": 0.694980694980695, + "grad_norm": 1.123689944709063, + "learning_rate": 5.140891671153797e-06, + "loss": 3.2848, + "mean_token_accuracy": 0.3864888772368431, "step": 450 }, { - "epoch": 0.022215246420206215, - "grad_norm": 8.413665658649023, - "learning_rate": 4.442298406566876e-06, - "loss": 5.4727, - "mean_token_accuracy": 0.18094758093357086, + "epoch": 0.7104247104247104, + "grad_norm": 1.0150276974982517, + "learning_rate": 4.676414948843934e-06, + "loss": 3.2078, + "mean_token_accuracy": 0.3944342628121376, "step": 460 }, { - "epoch": 0.022698186559775915, - "grad_norm": 14.092203313066426, - "learning_rate": 4.5388701110574606e-06, - "loss": 5.3945, - "mean_token_accuracy": 0.1817540317773819, + "epoch": 0.7258687258687259, + "grad_norm": 1.131883688390389, + "learning_rate": 4.2274461096098085e-06, + "loss": 3.2121, + "mean_token_accuracy": 0.3935947135090828, "step": 470 }, { - "epoch": 0.023181126699345616, - "grad_norm": 9.583297265678569, - "learning_rate": 4.635441815548045e-06, - "loss": 5.3266, - "mean_token_accuracy": 0.18991935551166533, + "epoch": 0.7413127413127413, + "grad_norm": 1.0848803772523403, + "learning_rate": 3.795293023279093e-06, + "loss": 3.2309, + "mean_token_accuracy": 0.3939241200685501, "step": 480 }, { - "epoch": 0.023664066838915317, - "grad_norm": 10.43916281216745, - "learning_rate": 4.7320135200386295e-06, - "loss": 5.25, - "mean_token_accuracy": 0.1966733880341053, + "epoch": 0.7567567567567568, + "grad_norm": 0.9911464983867319, + "learning_rate": 3.3812145745073834e-06, + "loss": 3.2645, + "mean_token_accuracy": 0.3887524425983429, "step": 490 }, { - "epoch": 0.024147006978485017, - "grad_norm": 9.419476847259737, - "learning_rate": 4.828585224529213e-06, - "loss": 5.2266, - "mean_token_accuracy": 0.19637096896767617, + "epoch": 0.7722007722007722, + "grad_norm": 0.9468182557450763, + "learning_rate": 2.9864169955810085e-06, + "loss": 3.2348, + "mean_token_accuracy": 0.3921034947037697, "step": 500 }, { - "epoch": 0.024147006978485017, - "eval_runtime": 7.799, - "eval_samples_per_second": 378.765, - "eval_steps_per_second": 23.721, + "epoch": 0.7722007722007722, + "eval_runtime": 0.3674, + "eval_samples_per_second": 253.144, + "eval_steps_per_second": 16.332, "step": 500 }, { - "epoch": 0.024629947118054718, - "grad_norm": 9.063478033216443, - "learning_rate": 4.9251569290197975e-06, - "loss": 5.1898, - "mean_token_accuracy": 0.20735886916518212, + "epoch": 0.7876447876447876, + "grad_norm": 1.153751906362788, + "learning_rate": 2.6120503525989894e-06, + "loss": 3.2051, + "mean_token_accuracy": 0.3940493628382683, "step": 510 }, { - "epoch": 0.02511288725762442, - "grad_norm": 7.6407757225586925, - "learning_rate": 5.021728633510382e-06, - "loss": 5.0984, - "mean_token_accuracy": 0.20796370953321458, + "epoch": 0.803088803088803, + "grad_norm": 1.0023254669711654, + "learning_rate": 2.25920519527003e-06, + "loss": 3.2387, + "mean_token_accuracy": 0.3898582592606544, "step": 520 }, { - "epoch": 0.02559582739719412, - "grad_norm": 7.56966381564491, - "learning_rate": 5.118300338000966e-06, - "loss": 5.0875, - "mean_token_accuracy": 0.20544354915618895, + "epoch": 0.8185328185328186, + "grad_norm": 1.018252078051325, + "learning_rate": 1.9289093800839067e-06, + "loss": 3.2488, + "mean_token_accuracy": 0.39030425548553466, "step": 530 }, { - "epoch": 0.026078767536763817, - "grad_norm": 7.151939418897148, - "learning_rate": 5.21487204249155e-06, - "loss": 4.9336, - "mean_token_accuracy": 0.21118951588869095, + "epoch": 0.833976833976834, + "grad_norm": 1.0191281048265344, + "learning_rate": 1.6221250761114803e-06, + "loss": 3.2156, + "mean_token_accuracy": 0.39363697469234465, "step": 540 }, { - "epoch": 0.026561707676333517, - "grad_norm": 7.891718493300589, - "learning_rate": 5.3114437469821344e-06, - "loss": 4.9742, - "mean_token_accuracy": 0.21895161271095276, + "epoch": 0.8494208494208494, + "grad_norm": 1.0580017660782297, + "learning_rate": 1.339745962155613e-06, + "loss": 3.2449, + "mean_token_accuracy": 0.3889385357499123, "step": 550 }, { - "epoch": 0.027044647815903218, - "grad_norm": 7.022858601401243, - "learning_rate": 5.408015451472719e-06, - "loss": 4.9344, - "mean_token_accuracy": 0.21784274280071259, + "epoch": 0.8648648648648649, + "grad_norm": 1.0638282009844648, + "learning_rate": 1.0825946234178575e-06, + "loss": 3.2687, + "mean_token_accuracy": 0.38850476443767545, "step": 560 }, { - "epoch": 0.02752758795547292, - "grad_norm": 8.03316781863098, - "learning_rate": 5.504587155963303e-06, - "loss": 4.9422, - "mean_token_accuracy": 0.22177419289946557, + "epoch": 0.8803088803088803, + "grad_norm": 0.9647796959764461, + "learning_rate": 8.514201552645052e-07, + "loss": 3.2523, + "mean_token_accuracy": 0.3878818407654762, "step": 570 }, { - "epoch": 0.02801052809504262, - "grad_norm": 6.416473059586127, - "learning_rate": 5.601158860453888e-06, - "loss": 4.9328, - "mean_token_accuracy": 0.216935483366251, + "epoch": 0.8957528957528957, + "grad_norm": 1.0003940081508194, + "learning_rate": 6.468959810724329e-07, + "loss": 3.2141, + "mean_token_accuracy": 0.3934506356716156, "step": 580 }, { - "epoch": 0.02849346823461232, - "grad_norm": 6.947560111550373, - "learning_rate": 5.697730564944471e-06, - "loss": 4.8422, - "mean_token_accuracy": 0.2323588691651821, + "epoch": 0.9111969111969112, + "grad_norm": 0.9373675947022841, + "learning_rate": 4.696178905113913e-07, + "loss": 3.2305, + "mean_token_accuracy": 0.39248495548963547, "step": 590 }, { - "epoch": 0.02897640837418202, - "grad_norm": 6.751214653454153, - "learning_rate": 5.794302269435056e-06, - "loss": 4.8609, - "mean_token_accuracy": 0.2269153229892254, + "epoch": 0.9266409266409267, + "grad_norm": 0.9968667494256308, + "learning_rate": 3.2010230397739206e-07, + "loss": 3.2254, + "mean_token_accuracy": 0.39279315173625945, "step": 600 }, { - "epoch": 0.02897640837418202, - "eval_runtime": 7.8282, - "eval_samples_per_second": 377.356, - "eval_steps_per_second": 23.633, + "epoch": 0.9266409266409267, + "eval_runtime": 0.3665, + "eval_samples_per_second": 253.77, + "eval_steps_per_second": 16.372, "step": 600 }, { - "epoch": 0.02945934851375172, - "grad_norm": 6.568818826918091, - "learning_rate": 5.89087397392564e-06, - "loss": 4.8547, - "mean_token_accuracy": 0.23074596747756004, + "epoch": 0.9420849420849421, + "grad_norm": 1.0683699164488198, + "learning_rate": 1.9878476823294467e-07, + "loss": 3.2227, + "mean_token_accuracy": 0.3929983913898468, "step": 610 }, { - "epoch": 0.029942288653321422, - "grad_norm": 6.768505739040085, - "learning_rate": 5.987445678416225e-06, - "loss": 4.8031, - "mean_token_accuracy": 0.23568548262119293, + "epoch": 0.9575289575289575, + "grad_norm": 0.9681641634376521, + "learning_rate": 1.0601868763643997e-07, + "loss": 3.2156, + "mean_token_accuracy": 0.3948619216680527, "step": 620 }, { - "epoch": 0.03042522879289112, - "grad_norm": 7.312646650877884, - "learning_rate": 6.084017382906808e-06, - "loss": 4.7609, - "mean_token_accuracy": 0.23659274280071257, + "epoch": 0.972972972972973, + "grad_norm": 1.0525452552878858, + "learning_rate": 4.207429465668877e-08, + "loss": 3.2148, + "mean_token_accuracy": 0.39301991611719134, "step": 630 }, { - "epoch": 0.03090816893246082, - "grad_norm": 6.751931563442281, - "learning_rate": 6.180589087397393e-06, - "loss": 4.7914, - "mean_token_accuracy": 0.2309475801885128, + "epoch": 0.9884169884169884, + "grad_norm": 1.007791209844153, + "learning_rate": 7.1378626715268295e-09, + "loss": 3.252, + "mean_token_accuracy": 0.3903378531336784, "step": 640 }, { - "epoch": 0.03139110907203052, - "grad_norm": 6.35129343291686, - "learning_rate": 6.277160791887977e-06, - "loss": 4.8023, - "mean_token_accuracy": 0.23679435327649118, - "step": 650 - }, - { - "epoch": 0.031874049211600225, - "grad_norm": 6.798276877968672, - "learning_rate": 6.373732496378562e-06, - "loss": 4.7078, - "mean_token_accuracy": 0.2406249985098839, - "step": 660 - }, - { - "epoch": 0.03235698935116992, - "grad_norm": 6.41128184388978, - "learning_rate": 6.470304200869146e-06, - "loss": 4.6484, - "mean_token_accuracy": 0.24546370953321456, - "step": 670 - }, - { - "epoch": 0.03283992949073962, - "grad_norm": 6.786893678077557, - "learning_rate": 6.56687590535973e-06, - "loss": 4.7242, - "mean_token_accuracy": 0.23760080561041833, - "step": 680 - }, - { - "epoch": 0.033322869630309324, - "grad_norm": 6.874307740254109, - "learning_rate": 6.663447609850314e-06, - "loss": 4.6531, - "mean_token_accuracy": 0.2535282254219055, - "step": 690 - }, - { - "epoch": 0.03380580976987902, - "grad_norm": 6.529334198552244, - "learning_rate": 6.760019314340899e-06, - "loss": 4.6836, - "mean_token_accuracy": 0.24284274205565454, - "step": 700 - }, - { - "epoch": 0.03380580976987902, - "eval_runtime": 7.7775, - "eval_samples_per_second": 379.811, - "eval_steps_per_second": 23.786, - "step": 700 - }, - { - "epoch": 0.034288749909448725, - "grad_norm": 6.571496054562236, - "learning_rate": 6.856591018831483e-06, - "loss": 4.6344, - "mean_token_accuracy": 0.25030241534113884, - "step": 710 - }, - { - "epoch": 0.03477169004901842, - "grad_norm": 6.3473260026917275, - "learning_rate": 6.953162723322067e-06, - "loss": 4.6398, - "mean_token_accuracy": 0.2475806452333927, - "step": 720 - }, - { - "epoch": 0.035254630188588126, - "grad_norm": 6.562261892713004, - "learning_rate": 7.049734427812651e-06, - "loss": 4.6148, - "mean_token_accuracy": 0.24868951588869095, - "step": 730 - }, - { - "epoch": 0.035737570328157824, - "grad_norm": 5.858977665193235, - "learning_rate": 7.1463061323032356e-06, - "loss": 4.657, - "mean_token_accuracy": 0.24516128972172738, - "step": 740 - }, - { - "epoch": 0.03622051046772753, - "grad_norm": 6.800295538759567, - "learning_rate": 7.24287783679382e-06, - "loss": 4.6344, - "mean_token_accuracy": 0.24808467850089072, - "step": 750 - }, - { - "epoch": 0.036703450607297225, - "grad_norm": 6.529416845885988, - "learning_rate": 7.3394495412844045e-06, - "loss": 4.6594, - "mean_token_accuracy": 0.24889112785458564, - "step": 760 - }, - { - "epoch": 0.03718639074686693, - "grad_norm": 6.697630717561144, - "learning_rate": 7.436021245774988e-06, - "loss": 4.6055, - "mean_token_accuracy": 0.2462701603770256, - "step": 770 - }, - { - "epoch": 0.037669330886436626, - "grad_norm": 5.95009608166028, - "learning_rate": 7.5325929502655725e-06, - "loss": 4.6008, - "mean_token_accuracy": 0.2548387087881565, - "step": 780 - }, - { - "epoch": 0.038152271026006324, - "grad_norm": 5.647138551872577, - "learning_rate": 7.629164654756157e-06, - "loss": 4.5828, - "mean_token_accuracy": 0.25282258093357085, - "step": 790 - }, - { - "epoch": 0.03863521116557603, - "grad_norm": 6.436677460184058, - "learning_rate": 7.725736359246742e-06, - "loss": 4.6082, - "mean_token_accuracy": 0.25241935551166533, - "step": 800 - }, - { - "epoch": 0.03863521116557603, - "eval_runtime": 7.7986, - "eval_samples_per_second": 378.785, - "eval_steps_per_second": 23.722, - "step": 800 - }, - { - "epoch": 0.039118151305145725, - "grad_norm": 6.557950012681355, - "learning_rate": 7.822308063737327e-06, - "loss": 4.543, - "mean_token_accuracy": 0.2562500022351742, - "step": 810 - }, - { - "epoch": 0.03960109144471543, - "grad_norm": 5.60271360072184, - "learning_rate": 7.91887976822791e-06, - "loss": 4.5906, - "mean_token_accuracy": 0.257358867675066, - "step": 820 - }, - { - "epoch": 0.040084031584285126, - "grad_norm": 5.891590551442568, - "learning_rate": 8.015451472718494e-06, - "loss": 4.4961, - "mean_token_accuracy": 0.26139112934470177, - "step": 830 - }, - { - "epoch": 0.04056697172385483, - "grad_norm": 6.319141995609288, - "learning_rate": 8.112023177209078e-06, - "loss": 4.5438, - "mean_token_accuracy": 0.26350806280970573, - "step": 840 - }, - { - "epoch": 0.04104991186342453, - "grad_norm": 5.9347957915729515, - "learning_rate": 8.208594881699663e-06, - "loss": 4.5984, - "mean_token_accuracy": 0.2543346740305424, - "step": 850 - }, - { - "epoch": 0.04153285200299423, - "grad_norm": 6.21887500489896, - "learning_rate": 8.305166586190247e-06, - "loss": 4.4961, - "mean_token_accuracy": 0.26088709756731987, - "step": 860 - }, - { - "epoch": 0.04201579214256393, - "grad_norm": 5.8005895616062535, - "learning_rate": 8.401738290680832e-06, - "loss": 4.5555, - "mean_token_accuracy": 0.2596774183213711, - "step": 870 - }, - { - "epoch": 0.042498732282133626, - "grad_norm": 5.645546026576973, - "learning_rate": 8.498309995171416e-06, - "loss": 4.4992, - "mean_token_accuracy": 0.2630040317773819, - "step": 880 - }, - { - "epoch": 0.04298167242170333, - "grad_norm": 5.867461510415207, - "learning_rate": 8.594881699662e-06, - "loss": 4.4867, - "mean_token_accuracy": 0.262298384308815, - "step": 890 - }, - { - "epoch": 0.04346461256127303, - "grad_norm": 6.4078157319256155, - "learning_rate": 8.691453404152585e-06, - "loss": 4.5141, - "mean_token_accuracy": 0.2566532239317894, - "step": 900 - }, - { - "epoch": 0.04346461256127303, - "eval_runtime": 7.7925, - "eval_samples_per_second": 379.084, - "eval_steps_per_second": 23.741, - "step": 900 - }, - { - "epoch": 0.04394755270084273, - "grad_norm": 6.1661426514813185, - "learning_rate": 8.788025108643168e-06, - "loss": 4.416, - "mean_token_accuracy": 0.26330645233392713, - "step": 910 - }, - { - "epoch": 0.04443049284041243, - "grad_norm": 5.817055841219091, - "learning_rate": 8.884596813133752e-06, - "loss": 4.4391, - "mean_token_accuracy": 0.27298386916518214, - "step": 920 - }, - { - "epoch": 0.04491343297998213, - "grad_norm": 5.984395658176951, - "learning_rate": 8.981168517624337e-06, - "loss": 4.5164, - "mean_token_accuracy": 0.2577620968222618, - "step": 930 - }, - { - "epoch": 0.04539637311955183, - "grad_norm": 5.882810812790242, - "learning_rate": 9.077740222114921e-06, - "loss": 4.4363, - "mean_token_accuracy": 0.27116935402154924, - "step": 940 - }, - { - "epoch": 0.045879313259121535, - "grad_norm": 5.11555968599847, - "learning_rate": 9.174311926605506e-06, - "loss": 4.4656, - "mean_token_accuracy": 0.26764112785458566, - "step": 950 - }, - { - "epoch": 0.04636225339869123, - "grad_norm": 5.590901293802474, - "learning_rate": 9.27088363109609e-06, - "loss": 4.4117, - "mean_token_accuracy": 0.26743951588869097, - "step": 960 - }, - { - "epoch": 0.04684519353826093, - "grad_norm": 5.647756025719848, - "learning_rate": 9.367455335586674e-06, - "loss": 4.5121, - "mean_token_accuracy": 0.260080648213625, - "step": 970 - }, - { - "epoch": 0.04732813367783063, - "grad_norm": 5.153246489988432, - "learning_rate": 9.464027040077259e-06, - "loss": 4.482, - "mean_token_accuracy": 0.2638104811310768, - "step": 980 - }, - { - "epoch": 0.04781107381740033, - "grad_norm": 5.47031302012429, - "learning_rate": 9.560598744567843e-06, - "loss": 4.432, - "mean_token_accuracy": 0.2661290317773819, - "step": 990 - }, - { - "epoch": 0.048294013956970035, - "grad_norm": 5.496662380685803, - "learning_rate": 9.657170449058426e-06, - "loss": 4.4934, - "mean_token_accuracy": 0.2628024198114872, - "step": 1000 - }, - { - "epoch": 0.048294013956970035, - "eval_runtime": 7.8503, - "eval_samples_per_second": 376.294, - "eval_steps_per_second": 23.566, - "step": 1000 - }, - { - "epoch": 0.04877695409653973, - "grad_norm": 5.415310676462476, - "learning_rate": 9.75374215354901e-06, - "loss": 4.3781, - "mean_token_accuracy": 0.2776209689676762, - "step": 1010 - }, - { - "epoch": 0.049259894236109436, - "grad_norm": 5.5124107735369465, - "learning_rate": 9.850313858039595e-06, - "loss": 4.4715, - "mean_token_accuracy": 0.27147177159786223, - "step": 1020 - }, - { - "epoch": 0.04974283437567913, - "grad_norm": 5.649516777672486, - "learning_rate": 9.94688556253018e-06, - "loss": 4.45, - "mean_token_accuracy": 0.27530241534113886, - "step": 1030 - }, - { - "epoch": 0.05022577451524884, - "grad_norm": 5.504346017846997, - "learning_rate": 1.0043457267020764e-05, - "loss": 4.4496, - "mean_token_accuracy": 0.2676411300897598, - "step": 1040 - }, - { - "epoch": 0.050708714654818535, - "grad_norm": 5.315690449608703, - "learning_rate": 1.0140028971511348e-05, - "loss": 4.5195, - "mean_token_accuracy": 0.26239918991923333, - "step": 1050 - }, - { - "epoch": 0.05119165479438824, - "grad_norm": 5.2296044318266635, - "learning_rate": 1.0236600676001933e-05, - "loss": 4.4371, - "mean_token_accuracy": 0.27227822244167327, - "step": 1060 - }, - { - "epoch": 0.051674594933957936, - "grad_norm": 5.403037573405085, - "learning_rate": 1.0333172380492516e-05, - "loss": 4.3746, - "mean_token_accuracy": 0.2784274183213711, - "step": 1070 - }, - { - "epoch": 0.05215753507352763, - "grad_norm": 5.199701390121784, - "learning_rate": 1.04297440849831e-05, - "loss": 4.4359, - "mean_token_accuracy": 0.27046370729804037, - "step": 1080 - }, - { - "epoch": 0.05264047521309734, - "grad_norm": 5.215482834477001, - "learning_rate": 1.0526315789473684e-05, - "loss": 4.4203, - "mean_token_accuracy": 0.2684475801885128, - "step": 1090 - }, - { - "epoch": 0.053123415352667035, - "grad_norm": 5.66227835348697, - "learning_rate": 1.0622887493964269e-05, - "loss": 4.4219, - "mean_token_accuracy": 0.27338709458708765, - "step": 1100 - }, - { - "epoch": 0.053123415352667035, - "eval_runtime": 7.7947, - "eval_samples_per_second": 378.975, - "eval_steps_per_second": 23.734, - "step": 1100 - }, - { - "epoch": 0.05360635549223674, - "grad_norm": 4.747092470280433, - "learning_rate": 1.0719459198454853e-05, - "loss": 4.3652, - "mean_token_accuracy": 0.27106855139136316, - "step": 1110 - }, - { - "epoch": 0.054089295631806436, - "grad_norm": 5.013979725224065, - "learning_rate": 1.0816030902945438e-05, - "loss": 4.402, - "mean_token_accuracy": 0.277318549156189, - "step": 1120 - }, - { - "epoch": 0.05457223577137614, - "grad_norm": 5.26115325559556, - "learning_rate": 1.0912602607436022e-05, - "loss": 4.4031, - "mean_token_accuracy": 0.2803427383303642, - "step": 1130 - }, - { - "epoch": 0.05505517591094584, - "grad_norm": 5.033659584260112, - "learning_rate": 1.1009174311926607e-05, - "loss": 4.4023, - "mean_token_accuracy": 0.27883064597845075, - "step": 1140 - }, - { - "epoch": 0.05553811605051554, - "grad_norm": 4.954391238616325, - "learning_rate": 1.1105746016417191e-05, - "loss": 4.4113, - "mean_token_accuracy": 0.2770161248743534, - "step": 1150 - }, - { - "epoch": 0.05602105619008524, - "grad_norm": 5.263067395585244, - "learning_rate": 1.1202317720907776e-05, - "loss": 4.3461, - "mean_token_accuracy": 0.2686491936445236, - "step": 1160 - }, - { - "epoch": 0.056503996329654936, - "grad_norm": 5.16759966644505, - "learning_rate": 1.1298889425398358e-05, - "loss": 4.432, - "mean_token_accuracy": 0.2745967745780945, - "step": 1170 - }, - { - "epoch": 0.05698693646922464, - "grad_norm": 5.21914041586483, - "learning_rate": 1.1395461129888943e-05, - "loss": 4.4094, - "mean_token_accuracy": 0.2734879031777382, - "step": 1180 - }, - { - "epoch": 0.05746987660879434, - "grad_norm": 4.972840372050854, - "learning_rate": 1.1492032834379527e-05, - "loss": 4.4453, - "mean_token_accuracy": 0.273689516633749, - "step": 1190 - }, - { - "epoch": 0.05795281674836404, - "grad_norm": 4.91837651816623, - "learning_rate": 1.1588604538870112e-05, - "loss": 4.3688, - "mean_token_accuracy": 0.27631047964096067, - "step": 1200 - }, - { - "epoch": 0.05795281674836404, - "eval_runtime": 7.7933, - "eval_samples_per_second": 379.046, - "eval_steps_per_second": 23.738, - "step": 1200 - }, - { - "epoch": 0.05843575688793374, - "grad_norm": 5.120258110700915, - "learning_rate": 1.1685176243360696e-05, - "loss": 4.393, - "mean_token_accuracy": 0.27358871027827264, - "step": 1210 - }, - { - "epoch": 0.05891869702750344, - "grad_norm": 4.715854893519893, - "learning_rate": 1.178174794785128e-05, - "loss": 4.3922, - "mean_token_accuracy": 0.26824596524238586, - "step": 1220 - }, - { - "epoch": 0.05940163716707314, - "grad_norm": 4.748569022412669, - "learning_rate": 1.1878319652341865e-05, - "loss": 4.4379, - "mean_token_accuracy": 0.26653225868940356, - "step": 1230 - }, - { - "epoch": 0.059884577306642844, - "grad_norm": 5.220648990494797, - "learning_rate": 1.197489135683245e-05, - "loss": 4.3531, - "mean_token_accuracy": 0.2740927390754223, - "step": 1240 - }, - { - "epoch": 0.06036751744621254, - "grad_norm": 5.028586477606002, - "learning_rate": 1.2071463061323034e-05, - "loss": 4.3551, - "mean_token_accuracy": 0.27772177308797835, - "step": 1250 - }, - { - "epoch": 0.06085045758578224, - "grad_norm": 5.068781345504261, - "learning_rate": 1.2168034765813617e-05, - "loss": 4.325, - "mean_token_accuracy": 0.2738911293447018, - "step": 1260 - }, - { - "epoch": 0.06133339772535194, - "grad_norm": 5.221080689633655, - "learning_rate": 1.2264606470304201e-05, - "loss": 4.3875, - "mean_token_accuracy": 0.27278225794434546, - "step": 1270 - }, - { - "epoch": 0.06181633786492164, - "grad_norm": 5.085764550248687, - "learning_rate": 1.2361178174794786e-05, - "loss": 4.3203, - "mean_token_accuracy": 0.2834677413105965, - "step": 1280 - }, - { - "epoch": 0.062299278004491344, - "grad_norm": 5.367307973698626, - "learning_rate": 1.245774987928537e-05, - "loss": 4.316, - "mean_token_accuracy": 0.28387096524238586, - "step": 1290 - }, - { - "epoch": 0.06278221814406104, - "grad_norm": 4.814088937557432, - "learning_rate": 1.2554321583775954e-05, - "loss": 4.4062, - "mean_token_accuracy": 0.27338709533214567, - "step": 1300 - }, - { - "epoch": 0.06278221814406104, - "eval_runtime": 7.7872, - "eval_samples_per_second": 379.342, - "eval_steps_per_second": 23.757, - "step": 1300 - }, - { - "epoch": 0.06326515828363075, - "grad_norm": 4.830576508274259, - "learning_rate": 1.2650893288266539e-05, - "loss": 4.3211, - "mean_token_accuracy": 0.28417338654398916, - "step": 1310 - }, - { - "epoch": 0.06374809842320045, - "grad_norm": 4.9332935687133785, - "learning_rate": 1.2747464992757123e-05, - "loss": 4.3812, - "mean_token_accuracy": 0.27560483366250993, - "step": 1320 - }, - { - "epoch": 0.06423103856277014, - "grad_norm": 5.4934363818373635, - "learning_rate": 1.2844036697247708e-05, - "loss": 4.3316, - "mean_token_accuracy": 0.2795362912118435, - "step": 1330 - }, - { - "epoch": 0.06471397870233984, - "grad_norm": 4.800260532111643, - "learning_rate": 1.2940608401738292e-05, - "loss": 4.3551, - "mean_token_accuracy": 0.280947582423687, - "step": 1340 - }, - { - "epoch": 0.06519691884190955, - "grad_norm": 4.880148264641701, - "learning_rate": 1.3037180106228875e-05, - "loss": 4.3531, - "mean_token_accuracy": 0.2753034979104996, - "step": 1350 - }, - { - "epoch": 0.06567985898147924, - "grad_norm": 4.808834621682347, - "learning_rate": 1.313375181071946e-05, - "loss": 4.3152, - "mean_token_accuracy": 0.27631048709154127, - "step": 1360 - }, - { - "epoch": 0.06616279912104894, - "grad_norm": 4.466947481484539, - "learning_rate": 1.3230323515210044e-05, - "loss": 4.4105, - "mean_token_accuracy": 0.2709677405655384, - "step": 1370 - }, - { - "epoch": 0.06664573926061865, - "grad_norm": 4.735459855718925, - "learning_rate": 1.3326895219700628e-05, - "loss": 4.309, - "mean_token_accuracy": 0.2811491943895817, - "step": 1380 - }, - { - "epoch": 0.06712867940018835, - "grad_norm": 4.515857419173914, - "learning_rate": 1.3423466924191213e-05, - "loss": 4.2199, - "mean_token_accuracy": 0.28780241683125496, - "step": 1390 - }, - { - "epoch": 0.06761161953975804, - "grad_norm": 4.750233472061156, - "learning_rate": 1.3520038628681797e-05, - "loss": 4.291, - "mean_token_accuracy": 0.2888104856014252, - "step": 1400 - }, - { - "epoch": 0.06761161953975804, - "eval_runtime": 7.8003, - "eval_samples_per_second": 378.703, - "eval_steps_per_second": 23.717, - "step": 1400 - }, - { - "epoch": 0.06809455967932775, - "grad_norm": 4.680421546574031, - "learning_rate": 1.3616610333172382e-05, - "loss": 4.3289, - "mean_token_accuracy": 0.27913306653499603, - "step": 1410 - }, - { - "epoch": 0.06857749981889745, - "grad_norm": 4.601224269285572, - "learning_rate": 1.3713182037662966e-05, - "loss": 4.3352, - "mean_token_accuracy": 0.2777217745780945, - "step": 1420 - }, - { - "epoch": 0.06906043995846715, - "grad_norm": 4.472689233093631, - "learning_rate": 1.380975374215355e-05, - "loss": 4.209, - "mean_token_accuracy": 0.2897177398204803, - "step": 1430 - }, - { - "epoch": 0.06954338009803684, - "grad_norm": 4.619177247391706, - "learning_rate": 1.3906325446644133e-05, - "loss": 4.284, - "mean_token_accuracy": 0.28467742130160334, - "step": 1440 - }, - { - "epoch": 0.07002632023760655, - "grad_norm": 4.799177042165004, - "learning_rate": 1.4002897151134718e-05, - "loss": 4.3965, - "mean_token_accuracy": 0.27217742130160333, - "step": 1450 - }, - { - "epoch": 0.07050926037717625, - "grad_norm": 4.393611048681592, - "learning_rate": 1.4099468855625302e-05, - "loss": 4.2645, - "mean_token_accuracy": 0.2820564515888691, - "step": 1460 - }, - { - "epoch": 0.07099220051674594, - "grad_norm": 4.454390565066716, - "learning_rate": 1.4196040560115887e-05, - "loss": 4.3277, - "mean_token_accuracy": 0.2783266119658947, - "step": 1470 - }, - { - "epoch": 0.07147514065631565, - "grad_norm": 4.199911067866371, - "learning_rate": 1.4292612264606471e-05, - "loss": 4.2387, - "mean_token_accuracy": 0.2902217753231525, - "step": 1480 - }, - { - "epoch": 0.07195808079588535, - "grad_norm": 4.438915347397052, - "learning_rate": 1.4389183969097056e-05, - "loss": 4.2637, - "mean_token_accuracy": 0.286995966732502, - "step": 1490 - }, - { - "epoch": 0.07244102093545506, - "grad_norm": 4.579843923204942, - "learning_rate": 1.448575567358764e-05, - "loss": 4.2477, - "mean_token_accuracy": 0.2926411271095276, - "step": 1500 - }, - { - "epoch": 0.07244102093545506, - "eval_runtime": 7.81, - "eval_samples_per_second": 378.231, - "eval_steps_per_second": 23.687, - "step": 1500 - }, - { - "epoch": 0.07292396107502475, - "grad_norm": 4.431824023740648, - "learning_rate": 1.4582327378078224e-05, - "loss": 4.2543, - "mean_token_accuracy": 0.285181450843811, - "step": 1510 - }, - { - "epoch": 0.07340690121459445, - "grad_norm": 4.510306609633914, - "learning_rate": 1.4678899082568809e-05, - "loss": 4.2793, - "mean_token_accuracy": 0.28054435551166534, - "step": 1520 - }, - { - "epoch": 0.07388984135416415, - "grad_norm": 4.365911038914563, - "learning_rate": 1.4775470787059393e-05, - "loss": 4.2566, - "mean_token_accuracy": 0.28346774354577065, - "step": 1530 - }, - { - "epoch": 0.07437278149373386, - "grad_norm": 4.088488922706824, - "learning_rate": 1.4872042491549976e-05, - "loss": 4.2594, - "mean_token_accuracy": 0.2848790302872658, - "step": 1540 - }, - { - "epoch": 0.07485572163330355, - "grad_norm": 4.120686586249447, - "learning_rate": 1.496861419604056e-05, - "loss": 4.3441, - "mean_token_accuracy": 0.28104838728904724, - "step": 1550 - }, - { - "epoch": 0.07533866177287325, - "grad_norm": 4.43368775520899, - "learning_rate": 1.5065185900531145e-05, - "loss": 4.3773, - "mean_token_accuracy": 0.2772177442908287, - "step": 1560 - }, - { - "epoch": 0.07582160191244296, - "grad_norm": 4.696316687482952, - "learning_rate": 1.516175760502173e-05, - "loss": 4.2699, - "mean_token_accuracy": 0.28266128823161124, - "step": 1570 - }, - { - "epoch": 0.07630454205201265, - "grad_norm": 4.258982028304477, - "learning_rate": 1.5258329309512314e-05, - "loss": 4.2465, - "mean_token_accuracy": 0.2902217723429203, - "step": 1580 - }, - { - "epoch": 0.07678748219158235, - "grad_norm": 4.183184611426825, - "learning_rate": 1.53549010140029e-05, - "loss": 4.2719, - "mean_token_accuracy": 0.281552417576313, - "step": 1590 - }, - { - "epoch": 0.07727042233115206, - "grad_norm": 4.475490851991573, - "learning_rate": 1.5451472718493484e-05, - "loss": 4.2766, - "mean_token_accuracy": 0.2819556452333927, - "step": 1600 - }, - { - "epoch": 0.07727042233115206, - "eval_runtime": 7.7922, - "eval_samples_per_second": 379.095, - "eval_steps_per_second": 23.742, - "step": 1600 - }, - { - "epoch": 0.07775336247072176, - "grad_norm": 4.195793070651388, - "learning_rate": 1.5548044422984067e-05, - "loss": 4.2336, - "mean_token_accuracy": 0.2904233880341053, - "step": 1610 - }, - { - "epoch": 0.07823630261029145, - "grad_norm": 4.280488621876795, - "learning_rate": 1.5644616127474653e-05, - "loss": 4.2367, - "mean_token_accuracy": 0.2879032239317894, - "step": 1620 - }, - { - "epoch": 0.07871924274986115, - "grad_norm": 4.247019183705438, - "learning_rate": 1.5741187831965236e-05, - "loss": 4.2648, - "mean_token_accuracy": 0.28840725868940353, - "step": 1630 - }, - { - "epoch": 0.07920218288943086, - "grad_norm": 4.07130581710393, - "learning_rate": 1.583775953645582e-05, - "loss": 4.2324, - "mean_token_accuracy": 0.2824596792459488, - "step": 1640 - }, - { - "epoch": 0.07968512302900055, - "grad_norm": 4.218784263208886, - "learning_rate": 1.5934331240946405e-05, - "loss": 4.2477, - "mean_token_accuracy": 0.28618951737880705, - "step": 1650 - }, - { - "epoch": 0.08016806316857025, - "grad_norm": 4.2590102178275755, - "learning_rate": 1.6030902945436988e-05, - "loss": 4.1945, - "mean_token_accuracy": 0.29002016186714175, - "step": 1660 - }, - { - "epoch": 0.08065100330813996, - "grad_norm": 4.022664270818587, - "learning_rate": 1.6127474649927574e-05, - "loss": 4.3285, - "mean_token_accuracy": 0.2779233880341053, - "step": 1670 - }, - { - "epoch": 0.08113394344770966, - "grad_norm": 3.8988764717241056, - "learning_rate": 1.6224046354418157e-05, - "loss": 4.2473, - "mean_token_accuracy": 0.2934475809335709, - "step": 1680 - }, - { - "epoch": 0.08161688358727935, - "grad_norm": 4.252351158534765, - "learning_rate": 1.6320618058908743e-05, - "loss": 4.2309, - "mean_token_accuracy": 0.2824596777558327, - "step": 1690 - }, - { - "epoch": 0.08209982372684906, - "grad_norm": 3.999145949805774, - "learning_rate": 1.6417189763399326e-05, - "loss": 4.2441, - "mean_token_accuracy": 0.2934475809335709, - "step": 1700 - }, - { - "epoch": 0.08209982372684906, - "eval_runtime": 7.766, - "eval_samples_per_second": 380.376, - "eval_steps_per_second": 23.822, - "step": 1700 - }, - { - "epoch": 0.08258276386641876, - "grad_norm": 4.492602153549056, - "learning_rate": 1.6513761467889912e-05, - "loss": 4.2141, - "mean_token_accuracy": 0.28719758093357084, - "step": 1710 - }, - { - "epoch": 0.08306570400598846, - "grad_norm": 3.961285953203282, - "learning_rate": 1.6610333172380494e-05, - "loss": 4.2934, - "mean_token_accuracy": 0.27923387214541434, - "step": 1720 - }, - { - "epoch": 0.08354864414555815, - "grad_norm": 4.236996974317345, - "learning_rate": 1.6706904876871077e-05, - "loss": 4.2449, - "mean_token_accuracy": 0.28639113157987595, - "step": 1730 - }, - { - "epoch": 0.08403158428512786, - "grad_norm": 3.9002420355103173, - "learning_rate": 1.6803476581361663e-05, - "loss": 4.2094, - "mean_token_accuracy": 0.2850806437432766, - "step": 1740 - }, - { - "epoch": 0.08451452442469756, - "grad_norm": 4.283672233634765, - "learning_rate": 1.6900048285852246e-05, - "loss": 4.2703, - "mean_token_accuracy": 0.2866935506463051, - "step": 1750 - }, - { - "epoch": 0.08499746456426725, - "grad_norm": 3.7545429095344107, - "learning_rate": 1.6996619990342832e-05, - "loss": 4.1707, - "mean_token_accuracy": 0.29284274131059645, - "step": 1760 - }, - { - "epoch": 0.08548040470383696, - "grad_norm": 4.064213012685202, - "learning_rate": 1.7093191694833415e-05, - "loss": 4.1891, - "mean_token_accuracy": 0.29143145456910136, - "step": 1770 - }, - { - "epoch": 0.08596334484340666, - "grad_norm": 4.158862971978716, - "learning_rate": 1.7189763399324e-05, - "loss": 4.2285, - "mean_token_accuracy": 0.2854838721454144, - "step": 1780 - }, - { - "epoch": 0.08644628498297637, - "grad_norm": 4.271117505211573, - "learning_rate": 1.7286335103814584e-05, - "loss": 4.2121, - "mean_token_accuracy": 0.29284274354577067, - "step": 1790 - }, - { - "epoch": 0.08692922512254606, - "grad_norm": 3.960333913637192, - "learning_rate": 1.738290680830517e-05, - "loss": 4.2281, - "mean_token_accuracy": 0.2903225809335709, - "step": 1800 - }, - { - "epoch": 0.08692922512254606, - "eval_runtime": 7.7754, - "eval_samples_per_second": 379.917, - "eval_steps_per_second": 23.793, - "step": 1800 - }, - { - "epoch": 0.08741216526211576, - "grad_norm": 3.9117115145614343, - "learning_rate": 1.7479478512795753e-05, - "loss": 4.1781, - "mean_token_accuracy": 0.2910282269120216, - "step": 1810 - }, - { - "epoch": 0.08789510540168546, - "grad_norm": 3.905931678714368, - "learning_rate": 1.7576050217286336e-05, - "loss": 4.2211, - "mean_token_accuracy": 0.2852822542190552, - "step": 1820 - }, - { - "epoch": 0.08837804554125517, - "grad_norm": 4.0562310282381535, - "learning_rate": 1.7672621921776922e-05, - "loss": 4.1496, - "mean_token_accuracy": 0.284375, - "step": 1830 - }, - { - "epoch": 0.08886098568082486, - "grad_norm": 3.930219040624092, - "learning_rate": 1.7769193626267504e-05, - "loss": 4.2453, - "mean_token_accuracy": 0.28830645233392715, - "step": 1840 - }, - { - "epoch": 0.08934392582039456, - "grad_norm": 3.7955300751416616, - "learning_rate": 1.786576533075809e-05, - "loss": 4.2336, - "mean_token_accuracy": 0.28578629046678544, - "step": 1850 - }, - { - "epoch": 0.08982686595996427, - "grad_norm": 3.827481874435193, - "learning_rate": 1.7962337035248673e-05, - "loss": 4.1898, - "mean_token_accuracy": 0.2917338728904724, - "step": 1860 - }, - { - "epoch": 0.09030980609953396, - "grad_norm": 3.96875879591722, - "learning_rate": 1.805890873973926e-05, - "loss": 4.1773, - "mean_token_accuracy": 0.2904233902692795, - "step": 1870 - }, - { - "epoch": 0.09079274623910366, - "grad_norm": 4.469603855470973, - "learning_rate": 1.8155480444229842e-05, - "loss": 4.3141, - "mean_token_accuracy": 0.28276209607720376, - "step": 1880 - }, - { - "epoch": 0.09127568637867337, - "grad_norm": 3.8364012516557833, - "learning_rate": 1.825205214872043e-05, - "loss": 4.2082, - "mean_token_accuracy": 0.2919354811310768, - "step": 1890 - }, - { - "epoch": 0.09175862651824307, - "grad_norm": 3.789969887449224, - "learning_rate": 1.834862385321101e-05, - "loss": 4.1961, - "mean_token_accuracy": 0.29455645084381105, - "step": 1900 - }, - { - "epoch": 0.09175862651824307, - "eval_runtime": 7.7993, - "eval_samples_per_second": 378.752, - "eval_steps_per_second": 23.72, - "step": 1900 - }, - { - "epoch": 0.09224156665781276, - "grad_norm": 4.1914990070207665, - "learning_rate": 1.8445195557701594e-05, - "loss": 4.1949, - "mean_token_accuracy": 0.292943549156189, - "step": 1910 - }, - { - "epoch": 0.09272450679738246, - "grad_norm": 3.796855649734539, - "learning_rate": 1.854176726219218e-05, - "loss": 4.2289, - "mean_token_accuracy": 0.28387096896767616, - "step": 1920 - }, - { - "epoch": 0.09320744693695217, - "grad_norm": 3.7432314938765665, - "learning_rate": 1.8638338966682763e-05, - "loss": 4.2934, - "mean_token_accuracy": 0.28387096971273423, - "step": 1930 - }, - { - "epoch": 0.09369038707652186, - "grad_norm": 3.8498893083454835, - "learning_rate": 1.873491067117335e-05, - "loss": 4.1922, - "mean_token_accuracy": 0.29002015963196753, - "step": 1940 - }, - { - "epoch": 0.09417332721609156, - "grad_norm": 3.8439065882660004, - "learning_rate": 1.883148237566393e-05, - "loss": 4.1785, - "mean_token_accuracy": 0.28729838579893113, - "step": 1950 - }, - { - "epoch": 0.09465626735566127, - "grad_norm": 3.8852014685722884, - "learning_rate": 1.8928054080154518e-05, - "loss": 4.2266, - "mean_token_accuracy": 0.2824596770107746, - "step": 1960 - }, - { - "epoch": 0.09513920749523097, - "grad_norm": 3.679777948886876, - "learning_rate": 1.90246257846451e-05, - "loss": 4.2246, - "mean_token_accuracy": 0.28931451588869095, - "step": 1970 - }, - { - "epoch": 0.09562214763480066, - "grad_norm": 3.729761142862288, - "learning_rate": 1.9121197489135687e-05, - "loss": 4.2203, - "mean_token_accuracy": 0.2883064493536949, - "step": 1980 - }, - { - "epoch": 0.09610508777437037, - "grad_norm": 3.981413827863695, - "learning_rate": 1.921776919362627e-05, - "loss": 4.1734, - "mean_token_accuracy": 0.29052419364452364, - "step": 1990 - }, - { - "epoch": 0.09658802791394007, - "grad_norm": 3.689671413395015, - "learning_rate": 1.9314340898116852e-05, - "loss": 4.1859, - "mean_token_accuracy": 0.28558467552065847, - "step": 2000 - }, - { - "epoch": 0.09658802791394007, - "eval_runtime": 7.8265, - "eval_samples_per_second": 377.433, - "eval_steps_per_second": 23.638, - "step": 2000 - }, - { - "epoch": 0.09707096805350977, - "grad_norm": 3.3695174549915956, - "learning_rate": 1.941091260260744e-05, - "loss": 4.1332, - "mean_token_accuracy": 0.2949596747756004, - "step": 2010 - }, - { - "epoch": 0.09755390819307946, - "grad_norm": 3.824944050520389, - "learning_rate": 1.950748430709802e-05, - "loss": 4.1762, - "mean_token_accuracy": 0.2873991928994656, - "step": 2020 - }, - { - "epoch": 0.09803684833264917, - "grad_norm": 3.770968937762511, - "learning_rate": 1.9604056011588607e-05, - "loss": 4.2035, - "mean_token_accuracy": 0.2889112919569016, - "step": 2030 - }, - { - "epoch": 0.09851978847221887, - "grad_norm": 3.6748457634050813, - "learning_rate": 1.970062771607919e-05, - "loss": 4.1648, - "mean_token_accuracy": 0.29838709682226183, - "step": 2040 - }, - { - "epoch": 0.09900272861178856, - "grad_norm": 3.4417568990801595, - "learning_rate": 1.9797199420569776e-05, - "loss": 4.1871, - "mean_token_accuracy": 0.2861895151436329, - "step": 2050 - }, - { - "epoch": 0.09948566875135827, - "grad_norm": 3.5192581295855567, - "learning_rate": 1.989377112506036e-05, - "loss": 4.1605, - "mean_token_accuracy": 0.29586693495512006, - "step": 2060 - }, - { - "epoch": 0.09996860889092797, - "grad_norm": 3.564576590092872, - "learning_rate": 1.9990342829550945e-05, - "loss": 4.0996, - "mean_token_accuracy": 0.2984879031777382, - "step": 2070 - }, - { - "epoch": 0.10045154903049767, - "grad_norm": 3.404975178767239, - "learning_rate": 1.9999988489454894e-05, - "loss": 4.216, - "mean_token_accuracy": 0.29022177383303643, - "step": 2080 - }, - { - "epoch": 0.10093448917006737, - "grad_norm": 3.496957526602745, - "learning_rate": 1.999994869995027e-05, - "loss": 4.1328, - "mean_token_accuracy": 0.2946572571992874, - "step": 2090 - }, - { - "epoch": 0.10141742930963707, - "grad_norm": 3.4605287084179563, - "learning_rate": 1.9999880489493693e-05, - "loss": 4.2566, - "mean_token_accuracy": 0.28326612934470174, - "step": 2100 - }, - { - "epoch": 0.10141742930963707, - "eval_runtime": 7.789, - "eval_samples_per_second": 379.252, - "eval_steps_per_second": 23.751, - "step": 2100 - }, - { - "epoch": 0.10190036944920677, - "grad_norm": 3.483317885353719, - "learning_rate": 1.999978385827903e-05, - "loss": 4.1437, - "mean_token_accuracy": 0.2941532261669636, - "step": 2110 - }, - { - "epoch": 0.10238330958877648, - "grad_norm": 3.5647520449079333, - "learning_rate": 1.9999658806580906e-05, - "loss": 4.2262, - "mean_token_accuracy": 0.290423384308815, - "step": 2120 - }, - { - "epoch": 0.10286624972834617, - "grad_norm": 3.7643349384881755, - "learning_rate": 1.9999505334754743e-05, - "loss": 4.157, - "mean_token_accuracy": 0.28901209533214567, - "step": 2130 - }, - { - "epoch": 0.10334918986791587, - "grad_norm": 3.2285226338348383, - "learning_rate": 1.999932344323672e-05, - "loss": 4.1895, - "mean_token_accuracy": 0.2916330635547638, - "step": 2140 - }, - { - "epoch": 0.10383213000748558, - "grad_norm": 3.5042926196337785, - "learning_rate": 1.9999113132543795e-05, - "loss": 4.1281, - "mean_token_accuracy": 0.2879032239317894, - "step": 2150 - }, - { - "epoch": 0.10431507014705527, - "grad_norm": 3.253912585958381, - "learning_rate": 1.999887440327369e-05, - "loss": 4.1699, - "mean_token_accuracy": 0.29606854915618896, - "step": 2160 - }, - { - "epoch": 0.10479801028662497, - "grad_norm": 3.472984439660169, - "learning_rate": 1.9998607256104902e-05, - "loss": 4.2035, - "mean_token_accuracy": 0.288306450843811, - "step": 2170 - }, - { - "epoch": 0.10528095042619467, - "grad_norm": 3.331946274518246, - "learning_rate": 1.9998311691796695e-05, - "loss": 4.1824, - "mean_token_accuracy": 0.2936491914093494, - "step": 2180 - }, - { - "epoch": 0.10576389056576438, - "grad_norm": 3.3809721423831878, - "learning_rate": 1.9997987711189088e-05, - "loss": 4.1719, - "mean_token_accuracy": 0.29324596375226974, - "step": 2190 - }, - { - "epoch": 0.10624683070533407, - "grad_norm": 3.4062492330108185, - "learning_rate": 1.9997635315202876e-05, - "loss": 4.2039, - "mean_token_accuracy": 0.29022177308797836, - "step": 2200 - }, - { - "epoch": 0.10624683070533407, - "eval_runtime": 7.8107, - "eval_samples_per_second": 378.2, - "eval_steps_per_second": 23.685, - "step": 2200 - }, - { - "epoch": 0.10672977084490377, - "grad_norm": 3.343433237267684, - "learning_rate": 1.999725450483961e-05, - "loss": 4.1156, - "mean_token_accuracy": 0.29495968073606493, - "step": 2210 - }, - { - "epoch": 0.10721271098447348, - "grad_norm": 3.3718022631206317, - "learning_rate": 1.9996845281181583e-05, - "loss": 4.1953, - "mean_token_accuracy": 0.28971774131059647, - "step": 2220 - }, - { - "epoch": 0.10769565112404317, - "grad_norm": 3.33376451036372, - "learning_rate": 1.9996407645391868e-05, - "loss": 4.1168, - "mean_token_accuracy": 0.29606854617595674, - "step": 2230 - }, - { - "epoch": 0.10817859126361287, - "grad_norm": 3.2845288968838577, - "learning_rate": 1.9995941598714263e-05, - "loss": 4.1551, - "mean_token_accuracy": 0.293951615691185, - "step": 2240 - }, - { - "epoch": 0.10866153140318258, - "grad_norm": 3.3637315136255124, - "learning_rate": 1.9995447142473327e-05, - "loss": 4.1141, - "mean_token_accuracy": 0.29868951439857483, - "step": 2250 - }, - { - "epoch": 0.10914447154275228, - "grad_norm": 3.4123615176518225, - "learning_rate": 1.999492427807436e-05, - "loss": 4.1992, - "mean_token_accuracy": 0.30312499701976775, - "step": 2260 - }, - { - "epoch": 0.10962741168232197, - "grad_norm": 3.3718098621564194, - "learning_rate": 1.99943730070034e-05, - "loss": 4.0992, - "mean_token_accuracy": 0.301008066534996, - "step": 2270 - }, - { - "epoch": 0.11011035182189167, - "grad_norm": 3.3948366801832646, - "learning_rate": 1.999379333082722e-05, - "loss": 4.1141, - "mean_token_accuracy": 0.30120967477560046, - "step": 2280 - }, - { - "epoch": 0.11059329196146138, - "grad_norm": 3.3914235079525676, - "learning_rate": 1.999318525119332e-05, - "loss": 4.2668, - "mean_token_accuracy": 0.29092741906642916, - "step": 2290 - }, - { - "epoch": 0.11107623210103108, - "grad_norm": 3.205971200422679, - "learning_rate": 1.9992548769829933e-05, - "loss": 4.1477, - "mean_token_accuracy": 0.2950604870915413, - "step": 2300 - }, - { - "epoch": 0.11107623210103108, - "eval_runtime": 7.8009, - "eval_samples_per_second": 378.673, - "eval_steps_per_second": 23.715, - "step": 2300 - }, - { - "epoch": 0.11155917224060077, - "grad_norm": 3.393777518949162, - "learning_rate": 1.999188388854601e-05, - "loss": 4.132, - "mean_token_accuracy": 0.29082661047577857, - "step": 2310 - }, - { - "epoch": 0.11204211238017048, - "grad_norm": 3.094440841604228, - "learning_rate": 1.9991190609231214e-05, - "loss": 4.1207, - "mean_token_accuracy": 0.2965725839138031, - "step": 2320 - }, - { - "epoch": 0.11252505251974018, - "grad_norm": 3.3479123424365915, - "learning_rate": 1.999046893385592e-05, - "loss": 4.1105, - "mean_token_accuracy": 0.2955645158886909, - "step": 2330 - }, - { - "epoch": 0.11300799265930987, - "grad_norm": 3.0668023779433766, - "learning_rate": 1.998971886447121e-05, - "loss": 4.1785, - "mean_token_accuracy": 0.28941532522439956, - "step": 2340 - }, - { - "epoch": 0.11349093279887958, - "grad_norm": 3.143968664214931, - "learning_rate": 1.998894040320886e-05, - "loss": 4.1613, - "mean_token_accuracy": 0.291633065789938, - "step": 2350 - }, - { - "epoch": 0.11397387293844928, - "grad_norm": 3.1427389765606955, - "learning_rate": 1.9988133552281348e-05, - "loss": 4.1406, - "mean_token_accuracy": 0.2945564478635788, - "step": 2360 - }, - { - "epoch": 0.11445681307801898, - "grad_norm": 3.2771465720865285, - "learning_rate": 1.998729831398183e-05, - "loss": 4.1137, - "mean_token_accuracy": 0.2974798411130905, - "step": 2370 - }, - { - "epoch": 0.11493975321758867, - "grad_norm": 3.4001197997830976, - "learning_rate": 1.998643469068415e-05, - "loss": 4.1777, - "mean_token_accuracy": 0.28921371400356294, - "step": 2380 - }, - { - "epoch": 0.11542269335715838, - "grad_norm": 3.126329774194059, - "learning_rate": 1.9985542684842813e-05, - "loss": 4.1, - "mean_token_accuracy": 0.29294354766607283, - "step": 2390 - }, - { - "epoch": 0.11590563349672808, - "grad_norm": 3.1907225391859932, - "learning_rate": 1.9984622298992996e-05, - "loss": 4.0848, - "mean_token_accuracy": 0.29758064448833466, - "step": 2400 - }, - { - "epoch": 0.11590563349672808, - "eval_runtime": 7.7913, - "eval_samples_per_second": 379.139, - "eval_steps_per_second": 23.744, - "step": 2400 - }, - { - "epoch": 0.11638857363629779, - "grad_norm": 3.134878526360008, - "learning_rate": 1.9983673535750547e-05, - "loss": 4.1465, - "mean_token_accuracy": 0.2937500014901161, - "step": 2410 - }, - { - "epoch": 0.11687151377586748, - "grad_norm": 3.236939441567008, - "learning_rate": 1.9982696397811944e-05, - "loss": 4.2105, - "mean_token_accuracy": 0.2894153207540512, - "step": 2420 - }, - { - "epoch": 0.11735445391543718, - "grad_norm": 3.084502916032764, - "learning_rate": 1.998169088795433e-05, - "loss": 4.1746, - "mean_token_accuracy": 0.2940524220466614, - "step": 2430 - }, - { - "epoch": 0.11783739405500689, - "grad_norm": 3.060352329478195, - "learning_rate": 1.998065700903547e-05, - "loss": 4.1437, - "mean_token_accuracy": 0.2989919319748878, - "step": 2440 - }, - { - "epoch": 0.11832033419457658, - "grad_norm": 3.1678451476640643, - "learning_rate": 1.997959476399376e-05, - "loss": 4.1527, - "mean_token_accuracy": 0.29425403028726577, - "step": 2450 - }, - { - "epoch": 0.11880327433414628, - "grad_norm": 3.094998419824479, - "learning_rate": 1.9978504155848217e-05, - "loss": 4.2156, - "mean_token_accuracy": 0.2966733887791634, - "step": 2460 - }, - { - "epoch": 0.11928621447371598, - "grad_norm": 3.2866241351872123, - "learning_rate": 1.997738518769847e-05, - "loss": 4.2145, - "mean_token_accuracy": 0.29163306653499604, - "step": 2470 - }, - { - "epoch": 0.11976915461328569, - "grad_norm": 3.3145211795801326, - "learning_rate": 1.9976237862724752e-05, - "loss": 4.2016, - "mean_token_accuracy": 0.28296371176838875, - "step": 2480 - }, - { - "epoch": 0.12025209475285538, - "grad_norm": 2.9865965006326918, - "learning_rate": 1.9975062184187884e-05, - "loss": 4.1645, - "mean_token_accuracy": 0.2926411278545856, - "step": 2490 - }, - { - "epoch": 0.12073503489242508, - "grad_norm": 3.149402413863351, - "learning_rate": 1.9973858155429272e-05, - "loss": 4.1547, - "mean_token_accuracy": 0.29475806280970573, - "step": 2500 - }, - { - "epoch": 0.12073503489242508, - "eval_runtime": 7.7765, - "eval_samples_per_second": 379.863, - "eval_steps_per_second": 23.79, - "step": 2500 - }, - { - "epoch": 0.12121797503199479, - "grad_norm": 3.053530624006074, - "learning_rate": 1.99726257798709e-05, - "loss": 4.1641, - "mean_token_accuracy": 0.2897177398204803, - "step": 2510 - }, - { - "epoch": 0.12170091517156448, - "grad_norm": 2.818520679482681, - "learning_rate": 1.9971365061015314e-05, - "loss": 4.1824, - "mean_token_accuracy": 0.29042338877916335, - "step": 2520 - }, - { - "epoch": 0.12218385531113418, - "grad_norm": 3.0383887925537554, - "learning_rate": 1.9970076002445616e-05, - "loss": 4.1348, - "mean_token_accuracy": 0.2960685446858406, - "step": 2530 - }, - { - "epoch": 0.12266679545070389, - "grad_norm": 3.0440103948940167, - "learning_rate": 1.9968758607825455e-05, - "loss": 4.1137, - "mean_token_accuracy": 0.29667339026927947, - "step": 2540 - }, - { - "epoch": 0.12314973559027359, - "grad_norm": 3.137336762278146, - "learning_rate": 1.9967412880899002e-05, - "loss": 4.152, - "mean_token_accuracy": 0.28568548560142515, - "step": 2550 - }, - { - "epoch": 0.12363267572984328, - "grad_norm": 3.2256735052882033, - "learning_rate": 1.996603882549097e-05, - "loss": 4.1059, - "mean_token_accuracy": 0.29092742055654525, - "step": 2560 - }, - { - "epoch": 0.12411561586941298, - "grad_norm": 2.810525058032397, - "learning_rate": 1.9964636445506567e-05, - "loss": 4.1738, - "mean_token_accuracy": 0.2917338714003563, - "step": 2570 - }, - { - "epoch": 0.12459855600898269, - "grad_norm": 3.0368595802556615, - "learning_rate": 1.996320574493152e-05, - "loss": 4.1023, - "mean_token_accuracy": 0.2969758063554764, - "step": 2580 - }, - { - "epoch": 0.12508149614855238, - "grad_norm": 3.107936710751673, - "learning_rate": 1.9961746727832035e-05, - "loss": 4.1766, - "mean_token_accuracy": 0.2967741936445236, - "step": 2590 - }, - { - "epoch": 0.12556443628812208, - "grad_norm": 3.1934944513060377, - "learning_rate": 1.996025939835479e-05, - "loss": 4.1219, - "mean_token_accuracy": 0.3009072571992874, - "step": 2600 - }, - { - "epoch": 0.12556443628812208, - "eval_runtime": 7.7742, - "eval_samples_per_second": 379.974, - "eval_steps_per_second": 23.797, - "step": 2600 - }, - { - "epoch": 0.1260473764276918, - "grad_norm": 2.9318511625725474, - "learning_rate": 1.995874376072695e-05, - "loss": 4.1039, - "mean_token_accuracy": 0.29314516112208366, - "step": 2610 - }, - { - "epoch": 0.1265303165672615, - "grad_norm": 2.9737147117589364, - "learning_rate": 1.9957199819256114e-05, - "loss": 4.1355, - "mean_token_accuracy": 0.2991935461759567, - "step": 2620 - }, - { - "epoch": 0.1270132567068312, - "grad_norm": 2.994991653253815, - "learning_rate": 1.9955627578330342e-05, - "loss": 4.1125, - "mean_token_accuracy": 0.29899193346500397, - "step": 2630 - }, - { - "epoch": 0.1274961968464009, - "grad_norm": 3.141545019029578, - "learning_rate": 1.995402704241811e-05, - "loss": 4.1816, - "mean_token_accuracy": 0.29324597045779227, - "step": 2640 - }, - { - "epoch": 0.12797913698597058, - "grad_norm": 3.1399846562592284, - "learning_rate": 1.9952398216068313e-05, - "loss": 4.1461, - "mean_token_accuracy": 0.29395161271095277, - "step": 2650 - }, - { - "epoch": 0.12846207712554028, - "grad_norm": 3.2048341512162204, - "learning_rate": 1.9950741103910266e-05, - "loss": 4.0828, - "mean_token_accuracy": 0.30231854766607286, - "step": 2660 - }, - { - "epoch": 0.12894501726510998, - "grad_norm": 3.1432823321613714, - "learning_rate": 1.9949055710653652e-05, - "loss": 4.1328, - "mean_token_accuracy": 0.3035282254219055, - "step": 2670 - }, - { - "epoch": 0.1294279574046797, - "grad_norm": 3.0971336034072174, - "learning_rate": 1.9947342041088548e-05, - "loss": 4.1203, - "mean_token_accuracy": 0.29374999925494194, - "step": 2680 - }, - { - "epoch": 0.1299108975442494, - "grad_norm": 2.8419491657941887, - "learning_rate": 1.9945600100085394e-05, - "loss": 4.2234, - "mean_token_accuracy": 0.29052419364452364, - "step": 2690 - }, - { - "epoch": 0.1303938376838191, - "grad_norm": 2.98093428485918, - "learning_rate": 1.9943829892594975e-05, - "loss": 4.0855, - "mean_token_accuracy": 0.2977822571992874, - "step": 2700 - }, - { - "epoch": 0.1303938376838191, - "eval_runtime": 7.7829, - "eval_samples_per_second": 379.55, - "eval_steps_per_second": 23.77, - "step": 2700 - }, - { - "epoch": 0.1308767778233888, - "grad_norm": 3.0655424493867027, - "learning_rate": 1.9942031423648412e-05, - "loss": 4.1461, - "mean_token_accuracy": 0.29586693495512006, - "step": 2710 - }, - { - "epoch": 0.13135971796295848, - "grad_norm": 2.917395046685627, - "learning_rate": 1.9940204698357157e-05, - "loss": 4.0898, - "mean_token_accuracy": 0.30181451588869096, - "step": 2720 - }, - { - "epoch": 0.13184265810252818, - "grad_norm": 2.780683782637958, - "learning_rate": 1.993834972191296e-05, - "loss": 4.1449, - "mean_token_accuracy": 0.29334677159786227, - "step": 2730 - }, - { - "epoch": 0.13232559824209789, - "grad_norm": 2.780131520760637, - "learning_rate": 1.9936466499587867e-05, - "loss": 4.1094, - "mean_token_accuracy": 0.2979838714003563, - "step": 2740 - }, - { - "epoch": 0.1328085383816676, - "grad_norm": 2.894335714716186, - "learning_rate": 1.9934555036734204e-05, - "loss": 4.0707, - "mean_token_accuracy": 0.29848790168762207, - "step": 2750 - }, - { - "epoch": 0.1332914785212373, - "grad_norm": 2.8123238985658383, - "learning_rate": 1.9932615338784563e-05, - "loss": 4.1219, - "mean_token_accuracy": 0.2932459652423859, - "step": 2760 - }, - { - "epoch": 0.133774418660807, - "grad_norm": 3.1669770335511807, - "learning_rate": 1.993064741125177e-05, - "loss": 4.1277, - "mean_token_accuracy": 0.28961693644523623, - "step": 2770 - }, - { - "epoch": 0.1342573588003767, - "grad_norm": 2.8788532089749053, - "learning_rate": 1.9928651259728895e-05, - "loss": 4.1324, - "mean_token_accuracy": 0.2976814493536949, - "step": 2780 - }, - { - "epoch": 0.1347402989399464, - "grad_norm": 2.902731924766266, - "learning_rate": 1.992662688988922e-05, - "loss": 4.1277, - "mean_token_accuracy": 0.29949596896767616, - "step": 2790 - }, - { - "epoch": 0.13522323907951608, - "grad_norm": 2.9026165656263068, - "learning_rate": 1.9924574307486226e-05, - "loss": 4.0316, - "mean_token_accuracy": 0.30332661271095274, - "step": 2800 - }, - { - "epoch": 0.13522323907951608, - "eval_runtime": 7.7959, - "eval_samples_per_second": 378.917, - "eval_steps_per_second": 23.73, - "step": 2800 - }, - { - "epoch": 0.1357061792190858, - "grad_norm": 2.959141394406022, - "learning_rate": 1.992249351835358e-05, - "loss": 4.116, - "mean_token_accuracy": 0.296572582423687, - "step": 2810 - }, - { - "epoch": 0.1361891193586555, - "grad_norm": 2.9565526284260835, - "learning_rate": 1.992038452840511e-05, - "loss": 4.132, - "mean_token_accuracy": 0.2971774205565453, - "step": 2820 - }, - { - "epoch": 0.1366720594982252, - "grad_norm": 3.0187909126734094, - "learning_rate": 1.9918247343634792e-05, - "loss": 4.0859, - "mean_token_accuracy": 0.30312499701976775, - "step": 2830 - }, - { - "epoch": 0.1371549996377949, - "grad_norm": 3.2696135134499875, - "learning_rate": 1.9916081970116754e-05, - "loss": 4.15, - "mean_token_accuracy": 0.2948588699102402, - "step": 2840 - }, - { - "epoch": 0.1376379397773646, - "grad_norm": 3.110202672973678, - "learning_rate": 1.991388841400521e-05, - "loss": 4.0668, - "mean_token_accuracy": 0.3077620968222618, - "step": 2850 - }, - { - "epoch": 0.1381208799169343, - "grad_norm": 3.0364024561026004, - "learning_rate": 1.9911666681534498e-05, - "loss": 4.1109, - "mean_token_accuracy": 0.30060483813285827, - "step": 2860 - }, - { - "epoch": 0.13860382005650398, - "grad_norm": 2.8116312935370122, - "learning_rate": 1.990941677901902e-05, - "loss": 4.0707, - "mean_token_accuracy": 0.29868951737880706, - "step": 2870 - }, - { - "epoch": 0.1390867601960737, - "grad_norm": 2.8181773456766335, - "learning_rate": 1.9907138712853247e-05, - "loss": 4.1582, - "mean_token_accuracy": 0.2958669364452362, - "step": 2880 - }, - { - "epoch": 0.1395697003356434, - "grad_norm": 2.72404271407935, - "learning_rate": 1.9904832489511694e-05, - "loss": 4.0938, - "mean_token_accuracy": 0.2978830635547638, - "step": 2890 - }, - { - "epoch": 0.1400526404752131, - "grad_norm": 2.8388884078244674, - "learning_rate": 1.99024981155489e-05, - "loss": 4.1098, - "mean_token_accuracy": 0.29707661271095276, - "step": 2900 - }, - { - "epoch": 0.1400526404752131, - "eval_runtime": 7.7975, - "eval_samples_per_second": 378.84, - "eval_steps_per_second": 23.726, - "step": 2900 - }, - { - "epoch": 0.1405355806147828, - "grad_norm": 2.7411410608071587, - "learning_rate": 1.9900135597599412e-05, - "loss": 4.0812, - "mean_token_accuracy": 0.30100806802511215, - "step": 2910 - }, - { - "epoch": 0.1410185207543525, - "grad_norm": 2.74663069342885, - "learning_rate": 1.989774494237777e-05, - "loss": 4.1445, - "mean_token_accuracy": 0.29445564597845075, - "step": 2920 - }, - { - "epoch": 0.1415014608939222, - "grad_norm": 2.953799847289113, - "learning_rate": 1.9895326156678466e-05, - "loss": 4.1664, - "mean_token_accuracy": 0.2919354811310768, - "step": 2930 - }, - { - "epoch": 0.14198440103349189, - "grad_norm": 2.8073107926018626, - "learning_rate": 1.989287924737597e-05, - "loss": 4.1949, - "mean_token_accuracy": 0.288306450098753, - "step": 2940 - }, - { - "epoch": 0.1424673411730616, - "grad_norm": 2.743598961280255, - "learning_rate": 1.9890404221424658e-05, - "loss": 4.109, - "mean_token_accuracy": 0.2983870953321457, - "step": 2950 - }, - { - "epoch": 0.1429502813126313, - "grad_norm": 3.0012326335675636, - "learning_rate": 1.9887901085858826e-05, - "loss": 4.0977, - "mean_token_accuracy": 0.29838709682226183, - "step": 2960 - }, - { - "epoch": 0.143433221452201, - "grad_norm": 2.8868559864713546, - "learning_rate": 1.988536984779266e-05, - "loss": 3.9898, - "mean_token_accuracy": 0.30514112710952757, - "step": 2970 - }, - { - "epoch": 0.1439161615917707, - "grad_norm": 3.2907404789514145, - "learning_rate": 1.9882810514420222e-05, - "loss": 4.1109, - "mean_token_accuracy": 0.29707661718130113, - "step": 2980 - }, - { - "epoch": 0.1443991017313404, - "grad_norm": 2.724490848885843, - "learning_rate": 1.988022309301541e-05, - "loss": 4.0551, - "mean_token_accuracy": 0.3006048396229744, - "step": 2990 - }, - { - "epoch": 0.1448820418709101, - "grad_norm": 2.7436759004887366, - "learning_rate": 1.9877607590931966e-05, - "loss": 4.1488, - "mean_token_accuracy": 0.2912298396229744, - "step": 3000 - }, - { - "epoch": 0.1448820418709101, - "eval_runtime": 7.7911, - "eval_samples_per_second": 379.152, - "eval_steps_per_second": 23.745, - "step": 3000 - }, - { - "epoch": 0.1453649820104798, - "grad_norm": 2.692265011369886, - "learning_rate": 1.987496401560343e-05, - "loss": 4.1621, - "mean_token_accuracy": 0.29082661122083664, - "step": 3010 - }, - { - "epoch": 0.1458479221500495, - "grad_norm": 2.7831114038649365, - "learning_rate": 1.9872292374543137e-05, - "loss": 4.0285, - "mean_token_accuracy": 0.30302419513463974, - "step": 3020 - }, - { - "epoch": 0.1463308622896192, - "grad_norm": 2.8814211491029686, - "learning_rate": 1.9869592675344185e-05, - "loss": 4.0773, - "mean_token_accuracy": 0.29979838877916337, - "step": 3030 - }, - { - "epoch": 0.1468138024291889, - "grad_norm": 2.884743053840664, - "learning_rate": 1.9866864925679407e-05, - "loss": 4.0977, - "mean_token_accuracy": 0.294052417576313, - "step": 3040 - }, - { - "epoch": 0.1472967425687586, - "grad_norm": 2.654526475003225, - "learning_rate": 1.9864109133301376e-05, - "loss": 4.123, - "mean_token_accuracy": 0.2966733887791634, - "step": 3050 - }, - { - "epoch": 0.1477796827083283, - "grad_norm": 2.9486897935026013, - "learning_rate": 1.9861325306042352e-05, - "loss": 4.1078, - "mean_token_accuracy": 0.2957661285996437, - "step": 3060 - }, - { - "epoch": 0.148262622847898, - "grad_norm": 2.634288213557304, - "learning_rate": 1.9858513451814278e-05, - "loss": 4.1004, - "mean_token_accuracy": 0.29687499850988386, - "step": 3070 - }, - { - "epoch": 0.14874556298746772, - "grad_norm": 2.941275199589005, - "learning_rate": 1.9855673578608755e-05, - "loss": 4.082, - "mean_token_accuracy": 0.3023185521364212, - "step": 3080 - }, - { - "epoch": 0.1492285031270374, - "grad_norm": 2.8403359075616947, - "learning_rate": 1.9852805694497012e-05, - "loss": 4.0883, - "mean_token_accuracy": 0.30131048560142515, - "step": 3090 - }, - { - "epoch": 0.1497114432666071, - "grad_norm": 2.8047399921946536, - "learning_rate": 1.9849909807629892e-05, - "loss": 4.1461, - "mean_token_accuracy": 0.29848790615797044, - "step": 3100 - }, - { - "epoch": 0.1497114432666071, - "eval_runtime": 7.7828, - "eval_samples_per_second": 379.553, - "eval_steps_per_second": 23.77, - "step": 3100 - }, - { - "epoch": 0.1501943834061768, - "grad_norm": 2.744187832623059, - "learning_rate": 1.984698592623782e-05, - "loss": 4.184, - "mean_token_accuracy": 0.28760080486536027, - "step": 3110 - }, - { - "epoch": 0.1506773235457465, - "grad_norm": 2.746691952115286, - "learning_rate": 1.9844034058630795e-05, - "loss": 4.1148, - "mean_token_accuracy": 0.2985887095332146, - "step": 3120 - }, - { - "epoch": 0.1511602636853162, - "grad_norm": 2.8818710937151355, - "learning_rate": 1.984105421319834e-05, - "loss": 4.1711, - "mean_token_accuracy": 0.29546370804309846, - "step": 3130 - }, - { - "epoch": 0.1516432038248859, - "grad_norm": 2.702875712828679, - "learning_rate": 1.9838046398409507e-05, - "loss": 4.1145, - "mean_token_accuracy": 0.29989919662475584, - "step": 3140 - }, - { - "epoch": 0.15212614396445562, - "grad_norm": 2.856644546684219, - "learning_rate": 1.983501062281284e-05, - "loss": 4.1246, - "mean_token_accuracy": 0.3035282254219055, - "step": 3150 - }, - { - "epoch": 0.1526090841040253, - "grad_norm": 2.700008163445202, - "learning_rate": 1.983194689503634e-05, - "loss": 4.0668, - "mean_token_accuracy": 0.3047379031777382, - "step": 3160 - }, - { - "epoch": 0.153092024243595, - "grad_norm": 2.8706883733850765, - "learning_rate": 1.982885522378746e-05, - "loss": 4.1195, - "mean_token_accuracy": 0.30161290168762206, - "step": 3170 - }, - { - "epoch": 0.1535749643831647, - "grad_norm": 2.6616090481057513, - "learning_rate": 1.9825735617853064e-05, - "loss": 4.0949, - "mean_token_accuracy": 0.30171371102333067, - "step": 3180 - }, - { - "epoch": 0.1540579045227344, - "grad_norm": 3.01874133105245, - "learning_rate": 1.9822588086099425e-05, - "loss": 4.0676, - "mean_token_accuracy": 0.3040322571992874, - "step": 3190 - }, - { - "epoch": 0.1545408446623041, - "grad_norm": 2.675413319261908, - "learning_rate": 1.9819412637472166e-05, - "loss": 4.1273, - "mean_token_accuracy": 0.2976814515888691, - "step": 3200 - }, - { - "epoch": 0.1545408446623041, - "eval_runtime": 7.7928, - "eval_samples_per_second": 379.069, - "eval_steps_per_second": 23.74, - "step": 3200 - }, - { - "epoch": 0.15502378480187382, - "grad_norm": 2.766899311175021, - "learning_rate": 1.9816209280996265e-05, - "loss": 4.0605, - "mean_token_accuracy": 0.30584677308797836, - "step": 3210 - }, - { - "epoch": 0.15550672494144352, - "grad_norm": 2.791158894607005, - "learning_rate": 1.981297802577601e-05, - "loss": 4.1082, - "mean_token_accuracy": 0.29828629121184347, - "step": 3220 - }, - { - "epoch": 0.1559896650810132, - "grad_norm": 3.02082827149008, - "learning_rate": 1.9809718880994984e-05, - "loss": 4.1406, - "mean_token_accuracy": 0.29637096673250196, - "step": 3230 - }, - { - "epoch": 0.1564726052205829, - "grad_norm": 2.759968414381429, - "learning_rate": 1.980643185591603e-05, - "loss": 4.0215, - "mean_token_accuracy": 0.29969757944345476, - "step": 3240 - }, - { - "epoch": 0.1569555453601526, - "grad_norm": 2.683569058322477, - "learning_rate": 1.9803116959881243e-05, - "loss": 4.066, - "mean_token_accuracy": 0.30231854915618894, - "step": 3250 - }, - { - "epoch": 0.1574384854997223, - "grad_norm": 2.5285953993153893, - "learning_rate": 1.9799774202311917e-05, - "loss": 4.1395, - "mean_token_accuracy": 0.30070564299821856, - "step": 3260 - }, - { - "epoch": 0.157921425639292, - "grad_norm": 2.580832491779241, - "learning_rate": 1.979640359270853e-05, - "loss": 4.082, - "mean_token_accuracy": 0.30030242055654527, - "step": 3270 - }, - { - "epoch": 0.15840436577886172, - "grad_norm": 2.7382317260883715, - "learning_rate": 1.9793005140650738e-05, - "loss": 4.1094, - "mean_token_accuracy": 0.29879032224416735, - "step": 3280 - }, - { - "epoch": 0.15888730591843142, - "grad_norm": 2.7128435969484563, - "learning_rate": 1.97895788557973e-05, - "loss": 4.0668, - "mean_token_accuracy": 0.3035282254219055, - "step": 3290 - }, - { - "epoch": 0.1593702460580011, - "grad_norm": 2.662083350826242, - "learning_rate": 1.97861247478861e-05, - "loss": 4.0465, - "mean_token_accuracy": 0.30756048411130904, - "step": 3300 - }, - { - "epoch": 0.1593702460580011, - "eval_runtime": 7.7695, - "eval_samples_per_second": 380.205, - "eval_steps_per_second": 23.811, - "step": 3300 - }, - { - "epoch": 0.1598531861975708, - "grad_norm": 2.6896017438928532, - "learning_rate": 1.9782642826734095e-05, - "loss": 4.125, - "mean_token_accuracy": 0.2977187469601631, - "step": 3310 - }, - { - "epoch": 0.1603361263371405, - "grad_norm": 2.6343597072414275, - "learning_rate": 1.9779133102237285e-05, - "loss": 3.9711, - "mean_token_accuracy": 0.3121975839138031, - "step": 3320 - }, - { - "epoch": 0.1608190664767102, - "grad_norm": 2.6208106907293707, - "learning_rate": 1.977559558437069e-05, - "loss": 4.0887, - "mean_token_accuracy": 0.30060483887791634, - "step": 3330 - }, - { - "epoch": 0.1613020066162799, - "grad_norm": 2.733434376395422, - "learning_rate": 1.9772030283188327e-05, - "loss": 4.032, - "mean_token_accuracy": 0.30191532224416734, - "step": 3340 - }, - { - "epoch": 0.16178494675584962, - "grad_norm": 2.528211328004999, - "learning_rate": 1.9768437208823173e-05, - "loss": 4.0785, - "mean_token_accuracy": 0.30332661122083665, - "step": 3350 - }, - { - "epoch": 0.16226788689541932, - "grad_norm": 2.722087051300455, - "learning_rate": 1.9764816371487137e-05, - "loss": 4.1055, - "mean_token_accuracy": 0.29657257795333863, - "step": 3360 - }, - { - "epoch": 0.16275082703498903, - "grad_norm": 2.5219048235861528, - "learning_rate": 1.976116778147104e-05, - "loss": 4.1, - "mean_token_accuracy": 0.29778225943446157, - "step": 3370 - }, - { - "epoch": 0.1632337671745587, - "grad_norm": 2.680587551133981, - "learning_rate": 1.975749144914457e-05, - "loss": 4.052, - "mean_token_accuracy": 0.2977822571992874, - "step": 3380 - }, - { - "epoch": 0.1637167073141284, - "grad_norm": 2.9112199550423377, - "learning_rate": 1.9753787384956276e-05, - "loss": 4.1535, - "mean_token_accuracy": 0.30362903475761416, - "step": 3390 - }, - { - "epoch": 0.1641996474536981, - "grad_norm": 2.621013954914834, - "learning_rate": 1.9750055599433503e-05, - "loss": 4.0965, - "mean_token_accuracy": 0.2966733857989311, - "step": 3400 - }, - { - "epoch": 0.1641996474536981, - "eval_runtime": 7.838, - "eval_samples_per_second": 376.884, - "eval_steps_per_second": 23.603, - "step": 3400 - }, - { - "epoch": 0.16468258759326782, - "grad_norm": 2.692676483705507, - "learning_rate": 1.9746296103182406e-05, - "loss": 4.068, - "mean_token_accuracy": 0.30221773982048034, - "step": 3410 - }, - { - "epoch": 0.16516552773283752, - "grad_norm": 2.75647521867288, - "learning_rate": 1.974250890688788e-05, - "loss": 4.0297, - "mean_token_accuracy": 0.3034274227917194, - "step": 3420 - }, - { - "epoch": 0.16564846787240722, - "grad_norm": 2.736914462919695, - "learning_rate": 1.973869402131356e-05, - "loss": 4.057, - "mean_token_accuracy": 0.30070564299821856, - "step": 3430 - }, - { - "epoch": 0.16613140801197693, - "grad_norm": 2.7217035002411603, - "learning_rate": 1.9734851457301757e-05, - "loss": 4.0801, - "mean_token_accuracy": 0.29868951588869097, - "step": 3440 - }, - { - "epoch": 0.1666143481515466, - "grad_norm": 2.5327655563603177, - "learning_rate": 1.973098122577347e-05, - "loss": 4.1125, - "mean_token_accuracy": 0.30332661271095274, - "step": 3450 - }, - { - "epoch": 0.1670972882911163, - "grad_norm": 2.655328286104727, - "learning_rate": 1.9727083337728316e-05, - "loss": 4.084, - "mean_token_accuracy": 0.30584677308797836, - "step": 3460 - }, - { - "epoch": 0.167580228430686, - "grad_norm": 2.6751770694202093, - "learning_rate": 1.9723157804244522e-05, - "loss": 4.0687, - "mean_token_accuracy": 0.30262096524238585, - "step": 3470 - }, - { - "epoch": 0.16806316857025572, - "grad_norm": 2.828274606821587, - "learning_rate": 1.9719204636478893e-05, - "loss": 4.0469, - "mean_token_accuracy": 0.3049395151436329, - "step": 3480 - }, - { - "epoch": 0.16854610870982542, - "grad_norm": 2.666808629903219, - "learning_rate": 1.9715223845666754e-05, - "loss": 4.0746, - "mean_token_accuracy": 0.2991935446858406, - "step": 3490 - }, - { - "epoch": 0.16902904884939512, - "grad_norm": 2.775065079718119, - "learning_rate": 1.9711215443121955e-05, - "loss": 4.0316, - "mean_token_accuracy": 0.3129032254219055, - "step": 3500 - }, - { - "epoch": 0.16902904884939512, - "eval_runtime": 7.8054, - "eval_samples_per_second": 378.457, - "eval_steps_per_second": 23.702, - "step": 3500 - }, - { - "epoch": 0.16951198898896483, - "grad_norm": 2.5983820917312026, - "learning_rate": 1.9707179440236815e-05, - "loss": 4.0781, - "mean_token_accuracy": 0.30453629046678543, - "step": 3510 - }, - { - "epoch": 0.1699949291285345, - "grad_norm": 2.61383874753937, - "learning_rate": 1.97031158484821e-05, - "loss": 4.0926, - "mean_token_accuracy": 0.3074596762657166, - "step": 3520 - }, - { - "epoch": 0.1704778692681042, - "grad_norm": 2.613847766199223, - "learning_rate": 1.969902467940698e-05, - "loss": 4.1414, - "mean_token_accuracy": 0.2979838714003563, - "step": 3530 - }, - { - "epoch": 0.1709608094076739, - "grad_norm": 2.4731166444967645, - "learning_rate": 1.9694905944639014e-05, - "loss": 4.0578, - "mean_token_accuracy": 0.3025201618671417, - "step": 3540 - }, - { - "epoch": 0.17144374954724362, - "grad_norm": 2.5600193940786995, - "learning_rate": 1.9690759655884085e-05, - "loss": 4.0527, - "mean_token_accuracy": 0.3074596807360649, - "step": 3550 - }, - { - "epoch": 0.17192668968681332, - "grad_norm": 2.738596210941972, - "learning_rate": 1.9686585824926412e-05, - "loss": 4.1148, - "mean_token_accuracy": 0.29213709831237794, - "step": 3560 - }, - { - "epoch": 0.17240962982638303, - "grad_norm": 2.599359635619837, - "learning_rate": 1.9682384463628477e-05, - "loss": 4.0469, - "mean_token_accuracy": 0.30080644935369494, - "step": 3570 - }, - { - "epoch": 0.17289256996595273, - "grad_norm": 2.691353570094291, - "learning_rate": 1.967815558393101e-05, - "loss": 4.0758, - "mean_token_accuracy": 0.3056451603770256, - "step": 3580 - }, - { - "epoch": 0.1733755101055224, - "grad_norm": 2.7396764808543943, - "learning_rate": 1.967389919785295e-05, - "loss": 4.0512, - "mean_token_accuracy": 0.3, - "step": 3590 - }, - { - "epoch": 0.1738584502450921, - "grad_norm": 2.7179381888390814, - "learning_rate": 1.9669615317491418e-05, - "loss": 4.068, - "mean_token_accuracy": 0.2933467745780945, - "step": 3600 - }, - { - "epoch": 0.1738584502450921, - "eval_runtime": 7.8039, - "eval_samples_per_second": 378.529, - "eval_steps_per_second": 23.706, - "step": 3600 - }, - { - "epoch": 0.17434139038466182, - "grad_norm": 2.7449641837357737, - "learning_rate": 1.966530395502167e-05, - "loss": 4.0566, - "mean_token_accuracy": 0.2998991921544075, - "step": 3610 - }, - { - "epoch": 0.17482433052423152, - "grad_norm": 2.6189015079358233, - "learning_rate": 1.9660965122697067e-05, - "loss": 4.0191, - "mean_token_accuracy": 0.3065524235367775, - "step": 3620 - }, - { - "epoch": 0.17530727066380122, - "grad_norm": 2.9319900971973567, - "learning_rate": 1.965659883284905e-05, - "loss": 4.0141, - "mean_token_accuracy": 0.3042338714003563, - "step": 3630 - }, - { - "epoch": 0.17579021080337093, - "grad_norm": 2.6196512440180566, - "learning_rate": 1.9652205097887097e-05, - "loss": 4.0523, - "mean_token_accuracy": 0.30383064299821855, - "step": 3640 - }, - { - "epoch": 0.17627315094294063, - "grad_norm": 2.618384526978317, - "learning_rate": 1.9647783930298683e-05, - "loss": 4.1137, - "mean_token_accuracy": 0.2987903207540512, - "step": 3650 - }, - { - "epoch": 0.17675609108251034, - "grad_norm": 2.5126722492749822, - "learning_rate": 1.9643335342649253e-05, - "loss": 4.0223, - "mean_token_accuracy": 0.3099798396229744, - "step": 3660 - }, - { - "epoch": 0.17723903122208, - "grad_norm": 2.494061535290806, - "learning_rate": 1.9638859347582176e-05, - "loss": 4.0906, - "mean_token_accuracy": 0.2984879031777382, - "step": 3670 - }, - { - "epoch": 0.17772197136164972, - "grad_norm": 2.550490711171001, - "learning_rate": 1.9634355957818724e-05, - "loss": 4.0457, - "mean_token_accuracy": 0.3073588699102402, - "step": 3680 - }, - { - "epoch": 0.17820491150121942, - "grad_norm": 2.4847633736880357, - "learning_rate": 1.9629825186158033e-05, - "loss": 4.0473, - "mean_token_accuracy": 0.3077620968222618, - "step": 3690 - }, - { - "epoch": 0.17868785164078912, - "grad_norm": 2.7663336779028618, - "learning_rate": 1.962526704547704e-05, - "loss": 4.0941, - "mean_token_accuracy": 0.30010080337524414, - "step": 3700 - }, - { - "epoch": 0.17868785164078912, - "eval_runtime": 7.8038, - "eval_samples_per_second": 378.533, - "eval_steps_per_second": 23.706, - "step": 3700 - }, - { - "epoch": 0.17917079178035883, - "grad_norm": 2.5166344462922665, - "learning_rate": 1.962068154873049e-05, - "loss": 4.0844, - "mean_token_accuracy": 0.3015120968222618, - "step": 3710 - }, - { - "epoch": 0.17965373191992853, - "grad_norm": 2.671642786530828, - "learning_rate": 1.9616068708950865e-05, - "loss": 4.109, - "mean_token_accuracy": 0.29949596524238586, - "step": 3720 - }, - { - "epoch": 0.18013667205949824, - "grad_norm": 2.5424801288239878, - "learning_rate": 1.9611428539248364e-05, - "loss": 4.0547, - "mean_token_accuracy": 0.30453629046678543, - "step": 3730 - }, - { - "epoch": 0.1806196121990679, - "grad_norm": 2.4744737906443475, - "learning_rate": 1.9606761052810858e-05, - "loss": 4.0434, - "mean_token_accuracy": 0.3083669379353523, - "step": 3740 - }, - { - "epoch": 0.18110255233863762, - "grad_norm": 2.661085975386548, - "learning_rate": 1.9602066262903855e-05, - "loss": 4.1684, - "mean_token_accuracy": 0.29294354766607283, - "step": 3750 - }, - { - "epoch": 0.18158549247820732, - "grad_norm": 2.551128771570501, - "learning_rate": 1.9597344182870463e-05, - "loss": 4.0281, - "mean_token_accuracy": 0.3139112889766693, - "step": 3760 - }, - { - "epoch": 0.18206843261777703, - "grad_norm": 2.4986089678252035, - "learning_rate": 1.9592594826131352e-05, - "loss": 4.1246, - "mean_token_accuracy": 0.30141129195690153, - "step": 3770 - }, - { - "epoch": 0.18255137275734673, - "grad_norm": 2.5241612996931884, - "learning_rate": 1.9587818206184718e-05, - "loss": 4.0605, - "mean_token_accuracy": 0.30463709980249404, - "step": 3780 - }, - { - "epoch": 0.18303431289691643, - "grad_norm": 2.574199584048626, - "learning_rate": 1.958301433660623e-05, - "loss": 4.0734, - "mean_token_accuracy": 0.3029233872890472, - "step": 3790 - }, - { - "epoch": 0.18351725303648614, - "grad_norm": 2.6257630018910145, - "learning_rate": 1.9578183231049028e-05, - "loss": 4.0523, - "mean_token_accuracy": 0.3030241936445236, - "step": 3800 - }, - { - "epoch": 0.18351725303648614, - "eval_runtime": 7.7827, - "eval_samples_per_second": 379.557, - "eval_steps_per_second": 23.771, - "step": 3800 - }, - { - "epoch": 0.18400019317605582, - "grad_norm": 2.8490950431431727, - "learning_rate": 1.9573324903243633e-05, - "loss": 4.098, - "mean_token_accuracy": 0.3015120938420296, - "step": 3810 - }, - { - "epoch": 0.18448313331562552, - "grad_norm": 2.6442037232679105, - "learning_rate": 1.956843936699795e-05, - "loss": 4.0773, - "mean_token_accuracy": 0.30846773982048037, - "step": 3820 - }, - { - "epoch": 0.18496607345519522, - "grad_norm": 2.440382964429585, - "learning_rate": 1.9563526636197205e-05, - "loss": 4.0621, - "mean_token_accuracy": 0.31008064597845075, - "step": 3830 - }, - { - "epoch": 0.18544901359476493, - "grad_norm": 2.506411414128234, - "learning_rate": 1.9558586724803926e-05, - "loss": 4.0563, - "mean_token_accuracy": 0.3015120968222618, - "step": 3840 - }, - { - "epoch": 0.18593195373433463, - "grad_norm": 2.4416581277853506, - "learning_rate": 1.9553619646857876e-05, - "loss": 4.05, - "mean_token_accuracy": 0.300504033267498, - "step": 3850 - }, - { - "epoch": 0.18641489387390434, - "grad_norm": 2.538610139614956, - "learning_rate": 1.9548625416476037e-05, - "loss": 4.0324, - "mean_token_accuracy": 0.3061491921544075, - "step": 3860 - }, - { - "epoch": 0.18689783401347404, - "grad_norm": 2.4205853818127934, - "learning_rate": 1.9543604047852565e-05, - "loss": 4.066, - "mean_token_accuracy": 0.3055443525314331, - "step": 3870 - }, - { - "epoch": 0.18738077415304372, - "grad_norm": 2.587017003337788, - "learning_rate": 1.9538555555258737e-05, - "loss": 4.0422, - "mean_token_accuracy": 0.29848790168762207, - "step": 3880 - }, - { - "epoch": 0.18786371429261342, - "grad_norm": 2.500367801105121, - "learning_rate": 1.9533479953042923e-05, - "loss": 4.0195, - "mean_token_accuracy": 0.305745966732502, - "step": 3890 - }, - { - "epoch": 0.18834665443218312, - "grad_norm": 2.4991761246992277, - "learning_rate": 1.9528377255630543e-05, - "loss": 4.0637, - "mean_token_accuracy": 0.3063508063554764, - "step": 3900 - }, - { - "epoch": 0.18834665443218312, - "eval_runtime": 7.7872, - "eval_samples_per_second": 379.342, - "eval_steps_per_second": 23.757, - "step": 3900 - }, - { - "epoch": 0.18882959457175283, - "grad_norm": 2.514616543709038, - "learning_rate": 1.9523247477524024e-05, - "loss": 4.0043, - "mean_token_accuracy": 0.3078629016876221, - "step": 3910 - }, - { - "epoch": 0.18931253471132253, - "grad_norm": 2.521595783266744, - "learning_rate": 1.9518090633302755e-05, - "loss": 4.0191, - "mean_token_accuracy": 0.300806450843811, - "step": 3920 - }, - { - "epoch": 0.18979547485089224, - "grad_norm": 2.4025892379321174, - "learning_rate": 1.9512906737623054e-05, - "loss": 4.0715, - "mean_token_accuracy": 0.30372984111309054, - "step": 3930 - }, - { - "epoch": 0.19027841499046194, - "grad_norm": 2.3736905652478364, - "learning_rate": 1.950769580521812e-05, - "loss": 3.9996, - "mean_token_accuracy": 0.3108870968222618, - "step": 3940 - }, - { - "epoch": 0.19076135513003165, - "grad_norm": 2.5732494964441366, - "learning_rate": 1.9502457850898007e-05, - "loss": 4.0187, - "mean_token_accuracy": 0.31018145084381105, - "step": 3950 - }, - { - "epoch": 0.19124429526960132, - "grad_norm": 2.574772993521237, - "learning_rate": 1.9497192889549544e-05, - "loss": 4.0824, - "mean_token_accuracy": 0.3025201603770256, - "step": 3960 - }, - { - "epoch": 0.19172723540917103, - "grad_norm": 2.403181142089266, - "learning_rate": 1.949190093613633e-05, - "loss": 4.0613, - "mean_token_accuracy": 0.30302419513463974, - "step": 3970 - }, - { - "epoch": 0.19221017554874073, - "grad_norm": 2.6461715859268677, - "learning_rate": 1.948658200569868e-05, - "loss": 4.1051, - "mean_token_accuracy": 0.3, - "step": 3980 - }, - { - "epoch": 0.19269311568831043, - "grad_norm": 2.5242849341583167, - "learning_rate": 1.948123611335358e-05, - "loss": 4.0645, - "mean_token_accuracy": 0.3054435446858406, - "step": 3990 - }, - { - "epoch": 0.19317605582788014, - "grad_norm": 2.534385061138649, - "learning_rate": 1.947586327429464e-05, - "loss": 4.0613, - "mean_token_accuracy": 0.30665321946144103, - "step": 4000 - }, - { - "epoch": 0.19317605582788014, - "eval_runtime": 7.8081, - "eval_samples_per_second": 378.323, - "eval_steps_per_second": 23.693, - "step": 4000 - }, - { - "epoch": 0.19365899596744984, - "grad_norm": 2.516760678379009, - "learning_rate": 1.9470463503792058e-05, - "loss": 4.0867, - "mean_token_accuracy": 0.29929435551166533, - "step": 4010 - }, - { - "epoch": 0.19414193610701955, - "grad_norm": 2.52633694862886, - "learning_rate": 1.9465036817192576e-05, - "loss": 3.966, - "mean_token_accuracy": 0.3123991906642914, - "step": 4020 - }, - { - "epoch": 0.19462487624658922, - "grad_norm": 2.694992357818292, - "learning_rate": 1.9459583229919436e-05, - "loss": 4.0941, - "mean_token_accuracy": 0.30514113008975985, - "step": 4030 - }, - { - "epoch": 0.19510781638615893, - "grad_norm": 2.42494813979197, - "learning_rate": 1.9454102757472325e-05, - "loss": 4.0844, - "mean_token_accuracy": 0.30564516186714175, - "step": 4040 - }, - { - "epoch": 0.19559075652572863, - "grad_norm": 2.7997016430800534, - "learning_rate": 1.9448595415427348e-05, - "loss": 4.0699, - "mean_token_accuracy": 0.30967741906642915, - "step": 4050 - }, - { - "epoch": 0.19607369666529834, - "grad_norm": 2.450927188900928, - "learning_rate": 1.9443061219436984e-05, - "loss": 4.0328, - "mean_token_accuracy": 0.30493951588869095, - "step": 4060 - }, - { - "epoch": 0.19655663680486804, - "grad_norm": 2.4520710916361503, - "learning_rate": 1.943750018523002e-05, - "loss": 4.0465, - "mean_token_accuracy": 0.30141129195690153, - "step": 4070 - }, - { - "epoch": 0.19703957694443774, - "grad_norm": 2.548513085821371, - "learning_rate": 1.9431912328611523e-05, - "loss": 4.0676, - "mean_token_accuracy": 0.30866935551166536, - "step": 4080 - }, - { - "epoch": 0.19752251708400745, - "grad_norm": 2.4211864889240675, - "learning_rate": 1.94262976654628e-05, - "loss": 4.0, - "mean_token_accuracy": 0.30413306653499605, - "step": 4090 - }, - { - "epoch": 0.19800545722357712, - "grad_norm": 2.534300677746175, - "learning_rate": 1.9420656211741335e-05, - "loss": 4.0172, - "mean_token_accuracy": 0.3080645173788071, - "step": 4100 - }, - { - "epoch": 0.19800545722357712, - "eval_runtime": 7.8179, - "eval_samples_per_second": 377.853, - "eval_steps_per_second": 23.664, - "step": 4100 - }, - { - "epoch": 0.19848839736314683, - "grad_norm": 2.4251826832719137, - "learning_rate": 1.9414987983480764e-05, - "loss": 4.0523, - "mean_token_accuracy": 0.30443548411130905, - "step": 4110 - }, - { - "epoch": 0.19897133750271653, - "grad_norm": 2.5359288027916804, - "learning_rate": 1.940929299679081e-05, - "loss": 4.0242, - "mean_token_accuracy": 0.3116935506463051, - "step": 4120 - }, - { - "epoch": 0.19945427764228624, - "grad_norm": 2.41459622269149, - "learning_rate": 1.940357126785725e-05, - "loss": 4.0703, - "mean_token_accuracy": 0.30262096896767615, - "step": 4130 - }, - { - "epoch": 0.19993721778185594, - "grad_norm": 2.548625471100205, - "learning_rate": 1.939782281294187e-05, - "loss": 4.0781, - "mean_token_accuracy": 0.2980846785008907, - "step": 4140 - }, - { - "epoch": 0.20042015792142565, - "grad_norm": 2.5293055980853896, - "learning_rate": 1.9392047648382405e-05, - "loss": 4.0387, - "mean_token_accuracy": 0.3074596792459488, - "step": 4150 - }, - { - "epoch": 0.20090309806099535, - "grad_norm": 2.5377151154619035, - "learning_rate": 1.9386245790592513e-05, - "loss": 4.0367, - "mean_token_accuracy": 0.3019153207540512, - "step": 4160 - }, - { - "epoch": 0.20138603820056503, - "grad_norm": 2.3948393404934385, - "learning_rate": 1.9380417256061707e-05, - "loss": 4.0332, - "mean_token_accuracy": 0.31018145233392713, - "step": 4170 - }, - { - "epoch": 0.20186897834013473, - "grad_norm": 2.4518674883472453, - "learning_rate": 1.9374562061355315e-05, - "loss": 4.0863, - "mean_token_accuracy": 0.30050402879714966, - "step": 4180 - }, - { - "epoch": 0.20235191847970443, - "grad_norm": 2.419370833469503, - "learning_rate": 1.9368680223114457e-05, - "loss": 4.0539, - "mean_token_accuracy": 0.3035282254219055, - "step": 4190 - }, - { - "epoch": 0.20283485861927414, - "grad_norm": 2.4552380224389645, - "learning_rate": 1.9362771758055952e-05, - "loss": 4.0672, - "mean_token_accuracy": 0.30221773982048034, - "step": 4200 - }, - { - "epoch": 0.20283485861927414, - "eval_runtime": 7.8164, - "eval_samples_per_second": 377.925, - "eval_steps_per_second": 23.668, - "step": 4200 - }, - { - "epoch": 0.20331779875884384, - "grad_norm": 2.5764697890208894, - "learning_rate": 1.935683668297231e-05, - "loss": 4.0445, - "mean_token_accuracy": 0.3090725839138031, - "step": 4210 - }, - { - "epoch": 0.20380073889841355, - "grad_norm": 2.5877640791755927, - "learning_rate": 1.9350875014731664e-05, - "loss": 4.0734, - "mean_token_accuracy": 0.30030241757631304, - "step": 4220 - }, - { - "epoch": 0.20428367903798325, - "grad_norm": 2.5672705433903324, - "learning_rate": 1.9344886770277735e-05, - "loss": 4.0711, - "mean_token_accuracy": 0.29818548560142516, - "step": 4230 - }, - { - "epoch": 0.20476661917755296, - "grad_norm": 2.6070097591406136, - "learning_rate": 1.9338871966629767e-05, - "loss": 4.0594, - "mean_token_accuracy": 0.2996975779533386, - "step": 4240 - }, - { - "epoch": 0.20524955931712263, - "grad_norm": 2.5056060856865385, - "learning_rate": 1.9332830620882493e-05, - "loss": 4.0156, - "mean_token_accuracy": 0.30766128599643705, - "step": 4250 - }, - { - "epoch": 0.20573249945669234, - "grad_norm": 2.536954497339716, - "learning_rate": 1.9326762750206082e-05, - "loss": 4.0355, - "mean_token_accuracy": 0.30493951588869095, - "step": 4260 - }, - { - "epoch": 0.20621543959626204, - "grad_norm": 2.483717821604835, - "learning_rate": 1.9320668371846087e-05, - "loss": 3.9477, - "mean_token_accuracy": 0.3126008063554764, - "step": 4270 - }, - { - "epoch": 0.20669837973583174, - "grad_norm": 2.439588924034685, - "learning_rate": 1.9314547503123396e-05, - "loss": 4.0891, - "mean_token_accuracy": 0.3006048366427422, - "step": 4280 - }, - { - "epoch": 0.20718131987540145, - "grad_norm": 2.512676808493711, - "learning_rate": 1.93084001614342e-05, - "loss": 4.093, - "mean_token_accuracy": 0.29627016335725787, - "step": 4290 - }, - { - "epoch": 0.20766426001497115, - "grad_norm": 2.5338270946674397, - "learning_rate": 1.930222636424991e-05, - "loss": 4.0172, - "mean_token_accuracy": 0.2998991921544075, - "step": 4300 - }, - { - "epoch": 0.20766426001497115, - "eval_runtime": 7.8133, - "eval_samples_per_second": 378.072, - "eval_steps_per_second": 23.677, - "step": 4300 - }, - { - "epoch": 0.20814720015454086, - "grad_norm": 2.3935398550737017, - "learning_rate": 1.929602612911714e-05, - "loss": 4.1211, - "mean_token_accuracy": 0.2969758063554764, - "step": 4310 - }, - { - "epoch": 0.20863014029411053, - "grad_norm": 2.5121903523760496, - "learning_rate": 1.928979947365764e-05, - "loss": 3.9719, - "mean_token_accuracy": 0.3152217730879784, - "step": 4320 - }, - { - "epoch": 0.20911308043368024, - "grad_norm": 2.3193582128852923, - "learning_rate": 1.928354641556824e-05, - "loss": 4.0297, - "mean_token_accuracy": 0.3068548396229744, - "step": 4330 - }, - { - "epoch": 0.20959602057324994, - "grad_norm": 2.3912353508871553, - "learning_rate": 1.9277266972620828e-05, - "loss": 4.0625, - "mean_token_accuracy": 0.30030241757631304, - "step": 4340 - }, - { - "epoch": 0.21007896071281965, - "grad_norm": 2.4598227549349585, - "learning_rate": 1.927096116266226e-05, - "loss": 4.0727, - "mean_token_accuracy": 0.30231854766607286, - "step": 4350 - }, - { - "epoch": 0.21056190085238935, - "grad_norm": 2.461571886061888, - "learning_rate": 1.9264629003614352e-05, - "loss": 4.034, - "mean_token_accuracy": 0.3034274160861969, - "step": 4360 - }, - { - "epoch": 0.21104484099195905, - "grad_norm": 2.3895560003156677, - "learning_rate": 1.9258270513473788e-05, - "loss": 4.0336, - "mean_token_accuracy": 0.3137096747756004, - "step": 4370 - }, - { - "epoch": 0.21152778113152876, - "grad_norm": 2.501270819608181, - "learning_rate": 1.9251885710312096e-05, - "loss": 4.0816, - "mean_token_accuracy": 0.2969758063554764, - "step": 4380 - }, - { - "epoch": 0.21201072127109843, - "grad_norm": 2.4225682062797573, - "learning_rate": 1.9245474612275583e-05, - "loss": 4.009, - "mean_token_accuracy": 0.3053427398204803, - "step": 4390 - }, - { - "epoch": 0.21249366141066814, - "grad_norm": 2.5966437690923576, - "learning_rate": 1.92390372375853e-05, - "loss": 4.0742, - "mean_token_accuracy": 0.30655242055654525, - "step": 4400 - }, - { - "epoch": 0.21249366141066814, - "eval_runtime": 7.7937, - "eval_samples_per_second": 379.026, - "eval_steps_per_second": 23.737, - "step": 4400 - }, - { - "epoch": 0.21297660155023784, - "grad_norm": 2.4000872847358217, - "learning_rate": 1.923257360453697e-05, - "loss": 4.0328, - "mean_token_accuracy": 0.30141128972172737, - "step": 4410 - }, - { - "epoch": 0.21345954168980755, - "grad_norm": 2.4318047007054773, - "learning_rate": 1.922608373150095e-05, - "loss": 3.9641, - "mean_token_accuracy": 0.31270161271095276, - "step": 4420 - }, - { - "epoch": 0.21394248182937725, - "grad_norm": 2.3537304374562993, - "learning_rate": 1.921956763692217e-05, - "loss": 4.0258, - "mean_token_accuracy": 0.30967741906642915, - "step": 4430 - }, - { - "epoch": 0.21442542196894696, - "grad_norm": 2.4907276209319975, - "learning_rate": 1.9213025339320083e-05, - "loss": 4.0141, - "mean_token_accuracy": 0.30927419364452363, - "step": 4440 - }, - { - "epoch": 0.21490836210851666, - "grad_norm": 2.3957895079994493, - "learning_rate": 1.920645685728862e-05, - "loss": 4.0098, - "mean_token_accuracy": 0.3104838699102402, - "step": 4450 - }, - { - "epoch": 0.21539130224808634, - "grad_norm": 2.544550132805664, - "learning_rate": 1.919986220949613e-05, - "loss": 4.0543, - "mean_token_accuracy": 0.3125, - "step": 4460 - }, - { - "epoch": 0.21587424238765604, - "grad_norm": 2.4014975958752403, - "learning_rate": 1.9193241414685318e-05, - "loss": 4.0, - "mean_token_accuracy": 0.30322580635547636, - "step": 4470 - }, - { - "epoch": 0.21635718252722574, - "grad_norm": 2.466015174751556, - "learning_rate": 1.9186594491673217e-05, - "loss": 3.9961, - "mean_token_accuracy": 0.31471773982048035, - "step": 4480 - }, - { - "epoch": 0.21684012266679545, - "grad_norm": 2.5329011496134886, - "learning_rate": 1.917992145935111e-05, - "loss": 4.1074, - "mean_token_accuracy": 0.30383064299821855, - "step": 4490 - }, - { - "epoch": 0.21732306280636515, - "grad_norm": 2.3974894070580572, - "learning_rate": 1.9173222336684492e-05, - "loss": 4.0496, - "mean_token_accuracy": 0.3053427398204803, - "step": 4500 - }, - { - "epoch": 0.21732306280636515, - "eval_runtime": 7.7965, - "eval_samples_per_second": 378.888, - "eval_steps_per_second": 23.729, - "step": 4500 - }, - { - "epoch": 0.21780600294593486, - "grad_norm": 2.3904189884183205, - "learning_rate": 1.9166497142712995e-05, - "loss": 4.0852, - "mean_token_accuracy": 0.302620966732502, - "step": 4510 - }, - { - "epoch": 0.21828894308550456, - "grad_norm": 2.512258121567953, - "learning_rate": 1.9159745896550367e-05, - "loss": 4.032, - "mean_token_accuracy": 0.3066532239317894, - "step": 4520 - }, - { - "epoch": 0.21877188322507427, - "grad_norm": 2.4135957188161914, - "learning_rate": 1.915296861738439e-05, - "loss": 4.073, - "mean_token_accuracy": 0.31098789870738985, - "step": 4530 - }, - { - "epoch": 0.21925482336464394, - "grad_norm": 2.4447776108764976, - "learning_rate": 1.914616532447683e-05, - "loss": 4.0187, - "mean_token_accuracy": 0.3093750014901161, - "step": 4540 - }, - { - "epoch": 0.21973776350421365, - "grad_norm": 2.494262130906879, - "learning_rate": 1.9139336037163394e-05, - "loss": 4.0305, - "mean_token_accuracy": 0.3131048381328583, - "step": 4550 - }, - { - "epoch": 0.22022070364378335, - "grad_norm": 2.318031299502848, - "learning_rate": 1.913248077485367e-05, - "loss": 4.0391, - "mean_token_accuracy": 0.30957661420106886, - "step": 4560 - }, - { - "epoch": 0.22070364378335305, - "grad_norm": 2.634787930670605, - "learning_rate": 1.912559955703106e-05, - "loss": 3.9695, - "mean_token_accuracy": 0.312298384308815, - "step": 4570 - }, - { - "epoch": 0.22118658392292276, - "grad_norm": 2.489743329988923, - "learning_rate": 1.9118692403252747e-05, - "loss": 3.9906, - "mean_token_accuracy": 0.3118951588869095, - "step": 4580 - }, - { - "epoch": 0.22166952406249246, - "grad_norm": 2.473567565951065, - "learning_rate": 1.9111759333149615e-05, - "loss": 4.0449, - "mean_token_accuracy": 0.30614919364452364, - "step": 4590 - }, - { - "epoch": 0.22215246420206217, - "grad_norm": 2.397229662717697, - "learning_rate": 1.9104800366426216e-05, - "loss": 4.0438, - "mean_token_accuracy": 0.30816532075405123, - "step": 4600 - }, - { - "epoch": 0.22215246420206217, - "eval_runtime": 7.7744, - "eval_samples_per_second": 379.963, - "eval_steps_per_second": 23.796, - "step": 4600 - }, - { - "epoch": 0.22263540434163184, - "grad_norm": 2.409335257933568, - "learning_rate": 1.9097815522860692e-05, - "loss": 3.9949, - "mean_token_accuracy": 0.31088709384202956, - "step": 4610 - }, - { - "epoch": 0.22311834448120155, - "grad_norm": 2.4997405268347372, - "learning_rate": 1.909080482230474e-05, - "loss": 4.0527, - "mean_token_accuracy": 0.3063508063554764, - "step": 4620 - }, - { - "epoch": 0.22360128462077125, - "grad_norm": 2.442181270108447, - "learning_rate": 1.9083768284683533e-05, - "loss": 4.0812, - "mean_token_accuracy": 0.3057459682226181, - "step": 4630 - }, - { - "epoch": 0.22408422476034096, - "grad_norm": 2.4272279353962065, - "learning_rate": 1.907670592999569e-05, - "loss": 4.0672, - "mean_token_accuracy": 0.3069556444883347, - "step": 4640 - }, - { - "epoch": 0.22456716489991066, - "grad_norm": 2.405488540267964, - "learning_rate": 1.9069617778313196e-05, - "loss": 4.059, - "mean_token_accuracy": 0.3022177413105965, - "step": 4650 - }, - { - "epoch": 0.22505010503948036, - "grad_norm": 2.312409121561029, - "learning_rate": 1.9062503849781356e-05, - "loss": 4.0168, - "mean_token_accuracy": 0.31098790317773817, - "step": 4660 - }, - { - "epoch": 0.22553304517905007, - "grad_norm": 2.598683537126093, - "learning_rate": 1.9055364164618738e-05, - "loss": 4.0605, - "mean_token_accuracy": 0.30705645084381106, - "step": 4670 - }, - { - "epoch": 0.22601598531861974, - "grad_norm": 2.391835886913309, - "learning_rate": 1.904819874311711e-05, - "loss": 3.9828, - "mean_token_accuracy": 0.31723790615797043, - "step": 4680 - }, - { - "epoch": 0.22649892545818945, - "grad_norm": 2.4726092766464505, - "learning_rate": 1.9041007605641387e-05, - "loss": 3.9828, - "mean_token_accuracy": 0.30987902730703354, - "step": 4690 - }, - { - "epoch": 0.22698186559775915, - "grad_norm": 2.280213411304886, - "learning_rate": 1.9033790772629566e-05, - "loss": 4.1113, - "mean_token_accuracy": 0.30614919513463973, - "step": 4700 - }, - { - "epoch": 0.22698186559775915, - "eval_runtime": 7.8102, - "eval_samples_per_second": 378.223, - "eval_steps_per_second": 23.687, - "step": 4700 - }, - { - "epoch": 0.22746480573732886, - "grad_norm": 2.3442566846349977, - "learning_rate": 1.9026548264592682e-05, - "loss": 4.0648, - "mean_token_accuracy": 0.30282258093357084, - "step": 4710 - }, - { - "epoch": 0.22794774587689856, - "grad_norm": 2.4219609056524773, - "learning_rate": 1.9019280102114743e-05, - "loss": 3.9504, - "mean_token_accuracy": 0.30967741906642915, - "step": 4720 - }, - { - "epoch": 0.22843068601646827, - "grad_norm": 2.5403332570120254, - "learning_rate": 1.9011986305852656e-05, - "loss": 4.043, - "mean_token_accuracy": 0.31088709384202956, - "step": 4730 - }, - { - "epoch": 0.22891362615603797, - "grad_norm": 2.469312361045575, - "learning_rate": 1.90046668965362e-05, - "loss": 4.1277, - "mean_token_accuracy": 0.29979838207364085, - "step": 4740 - }, - { - "epoch": 0.22939656629560765, - "grad_norm": 2.4236133149561994, - "learning_rate": 1.8997321894967927e-05, - "loss": 4.1148, - "mean_token_accuracy": 0.29919354766607287, - "step": 4750 - }, - { - "epoch": 0.22987950643517735, - "grad_norm": 2.5307277765411715, - "learning_rate": 1.898995132202315e-05, - "loss": 4.0148, - "mean_token_accuracy": 0.3024193555116653, - "step": 4760 - }, - { - "epoch": 0.23036244657474705, - "grad_norm": 2.462445980377924, - "learning_rate": 1.8982555198649843e-05, - "loss": 4.0047, - "mean_token_accuracy": 0.30796370953321456, - "step": 4770 - }, - { - "epoch": 0.23084538671431676, - "grad_norm": 2.499054204821308, - "learning_rate": 1.8975133545868595e-05, - "loss": 4.0594, - "mean_token_accuracy": 0.30181451588869096, - "step": 4780 - }, - { - "epoch": 0.23132832685388646, - "grad_norm": 2.3215590357733156, - "learning_rate": 1.8967686384772566e-05, - "loss": 3.9836, - "mean_token_accuracy": 0.3108870968222618, - "step": 4790 - }, - { - "epoch": 0.23181126699345617, - "grad_norm": 2.4739045579549193, - "learning_rate": 1.8960213736527403e-05, - "loss": 4.027, - "mean_token_accuracy": 0.306955648958683, - "step": 4800 - }, - { - "epoch": 0.23181126699345617, - "eval_runtime": 7.7829, - "eval_samples_per_second": 379.549, - "eval_steps_per_second": 23.77, - "step": 4800 - }, - { - "epoch": 0.23229420713302587, - "grad_norm": 2.42768862130812, - "learning_rate": 1.8952715622371183e-05, - "loss": 3.9781, - "mean_token_accuracy": 0.3152217775583267, - "step": 4810 - }, - { - "epoch": 0.23277714727259557, - "grad_norm": 2.3453972872455546, - "learning_rate": 1.8945192063614384e-05, - "loss": 4.1219, - "mean_token_accuracy": 0.29899193346500397, - "step": 4820 - }, - { - "epoch": 0.23326008741216525, - "grad_norm": 2.322289983616877, - "learning_rate": 1.893764308163978e-05, - "loss": 4.0668, - "mean_token_accuracy": 0.30332661271095274, - "step": 4830 - }, - { - "epoch": 0.23374302755173496, - "grad_norm": 2.3257425058685555, - "learning_rate": 1.8930068697902405e-05, - "loss": 3.9918, - "mean_token_accuracy": 0.30614919513463973, - "step": 4840 - }, - { - "epoch": 0.23422596769130466, - "grad_norm": 2.453908435447591, - "learning_rate": 1.892246893392949e-05, - "loss": 3.9914, - "mean_token_accuracy": 0.31834677755832674, - "step": 4850 - }, - { - "epoch": 0.23470890783087436, - "grad_norm": 2.304456121714858, - "learning_rate": 1.89148438113204e-05, - "loss": 3.9898, - "mean_token_accuracy": 0.3147177428007126, - "step": 4860 - }, - { - "epoch": 0.23519184797044407, - "grad_norm": 2.317812799182742, - "learning_rate": 1.8907193351746567e-05, - "loss": 4.0, - "mean_token_accuracy": 0.3084677428007126, - "step": 4870 - }, - { - "epoch": 0.23567478811001377, - "grad_norm": 2.416255377936248, - "learning_rate": 1.8899517576951438e-05, - "loss": 4.0121, - "mean_token_accuracy": 0.31360886693000795, - "step": 4880 - }, - { - "epoch": 0.23615772824958348, - "grad_norm": 2.377659782204465, - "learning_rate": 1.889181650875041e-05, - "loss": 3.9875, - "mean_token_accuracy": 0.31219758093357086, - "step": 4890 - }, - { - "epoch": 0.23664066838915315, - "grad_norm": 2.409949073682637, - "learning_rate": 1.888409016903076e-05, - "loss": 4.082, - "mean_token_accuracy": 0.30181451737880705, - "step": 4900 - }, - { - "epoch": 0.23664066838915315, - "eval_runtime": 7.7906, - "eval_samples_per_second": 379.177, - "eval_steps_per_second": 23.747, - "step": 4900 - }, - { - "epoch": 0.23712360852872286, - "grad_norm": 2.5229186101860743, - "learning_rate": 1.8876338579751604e-05, - "loss": 3.9629, - "mean_token_accuracy": 0.31350806504487994, - "step": 4910 - }, - { - "epoch": 0.23760654866829256, - "grad_norm": 2.4904309689203314, - "learning_rate": 1.8868561762943796e-05, - "loss": 4.059, - "mean_token_accuracy": 0.30635080486536026, - "step": 4920 - }, - { - "epoch": 0.23808948880786227, - "grad_norm": 2.5161866260229986, - "learning_rate": 1.886075974070991e-05, - "loss": 4.0559, - "mean_token_accuracy": 0.3110887065529823, - "step": 4930 - }, - { - "epoch": 0.23857242894743197, - "grad_norm": 2.3755953881544096, - "learning_rate": 1.8852932535224152e-05, - "loss": 4.0434, - "mean_token_accuracy": 0.30453629046678543, - "step": 4940 - }, - { - "epoch": 0.23905536908700167, - "grad_norm": 2.2664228117675336, - "learning_rate": 1.884508016873229e-05, - "loss": 3.9973, - "mean_token_accuracy": 0.31199596971273424, - "step": 4950 - }, - { - "epoch": 0.23953830922657138, - "grad_norm": 2.333137006574579, - "learning_rate": 1.8837202663551623e-05, - "loss": 4.0246, - "mean_token_accuracy": 0.3083669349551201, - "step": 4960 - }, - { - "epoch": 0.24002124936614105, - "grad_norm": 2.311318342982302, - "learning_rate": 1.882930004207088e-05, - "loss": 3.9625, - "mean_token_accuracy": 0.3132056400179863, - "step": 4970 - }, - { - "epoch": 0.24050418950571076, - "grad_norm": 2.302982732308885, - "learning_rate": 1.8821372326750175e-05, - "loss": 4.0613, - "mean_token_accuracy": 0.30715726017951966, - "step": 4980 - }, - { - "epoch": 0.24098712964528046, - "grad_norm": 2.394695239765169, - "learning_rate": 1.881341954012095e-05, - "loss": 4.0289, - "mean_token_accuracy": 0.3076612904667854, - "step": 4990 - }, - { - "epoch": 0.24147006978485017, - "grad_norm": 2.4550501235440985, - "learning_rate": 1.88054417047859e-05, - "loss": 4.0441, - "mean_token_accuracy": 0.30292338579893113, - "step": 5000 - }, - { - "epoch": 0.24147006978485017, - "eval_runtime": 7.8018, - "eval_samples_per_second": 378.632, - "eval_steps_per_second": 23.713, - "step": 5000 - }, - { - "epoch": 0.24195300992441987, - "grad_norm": 2.5326888838733743, - "learning_rate": 1.8797438843418906e-05, - "loss": 4.0234, - "mean_token_accuracy": 0.3087701633572578, - "step": 5010 - }, - { - "epoch": 0.24243595006398957, - "grad_norm": 2.363472762144466, - "learning_rate": 1.8789410978764972e-05, - "loss": 4.0305, - "mean_token_accuracy": 0.30917338728904725, - "step": 5020 - }, - { - "epoch": 0.24291889020355928, - "grad_norm": 2.5754690547802315, - "learning_rate": 1.878135813364018e-05, - "loss": 4.1043, - "mean_token_accuracy": 0.29858870804309845, - "step": 5030 - }, - { - "epoch": 0.24340183034312896, - "grad_norm": 2.473461141961251, - "learning_rate": 1.87732803309316e-05, - "loss": 4.0371, - "mean_token_accuracy": 0.3082661285996437, - "step": 5040 - }, - { - "epoch": 0.24388477048269866, - "grad_norm": 2.3109907906029945, - "learning_rate": 1.8765177593597225e-05, - "loss": 4.0055, - "mean_token_accuracy": 0.30252016335725784, - "step": 5050 - }, - { - "epoch": 0.24436771062226836, - "grad_norm": 2.490930908371213, - "learning_rate": 1.875704994466593e-05, - "loss": 4.1031, - "mean_token_accuracy": 0.30524193644523623, - "step": 5060 - }, - { - "epoch": 0.24485065076183807, - "grad_norm": 2.3890853308173745, - "learning_rate": 1.874889740723739e-05, - "loss": 4.0176, - "mean_token_accuracy": 0.3140120953321457, - "step": 5070 - }, - { - "epoch": 0.24533359090140777, - "grad_norm": 2.310256812584567, - "learning_rate": 1.8740720004482003e-05, - "loss": 3.9996, - "mean_token_accuracy": 0.31270160973072053, - "step": 5080 - }, - { - "epoch": 0.24581653104097748, - "grad_norm": 2.3601953691482374, - "learning_rate": 1.873251775964085e-05, - "loss": 4.0211, - "mean_token_accuracy": 0.3055443540215492, - "step": 5090 - }, - { - "epoch": 0.24629947118054718, - "grad_norm": 2.493184998826223, - "learning_rate": 1.8724290696025606e-05, - "loss": 4.0047, - "mean_token_accuracy": 0.3021169379353523, - "step": 5100 - }, - { - "epoch": 0.24629947118054718, - "eval_runtime": 7.7994, - "eval_samples_per_second": 378.746, - "eval_steps_per_second": 23.72, - "step": 5100 - }, - { - "epoch": 0.24678241132011688, - "grad_norm": 2.3883737233663416, - "learning_rate": 1.8716038837018496e-05, - "loss": 3.9617, - "mean_token_accuracy": 0.3137096792459488, - "step": 5110 - }, - { - "epoch": 0.24726535145968656, - "grad_norm": 2.407067649736428, - "learning_rate": 1.8707762206072203e-05, - "loss": 4.0223, - "mean_token_accuracy": 0.3102822542190552, - "step": 5120 - }, - { - "epoch": 0.24774829159925627, - "grad_norm": 2.331924706482883, - "learning_rate": 1.8699460826709828e-05, - "loss": 3.9387, - "mean_token_accuracy": 0.31340725868940356, - "step": 5130 - }, - { - "epoch": 0.24823123173882597, - "grad_norm": 2.398068312035197, - "learning_rate": 1.8691134722524794e-05, - "loss": 4.0398, - "mean_token_accuracy": 0.3116935476660728, - "step": 5140 - }, - { - "epoch": 0.24871417187839567, - "grad_norm": 2.341430158591099, - "learning_rate": 1.8682783917180808e-05, - "loss": 4.0211, - "mean_token_accuracy": 0.31653226017951963, - "step": 5150 - }, - { - "epoch": 0.24919711201796538, - "grad_norm": 2.1777730832714246, - "learning_rate": 1.8674408434411778e-05, - "loss": 3.9914, - "mean_token_accuracy": 0.31169354915618896, - "step": 5160 - }, - { - "epoch": 0.24968005215753508, - "grad_norm": 2.368915441612645, - "learning_rate": 1.8666008298021738e-05, - "loss": 3.9723, - "mean_token_accuracy": 0.31290322840213775, - "step": 5170 - }, - { - "epoch": 0.25016299229710476, - "grad_norm": 2.4409459468408214, - "learning_rate": 1.8657583531884804e-05, - "loss": 4.0809, - "mean_token_accuracy": 0.3044354856014252, - "step": 5180 - }, - { - "epoch": 0.2506459324366745, - "grad_norm": 2.2763184014052045, - "learning_rate": 1.8649134159945083e-05, - "loss": 3.9469, - "mean_token_accuracy": 0.3229838714003563, - "step": 5190 - }, - { - "epoch": 0.25112887257624417, - "grad_norm": 2.3311617057759557, - "learning_rate": 1.8640660206216622e-05, - "loss": 4.0914, - "mean_token_accuracy": 0.30272177457809446, - "step": 5200 - }, - { - "epoch": 0.25112887257624417, - "eval_runtime": 7.7925, - "eval_samples_per_second": 379.084, - "eval_steps_per_second": 23.741, - "step": 5200 - }, - { - "epoch": 0.2516118127158139, - "grad_norm": 2.309320261908768, - "learning_rate": 1.863216169478332e-05, - "loss": 4.0309, - "mean_token_accuracy": 0.3045362874865532, - "step": 5210 - }, - { - "epoch": 0.2520947528553836, - "grad_norm": 2.3046826489539063, - "learning_rate": 1.8623638649798886e-05, - "loss": 4.0602, - "mean_token_accuracy": 0.31320564448833466, - "step": 5220 - }, - { - "epoch": 0.25257769299495325, - "grad_norm": 2.3767757884245615, - "learning_rate": 1.8615091095486745e-05, - "loss": 4.1125, - "mean_token_accuracy": 0.29727822095155715, - "step": 5230 - }, - { - "epoch": 0.253060633134523, - "grad_norm": 2.3274956242869926, - "learning_rate": 1.860651905613999e-05, - "loss": 3.993, - "mean_token_accuracy": 0.30947580337524416, - "step": 5240 - }, - { - "epoch": 0.25354357327409266, - "grad_norm": 2.3604895458445903, - "learning_rate": 1.859792255612129e-05, - "loss": 4.0355, - "mean_token_accuracy": 0.305544351041317, - "step": 5250 - }, - { - "epoch": 0.2540265134136624, - "grad_norm": 2.340736150567122, - "learning_rate": 1.8589301619862852e-05, - "loss": 3.984, - "mean_token_accuracy": 0.30967742055654524, - "step": 5260 - }, - { - "epoch": 0.25450945355323207, - "grad_norm": 2.5036716573170503, - "learning_rate": 1.8580656271866317e-05, - "loss": 4.0289, - "mean_token_accuracy": 0.3067540302872658, - "step": 5270 - }, - { - "epoch": 0.2549923936928018, - "grad_norm": 2.603590775132617, - "learning_rate": 1.857198653670271e-05, - "loss": 4.0816, - "mean_token_accuracy": 0.29989919513463975, - "step": 5280 - }, - { - "epoch": 0.2554753338323715, - "grad_norm": 2.3112711829783756, - "learning_rate": 1.8563292439012376e-05, - "loss": 4.0676, - "mean_token_accuracy": 0.3093749970197678, - "step": 5290 - }, - { - "epoch": 0.25595827397194115, - "grad_norm": 2.399626459631329, - "learning_rate": 1.8554574003504893e-05, - "loss": 4.0102, - "mean_token_accuracy": 0.30700733959674836, - "step": 5300 - }, - { - "epoch": 0.25595827397194115, - "eval_runtime": 7.7952, - "eval_samples_per_second": 378.951, - "eval_steps_per_second": 23.733, - "step": 5300 - }, - { - "epoch": 0.2564412141115109, - "grad_norm": 2.3447856428896325, - "learning_rate": 1.8545831254959014e-05, - "loss": 4.0438, - "mean_token_accuracy": 0.30171371102333067, - "step": 5310 - }, - { - "epoch": 0.25692415425108056, - "grad_norm": 2.367630972367812, - "learning_rate": 1.8537064218222586e-05, - "loss": 4.0352, - "mean_token_accuracy": 0.30766129195690156, - "step": 5320 - }, - { - "epoch": 0.2574070943906503, - "grad_norm": 2.3729306020193657, - "learning_rate": 1.8528272918212487e-05, - "loss": 4.0512, - "mean_token_accuracy": 0.3099798411130905, - "step": 5330 - }, - { - "epoch": 0.25789003453021997, - "grad_norm": 2.314830670765685, - "learning_rate": 1.851945737991457e-05, - "loss": 3.9586, - "mean_token_accuracy": 0.3130040317773819, - "step": 5340 - }, - { - "epoch": 0.2583729746697897, - "grad_norm": 2.4771361605823046, - "learning_rate": 1.8510617628383544e-05, - "loss": 3.9879, - "mean_token_accuracy": 0.31411290168762207, - "step": 5350 - }, - { - "epoch": 0.2588559148093594, - "grad_norm": 2.332880643688743, - "learning_rate": 1.850175368874297e-05, - "loss": 4.0176, - "mean_token_accuracy": 0.3073588699102402, - "step": 5360 - }, - { - "epoch": 0.25933885494892905, - "grad_norm": 2.279526720713524, - "learning_rate": 1.8492865586185127e-05, - "loss": 4.0496, - "mean_token_accuracy": 0.30866935551166536, - "step": 5370 - }, - { - "epoch": 0.2598217950884988, - "grad_norm": 2.2715043499782044, - "learning_rate": 1.8483953345970983e-05, - "loss": 4.0449, - "mean_token_accuracy": 0.2966733857989311, - "step": 5380 - }, - { - "epoch": 0.26030473522806846, - "grad_norm": 2.417856469263586, - "learning_rate": 1.8475016993430102e-05, - "loss": 4.0316, - "mean_token_accuracy": 0.3053427442908287, - "step": 5390 - }, - { - "epoch": 0.2607876753676382, - "grad_norm": 2.3939657066886317, - "learning_rate": 1.8466056553960576e-05, - "loss": 3.9688, - "mean_token_accuracy": 0.31421370804309845, - "step": 5400 - }, - { - "epoch": 0.2607876753676382, - "eval_runtime": 7.7982, - "eval_samples_per_second": 378.807, - "eval_steps_per_second": 23.723, - "step": 5400 - }, - { - "epoch": 0.26127061550720787, - "grad_norm": 2.3629090392686325, - "learning_rate": 1.8457072053028962e-05, - "loss": 3.9867, - "mean_token_accuracy": 0.31522177457809447, - "step": 5410 - }, - { - "epoch": 0.2617535556467776, - "grad_norm": 2.3251406340563907, - "learning_rate": 1.84480635161702e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.3161290377378464, - "step": 5420 - }, - { - "epoch": 0.2622364957863473, - "grad_norm": 2.337857664117226, - "learning_rate": 1.843903096898753e-05, - "loss": 3.9801, - "mean_token_accuracy": 0.3078629061579704, - "step": 5430 - }, - { - "epoch": 0.26271943592591696, - "grad_norm": 2.4673230321655577, - "learning_rate": 1.8429974437152454e-05, - "loss": 4.0332, - "mean_token_accuracy": 0.31491935551166533, - "step": 5440 - }, - { - "epoch": 0.2632023760654867, - "grad_norm": 2.1333952726018865, - "learning_rate": 1.8420893946404623e-05, - "loss": 4.0789, - "mean_token_accuracy": 0.3047379031777382, - "step": 5450 - }, - { - "epoch": 0.26368531620505636, - "grad_norm": 2.3968585259124864, - "learning_rate": 1.841178952255179e-05, - "loss": 3.943, - "mean_token_accuracy": 0.31602822691202165, - "step": 5460 - }, - { - "epoch": 0.2641682563446261, - "grad_norm": 2.3898328278252765, - "learning_rate": 1.840266119146973e-05, - "loss": 4.0254, - "mean_token_accuracy": 0.31471773982048035, - "step": 5470 - }, - { - "epoch": 0.26465119648419577, - "grad_norm": 2.3747369956171855, - "learning_rate": 1.8393508979102163e-05, - "loss": 3.9707, - "mean_token_accuracy": 0.31129032373428345, - "step": 5480 - }, - { - "epoch": 0.2651341366237655, - "grad_norm": 2.3128205536022657, - "learning_rate": 1.8384332911460672e-05, - "loss": 3.9594, - "mean_token_accuracy": 0.3176411300897598, - "step": 5490 - }, - { - "epoch": 0.2656170767633352, - "grad_norm": 2.379800505518838, - "learning_rate": 1.8375133014624654e-05, - "loss": 3.966, - "mean_token_accuracy": 0.3194556459784508, - "step": 5500 - }, - { - "epoch": 0.2656170767633352, - "eval_runtime": 7.7902, - "eval_samples_per_second": 379.196, - "eval_steps_per_second": 23.748, - "step": 5500 - }, - { - "epoch": 0.2661000169029049, - "grad_norm": 2.3470868818778694, - "learning_rate": 1.8365909314741232e-05, - "loss": 4.0016, - "mean_token_accuracy": 0.30826612561941147, - "step": 5510 - }, - { - "epoch": 0.2665829570424746, - "grad_norm": 2.3216580874961776, - "learning_rate": 1.8356661838025162e-05, - "loss": 4.0672, - "mean_token_accuracy": 0.3048387080430984, - "step": 5520 - }, - { - "epoch": 0.26706589718204427, - "grad_norm": 2.317526685490564, - "learning_rate": 1.8347390610758798e-05, - "loss": 4.0211, - "mean_token_accuracy": 0.3072580650448799, - "step": 5530 - }, - { - "epoch": 0.267548837321614, - "grad_norm": 2.364153300933969, - "learning_rate": 1.833809565929198e-05, - "loss": 3.9344, - "mean_token_accuracy": 0.3083669379353523, - "step": 5540 - }, - { - "epoch": 0.2680317774611837, - "grad_norm": 2.3604797273813602, - "learning_rate": 1.832877701004198e-05, - "loss": 4.0441, - "mean_token_accuracy": 0.30907257944345473, - "step": 5550 - }, - { - "epoch": 0.2685147176007534, - "grad_norm": 2.3251109589491397, - "learning_rate": 1.8319434689493424e-05, - "loss": 4.0059, - "mean_token_accuracy": 0.3059475839138031, - "step": 5560 - }, - { - "epoch": 0.2689976577403231, - "grad_norm": 2.2868448114703916, - "learning_rate": 1.8310068724198213e-05, - "loss": 3.9762, - "mean_token_accuracy": 0.3176411271095276, - "step": 5570 - }, - { - "epoch": 0.2694805978798928, - "grad_norm": 2.4192872652574082, - "learning_rate": 1.830067914077545e-05, - "loss": 4.0133, - "mean_token_accuracy": 0.30695564597845076, - "step": 5580 - }, - { - "epoch": 0.2699635380194625, - "grad_norm": 2.3705732549526077, - "learning_rate": 1.8291265965911358e-05, - "loss": 3.9937, - "mean_token_accuracy": 0.3160282254219055, - "step": 5590 - }, - { - "epoch": 0.27044647815903217, - "grad_norm": 2.329064013791676, - "learning_rate": 1.8281829226359216e-05, - "loss": 3.9742, - "mean_token_accuracy": 0.3175403237342834, - "step": 5600 - }, - { - "epoch": 0.27044647815903217, - "eval_runtime": 7.7957, - "eval_samples_per_second": 378.927, - "eval_steps_per_second": 23.731, - "step": 5600 - }, - { - "epoch": 0.2709294182986019, - "grad_norm": 2.3676441570772373, - "learning_rate": 1.827236894893927e-05, - "loss": 3.991, - "mean_token_accuracy": 0.3101814493536949, - "step": 5610 - }, - { - "epoch": 0.2714123584381716, - "grad_norm": 2.275190405273513, - "learning_rate": 1.8262885160538676e-05, - "loss": 4.0, - "mean_token_accuracy": 0.3052419304847717, - "step": 5620 - }, - { - "epoch": 0.2718952985777413, - "grad_norm": 2.3252384232163097, - "learning_rate": 1.825337788811139e-05, - "loss": 4.0281, - "mean_token_accuracy": 0.3046370968222618, - "step": 5630 - }, - { - "epoch": 0.272378238717311, - "grad_norm": 2.3927840022508207, - "learning_rate": 1.8243847158678133e-05, - "loss": 4.0023, - "mean_token_accuracy": 0.31330645009875296, - "step": 5640 - }, - { - "epoch": 0.2728611788568807, - "grad_norm": 2.334558087459042, - "learning_rate": 1.8234292999326277e-05, - "loss": 4.0418, - "mean_token_accuracy": 0.3017137125134468, - "step": 5650 - }, - { - "epoch": 0.2733441189964504, - "grad_norm": 2.2926672240841426, - "learning_rate": 1.8224715437209798e-05, - "loss": 4.0664, - "mean_token_accuracy": 0.3090725809335709, - "step": 5660 - }, - { - "epoch": 0.27382705913602007, - "grad_norm": 2.2422588275577477, - "learning_rate": 1.8215114499549176e-05, - "loss": 4.0184, - "mean_token_accuracy": 0.30625, - "step": 5670 - }, - { - "epoch": 0.2743099992755898, - "grad_norm": 2.2692370292422153, - "learning_rate": 1.8205490213631328e-05, - "loss": 4.0055, - "mean_token_accuracy": 0.30907257944345473, - "step": 5680 - }, - { - "epoch": 0.2747929394151595, - "grad_norm": 2.3184674910987595, - "learning_rate": 1.8195842606809536e-05, - "loss": 3.9398, - "mean_token_accuracy": 0.31834677755832674, - "step": 5690 - }, - { - "epoch": 0.2752758795547292, - "grad_norm": 2.4104798829582395, - "learning_rate": 1.8186171706503354e-05, - "loss": 4.0023, - "mean_token_accuracy": 0.3135080635547638, - "step": 5700 - }, - { - "epoch": 0.2752758795547292, - "eval_runtime": 7.8, - "eval_samples_per_second": 378.718, - "eval_steps_per_second": 23.718, - "step": 5700 - }, - { - "epoch": 0.2757588196942989, - "grad_norm": 2.771761647922274, - "learning_rate": 1.8176477540198547e-05, - "loss": 4.0348, - "mean_token_accuracy": 0.30937499850988387, - "step": 5710 - }, - { - "epoch": 0.2762417598338686, - "grad_norm": 2.3544110496302055, - "learning_rate": 1.816676013544699e-05, - "loss": 3.9828, - "mean_token_accuracy": 0.3074596792459488, - "step": 5720 - }, - { - "epoch": 0.2767246999734383, - "grad_norm": 2.3375642096045075, - "learning_rate": 1.815701951986662e-05, - "loss": 3.9117, - "mean_token_accuracy": 0.31118951439857484, - "step": 5730 - }, - { - "epoch": 0.27720764011300797, - "grad_norm": 2.3809022847712193, - "learning_rate": 1.814725572114134e-05, - "loss": 4.0848, - "mean_token_accuracy": 0.29979838281869886, - "step": 5740 - }, - { - "epoch": 0.2776905802525777, - "grad_norm": 2.300537379180946, - "learning_rate": 1.813746876702093e-05, - "loss": 4.025, - "mean_token_accuracy": 0.305745966732502, - "step": 5750 - }, - { - "epoch": 0.2781735203921474, - "grad_norm": 2.3679605419463248, - "learning_rate": 1.8127658685320996e-05, - "loss": 3.9797, - "mean_token_accuracy": 0.31431451737880706, - "step": 5760 - }, - { - "epoch": 0.2786564605317171, - "grad_norm": 2.219694442047168, - "learning_rate": 1.8117825503922858e-05, - "loss": 4.0605, - "mean_token_accuracy": 0.309879033267498, - "step": 5770 - }, - { - "epoch": 0.2791394006712868, - "grad_norm": 2.457312995322759, - "learning_rate": 1.81079692507735e-05, - "loss": 4.0773, - "mean_token_accuracy": 0.30645161420106887, - "step": 5780 - }, - { - "epoch": 0.2796223408108565, - "grad_norm": 2.376516180002499, - "learning_rate": 1.809808995388548e-05, - "loss": 4.1172, - "mean_token_accuracy": 0.29667338728904724, - "step": 5790 - }, - { - "epoch": 0.2801052809504262, - "grad_norm": 2.34405592895596, - "learning_rate": 1.8088187641336846e-05, - "loss": 3.9707, - "mean_token_accuracy": 0.3206653207540512, - "step": 5800 - }, - { - "epoch": 0.2801052809504262, - "eval_runtime": 7.8092, - "eval_samples_per_second": 378.27, - "eval_steps_per_second": 23.69, - "step": 5800 - }, - { - "epoch": 0.28058822108999587, - "grad_norm": 2.4583421902969524, - "learning_rate": 1.8078262341271044e-05, - "loss": 3.9945, - "mean_token_accuracy": 0.31401209980249406, - "step": 5810 - }, - { - "epoch": 0.2810711612295656, - "grad_norm": 2.3363306807246365, - "learning_rate": 1.8068314081896877e-05, - "loss": 3.9973, - "mean_token_accuracy": 0.30897177159786227, - "step": 5820 - }, - { - "epoch": 0.2815541013691353, - "grad_norm": 2.448743622770045, - "learning_rate": 1.8058342891488392e-05, - "loss": 4.0687, - "mean_token_accuracy": 0.3038306459784508, - "step": 5830 - }, - { - "epoch": 0.282037041508705, - "grad_norm": 2.3124185813096902, - "learning_rate": 1.8048348798384802e-05, - "loss": 3.9844, - "mean_token_accuracy": 0.31068548411130903, - "step": 5840 - }, - { - "epoch": 0.2825199816482747, - "grad_norm": 2.273467667922356, - "learning_rate": 1.8038331830990416e-05, - "loss": 4.0, - "mean_token_accuracy": 0.3031250014901161, - "step": 5850 - }, - { - "epoch": 0.2830029217878444, - "grad_norm": 2.3418750143560194, - "learning_rate": 1.8028292017774556e-05, - "loss": 3.9664, - "mean_token_accuracy": 0.3056451566517353, - "step": 5860 - }, - { - "epoch": 0.2834858619274141, - "grad_norm": 2.4395724436683146, - "learning_rate": 1.8018229387271472e-05, - "loss": 4.0223, - "mean_token_accuracy": 0.3108870968222618, - "step": 5870 - }, - { - "epoch": 0.28396880206698377, - "grad_norm": 2.3306774837969146, - "learning_rate": 1.8008143968080273e-05, - "loss": 3.9578, - "mean_token_accuracy": 0.31622983813285827, - "step": 5880 - }, - { - "epoch": 0.2844517422065535, - "grad_norm": 2.3061022402294635, - "learning_rate": 1.7998035788864815e-05, - "loss": 3.9645, - "mean_token_accuracy": 0.3126008063554764, - "step": 5890 - }, - { - "epoch": 0.2849346823461232, - "grad_norm": 2.350985131599099, - "learning_rate": 1.798790487835366e-05, - "loss": 3.9238, - "mean_token_accuracy": 0.3196572557091713, - "step": 5900 - }, - { - "epoch": 0.2849346823461232, - "eval_runtime": 7.7908, - "eval_samples_per_second": 379.167, - "eval_steps_per_second": 23.746, - "step": 5900 - }, - { - "epoch": 0.2854176224856929, - "grad_norm": 2.301420782412212, - "learning_rate": 1.7977751265339967e-05, - "loss": 4.068, - "mean_token_accuracy": 0.30937499850988387, - "step": 5910 - }, - { - "epoch": 0.2859005626252626, - "grad_norm": 2.2861918585040044, - "learning_rate": 1.796757497868142e-05, - "loss": 3.9934, - "mean_token_accuracy": 0.30947580486536025, - "step": 5920 - }, - { - "epoch": 0.2863835027648323, - "grad_norm": 2.3410887630206685, - "learning_rate": 1.7957376047300135e-05, - "loss": 3.9977, - "mean_token_accuracy": 0.3123991936445236, - "step": 5930 - }, - { - "epoch": 0.286866442904402, - "grad_norm": 2.3398464731768764, - "learning_rate": 1.7947154500182605e-05, - "loss": 4.0187, - "mean_token_accuracy": 0.3138104796409607, - "step": 5940 - }, - { - "epoch": 0.2873493830439717, - "grad_norm": 2.2449216061438557, - "learning_rate": 1.7936910366379587e-05, - "loss": 3.9395, - "mean_token_accuracy": 0.32268145233392714, - "step": 5950 - }, - { - "epoch": 0.2878323231835414, - "grad_norm": 2.318184758343338, - "learning_rate": 1.7926643675006027e-05, - "loss": 4.0031, - "mean_token_accuracy": 0.30221773982048034, - "step": 5960 - }, - { - "epoch": 0.2883152633231111, - "grad_norm": 2.2948538362379534, - "learning_rate": 1.7916354455241e-05, - "loss": 3.9879, - "mean_token_accuracy": 0.3088709682226181, - "step": 5970 - }, - { - "epoch": 0.2887982034626808, - "grad_norm": 2.299339809625044, - "learning_rate": 1.7906042736327583e-05, - "loss": 3.9789, - "mean_token_accuracy": 0.3131048396229744, - "step": 5980 - }, - { - "epoch": 0.2892811436022505, - "grad_norm": 2.3323406552667016, - "learning_rate": 1.7895708547572828e-05, - "loss": 4.0512, - "mean_token_accuracy": 0.3063508063554764, - "step": 5990 - }, - { - "epoch": 0.2897640837418202, - "grad_norm": 2.366436181394848, - "learning_rate": 1.7885351918347625e-05, - "loss": 3.993, - "mean_token_accuracy": 0.31149193793535235, - "step": 6000 - }, - { - "epoch": 0.2897640837418202, - "eval_runtime": 7.8014, - "eval_samples_per_second": 378.652, - "eval_steps_per_second": 23.714, - "step": 6000 - }, - { - "epoch": 0.2902470238813899, - "grad_norm": 2.3568261989143098, - "learning_rate": 1.7874972878086653e-05, - "loss": 3.9637, - "mean_token_accuracy": 0.31441532224416735, - "step": 6010 - }, - { - "epoch": 0.2907299640209596, - "grad_norm": 2.3092386997302032, - "learning_rate": 1.7864571456288286e-05, - "loss": 3.9551, - "mean_token_accuracy": 0.3186491966247559, - "step": 6020 - }, - { - "epoch": 0.2912129041605293, - "grad_norm": 2.425376774244814, - "learning_rate": 1.7854147682514505e-05, - "loss": 3.9777, - "mean_token_accuracy": 0.3087701633572578, - "step": 6030 - }, - { - "epoch": 0.291695844300099, - "grad_norm": 2.1874338268109907, - "learning_rate": 1.7843701586390815e-05, - "loss": 4.0047, - "mean_token_accuracy": 0.3167338714003563, - "step": 6040 - }, - { - "epoch": 0.2921787844396687, - "grad_norm": 2.2979702515249962, - "learning_rate": 1.783323319760618e-05, - "loss": 4.0727, - "mean_token_accuracy": 0.3062499985098839, - "step": 6050 - }, - { - "epoch": 0.2926617245792384, - "grad_norm": 2.3467958301553713, - "learning_rate": 1.7822742545912897e-05, - "loss": 3.9926, - "mean_token_accuracy": 0.31118951439857484, - "step": 6060 - }, - { - "epoch": 0.2931446647188081, - "grad_norm": 2.160488858973282, - "learning_rate": 1.7812229661126554e-05, - "loss": 4.0066, - "mean_token_accuracy": 0.3072580650448799, - "step": 6070 - }, - { - "epoch": 0.2936276048583778, - "grad_norm": 2.359186013504172, - "learning_rate": 1.7801694573125927e-05, - "loss": 4.0707, - "mean_token_accuracy": 0.3089717760682106, - "step": 6080 - }, - { - "epoch": 0.29411054499794753, - "grad_norm": 2.273310488578305, - "learning_rate": 1.779113731185289e-05, - "loss": 3.9664, - "mean_token_accuracy": 0.3099798396229744, - "step": 6090 - }, - { - "epoch": 0.2945934851375172, - "grad_norm": 2.313191289463342, - "learning_rate": 1.7780557907312338e-05, - "loss": 3.95, - "mean_token_accuracy": 0.3112903207540512, - "step": 6100 - }, - { - "epoch": 0.2945934851375172, - "eval_runtime": 7.797, - "eval_samples_per_second": 378.861, - "eval_steps_per_second": 23.727, - "step": 6100 - }, - { - "epoch": 0.2950764252770869, - "grad_norm": 2.4133379570023874, - "learning_rate": 1.7769956389572103e-05, - "loss": 3.9832, - "mean_token_accuracy": 0.31421370804309845, - "step": 6110 - }, - { - "epoch": 0.2955593654166566, - "grad_norm": 2.4454949190189743, - "learning_rate": 1.775933278876286e-05, - "loss": 3.9859, - "mean_token_accuracy": 0.31350806802511216, - "step": 6120 - }, - { - "epoch": 0.2960423055562263, - "grad_norm": 2.3296075809571026, - "learning_rate": 1.7748687135078048e-05, - "loss": 3.9578, - "mean_token_accuracy": 0.32026209533214567, - "step": 6130 - }, - { - "epoch": 0.296525245695796, - "grad_norm": 2.396511974580495, - "learning_rate": 1.773801945877378e-05, - "loss": 4.002, - "mean_token_accuracy": 0.3178427442908287, - "step": 6140 - }, - { - "epoch": 0.2970081858353657, - "grad_norm": 2.203523945577575, - "learning_rate": 1.772732979016877e-05, - "loss": 3.9719, - "mean_token_accuracy": 0.3086693525314331, - "step": 6150 - }, - { - "epoch": 0.29749112597493543, - "grad_norm": 2.360976073490464, - "learning_rate": 1.7716618159644228e-05, - "loss": 4.0012, - "mean_token_accuracy": 0.3068548411130905, - "step": 6160 - }, - { - "epoch": 0.2979740661145051, - "grad_norm": 2.3598974847464174, - "learning_rate": 1.7705884597643783e-05, - "loss": 3.9164, - "mean_token_accuracy": 0.3168346807360649, - "step": 6170 - }, - { - "epoch": 0.2984570062540748, - "grad_norm": 2.398539588723626, - "learning_rate": 1.7695129134673396e-05, - "loss": 3.9957, - "mean_token_accuracy": 0.3072580620646477, - "step": 6180 - }, - { - "epoch": 0.2989399463936445, - "grad_norm": 2.293329734876782, - "learning_rate": 1.768435180130127e-05, - "loss": 3.9508, - "mean_token_accuracy": 0.3196572601795197, - "step": 6190 - }, - { - "epoch": 0.2994228865332142, - "grad_norm": 2.174807099599974, - "learning_rate": 1.767355262815778e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.3173387095332146, - "step": 6200 - }, - { - "epoch": 0.2994228865332142, - "eval_runtime": 7.8005, - "eval_samples_per_second": 378.695, - "eval_steps_per_second": 23.717, - "step": 6200 - }, - { - "epoch": 0.2999058266727839, - "grad_norm": 2.2990879033306793, - "learning_rate": 1.766273164593535e-05, - "loss": 4.0105, - "mean_token_accuracy": 0.3074596762657166, - "step": 6210 - }, - { - "epoch": 0.3003887668123536, - "grad_norm": 2.622382771294755, - "learning_rate": 1.76518888853884e-05, - "loss": 3.9035, - "mean_token_accuracy": 0.3196572601795197, - "step": 6220 - }, - { - "epoch": 0.30087170695192333, - "grad_norm": 2.2749746177018655, - "learning_rate": 1.7641024377333254e-05, - "loss": 4.025, - "mean_token_accuracy": 0.3048387080430984, - "step": 6230 - }, - { - "epoch": 0.301354647091493, - "grad_norm": 2.2327096640255313, - "learning_rate": 1.7630138152648036e-05, - "loss": 3.9844, - "mean_token_accuracy": 0.31300403326749804, - "step": 6240 - }, - { - "epoch": 0.3018375872310627, - "grad_norm": 2.227496928899117, - "learning_rate": 1.7619230242272586e-05, - "loss": 3.9461, - "mean_token_accuracy": 0.3133064553141594, - "step": 6250 - }, - { - "epoch": 0.3023205273706324, - "grad_norm": 2.3784554510990104, - "learning_rate": 1.760830067720838e-05, - "loss": 3.925, - "mean_token_accuracy": 0.3194556474685669, - "step": 6260 - }, - { - "epoch": 0.3028034675102021, - "grad_norm": 2.2579992858785003, - "learning_rate": 1.7597349488518452e-05, - "loss": 3.9316, - "mean_token_accuracy": 0.318548384308815, - "step": 6270 - }, - { - "epoch": 0.3032864076497718, - "grad_norm": 2.294492670101738, - "learning_rate": 1.7586376707327273e-05, - "loss": 4.0078, - "mean_token_accuracy": 0.3031249962747097, - "step": 6280 - }, - { - "epoch": 0.3037693477893415, - "grad_norm": 2.2467390160866145, - "learning_rate": 1.7575382364820697e-05, - "loss": 3.9383, - "mean_token_accuracy": 0.3201612919569016, - "step": 6290 - }, - { - "epoch": 0.30425228792891124, - "grad_norm": 2.301583546317634, - "learning_rate": 1.756436649224585e-05, - "loss": 4.0238, - "mean_token_accuracy": 0.30715725272893907, - "step": 6300 - }, - { - "epoch": 0.30425228792891124, - "eval_runtime": 7.8096, - "eval_samples_per_second": 378.254, - "eval_steps_per_second": 23.689, - "step": 6300 - }, - { - "epoch": 0.3047352280684809, - "grad_norm": 2.282605851373963, - "learning_rate": 1.7553329120911052e-05, - "loss": 3.9641, - "mean_token_accuracy": 0.3104838699102402, - "step": 6310 - }, - { - "epoch": 0.3052181682080506, - "grad_norm": 2.273058203677823, - "learning_rate": 1.7542270282185724e-05, - "loss": 3.9902, - "mean_token_accuracy": 0.31068548560142517, - "step": 6320 - }, - { - "epoch": 0.3057011083476203, - "grad_norm": 2.3399972661739454, - "learning_rate": 1.75311900075003e-05, - "loss": 3.8977, - "mean_token_accuracy": 0.31441532373428344, - "step": 6330 - }, - { - "epoch": 0.30618404848719, - "grad_norm": 2.2333003307684054, - "learning_rate": 1.7520088328346138e-05, - "loss": 3.9914, - "mean_token_accuracy": 0.3073588743805885, - "step": 6340 - }, - { - "epoch": 0.30666698862675973, - "grad_norm": 2.3255225647707514, - "learning_rate": 1.7508965276275424e-05, - "loss": 4.0156, - "mean_token_accuracy": 0.31048387438058855, - "step": 6350 - }, - { - "epoch": 0.3071499287663294, - "grad_norm": 2.243317344334465, - "learning_rate": 1.7497820882901098e-05, - "loss": 4.0066, - "mean_token_accuracy": 0.3132056474685669, - "step": 6360 - }, - { - "epoch": 0.30763286890589914, - "grad_norm": 2.2924556285616275, - "learning_rate": 1.7486655179896747e-05, - "loss": 4.0406, - "mean_token_accuracy": 0.30181451588869096, - "step": 6370 - }, - { - "epoch": 0.3081158090454688, - "grad_norm": 2.3063926676659676, - "learning_rate": 1.7475468198996525e-05, - "loss": 3.923, - "mean_token_accuracy": 0.3242943540215492, - "step": 6380 - }, - { - "epoch": 0.3085987491850385, - "grad_norm": 2.391693480237752, - "learning_rate": 1.746425997199506e-05, - "loss": 3.8898, - "mean_token_accuracy": 0.31885080933570864, - "step": 6390 - }, - { - "epoch": 0.3090816893246082, - "grad_norm": 2.3597982085372484, - "learning_rate": 1.7453030530747364e-05, - "loss": 4.0312, - "mean_token_accuracy": 0.30544354766607285, - "step": 6400 - }, - { - "epoch": 0.3090816893246082, - "eval_runtime": 7.8112, - "eval_samples_per_second": 378.173, - "eval_steps_per_second": 23.684, - "step": 6400 - }, - { - "epoch": 0.3095646294641779, - "grad_norm": 2.194724301086377, - "learning_rate": 1.7441779907168745e-05, - "loss": 3.9488, - "mean_token_accuracy": 0.32197580933570863, - "step": 6410 - }, - { - "epoch": 0.31004756960374763, - "grad_norm": 2.522540882686619, - "learning_rate": 1.7430508133234702e-05, - "loss": 3.9648, - "mean_token_accuracy": 0.31048387140035627, - "step": 6420 - }, - { - "epoch": 0.3105305097433173, - "grad_norm": 2.3446865957219734, - "learning_rate": 1.741921524098086e-05, - "loss": 4.0629, - "mean_token_accuracy": 0.3073588714003563, - "step": 6430 - }, - { - "epoch": 0.31101344988288704, - "grad_norm": 2.267605067107876, - "learning_rate": 1.7407901262502855e-05, - "loss": 3.9762, - "mean_token_accuracy": 0.30846773982048037, - "step": 6440 - }, - { - "epoch": 0.3114963900224567, - "grad_norm": 2.2985752161065007, - "learning_rate": 1.739656622995626e-05, - "loss": 3.9434, - "mean_token_accuracy": 0.3181451603770256, - "step": 6450 - }, - { - "epoch": 0.3119793301620264, - "grad_norm": 2.22405506620997, - "learning_rate": 1.738521017555648e-05, - "loss": 4.0066, - "mean_token_accuracy": 0.3078629031777382, - "step": 6460 - }, - { - "epoch": 0.3124622703015961, - "grad_norm": 2.4051041058811533, - "learning_rate": 1.7373833131578667e-05, - "loss": 4.048, - "mean_token_accuracy": 0.3053427435457706, - "step": 6470 - }, - { - "epoch": 0.3129452104411658, - "grad_norm": 2.290105618837381, - "learning_rate": 1.7362435130357633e-05, - "loss": 4.0113, - "mean_token_accuracy": 0.3079637110233307, - "step": 6480 - }, - { - "epoch": 0.31342815058073553, - "grad_norm": 2.3770429874571852, - "learning_rate": 1.735101620428774e-05, - "loss": 3.9371, - "mean_token_accuracy": 0.31985886842012407, - "step": 6490 - }, - { - "epoch": 0.3139110907203052, - "grad_norm": 2.1643821886932604, - "learning_rate": 1.7339576385822837e-05, - "loss": 3.9945, - "mean_token_accuracy": 0.31401209682226183, - "step": 6500 - }, - { - "epoch": 0.3139110907203052, - "eval_runtime": 7.782, - "eval_samples_per_second": 379.593, - "eval_steps_per_second": 23.773, - "step": 6500 - }, - { - "epoch": 0.31439403085987494, - "grad_norm": 2.33468782646229, - "learning_rate": 1.7328115707476143e-05, - "loss": 3.9449, - "mean_token_accuracy": 0.3156249985098839, - "step": 6510 - }, - { - "epoch": 0.3148769709994446, - "grad_norm": 2.2244561010714414, - "learning_rate": 1.731663420182016e-05, - "loss": 3.9426, - "mean_token_accuracy": 0.3172379083931446, - "step": 6520 - }, - { - "epoch": 0.3153599111390143, - "grad_norm": 2.1989794630573445, - "learning_rate": 1.7305131901486594e-05, - "loss": 3.9879, - "mean_token_accuracy": 0.3126008093357086, - "step": 6530 - }, - { - "epoch": 0.315842851278584, - "grad_norm": 2.28891198117997, - "learning_rate": 1.729360883916624e-05, - "loss": 3.9672, - "mean_token_accuracy": 0.3138104841113091, - "step": 6540 - }, - { - "epoch": 0.3163257914181537, - "grad_norm": 2.3101288529255846, - "learning_rate": 1.7282065047608906e-05, - "loss": 3.9621, - "mean_token_accuracy": 0.31703629046678544, - "step": 6550 - }, - { - "epoch": 0.31680873155772343, - "grad_norm": 2.416214398465251, - "learning_rate": 1.7270500559623315e-05, - "loss": 4.0199, - "mean_token_accuracy": 0.3084677457809448, - "step": 6560 - }, - { - "epoch": 0.3172916716972931, - "grad_norm": 2.2385735416834502, - "learning_rate": 1.7258915408077014e-05, - "loss": 3.9742, - "mean_token_accuracy": 0.30806451588869094, - "step": 6570 - }, - { - "epoch": 0.31777461183686284, - "grad_norm": 2.3290882483172166, - "learning_rate": 1.7247309625896275e-05, - "loss": 4.0238, - "mean_token_accuracy": 0.3094758078455925, - "step": 6580 - }, - { - "epoch": 0.3182575519764325, - "grad_norm": 2.3401643027405843, - "learning_rate": 1.7235683246066004e-05, - "loss": 3.9742, - "mean_token_accuracy": 0.316431450843811, - "step": 6590 - }, - { - "epoch": 0.3187404921160022, - "grad_norm": 2.1333934219974733, - "learning_rate": 1.722403630162965e-05, - "loss": 3.9773, - "mean_token_accuracy": 0.31088709384202956, - "step": 6600 - }, - { - "epoch": 0.3187404921160022, - "eval_runtime": 7.778, - "eval_samples_per_second": 379.788, - "eval_steps_per_second": 23.785, - "step": 6600 - }, - { - "epoch": 0.3192234322555719, - "grad_norm": 2.2208459993722136, - "learning_rate": 1.7212368825689103e-05, - "loss": 3.9668, - "mean_token_accuracy": 0.31512096524238586, - "step": 6610 - }, - { - "epoch": 0.3197063723951416, - "grad_norm": 2.359267365961413, - "learning_rate": 1.7200680851404618e-05, - "loss": 3.9828, - "mean_token_accuracy": 0.3094758063554764, - "step": 6620 - }, - { - "epoch": 0.32018931253471133, - "grad_norm": 2.216233674902618, - "learning_rate": 1.7188972411994692e-05, - "loss": 3.9285, - "mean_token_accuracy": 0.3195564553141594, - "step": 6630 - }, - { - "epoch": 0.320672252674281, - "grad_norm": 2.502862582329525, - "learning_rate": 1.7177243540736e-05, - "loss": 4.0441, - "mean_token_accuracy": 0.3050403207540512, - "step": 6640 - }, - { - "epoch": 0.32115519281385074, - "grad_norm": 2.437597504964185, - "learning_rate": 1.7165494270963286e-05, - "loss": 4.0027, - "mean_token_accuracy": 0.3105846792459488, - "step": 6650 - }, - { - "epoch": 0.3216381329534204, - "grad_norm": 2.313393770355039, - "learning_rate": 1.7153724636069258e-05, - "loss": 3.9859, - "mean_token_accuracy": 0.3132056459784508, - "step": 6660 - }, - { - "epoch": 0.32212107309299015, - "grad_norm": 2.3105426977274743, - "learning_rate": 1.7141934669504514e-05, - "loss": 4.0621, - "mean_token_accuracy": 0.29979838579893114, - "step": 6670 - }, - { - "epoch": 0.3226040132325598, - "grad_norm": 2.3483809214252642, - "learning_rate": 1.7130124404777428e-05, - "loss": 3.9746, - "mean_token_accuracy": 0.3112903252243996, - "step": 6680 - }, - { - "epoch": 0.3230869533721295, - "grad_norm": 2.1739337127979077, - "learning_rate": 1.7118293875454077e-05, - "loss": 3.9113, - "mean_token_accuracy": 0.318044351041317, - "step": 6690 - }, - { - "epoch": 0.32356989351169924, - "grad_norm": 2.322400227042309, - "learning_rate": 1.7106443115158114e-05, - "loss": 4.0137, - "mean_token_accuracy": 0.31391129046678545, - "step": 6700 - }, - { - "epoch": 0.32356989351169924, - "eval_runtime": 7.7776, - "eval_samples_per_second": 379.81, - "eval_steps_per_second": 23.786, - "step": 6700 - }, - { - "epoch": 0.3240528336512689, - "grad_norm": 2.352950881936604, - "learning_rate": 1.709457215757071e-05, - "loss": 3.9652, - "mean_token_accuracy": 0.309879033267498, - "step": 6710 - }, - { - "epoch": 0.32453577379083864, - "grad_norm": 2.2879967792380693, - "learning_rate": 1.7082681036430426e-05, - "loss": 4.0363, - "mean_token_accuracy": 0.30816532373428346, - "step": 6720 - }, - { - "epoch": 0.3250187139304083, - "grad_norm": 2.3073019576661684, - "learning_rate": 1.707076978553313e-05, - "loss": 4.0613, - "mean_token_accuracy": 0.30947580486536025, - "step": 6730 - }, - { - "epoch": 0.32550165406997805, - "grad_norm": 2.355077349448005, - "learning_rate": 1.705883843873191e-05, - "loss": 3.9422, - "mean_token_accuracy": 0.3134072571992874, - "step": 6740 - }, - { - "epoch": 0.32598459420954773, - "grad_norm": 2.242615298248909, - "learning_rate": 1.7046887029936962e-05, - "loss": 3.9125, - "mean_token_accuracy": 0.31693548411130906, - "step": 6750 - }, - { - "epoch": 0.3264675343491174, - "grad_norm": 2.3218418604028583, - "learning_rate": 1.7034915593115502e-05, - "loss": 4.0098, - "mean_token_accuracy": 0.3086693540215492, - "step": 6760 - }, - { - "epoch": 0.32695047448868714, - "grad_norm": 2.2112851894813974, - "learning_rate": 1.7022924162291667e-05, - "loss": 3.9285, - "mean_token_accuracy": 0.3201612919569016, - "step": 6770 - }, - { - "epoch": 0.3274334146282568, - "grad_norm": 2.1722341674024768, - "learning_rate": 1.701091277154642e-05, - "loss": 3.9543, - "mean_token_accuracy": 0.323387099802494, - "step": 6780 - }, - { - "epoch": 0.32791635476782655, - "grad_norm": 2.3522008685694122, - "learning_rate": 1.6998881455017448e-05, - "loss": 3.9836, - "mean_token_accuracy": 0.30493951588869095, - "step": 6790 - }, - { - "epoch": 0.3283992949073962, - "grad_norm": 2.172941073489028, - "learning_rate": 1.698683024689908e-05, - "loss": 3.9684, - "mean_token_accuracy": 0.31602822840213773, - "step": 6800 - }, - { - "epoch": 0.3283992949073962, - "eval_runtime": 7.778, - "eval_samples_per_second": 379.789, - "eval_steps_per_second": 23.785, - "step": 6800 - }, - { - "epoch": 0.32888223504696595, - "grad_norm": 2.2967390741857394, - "learning_rate": 1.6974759181442165e-05, - "loss": 4.0223, - "mean_token_accuracy": 0.30917338877916334, - "step": 6810 - }, - { - "epoch": 0.32936517518653563, - "grad_norm": 2.3853410377504636, - "learning_rate": 1.6962668292954004e-05, - "loss": 4.0043, - "mean_token_accuracy": 0.30957661122083663, - "step": 6820 - }, - { - "epoch": 0.3298481153261053, - "grad_norm": 2.328304698510418, - "learning_rate": 1.695055761579823e-05, - "loss": 3.9078, - "mean_token_accuracy": 0.32137096524238584, - "step": 6830 - }, - { - "epoch": 0.33033105546567504, - "grad_norm": 2.4057333311056124, - "learning_rate": 1.693842718439471e-05, - "loss": 3.9516, - "mean_token_accuracy": 0.3177419379353523, - "step": 6840 - }, - { - "epoch": 0.3308139956052447, - "grad_norm": 2.336966053167253, - "learning_rate": 1.692627703321946e-05, - "loss": 4.0426, - "mean_token_accuracy": 0.3061491891741753, - "step": 6850 - }, - { - "epoch": 0.33129693574481445, - "grad_norm": 2.3512761115617726, - "learning_rate": 1.691410719680455e-05, - "loss": 3.932, - "mean_token_accuracy": 0.3092741951346397, - "step": 6860 - }, - { - "epoch": 0.3317798758843841, - "grad_norm": 2.2427326694619465, - "learning_rate": 1.6901917709737988e-05, - "loss": 4.0445, - "mean_token_accuracy": 0.3020161330699921, - "step": 6870 - }, - { - "epoch": 0.33226281602395386, - "grad_norm": 2.2925829819531813, - "learning_rate": 1.688970860666364e-05, - "loss": 4.0129, - "mean_token_accuracy": 0.3090725839138031, - "step": 6880 - }, - { - "epoch": 0.33274575616352353, - "grad_norm": 2.238877296471795, - "learning_rate": 1.687747992228111e-05, - "loss": 4.0363, - "mean_token_accuracy": 0.307762099802494, - "step": 6890 - }, - { - "epoch": 0.3332286963030932, - "grad_norm": 2.318578708273, - "learning_rate": 1.6865231691345663e-05, - "loss": 3.9574, - "mean_token_accuracy": 0.315625, - "step": 6900 - }, - { - "epoch": 0.3332286963030932, - "eval_runtime": 7.7861, - "eval_samples_per_second": 379.393, - "eval_steps_per_second": 23.76, - "step": 6900 - }, - { - "epoch": 0.33371163644266294, - "grad_norm": 2.4347999850906823, - "learning_rate": 1.6852963948668114e-05, - "loss": 3.9707, - "mean_token_accuracy": 0.31008064597845075, - "step": 6910 - }, - { - "epoch": 0.3341945765822326, - "grad_norm": 2.2178276285514262, - "learning_rate": 1.684067672911474e-05, - "loss": 3.9242, - "mean_token_accuracy": 0.31784274578094485, - "step": 6920 - }, - { - "epoch": 0.33467751672180235, - "grad_norm": 2.3206908076372996, - "learning_rate": 1.6828370067607166e-05, - "loss": 4.043, - "mean_token_accuracy": 0.3073588669300079, - "step": 6930 - }, - { - "epoch": 0.335160456861372, - "grad_norm": 2.2500304159130926, - "learning_rate": 1.681604399912227e-05, - "loss": 3.966, - "mean_token_accuracy": 0.3155241906642914, - "step": 6940 - }, - { - "epoch": 0.33564339700094176, - "grad_norm": 2.181974160572126, - "learning_rate": 1.68036985586921e-05, - "loss": 4.0199, - "mean_token_accuracy": 0.3102822571992874, - "step": 6950 - }, - { - "epoch": 0.33612633714051143, - "grad_norm": 2.283425489086256, - "learning_rate": 1.6791333781403747e-05, - "loss": 3.9734, - "mean_token_accuracy": 0.31844757944345475, - "step": 6960 - }, - { - "epoch": 0.3366092772800811, - "grad_norm": 2.2393146725876814, - "learning_rate": 1.6778949702399266e-05, - "loss": 3.9312, - "mean_token_accuracy": 0.3215725839138031, - "step": 6970 - }, - { - "epoch": 0.33709221741965084, - "grad_norm": 2.367171414398298, - "learning_rate": 1.676654635687557e-05, - "loss": 3.9805, - "mean_token_accuracy": 0.30957661420106886, - "step": 6980 - }, - { - "epoch": 0.3375751575592205, - "grad_norm": 2.257205199857503, - "learning_rate": 1.675412378008433e-05, - "loss": 3.9688, - "mean_token_accuracy": 0.3132056429982185, - "step": 6990 - }, - { - "epoch": 0.33805809769879025, - "grad_norm": 2.3866668524460413, - "learning_rate": 1.674168200733187e-05, - "loss": 4.0309, - "mean_token_accuracy": 0.3129032254219055, - "step": 7000 - }, - { - "epoch": 0.33805809769879025, - "eval_runtime": 7.776, - "eval_samples_per_second": 379.888, - "eval_steps_per_second": 23.791, - "step": 7000 - }, - { - "epoch": 0.3385410378383599, - "grad_norm": 2.3452923804444175, - "learning_rate": 1.6729221073979078e-05, - "loss": 3.9207, - "mean_token_accuracy": 0.31895161271095274, - "step": 7010 - }, - { - "epoch": 0.33902397797792966, - "grad_norm": 2.3079760616919582, - "learning_rate": 1.671674101544129e-05, - "loss": 4.0207, - "mean_token_accuracy": 0.3066532239317894, - "step": 7020 - }, - { - "epoch": 0.33950691811749933, - "grad_norm": 2.2099399443930845, - "learning_rate": 1.6704241867188202e-05, - "loss": 3.966, - "mean_token_accuracy": 0.31401209682226183, - "step": 7030 - }, - { - "epoch": 0.339989858257069, - "grad_norm": 2.347282765374838, - "learning_rate": 1.6691723664743774e-05, - "loss": 3.993, - "mean_token_accuracy": 0.3141129046678543, - "step": 7040 - }, - { - "epoch": 0.34047279839663874, - "grad_norm": 2.41141185648113, - "learning_rate": 1.66791864436861e-05, - "loss": 3.9246, - "mean_token_accuracy": 0.32147177308797836, - "step": 7050 - }, - { - "epoch": 0.3409557385362084, - "grad_norm": 2.3865850421326535, - "learning_rate": 1.6666630239647345e-05, - "loss": 3.8871, - "mean_token_accuracy": 0.3296370983123779, - "step": 7060 - }, - { - "epoch": 0.34143867867577815, - "grad_norm": 2.3534648212718663, - "learning_rate": 1.6654055088313614e-05, - "loss": 4.1, - "mean_token_accuracy": 0.301008066534996, - "step": 7070 - }, - { - "epoch": 0.3419216188153478, - "grad_norm": 2.2459606706905575, - "learning_rate": 1.6641461025424876e-05, - "loss": 3.9703, - "mean_token_accuracy": 0.31784274280071256, - "step": 7080 - }, - { - "epoch": 0.34240455895491756, - "grad_norm": 2.174141593439425, - "learning_rate": 1.6628848086774836e-05, - "loss": 3.9152, - "mean_token_accuracy": 0.31723790243268013, - "step": 7090 - }, - { - "epoch": 0.34288749909448724, - "grad_norm": 2.3933393971085413, - "learning_rate": 1.661621630821085e-05, - "loss": 3.9453, - "mean_token_accuracy": 0.31602822840213773, - "step": 7100 - }, - { - "epoch": 0.34288749909448724, - "eval_runtime": 7.8005, - "eval_samples_per_second": 378.693, - "eval_steps_per_second": 23.716, - "step": 7100 - }, - { - "epoch": 0.3433704392340569, - "grad_norm": 2.198487978497135, - "learning_rate": 1.6603565725633816e-05, - "loss": 3.9359, - "mean_token_accuracy": 0.3143145129084587, - "step": 7110 - }, - { - "epoch": 0.34385337937362664, - "grad_norm": 2.503913516569388, - "learning_rate": 1.6590896374998084e-05, - "loss": 4.0527, - "mean_token_accuracy": 0.3056451603770256, - "step": 7120 - }, - { - "epoch": 0.3443363195131963, - "grad_norm": 2.301727766189351, - "learning_rate": 1.6578208292311336e-05, - "loss": 3.9906, - "mean_token_accuracy": 0.30383064448833463, - "step": 7130 - }, - { - "epoch": 0.34481925965276605, - "grad_norm": 2.2129990274286757, - "learning_rate": 1.65655015136345e-05, - "loss": 3.9406, - "mean_token_accuracy": 0.3174395188689232, - "step": 7140 - }, - { - "epoch": 0.34530219979233573, - "grad_norm": 2.2052477974110762, - "learning_rate": 1.655277607508163e-05, - "loss": 3.9262, - "mean_token_accuracy": 0.31542338579893114, - "step": 7150 - }, - { - "epoch": 0.34578513993190546, - "grad_norm": 2.187364445349514, - "learning_rate": 1.6540032012819822e-05, - "loss": 3.9453, - "mean_token_accuracy": 0.32237903624773023, - "step": 7160 - }, - { - "epoch": 0.34626808007147514, - "grad_norm": 2.1950098819251203, - "learning_rate": 1.6527269363069104e-05, - "loss": 3.9715, - "mean_token_accuracy": 0.3137096777558327, - "step": 7170 - }, - { - "epoch": 0.3467510202110448, - "grad_norm": 2.216177434209642, - "learning_rate": 1.651448816210232e-05, - "loss": 3.9219, - "mean_token_accuracy": 0.31844757944345475, - "step": 7180 - }, - { - "epoch": 0.34723396035061455, - "grad_norm": 2.36810281830132, - "learning_rate": 1.6501688446245046e-05, - "loss": 3.9516, - "mean_token_accuracy": 0.318951615691185, - "step": 7190 - }, - { - "epoch": 0.3477169004901842, - "grad_norm": 2.3448721178663083, - "learning_rate": 1.6488870251875488e-05, - "loss": 3.9438, - "mean_token_accuracy": 0.3159274190664291, - "step": 7200 - }, - { - "epoch": 0.3477169004901842, - "eval_runtime": 7.7785, - "eval_samples_per_second": 379.766, - "eval_steps_per_second": 23.784, - "step": 7200 - }, - { - "epoch": 0.34819984062975395, - "grad_norm": 2.3616374782903953, - "learning_rate": 1.6476033615424358e-05, - "loss": 4.009, - "mean_token_accuracy": 0.3084677428007126, - "step": 7210 - }, - { - "epoch": 0.34868278076932363, - "grad_norm": 2.1638043387752206, - "learning_rate": 1.6463178573374784e-05, - "loss": 3.9508, - "mean_token_accuracy": 0.31542338728904723, - "step": 7220 - }, - { - "epoch": 0.34916572090889336, - "grad_norm": 2.2885376591805238, - "learning_rate": 1.6450305162262203e-05, - "loss": 3.9305, - "mean_token_accuracy": 0.3120967738330364, - "step": 7230 - }, - { - "epoch": 0.34964866104846304, - "grad_norm": 2.287172788102938, - "learning_rate": 1.6437413418674273e-05, - "loss": 4.0156, - "mean_token_accuracy": 0.31602822691202165, - "step": 7240 - }, - { - "epoch": 0.35013160118803277, - "grad_norm": 2.1779051064389123, - "learning_rate": 1.6424503379250735e-05, - "loss": 3.9602, - "mean_token_accuracy": 0.3115927428007126, - "step": 7250 - }, - { - "epoch": 0.35061454132760245, - "grad_norm": 2.0871353987714745, - "learning_rate": 1.6411575080683343e-05, - "loss": 4.0141, - "mean_token_accuracy": 0.30463709831237795, - "step": 7260 - }, - { - "epoch": 0.3510974814671721, - "grad_norm": 2.2193379463158, - "learning_rate": 1.639862855971574e-05, - "loss": 4.007, - "mean_token_accuracy": 0.30635080933570863, - "step": 7270 - }, - { - "epoch": 0.35158042160674186, - "grad_norm": 2.280612581787595, - "learning_rate": 1.6385663853143355e-05, - "loss": 4.0191, - "mean_token_accuracy": 0.3054435484111309, - "step": 7280 - }, - { - "epoch": 0.35206336174631153, - "grad_norm": 2.2214063243459554, - "learning_rate": 1.6372680997813315e-05, - "loss": 3.9973, - "mean_token_accuracy": 0.31179435551166534, - "step": 7290 - }, - { - "epoch": 0.35254630188588126, - "grad_norm": 2.296752881513919, - "learning_rate": 1.6359680030624318e-05, - "loss": 3.9945, - "mean_token_accuracy": 0.30907257944345473, - "step": 7300 - }, - { - "epoch": 0.35254630188588126, - "eval_runtime": 7.7721, - "eval_samples_per_second": 380.076, - "eval_steps_per_second": 23.803, - "step": 7300 - }, - { - "epoch": 0.35302924202545094, - "grad_norm": 2.2489755150132837, - "learning_rate": 1.634666098852654e-05, - "loss": 3.9039, - "mean_token_accuracy": 0.3200604826211929, - "step": 7310 - }, - { - "epoch": 0.35351218216502067, - "grad_norm": 2.1856828168168208, - "learning_rate": 1.633362390852152e-05, - "loss": 3.9832, - "mean_token_accuracy": 0.31602822840213773, - "step": 7320 - }, - { - "epoch": 0.35399512230459035, - "grad_norm": 2.3085684332824288, - "learning_rate": 1.6320568827662083e-05, - "loss": 3.9922, - "mean_token_accuracy": 0.3146169349551201, - "step": 7330 - }, - { - "epoch": 0.35447806244416, - "grad_norm": 2.26913729212531, - "learning_rate": 1.630749578305219e-05, - "loss": 4.0238, - "mean_token_accuracy": 0.30977822840213776, - "step": 7340 - }, - { - "epoch": 0.35496100258372976, - "grad_norm": 2.3402266186887606, - "learning_rate": 1.629440481184688e-05, - "loss": 3.9488, - "mean_token_accuracy": 0.3134072557091713, - "step": 7350 - }, - { - "epoch": 0.35544394272329943, - "grad_norm": 2.209580816580286, - "learning_rate": 1.6281295951252124e-05, - "loss": 3.8887, - "mean_token_accuracy": 0.3208669349551201, - "step": 7360 - }, - { - "epoch": 0.35592688286286916, - "grad_norm": 2.252813356788115, - "learning_rate": 1.6268169238524742e-05, - "loss": 3.9789, - "mean_token_accuracy": 0.31350806206464765, - "step": 7370 - }, - { - "epoch": 0.35640982300243884, - "grad_norm": 2.3898285632577076, - "learning_rate": 1.6255024710972295e-05, - "loss": 4.0363, - "mean_token_accuracy": 0.3119959682226181, - "step": 7380 - }, - { - "epoch": 0.3568927631420086, - "grad_norm": 2.0969880498936573, - "learning_rate": 1.624186240595297e-05, - "loss": 3.9418, - "mean_token_accuracy": 0.31703629195690153, - "step": 7390 - }, - { - "epoch": 0.35737570328157825, - "grad_norm": 2.223692148575789, - "learning_rate": 1.622868236087549e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.30897177308797835, - "step": 7400 - }, - { - "epoch": 0.35737570328157825, - "eval_runtime": 7.806, - "eval_samples_per_second": 378.425, - "eval_steps_per_second": 23.7, - "step": 7400 - }, - { - "epoch": 0.3578586434211479, - "grad_norm": 2.325578031025663, - "learning_rate": 1.6215484613198982e-05, - "loss": 4.0258, - "mean_token_accuracy": 0.31209677308797834, - "step": 7410 - }, - { - "epoch": 0.35834158356071766, - "grad_norm": 2.476555569341456, - "learning_rate": 1.62022692004329e-05, - "loss": 4.0598, - "mean_token_accuracy": 0.30181451588869096, - "step": 7420 - }, - { - "epoch": 0.35882452370028733, - "grad_norm": 2.266881035928201, - "learning_rate": 1.618903616013689e-05, - "loss": 4.0301, - "mean_token_accuracy": 0.3078629031777382, - "step": 7430 - }, - { - "epoch": 0.35930746383985707, - "grad_norm": 2.2966846022527636, - "learning_rate": 1.6175785529920713e-05, - "loss": 3.9391, - "mean_token_accuracy": 0.32207661122083664, - "step": 7440 - }, - { - "epoch": 0.35979040397942674, - "grad_norm": 2.349804419385055, - "learning_rate": 1.6162517347444112e-05, - "loss": 3.9059, - "mean_token_accuracy": 0.3203629031777382, - "step": 7450 - }, - { - "epoch": 0.3602733441189965, - "grad_norm": 2.20560920505012, - "learning_rate": 1.6149231650416718e-05, - "loss": 4.0027, - "mean_token_accuracy": 0.31088710129261016, - "step": 7460 - }, - { - "epoch": 0.36075628425856615, - "grad_norm": 2.2392022644333203, - "learning_rate": 1.6135928476597937e-05, - "loss": 3.9527, - "mean_token_accuracy": 0.30846774131059645, - "step": 7470 - }, - { - "epoch": 0.3612392243981358, - "grad_norm": 2.1744927379007586, - "learning_rate": 1.6122607863796854e-05, - "loss": 3.968, - "mean_token_accuracy": 0.31794354915618894, - "step": 7480 - }, - { - "epoch": 0.36172216453770556, - "grad_norm": 2.1556922176031685, - "learning_rate": 1.6109269849872117e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.31461693346500397, - "step": 7490 - }, - { - "epoch": 0.36220510467727524, - "grad_norm": 2.2754901303161197, - "learning_rate": 1.6095914472731813e-05, - "loss": 4.0094, - "mean_token_accuracy": 0.3083669364452362, - "step": 7500 - }, - { - "epoch": 0.36220510467727524, - "eval_runtime": 7.7782, - "eval_samples_per_second": 379.779, - "eval_steps_per_second": 23.784, - "step": 7500 - }, - { - "epoch": 0.36268804481684497, - "grad_norm": 2.3576399930936116, - "learning_rate": 1.60825417703334e-05, - "loss": 3.9918, - "mean_token_accuracy": 0.31270161718130113, - "step": 7510 - }, - { - "epoch": 0.36317098495641464, - "grad_norm": 2.161842792240655, - "learning_rate": 1.606915178068356e-05, - "loss": 4.0711, - "mean_token_accuracy": 0.3039314493536949, - "step": 7520 - }, - { - "epoch": 0.3636539250959844, - "grad_norm": 2.224554755731731, - "learning_rate": 1.605574454183812e-05, - "loss": 3.9766, - "mean_token_accuracy": 0.318245966732502, - "step": 7530 - }, - { - "epoch": 0.36413686523555405, - "grad_norm": 2.2959269342884854, - "learning_rate": 1.6042320091901918e-05, - "loss": 3.8844, - "mean_token_accuracy": 0.32368951886892317, - "step": 7540 - }, - { - "epoch": 0.36461980537512373, - "grad_norm": 2.2806347103495135, - "learning_rate": 1.602887846902872e-05, - "loss": 3.9969, - "mean_token_accuracy": 0.3185483917593956, - "step": 7550 - }, - { - "epoch": 0.36510274551469346, - "grad_norm": 2.2406801044540394, - "learning_rate": 1.6015419711421093e-05, - "loss": 3.9875, - "mean_token_accuracy": 0.3098790317773819, - "step": 7560 - }, - { - "epoch": 0.36558568565426314, - "grad_norm": 2.3253377666358217, - "learning_rate": 1.60019438573303e-05, - "loss": 3.9105, - "mean_token_accuracy": 0.3145161300897598, - "step": 7570 - }, - { - "epoch": 0.36606862579383287, - "grad_norm": 2.1932508333597203, - "learning_rate": 1.5988450945056202e-05, - "loss": 4.0008, - "mean_token_accuracy": 0.3133064493536949, - "step": 7580 - }, - { - "epoch": 0.36655156593340255, - "grad_norm": 2.206922499068559, - "learning_rate": 1.5974941012947133e-05, - "loss": 3.975, - "mean_token_accuracy": 0.31149193495512006, - "step": 7590 - }, - { - "epoch": 0.3670345060729723, - "grad_norm": 2.2426919737010262, - "learning_rate": 1.5961414099399802e-05, - "loss": 3.984, - "mean_token_accuracy": 0.31189516335725787, - "step": 7600 - }, - { - "epoch": 0.3670345060729723, - "eval_runtime": 7.7741, - "eval_samples_per_second": 379.98, - "eval_steps_per_second": 23.797, - "step": 7600 - }, - { - "epoch": 0.36751744621254195, - "grad_norm": 2.225392847432725, - "learning_rate": 1.5947870242859188e-05, - "loss": 3.9598, - "mean_token_accuracy": 0.31330645382404326, - "step": 7610 - }, - { - "epoch": 0.36800038635211163, - "grad_norm": 2.3060781296421933, - "learning_rate": 1.5934309481818414e-05, - "loss": 3.9707, - "mean_token_accuracy": 0.31542338877916337, - "step": 7620 - }, - { - "epoch": 0.36848332649168136, - "grad_norm": 2.3585163707006824, - "learning_rate": 1.592073185481865e-05, - "loss": 3.9453, - "mean_token_accuracy": 0.315826615691185, - "step": 7630 - }, - { - "epoch": 0.36896626663125104, - "grad_norm": 2.2081498545111007, - "learning_rate": 1.590713740044901e-05, - "loss": 3.9375, - "mean_token_accuracy": 0.315625, - "step": 7640 - }, - { - "epoch": 0.36944920677082077, - "grad_norm": 2.347466197507941, - "learning_rate": 1.5893526157346416e-05, - "loss": 3.9582, - "mean_token_accuracy": 0.3138104811310768, - "step": 7650 - }, - { - "epoch": 0.36993214691039045, - "grad_norm": 2.328017784756564, - "learning_rate": 1.587989816419552e-05, - "loss": 4.0203, - "mean_token_accuracy": 0.3035282284021378, - "step": 7660 - }, - { - "epoch": 0.3704150870499602, - "grad_norm": 2.2293863274912398, - "learning_rate": 1.5866253459728574e-05, - "loss": 4.0051, - "mean_token_accuracy": 0.31129032373428345, - "step": 7670 - }, - { - "epoch": 0.37089802718952986, - "grad_norm": 2.384547800693373, - "learning_rate": 1.585259208272533e-05, - "loss": 4.0281, - "mean_token_accuracy": 0.30483870953321457, - "step": 7680 - }, - { - "epoch": 0.37138096732909953, - "grad_norm": 2.4612956238927244, - "learning_rate": 1.583891407201291e-05, - "loss": 3.9922, - "mean_token_accuracy": 0.31415562331676483, - "step": 7690 - }, - { - "epoch": 0.37186390746866926, - "grad_norm": 2.327182082327985, - "learning_rate": 1.5825219466465734e-05, - "loss": 4.0098, - "mean_token_accuracy": 0.30957661420106886, - "step": 7700 - }, - { - "epoch": 0.37186390746866926, - "eval_runtime": 7.7596, - "eval_samples_per_second": 380.691, - "eval_steps_per_second": 23.842, - "step": 7700 - }, - { - "epoch": 0.37234684760823894, - "grad_norm": 2.4641206129952424, - "learning_rate": 1.581150830500537e-05, - "loss": 4.0785, - "mean_token_accuracy": 0.30372984111309054, - "step": 7710 - }, - { - "epoch": 0.37282978774780867, - "grad_norm": 2.1559255914426236, - "learning_rate": 1.5797780626600444e-05, - "loss": 3.9988, - "mean_token_accuracy": 0.31592742055654527, - "step": 7720 - }, - { - "epoch": 0.37331272788737835, - "grad_norm": 2.2097018480771355, - "learning_rate": 1.5784036470266524e-05, - "loss": 3.9465, - "mean_token_accuracy": 0.3174395188689232, - "step": 7730 - }, - { - "epoch": 0.3737956680269481, - "grad_norm": 2.1729835502772623, - "learning_rate": 1.577027587506601e-05, - "loss": 3.8246, - "mean_token_accuracy": 0.3202620968222618, - "step": 7740 - }, - { - "epoch": 0.37427860816651776, - "grad_norm": 2.1650976320133632, - "learning_rate": 1.5756498880108027e-05, - "loss": 3.932, - "mean_token_accuracy": 0.32207661420106887, - "step": 7750 - }, - { - "epoch": 0.37476154830608743, - "grad_norm": 2.271808348234846, - "learning_rate": 1.57427055245483e-05, - "loss": 3.9785, - "mean_token_accuracy": 0.31532258093357085, - "step": 7760 - }, - { - "epoch": 0.37524448844565716, - "grad_norm": 2.4107833507934298, - "learning_rate": 1.5728895847589073e-05, - "loss": 4.0227, - "mean_token_accuracy": 0.304217104613781, - "step": 7770 - }, - { - "epoch": 0.37572742858522684, - "grad_norm": 2.235251553932183, - "learning_rate": 1.571506988847895e-05, - "loss": 4.0187, - "mean_token_accuracy": 0.30423386991024015, - "step": 7780 - }, - { - "epoch": 0.3762103687247966, - "grad_norm": 2.4843361279684637, - "learning_rate": 1.5701227686512836e-05, - "loss": 3.9539, - "mean_token_accuracy": 0.3168346777558327, - "step": 7790 - }, - { - "epoch": 0.37669330886436625, - "grad_norm": 2.191000039618457, - "learning_rate": 1.568736928103178e-05, - "loss": 4.0094, - "mean_token_accuracy": 0.31491935551166533, - "step": 7800 - }, - { - "epoch": 0.37669330886436625, - "eval_runtime": 7.7585, - "eval_samples_per_second": 380.745, - "eval_steps_per_second": 23.845, - "step": 7800 - }, - { - "epoch": 0.377176249003936, - "grad_norm": 2.1437180306225576, - "learning_rate": 1.56734947114229e-05, - "loss": 3.9578, - "mean_token_accuracy": 0.31068548262119294, - "step": 7810 - }, - { - "epoch": 0.37765918914350566, - "grad_norm": 2.381090588792243, - "learning_rate": 1.5659604017119233e-05, - "loss": 4.0047, - "mean_token_accuracy": 0.31320564448833466, - "step": 7820 - }, - { - "epoch": 0.3781421292830754, - "grad_norm": 2.3369925433015073, - "learning_rate": 1.564569723759967e-05, - "loss": 3.9406, - "mean_token_accuracy": 0.31431451588869097, - "step": 7830 - }, - { - "epoch": 0.37862506942264507, - "grad_norm": 2.2286225438782328, - "learning_rate": 1.56317744123888e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.3090725809335709, - "step": 7840 - }, - { - "epoch": 0.37910800956221474, - "grad_norm": 2.196373245240866, - "learning_rate": 1.561783558105682e-05, - "loss": 3.909, - "mean_token_accuracy": 0.3228830635547638, - "step": 7850 - }, - { - "epoch": 0.3795909497017845, - "grad_norm": 2.2697919408351357, - "learning_rate": 1.560388078321942e-05, - "loss": 3.9746, - "mean_token_accuracy": 0.31582661122083666, - "step": 7860 - }, - { - "epoch": 0.38007388984135415, - "grad_norm": 2.184099157719294, - "learning_rate": 1.5589910058537666e-05, - "loss": 3.977, - "mean_token_accuracy": 0.3138373658061028, - "step": 7870 - }, - { - "epoch": 0.3805568299809239, - "grad_norm": 2.2440503338997018, - "learning_rate": 1.5575923446717893e-05, - "loss": 3.9629, - "mean_token_accuracy": 0.3138104841113091, - "step": 7880 - }, - { - "epoch": 0.38103977012049356, - "grad_norm": 2.5432355518332384, - "learning_rate": 1.556192098751158e-05, - "loss": 3.9684, - "mean_token_accuracy": 0.3126008093357086, - "step": 7890 - }, - { - "epoch": 0.3815227102600633, - "grad_norm": 2.280111413169912, - "learning_rate": 1.5547902720715265e-05, - "loss": 3.9809, - "mean_token_accuracy": 0.3133064523339272, - "step": 7900 - }, - { - "epoch": 0.3815227102600633, - "eval_runtime": 7.762, - "eval_samples_per_second": 380.572, - "eval_steps_per_second": 23.834, - "step": 7900 - }, - { - "epoch": 0.38200565039963297, - "grad_norm": 2.177443611410207, - "learning_rate": 1.5533868686170396e-05, - "loss": 4.0441, - "mean_token_accuracy": 0.3105846792459488, - "step": 7910 - }, - { - "epoch": 0.38248859053920264, - "grad_norm": 2.2264488495344397, - "learning_rate": 1.5519818923763235e-05, - "loss": 4.0105, - "mean_token_accuracy": 0.30645161122083664, - "step": 7920 - }, - { - "epoch": 0.3829715306787724, - "grad_norm": 2.4923483307127805, - "learning_rate": 1.5505753473424757e-05, - "loss": 3.957, - "mean_token_accuracy": 0.316431450843811, - "step": 7930 - }, - { - "epoch": 0.38345447081834205, - "grad_norm": 2.3275734075195946, - "learning_rate": 1.549167237513051e-05, - "loss": 3.9637, - "mean_token_accuracy": 0.3182459682226181, - "step": 7940 - }, - { - "epoch": 0.3839374109579118, - "grad_norm": 2.3410843465541618, - "learning_rate": 1.5477575668900523e-05, - "loss": 3.9215, - "mean_token_accuracy": 0.3196572601795197, - "step": 7950 - }, - { - "epoch": 0.38442035109748146, - "grad_norm": 2.2839220796747415, - "learning_rate": 1.5463463394799184e-05, - "loss": 3.9203, - "mean_token_accuracy": 0.3147177428007126, - "step": 7960 - }, - { - "epoch": 0.3849032912370512, - "grad_norm": 2.2019522922585764, - "learning_rate": 1.5449335592935125e-05, - "loss": 4.0023, - "mean_token_accuracy": 0.3155241936445236, - "step": 7970 - }, - { - "epoch": 0.38538623137662087, - "grad_norm": 2.3933252938518867, - "learning_rate": 1.543519230346111e-05, - "loss": 4.0477, - "mean_token_accuracy": 0.3072580635547638, - "step": 7980 - }, - { - "epoch": 0.38586917151619055, - "grad_norm": 2.2673013759721576, - "learning_rate": 1.542103356657391e-05, - "loss": 3.9945, - "mean_token_accuracy": 0.315625, - "step": 7990 - }, - { - "epoch": 0.3863521116557603, - "grad_norm": 2.245016953142722, - "learning_rate": 1.540685942251423e-05, - "loss": 3.9141, - "mean_token_accuracy": 0.3180443555116653, - "step": 8000 - }, - { - "epoch": 0.3863521116557603, - "eval_runtime": 7.7913, - "eval_samples_per_second": 379.142, - "eval_steps_per_second": 23.745, - "step": 8000 - }, - { - "epoch": 0.38683505179532995, - "grad_norm": 2.3171600703872017, - "learning_rate": 1.5392669911566525e-05, - "loss": 3.9766, - "mean_token_accuracy": 0.31723790168762206, - "step": 8010 - }, - { - "epoch": 0.3873179919348997, - "grad_norm": 2.2051650337965265, - "learning_rate": 1.5378465074058953e-05, - "loss": 3.9844, - "mean_token_accuracy": 0.32086693644523623, - "step": 8020 - }, - { - "epoch": 0.38780093207446936, - "grad_norm": 2.317902630186066, - "learning_rate": 1.5364244950363216e-05, - "loss": 3.9902, - "mean_token_accuracy": 0.31663306057453156, - "step": 8030 - }, - { - "epoch": 0.3882838722140391, - "grad_norm": 2.302866413568975, - "learning_rate": 1.535000958089447e-05, - "loss": 3.9898, - "mean_token_accuracy": 0.3165322616696358, - "step": 8040 - }, - { - "epoch": 0.38876681235360877, - "grad_norm": 2.2651441307335185, - "learning_rate": 1.5335759006111195e-05, - "loss": 3.9566, - "mean_token_accuracy": 0.3184475839138031, - "step": 8050 - }, - { - "epoch": 0.38924975249317845, - "grad_norm": 2.2371477945459577, - "learning_rate": 1.532149326651509e-05, - "loss": 4.0086, - "mean_token_accuracy": 0.30977822542190553, - "step": 8060 - }, - { - "epoch": 0.3897326926327482, - "grad_norm": 2.267391115586042, - "learning_rate": 1.5307212402650956e-05, - "loss": 3.973, - "mean_token_accuracy": 0.3146169349551201, - "step": 8070 - }, - { - "epoch": 0.39021563277231786, - "grad_norm": 2.221125299434015, - "learning_rate": 1.5292916455106572e-05, - "loss": 4.0359, - "mean_token_accuracy": 0.3105846747756004, - "step": 8080 - }, - { - "epoch": 0.3906985729118876, - "grad_norm": 2.2532059615380153, - "learning_rate": 1.527860546451259e-05, - "loss": 4.0367, - "mean_token_accuracy": 0.30625, - "step": 8090 - }, - { - "epoch": 0.39118151305145726, - "grad_norm": 2.249756955836091, - "learning_rate": 1.526427947154242e-05, - "loss": 3.9863, - "mean_token_accuracy": 0.31249999850988386, - "step": 8100 - }, - { - "epoch": 0.39118151305145726, - "eval_runtime": 7.7701, - "eval_samples_per_second": 380.175, - "eval_steps_per_second": 23.809, - "step": 8100 - }, - { - "epoch": 0.391664453191027, - "grad_norm": 2.3681585876910405, - "learning_rate": 1.5249938516912104e-05, - "loss": 3.9473, - "mean_token_accuracy": 0.319556450843811, - "step": 8110 - }, - { - "epoch": 0.39214739333059667, - "grad_norm": 2.273513562099052, - "learning_rate": 1.5235582641380208e-05, - "loss": 3.9297, - "mean_token_accuracy": 0.3137096747756004, - "step": 8120 - }, - { - "epoch": 0.39263033347016635, - "grad_norm": 2.434611549389301, - "learning_rate": 1.522121188574771e-05, - "loss": 3.9719, - "mean_token_accuracy": 0.31532258093357085, - "step": 8130 - }, - { - "epoch": 0.3931132736097361, - "grad_norm": 2.2711279245845137, - "learning_rate": 1.520682629085787e-05, - "loss": 3.9609, - "mean_token_accuracy": 0.31683467626571654, - "step": 8140 - }, - { - "epoch": 0.39359621374930576, - "grad_norm": 2.43464539956847, - "learning_rate": 1.5192425897596134e-05, - "loss": 3.8906, - "mean_token_accuracy": 0.32499999850988387, - "step": 8150 - }, - { - "epoch": 0.3940791538888755, - "grad_norm": 2.354420987733528, - "learning_rate": 1.5178010746889995e-05, - "loss": 3.9301, - "mean_token_accuracy": 0.3187499985098839, - "step": 8160 - }, - { - "epoch": 0.39456209402844516, - "grad_norm": 2.2094552811465746, - "learning_rate": 1.5163580879708897e-05, - "loss": 3.9504, - "mean_token_accuracy": 0.3152217760682106, - "step": 8170 - }, - { - "epoch": 0.3950450341680149, - "grad_norm": 2.2521642433158, - "learning_rate": 1.5149136337064105e-05, - "loss": 3.9172, - "mean_token_accuracy": 0.3178427413105965, - "step": 8180 - }, - { - "epoch": 0.3955279743075846, - "grad_norm": 2.3528903523414084, - "learning_rate": 1.5134677160008594e-05, - "loss": 3.927, - "mean_token_accuracy": 0.3142137140035629, - "step": 8190 - }, - { - "epoch": 0.39601091444715425, - "grad_norm": 2.3613415375312554, - "learning_rate": 1.5120203389636937e-05, - "loss": 3.9484, - "mean_token_accuracy": 0.3144153207540512, - "step": 8200 - }, - { - "epoch": 0.39601091444715425, - "eval_runtime": 7.7794, - "eval_samples_per_second": 379.722, - "eval_steps_per_second": 23.781, - "step": 8200 - }, - { - "epoch": 0.396493854586724, - "grad_norm": 2.3690149146054877, - "learning_rate": 1.5105715067085173e-05, - "loss": 3.9527, - "mean_token_accuracy": 0.3152217760682106, - "step": 8210 - }, - { - "epoch": 0.39697679472629366, - "grad_norm": 2.2257570825117794, - "learning_rate": 1.5091212233530707e-05, - "loss": 3.9906, - "mean_token_accuracy": 0.3203629061579704, - "step": 8220 - }, - { - "epoch": 0.3974597348658634, - "grad_norm": 2.4680611405500144, - "learning_rate": 1.5076694930192187e-05, - "loss": 4.0457, - "mean_token_accuracy": 0.305443549156189, - "step": 8230 - }, - { - "epoch": 0.39794267500543307, - "grad_norm": 2.644212903219562, - "learning_rate": 1.5062163198329376e-05, - "loss": 3.9098, - "mean_token_accuracy": 0.32157257944345474, - "step": 8240 - }, - { - "epoch": 0.3984256151450028, - "grad_norm": 2.1640338436490767, - "learning_rate": 1.5047617079243057e-05, - "loss": 3.9254, - "mean_token_accuracy": 0.3203629061579704, - "step": 8250 - }, - { - "epoch": 0.3989085552845725, - "grad_norm": 2.356569793571822, - "learning_rate": 1.5033056614274898e-05, - "loss": 3.9813, - "mean_token_accuracy": 0.31401209980249406, - "step": 8260 - }, - { - "epoch": 0.39939149542414215, - "grad_norm": 2.201265363950054, - "learning_rate": 1.5018481844807333e-05, - "loss": 3.9473, - "mean_token_accuracy": 0.3187500014901161, - "step": 8270 - }, - { - "epoch": 0.3998744355637119, - "grad_norm": 2.3178787761333455, - "learning_rate": 1.5003892812263461e-05, - "loss": 3.8969, - "mean_token_accuracy": 0.31955645233392715, - "step": 8280 - }, - { - "epoch": 0.40035737570328156, - "grad_norm": 2.3310885134933015, - "learning_rate": 1.4989289558106913e-05, - "loss": 3.9312, - "mean_token_accuracy": 0.3175403207540512, - "step": 8290 - }, - { - "epoch": 0.4008403158428513, - "grad_norm": 2.3309224168306426, - "learning_rate": 1.4974672123841739e-05, - "loss": 3.9328, - "mean_token_accuracy": 0.3181451618671417, - "step": 8300 - }, - { - "epoch": 0.4008403158428513, - "eval_runtime": 7.7759, - "eval_samples_per_second": 379.893, - "eval_steps_per_second": 23.792, - "step": 8300 - }, - { - "epoch": 0.40132325598242097, - "grad_norm": 2.253461363076083, - "learning_rate": 1.4960040551012293e-05, - "loss": 3.9582, - "mean_token_accuracy": 0.31522177159786224, - "step": 8310 - }, - { - "epoch": 0.4018061961219907, - "grad_norm": 2.182744451847417, - "learning_rate": 1.4945394881203115e-05, - "loss": 3.9953, - "mean_token_accuracy": 0.30826613008975984, - "step": 8320 - }, - { - "epoch": 0.4022891362615604, - "grad_norm": 2.2082836776661137, - "learning_rate": 1.4930735156038798e-05, - "loss": 3.9375, - "mean_token_accuracy": 0.3208669349551201, - "step": 8330 - }, - { - "epoch": 0.40277207640113005, - "grad_norm": 2.152139964086219, - "learning_rate": 1.4916061417183899e-05, - "loss": 3.9145, - "mean_token_accuracy": 0.3189400926232338, - "step": 8340 - }, - { - "epoch": 0.4032550165406998, - "grad_norm": 2.322393374678207, - "learning_rate": 1.4901373706342788e-05, - "loss": 4.0086, - "mean_token_accuracy": 0.3099798396229744, - "step": 8350 - }, - { - "epoch": 0.40373795668026946, - "grad_norm": 2.2454702269187448, - "learning_rate": 1.4886672065259553e-05, - "loss": 3.9633, - "mean_token_accuracy": 0.31058468073606493, - "step": 8360 - }, - { - "epoch": 0.4042208968198392, - "grad_norm": 2.2150750047492487, - "learning_rate": 1.4871956535717875e-05, - "loss": 3.966, - "mean_token_accuracy": 0.31350806504487994, - "step": 8370 - }, - { - "epoch": 0.40470383695940887, - "grad_norm": 2.3506011177671615, - "learning_rate": 1.4857227159540901e-05, - "loss": 3.9621, - "mean_token_accuracy": 0.3196572601795197, - "step": 8380 - }, - { - "epoch": 0.4051867770989786, - "grad_norm": 2.2359460578761396, - "learning_rate": 1.484248397859114e-05, - "loss": 4.009, - "mean_token_accuracy": 0.3127016112208366, - "step": 8390 - }, - { - "epoch": 0.4056697172385483, - "grad_norm": 2.2535177698933393, - "learning_rate": 1.4827727034770326e-05, - "loss": 3.95, - "mean_token_accuracy": 0.3102822571992874, - "step": 8400 - }, - { - "epoch": 0.4056697172385483, - "eval_runtime": 7.7816, - "eval_samples_per_second": 379.612, - "eval_steps_per_second": 23.774, - "step": 8400 - }, - { - "epoch": 0.406152657378118, - "grad_norm": 2.4127913079682517, - "learning_rate": 1.4812956370019311e-05, - "loss": 4.0125, - "mean_token_accuracy": 0.30453629046678543, - "step": 8410 - }, - { - "epoch": 0.4066355975176877, - "grad_norm": 2.173502998824011, - "learning_rate": 1.4798172026317949e-05, - "loss": 3.9602, - "mean_token_accuracy": 0.31548836827278137, - "step": 8420 - }, - { - "epoch": 0.40711853765725736, - "grad_norm": 2.1571546369952515, - "learning_rate": 1.4783374045684971e-05, - "loss": 4.0199, - "mean_token_accuracy": 0.30756048262119295, - "step": 8430 - }, - { - "epoch": 0.4076014777968271, - "grad_norm": 2.2908423105756834, - "learning_rate": 1.4768562470177856e-05, - "loss": 3.9863, - "mean_token_accuracy": 0.30463709533214567, - "step": 8440 - }, - { - "epoch": 0.40808441793639677, - "grad_norm": 2.303398828783417, - "learning_rate": 1.4753737341892733e-05, - "loss": 3.9504, - "mean_token_accuracy": 0.3122983857989311, - "step": 8450 - }, - { - "epoch": 0.4085673580759665, - "grad_norm": 2.1881230355524606, - "learning_rate": 1.4738898702964239e-05, - "loss": 3.932, - "mean_token_accuracy": 0.31683467477560046, - "step": 8460 - }, - { - "epoch": 0.4090502982155362, - "grad_norm": 2.098728300090589, - "learning_rate": 1.472404659556542e-05, - "loss": 3.9145, - "mean_token_accuracy": 0.32026209831237795, - "step": 8470 - }, - { - "epoch": 0.4095332383551059, - "grad_norm": 2.2391037790745467, - "learning_rate": 1.4709181061907591e-05, - "loss": 3.9684, - "mean_token_accuracy": 0.31653225868940355, - "step": 8480 - }, - { - "epoch": 0.4100161784946756, - "grad_norm": 2.4090827424703534, - "learning_rate": 1.4694302144240234e-05, - "loss": 3.991, - "mean_token_accuracy": 0.3087701603770256, - "step": 8490 - }, - { - "epoch": 0.41049911863424526, - "grad_norm": 2.427448304656656, - "learning_rate": 1.4679409884850866e-05, - "loss": 4.0211, - "mean_token_accuracy": 0.3073588699102402, - "step": 8500 - }, - { - "epoch": 0.41049911863424526, - "eval_runtime": 7.77, - "eval_samples_per_second": 380.179, - "eval_steps_per_second": 23.809, - "step": 8500 - }, - { - "epoch": 0.410982058773815, - "grad_norm": 2.2633867160742454, - "learning_rate": 1.4664504326064919e-05, - "loss": 3.9004, - "mean_token_accuracy": 0.3173387095332146, - "step": 8510 - }, - { - "epoch": 0.41146499891338467, - "grad_norm": 2.3383005703347934, - "learning_rate": 1.4649585510245632e-05, - "loss": 3.9965, - "mean_token_accuracy": 0.31421370804309845, - "step": 8520 - }, - { - "epoch": 0.4119479390529544, - "grad_norm": 2.1632549205395146, - "learning_rate": 1.4634653479793917e-05, - "loss": 4.0121, - "mean_token_accuracy": 0.3075604856014252, - "step": 8530 - }, - { - "epoch": 0.4124308791925241, - "grad_norm": 2.3189173200001245, - "learning_rate": 1.4619708277148242e-05, - "loss": 3.9887, - "mean_token_accuracy": 0.31391129046678545, - "step": 8540 - }, - { - "epoch": 0.4129138193320938, - "grad_norm": 2.1664619049782603, - "learning_rate": 1.460474994478451e-05, - "loss": 3.9121, - "mean_token_accuracy": 0.31552419662475584, - "step": 8550 - }, - { - "epoch": 0.4133967594716635, - "grad_norm": 2.1446203636972916, - "learning_rate": 1.4589778525215952e-05, - "loss": 3.9684, - "mean_token_accuracy": 0.31602822691202165, - "step": 8560 - }, - { - "epoch": 0.41387969961123316, - "grad_norm": 2.234965721040499, - "learning_rate": 1.4574794060992976e-05, - "loss": 4.007, - "mean_token_accuracy": 0.3116935506463051, - "step": 8570 - }, - { - "epoch": 0.4143626397508029, - "grad_norm": 2.3350292840778666, - "learning_rate": 1.4559796594703084e-05, - "loss": 3.9723, - "mean_token_accuracy": 0.319254033267498, - "step": 8580 - }, - { - "epoch": 0.4148455798903726, - "grad_norm": 2.2834840773216483, - "learning_rate": 1.4544786168970715e-05, - "loss": 4.0074, - "mean_token_accuracy": 0.3156250029802322, - "step": 8590 - }, - { - "epoch": 0.4153285200299423, - "grad_norm": 2.522144799497461, - "learning_rate": 1.452976282645715e-05, - "loss": 3.8715, - "mean_token_accuracy": 0.327116933465004, - "step": 8600 - }, - { - "epoch": 0.4153285200299423, - "eval_runtime": 7.7737, - "eval_samples_per_second": 379.998, - "eval_steps_per_second": 23.798, - "step": 8600 - }, - { - "epoch": 0.415811460169512, - "grad_norm": 2.26372699187666, - "learning_rate": 1.4514726609860374e-05, - "loss": 3.9582, - "mean_token_accuracy": 0.3166330620646477, - "step": 8610 - }, - { - "epoch": 0.4162944003090817, - "grad_norm": 2.359948200481088, - "learning_rate": 1.4499677561914969e-05, - "loss": 3.9809, - "mean_token_accuracy": 0.31653225868940355, - "step": 8620 - }, - { - "epoch": 0.4167773404486514, - "grad_norm": 2.2059864287873068, - "learning_rate": 1.448461572539198e-05, - "loss": 3.9836, - "mean_token_accuracy": 0.3127016142010689, - "step": 8630 - }, - { - "epoch": 0.41726028058822107, - "grad_norm": 2.2288805568568475, - "learning_rate": 1.44695411430988e-05, - "loss": 3.8973, - "mean_token_accuracy": 0.3197580650448799, - "step": 8640 - }, - { - "epoch": 0.4177432207277908, - "grad_norm": 2.4082297684586793, - "learning_rate": 1.4454453857879047e-05, - "loss": 4.0121, - "mean_token_accuracy": 0.3090725764632225, - "step": 8650 - }, - { - "epoch": 0.4182261608673605, - "grad_norm": 2.2782643806095133, - "learning_rate": 1.4439353912612441e-05, - "loss": 3.882, - "mean_token_accuracy": 0.3180443555116653, - "step": 8660 - }, - { - "epoch": 0.4187091010069302, - "grad_norm": 2.273361530716404, - "learning_rate": 1.4424241350214683e-05, - "loss": 3.9695, - "mean_token_accuracy": 0.3146169364452362, - "step": 8670 - }, - { - "epoch": 0.4191920411464999, - "grad_norm": 2.1614064668476947, - "learning_rate": 1.4409116213637335e-05, - "loss": 3.9875, - "mean_token_accuracy": 0.31622984260320663, - "step": 8680 - }, - { - "epoch": 0.4196749812860696, - "grad_norm": 2.1749090104092113, - "learning_rate": 1.4393978545867699e-05, - "loss": 3.95, - "mean_token_accuracy": 0.3173387095332146, - "step": 8690 - }, - { - "epoch": 0.4201579214256393, - "grad_norm": 2.1901106971585516, - "learning_rate": 1.4378828389928683e-05, - "loss": 3.9254, - "mean_token_accuracy": 0.3138104841113091, - "step": 8700 - }, - { - "epoch": 0.4201579214256393, - "eval_runtime": 7.7431, - "eval_samples_per_second": 381.5, - "eval_steps_per_second": 23.892, - "step": 8700 - }, - { - "epoch": 0.42064086156520897, - "grad_norm": 2.3886563997614583, - "learning_rate": 1.4363665788878698e-05, - "loss": 3.9629, - "mean_token_accuracy": 0.31784274280071256, - "step": 8710 - }, - { - "epoch": 0.4211238017047787, - "grad_norm": 2.187173443011444, - "learning_rate": 1.4348490785811516e-05, - "loss": 3.957, - "mean_token_accuracy": 0.3177419379353523, - "step": 8720 - }, - { - "epoch": 0.4216067418443484, - "grad_norm": 2.336368136735689, - "learning_rate": 1.4333303423856161e-05, - "loss": 3.9625, - "mean_token_accuracy": 0.312802417576313, - "step": 8730 - }, - { - "epoch": 0.4220896819839181, - "grad_norm": 2.1307192899137193, - "learning_rate": 1.4318103746176793e-05, - "loss": 3.9059, - "mean_token_accuracy": 0.32520160973072054, - "step": 8740 - }, - { - "epoch": 0.4225726221234878, - "grad_norm": 2.2181936867558063, - "learning_rate": 1.4302891795972553e-05, - "loss": 3.9238, - "mean_token_accuracy": 0.3177419379353523, - "step": 8750 - }, - { - "epoch": 0.4230555622630575, - "grad_norm": 2.3146108820988402, - "learning_rate": 1.428766761647748e-05, - "loss": 4.0324, - "mean_token_accuracy": 0.30786290392279625, - "step": 8760 - }, - { - "epoch": 0.4235385024026272, - "grad_norm": 2.107398900918669, - "learning_rate": 1.4272431250960362e-05, - "loss": 3.9082, - "mean_token_accuracy": 0.3193548396229744, - "step": 8770 - }, - { - "epoch": 0.42402144254219687, - "grad_norm": 2.2395181923870235, - "learning_rate": 1.425718274272462e-05, - "loss": 3.9879, - "mean_token_accuracy": 0.3102822571992874, - "step": 8780 - }, - { - "epoch": 0.4245043826817666, - "grad_norm": 2.321570759451804, - "learning_rate": 1.4241922135108188e-05, - "loss": 3.9383, - "mean_token_accuracy": 0.31693548411130906, - "step": 8790 - }, - { - "epoch": 0.4249873228213363, - "grad_norm": 2.3972951483660623, - "learning_rate": 1.4226649471483396e-05, - "loss": 3.959, - "mean_token_accuracy": 0.3133064493536949, - "step": 8800 - }, - { - "epoch": 0.4249873228213363, - "eval_runtime": 7.7675, - "eval_samples_per_second": 380.301, - "eval_steps_per_second": 23.817, - "step": 8800 - }, - { - "epoch": 0.425470262960906, - "grad_norm": 2.2281928237376114, - "learning_rate": 1.4211364795256819e-05, - "loss": 3.9672, - "mean_token_accuracy": 0.31804435700178146, - "step": 8810 - }, - { - "epoch": 0.4259532031004757, - "grad_norm": 2.330225329925326, - "learning_rate": 1.4196068149869194e-05, - "loss": 3.9516, - "mean_token_accuracy": 0.32036290466785433, - "step": 8820 - }, - { - "epoch": 0.4264361432400454, - "grad_norm": 2.213484525240926, - "learning_rate": 1.4180759578795258e-05, - "loss": 3.9992, - "mean_token_accuracy": 0.3104838728904724, - "step": 8830 - }, - { - "epoch": 0.4269190833796151, - "grad_norm": 2.3257342531686045, - "learning_rate": 1.4165439125543659e-05, - "loss": 3.9672, - "mean_token_accuracy": 0.3222782239317894, - "step": 8840 - }, - { - "epoch": 0.42740202351918477, - "grad_norm": 2.270032724911227, - "learning_rate": 1.4150106833656801e-05, - "loss": 3.9555, - "mean_token_accuracy": 0.3232862904667854, - "step": 8850 - }, - { - "epoch": 0.4278849636587545, - "grad_norm": 2.225139220617103, - "learning_rate": 1.4134762746710741e-05, - "loss": 3.9465, - "mean_token_accuracy": 0.31743951588869096, - "step": 8860 - }, - { - "epoch": 0.4283679037983242, - "grad_norm": 2.143885802204563, - "learning_rate": 1.4119406908315065e-05, - "loss": 3.9324, - "mean_token_accuracy": 0.3220766097307205, - "step": 8870 - }, - { - "epoch": 0.4288508439378939, - "grad_norm": 2.2760174263761077, - "learning_rate": 1.4104039362112745e-05, - "loss": 3.9664, - "mean_token_accuracy": 0.3151209682226181, - "step": 8880 - }, - { - "epoch": 0.4293337840774636, - "grad_norm": 2.2498136887954576, - "learning_rate": 1.4088660151780035e-05, - "loss": 3.9051, - "mean_token_accuracy": 0.3233870953321457, - "step": 8890 - }, - { - "epoch": 0.4298167242170333, - "grad_norm": 2.3821445970295216, - "learning_rate": 1.4073269321026342e-05, - "loss": 3.9484, - "mean_token_accuracy": 0.3211693540215492, - "step": 8900 - }, - { - "epoch": 0.4298167242170333, - "eval_runtime": 7.783, - "eval_samples_per_second": 379.545, - "eval_steps_per_second": 23.77, - "step": 8900 - }, - { - "epoch": 0.430299664356603, - "grad_norm": 2.401569834089016, - "learning_rate": 1.4057866913594092e-05, - "loss": 3.9688, - "mean_token_accuracy": 0.3162298396229744, - "step": 8910 - }, - { - "epoch": 0.43078260449617267, - "grad_norm": 2.154146649871768, - "learning_rate": 1.404245297325862e-05, - "loss": 3.9457, - "mean_token_accuracy": 0.31804435700178146, - "step": 8920 - }, - { - "epoch": 0.4312655446357424, - "grad_norm": 2.3684647515183177, - "learning_rate": 1.4027027543828043e-05, - "loss": 3.9621, - "mean_token_accuracy": 0.30473790168762205, - "step": 8930 - }, - { - "epoch": 0.4317484847753121, - "grad_norm": 2.3698024233540362, - "learning_rate": 1.4011590669143112e-05, - "loss": 3.957, - "mean_token_accuracy": 0.3072580635547638, - "step": 8940 - }, - { - "epoch": 0.4322314249148818, - "grad_norm": 2.214949557739117, - "learning_rate": 1.3996142393077128e-05, - "loss": 3.9004, - "mean_token_accuracy": 0.32247983664274216, - "step": 8950 - }, - { - "epoch": 0.4327143650544515, - "grad_norm": 2.2609835092620583, - "learning_rate": 1.3980682759535784e-05, - "loss": 4.0207, - "mean_token_accuracy": 0.3058467753231525, - "step": 8960 - }, - { - "epoch": 0.4331973051940212, - "grad_norm": 2.276536125550772, - "learning_rate": 1.3965211812457055e-05, - "loss": 3.9355, - "mean_token_accuracy": 0.316129033267498, - "step": 8970 - }, - { - "epoch": 0.4336802453335909, - "grad_norm": 2.2358364256812724, - "learning_rate": 1.3949729595811077e-05, - "loss": 3.982, - "mean_token_accuracy": 0.3171370968222618, - "step": 8980 - }, - { - "epoch": 0.43416318547316063, - "grad_norm": 2.1560697570679688, - "learning_rate": 1.3934236153600002e-05, - "loss": 3.9379, - "mean_token_accuracy": 0.31401209384202955, - "step": 8990 - }, - { - "epoch": 0.4346461256127303, - "grad_norm": 2.1936154624335344, - "learning_rate": 1.3918731529857904e-05, - "loss": 3.9953, - "mean_token_accuracy": 0.3138104841113091, - "step": 9000 - }, - { - "epoch": 0.4346461256127303, - "eval_runtime": 7.7834, - "eval_samples_per_second": 379.528, - "eval_steps_per_second": 23.769, - "step": 9000 - }, - { - "epoch": 0.4351290657523, - "grad_norm": 2.1575081284022177, - "learning_rate": 1.3903215768650609e-05, - "loss": 3.9266, - "mean_token_accuracy": 0.319556450843811, - "step": 9010 - }, - { - "epoch": 0.4356120058918697, - "grad_norm": 2.345633670110874, - "learning_rate": 1.3887688914075628e-05, - "loss": 3.9949, - "mean_token_accuracy": 0.3073588714003563, - "step": 9020 - }, - { - "epoch": 0.4360949460314394, - "grad_norm": 2.3863605971117625, - "learning_rate": 1.387215101026198e-05, - "loss": 4.0414, - "mean_token_accuracy": 0.3067540302872658, - "step": 9030 - }, - { - "epoch": 0.4365778861710091, - "grad_norm": 2.232717047721671, - "learning_rate": 1.3856602101370095e-05, - "loss": 3.9133, - "mean_token_accuracy": 0.3209677428007126, - "step": 9040 - }, - { - "epoch": 0.4370608263105788, - "grad_norm": 2.267300866113889, - "learning_rate": 1.384104223159168e-05, - "loss": 3.9359, - "mean_token_accuracy": 0.31844758093357084, - "step": 9050 - }, - { - "epoch": 0.43754376645014853, - "grad_norm": 2.4453508923457736, - "learning_rate": 1.3825471445149589e-05, - "loss": 4.0, - "mean_token_accuracy": 0.3066532239317894, - "step": 9060 - }, - { - "epoch": 0.4380267065897182, - "grad_norm": 2.2267880358266043, - "learning_rate": 1.380988978629771e-05, - "loss": 3.9133, - "mean_token_accuracy": 0.322379033267498, - "step": 9070 - }, - { - "epoch": 0.4385096467292879, - "grad_norm": 2.384098935035482, - "learning_rate": 1.379429729932083e-05, - "loss": 4.0133, - "mean_token_accuracy": 0.3161290273070335, - "step": 9080 - }, - { - "epoch": 0.4389925868688576, - "grad_norm": 2.303557148223578, - "learning_rate": 1.3778694028534498e-05, - "loss": 3.941, - "mean_token_accuracy": 0.3209677428007126, - "step": 9090 - }, - { - "epoch": 0.4394755270084273, - "grad_norm": 2.3744461420478533, - "learning_rate": 1.3763080018284933e-05, - "loss": 3.9652, - "mean_token_accuracy": 0.3171370968222618, - "step": 9100 - }, - { - "epoch": 0.4394755270084273, - "eval_runtime": 7.7948, - "eval_samples_per_second": 378.969, - "eval_steps_per_second": 23.734, - "step": 9100 - }, - { - "epoch": 0.439958467147997, - "grad_norm": 2.394834932677048, - "learning_rate": 1.3747455312948866e-05, - "loss": 4.0305, - "mean_token_accuracy": 0.3071572557091713, - "step": 9110 - }, - { - "epoch": 0.4404414072875667, - "grad_norm": 2.172997247257797, - "learning_rate": 1.3731819956933419e-05, - "loss": 3.8891, - "mean_token_accuracy": 0.3120967745780945, - "step": 9120 - }, - { - "epoch": 0.44092434742713643, - "grad_norm": 2.3921038051430323, - "learning_rate": 1.3716173994675995e-05, - "loss": 3.9277, - "mean_token_accuracy": 0.30856855064630506, - "step": 9130 - }, - { - "epoch": 0.4414072875667061, - "grad_norm": 2.380354175518894, - "learning_rate": 1.370051747064413e-05, - "loss": 3.9543, - "mean_token_accuracy": 0.3088709689676762, - "step": 9140 - }, - { - "epoch": 0.4418902277062758, - "grad_norm": 2.2681764924645833, - "learning_rate": 1.3684850429335392e-05, - "loss": 3.9375, - "mean_token_accuracy": 0.3178427413105965, - "step": 9150 - }, - { - "epoch": 0.4423731678458455, - "grad_norm": 2.231651451929135, - "learning_rate": 1.3669172915277228e-05, - "loss": 3.9336, - "mean_token_accuracy": 0.3242943525314331, - "step": 9160 - }, - { - "epoch": 0.4428561079854152, - "grad_norm": 2.2826170704440396, - "learning_rate": 1.3653484973026854e-05, - "loss": 3.959, - "mean_token_accuracy": 0.31703629195690153, - "step": 9170 - }, - { - "epoch": 0.4433390481249849, - "grad_norm": 2.2816822868383797, - "learning_rate": 1.3637786647171122e-05, - "loss": 3.9797, - "mean_token_accuracy": 0.3132056429982185, - "step": 9180 - }, - { - "epoch": 0.4438219882645546, - "grad_norm": 2.282689470466177, - "learning_rate": 1.3622077982326403e-05, - "loss": 3.9418, - "mean_token_accuracy": 0.3214717760682106, - "step": 9190 - }, - { - "epoch": 0.44430492840412433, - "grad_norm": 2.2627975614159492, - "learning_rate": 1.3606359023138445e-05, - "loss": 3.9309, - "mean_token_accuracy": 0.3224798381328583, - "step": 9200 - }, - { - "epoch": 0.44430492840412433, - "eval_runtime": 7.8104, - "eval_samples_per_second": 378.212, - "eval_steps_per_second": 23.686, - "step": 9200 - }, - { - "epoch": 0.444787868543694, - "grad_norm": 2.2483442824641036, - "learning_rate": 1.359062981428225e-05, - "loss": 3.9121, - "mean_token_accuracy": 0.3189516142010689, - "step": 9210 - }, - { - "epoch": 0.4452708086832637, - "grad_norm": 2.269327754254287, - "learning_rate": 1.3574890400461963e-05, - "loss": 4.0125, - "mean_token_accuracy": 0.31300403326749804, - "step": 9220 - }, - { - "epoch": 0.4457537488228334, - "grad_norm": 2.445299738412008, - "learning_rate": 1.3559140826410724e-05, - "loss": 3.9691, - "mean_token_accuracy": 0.3160282239317894, - "step": 9230 - }, - { - "epoch": 0.4462366889624031, - "grad_norm": 2.2992504773357854, - "learning_rate": 1.354338113689055e-05, - "loss": 3.9277, - "mean_token_accuracy": 0.31834677457809446, - "step": 9240 - }, - { - "epoch": 0.4467196291019728, - "grad_norm": 2.273439600994875, - "learning_rate": 1.3527611376692209e-05, - "loss": 3.9535, - "mean_token_accuracy": 0.3162298396229744, - "step": 9250 - }, - { - "epoch": 0.4472025692415425, - "grad_norm": 2.238553038082488, - "learning_rate": 1.3511831590635091e-05, - "loss": 3.9395, - "mean_token_accuracy": 0.31340725868940356, - "step": 9260 - }, - { - "epoch": 0.44768550938111223, - "grad_norm": 2.225002583957854, - "learning_rate": 1.3496041823567082e-05, - "loss": 3.8457, - "mean_token_accuracy": 0.32419354617595675, - "step": 9270 - }, - { - "epoch": 0.4481684495206819, - "grad_norm": 2.440731510413359, - "learning_rate": 1.348024212036443e-05, - "loss": 3.9398, - "mean_token_accuracy": 0.3142137110233307, - "step": 9280 - }, - { - "epoch": 0.4486513896602516, - "grad_norm": 2.338191980385382, - "learning_rate": 1.3464432525931627e-05, - "loss": 3.9293, - "mean_token_accuracy": 0.31502016335725785, - "step": 9290 - }, - { - "epoch": 0.4491343297998213, - "grad_norm": 2.371082201919232, - "learning_rate": 1.3448613085201278e-05, - "loss": 3.9152, - "mean_token_accuracy": 0.3167338728904724, - "step": 9300 - }, - { - "epoch": 0.4491343297998213, - "eval_runtime": 7.7579, - "eval_samples_per_second": 380.771, - "eval_steps_per_second": 23.847, - "step": 9300 - }, - { - "epoch": 0.449617269939391, - "grad_norm": 2.3399554490922645, - "learning_rate": 1.3432783843133965e-05, - "loss": 3.9305, - "mean_token_accuracy": 0.3200604856014252, - "step": 9310 - }, - { - "epoch": 0.4501002100789607, - "grad_norm": 2.1538933836986507, - "learning_rate": 1.341694484471814e-05, - "loss": 3.9668, - "mean_token_accuracy": 0.31754032224416734, - "step": 9320 - }, - { - "epoch": 0.4505831502185304, - "grad_norm": 2.293289133417718, - "learning_rate": 1.3401096134969969e-05, - "loss": 3.9414, - "mean_token_accuracy": 0.3149193570017815, - "step": 9330 - }, - { - "epoch": 0.45106609035810014, - "grad_norm": 2.326297513006786, - "learning_rate": 1.3385237758933225e-05, - "loss": 3.9285, - "mean_token_accuracy": 0.3254032239317894, - "step": 9340 - }, - { - "epoch": 0.4515490304976698, - "grad_norm": 2.2071577920584455, - "learning_rate": 1.3369369761679156e-05, - "loss": 3.9957, - "mean_token_accuracy": 0.3078629046678543, - "step": 9350 - }, - { - "epoch": 0.4520319706372395, - "grad_norm": 2.241496164141993, - "learning_rate": 1.3353492188306349e-05, - "loss": 3.934, - "mean_token_accuracy": 0.3123991936445236, - "step": 9360 - }, - { - "epoch": 0.4525149107768092, - "grad_norm": 2.3380920777547494, - "learning_rate": 1.3337605083940614e-05, - "loss": 3.9848, - "mean_token_accuracy": 0.31139112561941146, - "step": 9370 - }, - { - "epoch": 0.4529978509163789, - "grad_norm": 2.231851607449588, - "learning_rate": 1.3321708493734844e-05, - "loss": 3.8746, - "mean_token_accuracy": 0.3270161300897598, - "step": 9380 - }, - { - "epoch": 0.45348079105594863, - "grad_norm": 2.367890514060785, - "learning_rate": 1.3305802462868894e-05, - "loss": 3.9422, - "mean_token_accuracy": 0.31905241757631303, - "step": 9390 - }, - { - "epoch": 0.4539637311955183, - "grad_norm": 2.2455674457503187, - "learning_rate": 1.3289887036549447e-05, - "loss": 4.0, - "mean_token_accuracy": 0.312802417576313, - "step": 9400 - }, - { - "epoch": 0.4539637311955183, - "eval_runtime": 7.7648, - "eval_samples_per_second": 380.434, - "eval_steps_per_second": 23.825, - "step": 9400 - }, - { - "epoch": 0.45444667133508804, - "grad_norm": 2.323137998201826, - "learning_rate": 1.3273962260009895e-05, - "loss": 3.9586, - "mean_token_accuracy": 0.3180443555116653, - "step": 9410 - }, - { - "epoch": 0.4549296114746577, - "grad_norm": 2.398052115634056, - "learning_rate": 1.32580281785102e-05, - "loss": 3.959, - "mean_token_accuracy": 0.31754032522439957, - "step": 9420 - }, - { - "epoch": 0.4554125516142274, - "grad_norm": 2.3919912724781427, - "learning_rate": 1.3242084837336772e-05, - "loss": 3.8762, - "mean_token_accuracy": 0.3264112904667854, - "step": 9430 - }, - { - "epoch": 0.4558954917537971, - "grad_norm": 2.2675232139312205, - "learning_rate": 1.3226132281802335e-05, - "loss": 3.9656, - "mean_token_accuracy": 0.31683467477560046, - "step": 9440 - }, - { - "epoch": 0.4563784318933668, - "grad_norm": 2.228346077251232, - "learning_rate": 1.3210170557245806e-05, - "loss": 4.0035, - "mean_token_accuracy": 0.3098790317773819, - "step": 9450 - }, - { - "epoch": 0.45686137203293653, - "grad_norm": 2.2966577634723664, - "learning_rate": 1.319419970903215e-05, - "loss": 3.9625, - "mean_token_accuracy": 0.3170362889766693, - "step": 9460 - }, - { - "epoch": 0.4573443121725062, - "grad_norm": 2.215148758529441, - "learning_rate": 1.3178219782552282e-05, - "loss": 3.9332, - "mean_token_accuracy": 0.3218749985098839, - "step": 9470 - }, - { - "epoch": 0.45782725231207594, - "grad_norm": 2.2667547610082326, - "learning_rate": 1.3162230823222901e-05, - "loss": 4.0102, - "mean_token_accuracy": 0.3093750014901161, - "step": 9480 - }, - { - "epoch": 0.4583101924516456, - "grad_norm": 2.085511179511814, - "learning_rate": 1.3146232876486384e-05, - "loss": 3.9547, - "mean_token_accuracy": 0.31068548411130903, - "step": 9490 - }, - { - "epoch": 0.4587931325912153, - "grad_norm": 2.263111990078179, - "learning_rate": 1.3130225987810657e-05, - "loss": 3.9461, - "mean_token_accuracy": 0.3170362874865532, - "step": 9500 - }, - { - "epoch": 0.4587931325912153, - "eval_runtime": 7.7917, - "eval_samples_per_second": 379.123, - "eval_steps_per_second": 23.743, - "step": 9500 - }, - { - "epoch": 0.459276072730785, - "grad_norm": 2.2530238073022417, - "learning_rate": 1.3114210202689048e-05, - "loss": 3.934, - "mean_token_accuracy": 0.3177419364452362, - "step": 9510 - }, - { - "epoch": 0.4597590128703547, - "grad_norm": 2.2585304557956176, - "learning_rate": 1.309818556664018e-05, - "loss": 3.9332, - "mean_token_accuracy": 0.322379033267498, - "step": 9520 - }, - { - "epoch": 0.46024195300992443, - "grad_norm": 2.207587094785256, - "learning_rate": 1.308215212520783e-05, - "loss": 3.9406, - "mean_token_accuracy": 0.31370967626571655, - "step": 9530 - }, - { - "epoch": 0.4607248931494941, - "grad_norm": 2.2335953303518257, - "learning_rate": 1.3066109923960794e-05, - "loss": 3.9105, - "mean_token_accuracy": 0.32237903475761415, - "step": 9540 - }, - { - "epoch": 0.46120783328906384, - "grad_norm": 2.3310224784615086, - "learning_rate": 1.3050059008492771e-05, - "loss": 3.9855, - "mean_token_accuracy": 0.31451612859964373, - "step": 9550 - }, - { - "epoch": 0.4616907734286335, - "grad_norm": 2.116490389888414, - "learning_rate": 1.3033999424422228e-05, - "loss": 3.9207, - "mean_token_accuracy": 0.3198588743805885, - "step": 9560 - }, - { - "epoch": 0.46217371356820325, - "grad_norm": 2.2997641313449866, - "learning_rate": 1.3017931217392259e-05, - "loss": 4.0242, - "mean_token_accuracy": 0.3093750014901161, - "step": 9570 - }, - { - "epoch": 0.4626566537077729, - "grad_norm": 2.181443807057096, - "learning_rate": 1.3001854433070478e-05, - "loss": 3.9418, - "mean_token_accuracy": 0.3158266082406044, - "step": 9580 - }, - { - "epoch": 0.4631395938473426, - "grad_norm": 2.403186175938397, - "learning_rate": 1.2985769117148867e-05, - "loss": 3.993, - "mean_token_accuracy": 0.31713709980249405, - "step": 9590 - }, - { - "epoch": 0.46362253398691233, - "grad_norm": 2.2126270345800663, - "learning_rate": 1.2969675315343663e-05, - "loss": 3.9359, - "mean_token_accuracy": 0.3174395188689232, - "step": 9600 - }, - { - "epoch": 0.46362253398691233, - "eval_runtime": 7.7796, - "eval_samples_per_second": 379.71, - "eval_steps_per_second": 23.78, - "step": 9600 - }, - { - "epoch": 0.464105474126482, - "grad_norm": 2.45734268606812, - "learning_rate": 1.2953573073395219e-05, - "loss": 3.9441, - "mean_token_accuracy": 0.3205645173788071, - "step": 9610 - }, - { - "epoch": 0.46458841426605174, - "grad_norm": 2.3243357566422427, - "learning_rate": 1.2937462437067866e-05, - "loss": 3.9609, - "mean_token_accuracy": 0.3123991936445236, - "step": 9620 - }, - { - "epoch": 0.4650713544056214, - "grad_norm": 2.458474031772993, - "learning_rate": 1.2921343452149811e-05, - "loss": 3.8781, - "mean_token_accuracy": 0.33306451588869096, - "step": 9630 - }, - { - "epoch": 0.46555429454519115, - "grad_norm": 2.4502888914113004, - "learning_rate": 1.290521616445297e-05, - "loss": 3.9398, - "mean_token_accuracy": 0.32046370953321457, - "step": 9640 - }, - { - "epoch": 0.4660372346847608, - "grad_norm": 2.298589153856968, - "learning_rate": 1.288908061981287e-05, - "loss": 4.0453, - "mean_token_accuracy": 0.30846774131059645, - "step": 9650 - }, - { - "epoch": 0.4665201748243305, - "grad_norm": 2.278982290215792, - "learning_rate": 1.28729368640885e-05, - "loss": 3.916, - "mean_token_accuracy": 0.3150201618671417, - "step": 9660 - }, - { - "epoch": 0.46700311496390023, - "grad_norm": 2.2573462629551253, - "learning_rate": 1.2856784943162181e-05, - "loss": 3.923, - "mean_token_accuracy": 0.3208669349551201, - "step": 9670 - }, - { - "epoch": 0.4674860551034699, - "grad_norm": 2.396623800295243, - "learning_rate": 1.2840624902939452e-05, - "loss": 3.9633, - "mean_token_accuracy": 0.31512096971273423, - "step": 9680 - }, - { - "epoch": 0.46796899524303964, - "grad_norm": 2.396254030297396, - "learning_rate": 1.282445678934892e-05, - "loss": 3.9676, - "mean_token_accuracy": 0.31774193346500396, - "step": 9690 - }, - { - "epoch": 0.4684519353826093, - "grad_norm": 2.3470138750190097, - "learning_rate": 1.2808280648342134e-05, - "loss": 3.9488, - "mean_token_accuracy": 0.3152217760682106, - "step": 9700 - }, - { - "epoch": 0.4684519353826093, - "eval_runtime": 7.7998, - "eval_samples_per_second": 378.728, - "eval_steps_per_second": 23.719, - "step": 9700 - }, - { - "epoch": 0.46893487552217905, - "grad_norm": 2.3919409766177164, - "learning_rate": 1.279209652589347e-05, - "loss": 3.9918, - "mean_token_accuracy": 0.31602822467684744, - "step": 9710 - }, - { - "epoch": 0.4694178156617487, - "grad_norm": 2.32929020746566, - "learning_rate": 1.2775904467999973e-05, - "loss": 3.9418, - "mean_token_accuracy": 0.31723790615797043, - "step": 9720 - }, - { - "epoch": 0.4699007558013184, - "grad_norm": 2.2356462523592735, - "learning_rate": 1.2759704520681253e-05, - "loss": 3.957, - "mean_token_accuracy": 0.31229838728904724, - "step": 9730 - }, - { - "epoch": 0.47038369594088814, - "grad_norm": 2.275634011665633, - "learning_rate": 1.2743496729979338e-05, - "loss": 3.9527, - "mean_token_accuracy": 0.31683467924594877, - "step": 9740 - }, - { - "epoch": 0.4708666360804578, - "grad_norm": 2.3433856130443513, - "learning_rate": 1.272728114195855e-05, - "loss": 3.9996, - "mean_token_accuracy": 0.3057459682226181, - "step": 9750 - }, - { - "epoch": 0.47134957622002754, - "grad_norm": 2.2118099297941742, - "learning_rate": 1.2711057802705369e-05, - "loss": 3.9645, - "mean_token_accuracy": 0.32076613008975985, - "step": 9760 - }, - { - "epoch": 0.4718325163595972, - "grad_norm": 2.0910877221753332, - "learning_rate": 1.2694826758328303e-05, - "loss": 3.9074, - "mean_token_accuracy": 0.32127016335725783, - "step": 9770 - }, - { - "epoch": 0.47231545649916695, - "grad_norm": 2.305421910998035, - "learning_rate": 1.2678588054957766e-05, - "loss": 3.9648, - "mean_token_accuracy": 0.3132056459784508, - "step": 9780 - }, - { - "epoch": 0.47279839663873663, - "grad_norm": 2.447527686786078, - "learning_rate": 1.2662341738745934e-05, - "loss": 3.9348, - "mean_token_accuracy": 0.3136088699102402, - "step": 9790 - }, - { - "epoch": 0.4732813367783063, - "grad_norm": 2.394433415830251, - "learning_rate": 1.264608785586662e-05, - "loss": 4.0125, - "mean_token_accuracy": 0.3113911300897598, - "step": 9800 - }, - { - "epoch": 0.4732813367783063, - "eval_runtime": 7.7965, - "eval_samples_per_second": 378.886, - "eval_steps_per_second": 23.728, - "step": 9800 - }, - { - "epoch": 0.47376427691787604, - "grad_norm": 2.261396764613608, - "learning_rate": 1.2629826452515146e-05, - "loss": 3.9477, - "mean_token_accuracy": 0.31250000149011614, - "step": 9810 - }, - { - "epoch": 0.4742472170574457, - "grad_norm": 2.310636829001386, - "learning_rate": 1.2613557574908203e-05, - "loss": 3.9453, - "mean_token_accuracy": 0.31481854915618895, - "step": 9820 - }, - { - "epoch": 0.47473015719701545, - "grad_norm": 2.3013881999511248, - "learning_rate": 1.2597281269283727e-05, - "loss": 3.9102, - "mean_token_accuracy": 0.3211693540215492, - "step": 9830 - }, - { - "epoch": 0.4752130973365851, - "grad_norm": 2.1879800213534644, - "learning_rate": 1.2580997581900771e-05, - "loss": 3.9777, - "mean_token_accuracy": 0.309173384308815, - "step": 9840 - }, - { - "epoch": 0.47569603747615485, - "grad_norm": 2.3377746248218534, - "learning_rate": 1.2564706559039355e-05, - "loss": 3.9937, - "mean_token_accuracy": 0.31118951216340063, - "step": 9850 - }, - { - "epoch": 0.47617897761572453, - "grad_norm": 2.1164390877555084, - "learning_rate": 1.2548408247000356e-05, - "loss": 3.9363, - "mean_token_accuracy": 0.3177419364452362, - "step": 9860 - }, - { - "epoch": 0.4766619177552942, - "grad_norm": 2.321940221914267, - "learning_rate": 1.2532102692105368e-05, - "loss": 3.9176, - "mean_token_accuracy": 0.3163306459784508, - "step": 9870 - }, - { - "epoch": 0.47714485789486394, - "grad_norm": 2.3141071723130775, - "learning_rate": 1.2515789940696568e-05, - "loss": 3.9109, - "mean_token_accuracy": 0.322379033267498, - "step": 9880 - }, - { - "epoch": 0.4776277980344336, - "grad_norm": 2.294320812990155, - "learning_rate": 1.2499470039136586e-05, - "loss": 3.9469, - "mean_token_accuracy": 0.3117943540215492, - "step": 9890 - }, - { - "epoch": 0.47811073817400335, - "grad_norm": 2.4159952300234844, - "learning_rate": 1.248314303380837e-05, - "loss": 3.9961, - "mean_token_accuracy": 0.3142137065529823, - "step": 9900 - }, - { - "epoch": 0.47811073817400335, - "eval_runtime": 7.7835, - "eval_samples_per_second": 379.522, - "eval_steps_per_second": 23.768, - "step": 9900 - }, - { - "epoch": 0.478593678313573, - "grad_norm": 2.1712414152335473, - "learning_rate": 1.2466808971115065e-05, - "loss": 3.9656, - "mean_token_accuracy": 0.3052419349551201, - "step": 9910 - }, - { - "epoch": 0.47907661845314276, - "grad_norm": 2.3495237798146036, - "learning_rate": 1.2450467897479868e-05, - "loss": 3.8773, - "mean_token_accuracy": 0.32268145233392714, - "step": 9920 - }, - { - "epoch": 0.47955955859271243, - "grad_norm": 2.2143312103701334, - "learning_rate": 1.2434119859345908e-05, - "loss": 3.943, - "mean_token_accuracy": 0.31653226017951963, - "step": 9930 - }, - { - "epoch": 0.4800424987322821, - "grad_norm": 2.180722180897748, - "learning_rate": 1.24177649031761e-05, - "loss": 3.8539, - "mean_token_accuracy": 0.3285282239317894, - "step": 9940 - }, - { - "epoch": 0.48052543887185184, - "grad_norm": 2.2574779401170684, - "learning_rate": 1.2401403075453029e-05, - "loss": 3.9254, - "mean_token_accuracy": 0.3148185446858406, - "step": 9950 - }, - { - "epoch": 0.4810083790114215, - "grad_norm": 2.1759463002031767, - "learning_rate": 1.23850344226788e-05, - "loss": 3.9422, - "mean_token_accuracy": 0.32006048411130905, - "step": 9960 - }, - { - "epoch": 0.48149131915099125, - "grad_norm": 2.3677686123349577, - "learning_rate": 1.2368658991374926e-05, - "loss": 3.9238, - "mean_token_accuracy": 0.31522177532315254, - "step": 9970 - }, - { - "epoch": 0.4819742592905609, - "grad_norm": 2.175187690610891, - "learning_rate": 1.2352276828082177e-05, - "loss": 3.9129, - "mean_token_accuracy": 0.3162298396229744, - "step": 9980 - }, - { - "epoch": 0.48245719943013066, - "grad_norm": 2.280563514433045, - "learning_rate": 1.2335887979360462e-05, - "loss": 3.85, - "mean_token_accuracy": 0.3267137110233307, - "step": 9990 - }, - { - "epoch": 0.48294013956970033, - "grad_norm": 2.283340175136506, - "learning_rate": 1.2319492491788685e-05, - "loss": 3.9324, - "mean_token_accuracy": 0.31754032224416734, - "step": 10000 - }, - { - "epoch": 0.48294013956970033, - "eval_runtime": 7.7816, - "eval_samples_per_second": 379.615, - "eval_steps_per_second": 23.774, - "step": 10000 - }, - { - "epoch": 0.48342307970927, - "grad_norm": 2.1493249415035782, - "learning_rate": 1.2303090411964627e-05, - "loss": 3.9215, - "mean_token_accuracy": 0.3209677428007126, - "step": 10010 - }, - { - "epoch": 0.48390601984883974, - "grad_norm": 2.4424066440382037, - "learning_rate": 1.2286681786504795e-05, - "loss": 3.9117, - "mean_token_accuracy": 0.3243951588869095, - "step": 10020 - }, - { - "epoch": 0.4843889599884094, - "grad_norm": 2.2762859628611185, - "learning_rate": 1.2270266662044301e-05, - "loss": 3.8781, - "mean_token_accuracy": 0.3213709697127342, - "step": 10030 - }, - { - "epoch": 0.48487190012797915, - "grad_norm": 2.3210041214373103, - "learning_rate": 1.225384508523674e-05, - "loss": 4.0113, - "mean_token_accuracy": 0.30967741906642915, - "step": 10040 - }, - { - "epoch": 0.4853548402675488, - "grad_norm": 2.495495862554966, - "learning_rate": 1.2237417102754025e-05, - "loss": 3.9047, - "mean_token_accuracy": 0.3147177375853062, - "step": 10050 - }, - { - "epoch": 0.48583778040711856, - "grad_norm": 2.339844516174924, - "learning_rate": 1.2220982761286294e-05, - "loss": 3.9547, - "mean_token_accuracy": 0.3172379046678543, - "step": 10060 - }, - { - "epoch": 0.48632072054668823, - "grad_norm": 2.2196723011029826, - "learning_rate": 1.2204542107541746e-05, - "loss": 3.8898, - "mean_token_accuracy": 0.322177417576313, - "step": 10070 - }, - { - "epoch": 0.4868036606862579, - "grad_norm": 2.332789857876502, - "learning_rate": 1.2188095188246524e-05, - "loss": 3.9613, - "mean_token_accuracy": 0.3122983857989311, - "step": 10080 - }, - { - "epoch": 0.48728660082582764, - "grad_norm": 2.259344403341194, - "learning_rate": 1.2171642050144576e-05, - "loss": 3.9891, - "mean_token_accuracy": 0.3157258093357086, - "step": 10090 - }, - { - "epoch": 0.4877695409653973, - "grad_norm": 2.472643268483903, - "learning_rate": 1.2155182739997533e-05, - "loss": 4.0141, - "mean_token_accuracy": 0.31108870804309846, - "step": 10100 - }, - { - "epoch": 0.4877695409653973, - "eval_runtime": 7.7784, - "eval_samples_per_second": 379.772, - "eval_steps_per_second": 23.784, - "step": 10100 - }, - { - "epoch": 0.48825248110496705, - "grad_norm": 2.202889164832667, - "learning_rate": 1.2138717304584555e-05, - "loss": 3.9086, - "mean_token_accuracy": 0.32459677308797835, - "step": 10110 - }, - { - "epoch": 0.4887354212445367, - "grad_norm": 2.182021867116535, - "learning_rate": 1.212224579070222e-05, - "loss": 3.9539, - "mean_token_accuracy": 0.31733871102333067, - "step": 10120 - }, - { - "epoch": 0.48921836138410646, - "grad_norm": 2.265691829180602, - "learning_rate": 1.2105768245164377e-05, - "loss": 3.9965, - "mean_token_accuracy": 0.3166330635547638, - "step": 10130 - }, - { - "epoch": 0.48970130152367614, - "grad_norm": 2.3732455717470695, - "learning_rate": 1.2089284714802021e-05, - "loss": 3.9504, - "mean_token_accuracy": 0.3175403192639351, - "step": 10140 - }, - { - "epoch": 0.49018424166324587, - "grad_norm": 2.201656192904955, - "learning_rate": 1.2072795246463156e-05, - "loss": 3.9332, - "mean_token_accuracy": 0.3156249985098839, - "step": 10150 - }, - { - "epoch": 0.49066718180281554, - "grad_norm": 2.413824988697088, - "learning_rate": 1.2056299887012654e-05, - "loss": 3.9051, - "mean_token_accuracy": 0.3150201618671417, - "step": 10160 - }, - { - "epoch": 0.4911501219423852, - "grad_norm": 2.19345059928571, - "learning_rate": 1.2039798683332145e-05, - "loss": 3.9703, - "mean_token_accuracy": 0.30756048709154127, - "step": 10170 - }, - { - "epoch": 0.49163306208195495, - "grad_norm": 2.169462612804747, - "learning_rate": 1.2023291682319858e-05, - "loss": 3.9129, - "mean_token_accuracy": 0.32046370953321457, - "step": 10180 - }, - { - "epoch": 0.49211600222152463, - "grad_norm": 2.2622377753071885, - "learning_rate": 1.2006778930890503e-05, - "loss": 3.975, - "mean_token_accuracy": 0.31129032373428345, - "step": 10190 - }, - { - "epoch": 0.49259894236109436, - "grad_norm": 2.2386350798000385, - "learning_rate": 1.199026047597513e-05, - "loss": 3.9305, - "mean_token_accuracy": 0.32328629195690156, - "step": 10200 - }, - { - "epoch": 0.49259894236109436, - "eval_runtime": 7.7922, - "eval_samples_per_second": 379.099, - "eval_steps_per_second": 23.742, - "step": 10200 - }, - { - "epoch": 0.49308188250066404, - "grad_norm": 2.3260327986868417, - "learning_rate": 1.1973736364521005e-05, - "loss": 3.982, - "mean_token_accuracy": 0.31098790317773817, - "step": 10210 - }, - { - "epoch": 0.49356482264023377, - "grad_norm": 2.2568709878274, - "learning_rate": 1.1957206643491463e-05, - "loss": 3.9852, - "mean_token_accuracy": 0.31784273982048034, - "step": 10220 - }, - { - "epoch": 0.49404776277980345, - "grad_norm": 2.398317256605666, - "learning_rate": 1.194067135986579e-05, - "loss": 3.9937, - "mean_token_accuracy": 0.3069556444883347, - "step": 10230 - }, - { - "epoch": 0.4945307029193731, - "grad_norm": 2.238419212183823, - "learning_rate": 1.1924130560639072e-05, - "loss": 3.9172, - "mean_token_accuracy": 0.3240927442908287, - "step": 10240 - }, - { - "epoch": 0.49501364305894285, - "grad_norm": 2.26556200416511, - "learning_rate": 1.190758429282208e-05, - "loss": 3.9523, - "mean_token_accuracy": 0.31602822691202165, - "step": 10250 - }, - { - "epoch": 0.49549658319851253, - "grad_norm": 2.200531837278337, - "learning_rate": 1.1891032603441121e-05, - "loss": 3.9422, - "mean_token_accuracy": 0.31834677457809446, - "step": 10260 - }, - { - "epoch": 0.49597952333808226, - "grad_norm": 2.279611482245076, - "learning_rate": 1.1874475539537917e-05, - "loss": 3.9008, - "mean_token_accuracy": 0.32368951439857485, - "step": 10270 - }, - { - "epoch": 0.49646246347765194, - "grad_norm": 2.3156117711132564, - "learning_rate": 1.1857913148169455e-05, - "loss": 3.9438, - "mean_token_accuracy": 0.31451612859964373, - "step": 10280 - }, - { - "epoch": 0.49694540361722167, - "grad_norm": 2.1610310175257283, - "learning_rate": 1.1841345476407877e-05, - "loss": 3.9379, - "mean_token_accuracy": 0.3158266142010689, - "step": 10290 - }, - { - "epoch": 0.49742834375679135, - "grad_norm": 2.2393961935207556, - "learning_rate": 1.1824772571340319e-05, - "loss": 3.993, - "mean_token_accuracy": 0.31683467626571654, - "step": 10300 - }, - { - "epoch": 0.49742834375679135, - "eval_runtime": 7.8092, - "eval_samples_per_second": 378.272, - "eval_steps_per_second": 23.69, - "step": 10300 - }, - { - "epoch": 0.497911283896361, - "grad_norm": 2.246921590257132, - "learning_rate": 1.1808194480068798e-05, - "loss": 3.9641, - "mean_token_accuracy": 0.31693548560142515, - "step": 10310 - }, - { - "epoch": 0.49839422403593076, - "grad_norm": 2.266915245346127, - "learning_rate": 1.179161124971007e-05, - "loss": 3.9164, - "mean_token_accuracy": 0.3166330635547638, - "step": 10320 - }, - { - "epoch": 0.49887716417550043, - "grad_norm": 2.3696333857943777, - "learning_rate": 1.1775022927395496e-05, - "loss": 4.0281, - "mean_token_accuracy": 0.3110887110233307, - "step": 10330 - }, - { - "epoch": 0.49936010431507016, - "grad_norm": 2.2861514387756565, - "learning_rate": 1.175842956027091e-05, - "loss": 3.991, - "mean_token_accuracy": 0.31381048262119293, - "step": 10340 - }, - { - "epoch": 0.49984304445463984, - "grad_norm": 2.2991366720026, - "learning_rate": 1.1741831195496478e-05, - "loss": 3.8738, - "mean_token_accuracy": 0.32086693346500395, - "step": 10350 - }, - { - "epoch": 0.5003259845942095, - "grad_norm": 2.3171652950450308, - "learning_rate": 1.1725227880246581e-05, - "loss": 3.9305, - "mean_token_accuracy": 0.31784274280071256, - "step": 10360 - }, - { - "epoch": 0.5008089247337792, - "grad_norm": 2.3714053289075117, - "learning_rate": 1.1708619661709662e-05, - "loss": 3.9137, - "mean_token_accuracy": 0.31834677159786223, - "step": 10370 - }, - { - "epoch": 0.501291864873349, - "grad_norm": 2.466585158068514, - "learning_rate": 1.1692006587088098e-05, - "loss": 3.941, - "mean_token_accuracy": 0.3130040317773819, - "step": 10380 - }, - { - "epoch": 0.5017748050129186, - "grad_norm": 2.2030803067840794, - "learning_rate": 1.1675388703598074e-05, - "loss": 3.8672, - "mean_token_accuracy": 0.32167338877916335, - "step": 10390 - }, - { - "epoch": 0.5022577451524883, - "grad_norm": 2.2053148137272016, - "learning_rate": 1.1658766058469437e-05, - "loss": 3.9215, - "mean_token_accuracy": 0.32217742055654525, - "step": 10400 - }, - { - "epoch": 0.5022577451524883, - "eval_runtime": 7.7878, - "eval_samples_per_second": 379.309, - "eval_steps_per_second": 23.755, - "step": 10400 - }, - { - "epoch": 0.5027406852920581, - "grad_norm": 2.4747415131909123, - "learning_rate": 1.1642138698945573e-05, - "loss": 3.932, - "mean_token_accuracy": 0.3148185506463051, - "step": 10410 - }, - { - "epoch": 0.5032236254316278, - "grad_norm": 2.3673901546378464, - "learning_rate": 1.162550667228326e-05, - "loss": 3.8859, - "mean_token_accuracy": 0.31905242055654526, - "step": 10420 - }, - { - "epoch": 0.5037065655711974, - "grad_norm": 2.3899713683933803, - "learning_rate": 1.1608870025752544e-05, - "loss": 3.966, - "mean_token_accuracy": 0.3223790302872658, - "step": 10430 - }, - { - "epoch": 0.5041895057107671, - "grad_norm": 2.298399917558338, - "learning_rate": 1.1592228806636598e-05, - "loss": 3.9344, - "mean_token_accuracy": 0.3208669349551201, - "step": 10440 - }, - { - "epoch": 0.5046724458503369, - "grad_norm": 2.2654227165459977, - "learning_rate": 1.1575583062231599e-05, - "loss": 3.9668, - "mean_token_accuracy": 0.3087701633572578, - "step": 10450 - }, - { - "epoch": 0.5051553859899065, - "grad_norm": 2.3169942952651272, - "learning_rate": 1.1558932839846575e-05, - "loss": 3.9359, - "mean_token_accuracy": 0.31915322691202164, - "step": 10460 - }, - { - "epoch": 0.5056383261294762, - "grad_norm": 2.3090126349842515, - "learning_rate": 1.154227818680329e-05, - "loss": 3.9637, - "mean_token_accuracy": 0.3152217760682106, - "step": 10470 - }, - { - "epoch": 0.506121266269046, - "grad_norm": 2.306503415929606, - "learning_rate": 1.1525619150436095e-05, - "loss": 3.9676, - "mean_token_accuracy": 0.311491933465004, - "step": 10480 - }, - { - "epoch": 0.5066042064086157, - "grad_norm": 2.337187519533768, - "learning_rate": 1.1508955778091795e-05, - "loss": 3.9492, - "mean_token_accuracy": 0.3140120983123779, - "step": 10490 - }, - { - "epoch": 0.5070871465481853, - "grad_norm": 2.3344131919011497, - "learning_rate": 1.1492288117129531e-05, - "loss": 3.9926, - "mean_token_accuracy": 0.3085685446858406, - "step": 10500 - }, - { - "epoch": 0.5070871465481853, - "eval_runtime": 7.8283, - "eval_samples_per_second": 377.347, - "eval_steps_per_second": 23.632, - "step": 10500 - }, - { - "epoch": 0.507570086687755, - "grad_norm": 2.3624904907751696, - "learning_rate": 1.1475616214920622e-05, - "loss": 3.9664, - "mean_token_accuracy": 0.3083669364452362, - "step": 10510 - }, - { - "epoch": 0.5080530268273248, - "grad_norm": 2.2940469285090974, - "learning_rate": 1.145894011884844e-05, - "loss": 3.9941, - "mean_token_accuracy": 0.3200604856014252, - "step": 10520 - }, - { - "epoch": 0.5085359669668944, - "grad_norm": 2.2291193041702595, - "learning_rate": 1.1442259876308288e-05, - "loss": 3.943, - "mean_token_accuracy": 0.31864919513463974, - "step": 10530 - }, - { - "epoch": 0.5090189071064641, - "grad_norm": 2.3136245512748954, - "learning_rate": 1.1425575534707244e-05, - "loss": 3.9, - "mean_token_accuracy": 0.3193548396229744, - "step": 10540 - }, - { - "epoch": 0.5095018472460339, - "grad_norm": 2.3174983672935205, - "learning_rate": 1.1408887141464033e-05, - "loss": 3.9684, - "mean_token_accuracy": 0.30856855064630506, - "step": 10550 - }, - { - "epoch": 0.5099847873856036, - "grad_norm": 2.511282812333674, - "learning_rate": 1.1392194744008914e-05, - "loss": 3.9777, - "mean_token_accuracy": 0.31895161271095274, - "step": 10560 - }, - { - "epoch": 0.5104677275251732, - "grad_norm": 2.2478055922913343, - "learning_rate": 1.1375498389783498e-05, - "loss": 3.934, - "mean_token_accuracy": 0.3179435461759567, - "step": 10570 - }, - { - "epoch": 0.510950667664743, - "grad_norm": 2.3186034837060454, - "learning_rate": 1.1358798126240662e-05, - "loss": 3.9203, - "mean_token_accuracy": 0.32127016186714175, - "step": 10580 - }, - { - "epoch": 0.5114336078043127, - "grad_norm": 2.238581092322491, - "learning_rate": 1.1342094000844388e-05, - "loss": 3.9383, - "mean_token_accuracy": 0.31713709980249405, - "step": 10590 - }, - { - "epoch": 0.5119165479438823, - "grad_norm": 2.3716603791992927, - "learning_rate": 1.1325386061069639e-05, - "loss": 4.0141, - "mean_token_accuracy": 0.3174395188689232, - "step": 10600 - }, - { - "epoch": 0.5119165479438823, - "eval_runtime": 7.8217, - "eval_samples_per_second": 377.666, - "eval_steps_per_second": 23.652, - "step": 10600 - }, - { - "epoch": 0.512399488083452, - "grad_norm": 2.3212554081107797, - "learning_rate": 1.1308674354402207e-05, - "loss": 4.0004, - "mean_token_accuracy": 0.31411290168762207, - "step": 10610 - }, - { - "epoch": 0.5128824282230218, - "grad_norm": 2.303960307323921, - "learning_rate": 1.12919589283386e-05, - "loss": 3.9234, - "mean_token_accuracy": 0.3179435521364212, - "step": 10620 - }, - { - "epoch": 0.5133653683625915, - "grad_norm": 2.2702987121612144, - "learning_rate": 1.1275239830385894e-05, - "loss": 3.9254, - "mean_token_accuracy": 0.32389113008975984, - "step": 10630 - }, - { - "epoch": 0.5138483085021611, - "grad_norm": 2.395402624771453, - "learning_rate": 1.1258517108061598e-05, - "loss": 3.9223, - "mean_token_accuracy": 0.3180443525314331, - "step": 10640 - }, - { - "epoch": 0.5143312486417309, - "grad_norm": 2.4870151406189733, - "learning_rate": 1.124179080889353e-05, - "loss": 3.8977, - "mean_token_accuracy": 0.3171370968222618, - "step": 10650 - }, - { - "epoch": 0.5148141887813006, - "grad_norm": 2.1750747428090214, - "learning_rate": 1.1225060980419661e-05, - "loss": 3.9453, - "mean_token_accuracy": 0.31431451588869097, - "step": 10660 - }, - { - "epoch": 0.5152971289208702, - "grad_norm": 2.287363578196008, - "learning_rate": 1.120832767018801e-05, - "loss": 3.9469, - "mean_token_accuracy": 0.3174395188689232, - "step": 10670 - }, - { - "epoch": 0.5157800690604399, - "grad_norm": 2.4797993128254543, - "learning_rate": 1.1191590925756473e-05, - "loss": 3.984, - "mean_token_accuracy": 0.31562500447034836, - "step": 10680 - }, - { - "epoch": 0.5162630092000097, - "grad_norm": 2.362031912899414, - "learning_rate": 1.117485079469272e-05, - "loss": 3.941, - "mean_token_accuracy": 0.31391128748655317, - "step": 10690 - }, - { - "epoch": 0.5167459493395794, - "grad_norm": 2.2458500672311548, - "learning_rate": 1.1158107324574037e-05, - "loss": 3.948, - "mean_token_accuracy": 0.31280241906642914, - "step": 10700 - }, - { - "epoch": 0.5167459493395794, - "eval_runtime": 7.8, - "eval_samples_per_second": 378.718, - "eval_steps_per_second": 23.718, - "step": 10700 - }, - { - "epoch": 0.517228889479149, - "grad_norm": 2.27954095190706, - "learning_rate": 1.1141360562987206e-05, - "loss": 3.9387, - "mean_token_accuracy": 0.31985886991024015, - "step": 10710 - }, - { - "epoch": 0.5177118296187188, - "grad_norm": 2.3040922937651604, - "learning_rate": 1.112461055752836e-05, - "loss": 3.9059, - "mean_token_accuracy": 0.32076613008975985, - "step": 10720 - }, - { - "epoch": 0.5181947697582885, - "grad_norm": 2.204297655001981, - "learning_rate": 1.110785735580286e-05, - "loss": 3.9332, - "mean_token_accuracy": 0.3213709697127342, - "step": 10730 - }, - { - "epoch": 0.5186777098978581, - "grad_norm": 2.2882252399119505, - "learning_rate": 1.1091101005425135e-05, - "loss": 3.9496, - "mean_token_accuracy": 0.322379033267498, - "step": 10740 - }, - { - "epoch": 0.5191606500374278, - "grad_norm": 2.341400370606141, - "learning_rate": 1.107434155401858e-05, - "loss": 3.9656, - "mean_token_accuracy": 0.3123991906642914, - "step": 10750 - }, - { - "epoch": 0.5196435901769976, - "grad_norm": 2.372411346313528, - "learning_rate": 1.105757904921539e-05, - "loss": 3.9699, - "mean_token_accuracy": 0.3140120983123779, - "step": 10760 - }, - { - "epoch": 0.5201265303165673, - "grad_norm": 2.4067753613053418, - "learning_rate": 1.1040813538656445e-05, - "loss": 3.9688, - "mean_token_accuracy": 0.31411290168762207, - "step": 10770 - }, - { - "epoch": 0.5206094704561369, - "grad_norm": 2.3964796483430413, - "learning_rate": 1.1024045069991172e-05, - "loss": 3.9305, - "mean_token_accuracy": 0.31915322244167327, - "step": 10780 - }, - { - "epoch": 0.5210924105957067, - "grad_norm": 2.2471462878199917, - "learning_rate": 1.1007273690877392e-05, - "loss": 3.9629, - "mean_token_accuracy": 0.31411290168762207, - "step": 10790 - }, - { - "epoch": 0.5215753507352764, - "grad_norm": 2.3748495250402057, - "learning_rate": 1.099049944898121e-05, - "loss": 3.9555, - "mean_token_accuracy": 0.316129033267498, - "step": 10800 - }, - { - "epoch": 0.5215753507352764, - "eval_runtime": 7.8065, - "eval_samples_per_second": 378.402, - "eval_steps_per_second": 23.698, - "step": 10800 - }, - { - "epoch": 0.522058290874846, - "grad_norm": 2.3859897231005394, - "learning_rate": 1.097372239197686e-05, - "loss": 3.9426, - "mean_token_accuracy": 0.31491935551166533, - "step": 10810 - }, - { - "epoch": 0.5225412310144157, - "grad_norm": 2.390124828104098, - "learning_rate": 1.0956942567546583e-05, - "loss": 3.9844, - "mean_token_accuracy": 0.3099798381328583, - "step": 10820 - }, - { - "epoch": 0.5230241711539855, - "grad_norm": 2.442322466247052, - "learning_rate": 1.0940160023380482e-05, - "loss": 3.9586, - "mean_token_accuracy": 0.31955645233392715, - "step": 10830 - }, - { - "epoch": 0.5235071112935552, - "grad_norm": 2.36435218742294, - "learning_rate": 1.0923374807176386e-05, - "loss": 3.9289, - "mean_token_accuracy": 0.32429435551166536, - "step": 10840 - }, - { - "epoch": 0.5239900514331248, - "grad_norm": 2.4045762621804583, - "learning_rate": 1.0906586966639724e-05, - "loss": 3.9348, - "mean_token_accuracy": 0.3165322557091713, - "step": 10850 - }, - { - "epoch": 0.5244729915726946, - "grad_norm": 2.417811215817686, - "learning_rate": 1.0889796549483383e-05, - "loss": 3.9047, - "mean_token_accuracy": 0.3194556474685669, - "step": 10860 - }, - { - "epoch": 0.5249559317122643, - "grad_norm": 2.2478094349703475, - "learning_rate": 1.087300360342757e-05, - "loss": 3.9504, - "mean_token_accuracy": 0.3219758078455925, - "step": 10870 - }, - { - "epoch": 0.5254388718518339, - "grad_norm": 2.2546786410288324, - "learning_rate": 1.0856208176199683e-05, - "loss": 3.8969, - "mean_token_accuracy": 0.3184575974941254, - "step": 10880 - }, - { - "epoch": 0.5259218119914036, - "grad_norm": 2.5084391879776216, - "learning_rate": 1.0839410315534166e-05, - "loss": 3.9516, - "mean_token_accuracy": 0.3171370983123779, - "step": 10890 - }, - { - "epoch": 0.5264047521309734, - "grad_norm": 2.374367668553866, - "learning_rate": 1.0822610069172388e-05, - "loss": 3.9586, - "mean_token_accuracy": 0.31764113157987595, - "step": 10900 - }, - { - "epoch": 0.5264047521309734, - "eval_runtime": 7.8066, - "eval_samples_per_second": 378.396, - "eval_steps_per_second": 23.698, - "step": 10900 - }, - { - "epoch": 0.5268876922705431, - "grad_norm": 2.344279403985828, - "learning_rate": 1.0805807484862491e-05, - "loss": 3.877, - "mean_token_accuracy": 0.33064516335725785, - "step": 10910 - }, - { - "epoch": 0.5273706324101127, - "grad_norm": 2.3718471707951894, - "learning_rate": 1.0789002610359263e-05, - "loss": 3.9363, - "mean_token_accuracy": 0.3161290317773819, - "step": 10920 - }, - { - "epoch": 0.5278535725496825, - "grad_norm": 2.327565851459907, - "learning_rate": 1.0772195493424005e-05, - "loss": 3.9121, - "mean_token_accuracy": 0.32409274131059645, - "step": 10930 - }, - { - "epoch": 0.5283365126892522, - "grad_norm": 2.416244321524799, - "learning_rate": 1.0755386181824386e-05, - "loss": 3.8828, - "mean_token_accuracy": 0.32368951588869094, - "step": 10940 - }, - { - "epoch": 0.5288194528288218, - "grad_norm": 2.3202515296480866, - "learning_rate": 1.0738574723334317e-05, - "loss": 3.9805, - "mean_token_accuracy": 0.30423387289047243, - "step": 10950 - }, - { - "epoch": 0.5293023929683915, - "grad_norm": 2.3575938023043124, - "learning_rate": 1.0721761165733807e-05, - "loss": 3.8828, - "mean_token_accuracy": 0.3261088699102402, - "step": 10960 - }, - { - "epoch": 0.5297853331079613, - "grad_norm": 2.389861043165183, - "learning_rate": 1.0704945556808832e-05, - "loss": 3.9766, - "mean_token_accuracy": 0.32127015888690946, - "step": 10970 - }, - { - "epoch": 0.530268273247531, - "grad_norm": 2.5340142582520104, - "learning_rate": 1.06881279443512e-05, - "loss": 3.907, - "mean_token_accuracy": 0.3184475779533386, - "step": 10980 - }, - { - "epoch": 0.5307512133871006, - "grad_norm": 2.355080602588245, - "learning_rate": 1.0671308376158408e-05, - "loss": 4.0246, - "mean_token_accuracy": 0.30887096375226974, - "step": 10990 - }, - { - "epoch": 0.5312341535266704, - "grad_norm": 2.376495740197977, - "learning_rate": 1.0654486900033518e-05, - "loss": 3.968, - "mean_token_accuracy": 0.3168346777558327, - "step": 11000 - }, - { - "epoch": 0.5312341535266704, - "eval_runtime": 7.7917, - "eval_samples_per_second": 379.12, - "eval_steps_per_second": 23.743, - "step": 11000 - }, - { - "epoch": 0.5317170936662401, - "grad_norm": 2.3428629742516445, - "learning_rate": 1.0637663563785013e-05, - "loss": 3.9023, - "mean_token_accuracy": 0.32268145233392714, - "step": 11010 - }, - { - "epoch": 0.5322000338058098, - "grad_norm": 2.2645437045799346, - "learning_rate": 1.062083841522666e-05, - "loss": 3.884, - "mean_token_accuracy": 0.3223790317773819, - "step": 11020 - }, - { - "epoch": 0.5326829739453794, - "grad_norm": 2.319140422066057, - "learning_rate": 1.0604011502177376e-05, - "loss": 3.8789, - "mean_token_accuracy": 0.3240927457809448, - "step": 11030 - }, - { - "epoch": 0.5331659140849492, - "grad_norm": 2.3837471862106607, - "learning_rate": 1.0587182872461102e-05, - "loss": 3.9473, - "mean_token_accuracy": 0.3145161300897598, - "step": 11040 - }, - { - "epoch": 0.5336488542245189, - "grad_norm": 2.2134495943303603, - "learning_rate": 1.0570352573906641e-05, - "loss": 3.9727, - "mean_token_accuracy": 0.31471773982048035, - "step": 11050 - }, - { - "epoch": 0.5341317943640885, - "grad_norm": 2.228727056930029, - "learning_rate": 1.055352065434756e-05, - "loss": 3.9137, - "mean_token_accuracy": 0.31905242055654526, - "step": 11060 - }, - { - "epoch": 0.5346147345036583, - "grad_norm": 2.3886647426886274, - "learning_rate": 1.0536687161622012e-05, - "loss": 3.8305, - "mean_token_accuracy": 0.3269153222441673, - "step": 11070 - }, - { - "epoch": 0.535097674643228, - "grad_norm": 2.480877813492789, - "learning_rate": 1.0519852143572638e-05, - "loss": 3.9457, - "mean_token_accuracy": 0.31280242204666137, - "step": 11080 - }, - { - "epoch": 0.5355806147827977, - "grad_norm": 2.266460194605464, - "learning_rate": 1.0503015648046402e-05, - "loss": 3.9848, - "mean_token_accuracy": 0.3245967745780945, - "step": 11090 - }, - { - "epoch": 0.5360635549223673, - "grad_norm": 2.374578298133916, - "learning_rate": 1.0486177722894482e-05, - "loss": 3.8977, - "mean_token_accuracy": 0.31965725421905516, - "step": 11100 - }, - { - "epoch": 0.5360635549223673, - "eval_runtime": 7.8063, - "eval_samples_per_second": 378.412, - "eval_steps_per_second": 23.699, - "step": 11100 - }, - { - "epoch": 0.5365464950619371, - "grad_norm": 2.320960704411755, - "learning_rate": 1.0469338415972099e-05, - "loss": 3.9395, - "mean_token_accuracy": 0.32076613008975985, - "step": 11110 - }, - { - "epoch": 0.5370294352015068, - "grad_norm": 2.3530541764856405, - "learning_rate": 1.0452497775138417e-05, - "loss": 3.8898, - "mean_token_accuracy": 0.3242943540215492, - "step": 11120 - }, - { - "epoch": 0.5375123753410764, - "grad_norm": 2.223179542471163, - "learning_rate": 1.0435655848256382e-05, - "loss": 3.9848, - "mean_token_accuracy": 0.321370966732502, - "step": 11130 - }, - { - "epoch": 0.5379953154806462, - "grad_norm": 2.3937974926676833, - "learning_rate": 1.0418812683192603e-05, - "loss": 4.0148, - "mean_token_accuracy": 0.3041330650448799, - "step": 11140 - }, - { - "epoch": 0.5384782556202159, - "grad_norm": 2.375824739824623, - "learning_rate": 1.0401968327817206e-05, - "loss": 3.9188, - "mean_token_accuracy": 0.313508066534996, - "step": 11150 - }, - { - "epoch": 0.5389611957597856, - "grad_norm": 2.1913264553811067, - "learning_rate": 1.0385122830003694e-05, - "loss": 3.9766, - "mean_token_accuracy": 0.31522177457809447, - "step": 11160 - }, - { - "epoch": 0.5394441358993552, - "grad_norm": 2.353812281790984, - "learning_rate": 1.036827623762882e-05, - "loss": 3.9738, - "mean_token_accuracy": 0.31229838728904724, - "step": 11170 - }, - { - "epoch": 0.539927076038925, - "grad_norm": 2.2045320603283427, - "learning_rate": 1.0351428598572453e-05, - "loss": 3.907, - "mean_token_accuracy": 0.32389113306999207, - "step": 11180 - }, - { - "epoch": 0.5404100161784947, - "grad_norm": 2.2653742662058276, - "learning_rate": 1.0334579960717432e-05, - "loss": 3.9461, - "mean_token_accuracy": 0.3148185446858406, - "step": 11190 - }, - { - "epoch": 0.5408929563180643, - "grad_norm": 2.2064719315934798, - "learning_rate": 1.031773037194943e-05, - "loss": 3.8781, - "mean_token_accuracy": 0.3208669349551201, - "step": 11200 - }, - { - "epoch": 0.5408929563180643, - "eval_runtime": 7.8046, - "eval_samples_per_second": 378.493, - "eval_steps_per_second": 23.704, - "step": 11200 - }, - { - "epoch": 0.5413758964576341, - "grad_norm": 2.4647689060259537, - "learning_rate": 1.0300879880156836e-05, - "loss": 3.9629, - "mean_token_accuracy": 0.3076612904667854, - "step": 11210 - }, - { - "epoch": 0.5418588365972038, - "grad_norm": 2.1410016454859697, - "learning_rate": 1.0284028533230593e-05, - "loss": 3.8711, - "mean_token_accuracy": 0.3243951603770256, - "step": 11220 - }, - { - "epoch": 0.5423417767367735, - "grad_norm": 2.387697158122027, - "learning_rate": 1.0267176379064076e-05, - "loss": 3.9371, - "mean_token_accuracy": 0.3259072586894035, - "step": 11230 - }, - { - "epoch": 0.5428247168763431, - "grad_norm": 2.269711835329223, - "learning_rate": 1.0250323465552964e-05, - "loss": 3.9402, - "mean_token_accuracy": 0.32076612710952757, - "step": 11240 - }, - { - "epoch": 0.5433076570159129, - "grad_norm": 2.344636114368095, - "learning_rate": 1.0233469840595083e-05, - "loss": 3.9113, - "mean_token_accuracy": 0.3166330635547638, - "step": 11250 - }, - { - "epoch": 0.5437905971554826, - "grad_norm": 2.2553448211770966, - "learning_rate": 1.0216615552090285e-05, - "loss": 3.9902, - "mean_token_accuracy": 0.3181451603770256, - "step": 11260 - }, - { - "epoch": 0.5442735372950522, - "grad_norm": 2.240013445261615, - "learning_rate": 1.0199760647940308e-05, - "loss": 3.9777, - "mean_token_accuracy": 0.30897177308797835, - "step": 11270 - }, - { - "epoch": 0.544756477434622, - "grad_norm": 2.3231306384272377, - "learning_rate": 1.0182905176048643e-05, - "loss": 3.9523, - "mean_token_accuracy": 0.31794354915618894, - "step": 11280 - }, - { - "epoch": 0.5452394175741917, - "grad_norm": 2.341563149896763, - "learning_rate": 1.0166049184320386e-05, - "loss": 3.9375, - "mean_token_accuracy": 0.30816532075405123, - "step": 11290 - }, - { - "epoch": 0.5457223577137614, - "grad_norm": 2.3258022595342105, - "learning_rate": 1.0149192720662122e-05, - "loss": 3.9734, - "mean_token_accuracy": 0.3102822571992874, - "step": 11300 - }, - { - "epoch": 0.5457223577137614, - "eval_runtime": 7.8043, - "eval_samples_per_second": 378.507, - "eval_steps_per_second": 23.705, - "step": 11300 - }, - { - "epoch": 0.546205297853331, - "grad_norm": 2.40866319180267, - "learning_rate": 1.0132335832981765e-05, - "loss": 3.9434, - "mean_token_accuracy": 0.3118951603770256, - "step": 11310 - }, - { - "epoch": 0.5466882379929008, - "grad_norm": 2.3244668549304497, - "learning_rate": 1.0115478569188448e-05, - "loss": 3.9926, - "mean_token_accuracy": 0.31693548411130906, - "step": 11320 - }, - { - "epoch": 0.5471711781324705, - "grad_norm": 2.3252721244410735, - "learning_rate": 1.0098620977192356e-05, - "loss": 3.9047, - "mean_token_accuracy": 0.3171370968222618, - "step": 11330 - }, - { - "epoch": 0.5476541182720401, - "grad_norm": 2.3901838842617176, - "learning_rate": 1.0081763104904625e-05, - "loss": 3.898, - "mean_token_accuracy": 0.3221774220466614, - "step": 11340 - }, - { - "epoch": 0.5481370584116099, - "grad_norm": 2.2742395633915318, - "learning_rate": 1.006490500023717e-05, - "loss": 3.9441, - "mean_token_accuracy": 0.3181451603770256, - "step": 11350 - }, - { - "epoch": 0.5486199985511796, - "grad_norm": 2.345525038506473, - "learning_rate": 1.0048046711102584e-05, - "loss": 3.9562, - "mean_token_accuracy": 0.31350806057453157, - "step": 11360 - }, - { - "epoch": 0.5491029386907493, - "grad_norm": 2.3075893914563137, - "learning_rate": 1.0031188285413969e-05, - "loss": 3.9691, - "mean_token_accuracy": 0.3171370968222618, - "step": 11370 - }, - { - "epoch": 0.549585878830319, - "grad_norm": 2.4733962445569393, - "learning_rate": 1.0014329771084822e-05, - "loss": 4.0105, - "mean_token_accuracy": 0.3080645173788071, - "step": 11380 - }, - { - "epoch": 0.5500688189698887, - "grad_norm": 2.3411508446767733, - "learning_rate": 9.997471216028893e-06, - "loss": 3.9012, - "mean_token_accuracy": 0.3288306474685669, - "step": 11390 - }, - { - "epoch": 0.5505517591094584, - "grad_norm": 2.676127249800124, - "learning_rate": 9.980612668160046e-06, - "loss": 3.8801, - "mean_token_accuracy": 0.32066532373428347, - "step": 11400 - }, - { - "epoch": 0.5505517591094584, - "eval_runtime": 7.7952, - "eval_samples_per_second": 378.953, - "eval_steps_per_second": 23.733, - "step": 11400 - }, - { - "epoch": 0.551034699249028, - "grad_norm": 2.250586146003107, - "learning_rate": 9.963754175392124e-06, - "loss": 3.8809, - "mean_token_accuracy": 0.3181451603770256, - "step": 11410 - }, - { - "epoch": 0.5515176393885978, - "grad_norm": 2.3292220907663443, - "learning_rate": 9.946895785638814e-06, - "loss": 3.8988, - "mean_token_accuracy": 0.3243951603770256, - "step": 11420 - }, - { - "epoch": 0.5520005795281675, - "grad_norm": 2.217312220725214, - "learning_rate": 9.930037546813513e-06, - "loss": 3.9523, - "mean_token_accuracy": 0.3222782269120216, - "step": 11430 - }, - { - "epoch": 0.5524835196677372, - "grad_norm": 2.3205444966822455, - "learning_rate": 9.913179506829182e-06, - "loss": 3.893, - "mean_token_accuracy": 0.31622983515262604, - "step": 11440 - }, - { - "epoch": 0.5529664598073069, - "grad_norm": 2.41816032361089, - "learning_rate": 9.896321713598222e-06, - "loss": 3.9715, - "mean_token_accuracy": 0.3160282224416733, - "step": 11450 - }, - { - "epoch": 0.5534493999468766, - "grad_norm": 2.396254560045998, - "learning_rate": 9.879464215032337e-06, - "loss": 3.9297, - "mean_token_accuracy": 0.3211693525314331, - "step": 11460 - }, - { - "epoch": 0.5539323400864463, - "grad_norm": 2.2817830440751825, - "learning_rate": 9.862607059042381e-06, - "loss": 3.9281, - "mean_token_accuracy": 0.3254032269120216, - "step": 11470 - }, - { - "epoch": 0.5544152802260159, - "grad_norm": 2.325569099250059, - "learning_rate": 9.84575029353825e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.3155241906642914, - "step": 11480 - }, - { - "epoch": 0.5548982203655857, - "grad_norm": 2.49117197996408, - "learning_rate": 9.828893966428712e-06, - "loss": 3.9949, - "mean_token_accuracy": 0.31209677159786225, - "step": 11490 - }, - { - "epoch": 0.5553811605051554, - "grad_norm": 2.4464685178320598, - "learning_rate": 9.812038125621308e-06, - "loss": 3.9898, - "mean_token_accuracy": 0.3132056504487991, - "step": 11500 - }, - { - "epoch": 0.5553811605051554, - "eval_runtime": 7.8159, - "eval_samples_per_second": 377.946, - "eval_steps_per_second": 23.67, - "step": 11500 - }, - { - "epoch": 0.5558641006447251, - "grad_norm": 2.4246638993009784, - "learning_rate": 9.795182819022182e-06, - "loss": 3.9281, - "mean_token_accuracy": 0.3229838728904724, - "step": 11510 - }, - { - "epoch": 0.5563470407842948, - "grad_norm": 2.459596532539133, - "learning_rate": 9.77832809453597e-06, - "loss": 3.9813, - "mean_token_accuracy": 0.30524193346500395, - "step": 11520 - }, - { - "epoch": 0.5568299809238645, - "grad_norm": 2.1643566153937845, - "learning_rate": 9.761474000065649e-06, - "loss": 3.8715, - "mean_token_accuracy": 0.32106854766607285, - "step": 11530 - }, - { - "epoch": 0.5573129210634342, - "grad_norm": 2.412486105184616, - "learning_rate": 9.744620583512403e-06, - "loss": 3.9602, - "mean_token_accuracy": 0.3075604856014252, - "step": 11540 - }, - { - "epoch": 0.5577958612030038, - "grad_norm": 2.2652121668305085, - "learning_rate": 9.727767892775491e-06, - "loss": 3.923, - "mean_token_accuracy": 0.3234879031777382, - "step": 11550 - }, - { - "epoch": 0.5582788013425736, - "grad_norm": 2.4037233821602926, - "learning_rate": 9.710915975752116e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.31743951588869096, - "step": 11560 - }, - { - "epoch": 0.5587617414821433, - "grad_norm": 2.3119405511393274, - "learning_rate": 9.694064880337267e-06, - "loss": 3.9055, - "mean_token_accuracy": 0.3221774235367775, - "step": 11570 - }, - { - "epoch": 0.559244681621713, - "grad_norm": 2.3914437892968476, - "learning_rate": 9.677214654423618e-06, - "loss": 3.9312, - "mean_token_accuracy": 0.3207661285996437, - "step": 11580 - }, - { - "epoch": 0.5597276217612827, - "grad_norm": 2.600317317789711, - "learning_rate": 9.660365345901351e-06, - "loss": 3.9789, - "mean_token_accuracy": 0.31885080635547636, - "step": 11590 - }, - { - "epoch": 0.5602105619008524, - "grad_norm": 2.3190814614590214, - "learning_rate": 9.643517002658055e-06, - "loss": 3.902, - "mean_token_accuracy": 0.3232862904667854, - "step": 11600 - }, - { - "epoch": 0.5602105619008524, - "eval_runtime": 7.8061, - "eval_samples_per_second": 378.422, - "eval_steps_per_second": 23.699, - "step": 11600 - }, - { - "epoch": 0.5606935020404221, - "grad_norm": 2.48315326365775, - "learning_rate": 9.62666967257857e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.3162298411130905, - "step": 11610 - }, - { - "epoch": 0.5611764421799917, - "grad_norm": 2.3032605096738896, - "learning_rate": 9.609823403544858e-06, - "loss": 3.8871, - "mean_token_accuracy": 0.3181451618671417, - "step": 11620 - }, - { - "epoch": 0.5616593823195615, - "grad_norm": 2.313723676072136, - "learning_rate": 9.592978243435867e-06, - "loss": 3.9109, - "mean_token_accuracy": 0.31915322691202164, - "step": 11630 - }, - { - "epoch": 0.5621423224591312, - "grad_norm": 2.492365331637608, - "learning_rate": 9.576134240127387e-06, - "loss": 3.9258, - "mean_token_accuracy": 0.3150201618671417, - "step": 11640 - }, - { - "epoch": 0.5626252625987009, - "grad_norm": 2.3474685174521626, - "learning_rate": 9.55929144149193e-06, - "loss": 3.9117, - "mean_token_accuracy": 0.3222782239317894, - "step": 11650 - }, - { - "epoch": 0.5631082027382706, - "grad_norm": 2.1915421526225205, - "learning_rate": 9.54244989539857e-06, - "loss": 3.9324, - "mean_token_accuracy": 0.31995967775583267, - "step": 11660 - }, - { - "epoch": 0.5635911428778403, - "grad_norm": 2.4628609017806293, - "learning_rate": 9.525609649712838e-06, - "loss": 3.993, - "mean_token_accuracy": 0.32076613008975985, - "step": 11670 - }, - { - "epoch": 0.56407408301741, - "grad_norm": 2.497022895162499, - "learning_rate": 9.508770752296557e-06, - "loss": 3.9898, - "mean_token_accuracy": 0.315322582423687, - "step": 11680 - }, - { - "epoch": 0.5645570231569796, - "grad_norm": 2.3586812171901554, - "learning_rate": 9.491933251007723e-06, - "loss": 3.916, - "mean_token_accuracy": 0.3128024205565453, - "step": 11690 - }, - { - "epoch": 0.5650399632965494, - "grad_norm": 2.18913763633722, - "learning_rate": 9.475097193700362e-06, - "loss": 3.8754, - "mean_token_accuracy": 0.33034273982048035, - "step": 11700 - }, - { - "epoch": 0.5650399632965494, - "eval_runtime": 7.8115, - "eval_samples_per_second": 378.163, - "eval_steps_per_second": 23.683, - "step": 11700 - }, - { - "epoch": 0.5655229034361191, - "grad_norm": 2.3360906122348397, - "learning_rate": 9.4582626282244e-06, - "loss": 3.9098, - "mean_token_accuracy": 0.3188508078455925, - "step": 11710 - }, - { - "epoch": 0.5660058435756888, - "grad_norm": 2.480090500671984, - "learning_rate": 9.441429602425518e-06, - "loss": 3.9766, - "mean_token_accuracy": 0.31572580337524414, - "step": 11720 - }, - { - "epoch": 0.5664887837152585, - "grad_norm": 2.3157149068680547, - "learning_rate": 9.42459816414502e-06, - "loss": 4.0152, - "mean_token_accuracy": 0.31491935551166533, - "step": 11730 - }, - { - "epoch": 0.5669717238548282, - "grad_norm": 2.4681760983235583, - "learning_rate": 9.407768361219707e-06, - "loss": 3.8789, - "mean_token_accuracy": 0.32893145084381104, - "step": 11740 - }, - { - "epoch": 0.5674546639943979, - "grad_norm": 2.163160934296099, - "learning_rate": 9.390940241481722e-06, - "loss": 3.9195, - "mean_token_accuracy": 0.32207661122083664, - "step": 11750 - }, - { - "epoch": 0.5679376041339675, - "grad_norm": 2.3419831974967527, - "learning_rate": 9.374113852758432e-06, - "loss": 3.884, - "mean_token_accuracy": 0.3201612919569016, - "step": 11760 - }, - { - "epoch": 0.5684205442735373, - "grad_norm": 2.280715914252699, - "learning_rate": 9.357289242872277e-06, - "loss": 3.8645, - "mean_token_accuracy": 0.3285282254219055, - "step": 11770 - }, - { - "epoch": 0.568903484413107, - "grad_norm": 2.3055525311385434, - "learning_rate": 9.34046645964065e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.31995967775583267, - "step": 11780 - }, - { - "epoch": 0.5693864245526767, - "grad_norm": 2.4661423209753583, - "learning_rate": 9.323645550875743e-06, - "loss": 3.9766, - "mean_token_accuracy": 0.3115927428007126, - "step": 11790 - }, - { - "epoch": 0.5698693646922464, - "grad_norm": 2.548462482871013, - "learning_rate": 9.30682656438443e-06, - "loss": 3.9113, - "mean_token_accuracy": 0.3269153252243996, - "step": 11800 - }, - { - "epoch": 0.5698693646922464, - "eval_runtime": 7.7874, - "eval_samples_per_second": 379.333, - "eval_steps_per_second": 23.756, - "step": 11800 - }, - { - "epoch": 0.5703523048318161, - "grad_norm": 2.392113069324535, - "learning_rate": 9.290009547968111e-06, - "loss": 3.9367, - "mean_token_accuracy": 0.3148185461759567, - "step": 11810 - }, - { - "epoch": 0.5708352449713858, - "grad_norm": 2.3701005453591617, - "learning_rate": 9.2731945494226e-06, - "loss": 3.9137, - "mean_token_accuracy": 0.3216733857989311, - "step": 11820 - }, - { - "epoch": 0.5713181851109554, - "grad_norm": 2.4702942758554247, - "learning_rate": 9.256381616537958e-06, - "loss": 3.9223, - "mean_token_accuracy": 0.32379032373428346, - "step": 11830 - }, - { - "epoch": 0.5718011252505252, - "grad_norm": 2.325168256903362, - "learning_rate": 9.2395707970984e-06, - "loss": 3.9918, - "mean_token_accuracy": 0.3133064478635788, - "step": 11840 - }, - { - "epoch": 0.5722840653900949, - "grad_norm": 2.485310226148532, - "learning_rate": 9.222762138882113e-06, - "loss": 3.9906, - "mean_token_accuracy": 0.3068548396229744, - "step": 11850 - }, - { - "epoch": 0.5727670055296646, - "grad_norm": 2.377160008463382, - "learning_rate": 9.205955689661144e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.32197580486536026, - "step": 11860 - }, - { - "epoch": 0.5732499456692343, - "grad_norm": 2.3302635537296985, - "learning_rate": 9.189151497201273e-06, - "loss": 3.8973, - "mean_token_accuracy": 0.319556450843811, - "step": 11870 - }, - { - "epoch": 0.573732885808804, - "grad_norm": 2.422412499545331, - "learning_rate": 9.172349609261859e-06, - "loss": 3.9164, - "mean_token_accuracy": 0.3197580620646477, - "step": 11880 - }, - { - "epoch": 0.5742158259483737, - "grad_norm": 2.2368465530388772, - "learning_rate": 9.155550073595712e-06, - "loss": 3.9434, - "mean_token_accuracy": 0.31703629344701767, - "step": 11890 - }, - { - "epoch": 0.5746987660879433, - "grad_norm": 2.457396731204454, - "learning_rate": 9.138752937948953e-06, - "loss": 3.9305, - "mean_token_accuracy": 0.3182459697127342, - "step": 11900 - }, - { - "epoch": 0.5746987660879433, - "eval_runtime": 7.8237, - "eval_samples_per_second": 377.572, - "eval_steps_per_second": 23.646, - "step": 11900 - }, - { - "epoch": 0.5751817062275131, - "grad_norm": 2.4210992392868467, - "learning_rate": 9.121958250060889e-06, - "loss": 3.9445, - "mean_token_accuracy": 0.32066532224416733, - "step": 11910 - }, - { - "epoch": 0.5756646463670828, - "grad_norm": 2.3403407054192464, - "learning_rate": 9.105166057663864e-06, - "loss": 3.8945, - "mean_token_accuracy": 0.31844758093357084, - "step": 11920 - }, - { - "epoch": 0.5761475865066525, - "grad_norm": 2.2856354948411397, - "learning_rate": 9.088376408483137e-06, - "loss": 3.984, - "mean_token_accuracy": 0.3168346777558327, - "step": 11930 - }, - { - "epoch": 0.5766305266462222, - "grad_norm": 2.4176027554030255, - "learning_rate": 9.071589350236727e-06, - "loss": 3.8922, - "mean_token_accuracy": 0.3244959682226181, - "step": 11940 - }, - { - "epoch": 0.5771134667857919, - "grad_norm": 2.3349999288145424, - "learning_rate": 9.054804930635304e-06, - "loss": 3.8887, - "mean_token_accuracy": 0.3199596792459488, - "step": 11950 - }, - { - "epoch": 0.5775964069253616, - "grad_norm": 2.326350755884072, - "learning_rate": 9.038023197382023e-06, - "loss": 3.8801, - "mean_token_accuracy": 0.3221451297402382, - "step": 11960 - }, - { - "epoch": 0.5780793470649312, - "grad_norm": 2.406344004202903, - "learning_rate": 9.021244198172421e-06, - "loss": 3.9602, - "mean_token_accuracy": 0.30957660973072054, - "step": 11970 - }, - { - "epoch": 0.578562287204501, - "grad_norm": 2.3392853291595004, - "learning_rate": 9.00446798069425e-06, - "loss": 3.9234, - "mean_token_accuracy": 0.3192540377378464, - "step": 11980 - }, - { - "epoch": 0.5790452273440707, - "grad_norm": 2.3558877376188363, - "learning_rate": 8.98769459262736e-06, - "loss": 3.9219, - "mean_token_accuracy": 0.3214717715978622, - "step": 11990 - }, - { - "epoch": 0.5795281674836404, - "grad_norm": 2.461191680076343, - "learning_rate": 8.970924081643566e-06, - "loss": 3.9043, - "mean_token_accuracy": 0.3205645173788071, - "step": 12000 - }, - { - "epoch": 0.5795281674836404, - "eval_runtime": 7.8191, - "eval_samples_per_second": 377.794, - "eval_steps_per_second": 23.66, - "step": 12000 - }, - { - "epoch": 0.5800111076232101, - "grad_norm": 2.4685072931996785, - "learning_rate": 8.954156495406497e-06, - "loss": 3.9227, - "mean_token_accuracy": 0.31703629046678544, - "step": 12010 - }, - { - "epoch": 0.5804940477627798, - "grad_norm": 2.333901134265098, - "learning_rate": 8.937391881571479e-06, - "loss": 3.9207, - "mean_token_accuracy": 0.32207661122083664, - "step": 12020 - }, - { - "epoch": 0.5809769879023495, - "grad_norm": 2.346218130006008, - "learning_rate": 8.920630287785377e-06, - "loss": 3.9172, - "mean_token_accuracy": 0.32479838877916334, - "step": 12030 - }, - { - "epoch": 0.5814599280419191, - "grad_norm": 2.345848427930141, - "learning_rate": 8.903871761686487e-06, - "loss": 3.8883, - "mean_token_accuracy": 0.3265120953321457, - "step": 12040 - }, - { - "epoch": 0.5819428681814889, - "grad_norm": 2.4235354075177864, - "learning_rate": 8.887116350904378e-06, - "loss": 3.8875, - "mean_token_accuracy": 0.31955645233392715, - "step": 12050 - }, - { - "epoch": 0.5824258083210586, - "grad_norm": 2.348899335064894, - "learning_rate": 8.87036410305977e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.3224798396229744, - "step": 12060 - }, - { - "epoch": 0.5829087484606283, - "grad_norm": 2.4310303799046076, - "learning_rate": 8.85361506576438e-06, - "loss": 3.9242, - "mean_token_accuracy": 0.3147177428007126, - "step": 12070 - }, - { - "epoch": 0.583391688600198, - "grad_norm": 2.3188114823528476, - "learning_rate": 8.836869286620827e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.31431451588869097, - "step": 12080 - }, - { - "epoch": 0.5838746287397677, - "grad_norm": 2.3406640174114153, - "learning_rate": 8.820126813222436e-06, - "loss": 3.923, - "mean_token_accuracy": 0.3131048381328583, - "step": 12090 - }, - { - "epoch": 0.5843575688793374, - "grad_norm": 2.479455770239232, - "learning_rate": 8.803387693153169e-06, - "loss": 3.9801, - "mean_token_accuracy": 0.3151209712028503, - "step": 12100 - }, - { - "epoch": 0.5843575688793374, - "eval_runtime": 7.8199, - "eval_samples_per_second": 377.752, - "eval_steps_per_second": 23.657, - "step": 12100 - }, - { - "epoch": 0.584840509018907, - "grad_norm": 2.376407361536894, - "learning_rate": 8.78665197398743e-06, - "loss": 3.9539, - "mean_token_accuracy": 0.3192540317773819, - "step": 12110 - }, - { - "epoch": 0.5853234491584768, - "grad_norm": 2.2861915670853734, - "learning_rate": 8.769919703289985e-06, - "loss": 3.9445, - "mean_token_accuracy": 0.31834677457809446, - "step": 12120 - }, - { - "epoch": 0.5858063892980465, - "grad_norm": 2.3229430651807528, - "learning_rate": 8.753190928615773e-06, - "loss": 3.8785, - "mean_token_accuracy": 0.32368951886892317, - "step": 12130 - }, - { - "epoch": 0.5862893294376162, - "grad_norm": 2.3473446680131467, - "learning_rate": 8.736465697509807e-06, - "loss": 3.9035, - "mean_token_accuracy": 0.31229838728904724, - "step": 12140 - }, - { - "epoch": 0.5867722695771859, - "grad_norm": 2.5268537210273183, - "learning_rate": 8.719744057507036e-06, - "loss": 3.9605, - "mean_token_accuracy": 0.31263890117406845, - "step": 12150 - }, - { - "epoch": 0.5872552097167556, - "grad_norm": 2.347070448429671, - "learning_rate": 8.703026056132191e-06, - "loss": 3.9918, - "mean_token_accuracy": 0.312298384308815, - "step": 12160 - }, - { - "epoch": 0.5877381498563253, - "grad_norm": 2.455137886945934, - "learning_rate": 8.686311740899673e-06, - "loss": 3.9012, - "mean_token_accuracy": 0.31794354915618894, - "step": 12170 - }, - { - "epoch": 0.5882210899958951, - "grad_norm": 2.461354202215206, - "learning_rate": 8.669601159313396e-06, - "loss": 3.9309, - "mean_token_accuracy": 0.3152217730879784, - "step": 12180 - }, - { - "epoch": 0.5887040301354647, - "grad_norm": 2.4164386323857747, - "learning_rate": 8.652894358866672e-06, - "loss": 4.0621, - "mean_token_accuracy": 0.30796370953321456, - "step": 12190 - }, - { - "epoch": 0.5891869702750344, - "grad_norm": 2.3317146862006366, - "learning_rate": 8.636191387042055e-06, - "loss": 3.9422, - "mean_token_accuracy": 0.3167338714003563, - "step": 12200 - }, - { - "epoch": 0.5891869702750344, - "eval_runtime": 7.8041, - "eval_samples_per_second": 378.517, - "eval_steps_per_second": 23.705, - "step": 12200 - }, - { - "epoch": 0.5896699104146041, - "grad_norm": 2.382845058326198, - "learning_rate": 8.619492291311232e-06, - "loss": 3.9211, - "mean_token_accuracy": 0.3157753825187683, - "step": 12210 - }, - { - "epoch": 0.5901528505541738, - "grad_norm": 2.3838188339279496, - "learning_rate": 8.602797119134857e-06, - "loss": 3.909, - "mean_token_accuracy": 0.31844758093357084, - "step": 12220 - }, - { - "epoch": 0.5906357906937435, - "grad_norm": 2.2911175279726685, - "learning_rate": 8.586105917962456e-06, - "loss": 3.898, - "mean_token_accuracy": 0.31602822691202165, - "step": 12230 - }, - { - "epoch": 0.5911187308333132, - "grad_norm": 2.5265796074142326, - "learning_rate": 8.56941873523224e-06, - "loss": 3.8953, - "mean_token_accuracy": 0.3108870953321457, - "step": 12240 - }, - { - "epoch": 0.591601670972883, - "grad_norm": 2.4085651802137846, - "learning_rate": 8.552735618371027e-06, - "loss": 3.9387, - "mean_token_accuracy": 0.31895161122083665, - "step": 12250 - }, - { - "epoch": 0.5920846111124526, - "grad_norm": 2.4501739619412195, - "learning_rate": 8.536056614794058e-06, - "loss": 3.9391, - "mean_token_accuracy": 0.31693548560142515, - "step": 12260 - }, - { - "epoch": 0.5925675512520223, - "grad_norm": 2.4531039553366107, - "learning_rate": 8.51938177190489e-06, - "loss": 3.8828, - "mean_token_accuracy": 0.31502016335725785, - "step": 12270 - }, - { - "epoch": 0.593050491391592, - "grad_norm": 2.275677901505879, - "learning_rate": 8.502711137095268e-06, - "loss": 3.948, - "mean_token_accuracy": 0.3147177442908287, - "step": 12280 - }, - { - "epoch": 0.5935334315311617, - "grad_norm": 2.7275202797910327, - "learning_rate": 8.486044757744955e-06, - "loss": 3.9289, - "mean_token_accuracy": 0.3179435461759567, - "step": 12290 - }, - { - "epoch": 0.5940163716707314, - "grad_norm": 2.308175116377804, - "learning_rate": 8.469382681221638e-06, - "loss": 3.8691, - "mean_token_accuracy": 0.32328628897666933, - "step": 12300 - }, - { - "epoch": 0.5940163716707314, - "eval_runtime": 7.8025, - "eval_samples_per_second": 378.596, - "eval_steps_per_second": 23.71, - "step": 12300 - }, - { - "epoch": 0.5944993118103011, - "grad_norm": 2.4988315147767026, - "learning_rate": 8.45272495488076e-06, - "loss": 3.9902, - "mean_token_accuracy": 0.3171370968222618, - "step": 12310 - }, - { - "epoch": 0.5949822519498709, - "grad_norm": 2.336446395859129, - "learning_rate": 8.43607162606542e-06, - "loss": 4.0223, - "mean_token_accuracy": 0.31431451588869097, - "step": 12320 - }, - { - "epoch": 0.5954651920894405, - "grad_norm": 2.3516115918214773, - "learning_rate": 8.419422742106192e-06, - "loss": 3.9191, - "mean_token_accuracy": 0.3229838714003563, - "step": 12330 - }, - { - "epoch": 0.5959481322290102, - "grad_norm": 2.3736666646366906, - "learning_rate": 8.402778350321047e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.32237903475761415, - "step": 12340 - }, - { - "epoch": 0.59643107236858, - "grad_norm": 2.296403879228017, - "learning_rate": 8.386138498015157e-06, - "loss": 3.9012, - "mean_token_accuracy": 0.32056451588869095, - "step": 12350 - }, - { - "epoch": 0.5969140125081496, - "grad_norm": 2.3583058482072237, - "learning_rate": 8.369503232480825e-06, - "loss": 3.9758, - "mean_token_accuracy": 0.3149193570017815, - "step": 12360 - }, - { - "epoch": 0.5973969526477193, - "grad_norm": 2.314194923594278, - "learning_rate": 8.352872600997289e-06, - "loss": 3.9047, - "mean_token_accuracy": 0.31975806355476377, - "step": 12370 - }, - { - "epoch": 0.597879892787289, - "grad_norm": 2.4114810572442185, - "learning_rate": 8.336246650830642e-06, - "loss": 3.8652, - "mean_token_accuracy": 0.32631048560142517, - "step": 12380 - }, - { - "epoch": 0.5983628329268588, - "grad_norm": 2.3430620758953835, - "learning_rate": 8.319625429233649e-06, - "loss": 3.9531, - "mean_token_accuracy": 0.31895161271095274, - "step": 12390 - }, - { - "epoch": 0.5988457730664284, - "grad_norm": 2.3480062433608264, - "learning_rate": 8.303008983445647e-06, - "loss": 3.9781, - "mean_token_accuracy": 0.31451612859964373, - "step": 12400 - }, - { - "epoch": 0.5988457730664284, - "eval_runtime": 7.7997, - "eval_samples_per_second": 378.734, - "eval_steps_per_second": 23.719, - "step": 12400 - }, - { - "epoch": 0.5993287132059981, - "grad_norm": 2.4846036268181964, - "learning_rate": 8.286397360692403e-06, - "loss": 3.9492, - "mean_token_accuracy": 0.3157258078455925, - "step": 12410 - }, - { - "epoch": 0.5998116533455679, - "grad_norm": 2.271801156314681, - "learning_rate": 8.269790608185971e-06, - "loss": 3.9371, - "mean_token_accuracy": 0.31451612859964373, - "step": 12420 - }, - { - "epoch": 0.6002945934851375, - "grad_norm": 2.5306507378807432, - "learning_rate": 8.253188773124565e-06, - "loss": 3.8914, - "mean_token_accuracy": 0.31875, - "step": 12430 - }, - { - "epoch": 0.6007775336247072, - "grad_norm": 2.3723808856815443, - "learning_rate": 8.23659190269242e-06, - "loss": 3.9098, - "mean_token_accuracy": 0.3179435506463051, - "step": 12440 - }, - { - "epoch": 0.6012604737642769, - "grad_norm": 2.440474777149874, - "learning_rate": 8.22000004405967e-06, - "loss": 3.9199, - "mean_token_accuracy": 0.3185483902692795, - "step": 12450 - }, - { - "epoch": 0.6017434139038467, - "grad_norm": 2.4337366972057013, - "learning_rate": 8.203413244382191e-06, - "loss": 4.0355, - "mean_token_accuracy": 0.3094758056104183, - "step": 12460 - }, - { - "epoch": 0.6022263540434163, - "grad_norm": 2.4108682935467383, - "learning_rate": 8.186831550801498e-06, - "loss": 3.9516, - "mean_token_accuracy": 0.3181451618671417, - "step": 12470 - }, - { - "epoch": 0.602709294182986, - "grad_norm": 2.490215018819108, - "learning_rate": 8.17025501044457e-06, - "loss": 3.8852, - "mean_token_accuracy": 0.3260080650448799, - "step": 12480 - }, - { - "epoch": 0.6031922343225558, - "grad_norm": 2.346653145172493, - "learning_rate": 8.153683670423772e-06, - "loss": 3.9492, - "mean_token_accuracy": 0.3160282239317894, - "step": 12490 - }, - { - "epoch": 0.6036751744621254, - "grad_norm": 2.44406010865299, - "learning_rate": 8.137117577836654e-06, - "loss": 3.8703, - "mean_token_accuracy": 0.319254033267498, - "step": 12500 - }, - { - "epoch": 0.6036751744621254, - "eval_runtime": 7.8222, - "eval_samples_per_second": 377.645, - "eval_steps_per_second": 23.651, - "step": 12500 - }, - { - "epoch": 0.6041581146016951, - "grad_norm": 2.293463921830506, - "learning_rate": 8.120556779765886e-06, - "loss": 3.9297, - "mean_token_accuracy": 0.3200604826211929, - "step": 12510 - }, - { - "epoch": 0.6046410547412648, - "grad_norm": 2.3423937214795587, - "learning_rate": 8.10400132327906e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.31633064448833464, - "step": 12520 - }, - { - "epoch": 0.6051239948808346, - "grad_norm": 2.2193064020173416, - "learning_rate": 8.087451255428614e-06, - "loss": 3.941, - "mean_token_accuracy": 0.31481854766607287, - "step": 12530 - }, - { - "epoch": 0.6056069350204042, - "grad_norm": 2.274125797565075, - "learning_rate": 8.070906623251646e-06, - "loss": 3.8508, - "mean_token_accuracy": 0.3283266112208366, - "step": 12540 - }, - { - "epoch": 0.6060898751599739, - "grad_norm": 2.450745719002159, - "learning_rate": 8.054367473769822e-06, - "loss": 3.9395, - "mean_token_accuracy": 0.3186491921544075, - "step": 12550 - }, - { - "epoch": 0.6065728152995437, - "grad_norm": 2.3467240545768635, - "learning_rate": 8.03783385398922e-06, - "loss": 3.9297, - "mean_token_accuracy": 0.3268145158886909, - "step": 12560 - }, - { - "epoch": 0.6070557554391133, - "grad_norm": 2.3832867287564587, - "learning_rate": 8.021305810900198e-06, - "loss": 3.8937, - "mean_token_accuracy": 0.322076615691185, - "step": 12570 - }, - { - "epoch": 0.607538695578683, - "grad_norm": 2.333120906074266, - "learning_rate": 8.004783391477281e-06, - "loss": 3.9285, - "mean_token_accuracy": 0.315625, - "step": 12580 - }, - { - "epoch": 0.6080216357182527, - "grad_norm": 2.637448853043515, - "learning_rate": 7.988266642678983e-06, - "loss": 3.9594, - "mean_token_accuracy": 0.31532258093357085, - "step": 12590 - }, - { - "epoch": 0.6085045758578225, - "grad_norm": 2.378957418572957, - "learning_rate": 7.971755611447732e-06, - "loss": 3.8285, - "mean_token_accuracy": 0.3304435461759567, - "step": 12600 - }, - { - "epoch": 0.6085045758578225, - "eval_runtime": 7.829, - "eval_samples_per_second": 377.316, - "eval_steps_per_second": 23.63, - "step": 12600 - }, - { - "epoch": 0.6089875159973921, - "grad_norm": 2.327415186219916, - "learning_rate": 7.955250344709677e-06, - "loss": 3.9434, - "mean_token_accuracy": 0.3135080635547638, - "step": 12610 - }, - { - "epoch": 0.6094704561369618, - "grad_norm": 2.3591974534040183, - "learning_rate": 7.938750889374614e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.3084677442908287, - "step": 12620 - }, - { - "epoch": 0.6099533962765316, - "grad_norm": 2.385798003771275, - "learning_rate": 7.92225729233579e-06, - "loss": 3.9516, - "mean_token_accuracy": 0.3189516142010689, - "step": 12630 - }, - { - "epoch": 0.6104363364161012, - "grad_norm": 2.4359180069415722, - "learning_rate": 7.90576960046983e-06, - "loss": 3.8957, - "mean_token_accuracy": 0.32177419513463973, - "step": 12640 - }, - { - "epoch": 0.6109192765556709, - "grad_norm": 2.3362767954011403, - "learning_rate": 7.889287860636556e-06, - "loss": 3.9156, - "mean_token_accuracy": 0.3185483872890472, - "step": 12650 - }, - { - "epoch": 0.6114022166952406, - "grad_norm": 2.5328674143723178, - "learning_rate": 7.872812119678893e-06, - "loss": 4.0055, - "mean_token_accuracy": 0.3107862934470177, - "step": 12660 - }, - { - "epoch": 0.6118851568348104, - "grad_norm": 2.5996935950577473, - "learning_rate": 7.856342424422693e-06, - "loss": 3.9906, - "mean_token_accuracy": 0.3160282239317894, - "step": 12670 - }, - { - "epoch": 0.61236809697438, - "grad_norm": 2.3733602776773997, - "learning_rate": 7.839878821676642e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.31743951588869096, - "step": 12680 - }, - { - "epoch": 0.6128510371139497, - "grad_norm": 2.433579799147079, - "learning_rate": 7.823421358232113e-06, - "loss": 3.9031, - "mean_token_accuracy": 0.32338709235191343, - "step": 12690 - }, - { - "epoch": 0.6133339772535195, - "grad_norm": 2.435948421081829, - "learning_rate": 7.806970080863013e-06, - "loss": 3.8687, - "mean_token_accuracy": 0.32893145084381104, - "step": 12700 - }, - { - "epoch": 0.6133339772535195, - "eval_runtime": 7.8186, - "eval_samples_per_second": 377.818, - "eval_steps_per_second": 23.662, - "step": 12700 - }, - { - "epoch": 0.6138169173930891, - "grad_norm": 2.5919961474739925, - "learning_rate": 7.790525036325688e-06, - "loss": 4.0113, - "mean_token_accuracy": 0.31008064150810244, - "step": 12710 - }, - { - "epoch": 0.6142998575326588, - "grad_norm": 2.324324256109048, - "learning_rate": 7.774086271358752e-06, - "loss": 3.9492, - "mean_token_accuracy": 0.31118951439857484, - "step": 12720 - }, - { - "epoch": 0.6147827976722285, - "grad_norm": 2.456467135348635, - "learning_rate": 7.757653832682988e-06, - "loss": 4.0051, - "mean_token_accuracy": 0.31491935402154925, - "step": 12730 - }, - { - "epoch": 0.6152657378117983, - "grad_norm": 2.4413301215839813, - "learning_rate": 7.741227767001178e-06, - "loss": 3.8988, - "mean_token_accuracy": 0.3238911271095276, - "step": 12740 - }, - { - "epoch": 0.6157486779513679, - "grad_norm": 2.4955358813010773, - "learning_rate": 7.724808120998019e-06, - "loss": 3.9805, - "mean_token_accuracy": 0.3141129031777382, - "step": 12750 - }, - { - "epoch": 0.6162316180909376, - "grad_norm": 2.506360797782655, - "learning_rate": 7.708394941339933e-06, - "loss": 3.8996, - "mean_token_accuracy": 0.3160282298922539, - "step": 12760 - }, - { - "epoch": 0.6167145582305074, - "grad_norm": 2.2751606705591367, - "learning_rate": 7.691988274674991e-06, - "loss": 3.8918, - "mean_token_accuracy": 0.3219758063554764, - "step": 12770 - }, - { - "epoch": 0.617197498370077, - "grad_norm": 2.3632981596438487, - "learning_rate": 7.67558816763273e-06, - "loss": 3.9844, - "mean_token_accuracy": 0.3179435506463051, - "step": 12780 - }, - { - "epoch": 0.6176804385096467, - "grad_norm": 2.6603198143339464, - "learning_rate": 7.659194666824065e-06, - "loss": 3.9547, - "mean_token_accuracy": 0.3224798396229744, - "step": 12790 - }, - { - "epoch": 0.6181633786492164, - "grad_norm": 2.4044952755815827, - "learning_rate": 7.642807818841117e-06, - "loss": 3.9102, - "mean_token_accuracy": 0.3218749985098839, - "step": 12800 - }, - { - "epoch": 0.6181633786492164, - "eval_runtime": 7.8228, - "eval_samples_per_second": 377.612, - "eval_steps_per_second": 23.649, - "step": 12800 - }, - { - "epoch": 0.6186463187887862, - "grad_norm": 2.568865982176715, - "learning_rate": 7.626427670257106e-06, - "loss": 3.8984, - "mean_token_accuracy": 0.31965726166963576, - "step": 12810 - }, - { - "epoch": 0.6191292589283558, - "grad_norm": 2.242642250851699, - "learning_rate": 7.610054267626221e-06, - "loss": 3.9457, - "mean_token_accuracy": 0.32066532373428347, - "step": 12820 - }, - { - "epoch": 0.6196121990679255, - "grad_norm": 2.3693172932920716, - "learning_rate": 7.593687657483459e-06, - "loss": 3.9426, - "mean_token_accuracy": 0.32066532224416733, - "step": 12830 - }, - { - "epoch": 0.6200951392074953, - "grad_norm": 2.2068181216476486, - "learning_rate": 7.577327886344532e-06, - "loss": 3.9043, - "mean_token_accuracy": 0.3173387095332146, - "step": 12840 - }, - { - "epoch": 0.6205780793470649, - "grad_norm": 2.429565571275932, - "learning_rate": 7.560975000705697e-06, - "loss": 4.0078, - "mean_token_accuracy": 0.30997983664274215, - "step": 12850 - }, - { - "epoch": 0.6210610194866346, - "grad_norm": 2.418662179857565, - "learning_rate": 7.544629047043661e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.3155241936445236, - "step": 12860 - }, - { - "epoch": 0.6215439596262043, - "grad_norm": 2.3463015231804767, - "learning_rate": 7.528290071815405e-06, - "loss": 3.9633, - "mean_token_accuracy": 0.31703629046678544, - "step": 12870 - }, - { - "epoch": 0.6220268997657741, - "grad_norm": 2.317958165392629, - "learning_rate": 7.511958121458105e-06, - "loss": 3.9246, - "mean_token_accuracy": 0.3205645143985748, - "step": 12880 - }, - { - "epoch": 0.6225098399053437, - "grad_norm": 2.4637128813599634, - "learning_rate": 7.495633242388942e-06, - "loss": 3.9512, - "mean_token_accuracy": 0.3074596792459488, - "step": 12890 - }, - { - "epoch": 0.6229927800449134, - "grad_norm": 2.3577747313384916, - "learning_rate": 7.479315481005027e-06, - "loss": 3.8328, - "mean_token_accuracy": 0.33014112859964373, - "step": 12900 - }, - { - "epoch": 0.6229927800449134, - "eval_runtime": 7.8122, - "eval_samples_per_second": 378.128, - "eval_steps_per_second": 23.681, - "step": 12900 - }, - { - "epoch": 0.6234757201844832, - "grad_norm": 2.3936035472824497, - "learning_rate": 7.463004883683219e-06, - "loss": 3.8766, - "mean_token_accuracy": 0.3201612874865532, - "step": 12910 - }, - { - "epoch": 0.6239586603240528, - "grad_norm": 2.397476131748257, - "learning_rate": 7.446701496780034e-06, - "loss": 3.943, - "mean_token_accuracy": 0.3152217760682106, - "step": 12920 - }, - { - "epoch": 0.6244416004636225, - "grad_norm": 2.3223812381697777, - "learning_rate": 7.430405366631488e-06, - "loss": 3.918, - "mean_token_accuracy": 0.3207661285996437, - "step": 12930 - }, - { - "epoch": 0.6249245406031922, - "grad_norm": 2.2315053966770484, - "learning_rate": 7.41411653955296e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.322379033267498, - "step": 12940 - }, - { - "epoch": 0.625407480742762, - "grad_norm": 2.3335703162067882, - "learning_rate": 7.3978350618390985e-06, - "loss": 3.9332, - "mean_token_accuracy": 0.31995967775583267, - "step": 12950 - }, - { - "epoch": 0.6258904208823316, - "grad_norm": 2.3212974620696682, - "learning_rate": 7.381560979763639e-06, - "loss": 3.9594, - "mean_token_accuracy": 0.3174395143985748, - "step": 12960 - }, - { - "epoch": 0.6263733610219013, - "grad_norm": 2.3574048410155775, - "learning_rate": 7.365294339579321e-06, - "loss": 3.9246, - "mean_token_accuracy": 0.31774193346500396, - "step": 12970 - }, - { - "epoch": 0.6268563011614711, - "grad_norm": 2.294702562502172, - "learning_rate": 7.349035187517709e-06, - "loss": 3.9031, - "mean_token_accuracy": 0.32379032373428346, - "step": 12980 - }, - { - "epoch": 0.6273392413010407, - "grad_norm": 2.321542868350157, - "learning_rate": 7.332783569789111e-06, - "loss": 3.9711, - "mean_token_accuracy": 0.31381048262119293, - "step": 12990 - }, - { - "epoch": 0.6278221814406104, - "grad_norm": 2.3054377755348927, - "learning_rate": 7.316539532582395e-06, - "loss": 3.9422, - "mean_token_accuracy": 0.32217742055654525, - "step": 13000 - }, - { - "epoch": 0.6278221814406104, - "eval_runtime": 7.7865, - "eval_samples_per_second": 379.376, - "eval_steps_per_second": 23.759, - "step": 13000 - }, - { - "epoch": 0.6283051215801801, - "grad_norm": 2.46251436645923, - "learning_rate": 7.300303122064913e-06, - "loss": 3.8961, - "mean_token_accuracy": 0.3270161271095276, - "step": 13010 - }, - { - "epoch": 0.6287880617197499, - "grad_norm": 2.5061581647684847, - "learning_rate": 7.284074384382309e-06, - "loss": 3.8523, - "mean_token_accuracy": 0.3245967745780945, - "step": 13020 - }, - { - "epoch": 0.6292710018593195, - "grad_norm": 2.409695697482411, - "learning_rate": 7.267853365658453e-06, - "loss": 3.9711, - "mean_token_accuracy": 0.3150201618671417, - "step": 13030 - }, - { - "epoch": 0.6297539419988892, - "grad_norm": 2.4323365334076623, - "learning_rate": 7.251640111995248e-06, - "loss": 3.9613, - "mean_token_accuracy": 0.318548384308815, - "step": 13040 - }, - { - "epoch": 0.630236882138459, - "grad_norm": 2.388842497691783, - "learning_rate": 7.235434669472552e-06, - "loss": 3.9395, - "mean_token_accuracy": 0.31562500149011613, - "step": 13050 - }, - { - "epoch": 0.6307198222780286, - "grad_norm": 2.3337639860673467, - "learning_rate": 7.2192370841480035e-06, - "loss": 3.8188, - "mean_token_accuracy": 0.33185483813285827, - "step": 13060 - }, - { - "epoch": 0.6312027624175983, - "grad_norm": 2.4249027758975994, - "learning_rate": 7.2030474020569216e-06, - "loss": 3.9527, - "mean_token_accuracy": 0.3166330650448799, - "step": 13070 - }, - { - "epoch": 0.631685702557168, - "grad_norm": 2.455308685176902, - "learning_rate": 7.186865669212162e-06, - "loss": 3.9477, - "mean_token_accuracy": 0.3134072571992874, - "step": 13080 - }, - { - "epoch": 0.6321686426967378, - "grad_norm": 2.498420458961923, - "learning_rate": 7.170691931603977e-06, - "loss": 3.9316, - "mean_token_accuracy": 0.32157257944345474, - "step": 13090 - }, - { - "epoch": 0.6326515828363074, - "grad_norm": 2.481104271475756, - "learning_rate": 7.154526235199917e-06, - "loss": 3.9301, - "mean_token_accuracy": 0.319556450843811, - "step": 13100 - }, - { - "epoch": 0.6326515828363074, - "eval_runtime": 7.8352, - "eval_samples_per_second": 377.015, - "eval_steps_per_second": 23.611, - "step": 13100 - }, - { - "epoch": 0.6331345229758771, - "grad_norm": 2.385737967049577, - "learning_rate": 7.138368625944652e-06, - "loss": 3.8805, - "mean_token_accuracy": 0.3213709697127342, - "step": 13110 - }, - { - "epoch": 0.6336174631154469, - "grad_norm": 2.3991427423310174, - "learning_rate": 7.1222191497598945e-06, - "loss": 3.9078, - "mean_token_accuracy": 0.32036290466785433, - "step": 13120 - }, - { - "epoch": 0.6341004032550165, - "grad_norm": 2.449850530644843, - "learning_rate": 7.106077852544218e-06, - "loss": 4.0055, - "mean_token_accuracy": 0.31118951737880707, - "step": 13130 - }, - { - "epoch": 0.6345833433945862, - "grad_norm": 2.4961170883202426, - "learning_rate": 7.089944780172971e-06, - "loss": 3.9934, - "mean_token_accuracy": 0.30423387289047243, - "step": 13140 - }, - { - "epoch": 0.635066283534156, - "grad_norm": 2.360820203905407, - "learning_rate": 7.073819978498102e-06, - "loss": 3.9316, - "mean_token_accuracy": 0.32157257944345474, - "step": 13150 - }, - { - "epoch": 0.6355492236737257, - "grad_norm": 2.2824310704213926, - "learning_rate": 7.057703493348085e-06, - "loss": 3.9227, - "mean_token_accuracy": 0.31582661122083666, - "step": 13160 - }, - { - "epoch": 0.6360321638132953, - "grad_norm": 2.441871581416318, - "learning_rate": 7.041595370527725e-06, - "loss": 3.9133, - "mean_token_accuracy": 0.3216733857989311, - "step": 13170 - }, - { - "epoch": 0.636515103952865, - "grad_norm": 2.407221197623045, - "learning_rate": 7.025495655818084e-06, - "loss": 3.9371, - "mean_token_accuracy": 0.3126008078455925, - "step": 13180 - }, - { - "epoch": 0.6369980440924348, - "grad_norm": 2.4327088299934143, - "learning_rate": 7.009404394976315e-06, - "loss": 3.973, - "mean_token_accuracy": 0.3137096777558327, - "step": 13190 - }, - { - "epoch": 0.6374809842320044, - "grad_norm": 2.491724091378324, - "learning_rate": 6.993321633735553e-06, - "loss": 3.9207, - "mean_token_accuracy": 0.32006048411130905, - "step": 13200 - }, - { - "epoch": 0.6374809842320044, - "eval_runtime": 7.8027, - "eval_samples_per_second": 378.589, - "eval_steps_per_second": 23.71, - "step": 13200 - }, - { - "epoch": 0.6379639243715741, - "grad_norm": 2.7785307670964987, - "learning_rate": 6.977247417804766e-06, - "loss": 3.9078, - "mean_token_accuracy": 0.3141129046678543, - "step": 13210 - }, - { - "epoch": 0.6384468645111439, - "grad_norm": 2.391861656939564, - "learning_rate": 6.961181792868637e-06, - "loss": 3.898, - "mean_token_accuracy": 0.3157258093357086, - "step": 13220 - }, - { - "epoch": 0.6389298046507136, - "grad_norm": 2.4026082956779273, - "learning_rate": 6.945124804587444e-06, - "loss": 3.9121, - "mean_token_accuracy": 0.3226814493536949, - "step": 13230 - }, - { - "epoch": 0.6394127447902832, - "grad_norm": 2.33323761553131, - "learning_rate": 6.9290764985969e-06, - "loss": 3.991, - "mean_token_accuracy": 0.31169354617595674, - "step": 13240 - }, - { - "epoch": 0.6398956849298529, - "grad_norm": 2.38750310086146, - "learning_rate": 6.9130369205080646e-06, - "loss": 3.9328, - "mean_token_accuracy": 0.3231854856014252, - "step": 13250 - }, - { - "epoch": 0.6403786250694227, - "grad_norm": 2.4831110194490997, - "learning_rate": 6.897006115907168e-06, - "loss": 3.9871, - "mean_token_accuracy": 0.3108870968222618, - "step": 13260 - }, - { - "epoch": 0.6408615652089924, - "grad_norm": 2.45661553413711, - "learning_rate": 6.880984130355528e-06, - "loss": 3.909, - "mean_token_accuracy": 0.32570564597845075, - "step": 13270 - }, - { - "epoch": 0.641344505348562, - "grad_norm": 2.367788164306088, - "learning_rate": 6.864971009389373e-06, - "loss": 3.8633, - "mean_token_accuracy": 0.3243951588869095, - "step": 13280 - }, - { - "epoch": 0.6418274454881318, - "grad_norm": 2.4928497571389343, - "learning_rate": 6.848966798519763e-06, - "loss": 3.9535, - "mean_token_accuracy": 0.3149193525314331, - "step": 13290 - }, - { - "epoch": 0.6423103856277015, - "grad_norm": 2.411054866056794, - "learning_rate": 6.832971543232414e-06, - "loss": 3.8816, - "mean_token_accuracy": 0.32298386842012405, - "step": 13300 - }, - { - "epoch": 0.6423103856277015, - "eval_runtime": 7.8101, - "eval_samples_per_second": 378.23, - "eval_steps_per_second": 23.687, - "step": 13300 - }, - { - "epoch": 0.6427933257672711, - "grad_norm": 2.2097641685763514, - "learning_rate": 6.816985288987603e-06, - "loss": 3.9855, - "mean_token_accuracy": 0.31542338877916337, - "step": 13310 - }, - { - "epoch": 0.6432762659068408, - "grad_norm": 2.388671729476071, - "learning_rate": 6.801008081220015e-06, - "loss": 3.9508, - "mean_token_accuracy": 0.321169351041317, - "step": 13320 - }, - { - "epoch": 0.6437592060464106, - "grad_norm": 2.462324067346631, - "learning_rate": 6.785039965338632e-06, - "loss": 3.9867, - "mean_token_accuracy": 0.30544354766607285, - "step": 13330 - }, - { - "epoch": 0.6442421461859803, - "grad_norm": 2.4048972164718423, - "learning_rate": 6.769080986726593e-06, - "loss": 3.9375, - "mean_token_accuracy": 0.32086693346500395, - "step": 13340 - }, - { - "epoch": 0.6447250863255499, - "grad_norm": 2.457099332342788, - "learning_rate": 6.753131190741058e-06, - "loss": 3.8625, - "mean_token_accuracy": 0.3244959682226181, - "step": 13350 - }, - { - "epoch": 0.6452080264651197, - "grad_norm": 2.702352747281538, - "learning_rate": 6.7371906227131125e-06, - "loss": 3.948, - "mean_token_accuracy": 0.3256048396229744, - "step": 13360 - }, - { - "epoch": 0.6456909666046894, - "grad_norm": 2.450534668684664, - "learning_rate": 6.721259327947585e-06, - "loss": 3.891, - "mean_token_accuracy": 0.3180443555116653, - "step": 13370 - }, - { - "epoch": 0.646173906744259, - "grad_norm": 2.4241048462215202, - "learning_rate": 6.705337351722978e-06, - "loss": 3.8945, - "mean_token_accuracy": 0.31875, - "step": 13380 - }, - { - "epoch": 0.6466568468838287, - "grad_norm": 2.4140939409017346, - "learning_rate": 6.689424739291284e-06, - "loss": 3.9734, - "mean_token_accuracy": 0.3208669379353523, - "step": 13390 - }, - { - "epoch": 0.6471397870233985, - "grad_norm": 2.470336158681267, - "learning_rate": 6.673521535877907e-06, - "loss": 3.9711, - "mean_token_accuracy": 0.31300403028726576, - "step": 13400 - }, - { - "epoch": 0.6471397870233985, - "eval_runtime": 7.8005, - "eval_samples_per_second": 378.695, - "eval_steps_per_second": 23.717, - "step": 13400 - }, - { - "epoch": 0.6476227271629682, - "grad_norm": 2.5353353960367, - "learning_rate": 6.657627786681484e-06, - "loss": 3.9293, - "mean_token_accuracy": 0.32026209533214567, - "step": 13410 - }, - { - "epoch": 0.6481056673025378, - "grad_norm": 2.3230612136218216, - "learning_rate": 6.641743536873804e-06, - "loss": 3.9219, - "mean_token_accuracy": 0.3149193525314331, - "step": 13420 - }, - { - "epoch": 0.6485886074421076, - "grad_norm": 2.533045045902366, - "learning_rate": 6.625868831599645e-06, - "loss": 3.9008, - "mean_token_accuracy": 0.3262096747756004, - "step": 13430 - }, - { - "epoch": 0.6490715475816773, - "grad_norm": 2.422131897303017, - "learning_rate": 6.610003715976663e-06, - "loss": 3.9062, - "mean_token_accuracy": 0.32389113008975984, - "step": 13440 - }, - { - "epoch": 0.6495544877212469, - "grad_norm": 2.4198555502996633, - "learning_rate": 6.594148235095257e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.3145161300897598, - "step": 13450 - }, - { - "epoch": 0.6500374278608166, - "grad_norm": 2.607695233539751, - "learning_rate": 6.578302434018446e-06, - "loss": 3.9508, - "mean_token_accuracy": 0.31844758093357084, - "step": 13460 - }, - { - "epoch": 0.6505203680003864, - "grad_norm": 2.390740146052622, - "learning_rate": 6.562466357781738e-06, - "loss": 3.8449, - "mean_token_accuracy": 0.3236895173788071, - "step": 13470 - }, - { - "epoch": 0.6510033081399561, - "grad_norm": 2.469974288889219, - "learning_rate": 6.546640051392992e-06, - "loss": 3.8391, - "mean_token_accuracy": 0.32963709682226183, - "step": 13480 - }, - { - "epoch": 0.6514862482795257, - "grad_norm": 2.4665650951223754, - "learning_rate": 6.530823559832318e-06, - "loss": 3.9434, - "mean_token_accuracy": 0.3211693525314331, - "step": 13490 - }, - { - "epoch": 0.6519691884190955, - "grad_norm": 2.3379100061768, - "learning_rate": 6.515016928051911e-06, - "loss": 3.8563, - "mean_token_accuracy": 0.3257056474685669, - "step": 13500 - }, - { - "epoch": 0.6519691884190955, - "eval_runtime": 7.8268, - "eval_samples_per_second": 377.419, - "eval_steps_per_second": 23.637, - "step": 13500 - }, - { - "epoch": 0.6524521285586652, - "grad_norm": 2.3116877312986888, - "learning_rate": 6.499220200975967e-06, - "loss": 3.8906, - "mean_token_accuracy": 0.3240927457809448, - "step": 13510 - }, - { - "epoch": 0.6529350686982348, - "grad_norm": 2.4643670214315514, - "learning_rate": 6.483433423500503e-06, - "loss": 3.9195, - "mean_token_accuracy": 0.31935483664274217, - "step": 13520 - }, - { - "epoch": 0.6534180088378045, - "grad_norm": 2.7259989461781156, - "learning_rate": 6.467656640493285e-06, - "loss": 3.9254, - "mean_token_accuracy": 0.31733871400356295, - "step": 13530 - }, - { - "epoch": 0.6539009489773743, - "grad_norm": 2.4693318113401057, - "learning_rate": 6.451889896793657e-06, - "loss": 3.9773, - "mean_token_accuracy": 0.3107862904667854, - "step": 13540 - }, - { - "epoch": 0.654383889116944, - "grad_norm": 2.486509368633819, - "learning_rate": 6.4361332372124395e-06, - "loss": 3.868, - "mean_token_accuracy": 0.32550403028726577, - "step": 13550 - }, - { - "epoch": 0.6548668292565136, - "grad_norm": 2.416245832401473, - "learning_rate": 6.420386706531784e-06, - "loss": 3.9191, - "mean_token_accuracy": 0.31915322989225386, - "step": 13560 - }, - { - "epoch": 0.6553497693960834, - "grad_norm": 2.4438580785603117, - "learning_rate": 6.404650349505064e-06, - "loss": 3.975, - "mean_token_accuracy": 0.31018145084381105, - "step": 13570 - }, - { - "epoch": 0.6558327095356531, - "grad_norm": 2.370352554275622, - "learning_rate": 6.388924210856728e-06, - "loss": 3.9355, - "mean_token_accuracy": 0.31542338877916337, - "step": 13580 - }, - { - "epoch": 0.6563156496752227, - "grad_norm": 2.554690529825553, - "learning_rate": 6.373208335282194e-06, - "loss": 3.9961, - "mean_token_accuracy": 0.3079637125134468, - "step": 13590 - }, - { - "epoch": 0.6567985898147924, - "grad_norm": 2.393189770980954, - "learning_rate": 6.357502767447701e-06, - "loss": 3.8934, - "mean_token_accuracy": 0.3205645129084587, - "step": 13600 - }, - { - "epoch": 0.6567985898147924, - "eval_runtime": 7.8081, - "eval_samples_per_second": 378.325, - "eval_steps_per_second": 23.693, - "step": 13600 - }, - { - "epoch": 0.6572815299543622, - "grad_norm": 2.4939519544957505, - "learning_rate": 6.3418075519902e-06, - "loss": 3.8641, - "mean_token_accuracy": 0.3302419349551201, - "step": 13610 - }, - { - "epoch": 0.6577644700939319, - "grad_norm": 2.498907949016617, - "learning_rate": 6.326122733517219e-06, - "loss": 3.898, - "mean_token_accuracy": 0.3212701603770256, - "step": 13620 - }, - { - "epoch": 0.6582474102335015, - "grad_norm": 2.4503704301524225, - "learning_rate": 6.310448356606722e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.3220766127109528, - "step": 13630 - }, - { - "epoch": 0.6587303503730713, - "grad_norm": 2.4254164909360627, - "learning_rate": 6.294784465807024e-06, - "loss": 3.9262, - "mean_token_accuracy": 0.31532257944345476, - "step": 13640 - }, - { - "epoch": 0.659213290512641, - "grad_norm": 2.862695016604388, - "learning_rate": 6.27913110563661e-06, - "loss": 3.9422, - "mean_token_accuracy": 0.3191532254219055, - "step": 13650 - }, - { - "epoch": 0.6596962306522106, - "grad_norm": 2.449129559009671, - "learning_rate": 6.2634883205840566e-06, - "loss": 3.9691, - "mean_token_accuracy": 0.31602822467684744, - "step": 13660 - }, - { - "epoch": 0.6601791707917803, - "grad_norm": 2.3446670577028264, - "learning_rate": 6.24785615510787e-06, - "loss": 3.9137, - "mean_token_accuracy": 0.3180443584918976, - "step": 13670 - }, - { - "epoch": 0.6606621109313501, - "grad_norm": 2.557532204979107, - "learning_rate": 6.232234653636386e-06, - "loss": 3.9473, - "mean_token_accuracy": 0.3185483872890472, - "step": 13680 - }, - { - "epoch": 0.6611450510709198, - "grad_norm": 2.420993812282275, - "learning_rate": 6.216623860567621e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.31754032522439957, - "step": 13690 - }, - { - "epoch": 0.6616279912104894, - "grad_norm": 2.3094562537185346, - "learning_rate": 6.201023820269168e-06, - "loss": 3.9016, - "mean_token_accuracy": 0.3211693540215492, - "step": 13700 - }, - { - "epoch": 0.6616279912104894, - "eval_runtime": 7.8045, - "eval_samples_per_second": 378.501, - "eval_steps_per_second": 23.704, - "step": 13700 - }, - { - "epoch": 0.6621109313500592, - "grad_norm": 2.381447202179818, - "learning_rate": 6.185434577078048e-06, - "loss": 3.8926, - "mean_token_accuracy": 0.321370966732502, - "step": 13710 - }, - { - "epoch": 0.6625938714896289, - "grad_norm": 2.4072660215105306, - "learning_rate": 6.169856175300608e-06, - "loss": 3.925, - "mean_token_accuracy": 0.31673386991024016, - "step": 13720 - }, - { - "epoch": 0.6630768116291985, - "grad_norm": 2.4079167228823573, - "learning_rate": 6.15428865921237e-06, - "loss": 3.952, - "mean_token_accuracy": 0.32086693644523623, - "step": 13730 - }, - { - "epoch": 0.6635597517687682, - "grad_norm": 2.5129848355695947, - "learning_rate": 6.138732073057929e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.32268145233392714, - "step": 13740 - }, - { - "epoch": 0.664042691908338, - "grad_norm": 2.5972387787368905, - "learning_rate": 6.123186461050809e-06, - "loss": 3.9574, - "mean_token_accuracy": 0.3197580650448799, - "step": 13750 - }, - { - "epoch": 0.6645256320479077, - "grad_norm": 2.5199844587444713, - "learning_rate": 6.10765186737334e-06, - "loss": 3.8922, - "mean_token_accuracy": 0.3214717715978622, - "step": 13760 - }, - { - "epoch": 0.6650085721874773, - "grad_norm": 2.4518189167491977, - "learning_rate": 6.09212833617655e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.3166330650448799, - "step": 13770 - }, - { - "epoch": 0.6654915123270471, - "grad_norm": 2.638132964794942, - "learning_rate": 6.076615911580015e-06, - "loss": 3.8254, - "mean_token_accuracy": 0.32510080486536025, - "step": 13780 - }, - { - "epoch": 0.6659744524666168, - "grad_norm": 2.424275930362351, - "learning_rate": 6.061114637671752e-06, - "loss": 3.8687, - "mean_token_accuracy": 0.3265120968222618, - "step": 13790 - }, - { - "epoch": 0.6664573926061864, - "grad_norm": 2.948220453970061, - "learning_rate": 6.045624558508079e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.318245966732502, - "step": 13800 - }, - { - "epoch": 0.6664573926061864, - "eval_runtime": 7.7802, - "eval_samples_per_second": 379.679, - "eval_steps_per_second": 23.778, - "step": 13800 - }, - { - "epoch": 0.6669403327457561, - "grad_norm": 2.359312713919547, - "learning_rate": 6.030145718113505e-06, - "loss": 3.9305, - "mean_token_accuracy": 0.32469757795333865, - "step": 13810 - }, - { - "epoch": 0.6674232728853259, - "grad_norm": 2.5959829401054186, - "learning_rate": 6.014678160480589e-06, - "loss": 3.9172, - "mean_token_accuracy": 0.32197580486536026, - "step": 13820 - }, - { - "epoch": 0.6679062130248956, - "grad_norm": 2.3969151206364008, - "learning_rate": 5.999221929569834e-06, - "loss": 3.9574, - "mean_token_accuracy": 0.3176411271095276, - "step": 13830 - }, - { - "epoch": 0.6683891531644652, - "grad_norm": 2.4227529562048105, - "learning_rate": 5.983777069309539e-06, - "loss": 3.9023, - "mean_token_accuracy": 0.32086693644523623, - "step": 13840 - }, - { - "epoch": 0.668872093304035, - "grad_norm": 2.418079183322335, - "learning_rate": 5.968343623595696e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.3179435461759567, - "step": 13850 - }, - { - "epoch": 0.6693550334436047, - "grad_norm": 2.418766494528195, - "learning_rate": 5.952921636291851e-06, - "loss": 3.9578, - "mean_token_accuracy": 0.3129032254219055, - "step": 13860 - }, - { - "epoch": 0.6698379735831743, - "grad_norm": 2.5143601669096887, - "learning_rate": 5.937511151228984e-06, - "loss": 3.9035, - "mean_token_accuracy": 0.3157258085906506, - "step": 13870 - }, - { - "epoch": 0.670320913722744, - "grad_norm": 2.5177516821141315, - "learning_rate": 5.922112212205389e-06, - "loss": 3.9613, - "mean_token_accuracy": 0.3132056429982185, - "step": 13880 - }, - { - "epoch": 0.6708038538623138, - "grad_norm": 2.428742140891193, - "learning_rate": 5.906724862986533e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.31451613157987596, - "step": 13890 - }, - { - "epoch": 0.6712867940018835, - "grad_norm": 2.4586403759398663, - "learning_rate": 5.891349147304959e-06, - "loss": 3.9664, - "mean_token_accuracy": 0.3160282254219055, - "step": 13900 - }, - { - "epoch": 0.6712867940018835, - "eval_runtime": 7.819, - "eval_samples_per_second": 377.799, - "eval_steps_per_second": 23.66, - "step": 13900 - }, - { - "epoch": 0.6717697341414531, - "grad_norm": 2.3892298855439003, - "learning_rate": 5.8759851088601365e-06, - "loss": 3.8863, - "mean_token_accuracy": 0.3189516142010689, - "step": 13910 - }, - { - "epoch": 0.6722526742810229, - "grad_norm": 2.4162785767272976, - "learning_rate": 5.860632791318349e-06, - "loss": 3.932, - "mean_token_accuracy": 0.31491935849189756, - "step": 13920 - }, - { - "epoch": 0.6727356144205926, - "grad_norm": 2.4674787625719508, - "learning_rate": 5.845292238312568e-06, - "loss": 3.9031, - "mean_token_accuracy": 0.32116935551166537, - "step": 13930 - }, - { - "epoch": 0.6732185545601622, - "grad_norm": 2.6481593182014507, - "learning_rate": 5.829963493442332e-06, - "loss": 3.9152, - "mean_token_accuracy": 0.3179435461759567, - "step": 13940 - }, - { - "epoch": 0.673701494699732, - "grad_norm": 2.4627066242099094, - "learning_rate": 5.814646600273611e-06, - "loss": 3.8961, - "mean_token_accuracy": 0.3146169364452362, - "step": 13950 - }, - { - "epoch": 0.6741844348393017, - "grad_norm": 2.5241716916536503, - "learning_rate": 5.799341602338706e-06, - "loss": 3.9008, - "mean_token_accuracy": 0.32096774131059647, - "step": 13960 - }, - { - "epoch": 0.6746673749788714, - "grad_norm": 2.4278502281401866, - "learning_rate": 5.784048543136089e-06, - "loss": 3.959, - "mean_token_accuracy": 0.3180443555116653, - "step": 13970 - }, - { - "epoch": 0.675150315118441, - "grad_norm": 2.480624354040345, - "learning_rate": 5.768767466130323e-06, - "loss": 3.8937, - "mean_token_accuracy": 0.3196572571992874, - "step": 13980 - }, - { - "epoch": 0.6756332552580108, - "grad_norm": 2.4312966451771953, - "learning_rate": 5.753498414751901e-06, - "loss": 3.9695, - "mean_token_accuracy": 0.3206653192639351, - "step": 13990 - }, - { - "epoch": 0.6761161953975805, - "grad_norm": 2.4529403596977306, - "learning_rate": 5.738241432397148e-06, - "loss": 3.852, - "mean_token_accuracy": 0.3302419364452362, - "step": 14000 - }, - { - "epoch": 0.6761161953975805, - "eval_runtime": 7.8069, - "eval_samples_per_second": 378.384, - "eval_steps_per_second": 23.697, - "step": 14000 - }, - { - "epoch": 0.6765991355371501, - "grad_norm": 2.8264414353458815, - "learning_rate": 5.722996562428073e-06, - "loss": 3.9027, - "mean_token_accuracy": 0.3213709697127342, - "step": 14010 - }, - { - "epoch": 0.6770820756767199, - "grad_norm": 2.49225939231994, - "learning_rate": 5.707763848172284e-06, - "loss": 3.8805, - "mean_token_accuracy": 0.3251008078455925, - "step": 14020 - }, - { - "epoch": 0.6775650158162896, - "grad_norm": 2.414729369619342, - "learning_rate": 5.69254333292282e-06, - "loss": 3.9215, - "mean_token_accuracy": 0.3220766082406044, - "step": 14030 - }, - { - "epoch": 0.6780479559558593, - "grad_norm": 2.482081685297335, - "learning_rate": 5.677335059938052e-06, - "loss": 3.8676, - "mean_token_accuracy": 0.32550403028726577, - "step": 14040 - }, - { - "epoch": 0.6785308960954289, - "grad_norm": 2.611387177885236, - "learning_rate": 5.662139072441566e-06, - "loss": 3.9324, - "mean_token_accuracy": 0.31108871251344683, - "step": 14050 - }, - { - "epoch": 0.6790138362349987, - "grad_norm": 2.384795669222592, - "learning_rate": 5.646955413622024e-06, - "loss": 3.9078, - "mean_token_accuracy": 0.3208669379353523, - "step": 14060 - }, - { - "epoch": 0.6794967763745684, - "grad_norm": 2.32157343155579, - "learning_rate": 5.631784126633058e-06, - "loss": 3.8996, - "mean_token_accuracy": 0.3175403267145157, - "step": 14070 - }, - { - "epoch": 0.679979716514138, - "grad_norm": 2.504607872955561, - "learning_rate": 5.616625254593122e-06, - "loss": 3.9629, - "mean_token_accuracy": 0.309677417576313, - "step": 14080 - }, - { - "epoch": 0.6804626566537078, - "grad_norm": 2.7441888026175203, - "learning_rate": 5.601478840585399e-06, - "loss": 3.8898, - "mean_token_accuracy": 0.32046370953321457, - "step": 14090 - }, - { - "epoch": 0.6809455967932775, - "grad_norm": 2.6365495571998814, - "learning_rate": 5.5863449276576595e-06, - "loss": 3.9422, - "mean_token_accuracy": 0.3156249985098839, - "step": 14100 - }, - { - "epoch": 0.6809455967932775, - "eval_runtime": 7.8276, - "eval_samples_per_second": 377.384, - "eval_steps_per_second": 23.634, - "step": 14100 - }, - { - "epoch": 0.6814285369328472, - "grad_norm": 2.371646624269115, - "learning_rate": 5.57122355882215e-06, - "loss": 3.9062, - "mean_token_accuracy": 0.3235887140035629, - "step": 14110 - }, - { - "epoch": 0.6819114770724168, - "grad_norm": 2.4822156340525194, - "learning_rate": 5.556114777055448e-06, - "loss": 3.9262, - "mean_token_accuracy": 0.3153225779533386, - "step": 14120 - }, - { - "epoch": 0.6823944172119866, - "grad_norm": 2.686293541984887, - "learning_rate": 5.541018625298387e-06, - "loss": 3.9156, - "mean_token_accuracy": 0.32127015888690946, - "step": 14130 - }, - { - "epoch": 0.6828773573515563, - "grad_norm": 2.5840822066386684, - "learning_rate": 5.5259351464558716e-06, - "loss": 3.8797, - "mean_token_accuracy": 0.32106854766607285, - "step": 14140 - }, - { - "epoch": 0.6833602974911259, - "grad_norm": 2.5815636065214167, - "learning_rate": 5.510864383396819e-06, - "loss": 3.9922, - "mean_token_accuracy": 0.3129032254219055, - "step": 14150 - }, - { - "epoch": 0.6838432376306957, - "grad_norm": 2.580527635344399, - "learning_rate": 5.4958063789539785e-06, - "loss": 3.9617, - "mean_token_accuracy": 0.3160282239317894, - "step": 14160 - }, - { - "epoch": 0.6843261777702654, - "grad_norm": 2.487363174552614, - "learning_rate": 5.480761175923858e-06, - "loss": 3.9844, - "mean_token_accuracy": 0.3126008063554764, - "step": 14170 - }, - { - "epoch": 0.6848091179098351, - "grad_norm": 2.634061485900264, - "learning_rate": 5.4657288170665826e-06, - "loss": 3.8746, - "mean_token_accuracy": 0.3256048411130905, - "step": 14180 - }, - { - "epoch": 0.6852920580494047, - "grad_norm": 2.5070766485362665, - "learning_rate": 5.450709345105753e-06, - "loss": 3.9535, - "mean_token_accuracy": 0.31542338579893114, - "step": 14190 - }, - { - "epoch": 0.6857749981889745, - "grad_norm": 2.432497984129058, - "learning_rate": 5.435702802728366e-06, - "loss": 3.9016, - "mean_token_accuracy": 0.32358870655298233, - "step": 14200 - }, - { - "epoch": 0.6857749981889745, - "eval_runtime": 7.8074, - "eval_samples_per_second": 378.357, - "eval_steps_per_second": 23.695, - "step": 14200 - }, - { - "epoch": 0.6862579383285442, - "grad_norm": 2.405571423456711, - "learning_rate": 5.42070923258466e-06, - "loss": 3.907, - "mean_token_accuracy": 0.3243951588869095, - "step": 14210 - }, - { - "epoch": 0.6867408784681138, - "grad_norm": 2.371988315484479, - "learning_rate": 5.405728677288011e-06, - "loss": 3.9602, - "mean_token_accuracy": 0.3175403237342834, - "step": 14220 - }, - { - "epoch": 0.6872238186076836, - "grad_norm": 2.3963169699532805, - "learning_rate": 5.390761179414792e-06, - "loss": 3.8766, - "mean_token_accuracy": 0.32268145233392714, - "step": 14230 - }, - { - "epoch": 0.6877067587472533, - "grad_norm": 2.304178036628061, - "learning_rate": 5.375806781504288e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.32086693644523623, - "step": 14240 - }, - { - "epoch": 0.688189698886823, - "grad_norm": 2.4471313784025908, - "learning_rate": 5.3608655260585294e-06, - "loss": 3.8777, - "mean_token_accuracy": 0.32147177010774614, - "step": 14250 - }, - { - "epoch": 0.6886726390263926, - "grad_norm": 2.6521343840359113, - "learning_rate": 5.345937455542212e-06, - "loss": 3.934, - "mean_token_accuracy": 0.317237900197506, - "step": 14260 - }, - { - "epoch": 0.6891555791659624, - "grad_norm": 2.4685010593263916, - "learning_rate": 5.331022612382537e-06, - "loss": 3.9535, - "mean_token_accuracy": 0.3150201603770256, - "step": 14270 - }, - { - "epoch": 0.6896385193055321, - "grad_norm": 2.55936453226136, - "learning_rate": 5.316121038969146e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.3237903222441673, - "step": 14280 - }, - { - "epoch": 0.6901214594451017, - "grad_norm": 2.4982568171465944, - "learning_rate": 5.301232777653935e-06, - "loss": 3.991, - "mean_token_accuracy": 0.3178427413105965, - "step": 14290 - }, - { - "epoch": 0.6906043995846715, - "grad_norm": 2.4175112580335574, - "learning_rate": 5.286357870750976e-06, - "loss": 3.9367, - "mean_token_accuracy": 0.3175403207540512, - "step": 14300 - }, - { - "epoch": 0.6906043995846715, - "eval_runtime": 7.8215, - "eval_samples_per_second": 377.677, - "eval_steps_per_second": 23.653, - "step": 14300 - }, - { - "epoch": 0.6910873397242412, - "grad_norm": 2.425430169136526, - "learning_rate": 5.271496360536388e-06, - "loss": 3.9793, - "mean_token_accuracy": 0.3131048381328583, - "step": 14310 - }, - { - "epoch": 0.6915702798638109, - "grad_norm": 2.314037880061397, - "learning_rate": 5.256648289248215e-06, - "loss": 3.9266, - "mean_token_accuracy": 0.31532257944345476, - "step": 14320 - }, - { - "epoch": 0.6920532200033805, - "grad_norm": 2.3724551691365363, - "learning_rate": 5.241813699086311e-06, - "loss": 3.9227, - "mean_token_accuracy": 0.32318548262119295, - "step": 14330 - }, - { - "epoch": 0.6925361601429503, - "grad_norm": 2.446218547681278, - "learning_rate": 5.2269926322122026e-06, - "loss": 3.9043, - "mean_token_accuracy": 0.3210685506463051, - "step": 14340 - }, - { - "epoch": 0.69301910028252, - "grad_norm": 2.516961842821772, - "learning_rate": 5.212185130748991e-06, - "loss": 3.9129, - "mean_token_accuracy": 0.3183467760682106, - "step": 14350 - }, - { - "epoch": 0.6935020404220896, - "grad_norm": 2.3091894518979195, - "learning_rate": 5.197391236781221e-06, - "loss": 3.8883, - "mean_token_accuracy": 0.3285282254219055, - "step": 14360 - }, - { - "epoch": 0.6939849805616594, - "grad_norm": 2.4512283398448704, - "learning_rate": 5.182610992354768e-06, - "loss": 3.9641, - "mean_token_accuracy": 0.3127016112208366, - "step": 14370 - }, - { - "epoch": 0.6944679207012291, - "grad_norm": 2.370448114144957, - "learning_rate": 5.167844439476697e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.3180443555116653, - "step": 14380 - }, - { - "epoch": 0.6949508608407988, - "grad_norm": 2.358065363678817, - "learning_rate": 5.153091620115187e-06, - "loss": 3.9398, - "mean_token_accuracy": 0.32167338877916335, - "step": 14390 - }, - { - "epoch": 0.6954338009803684, - "grad_norm": 2.644788406142881, - "learning_rate": 5.138352576199359e-06, - "loss": 3.9617, - "mean_token_accuracy": 0.31965725868940353, - "step": 14400 - }, - { - "epoch": 0.6954338009803684, - "eval_runtime": 7.8204, - "eval_samples_per_second": 377.729, - "eval_steps_per_second": 23.656, - "step": 14400 - }, - { - "epoch": 0.6959167411199382, - "grad_norm": 2.489715561821747, - "learning_rate": 5.1236273496192035e-06, - "loss": 3.9211, - "mean_token_accuracy": 0.3196572601795197, - "step": 14410 - }, - { - "epoch": 0.6963996812595079, - "grad_norm": 2.6532229560459504, - "learning_rate": 5.108915982225421e-06, - "loss": 3.8781, - "mean_token_accuracy": 0.3236895129084587, - "step": 14420 - }, - { - "epoch": 0.6968826213990776, - "grad_norm": 2.425560989655647, - "learning_rate": 5.0942185158293365e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.3158266142010689, - "step": 14430 - }, - { - "epoch": 0.6973655615386473, - "grad_norm": 2.498177671354803, - "learning_rate": 5.079534992202767e-06, - "loss": 3.934, - "mean_token_accuracy": 0.3134072616696358, - "step": 14440 - }, - { - "epoch": 0.697848501678217, - "grad_norm": 2.40580931359974, - "learning_rate": 5.064865453077892e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.31885080859065057, - "step": 14450 - }, - { - "epoch": 0.6983314418177867, - "grad_norm": 2.3475911608267492, - "learning_rate": 5.050209940147154e-06, - "loss": 3.9828, - "mean_token_accuracy": 0.31018145233392713, - "step": 14460 - }, - { - "epoch": 0.6988143819573563, - "grad_norm": 2.3605570313582374, - "learning_rate": 5.03556849506313e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.32288306653499604, - "step": 14470 - }, - { - "epoch": 0.6992973220969261, - "grad_norm": 2.295470415797191, - "learning_rate": 5.02094115943842e-06, - "loss": 3.9293, - "mean_token_accuracy": 0.32167338728904726, - "step": 14480 - }, - { - "epoch": 0.6997802622364958, - "grad_norm": 2.5862582890249928, - "learning_rate": 5.006327974845504e-06, - "loss": 3.9164, - "mean_token_accuracy": 0.3180443525314331, - "step": 14490 - }, - { - "epoch": 0.7002632023760655, - "grad_norm": 2.3676120963198635, - "learning_rate": 4.991728982816672e-06, - "loss": 3.9469, - "mean_token_accuracy": 0.3202620968222618, - "step": 14500 - }, - { - "epoch": 0.7002632023760655, - "eval_runtime": 7.8114, - "eval_samples_per_second": 378.165, - "eval_steps_per_second": 23.683, - "step": 14500 - }, - { - "epoch": 0.7007461425156352, - "grad_norm": 2.6551673981950086, - "learning_rate": 4.977144224843853e-06, - "loss": 3.9465, - "mean_token_accuracy": 0.3246975809335709, - "step": 14510 - }, - { - "epoch": 0.7012290826552049, - "grad_norm": 2.5382371856969557, - "learning_rate": 4.962573742378534e-06, - "loss": 3.9574, - "mean_token_accuracy": 0.31179435551166534, - "step": 14520 - }, - { - "epoch": 0.7017120227947746, - "grad_norm": 2.532309626583744, - "learning_rate": 4.948017576831617e-06, - "loss": 3.9258, - "mean_token_accuracy": 0.31592742204666135, - "step": 14530 - }, - { - "epoch": 0.7021949629343442, - "grad_norm": 2.491087270974582, - "learning_rate": 4.933475769573337e-06, - "loss": 3.8625, - "mean_token_accuracy": 0.33125, - "step": 14540 - }, - { - "epoch": 0.702677903073914, - "grad_norm": 2.456895204287448, - "learning_rate": 4.918948361933096e-06, - "loss": 3.891, - "mean_token_accuracy": 0.3257056474685669, - "step": 14550 - }, - { - "epoch": 0.7031608432134837, - "grad_norm": 2.590229869518269, - "learning_rate": 4.904435395199386e-06, - "loss": 3.9656, - "mean_token_accuracy": 0.30957661122083663, - "step": 14560 - }, - { - "epoch": 0.7036437833530534, - "grad_norm": 2.316206424021573, - "learning_rate": 4.889936910619647e-06, - "loss": 3.8652, - "mean_token_accuracy": 0.3272177428007126, - "step": 14570 - }, - { - "epoch": 0.7041267234926231, - "grad_norm": 2.6372734864838874, - "learning_rate": 4.875452949400166e-06, - "loss": 3.8891, - "mean_token_accuracy": 0.31864919513463974, - "step": 14580 - }, - { - "epoch": 0.7046096636321928, - "grad_norm": 2.4704041222080932, - "learning_rate": 4.860983552705955e-06, - "loss": 3.9645, - "mean_token_accuracy": 0.31340725868940356, - "step": 14590 - }, - { - "epoch": 0.7050926037717625, - "grad_norm": 2.4519715493817986, - "learning_rate": 4.846528761660616e-06, - "loss": 3.9297, - "mean_token_accuracy": 0.31885080486536027, - "step": 14600 - }, - { - "epoch": 0.7050926037717625, - "eval_runtime": 7.8098, - "eval_samples_per_second": 378.242, - "eval_steps_per_second": 23.688, - "step": 14600 - }, - { - "epoch": 0.7055755439113321, - "grad_norm": 2.4573889374384668, - "learning_rate": 4.832088617346269e-06, - "loss": 3.9016, - "mean_token_accuracy": 0.32328628897666933, - "step": 14610 - }, - { - "epoch": 0.7060584840509019, - "grad_norm": 2.522003340352901, - "learning_rate": 4.817663160803375e-06, - "loss": 3.923, - "mean_token_accuracy": 0.321068549156189, - "step": 14620 - }, - { - "epoch": 0.7065414241904716, - "grad_norm": 2.582041652621467, - "learning_rate": 4.803252433030675e-06, - "loss": 3.8738, - "mean_token_accuracy": 0.31965725868940353, - "step": 14630 - }, - { - "epoch": 0.7070243643300413, - "grad_norm": 2.4274847926799024, - "learning_rate": 4.788856474985027e-06, - "loss": 3.952, - "mean_token_accuracy": 0.31673386991024016, - "step": 14640 - }, - { - "epoch": 0.707507304469611, - "grad_norm": 2.555341376776363, - "learning_rate": 4.774475327581338e-06, - "loss": 3.8648, - "mean_token_accuracy": 0.3256048411130905, - "step": 14650 - }, - { - "epoch": 0.7079902446091807, - "grad_norm": 2.614824409703627, - "learning_rate": 4.760109031692398e-06, - "loss": 3.9508, - "mean_token_accuracy": 0.31592742055654527, - "step": 14660 - }, - { - "epoch": 0.7084731847487504, - "grad_norm": 2.2531611451556093, - "learning_rate": 4.745757628148804e-06, - "loss": 3.8855, - "mean_token_accuracy": 0.3239919379353523, - "step": 14670 - }, - { - "epoch": 0.70895612488832, - "grad_norm": 2.49962981853155, - "learning_rate": 4.731421157738809e-06, - "loss": 3.8937, - "mean_token_accuracy": 0.3175403207540512, - "step": 14680 - }, - { - "epoch": 0.7094390650278898, - "grad_norm": 2.403220615500529, - "learning_rate": 4.717099661208251e-06, - "loss": 3.907, - "mean_token_accuracy": 0.3205645173788071, - "step": 14690 - }, - { - "epoch": 0.7099220051674595, - "grad_norm": 2.4963327525832333, - "learning_rate": 4.702793179260387e-06, - "loss": 3.9488, - "mean_token_accuracy": 0.3166330650448799, - "step": 14700 - }, - { - "epoch": 0.7099220051674595, - "eval_runtime": 7.8138, - "eval_samples_per_second": 378.047, - "eval_steps_per_second": 23.676, - "step": 14700 - }, - { - "epoch": 0.7104049453070292, - "grad_norm": 2.5411854440802295, - "learning_rate": 4.6885017525558074e-06, - "loss": 3.8969, - "mean_token_accuracy": 0.3195564538240433, - "step": 14710 - }, - { - "epoch": 0.7108878854465989, - "grad_norm": 2.612826539910147, - "learning_rate": 4.674225421712317e-06, - "loss": 3.9684, - "mean_token_accuracy": 0.3085685521364212, - "step": 14720 - }, - { - "epoch": 0.7113708255861686, - "grad_norm": 2.323685904255609, - "learning_rate": 4.659964227304816e-06, - "loss": 3.9621, - "mean_token_accuracy": 0.31058467775583265, - "step": 14730 - }, - { - "epoch": 0.7118537657257383, - "grad_norm": 2.4355541715215208, - "learning_rate": 4.645718209865189e-06, - "loss": 3.9695, - "mean_token_accuracy": 0.3129032239317894, - "step": 14740 - }, - { - "epoch": 0.712336705865308, - "grad_norm": 2.46237779541236, - "learning_rate": 4.6314874098821696e-06, - "loss": 3.9191, - "mean_token_accuracy": 0.32358870804309847, - "step": 14750 - }, - { - "epoch": 0.7128196460048777, - "grad_norm": 2.8244521530754603, - "learning_rate": 4.617271867801268e-06, - "loss": 3.9863, - "mean_token_accuracy": 0.31370967626571655, - "step": 14760 - }, - { - "epoch": 0.7133025861444474, - "grad_norm": 2.342530027987054, - "learning_rate": 4.603071624024605e-06, - "loss": 3.9066, - "mean_token_accuracy": 0.3244959697127342, - "step": 14770 - }, - { - "epoch": 0.7137855262840171, - "grad_norm": 2.552251880164965, - "learning_rate": 4.58888671891084e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.31875, - "step": 14780 - }, - { - "epoch": 0.7142684664235868, - "grad_norm": 2.339784549556923, - "learning_rate": 4.5747171927750175e-06, - "loss": 3.9496, - "mean_token_accuracy": 0.3142137140035629, - "step": 14790 - }, - { - "epoch": 0.7147514065631565, - "grad_norm": 2.3843810229299995, - "learning_rate": 4.560563085888503e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.31905242055654526, - "step": 14800 - }, - { - "epoch": 0.7147514065631565, - "eval_runtime": 7.8314, - "eval_samples_per_second": 377.198, - "eval_steps_per_second": 23.623, - "step": 14800 - }, - { - "epoch": 0.7152343467027262, - "grad_norm": 2.428513651369846, - "learning_rate": 4.54642443847881e-06, - "loss": 3.9254, - "mean_token_accuracy": 0.31824596524238585, - "step": 14810 - }, - { - "epoch": 0.7157172868422959, - "grad_norm": 2.6030870042909204, - "learning_rate": 4.532301290729535e-06, - "loss": 3.9707, - "mean_token_accuracy": 0.32258064597845076, - "step": 14820 - }, - { - "epoch": 0.7162002269818656, - "grad_norm": 2.5543406619630935, - "learning_rate": 4.518193682780205e-06, - "loss": 3.9523, - "mean_token_accuracy": 0.31491935551166533, - "step": 14830 - }, - { - "epoch": 0.7166831671214353, - "grad_norm": 2.605715236873571, - "learning_rate": 4.504101654726195e-06, - "loss": 3.943, - "mean_token_accuracy": 0.31542338579893114, - "step": 14840 - }, - { - "epoch": 0.717166107261005, - "grad_norm": 2.509142932962603, - "learning_rate": 4.4900252466186e-06, - "loss": 3.9023, - "mean_token_accuracy": 0.3197351634502411, - "step": 14850 - }, - { - "epoch": 0.7176490474005747, - "grad_norm": 2.499479951166928, - "learning_rate": 4.475964498464106e-06, - "loss": 3.8676, - "mean_token_accuracy": 0.3274193570017815, - "step": 14860 - }, - { - "epoch": 0.7181319875401444, - "grad_norm": 2.44542708774872, - "learning_rate": 4.4619194502249165e-06, - "loss": 3.9789, - "mean_token_accuracy": 0.31340223997831346, - "step": 14870 - }, - { - "epoch": 0.7186149276797141, - "grad_norm": 2.4438661074715426, - "learning_rate": 4.44789014181859e-06, - "loss": 3.9961, - "mean_token_accuracy": 0.3086693525314331, - "step": 14880 - }, - { - "epoch": 0.7190978678192838, - "grad_norm": 2.3923380712423414, - "learning_rate": 4.433876613117968e-06, - "loss": 3.902, - "mean_token_accuracy": 0.31733871102333067, - "step": 14890 - }, - { - "epoch": 0.7195808079588535, - "grad_norm": 2.5628356632297313, - "learning_rate": 4.419878903951027e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.3243951633572578, - "step": 14900 - }, - { - "epoch": 0.7195808079588535, - "eval_runtime": 7.812, - "eval_samples_per_second": 378.138, - "eval_steps_per_second": 23.682, - "step": 14900 - }, - { - "epoch": 0.7200637480984232, - "grad_norm": 2.391454109510873, - "learning_rate": 4.405897054100808e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.32056451588869095, - "step": 14910 - }, - { - "epoch": 0.720546688237993, - "grad_norm": 2.4146831039124046, - "learning_rate": 4.391931103305251e-06, - "loss": 3.8344, - "mean_token_accuracy": 0.322076615691185, - "step": 14920 - }, - { - "epoch": 0.7210296283775626, - "grad_norm": 2.2980539186703433, - "learning_rate": 4.37798109125713e-06, - "loss": 4.002, - "mean_token_accuracy": 0.31108870804309846, - "step": 14930 - }, - { - "epoch": 0.7215125685171323, - "grad_norm": 2.673315473006604, - "learning_rate": 4.364047057603897e-06, - "loss": 3.9367, - "mean_token_accuracy": 0.31885080635547636, - "step": 14940 - }, - { - "epoch": 0.721995508656702, - "grad_norm": 2.3936242046831175, - "learning_rate": 4.350129041947623e-06, - "loss": 3.9316, - "mean_token_accuracy": 0.31300403028726576, - "step": 14950 - }, - { - "epoch": 0.7224784487962717, - "grad_norm": 2.590915431177032, - "learning_rate": 4.3362270838448275e-06, - "loss": 3.8977, - "mean_token_accuracy": 0.31985886991024015, - "step": 14960 - }, - { - "epoch": 0.7229613889358414, - "grad_norm": 2.375467613791358, - "learning_rate": 4.322341222806394e-06, - "loss": 3.9246, - "mean_token_accuracy": 0.30735886842012405, - "step": 14970 - }, - { - "epoch": 0.7234443290754111, - "grad_norm": 2.507571735217888, - "learning_rate": 4.30847149829748e-06, - "loss": 3.9066, - "mean_token_accuracy": 0.3207661330699921, - "step": 14980 - }, - { - "epoch": 0.7239272692149809, - "grad_norm": 2.4644818301505262, - "learning_rate": 4.294617949737353e-06, - "loss": 4.0098, - "mean_token_accuracy": 0.31330645084381104, - "step": 14990 - }, - { - "epoch": 0.7244102093545505, - "grad_norm": 2.408721104937917, - "learning_rate": 4.280780616499325e-06, - "loss": 3.873, - "mean_token_accuracy": 0.3273185506463051, - "step": 15000 - }, - { - "epoch": 0.7244102093545505, - "eval_runtime": 7.7943, - "eval_samples_per_second": 378.994, - "eval_steps_per_second": 23.735, - "step": 15000 - }, - { - "epoch": 0.7248931494941202, - "grad_norm": 2.593428874376227, - "learning_rate": 4.266959537910608e-06, - "loss": 3.9258, - "mean_token_accuracy": 0.31784274280071256, - "step": 15010 - }, - { - "epoch": 0.7253760896336899, - "grad_norm": 2.6411816877271255, - "learning_rate": 4.253154753252235e-06, - "loss": 3.902, - "mean_token_accuracy": 0.31370967626571655, - "step": 15020 - }, - { - "epoch": 0.7258590297732596, - "grad_norm": 2.542398653412803, - "learning_rate": 4.239366301758914e-06, - "loss": 3.9395, - "mean_token_accuracy": 0.3205645129084587, - "step": 15030 - }, - { - "epoch": 0.7263419699128293, - "grad_norm": 2.521715865718538, - "learning_rate": 4.225594222618939e-06, - "loss": 3.8879, - "mean_token_accuracy": 0.32651209831237793, - "step": 15040 - }, - { - "epoch": 0.726824910052399, - "grad_norm": 2.4930738109906505, - "learning_rate": 4.211838554974065e-06, - "loss": 3.923, - "mean_token_accuracy": 0.3111895158886909, - "step": 15050 - }, - { - "epoch": 0.7273078501919688, - "grad_norm": 2.4315326034226166, - "learning_rate": 4.198099337919421e-06, - "loss": 3.9227, - "mean_token_accuracy": 0.3218749985098839, - "step": 15060 - }, - { - "epoch": 0.7277907903315384, - "grad_norm": 2.4449574507272636, - "learning_rate": 4.18437661050336e-06, - "loss": 3.9441, - "mean_token_accuracy": 0.3107862897217274, - "step": 15070 - }, - { - "epoch": 0.7282737304711081, - "grad_norm": 2.370351988418548, - "learning_rate": 4.1706704117273845e-06, - "loss": 3.9156, - "mean_token_accuracy": 0.3192540317773819, - "step": 15080 - }, - { - "epoch": 0.7287566706106778, - "grad_norm": 2.591991883355936, - "learning_rate": 4.156980780546014e-06, - "loss": 3.9797, - "mean_token_accuracy": 0.31673386693000793, - "step": 15090 - }, - { - "epoch": 0.7292396107502475, - "grad_norm": 2.4510274646424866, - "learning_rate": 4.1433077558666876e-06, - "loss": 3.8547, - "mean_token_accuracy": 0.32661290317773817, - "step": 15100 - }, - { - "epoch": 0.7292396107502475, - "eval_runtime": 7.8426, - "eval_samples_per_second": 376.663, - "eval_steps_per_second": 23.589, - "step": 15100 - }, - { - "epoch": 0.7297225508898172, - "grad_norm": 2.5027839013131756, - "learning_rate": 4.129651376549639e-06, - "loss": 3.8727, - "mean_token_accuracy": 0.316129033267498, - "step": 15110 - }, - { - "epoch": 0.7302054910293869, - "grad_norm": 2.5042297187716707, - "learning_rate": 4.11601168140779e-06, - "loss": 3.807, - "mean_token_accuracy": 0.334072582423687, - "step": 15120 - }, - { - "epoch": 0.7306884311689567, - "grad_norm": 2.4037398800644545, - "learning_rate": 4.102388709206666e-06, - "loss": 3.8957, - "mean_token_accuracy": 0.3201612919569016, - "step": 15130 - }, - { - "epoch": 0.7311713713085263, - "grad_norm": 2.5212822272654987, - "learning_rate": 4.088782498664238e-06, - "loss": 3.9094, - "mean_token_accuracy": 0.3208669349551201, - "step": 15140 - }, - { - "epoch": 0.731654311448096, - "grad_norm": 2.6670973617578593, - "learning_rate": 4.0751930884508586e-06, - "loss": 3.9637, - "mean_token_accuracy": 0.3102822571992874, - "step": 15150 - }, - { - "epoch": 0.7321372515876657, - "grad_norm": 2.354301800171696, - "learning_rate": 4.061620517189111e-06, - "loss": 3.9531, - "mean_token_accuracy": 0.3113911300897598, - "step": 15160 - }, - { - "epoch": 0.7326201917272354, - "grad_norm": 2.5479479795334816, - "learning_rate": 4.048064823453748e-06, - "loss": 3.9254, - "mean_token_accuracy": 0.31854838579893113, - "step": 15170 - }, - { - "epoch": 0.7331031318668051, - "grad_norm": 2.4479654094931718, - "learning_rate": 4.034526045771529e-06, - "loss": 3.8668, - "mean_token_accuracy": 0.3244959697127342, - "step": 15180 - }, - { - "epoch": 0.7335860720063748, - "grad_norm": 2.44521805835857, - "learning_rate": 4.021004222621151e-06, - "loss": 3.9937, - "mean_token_accuracy": 0.3016129031777382, - "step": 15190 - }, - { - "epoch": 0.7340690121459446, - "grad_norm": 2.5390031389189214, - "learning_rate": 4.007499392433113e-06, - "loss": 3.9332, - "mean_token_accuracy": 0.317237900197506, - "step": 15200 - }, - { - "epoch": 0.7340690121459446, - "eval_runtime": 7.8288, - "eval_samples_per_second": 377.324, - "eval_steps_per_second": 23.631, - "step": 15200 - }, - { - "epoch": 0.7345519522855142, - "grad_norm": 2.6208555190571587, - "learning_rate": 3.994011593589635e-06, - "loss": 3.9801, - "mean_token_accuracy": 0.31562500149011613, - "step": 15210 - }, - { - "epoch": 0.7350348924250839, - "grad_norm": 2.367527893585157, - "learning_rate": 3.9805408644245145e-06, - "loss": 3.9496, - "mean_token_accuracy": 0.31784273982048034, - "step": 15220 - }, - { - "epoch": 0.7355178325646536, - "grad_norm": 2.707198956784325, - "learning_rate": 3.967087243223046e-06, - "loss": 3.9797, - "mean_token_accuracy": 0.3088709652423859, - "step": 15230 - }, - { - "epoch": 0.7360007727042233, - "grad_norm": 2.4441958593750677, - "learning_rate": 3.953650768221898e-06, - "loss": 3.8805, - "mean_token_accuracy": 0.3265120953321457, - "step": 15240 - }, - { - "epoch": 0.736483712843793, - "grad_norm": 2.33734175337742, - "learning_rate": 3.940231477609004e-06, - "loss": 3.9145, - "mean_token_accuracy": 0.31653225868940355, - "step": 15250 - }, - { - "epoch": 0.7369666529833627, - "grad_norm": 2.468068568695171, - "learning_rate": 3.926829409523466e-06, - "loss": 3.9555, - "mean_token_accuracy": 0.3209677398204803, - "step": 15260 - }, - { - "epoch": 0.7374495931229325, - "grad_norm": 2.5370706100855136, - "learning_rate": 3.913444602055424e-06, - "loss": 3.893, - "mean_token_accuracy": 0.32237903475761415, - "step": 15270 - }, - { - "epoch": 0.7379325332625021, - "grad_norm": 2.4632435230577596, - "learning_rate": 3.900077093245982e-06, - "loss": 3.9699, - "mean_token_accuracy": 0.3167338714003563, - "step": 15280 - }, - { - "epoch": 0.7384154734020718, - "grad_norm": 2.561719935990509, - "learning_rate": 3.886726921087058e-06, - "loss": 3.8734, - "mean_token_accuracy": 0.3237903222441673, - "step": 15290 - }, - { - "epoch": 0.7388984135416415, - "grad_norm": 2.790950404677061, - "learning_rate": 3.873394123521315e-06, - "loss": 3.9156, - "mean_token_accuracy": 0.31804435700178146, - "step": 15300 - }, - { - "epoch": 0.7388984135416415, - "eval_runtime": 7.8425, - "eval_samples_per_second": 376.665, - "eval_steps_per_second": 23.589, - "step": 15300 - }, - { - "epoch": 0.7393813536812112, - "grad_norm": 2.4446765590082546, - "learning_rate": 3.860078738442014e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.3146169364452362, - "step": 15310 - }, - { - "epoch": 0.7398642938207809, - "grad_norm": 2.65967736413874, - "learning_rate": 3.846780803692958e-06, - "loss": 3.941, - "mean_token_accuracy": 0.31804435700178146, - "step": 15320 - }, - { - "epoch": 0.7403472339603506, - "grad_norm": 2.491670541254839, - "learning_rate": 3.833500357068325e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.31441531926393507, - "step": 15330 - }, - { - "epoch": 0.7408301740999204, - "grad_norm": 2.3460682734120524, - "learning_rate": 3.820237436312606e-06, - "loss": 3.9336, - "mean_token_accuracy": 0.3200604826211929, - "step": 15340 - }, - { - "epoch": 0.74131311423949, - "grad_norm": 2.388014872853918, - "learning_rate": 3.806992079120477e-06, - "loss": 3.8742, - "mean_token_accuracy": 0.32610887438058855, - "step": 15350 - }, - { - "epoch": 0.7417960543790597, - "grad_norm": 2.4530352255650296, - "learning_rate": 3.7937643231367038e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.31794354766607286, - "step": 15360 - }, - { - "epoch": 0.7422789945186294, - "grad_norm": 2.664514435291622, - "learning_rate": 3.780554205956014e-06, - "loss": 3.9906, - "mean_token_accuracy": 0.3133064493536949, - "step": 15370 - }, - { - "epoch": 0.7427619346581991, - "grad_norm": 2.3191899016908537, - "learning_rate": 3.7673617651230055e-06, - "loss": 3.8566, - "mean_token_accuracy": 0.32510080486536025, - "step": 15380 - }, - { - "epoch": 0.7432448747977688, - "grad_norm": 2.474256875227795, - "learning_rate": 3.7541870381320564e-06, - "loss": 3.941, - "mean_token_accuracy": 0.31542338579893114, - "step": 15390 - }, - { - "epoch": 0.7437278149373385, - "grad_norm": 2.555223276304514, - "learning_rate": 3.7410300624271768e-06, - "loss": 3.902, - "mean_token_accuracy": 0.3140120983123779, - "step": 15400 - }, - { - "epoch": 0.7437278149373385, - "eval_runtime": 7.7795, - "eval_samples_per_second": 379.716, - "eval_steps_per_second": 23.78, - "step": 15400 - }, - { - "epoch": 0.7442107550769083, - "grad_norm": 2.537189244982377, - "learning_rate": 3.7278908754019438e-06, - "loss": 3.9645, - "mean_token_accuracy": 0.31733870804309844, - "step": 15410 - }, - { - "epoch": 0.7446936952164779, - "grad_norm": 2.442429163563041, - "learning_rate": 3.714769514399358e-06, - "loss": 3.9188, - "mean_token_accuracy": 0.3183467760682106, - "step": 15420 - }, - { - "epoch": 0.7451766353560476, - "grad_norm": 2.430822312159094, - "learning_rate": 3.7016660167117826e-06, - "loss": 3.902, - "mean_token_accuracy": 0.32631048411130903, - "step": 15430 - }, - { - "epoch": 0.7456595754956173, - "grad_norm": 2.4044508435609604, - "learning_rate": 3.688580419580785e-06, - "loss": 3.8777, - "mean_token_accuracy": 0.3193548396229744, - "step": 15440 - }, - { - "epoch": 0.746142515635187, - "grad_norm": 2.4335772557300683, - "learning_rate": 3.675512760197072e-06, - "loss": 3.9574, - "mean_token_accuracy": 0.3194556415081024, - "step": 15450 - }, - { - "epoch": 0.7466254557747567, - "grad_norm": 2.7082274819114507, - "learning_rate": 3.6624630757003676e-06, - "loss": 3.9371, - "mean_token_accuracy": 0.31512096971273423, - "step": 15460 - }, - { - "epoch": 0.7471083959143264, - "grad_norm": 2.5587950414668255, - "learning_rate": 3.6494314031793087e-06, - "loss": 3.9238, - "mean_token_accuracy": 0.3152217730879784, - "step": 15470 - }, - { - "epoch": 0.7475913360538962, - "grad_norm": 2.664794005054441, - "learning_rate": 3.6364177796713353e-06, - "loss": 3.9676, - "mean_token_accuracy": 0.3129032254219055, - "step": 15480 - }, - { - "epoch": 0.7480742761934658, - "grad_norm": 2.531862524210361, - "learning_rate": 3.6234222421625955e-06, - "loss": 3.9008, - "mean_token_accuracy": 0.3285282254219055, - "step": 15490 - }, - { - "epoch": 0.7485572163330355, - "grad_norm": 2.6051289402257902, - "learning_rate": 3.6104448275878335e-06, - "loss": 3.9234, - "mean_token_accuracy": 0.325, - "step": 15500 - }, - { - "epoch": 0.7485572163330355, - "eval_runtime": 7.8145, - "eval_samples_per_second": 378.017, - "eval_steps_per_second": 23.674, - "step": 15500 - }, - { - "epoch": 0.7490401564726052, - "grad_norm": 2.5638234621717126, - "learning_rate": 3.5974855728302893e-06, - "loss": 3.9395, - "mean_token_accuracy": 0.31935483664274217, - "step": 15510 - }, - { - "epoch": 0.7495230966121749, - "grad_norm": 2.7391427381924123, - "learning_rate": 3.5845445147215853e-06, - "loss": 3.9414, - "mean_token_accuracy": 0.316431450843811, - "step": 15520 - }, - { - "epoch": 0.7500060367517446, - "grad_norm": 2.425258497299364, - "learning_rate": 3.5716216900416223e-06, - "loss": 3.9055, - "mean_token_accuracy": 0.3233870953321457, - "step": 15530 - }, - { - "epoch": 0.7504889768913143, - "grad_norm": 2.366059174869697, - "learning_rate": 3.5587171355184993e-06, - "loss": 3.9266, - "mean_token_accuracy": 0.3136088728904724, - "step": 15540 - }, - { - "epoch": 0.7509719170308841, - "grad_norm": 2.636300062613671, - "learning_rate": 3.5458308878283664e-06, - "loss": 4.0047, - "mean_token_accuracy": 0.318044351041317, - "step": 15550 - }, - { - "epoch": 0.7514548571704537, - "grad_norm": 2.6542044032252843, - "learning_rate": 3.532962983595363e-06, - "loss": 3.8215, - "mean_token_accuracy": 0.3284274205565453, - "step": 15560 - }, - { - "epoch": 0.7519377973100234, - "grad_norm": 2.526497619039477, - "learning_rate": 3.520113459391473e-06, - "loss": 3.9012, - "mean_token_accuracy": 0.3254032239317894, - "step": 15570 - }, - { - "epoch": 0.7524207374495931, - "grad_norm": 2.5230094856956358, - "learning_rate": 3.5072823517364696e-06, - "loss": 3.9469, - "mean_token_accuracy": 0.31905241757631303, - "step": 15580 - }, - { - "epoch": 0.7529036775891629, - "grad_norm": 2.58256065884894, - "learning_rate": 3.4944696970977597e-06, - "loss": 3.9438, - "mean_token_accuracy": 0.3193548426032066, - "step": 15590 - }, - { - "epoch": 0.7533866177287325, - "grad_norm": 2.4598479476918675, - "learning_rate": 3.4816755318903162e-06, - "loss": 3.9102, - "mean_token_accuracy": 0.32409273982048037, - "step": 15600 - }, - { - "epoch": 0.7533866177287325, - "eval_runtime": 7.8083, - "eval_samples_per_second": 378.317, - "eval_steps_per_second": 23.693, - "step": 15600 - }, - { - "epoch": 0.7538695578683022, - "grad_norm": 2.415970107077365, - "learning_rate": 3.4688998924765615e-06, - "loss": 3.9352, - "mean_token_accuracy": 0.31794354766607286, - "step": 15610 - }, - { - "epoch": 0.754352498007872, - "grad_norm": 2.4321884051867384, - "learning_rate": 3.456142815166269e-06, - "loss": 3.943, - "mean_token_accuracy": 0.31955645233392715, - "step": 15620 - }, - { - "epoch": 0.7548354381474416, - "grad_norm": 2.392784198660163, - "learning_rate": 3.443404336216446e-06, - "loss": 3.9535, - "mean_token_accuracy": 0.31491935402154925, - "step": 15630 - }, - { - "epoch": 0.7553183782870113, - "grad_norm": 2.2782457512809136, - "learning_rate": 3.430684491831251e-06, - "loss": 3.9594, - "mean_token_accuracy": 0.3191532254219055, - "step": 15640 - }, - { - "epoch": 0.755801318426581, - "grad_norm": 2.473267520440249, - "learning_rate": 3.4179833181618815e-06, - "loss": 3.834, - "mean_token_accuracy": 0.3323588728904724, - "step": 15650 - }, - { - "epoch": 0.7562842585661508, - "grad_norm": 2.4249260049491, - "learning_rate": 3.405300851306462e-06, - "loss": 3.9762, - "mean_token_accuracy": 0.3129032239317894, - "step": 15660 - }, - { - "epoch": 0.7567671987057204, - "grad_norm": 2.6116856637486876, - "learning_rate": 3.39263712730996e-06, - "loss": 3.9023, - "mean_token_accuracy": 0.3192540302872658, - "step": 15670 - }, - { - "epoch": 0.7572501388452901, - "grad_norm": 2.7517678667929193, - "learning_rate": 3.3799921821640614e-06, - "loss": 3.9215, - "mean_token_accuracy": 0.32026209831237795, - "step": 15680 - }, - { - "epoch": 0.7577330789848599, - "grad_norm": 2.5161610627735715, - "learning_rate": 3.3673660518071004e-06, - "loss": 3.859, - "mean_token_accuracy": 0.3259072557091713, - "step": 15690 - }, - { - "epoch": 0.7582160191244295, - "grad_norm": 2.6017300673422197, - "learning_rate": 3.3547587721239163e-06, - "loss": 3.9094, - "mean_token_accuracy": 0.31885080635547636, - "step": 15700 - }, - { - "epoch": 0.7582160191244295, - "eval_runtime": 7.819, - "eval_samples_per_second": 377.8, - "eval_steps_per_second": 23.66, - "step": 15700 - }, - { - "epoch": 0.7586989592639992, - "grad_norm": 2.767250808746583, - "learning_rate": 3.3421703789457837e-06, - "loss": 3.9621, - "mean_token_accuracy": 0.3126008063554764, - "step": 15710 - }, - { - "epoch": 0.759181899403569, - "grad_norm": 2.329018495591996, - "learning_rate": 3.3296009080502987e-06, - "loss": 3.9062, - "mean_token_accuracy": 0.3284274205565453, - "step": 15720 - }, - { - "epoch": 0.7596648395431387, - "grad_norm": 2.5092994722659188, - "learning_rate": 3.317050395161282e-06, - "loss": 3.9113, - "mean_token_accuracy": 0.31330645084381104, - "step": 15730 - }, - { - "epoch": 0.7601477796827083, - "grad_norm": 2.6546587795161884, - "learning_rate": 3.304518875948661e-06, - "loss": 3.9152, - "mean_token_accuracy": 0.3172379046678543, - "step": 15740 - }, - { - "epoch": 0.760630719822278, - "grad_norm": 2.582949496137021, - "learning_rate": 3.2920063860283935e-06, - "loss": 3.9598, - "mean_token_accuracy": 0.31431451737880706, - "step": 15750 - }, - { - "epoch": 0.7611136599618478, - "grad_norm": 2.461486046197451, - "learning_rate": 3.2795129609623477e-06, - "loss": 3.9254, - "mean_token_accuracy": 0.31915322691202164, - "step": 15760 - }, - { - "epoch": 0.7615966001014174, - "grad_norm": 2.6381647773561756, - "learning_rate": 3.267038636258213e-06, - "loss": 3.9652, - "mean_token_accuracy": 0.3188508078455925, - "step": 15770 - }, - { - "epoch": 0.7620795402409871, - "grad_norm": 2.5098004155536167, - "learning_rate": 3.254583447369387e-06, - "loss": 3.9238, - "mean_token_accuracy": 0.31854838579893113, - "step": 15780 - }, - { - "epoch": 0.7625624803805569, - "grad_norm": 2.415709958900494, - "learning_rate": 3.2421474296948764e-06, - "loss": 3.9754, - "mean_token_accuracy": 0.3189516082406044, - "step": 15790 - }, - { - "epoch": 0.7630454205201266, - "grad_norm": 2.4976840406476124, - "learning_rate": 3.229730618579222e-06, - "loss": 3.9324, - "mean_token_accuracy": 0.3132056474685669, - "step": 15800 - }, - { - "epoch": 0.7630454205201266, - "eval_runtime": 7.7948, - "eval_samples_per_second": 378.971, - "eval_steps_per_second": 23.734, - "step": 15800 - }, - { - "epoch": 0.7635283606596962, - "grad_norm": 2.493601075255913, - "learning_rate": 3.2173330493123557e-06, - "loss": 3.9641, - "mean_token_accuracy": 0.32227822542190554, - "step": 15810 - }, - { - "epoch": 0.7640113007992659, - "grad_norm": 2.510355193627828, - "learning_rate": 3.2049547571295303e-06, - "loss": 3.9555, - "mean_token_accuracy": 0.31774193197488787, - "step": 15820 - }, - { - "epoch": 0.7644942409388357, - "grad_norm": 2.7045016229318986, - "learning_rate": 3.1925957772112136e-06, - "loss": 3.9082, - "mean_token_accuracy": 0.32147177010774614, - "step": 15830 - }, - { - "epoch": 0.7649771810784053, - "grad_norm": 2.3423531235386252, - "learning_rate": 3.180256144682986e-06, - "loss": 3.9355, - "mean_token_accuracy": 0.31350806504487994, - "step": 15840 - }, - { - "epoch": 0.765460121217975, - "grad_norm": 2.4603035111451534, - "learning_rate": 3.1679358946154306e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.31945564299821855, - "step": 15850 - }, - { - "epoch": 0.7659430613575448, - "grad_norm": 2.366092766507224, - "learning_rate": 3.155635062024055e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.3220766097307205, - "step": 15860 - }, - { - "epoch": 0.7664260014971145, - "grad_norm": 2.3563125980306787, - "learning_rate": 3.1433536818691734e-06, - "loss": 3.8848, - "mean_token_accuracy": 0.3233870968222618, - "step": 15870 - }, - { - "epoch": 0.7669089416366841, - "grad_norm": 2.472913696601303, - "learning_rate": 3.131091789055822e-06, - "loss": 3.8574, - "mean_token_accuracy": 0.3309475839138031, - "step": 15880 - }, - { - "epoch": 0.7673918817762538, - "grad_norm": 2.416212989032887, - "learning_rate": 3.1188494184336394e-06, - "loss": 3.927, - "mean_token_accuracy": 0.32026209831237795, - "step": 15890 - }, - { - "epoch": 0.7678748219158236, - "grad_norm": 2.4954487608961102, - "learning_rate": 3.1066266047967895e-06, - "loss": 3.8809, - "mean_token_accuracy": 0.3209677442908287, - "step": 15900 - }, - { - "epoch": 0.7678748219158236, - "eval_runtime": 7.8189, - "eval_samples_per_second": 377.803, - "eval_steps_per_second": 23.661, - "step": 15900 - }, - { - "epoch": 0.7683577620553932, - "grad_norm": 2.5377497456994824, - "learning_rate": 3.0944233828838533e-06, - "loss": 3.9172, - "mean_token_accuracy": 0.31885080635547636, - "step": 15910 - }, - { - "epoch": 0.7688407021949629, - "grad_norm": 2.3940917924875667, - "learning_rate": 3.082239787377721e-06, - "loss": 3.8754, - "mean_token_accuracy": 0.31844757944345475, - "step": 15920 - }, - { - "epoch": 0.7693236423345327, - "grad_norm": 2.512979259567937, - "learning_rate": 3.070075852905512e-06, - "loss": 3.9828, - "mean_token_accuracy": 0.30735886842012405, - "step": 15930 - }, - { - "epoch": 0.7698065824741024, - "grad_norm": 2.5069447000299054, - "learning_rate": 3.057931614038463e-06, - "loss": 3.9281, - "mean_token_accuracy": 0.32368951886892317, - "step": 15940 - }, - { - "epoch": 0.770289522613672, - "grad_norm": 2.4178966533910344, - "learning_rate": 3.045807105291838e-06, - "loss": 3.9266, - "mean_token_accuracy": 0.3143145129084587, - "step": 15950 - }, - { - "epoch": 0.7707724627532417, - "grad_norm": 2.4809741475818923, - "learning_rate": 3.033702361124814e-06, - "loss": 3.968, - "mean_token_accuracy": 0.3160282254219055, - "step": 15960 - }, - { - "epoch": 0.7712554028928115, - "grad_norm": 2.6846141184788004, - "learning_rate": 3.0216174159404067e-06, - "loss": 3.9156, - "mean_token_accuracy": 0.3222782269120216, - "step": 15970 - }, - { - "epoch": 0.7717383430323811, - "grad_norm": 2.489939738950206, - "learning_rate": 3.009552304085358e-06, - "loss": 3.9672, - "mean_token_accuracy": 0.3148185521364212, - "step": 15980 - }, - { - "epoch": 0.7722212831719508, - "grad_norm": 2.4379455939343044, - "learning_rate": 2.997507059850041e-06, - "loss": 3.8504, - "mean_token_accuracy": 0.32560483664274215, - "step": 15990 - }, - { - "epoch": 0.7727042233115206, - "grad_norm": 2.4669427582835954, - "learning_rate": 2.985481717468358e-06, - "loss": 3.9441, - "mean_token_accuracy": 0.31250000149011614, - "step": 16000 - }, - { - "epoch": 0.7727042233115206, - "eval_runtime": 7.7928, - "eval_samples_per_second": 379.068, - "eval_steps_per_second": 23.74, - "step": 16000 - }, - { - "epoch": 0.7731871634510903, - "grad_norm": 2.519576489907158, - "learning_rate": 2.9734763111176534e-06, - "loss": 3.9273, - "mean_token_accuracy": 0.3140120983123779, - "step": 16010 - }, - { - "epoch": 0.7736701035906599, - "grad_norm": 2.42330825198855, - "learning_rate": 2.9614908749186124e-06, - "loss": 3.909, - "mean_token_accuracy": 0.3224798396229744, - "step": 16020 - }, - { - "epoch": 0.7741530437302296, - "grad_norm": 2.4422860351800817, - "learning_rate": 2.9495254429351604e-06, - "loss": 3.8449, - "mean_token_accuracy": 0.3275201603770256, - "step": 16030 - }, - { - "epoch": 0.7746359838697994, - "grad_norm": 2.633996798829478, - "learning_rate": 2.9375800491743655e-06, - "loss": 3.8957, - "mean_token_accuracy": 0.32459677308797835, - "step": 16040 - }, - { - "epoch": 0.775118924009369, - "grad_norm": 2.4868630036775476, - "learning_rate": 2.925654727586349e-06, - "loss": 3.9402, - "mean_token_accuracy": 0.32258064597845076, - "step": 16050 - }, - { - "epoch": 0.7756018641489387, - "grad_norm": 2.5262480686013413, - "learning_rate": 2.913749512064188e-06, - "loss": 3.9461, - "mean_token_accuracy": 0.3170362889766693, - "step": 16060 - }, - { - "epoch": 0.7760848042885085, - "grad_norm": 2.39883548222417, - "learning_rate": 2.9018644364438053e-06, - "loss": 3.9383, - "mean_token_accuracy": 0.31804435700178146, - "step": 16070 - }, - { - "epoch": 0.7765677444280782, - "grad_norm": 2.6351213080563927, - "learning_rate": 2.889999534503893e-06, - "loss": 3.9387, - "mean_token_accuracy": 0.31834677457809446, - "step": 16080 - }, - { - "epoch": 0.7770506845676478, - "grad_norm": 2.5557718779587892, - "learning_rate": 2.8781548399658065e-06, - "loss": 3.9621, - "mean_token_accuracy": 0.3150201618671417, - "step": 16090 - }, - { - "epoch": 0.7775336247072175, - "grad_norm": 2.5652311494524644, - "learning_rate": 2.866330386493471e-06, - "loss": 3.9215, - "mean_token_accuracy": 0.3173387095332146, - "step": 16100 - }, - { - "epoch": 0.7775336247072175, - "eval_runtime": 7.7936, - "eval_samples_per_second": 379.027, - "eval_steps_per_second": 23.737, - "step": 16100 - }, - { - "epoch": 0.7780165648467873, - "grad_norm": 2.6079461085726923, - "learning_rate": 2.8545262076932734e-06, - "loss": 3.9406, - "mean_token_accuracy": 0.31572580635547637, - "step": 16110 - }, - { - "epoch": 0.7784995049863569, - "grad_norm": 2.5430824277773207, - "learning_rate": 2.8427423371139895e-06, - "loss": 3.9145, - "mean_token_accuracy": 0.3162298411130905, - "step": 16120 - }, - { - "epoch": 0.7789824451259266, - "grad_norm": 2.4719514016389494, - "learning_rate": 2.8309788082466736e-06, - "loss": 3.9121, - "mean_token_accuracy": 0.3245967745780945, - "step": 16130 - }, - { - "epoch": 0.7794653852654964, - "grad_norm": 2.4676565661863927, - "learning_rate": 2.819235654524568e-06, - "loss": 3.8949, - "mean_token_accuracy": 0.32106854766607285, - "step": 16140 - }, - { - "epoch": 0.7799483254050661, - "grad_norm": 2.6278193950196167, - "learning_rate": 2.8075129093230003e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.3162298366427422, - "step": 16150 - }, - { - "epoch": 0.7804312655446357, - "grad_norm": 2.690742344407508, - "learning_rate": 2.7958106059593015e-06, - "loss": 3.8832, - "mean_token_accuracy": 0.3235887110233307, - "step": 16160 - }, - { - "epoch": 0.7809142056842054, - "grad_norm": 2.6272651864157175, - "learning_rate": 2.7841287776927017e-06, - "loss": 3.9227, - "mean_token_accuracy": 0.3182459682226181, - "step": 16170 - }, - { - "epoch": 0.7813971458237752, - "grad_norm": 2.858965289263669, - "learning_rate": 2.772467457724243e-06, - "loss": 3.8965, - "mean_token_accuracy": 0.327822582423687, - "step": 16180 - }, - { - "epoch": 0.7818800859633448, - "grad_norm": 2.730948147511468, - "learning_rate": 2.76082667919667e-06, - "loss": 3.9512, - "mean_token_accuracy": 0.3166330620646477, - "step": 16190 - }, - { - "epoch": 0.7823630261029145, - "grad_norm": 2.4100917419409793, - "learning_rate": 2.749206475194357e-06, - "loss": 3.9395, - "mean_token_accuracy": 0.31108870878815653, - "step": 16200 - }, - { - "epoch": 0.7823630261029145, - "eval_runtime": 7.7859, - "eval_samples_per_second": 379.404, - "eval_steps_per_second": 23.761, - "step": 16200 - }, - { - "epoch": 0.7828459662424843, - "grad_norm": 2.5261171713649153, - "learning_rate": 2.737606878743202e-06, - "loss": 3.9016, - "mean_token_accuracy": 0.32429435551166536, - "step": 16210 - }, - { - "epoch": 0.783328906382054, - "grad_norm": 2.5556710826864326, - "learning_rate": 2.726027922810526e-06, - "loss": 3.9312, - "mean_token_accuracy": 0.3151209682226181, - "step": 16220 - }, - { - "epoch": 0.7838118465216236, - "grad_norm": 2.375014949832733, - "learning_rate": 2.714469640304994e-06, - "loss": 3.8926, - "mean_token_accuracy": 0.32681451439857484, - "step": 16230 - }, - { - "epoch": 0.7842947866611933, - "grad_norm": 2.5494710127341933, - "learning_rate": 2.7029320640765166e-06, - "loss": 3.9289, - "mean_token_accuracy": 0.3190524145960808, - "step": 16240 - }, - { - "epoch": 0.7847777268007631, - "grad_norm": 2.3762506462613633, - "learning_rate": 2.691415226916153e-06, - "loss": 3.8652, - "mean_token_accuracy": 0.32026209533214567, - "step": 16250 - }, - { - "epoch": 0.7852606669403327, - "grad_norm": 2.4240461691185216, - "learning_rate": 2.679919161556014e-06, - "loss": 3.9391, - "mean_token_accuracy": 0.32258064597845076, - "step": 16260 - }, - { - "epoch": 0.7857436070799024, - "grad_norm": 2.3379108180916286, - "learning_rate": 2.6684439006691807e-06, - "loss": 3.8594, - "mean_token_accuracy": 0.32389113008975984, - "step": 16270 - }, - { - "epoch": 0.7862265472194722, - "grad_norm": 2.5169286063457013, - "learning_rate": 2.6569894768696048e-06, - "loss": 3.9629, - "mean_token_accuracy": 0.3142137110233307, - "step": 16280 - }, - { - "epoch": 0.7867094873590419, - "grad_norm": 2.4211109867696003, - "learning_rate": 2.64555592271202e-06, - "loss": 3.9145, - "mean_token_accuracy": 0.31733870804309844, - "step": 16290 - }, - { - "epoch": 0.7871924274986115, - "grad_norm": 2.4890569253235735, - "learning_rate": 2.6341432706918357e-06, - "loss": 3.9414, - "mean_token_accuracy": 0.32227823138237, - "step": 16300 - }, - { - "epoch": 0.7871924274986115, - "eval_runtime": 7.8277, - "eval_samples_per_second": 377.377, - "eval_steps_per_second": 23.634, - "step": 16300 - }, - { - "epoch": 0.7876753676381812, - "grad_norm": 2.414208603695168, - "learning_rate": 2.6227515532450644e-06, - "loss": 3.9043, - "mean_token_accuracy": 0.32449596375226974, - "step": 16310 - }, - { - "epoch": 0.788158307777751, - "grad_norm": 2.842053128996133, - "learning_rate": 2.611380802748221e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.32157257944345474, - "step": 16320 - }, - { - "epoch": 0.7886412479173206, - "grad_norm": 2.511953260541321, - "learning_rate": 2.6000310515182213e-06, - "loss": 3.9387, - "mean_token_accuracy": 0.32399193197488785, - "step": 16330 - }, - { - "epoch": 0.7891241880568903, - "grad_norm": 2.4278924456945976, - "learning_rate": 2.5887023318123063e-06, - "loss": 3.9059, - "mean_token_accuracy": 0.3213709682226181, - "step": 16340 - }, - { - "epoch": 0.7896071281964601, - "grad_norm": 2.4623932720379536, - "learning_rate": 2.57739467582794e-06, - "loss": 3.9242, - "mean_token_accuracy": 0.3230846807360649, - "step": 16350 - }, - { - "epoch": 0.7900900683360298, - "grad_norm": 2.543508384283926, - "learning_rate": 2.566108115702728e-06, - "loss": 3.9363, - "mean_token_accuracy": 0.31834677457809446, - "step": 16360 - }, - { - "epoch": 0.7905730084755994, - "grad_norm": 2.4619000045994666, - "learning_rate": 2.554842683514305e-06, - "loss": 3.9219, - "mean_token_accuracy": 0.321169351041317, - "step": 16370 - }, - { - "epoch": 0.7910559486151691, - "grad_norm": 2.5424361809339002, - "learning_rate": 2.543598411280269e-06, - "loss": 3.8965, - "mean_token_accuracy": 0.32368951886892317, - "step": 16380 - }, - { - "epoch": 0.7915388887547389, - "grad_norm": 2.67327142330878, - "learning_rate": 2.5323753309580766e-06, - "loss": 3.898, - "mean_token_accuracy": 0.3201612889766693, - "step": 16390 - }, - { - "epoch": 0.7920218288943085, - "grad_norm": 2.433649543912036, - "learning_rate": 2.521173474444957e-06, - "loss": 3.9027, - "mean_token_accuracy": 0.32409273982048037, - "step": 16400 - }, - { - "epoch": 0.7920218288943085, - "eval_runtime": 7.8437, - "eval_samples_per_second": 376.608, - "eval_steps_per_second": 23.586, - "step": 16400 - }, - { - "epoch": 0.7925047690338782, - "grad_norm": 2.5224384469736822, - "learning_rate": 2.5099928735778103e-06, - "loss": 3.8809, - "mean_token_accuracy": 0.32469758242368696, - "step": 16410 - }, - { - "epoch": 0.792987709173448, - "grad_norm": 2.504768179751752, - "learning_rate": 2.4988335601331336e-06, - "loss": 3.9047, - "mean_token_accuracy": 0.31784273982048034, - "step": 16420 - }, - { - "epoch": 0.7934706493130177, - "grad_norm": 2.5360401475869305, - "learning_rate": 2.487695565826922e-06, - "loss": 4.0004, - "mean_token_accuracy": 0.31068548411130903, - "step": 16430 - }, - { - "epoch": 0.7939535894525873, - "grad_norm": 2.4926819515576226, - "learning_rate": 2.4765789223145797e-06, - "loss": 3.982, - "mean_token_accuracy": 0.31643145233392717, - "step": 16440 - }, - { - "epoch": 0.794436529592157, - "grad_norm": 2.6332858795459964, - "learning_rate": 2.465483661190824e-06, - "loss": 3.9148, - "mean_token_accuracy": 0.31461693346500397, - "step": 16450 - }, - { - "epoch": 0.7949194697317268, - "grad_norm": 2.483125440054413, - "learning_rate": 2.4544098139896056e-06, - "loss": 4.0039, - "mean_token_accuracy": 0.3110887110233307, - "step": 16460 - }, - { - "epoch": 0.7954024098712964, - "grad_norm": 2.6720920517350173, - "learning_rate": 2.4433574121840178e-06, - "loss": 3.9012, - "mean_token_accuracy": 0.31733870804309844, - "step": 16470 - }, - { - "epoch": 0.7958853500108661, - "grad_norm": 2.6393270847156165, - "learning_rate": 2.4323264871861927e-06, - "loss": 3.8641, - "mean_token_accuracy": 0.32358870804309847, - "step": 16480 - }, - { - "epoch": 0.7963682901504359, - "grad_norm": 2.4294413949842917, - "learning_rate": 2.4213170703472355e-06, - "loss": 3.9723, - "mean_token_accuracy": 0.31794354915618894, - "step": 16490 - }, - { - "epoch": 0.7968512302900056, - "grad_norm": 2.6723150672222635, - "learning_rate": 2.410329192957116e-06, - "loss": 3.8648, - "mean_token_accuracy": 0.32752016335725787, - "step": 16500 - }, - { - "epoch": 0.7968512302900056, - "eval_runtime": 7.8122, - "eval_samples_per_second": 378.128, - "eval_steps_per_second": 23.681, - "step": 16500 - }, - { - "epoch": 0.7973341704295752, - "grad_norm": 2.675929868974572, - "learning_rate": 2.399362886244592e-06, - "loss": 3.8508, - "mean_token_accuracy": 0.32782258093357086, - "step": 16510 - }, - { - "epoch": 0.797817110569145, - "grad_norm": 2.3894708340546615, - "learning_rate": 2.3884181813771025e-06, - "loss": 4.0023, - "mean_token_accuracy": 0.31844758093357084, - "step": 16520 - }, - { - "epoch": 0.7983000507087147, - "grad_norm": 2.6234629143490076, - "learning_rate": 2.377495109460706e-06, - "loss": 3.9281, - "mean_token_accuracy": 0.32520161420106886, - "step": 16530 - }, - { - "epoch": 0.7987829908482843, - "grad_norm": 2.451513525214137, - "learning_rate": 2.36659370153997e-06, - "loss": 3.8617, - "mean_token_accuracy": 0.32923386842012403, - "step": 16540 - }, - { - "epoch": 0.799265930987854, - "grad_norm": 2.499150543671022, - "learning_rate": 2.3557139885978965e-06, - "loss": 3.9324, - "mean_token_accuracy": 0.3196572571992874, - "step": 16550 - }, - { - "epoch": 0.7997488711274238, - "grad_norm": 2.3862913153766336, - "learning_rate": 2.3448560015558177e-06, - "loss": 3.9543, - "mean_token_accuracy": 0.3138104841113091, - "step": 16560 - }, - { - "epoch": 0.8002318112669935, - "grad_norm": 2.41113089862968, - "learning_rate": 2.334019771273327e-06, - "loss": 3.8668, - "mean_token_accuracy": 0.324193549156189, - "step": 16570 - }, - { - "epoch": 0.8007147514065631, - "grad_norm": 2.735495230570863, - "learning_rate": 2.3232053285481814e-06, - "loss": 3.9336, - "mean_token_accuracy": 0.31481854543089866, - "step": 16580 - }, - { - "epoch": 0.8011976915461329, - "grad_norm": 2.5973844807381727, - "learning_rate": 2.312412704116218e-06, - "loss": 3.9051, - "mean_token_accuracy": 0.3264112889766693, - "step": 16590 - }, - { - "epoch": 0.8016806316857026, - "grad_norm": 2.439730137959072, - "learning_rate": 2.3016419286512525e-06, - "loss": 3.8828, - "mean_token_accuracy": 0.3177419364452362, - "step": 16600 - }, - { - "epoch": 0.8016806316857026, - "eval_runtime": 7.7873, - "eval_samples_per_second": 379.336, - "eval_steps_per_second": 23.757, - "step": 16600 - }, - { - "epoch": 0.8021635718252722, - "grad_norm": 2.5416858470207355, - "learning_rate": 2.2908930327650157e-06, - "loss": 3.9117, - "mean_token_accuracy": 0.3216733857989311, - "step": 16610 - }, - { - "epoch": 0.8026465119648419, - "grad_norm": 2.6011642180344268, - "learning_rate": 2.2801660470070507e-06, - "loss": 3.8793, - "mean_token_accuracy": 0.3247262641787529, - "step": 16620 - }, - { - "epoch": 0.8031294521044117, - "grad_norm": 2.534611112505, - "learning_rate": 2.2694610018646245e-06, - "loss": 3.9574, - "mean_token_accuracy": 0.3204637080430984, - "step": 16630 - }, - { - "epoch": 0.8036123922439814, - "grad_norm": 2.486281908750067, - "learning_rate": 2.258777927762653e-06, - "loss": 3.9102, - "mean_token_accuracy": 0.32405434995889665, - "step": 16640 - }, - { - "epoch": 0.804095332383551, - "grad_norm": 2.6224260913507416, - "learning_rate": 2.2481168550636045e-06, - "loss": 3.8547, - "mean_token_accuracy": 0.3225806444883347, - "step": 16650 - }, - { - "epoch": 0.8045782725231208, - "grad_norm": 2.3806781619885204, - "learning_rate": 2.2374778140674225e-06, - "loss": 3.8582, - "mean_token_accuracy": 0.3275201618671417, - "step": 16660 - }, - { - "epoch": 0.8050612126626905, - "grad_norm": 2.4876755798307157, - "learning_rate": 2.226860835011423e-06, - "loss": 3.9195, - "mean_token_accuracy": 0.3277217760682106, - "step": 16670 - }, - { - "epoch": 0.8055441528022601, - "grad_norm": 2.4324733612217715, - "learning_rate": 2.2162659480702275e-06, - "loss": 3.9031, - "mean_token_accuracy": 0.3170362874865532, - "step": 16680 - }, - { - "epoch": 0.8060270929418298, - "grad_norm": 2.5046025756743977, - "learning_rate": 2.2056931833556694e-06, - "loss": 3.8676, - "mean_token_accuracy": 0.3243951588869095, - "step": 16690 - }, - { - "epoch": 0.8065100330813996, - "grad_norm": 2.687273634169363, - "learning_rate": 2.1951425709167095e-06, - "loss": 3.8457, - "mean_token_accuracy": 0.32963709682226183, - "step": 16700 - }, - { - "epoch": 0.8065100330813996, - "eval_runtime": 7.7601, - "eval_samples_per_second": 380.664, - "eval_steps_per_second": 23.84, - "step": 16700 - }, - { - "epoch": 0.8069929732209693, - "grad_norm": 2.6150983680182054, - "learning_rate": 2.1846141407393393e-06, - "loss": 3.9645, - "mean_token_accuracy": 0.3110887110233307, - "step": 16710 - }, - { - "epoch": 0.8074759133605389, - "grad_norm": 2.436027318869901, - "learning_rate": 2.174107922746518e-06, - "loss": 3.8863, - "mean_token_accuracy": 0.31512096524238586, - "step": 16720 - }, - { - "epoch": 0.8079588535001087, - "grad_norm": 2.5461376475714674, - "learning_rate": 2.1636239467980725e-06, - "loss": 3.925, - "mean_token_accuracy": 0.3215725839138031, - "step": 16730 - }, - { - "epoch": 0.8084417936396784, - "grad_norm": 2.4314482623233844, - "learning_rate": 2.1531622426906063e-06, - "loss": 3.8543, - "mean_token_accuracy": 0.32379032075405123, - "step": 16740 - }, - { - "epoch": 0.8089247337792481, - "grad_norm": 2.379302785226099, - "learning_rate": 2.142722840157435e-06, - "loss": 3.9109, - "mean_token_accuracy": 0.32328629195690156, - "step": 16750 - }, - { - "epoch": 0.8094076739188177, - "grad_norm": 2.611400788057758, - "learning_rate": 2.132305768868488e-06, - "loss": 3.95, - "mean_token_accuracy": 0.31945564448833463, - "step": 16760 - }, - { - "epoch": 0.8098906140583875, - "grad_norm": 2.392008594223068, - "learning_rate": 2.121911058430225e-06, - "loss": 3.9102, - "mean_token_accuracy": 0.3190524220466614, - "step": 16770 - }, - { - "epoch": 0.8103735541979572, - "grad_norm": 2.650320084641724, - "learning_rate": 2.1115387383855513e-06, - "loss": 3.9527, - "mean_token_accuracy": 0.31875000298023226, - "step": 16780 - }, - { - "epoch": 0.8108564943375268, - "grad_norm": 2.3822462082754767, - "learning_rate": 2.10118883821374e-06, - "loss": 3.9215, - "mean_token_accuracy": 0.315625, - "step": 16790 - }, - { - "epoch": 0.8113394344770966, - "grad_norm": 2.6889518708907922, - "learning_rate": 2.0908613873303454e-06, - "loss": 3.9277, - "mean_token_accuracy": 0.3151209712028503, - "step": 16800 - }, - { - "epoch": 0.8113394344770966, - "eval_runtime": 7.8206, - "eval_samples_per_second": 377.72, - "eval_steps_per_second": 23.655, - "step": 16800 - }, - { - "epoch": 0.8118223746166663, - "grad_norm": 2.550458750695889, - "learning_rate": 2.0805564150871173e-06, - "loss": 3.873, - "mean_token_accuracy": 0.3179435506463051, - "step": 16810 - }, - { - "epoch": 0.812305314756236, - "grad_norm": 2.6128267204973405, - "learning_rate": 2.070273950771915e-06, - "loss": 3.8691, - "mean_token_accuracy": 0.3272177383303642, - "step": 16820 - }, - { - "epoch": 0.8127882548958056, - "grad_norm": 2.587908446299981, - "learning_rate": 2.0600140236086308e-06, - "loss": 3.8918, - "mean_token_accuracy": 0.3230846762657166, - "step": 16830 - }, - { - "epoch": 0.8132711950353754, - "grad_norm": 2.5807468063594294, - "learning_rate": 2.0497766627571057e-06, - "loss": 3.9773, - "mean_token_accuracy": 0.3130040317773819, - "step": 16840 - }, - { - "epoch": 0.8137541351749451, - "grad_norm": 2.6755023367814896, - "learning_rate": 2.039561897313046e-06, - "loss": 3.9195, - "mean_token_accuracy": 0.3190524190664291, - "step": 16850 - }, - { - "epoch": 0.8142370753145147, - "grad_norm": 2.6044266489922934, - "learning_rate": 2.0293697563079308e-06, - "loss": 3.9348, - "mean_token_accuracy": 0.318245966732502, - "step": 16860 - }, - { - "epoch": 0.8147200154540845, - "grad_norm": 2.4927386975261454, - "learning_rate": 2.019200268708945e-06, - "loss": 3.9402, - "mean_token_accuracy": 0.31340725868940356, - "step": 16870 - }, - { - "epoch": 0.8152029555936542, - "grad_norm": 2.5621006819858887, - "learning_rate": 2.0090534634188928e-06, - "loss": 3.9223, - "mean_token_accuracy": 0.3196572601795197, - "step": 16880 - }, - { - "epoch": 0.8156858957332239, - "grad_norm": 2.4242295628336055, - "learning_rate": 1.9989293692761024e-06, - "loss": 3.9035, - "mean_token_accuracy": 0.3220766127109528, - "step": 16890 - }, - { - "epoch": 0.8161688358727935, - "grad_norm": 2.519424020713278, - "learning_rate": 1.9888280150543647e-06, - "loss": 3.868, - "mean_token_accuracy": 0.31885080337524413, - "step": 16900 - }, - { - "epoch": 0.8161688358727935, - "eval_runtime": 7.8301, - "eval_samples_per_second": 377.261, - "eval_steps_per_second": 23.627, - "step": 16900 - }, - { - "epoch": 0.8166517760123633, - "grad_norm": 2.535137419412962, - "learning_rate": 1.9787494294628373e-06, - "loss": 3.8527, - "mean_token_accuracy": 0.3292338728904724, - "step": 16910 - }, - { - "epoch": 0.817134716151933, - "grad_norm": 2.596490646650906, - "learning_rate": 1.968693641145968e-06, - "loss": 3.9609, - "mean_token_accuracy": 0.3181451603770256, - "step": 16920 - }, - { - "epoch": 0.8176176562915026, - "grad_norm": 2.4151702777889925, - "learning_rate": 1.958660678683406e-06, - "loss": 3.8828, - "mean_token_accuracy": 0.32177419364452364, - "step": 16930 - }, - { - "epoch": 0.8181005964310724, - "grad_norm": 2.573406277489698, - "learning_rate": 1.948650570589936e-06, - "loss": 3.8855, - "mean_token_accuracy": 0.3219758063554764, - "step": 16940 - }, - { - "epoch": 0.8185835365706421, - "grad_norm": 2.57344482841107, - "learning_rate": 1.9386633453153826e-06, - "loss": 3.9406, - "mean_token_accuracy": 0.3123991906642914, - "step": 16950 - }, - { - "epoch": 0.8190664767102118, - "grad_norm": 2.4151219582823202, - "learning_rate": 1.92869903124454e-06, - "loss": 3.8496, - "mean_token_accuracy": 0.32540322542190553, - "step": 16960 - }, - { - "epoch": 0.8195494168497814, - "grad_norm": 2.5543777691813956, - "learning_rate": 1.9187576566970766e-06, - "loss": 3.85, - "mean_token_accuracy": 0.3222782269120216, - "step": 16970 - }, - { - "epoch": 0.8200323569893512, - "grad_norm": 2.381780186947048, - "learning_rate": 1.9088392499274734e-06, - "loss": 3.8914, - "mean_token_accuracy": 0.3225806444883347, - "step": 16980 - }, - { - "epoch": 0.8205152971289209, - "grad_norm": 2.6101620136163994, - "learning_rate": 1.8989438391249315e-06, - "loss": 3.8863, - "mean_token_accuracy": 0.3214717760682106, - "step": 16990 - }, - { - "epoch": 0.8209982372684905, - "grad_norm": 2.502577854036647, - "learning_rate": 1.8890714524132958e-06, - "loss": 3.9422, - "mean_token_accuracy": 0.3118951618671417, - "step": 17000 - }, - { - "epoch": 0.8209982372684905, - "eval_runtime": 7.8065, - "eval_samples_per_second": 378.402, - "eval_steps_per_second": 23.698, - "step": 17000 - }, - { - "epoch": 0.8214811774080603, - "grad_norm": 2.5690215396224483, - "learning_rate": 1.8792221178509696e-06, - "loss": 3.8648, - "mean_token_accuracy": 0.3270161271095276, - "step": 17010 - }, - { - "epoch": 0.82196411754763, - "grad_norm": 2.502535822958402, - "learning_rate": 1.8693958634308452e-06, - "loss": 3.9094, - "mean_token_accuracy": 0.321370966732502, - "step": 17020 - }, - { - "epoch": 0.8224470576871997, - "grad_norm": 2.668974892618196, - "learning_rate": 1.8595927170802175e-06, - "loss": 3.8594, - "mean_token_accuracy": 0.3273185506463051, - "step": 17030 - }, - { - "epoch": 0.8229299978267693, - "grad_norm": 2.643450235908905, - "learning_rate": 1.849812706660702e-06, - "loss": 3.909, - "mean_token_accuracy": 0.3168697118759155, - "step": 17040 - }, - { - "epoch": 0.8234129379663391, - "grad_norm": 2.4585571603267833, - "learning_rate": 1.8400558599681617e-06, - "loss": 3.8957, - "mean_token_accuracy": 0.31673386991024016, - "step": 17050 - }, - { - "epoch": 0.8238958781059088, - "grad_norm": 2.6591025669212005, - "learning_rate": 1.8303222047326275e-06, - "loss": 4.0102, - "mean_token_accuracy": 0.3132056459784508, - "step": 17060 - }, - { - "epoch": 0.8243788182454784, - "grad_norm": 2.4330721163796944, - "learning_rate": 1.820611768618218e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.3226814538240433, - "step": 17070 - }, - { - "epoch": 0.8248617583850482, - "grad_norm": 2.535287842660409, - "learning_rate": 1.8109245792230545e-06, - "loss": 3.8875, - "mean_token_accuracy": 0.32096774131059647, - "step": 17080 - }, - { - "epoch": 0.8253446985246179, - "grad_norm": 2.609166515058401, - "learning_rate": 1.801260664079194e-06, - "loss": 3.8504, - "mean_token_accuracy": 0.3214717745780945, - "step": 17090 - }, - { - "epoch": 0.8258276386641876, - "grad_norm": 2.449982066803611, - "learning_rate": 1.7916200506525462e-06, - "loss": 3.8836, - "mean_token_accuracy": 0.3186491936445236, - "step": 17100 - }, - { - "epoch": 0.8258276386641876, - "eval_runtime": 7.8054, - "eval_samples_per_second": 378.454, - "eval_steps_per_second": 23.701, - "step": 17100 - }, - { - "epoch": 0.8263105788037572, - "grad_norm": 2.6535672183544214, - "learning_rate": 1.7820027663427918e-06, - "loss": 3.9543, - "mean_token_accuracy": 0.3135080635547638, - "step": 17110 - }, - { - "epoch": 0.826793518943327, - "grad_norm": 2.786276944579817, - "learning_rate": 1.772408838483307e-06, - "loss": 3.9449, - "mean_token_accuracy": 0.31219757795333863, - "step": 17120 - }, - { - "epoch": 0.8272764590828967, - "grad_norm": 2.6499136566142107, - "learning_rate": 1.7628382943410882e-06, - "loss": 3.9191, - "mean_token_accuracy": 0.3180443525314331, - "step": 17130 - }, - { - "epoch": 0.8277593992224663, - "grad_norm": 2.5238124344804853, - "learning_rate": 1.753291161116677e-06, - "loss": 3.9758, - "mean_token_accuracy": 0.31703629195690153, - "step": 17140 - }, - { - "epoch": 0.8282423393620361, - "grad_norm": 2.4099653479491594, - "learning_rate": 1.7437674659440663e-06, - "loss": 3.8711, - "mean_token_accuracy": 0.3224798426032066, - "step": 17150 - }, - { - "epoch": 0.8287252795016058, - "grad_norm": 2.5444956940139303, - "learning_rate": 1.7342672358906487e-06, - "loss": 3.882, - "mean_token_accuracy": 0.3190524160861969, - "step": 17160 - }, - { - "epoch": 0.8292082196411755, - "grad_norm": 2.524613062472469, - "learning_rate": 1.7247904979571184e-06, - "loss": 3.8875, - "mean_token_accuracy": 0.32076612710952757, - "step": 17170 - }, - { - "epoch": 0.8296911597807451, - "grad_norm": 2.5925078281534244, - "learning_rate": 1.7153372790774092e-06, - "loss": 3.9406, - "mean_token_accuracy": 0.31905242055654526, - "step": 17180 - }, - { - "epoch": 0.8301740999203149, - "grad_norm": 2.4780013122019193, - "learning_rate": 1.7059076061186008e-06, - "loss": 3.8977, - "mean_token_accuracy": 0.32116935700178145, - "step": 17190 - }, - { - "epoch": 0.8306570400598846, - "grad_norm": 2.574221523746746, - "learning_rate": 1.696501505880863e-06, - "loss": 3.9051, - "mean_token_accuracy": 0.32671370804309846, - "step": 17200 - }, - { - "epoch": 0.8306570400598846, - "eval_runtime": 7.7964, - "eval_samples_per_second": 378.891, - "eval_steps_per_second": 23.729, - "step": 17200 - }, - { - "epoch": 0.8311399801994542, - "grad_norm": 2.7117294620270824, - "learning_rate": 1.687119005097363e-06, - "loss": 3.9316, - "mean_token_accuracy": 0.3150201618671417, - "step": 17210 - }, - { - "epoch": 0.831622920339024, - "grad_norm": 2.4010674996449586, - "learning_rate": 1.6777601304342016e-06, - "loss": 3.9254, - "mean_token_accuracy": 0.321673384308815, - "step": 17220 - }, - { - "epoch": 0.8321058604785937, - "grad_norm": 2.4882554764900915, - "learning_rate": 1.6684249084903225e-06, - "loss": 3.9473, - "mean_token_accuracy": 0.3157258048653603, - "step": 17230 - }, - { - "epoch": 0.8325888006181634, - "grad_norm": 2.4635578182484803, - "learning_rate": 1.6591133657974557e-06, - "loss": 3.8875, - "mean_token_accuracy": 0.32046370655298234, - "step": 17240 - }, - { - "epoch": 0.833071740757733, - "grad_norm": 2.57611782415298, - "learning_rate": 1.6498255288200248e-06, - "loss": 3.9301, - "mean_token_accuracy": 0.31350806802511216, - "step": 17250 - }, - { - "epoch": 0.8335546808973028, - "grad_norm": 2.6001614071423464, - "learning_rate": 1.640561423955086e-06, - "loss": 3.875, - "mean_token_accuracy": 0.3285282239317894, - "step": 17260 - }, - { - "epoch": 0.8340376210368725, - "grad_norm": 2.584795999520936, - "learning_rate": 1.6313210775322375e-06, - "loss": 3.882, - "mean_token_accuracy": 0.32520161122083663, - "step": 17270 - }, - { - "epoch": 0.8345205611764421, - "grad_norm": 2.580457378985107, - "learning_rate": 1.6221045158135606e-06, - "loss": 3.9844, - "mean_token_accuracy": 0.31834677159786223, - "step": 17280 - }, - { - "epoch": 0.8350035013160119, - "grad_norm": 2.5138319706121486, - "learning_rate": 1.6129117649935378e-06, - "loss": 3.923, - "mean_token_accuracy": 0.31743952035903933, - "step": 17290 - }, - { - "epoch": 0.8354864414555816, - "grad_norm": 2.3423205888685645, - "learning_rate": 1.6037428511989705e-06, - "loss": 3.9813, - "mean_token_accuracy": 0.31441532671451566, - "step": 17300 - }, - { - "epoch": 0.8354864414555816, - "eval_runtime": 7.8065, - "eval_samples_per_second": 378.401, - "eval_steps_per_second": 23.698, - "step": 17300 - }, - { - "epoch": 0.8359693815951513, - "grad_norm": 2.5436061994772556, - "learning_rate": 1.5945978004889218e-06, - "loss": 3.9258, - "mean_token_accuracy": 0.318548384308815, - "step": 17310 - }, - { - "epoch": 0.836452321734721, - "grad_norm": 2.741691426941505, - "learning_rate": 1.5854766388546284e-06, - "loss": 3.9387, - "mean_token_accuracy": 0.315120966732502, - "step": 17320 - }, - { - "epoch": 0.8369352618742907, - "grad_norm": 2.686510441187782, - "learning_rate": 1.576379392219437e-06, - "loss": 3.941, - "mean_token_accuracy": 0.31542338579893114, - "step": 17330 - }, - { - "epoch": 0.8374182020138604, - "grad_norm": 2.4551701738351253, - "learning_rate": 1.5673060864387146e-06, - "loss": 3.8379, - "mean_token_accuracy": 0.33397177010774615, - "step": 17340 - }, - { - "epoch": 0.83790114215343, - "grad_norm": 2.6245231178723167, - "learning_rate": 1.5582567472997966e-06, - "loss": 3.8723, - "mean_token_accuracy": 0.3239919349551201, - "step": 17350 - }, - { - "epoch": 0.8383840822929998, - "grad_norm": 2.592285760307571, - "learning_rate": 1.5492314005218967e-06, - "loss": 3.9418, - "mean_token_accuracy": 0.3153225839138031, - "step": 17360 - }, - { - "epoch": 0.8388670224325695, - "grad_norm": 2.486787451717872, - "learning_rate": 1.5402300717560436e-06, - "loss": 3.8746, - "mean_token_accuracy": 0.32590726017951965, - "step": 17370 - }, - { - "epoch": 0.8393499625721392, - "grad_norm": 2.5401410402396567, - "learning_rate": 1.531252786584998e-06, - "loss": 3.9711, - "mean_token_accuracy": 0.3130040317773819, - "step": 17380 - }, - { - "epoch": 0.8398329027117089, - "grad_norm": 2.550211841189419, - "learning_rate": 1.5222995705231912e-06, - "loss": 3.9887, - "mean_token_accuracy": 0.3118951618671417, - "step": 17390 - }, - { - "epoch": 0.8403158428512786, - "grad_norm": 2.503064967082286, - "learning_rate": 1.5133704490166502e-06, - "loss": 3.9324, - "mean_token_accuracy": 0.32066532373428347, - "step": 17400 - }, - { - "epoch": 0.8403158428512786, - "eval_runtime": 7.8477, - "eval_samples_per_second": 376.417, - "eval_steps_per_second": 23.574, - "step": 17400 - }, - { - "epoch": 0.8407987829908483, - "grad_norm": 2.472698593343891, - "learning_rate": 1.504465447442911e-06, - "loss": 3.8543, - "mean_token_accuracy": 0.3254032239317894, - "step": 17410 - }, - { - "epoch": 0.8412817231304179, - "grad_norm": 2.4917730540245064, - "learning_rate": 1.4955845911109713e-06, - "loss": 3.9602, - "mean_token_accuracy": 0.3097782269120216, - "step": 17420 - }, - { - "epoch": 0.8417646632699877, - "grad_norm": 2.5462475749657068, - "learning_rate": 1.486727905261196e-06, - "loss": 3.9543, - "mean_token_accuracy": 0.3137096792459488, - "step": 17430 - }, - { - "epoch": 0.8422476034095574, - "grad_norm": 2.5142472951474737, - "learning_rate": 1.4778954150652646e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.3162298396229744, - "step": 17440 - }, - { - "epoch": 0.8427305435491271, - "grad_norm": 2.5390040650290473, - "learning_rate": 1.4690871456260758e-06, - "loss": 3.9176, - "mean_token_accuracy": 0.31582661271095275, - "step": 17450 - }, - { - "epoch": 0.8432134836886968, - "grad_norm": 2.585182230094246, - "learning_rate": 1.460303121977703e-06, - "loss": 3.8492, - "mean_token_accuracy": 0.3227822586894035, - "step": 17460 - }, - { - "epoch": 0.8436964238282665, - "grad_norm": 2.4832673111880776, - "learning_rate": 1.4515433690853054e-06, - "loss": 3.9402, - "mean_token_accuracy": 0.3161290347576141, - "step": 17470 - }, - { - "epoch": 0.8441793639678362, - "grad_norm": 2.6491581535177966, - "learning_rate": 1.4428079118450634e-06, - "loss": 3.9246, - "mean_token_accuracy": 0.3138104841113091, - "step": 17480 - }, - { - "epoch": 0.8446623041074058, - "grad_norm": 2.6871477807734956, - "learning_rate": 1.4340967750841006e-06, - "loss": 3.9051, - "mean_token_accuracy": 0.3205645173788071, - "step": 17490 - }, - { - "epoch": 0.8451452442469756, - "grad_norm": 2.6754167986754487, - "learning_rate": 1.4254099835604284e-06, - "loss": 4.0109, - "mean_token_accuracy": 0.3129032254219055, - "step": 17500 - }, - { - "epoch": 0.8451452442469756, - "eval_runtime": 8.0421, - "eval_samples_per_second": 367.319, - "eval_steps_per_second": 23.004, - "step": 17500 - }, - { - "epoch": 0.8456281843865453, - "grad_norm": 2.5573298098627895, - "learning_rate": 1.41674756196286e-06, - "loss": 3.8926, - "mean_token_accuracy": 0.3280241936445236, - "step": 17510 - }, - { - "epoch": 0.846111124526115, - "grad_norm": 2.443762118092534, - "learning_rate": 1.408109534910952e-06, - "loss": 3.923, - "mean_token_accuracy": 0.3142137095332146, - "step": 17520 - }, - { - "epoch": 0.8465940646656847, - "grad_norm": 2.512827526384991, - "learning_rate": 1.399495926954919e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.3180443584918976, - "step": 17530 - }, - { - "epoch": 0.8470770048052544, - "grad_norm": 2.492337351741429, - "learning_rate": 1.3909067625755856e-06, - "loss": 3.8785, - "mean_token_accuracy": 0.32893145084381104, - "step": 17540 - }, - { - "epoch": 0.8475599449448241, - "grad_norm": 2.595506866876267, - "learning_rate": 1.382342066184299e-06, - "loss": 3.9031, - "mean_token_accuracy": 0.3213709704577923, - "step": 17550 - }, - { - "epoch": 0.8480428850843937, - "grad_norm": 2.6035833153749395, - "learning_rate": 1.3738018621228643e-06, - "loss": 3.9512, - "mean_token_accuracy": 0.31381049007177353, - "step": 17560 - }, - { - "epoch": 0.8485258252239635, - "grad_norm": 2.7177819832696555, - "learning_rate": 1.3652861746634817e-06, - "loss": 3.9652, - "mean_token_accuracy": 0.31159274131059644, - "step": 17570 - }, - { - "epoch": 0.8490087653635332, - "grad_norm": 2.474075091366069, - "learning_rate": 1.3567950280086673e-06, - "loss": 3.9066, - "mean_token_accuracy": 0.3237903222441673, - "step": 17580 - }, - { - "epoch": 0.8494917055031029, - "grad_norm": 2.5098231785926752, - "learning_rate": 1.348328446291195e-06, - "loss": 3.8488, - "mean_token_accuracy": 0.3262096747756004, - "step": 17590 - }, - { - "epoch": 0.8499746456426726, - "grad_norm": 2.478735544185196, - "learning_rate": 1.3398864535740164e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.3163306474685669, - "step": 17600 - }, - { - "epoch": 0.8499746456426726, - "eval_runtime": 7.7963, - "eval_samples_per_second": 378.896, - "eval_steps_per_second": 23.729, - "step": 17600 - }, - { - "epoch": 0.8504575857822423, - "grad_norm": 2.411240439693436, - "learning_rate": 1.3314690738502024e-06, - "loss": 3.9797, - "mean_token_accuracy": 0.3172379031777382, - "step": 17610 - }, - { - "epoch": 0.850940525921812, - "grad_norm": 2.6229706648875752, - "learning_rate": 1.3230763310428695e-06, - "loss": 3.9004, - "mean_token_accuracy": 0.3230846717953682, - "step": 17620 - }, - { - "epoch": 0.8514234660613816, - "grad_norm": 2.5796257342013367, - "learning_rate": 1.3147082490051189e-06, - "loss": 3.868, - "mean_token_accuracy": 0.322076615691185, - "step": 17630 - }, - { - "epoch": 0.8519064062009514, - "grad_norm": 2.5354239984706286, - "learning_rate": 1.3063648515199522e-06, - "loss": 3.9684, - "mean_token_accuracy": 0.3152217730879784, - "step": 17640 - }, - { - "epoch": 0.8523893463405211, - "grad_norm": 2.716743495005877, - "learning_rate": 1.2980461623002226e-06, - "loss": 3.8937, - "mean_token_accuracy": 0.318951615691185, - "step": 17650 - }, - { - "epoch": 0.8528722864800908, - "grad_norm": 2.5504605263514093, - "learning_rate": 1.2897522049885603e-06, - "loss": 3.8477, - "mean_token_accuracy": 0.32671370804309846, - "step": 17660 - }, - { - "epoch": 0.8533552266196605, - "grad_norm": 2.629616945834561, - "learning_rate": 1.2814830031573023e-06, - "loss": 3.852, - "mean_token_accuracy": 0.3342741921544075, - "step": 17670 - }, - { - "epoch": 0.8538381667592302, - "grad_norm": 2.477653300619644, - "learning_rate": 1.2732385803084235e-06, - "loss": 3.9539, - "mean_token_accuracy": 0.31804435700178146, - "step": 17680 - }, - { - "epoch": 0.8543211068987999, - "grad_norm": 2.663303737764512, - "learning_rate": 1.265018959873483e-06, - "loss": 3.9129, - "mean_token_accuracy": 0.31874999701976775, - "step": 17690 - }, - { - "epoch": 0.8548040470383695, - "grad_norm": 2.42834151952541, - "learning_rate": 1.256824165213545e-06, - "loss": 3.8531, - "mean_token_accuracy": 0.32620967626571656, - "step": 17700 - }, - { - "epoch": 0.8548040470383695, - "eval_runtime": 7.8267, - "eval_samples_per_second": 377.427, - "eval_steps_per_second": 23.637, - "step": 17700 - }, - { - "epoch": 0.8552869871779393, - "grad_norm": 2.545214620236476, - "learning_rate": 1.2486542196191121e-06, - "loss": 3.9281, - "mean_token_accuracy": 0.31491935402154925, - "step": 17710 - }, - { - "epoch": 0.855769927317509, - "grad_norm": 2.6035612715879766, - "learning_rate": 1.2405091463100672e-06, - "loss": 3.902, - "mean_token_accuracy": 0.3232862874865532, - "step": 17720 - }, - { - "epoch": 0.8562528674570787, - "grad_norm": 2.659506168088208, - "learning_rate": 1.232388968435605e-06, - "loss": 3.8754, - "mean_token_accuracy": 0.3211693525314331, - "step": 17730 - }, - { - "epoch": 0.8567358075966484, - "grad_norm": 2.4774107184479877, - "learning_rate": 1.2242937090741624e-06, - "loss": 3.8656, - "mean_token_accuracy": 0.32137097120285035, - "step": 17740 - }, - { - "epoch": 0.8572187477362181, - "grad_norm": 2.520561799141872, - "learning_rate": 1.216223391233352e-06, - "loss": 3.8668, - "mean_token_accuracy": 0.3209677442908287, - "step": 17750 - }, - { - "epoch": 0.8577016878757878, - "grad_norm": 2.558043094812153, - "learning_rate": 1.2081780378499042e-06, - "loss": 3.9691, - "mean_token_accuracy": 0.3159274160861969, - "step": 17760 - }, - { - "epoch": 0.8581846280153574, - "grad_norm": 2.6220234175553068, - "learning_rate": 1.200157671789598e-06, - "loss": 3.8973, - "mean_token_accuracy": 0.31360886842012403, - "step": 17770 - }, - { - "epoch": 0.8586675681549272, - "grad_norm": 2.6051951492583125, - "learning_rate": 1.1921623158471951e-06, - "loss": 3.9566, - "mean_token_accuracy": 0.31481854766607287, - "step": 17780 - }, - { - "epoch": 0.8591505082944969, - "grad_norm": 2.6987052748218874, - "learning_rate": 1.184191992746372e-06, - "loss": 3.9297, - "mean_token_accuracy": 0.31542338579893114, - "step": 17790 - }, - { - "epoch": 0.8596334484340666, - "grad_norm": 2.478355542795563, - "learning_rate": 1.1762467251396614e-06, - "loss": 3.8891, - "mean_token_accuracy": 0.3191532239317894, - "step": 17800 - }, - { - "epoch": 0.8596334484340666, - "eval_runtime": 7.7942, - "eval_samples_per_second": 378.999, - "eval_steps_per_second": 23.736, - "step": 17800 - }, - { - "epoch": 0.8601163885736363, - "grad_norm": 2.5359644602852156, - "learning_rate": 1.1683265356083906e-06, - "loss": 3.9629, - "mean_token_accuracy": 0.31512096524238586, - "step": 17810 - }, - { - "epoch": 0.860599328713206, - "grad_norm": 2.3923709132791346, - "learning_rate": 1.1604314466626032e-06, - "loss": 3.9309, - "mean_token_accuracy": 0.3182459682226181, - "step": 17820 - }, - { - "epoch": 0.8610822688527757, - "grad_norm": 2.521639770013068, - "learning_rate": 1.152561480741009e-06, - "loss": 3.982, - "mean_token_accuracy": 0.3186491966247559, - "step": 17830 - }, - { - "epoch": 0.8615652089923453, - "grad_norm": 2.5079137372187725, - "learning_rate": 1.1447166602109162e-06, - "loss": 3.9332, - "mean_token_accuracy": 0.32096774131059647, - "step": 17840 - }, - { - "epoch": 0.8620481491319151, - "grad_norm": 2.3796392948960956, - "learning_rate": 1.1368970073681685e-06, - "loss": 3.9062, - "mean_token_accuracy": 0.3177419379353523, - "step": 17850 - }, - { - "epoch": 0.8625310892714848, - "grad_norm": 2.510990104182163, - "learning_rate": 1.129102544437074e-06, - "loss": 3.9914, - "mean_token_accuracy": 0.31209677308797834, - "step": 17860 - }, - { - "epoch": 0.8630140294110545, - "grad_norm": 2.4555247578595156, - "learning_rate": 1.1213332935703515e-06, - "loss": 3.9203, - "mean_token_accuracy": 0.31229838728904724, - "step": 17870 - }, - { - "epoch": 0.8634969695506242, - "grad_norm": 2.613419644383833, - "learning_rate": 1.1135892768490674e-06, - "loss": 3.8676, - "mean_token_accuracy": 0.3239919364452362, - "step": 17880 - }, - { - "epoch": 0.8639799096901939, - "grad_norm": 2.5738251401194954, - "learning_rate": 1.1058705162825677e-06, - "loss": 3.9164, - "mean_token_accuracy": 0.311794351041317, - "step": 17890 - }, - { - "epoch": 0.8644628498297636, - "grad_norm": 2.6197753140478475, - "learning_rate": 1.0981770338084152e-06, - "loss": 3.9641, - "mean_token_accuracy": 0.31804435402154924, - "step": 17900 - }, - { - "epoch": 0.8644628498297636, - "eval_runtime": 7.8205, - "eval_samples_per_second": 377.727, - "eval_steps_per_second": 23.656, - "step": 17900 - }, - { - "epoch": 0.8649457899693334, - "grad_norm": 2.5329537127157917, - "learning_rate": 1.0905088512923312e-06, - "loss": 3.9344, - "mean_token_accuracy": 0.32227822542190554, - "step": 17910 - }, - { - "epoch": 0.865428730108903, - "grad_norm": 2.5629187963373985, - "learning_rate": 1.0828659905281347e-06, - "loss": 3.8914, - "mean_token_accuracy": 0.3215725809335709, - "step": 17920 - }, - { - "epoch": 0.8659116702484727, - "grad_norm": 2.469573236383982, - "learning_rate": 1.0752484732376745e-06, - "loss": 3.8172, - "mean_token_accuracy": 0.3268145158886909, - "step": 17930 - }, - { - "epoch": 0.8663946103880424, - "grad_norm": 2.5116301701563466, - "learning_rate": 1.0676563210707658e-06, - "loss": 3.9629, - "mean_token_accuracy": 0.3157258048653603, - "step": 17940 - }, - { - "epoch": 0.8668775505276121, - "grad_norm": 2.5356796089718596, - "learning_rate": 1.0600895556051482e-06, - "loss": 3.9445, - "mean_token_accuracy": 0.3138104841113091, - "step": 17950 - }, - { - "epoch": 0.8673604906671818, - "grad_norm": 2.5825999856145465, - "learning_rate": 1.0525481983463937e-06, - "loss": 3.8207, - "mean_token_accuracy": 0.32570564895868304, - "step": 17960 - }, - { - "epoch": 0.8678434308067515, - "grad_norm": 2.6857045559811072, - "learning_rate": 1.045032270727866e-06, - "loss": 3.8469, - "mean_token_accuracy": 0.3246975809335709, - "step": 17970 - }, - { - "epoch": 0.8683263709463213, - "grad_norm": 2.497678004820524, - "learning_rate": 1.037541794110658e-06, - "loss": 3.9027, - "mean_token_accuracy": 0.3253024145960808, - "step": 17980 - }, - { - "epoch": 0.8688093110858909, - "grad_norm": 2.4785066488440477, - "learning_rate": 1.0300767897835284e-06, - "loss": 3.8891, - "mean_token_accuracy": 0.32933467626571655, - "step": 17990 - }, - { - "epoch": 0.8692922512254606, - "grad_norm": 2.466491452351288, - "learning_rate": 1.0226372789628392e-06, - "loss": 3.9184, - "mean_token_accuracy": 0.3203629061579704, - "step": 18000 - }, - { - "epoch": 0.8692922512254606, - "eval_runtime": 7.8099, - "eval_samples_per_second": 378.237, - "eval_steps_per_second": 23.688, - "step": 18000 - }, - { - "epoch": 0.8697751913650303, - "grad_norm": 2.6259082682652184, - "learning_rate": 1.0152232827924936e-06, - "loss": 4.0012, - "mean_token_accuracy": 0.3131048396229744, - "step": 18010 - }, - { - "epoch": 0.8702581315046, - "grad_norm": 2.782506770476591, - "learning_rate": 1.007834822343884e-06, - "loss": 3.9152, - "mean_token_accuracy": 0.3187499985098839, - "step": 18020 - }, - { - "epoch": 0.8707410716441697, - "grad_norm": 2.442090461501015, - "learning_rate": 1.0004719186158262e-06, - "loss": 3.9586, - "mean_token_accuracy": 0.3195564538240433, - "step": 18030 - }, - { - "epoch": 0.8712240117837394, - "grad_norm": 2.60213599038238, - "learning_rate": 9.931345925345038e-07, - "loss": 3.8836, - "mean_token_accuracy": 0.3230846792459488, - "step": 18040 - }, - { - "epoch": 0.8717069519233092, - "grad_norm": 2.6158348626524375, - "learning_rate": 9.858228649533975e-07, - "loss": 3.8551, - "mean_token_accuracy": 0.32207661420106887, - "step": 18050 - }, - { - "epoch": 0.8721898920628788, - "grad_norm": 2.6223838385893874, - "learning_rate": 9.785367566532434e-07, - "loss": 3.9312, - "mean_token_accuracy": 0.32258064597845076, - "step": 18060 - }, - { - "epoch": 0.8726728322024485, - "grad_norm": 2.379448928779309, - "learning_rate": 9.712762883419591e-07, - "loss": 3.8617, - "mean_token_accuracy": 0.32217741906642916, - "step": 18070 - }, - { - "epoch": 0.8731557723420182, - "grad_norm": 2.5482583997864547, - "learning_rate": 9.64041480654596e-07, - "loss": 3.968, - "mean_token_accuracy": 0.312197582423687, - "step": 18080 - }, - { - "epoch": 0.8736387124815879, - "grad_norm": 2.5431610039802774, - "learning_rate": 9.568323541532686e-07, - "loss": 3.9152, - "mean_token_accuracy": 0.3186491936445236, - "step": 18090 - }, - { - "epoch": 0.8741216526211576, - "grad_norm": 2.528358798814819, - "learning_rate": 9.49648929327105e-07, - "loss": 3.9676, - "mean_token_accuracy": 0.31340725868940356, - "step": 18100 - }, - { - "epoch": 0.8741216526211576, - "eval_runtime": 7.8035, - "eval_samples_per_second": 378.546, - "eval_steps_per_second": 23.707, - "step": 18100 - }, - { - "epoch": 0.8746045927607273, - "grad_norm": 2.5167682618470213, - "learning_rate": 9.424912265921915e-07, - "loss": 4.0215, - "mean_token_accuracy": 0.30443548411130905, - "step": 18110 - }, - { - "epoch": 0.8750875329002971, - "grad_norm": 2.5407010875098934, - "learning_rate": 9.353592662915e-07, - "loss": 3.9145, - "mean_token_accuracy": 0.32812499850988386, - "step": 18120 - }, - { - "epoch": 0.8755704730398667, - "grad_norm": 2.5827157374986025, - "learning_rate": 9.282530686948477e-07, - "loss": 3.9234, - "mean_token_accuracy": 0.31915322244167327, - "step": 18130 - }, - { - "epoch": 0.8760534131794364, - "grad_norm": 2.4776944374572167, - "learning_rate": 9.211726539988264e-07, - "loss": 3.8676, - "mean_token_accuracy": 0.3299395188689232, - "step": 18140 - }, - { - "epoch": 0.8765363533190061, - "grad_norm": 2.4300257563844583, - "learning_rate": 9.141180423267548e-07, - "loss": 3.8188, - "mean_token_accuracy": 0.33286290168762206, - "step": 18150 - }, - { - "epoch": 0.8770192934585758, - "grad_norm": 2.6459751674727943, - "learning_rate": 9.070892537286103e-07, - "loss": 3.9262, - "mean_token_accuracy": 0.31885080635547636, - "step": 18160 - }, - { - "epoch": 0.8775022335981455, - "grad_norm": 2.386959199458145, - "learning_rate": 9.000863081809841e-07, - "loss": 3.9355, - "mean_token_accuracy": 0.32268145233392714, - "step": 18170 - }, - { - "epoch": 0.8779851737377152, - "grad_norm": 2.518960737670266, - "learning_rate": 8.931092255870133e-07, - "loss": 3.8563, - "mean_token_accuracy": 0.32489919662475586, - "step": 18180 - }, - { - "epoch": 0.878468113877285, - "grad_norm": 2.556639602148005, - "learning_rate": 8.861580257763381e-07, - "loss": 3.9148, - "mean_token_accuracy": 0.3240927428007126, - "step": 18190 - }, - { - "epoch": 0.8789510540168546, - "grad_norm": 2.497428041319776, - "learning_rate": 8.792327285050229e-07, - "loss": 3.9277, - "mean_token_accuracy": 0.321068549156189, - "step": 18200 - }, - { - "epoch": 0.8789510540168546, - "eval_runtime": 7.8119, - "eval_samples_per_second": 378.141, - "eval_steps_per_second": 23.682, - "step": 18200 - }, - { - "epoch": 0.8794339941564243, - "grad_norm": 2.8076426527684197, - "learning_rate": 8.723333534555323e-07, - "loss": 3.8789, - "mean_token_accuracy": 0.3240927442908287, - "step": 18210 - }, - { - "epoch": 0.879916934295994, - "grad_norm": 2.5320265775086233, - "learning_rate": 8.654599202366431e-07, - "loss": 3.9105, - "mean_token_accuracy": 0.31925403103232386, - "step": 18220 - }, - { - "epoch": 0.8803998744355637, - "grad_norm": 2.436542761284518, - "learning_rate": 8.586124483834047e-07, - "loss": 3.8496, - "mean_token_accuracy": 0.32157258242368697, - "step": 18230 - }, - { - "epoch": 0.8808828145751334, - "grad_norm": 2.5187499498107475, - "learning_rate": 8.51790957357086e-07, - "loss": 3.9852, - "mean_token_accuracy": 0.3115927383303642, - "step": 18240 - }, - { - "epoch": 0.8813657547147031, - "grad_norm": 2.5069267484758146, - "learning_rate": 8.449954665451133e-07, - "loss": 3.9414, - "mean_token_accuracy": 0.3117943570017815, - "step": 18250 - }, - { - "epoch": 0.8818486948542729, - "grad_norm": 2.5843232539545644, - "learning_rate": 8.382259952610195e-07, - "loss": 3.9438, - "mean_token_accuracy": 0.3169354870915413, - "step": 18260 - }, - { - "epoch": 0.8823316349938425, - "grad_norm": 2.3856609815925465, - "learning_rate": 8.314825627443801e-07, - "loss": 3.8996, - "mean_token_accuracy": 0.3181451618671417, - "step": 18270 - }, - { - "epoch": 0.8828145751334122, - "grad_norm": 2.6416341033003, - "learning_rate": 8.247651881607755e-07, - "loss": 3.9152, - "mean_token_accuracy": 0.3177419364452362, - "step": 18280 - }, - { - "epoch": 0.883297515272982, - "grad_norm": 2.6640006965890657, - "learning_rate": 8.180738906017182e-07, - "loss": 3.8773, - "mean_token_accuracy": 0.3157258078455925, - "step": 18290 - }, - { - "epoch": 0.8837804554125516, - "grad_norm": 2.5647265541526534, - "learning_rate": 8.114086890846151e-07, - "loss": 3.9621, - "mean_token_accuracy": 0.31784274280071256, - "step": 18300 - }, - { - "epoch": 0.8837804554125516, - "eval_runtime": 7.8238, - "eval_samples_per_second": 377.565, - "eval_steps_per_second": 23.646, - "step": 18300 - }, - { - "epoch": 0.8842633955521213, - "grad_norm": 2.4336383373323858, - "learning_rate": 8.04769602552693e-07, - "loss": 3.891, - "mean_token_accuracy": 0.3234879031777382, - "step": 18310 - }, - { - "epoch": 0.884746335691691, - "grad_norm": 2.5498922039186795, - "learning_rate": 7.981566498749737e-07, - "loss": 3.9012, - "mean_token_accuracy": 0.32258064299821854, - "step": 18320 - }, - { - "epoch": 0.8852292758312608, - "grad_norm": 2.6650794974443217, - "learning_rate": 7.915698498461877e-07, - "loss": 3.8953, - "mean_token_accuracy": 0.3231854811310768, - "step": 18330 - }, - { - "epoch": 0.8857122159708304, - "grad_norm": 2.5875047933394644, - "learning_rate": 7.850092211867477e-07, - "loss": 3.9746, - "mean_token_accuracy": 0.3092741951346397, - "step": 18340 - }, - { - "epoch": 0.8861951561104001, - "grad_norm": 2.6501223033817336, - "learning_rate": 7.784747825426764e-07, - "loss": 3.916, - "mean_token_accuracy": 0.32167338877916335, - "step": 18350 - }, - { - "epoch": 0.8866780962499698, - "grad_norm": 2.4253460384541614, - "learning_rate": 7.71966552485569e-07, - "loss": 3.9891, - "mean_token_accuracy": 0.3151209682226181, - "step": 18360 - }, - { - "epoch": 0.8871610363895395, - "grad_norm": 2.7215885550299, - "learning_rate": 7.654845495125318e-07, - "loss": 3.9012, - "mean_token_accuracy": 0.3282258078455925, - "step": 18370 - }, - { - "epoch": 0.8876439765291092, - "grad_norm": 2.540776778450628, - "learning_rate": 7.590287920461225e-07, - "loss": 3.9289, - "mean_token_accuracy": 0.318951615691185, - "step": 18380 - }, - { - "epoch": 0.8881269166686789, - "grad_norm": 2.506801425692361, - "learning_rate": 7.525992984343178e-07, - "loss": 3.957, - "mean_token_accuracy": 0.3157258048653603, - "step": 18390 - }, - { - "epoch": 0.8886098568082487, - "grad_norm": 2.514043619789244, - "learning_rate": 7.461960869504414e-07, - "loss": 3.9254, - "mean_token_accuracy": 0.3174395188689232, - "step": 18400 - }, - { - "epoch": 0.8886098568082487, - "eval_runtime": 7.7965, - "eval_samples_per_second": 378.888, - "eval_steps_per_second": 23.729, - "step": 18400 - }, - { - "epoch": 0.8890927969478183, - "grad_norm": 2.449564853084911, - "learning_rate": 7.398191757931262e-07, - "loss": 3.9117, - "mean_token_accuracy": 0.325, - "step": 18410 - }, - { - "epoch": 0.889575737087388, - "grad_norm": 2.372405229356179, - "learning_rate": 7.334685830862509e-07, - "loss": 3.9176, - "mean_token_accuracy": 0.3192540302872658, - "step": 18420 - }, - { - "epoch": 0.8900586772269578, - "grad_norm": 2.5881527126395363, - "learning_rate": 7.271443268788981e-07, - "loss": 3.9031, - "mean_token_accuracy": 0.323991933465004, - "step": 18430 - }, - { - "epoch": 0.8905416173665274, - "grad_norm": 2.5115084852944047, - "learning_rate": 7.208464251452984e-07, - "loss": 3.9301, - "mean_token_accuracy": 0.31885080933570864, - "step": 18440 - }, - { - "epoch": 0.8910245575060971, - "grad_norm": 2.645389897017754, - "learning_rate": 7.145748957847809e-07, - "loss": 3.9539, - "mean_token_accuracy": 0.3170362874865532, - "step": 18450 - }, - { - "epoch": 0.8915074976456668, - "grad_norm": 2.5105823782977335, - "learning_rate": 7.083297566217163e-07, - "loss": 3.918, - "mean_token_accuracy": 0.3204637080430984, - "step": 18460 - }, - { - "epoch": 0.8919904377852366, - "grad_norm": 2.762280756865794, - "learning_rate": 7.02111025405482e-07, - "loss": 3.9438, - "mean_token_accuracy": 0.31895161271095274, - "step": 18470 - }, - { - "epoch": 0.8924733779248062, - "grad_norm": 2.6178122291116552, - "learning_rate": 6.959187198103901e-07, - "loss": 3.8445, - "mean_token_accuracy": 0.32600805908441544, - "step": 18480 - }, - { - "epoch": 0.8929563180643759, - "grad_norm": 2.675038901268123, - "learning_rate": 6.897528574356549e-07, - "loss": 3.9227, - "mean_token_accuracy": 0.3201612919569016, - "step": 18490 - }, - { - "epoch": 0.8934392582039457, - "grad_norm": 2.424588133790083, - "learning_rate": 6.836134558053331e-07, - "loss": 3.9367, - "mean_token_accuracy": 0.31300402879714967, - "step": 18500 - }, - { - "epoch": 0.8934392582039457, - "eval_runtime": 7.8421, - "eval_samples_per_second": 376.684, - "eval_steps_per_second": 23.591, - "step": 18500 - }, - { - "epoch": 0.8939221983435153, - "grad_norm": 2.5838555063615942, - "learning_rate": 6.775005323682782e-07, - "loss": 3.9551, - "mean_token_accuracy": 0.31431451588869097, - "step": 18510 - }, - { - "epoch": 0.894405138483085, - "grad_norm": 2.4958291171493356, - "learning_rate": 6.714141044980915e-07, - "loss": 3.8871, - "mean_token_accuracy": 0.32177419364452364, - "step": 18520 - }, - { - "epoch": 0.8948880786226547, - "grad_norm": 2.472250140068182, - "learning_rate": 6.65354189493066e-07, - "loss": 3.9043, - "mean_token_accuracy": 0.3266129046678543, - "step": 18530 - }, - { - "epoch": 0.8953710187622245, - "grad_norm": 2.563549558316242, - "learning_rate": 6.593208045761468e-07, - "loss": 3.8895, - "mean_token_accuracy": 0.3219758033752441, - "step": 18540 - }, - { - "epoch": 0.8958539589017941, - "grad_norm": 2.563429011031045, - "learning_rate": 6.533139668948762e-07, - "loss": 3.9164, - "mean_token_accuracy": 0.31854838877916336, - "step": 18550 - }, - { - "epoch": 0.8963368990413638, - "grad_norm": 2.5335162983722888, - "learning_rate": 6.473336935213481e-07, - "loss": 3.8594, - "mean_token_accuracy": 0.3230846792459488, - "step": 18560 - }, - { - "epoch": 0.8968198391809336, - "grad_norm": 2.4202524603323643, - "learning_rate": 6.413800014521521e-07, - "loss": 3.8156, - "mean_token_accuracy": 0.32479838877916334, - "step": 18570 - }, - { - "epoch": 0.8973027793205032, - "grad_norm": 2.500669415268221, - "learning_rate": 6.354529076083383e-07, - "loss": 3.9023, - "mean_token_accuracy": 0.31572580635547637, - "step": 18580 - }, - { - "epoch": 0.8977857194600729, - "grad_norm": 2.689120553609442, - "learning_rate": 6.295524288353561e-07, - "loss": 3.8383, - "mean_token_accuracy": 0.33094758093357085, - "step": 18590 - }, - { - "epoch": 0.8982686595996426, - "grad_norm": 2.515004864037933, - "learning_rate": 6.236785819030155e-07, - "loss": 3.9242, - "mean_token_accuracy": 0.32116935700178145, - "step": 18600 - }, - { - "epoch": 0.8982686595996426, - "eval_runtime": 7.8046, - "eval_samples_per_second": 378.497, - "eval_steps_per_second": 23.704, - "step": 18600 - }, - { - "epoch": 0.8987515997392124, - "grad_norm": 2.436796793368009, - "learning_rate": 6.178313835054295e-07, - "loss": 3.9789, - "mean_token_accuracy": 0.3178427442908287, - "step": 18610 - }, - { - "epoch": 0.899234539878782, - "grad_norm": 2.741369435917347, - "learning_rate": 6.120108502609845e-07, - "loss": 3.8687, - "mean_token_accuracy": 0.32439515739679337, - "step": 18620 - }, - { - "epoch": 0.8997174800183517, - "grad_norm": 2.5219506860062633, - "learning_rate": 6.062169987122724e-07, - "loss": 3.8992, - "mean_token_accuracy": 0.3222782269120216, - "step": 18630 - }, - { - "epoch": 0.9002004201579215, - "grad_norm": 2.7278126051330323, - "learning_rate": 6.004498453260532e-07, - "loss": 3.8949, - "mean_token_accuracy": 0.3157258078455925, - "step": 18640 - }, - { - "epoch": 0.9006833602974911, - "grad_norm": 2.6415796007207444, - "learning_rate": 5.947094064932113e-07, - "loss": 3.9766, - "mean_token_accuracy": 0.3200604811310768, - "step": 18650 - }, - { - "epoch": 0.9011663004370608, - "grad_norm": 2.4746303635471545, - "learning_rate": 5.889956985287049e-07, - "loss": 3.8977, - "mean_token_accuracy": 0.3229838728904724, - "step": 18660 - }, - { - "epoch": 0.9016492405766305, - "grad_norm": 2.7078937827890077, - "learning_rate": 5.833087376715185e-07, - "loss": 3.891, - "mean_token_accuracy": 0.3146169394254684, - "step": 18670 - }, - { - "epoch": 0.9021321807162003, - "grad_norm": 2.381055839751177, - "learning_rate": 5.776485400846177e-07, - "loss": 3.9328, - "mean_token_accuracy": 0.3219758063554764, - "step": 18680 - }, - { - "epoch": 0.9026151208557699, - "grad_norm": 2.444002346736162, - "learning_rate": 5.720151218549097e-07, - "loss": 3.8453, - "mean_token_accuracy": 0.3275201618671417, - "step": 18690 - }, - { - "epoch": 0.9030980609953396, - "grad_norm": 2.72492144695871, - "learning_rate": 5.664084989931829e-07, - "loss": 3.9266, - "mean_token_accuracy": 0.3153225839138031, - "step": 18700 - }, - { - "epoch": 0.9030980609953396, - "eval_runtime": 7.8284, - "eval_samples_per_second": 377.346, - "eval_steps_per_second": 23.632, - "step": 18700 - }, - { - "epoch": 0.9035810011349094, - "grad_norm": 2.6411950085154, - "learning_rate": 5.608286874340774e-07, - "loss": 3.9137, - "mean_token_accuracy": 0.3228830635547638, - "step": 18710 - }, - { - "epoch": 0.904063941274479, - "grad_norm": 2.611648832232795, - "learning_rate": 5.552757030360279e-07, - "loss": 3.9133, - "mean_token_accuracy": 0.3105846792459488, - "step": 18720 - }, - { - "epoch": 0.9045468814140487, - "grad_norm": 2.5337979197329634, - "learning_rate": 5.497495615812298e-07, - "loss": 3.8461, - "mean_token_accuracy": 0.3324596777558327, - "step": 18730 - }, - { - "epoch": 0.9050298215536184, - "grad_norm": 2.4826600829370196, - "learning_rate": 5.442502787755788e-07, - "loss": 3.9656, - "mean_token_accuracy": 0.3231854856014252, - "step": 18740 - }, - { - "epoch": 0.9055127616931882, - "grad_norm": 2.5017046847777564, - "learning_rate": 5.387778702486457e-07, - "loss": 3.9, - "mean_token_accuracy": 0.3176411300897598, - "step": 18750 - }, - { - "epoch": 0.9059957018327578, - "grad_norm": 2.4882045700749456, - "learning_rate": 5.333323515536104e-07, - "loss": 3.9445, - "mean_token_accuracy": 0.316129033267498, - "step": 18760 - }, - { - "epoch": 0.9064786419723275, - "grad_norm": 2.549440181267693, - "learning_rate": 5.279137381672395e-07, - "loss": 3.927, - "mean_token_accuracy": 0.31360886842012403, - "step": 18770 - }, - { - "epoch": 0.9069615821118973, - "grad_norm": 2.5172067210049796, - "learning_rate": 5.22522045489825e-07, - "loss": 3.8648, - "mean_token_accuracy": 0.3209677428007126, - "step": 18780 - }, - { - "epoch": 0.9074445222514669, - "grad_norm": 2.7296527838127735, - "learning_rate": 5.171572888451482e-07, - "loss": 3.8293, - "mean_token_accuracy": 0.32973790615797044, - "step": 18790 - }, - { - "epoch": 0.9079274623910366, - "grad_norm": 2.4374331740842092, - "learning_rate": 5.118194834804391e-07, - "loss": 3.9012, - "mean_token_accuracy": 0.32137097269296644, - "step": 18800 - }, - { - "epoch": 0.9079274623910366, - "eval_runtime": 7.7904, - "eval_samples_per_second": 379.186, - "eval_steps_per_second": 23.747, - "step": 18800 - }, - { - "epoch": 0.9084104025306063, - "grad_norm": 2.4773505001982508, - "learning_rate": 5.065086445663248e-07, - "loss": 3.8863, - "mean_token_accuracy": 0.32620967626571656, - "step": 18810 - }, - { - "epoch": 0.9088933426701761, - "grad_norm": 2.4670580941703752, - "learning_rate": 5.012247871967945e-07, - "loss": 3.85, - "mean_token_accuracy": 0.324798384308815, - "step": 18820 - }, - { - "epoch": 0.9093762828097457, - "grad_norm": 2.684457024602209, - "learning_rate": 4.959679263891471e-07, - "loss": 3.9586, - "mean_token_accuracy": 0.3166330635547638, - "step": 18830 - }, - { - "epoch": 0.9098592229493154, - "grad_norm": 2.488340701069753, - "learning_rate": 4.90738077083962e-07, - "loss": 3.9383, - "mean_token_accuracy": 0.318245966732502, - "step": 18840 - }, - { - "epoch": 0.9103421630888852, - "grad_norm": 2.5644966863299303, - "learning_rate": 4.85535254145042e-07, - "loss": 3.9203, - "mean_token_accuracy": 0.31169354617595674, - "step": 18850 - }, - { - "epoch": 0.9108251032284548, - "grad_norm": 2.5045931433189375, - "learning_rate": 4.80359472359384e-07, - "loss": 3.9109, - "mean_token_accuracy": 0.3205645173788071, - "step": 18860 - }, - { - "epoch": 0.9113080433680245, - "grad_norm": 2.690638461618957, - "learning_rate": 4.7521074643712473e-07, - "loss": 3.8918, - "mean_token_accuracy": 0.3225806415081024, - "step": 18870 - }, - { - "epoch": 0.9117909835075942, - "grad_norm": 2.587146130473699, - "learning_rate": 4.700890910115119e-07, - "loss": 3.9676, - "mean_token_accuracy": 0.31552419513463975, - "step": 18880 - }, - { - "epoch": 0.912273923647164, - "grad_norm": 2.485785065387238, - "learning_rate": 4.6499452063885064e-07, - "loss": 3.9277, - "mean_token_accuracy": 0.3143145129084587, - "step": 18890 - }, - { - "epoch": 0.9127568637867336, - "grad_norm": 2.6035654302146107, - "learning_rate": 4.599270497984676e-07, - "loss": 3.8852, - "mean_token_accuracy": 0.32338709831237794, - "step": 18900 - }, - { - "epoch": 0.9127568637867336, - "eval_runtime": 7.8079, - "eval_samples_per_second": 378.337, - "eval_steps_per_second": 23.694, - "step": 18900 - }, - { - "epoch": 0.9132398039263033, - "grad_norm": 2.6154486306538867, - "learning_rate": 4.548866928926732e-07, - "loss": 3.8309, - "mean_token_accuracy": 0.3325604870915413, - "step": 18910 - }, - { - "epoch": 0.9137227440658731, - "grad_norm": 2.6372359708400093, - "learning_rate": 4.498734642467151e-07, - "loss": 3.9453, - "mean_token_accuracy": 0.32308468222618103, - "step": 18920 - }, - { - "epoch": 0.9142056842054427, - "grad_norm": 2.7690705503226543, - "learning_rate": 4.4488737810874037e-07, - "loss": 3.893, - "mean_token_accuracy": 0.321370966732502, - "step": 18930 - }, - { - "epoch": 0.9146886243450124, - "grad_norm": 2.5262319425991637, - "learning_rate": 4.3992844864974905e-07, - "loss": 3.8953, - "mean_token_accuracy": 0.32600806653499603, - "step": 18940 - }, - { - "epoch": 0.9151715644845821, - "grad_norm": 2.537055947158476, - "learning_rate": 4.3499668996356824e-07, - "loss": 3.8543, - "mean_token_accuracy": 0.32278225719928744, - "step": 18950 - }, - { - "epoch": 0.9156545046241519, - "grad_norm": 2.7171126783994657, - "learning_rate": 4.300921160667937e-07, - "loss": 3.9355, - "mean_token_accuracy": 0.3167338714003563, - "step": 18960 - }, - { - "epoch": 0.9161374447637215, - "grad_norm": 2.602038936275718, - "learning_rate": 4.2521474089876614e-07, - "loss": 3.8523, - "mean_token_accuracy": 0.3256048381328583, - "step": 18970 - }, - { - "epoch": 0.9166203849032912, - "grad_norm": 2.619720114968416, - "learning_rate": 4.20364578321516e-07, - "loss": 3.8934, - "mean_token_accuracy": 0.3287298396229744, - "step": 18980 - }, - { - "epoch": 0.917103325042861, - "grad_norm": 2.5304704087644145, - "learning_rate": 4.1554164211974447e-07, - "loss": 3.9309, - "mean_token_accuracy": 0.32721774131059644, - "step": 18990 - }, - { - "epoch": 0.9175862651824306, - "grad_norm": 2.698019022324581, - "learning_rate": 4.107459460007601e-07, - "loss": 3.8473, - "mean_token_accuracy": 0.3297379031777382, - "step": 19000 - }, - { - "epoch": 0.9175862651824306, - "eval_runtime": 7.8214, - "eval_samples_per_second": 377.683, - "eval_steps_per_second": 23.653, - "step": 19000 - }, - { - "epoch": 0.9180692053220003, - "grad_norm": 2.8081133944101726, - "learning_rate": 4.059775035944613e-07, - "loss": 3.9379, - "mean_token_accuracy": 0.3162298336625099, - "step": 19010 - }, - { - "epoch": 0.91855214546157, - "grad_norm": 2.505467578492602, - "learning_rate": 4.0123632845328167e-07, - "loss": 3.8598, - "mean_token_accuracy": 0.3239919364452362, - "step": 19020 - }, - { - "epoch": 0.9190350856011398, - "grad_norm": 2.5358846827134105, - "learning_rate": 3.965224340521645e-07, - "loss": 3.8695, - "mean_token_accuracy": 0.32641129195690155, - "step": 19030 - }, - { - "epoch": 0.9195180257407094, - "grad_norm": 2.5024701792955395, - "learning_rate": 3.918358337885153e-07, - "loss": 3.8687, - "mean_token_accuracy": 0.32268145233392714, - "step": 19040 - }, - { - "epoch": 0.9200009658802791, - "grad_norm": 2.5629357644973654, - "learning_rate": 3.871765409821615e-07, - "loss": 3.8844, - "mean_token_accuracy": 0.32227822840213777, - "step": 19050 - }, - { - "epoch": 0.9204839060198489, - "grad_norm": 2.6066892342153416, - "learning_rate": 3.8254456887533156e-07, - "loss": 3.8828, - "mean_token_accuracy": 0.3198588714003563, - "step": 19060 - }, - { - "epoch": 0.9209668461594186, - "grad_norm": 2.488094364056554, - "learning_rate": 3.779399306325937e-07, - "loss": 3.9316, - "mean_token_accuracy": 0.31784273833036425, - "step": 19070 - }, - { - "epoch": 0.9214497862989882, - "grad_norm": 2.7034090356412888, - "learning_rate": 3.7336263934083737e-07, - "loss": 3.8559, - "mean_token_accuracy": 0.3318548396229744, - "step": 19080 - }, - { - "epoch": 0.921932726438558, - "grad_norm": 2.62611611703325, - "learning_rate": 3.688127080092252e-07, - "loss": 3.8559, - "mean_token_accuracy": 0.3225806444883347, - "step": 19090 - }, - { - "epoch": 0.9224156665781277, - "grad_norm": 2.5928103916030656, - "learning_rate": 3.642901495691642e-07, - "loss": 3.8402, - "mean_token_accuracy": 0.3228830650448799, - "step": 19100 - }, - { - "epoch": 0.9224156665781277, - "eval_runtime": 7.8233, - "eval_samples_per_second": 377.588, - "eval_steps_per_second": 23.647, - "step": 19100 - }, - { - "epoch": 0.9228986067176973, - "grad_norm": 2.601053724006797, - "learning_rate": 3.5979497687426036e-07, - "loss": 3.9699, - "mean_token_accuracy": 0.3127016142010689, - "step": 19110 - }, - { - "epoch": 0.923381546857267, - "grad_norm": 2.6358294746115805, - "learning_rate": 3.553272027002885e-07, - "loss": 3.9188, - "mean_token_accuracy": 0.3264112904667854, - "step": 19120 - }, - { - "epoch": 0.9238644869968368, - "grad_norm": 2.770921342024172, - "learning_rate": 3.5088683974515146e-07, - "loss": 3.9043, - "mean_token_accuracy": 0.32701613157987597, - "step": 19130 - }, - { - "epoch": 0.9243474271364065, - "grad_norm": 2.614000634361156, - "learning_rate": 3.464739006288509e-07, - "loss": 3.934, - "mean_token_accuracy": 0.31250000149011614, - "step": 19140 - }, - { - "epoch": 0.9248303672759761, - "grad_norm": 2.4745174672234787, - "learning_rate": 3.4208839789344196e-07, - "loss": 3.9582, - "mean_token_accuracy": 0.31965726166963576, - "step": 19150 - }, - { - "epoch": 0.9253133074155458, - "grad_norm": 2.610724419573157, - "learning_rate": 3.377303440030066e-07, - "loss": 3.9254, - "mean_token_accuracy": 0.3127016112208366, - "step": 19160 - }, - { - "epoch": 0.9257962475551156, - "grad_norm": 2.619733190776688, - "learning_rate": 3.3339975134361157e-07, - "loss": 3.8824, - "mean_token_accuracy": 0.3195564478635788, - "step": 19170 - }, - { - "epoch": 0.9262791876946852, - "grad_norm": 2.783267576792982, - "learning_rate": 3.2909663222327583e-07, - "loss": 3.9414, - "mean_token_accuracy": 0.32368951886892317, - "step": 19180 - }, - { - "epoch": 0.9267621278342549, - "grad_norm": 2.493079950658083, - "learning_rate": 3.248209988719386e-07, - "loss": 3.8648, - "mean_token_accuracy": 0.3292338714003563, - "step": 19190 - }, - { - "epoch": 0.9272450679738247, - "grad_norm": 2.7837353515841983, - "learning_rate": 3.2057286344141515e-07, - "loss": 3.923, - "mean_token_accuracy": 0.31411290615797044, - "step": 19200 - }, - { - "epoch": 0.9272450679738247, - "eval_runtime": 7.828, - "eval_samples_per_second": 377.366, - "eval_steps_per_second": 23.633, - "step": 19200 - }, - { - "epoch": 0.9277280081133944, - "grad_norm": 2.522543368816119, - "learning_rate": 3.163522380053785e-07, - "loss": 3.9141, - "mean_token_accuracy": 0.3188508078455925, - "step": 19210 - }, - { - "epoch": 0.928210948252964, - "grad_norm": 2.4282524879481957, - "learning_rate": 3.1215913455930337e-07, - "loss": 3.8691, - "mean_token_accuracy": 0.32217742055654525, - "step": 19220 - }, - { - "epoch": 0.9286938883925338, - "grad_norm": 2.709508618019017, - "learning_rate": 3.0799356502045464e-07, - "loss": 3.9027, - "mean_token_accuracy": 0.3201612919569016, - "step": 19230 - }, - { - "epoch": 0.9291768285321035, - "grad_norm": 2.543570879721227, - "learning_rate": 3.0385554122783545e-07, - "loss": 3.9539, - "mean_token_accuracy": 0.32227822542190554, - "step": 19240 - }, - { - "epoch": 0.9296597686716731, - "grad_norm": 2.62279741090973, - "learning_rate": 2.9974507494216596e-07, - "loss": 3.952, - "mean_token_accuracy": 0.31522177457809447, - "step": 19250 - }, - { - "epoch": 0.9301427088112428, - "grad_norm": 2.528492635895777, - "learning_rate": 2.9566217784584016e-07, - "loss": 4.0156, - "mean_token_accuracy": 0.3119959682226181, - "step": 19260 - }, - { - "epoch": 0.9306256489508126, - "grad_norm": 2.9014657624963727, - "learning_rate": 2.916068615429013e-07, - "loss": 3.9234, - "mean_token_accuracy": 0.31633064448833464, - "step": 19270 - }, - { - "epoch": 0.9311085890903823, - "grad_norm": 2.403591101207481, - "learning_rate": 2.87579137559002e-07, - "loss": 3.9937, - "mean_token_accuracy": 0.31340725868940356, - "step": 19280 - }, - { - "epoch": 0.9315915292299519, - "grad_norm": 2.6196081554643693, - "learning_rate": 2.835790173413788e-07, - "loss": 3.925, - "mean_token_accuracy": 0.3213709697127342, - "step": 19290 - }, - { - "epoch": 0.9320744693695217, - "grad_norm": 2.724996916497667, - "learning_rate": 2.7960651225881097e-07, - "loss": 3.9227, - "mean_token_accuracy": 0.3177419379353523, - "step": 19300 - }, - { - "epoch": 0.9320744693695217, - "eval_runtime": 7.8211, - "eval_samples_per_second": 377.697, - "eval_steps_per_second": 23.654, - "step": 19300 - }, - { - "epoch": 0.9325574095090914, - "grad_norm": 2.659378795088702, - "learning_rate": 2.756616336015916e-07, - "loss": 3.9969, - "mean_token_accuracy": 0.312197582423687, - "step": 19310 - }, - { - "epoch": 0.933040349648661, - "grad_norm": 2.7999484834304678, - "learning_rate": 2.7174439258150444e-07, - "loss": 3.9086, - "mean_token_accuracy": 0.323387099802494, - "step": 19320 - }, - { - "epoch": 0.9335232897882307, - "grad_norm": 3.156615195210052, - "learning_rate": 2.678548003317727e-07, - "loss": 3.8504, - "mean_token_accuracy": 0.3265120968222618, - "step": 19330 - }, - { - "epoch": 0.9340062299278005, - "grad_norm": 2.5308370104062194, - "learning_rate": 2.6399286790704803e-07, - "loss": 3.9227, - "mean_token_accuracy": 0.3246975809335709, - "step": 19340 - }, - { - "epoch": 0.9344891700673702, - "grad_norm": 2.4821951746772486, - "learning_rate": 2.6015860628336386e-07, - "loss": 3.8996, - "mean_token_accuracy": 0.3173387095332146, - "step": 19350 - }, - { - "epoch": 0.9349721102069398, - "grad_norm": 2.5257136886558467, - "learning_rate": 2.563520263581165e-07, - "loss": 3.907, - "mean_token_accuracy": 0.31915322691202164, - "step": 19360 - }, - { - "epoch": 0.9354550503465096, - "grad_norm": 2.672617688757303, - "learning_rate": 2.5257313895001965e-07, - "loss": 3.9016, - "mean_token_accuracy": 0.3269153222441673, - "step": 19370 - }, - { - "epoch": 0.9359379904860793, - "grad_norm": 2.561879482730617, - "learning_rate": 2.488219547990889e-07, - "loss": 3.9656, - "mean_token_accuracy": 0.3177419349551201, - "step": 19380 - }, - { - "epoch": 0.9364209306256489, - "grad_norm": 2.6147173633418728, - "learning_rate": 2.4509848456659934e-07, - "loss": 3.877, - "mean_token_accuracy": 0.32116935700178145, - "step": 19390 - }, - { - "epoch": 0.9369038707652186, - "grad_norm": 2.8378090427501927, - "learning_rate": 2.414027388350648e-07, - "loss": 3.9926, - "mean_token_accuracy": 0.31532258093357085, - "step": 19400 - }, - { - "epoch": 0.9369038707652186, - "eval_runtime": 7.7969, - "eval_samples_per_second": 378.869, - "eval_steps_per_second": 23.727, - "step": 19400 - }, - { - "epoch": 0.9373868109047884, - "grad_norm": 2.5140470070157894, - "learning_rate": 2.3773472810819874e-07, - "loss": 3.9332, - "mean_token_accuracy": 0.31502016335725785, - "step": 19410 - }, - { - "epoch": 0.9378697510443581, - "grad_norm": 2.593453842238223, - "learning_rate": 2.3409446281088988e-07, - "loss": 3.9145, - "mean_token_accuracy": 0.3166330650448799, - "step": 19420 - }, - { - "epoch": 0.9383526911839277, - "grad_norm": 2.461249695321873, - "learning_rate": 2.3048195328917223e-07, - "loss": 3.8922, - "mean_token_accuracy": 0.31441532373428344, - "step": 19430 - }, - { - "epoch": 0.9388356313234975, - "grad_norm": 2.4725622446835183, - "learning_rate": 2.2689720981019513e-07, - "loss": 3.9277, - "mean_token_accuracy": 0.3158266097307205, - "step": 19440 - }, - { - "epoch": 0.9393185714630672, - "grad_norm": 2.574068805859843, - "learning_rate": 2.2334024256219333e-07, - "loss": 3.816, - "mean_token_accuracy": 0.32711693793535235, - "step": 19450 - }, - { - "epoch": 0.9398015116026368, - "grad_norm": 2.560525805446135, - "learning_rate": 2.1981106165445465e-07, - "loss": 3.9574, - "mean_token_accuracy": 0.32116935551166537, - "step": 19460 - }, - { - "epoch": 0.9402844517422065, - "grad_norm": 2.5105187541810414, - "learning_rate": 2.1630967711730345e-07, - "loss": 3.934, - "mean_token_accuracy": 0.31633064448833464, - "step": 19470 - }, - { - "epoch": 0.9407673918817763, - "grad_norm": 2.4622664871562265, - "learning_rate": 2.1283609890205615e-07, - "loss": 3.8578, - "mean_token_accuracy": 0.3212701603770256, - "step": 19480 - }, - { - "epoch": 0.941250332021346, - "grad_norm": 2.5721867320714416, - "learning_rate": 2.0939033688100574e-07, - "loss": 3.9355, - "mean_token_accuracy": 0.3192540302872658, - "step": 19490 - }, - { - "epoch": 0.9417332721609156, - "grad_norm": 2.564343503918378, - "learning_rate": 2.059724008473818e-07, - "loss": 3.9234, - "mean_token_accuracy": 0.31411290019750593, - "step": 19500 - }, - { - "epoch": 0.9417332721609156, - "eval_runtime": 7.7798, - "eval_samples_per_second": 379.704, - "eval_steps_per_second": 23.78, - "step": 19500 - }, - { - "epoch": 0.9422162123004854, - "grad_norm": 2.6502418226088613, - "learning_rate": 2.0258230051533822e-07, - "loss": 3.9457, - "mean_token_accuracy": 0.31471773982048035, - "step": 19510 - }, - { - "epoch": 0.9426991524400551, - "grad_norm": 2.5163455055308597, - "learning_rate": 1.9922004551990891e-07, - "loss": 3.9641, - "mean_token_accuracy": 0.31905242055654526, - "step": 19520 - }, - { - "epoch": 0.9431820925796247, - "grad_norm": 2.4206413048422912, - "learning_rate": 1.958856454169944e-07, - "loss": 3.9332, - "mean_token_accuracy": 0.32137096524238584, - "step": 19530 - }, - { - "epoch": 0.9436650327191944, - "grad_norm": 2.6683290778560336, - "learning_rate": 1.9257910968332405e-07, - "loss": 3.9551, - "mean_token_accuracy": 0.31995967626571653, - "step": 19540 - }, - { - "epoch": 0.9441479728587642, - "grad_norm": 2.462647437956805, - "learning_rate": 1.893004477164373e-07, - "loss": 3.8719, - "mean_token_accuracy": 0.33014112561941145, - "step": 19550 - }, - { - "epoch": 0.9446309129983339, - "grad_norm": 2.5511477711346804, - "learning_rate": 1.8604966883464804e-07, - "loss": 3.8703, - "mean_token_accuracy": 0.31743951588869096, - "step": 19560 - }, - { - "epoch": 0.9451138531379035, - "grad_norm": 2.5828642009651, - "learning_rate": 1.828267822770302e-07, - "loss": 3.968, - "mean_token_accuracy": 0.31864919066429137, - "step": 19570 - }, - { - "epoch": 0.9455967932774733, - "grad_norm": 2.5446709674868075, - "learning_rate": 1.7963179720338008e-07, - "loss": 3.9684, - "mean_token_accuracy": 0.3165322571992874, - "step": 19580 - }, - { - "epoch": 0.946079733417043, - "grad_norm": 2.6205788486703927, - "learning_rate": 1.7646472269419401e-07, - "loss": 3.9262, - "mean_token_accuracy": 0.31784273982048034, - "step": 19590 - }, - { - "epoch": 0.9465626735566126, - "grad_norm": 2.701050907610122, - "learning_rate": 1.7332556775064845e-07, - "loss": 3.8879, - "mean_token_accuracy": 0.32268145233392714, - "step": 19600 - }, - { - "epoch": 0.9465626735566126, - "eval_runtime": 7.7806, - "eval_samples_per_second": 379.665, - "eval_steps_per_second": 23.777, - "step": 19600 - }, - { - "epoch": 0.9470456136961823, - "grad_norm": 2.4837191107950627, - "learning_rate": 1.7021434129456337e-07, - "loss": 3.8816, - "mean_token_accuracy": 0.3280241906642914, - "step": 19610 - }, - { - "epoch": 0.9475285538357521, - "grad_norm": 2.633241501741633, - "learning_rate": 1.6713105216838887e-07, - "loss": 3.923, - "mean_token_accuracy": 0.3229838743805885, - "step": 19620 - }, - { - "epoch": 0.9480114939753218, - "grad_norm": 2.555522130350023, - "learning_rate": 1.6407570913516967e-07, - "loss": 3.9082, - "mean_token_accuracy": 0.3270161300897598, - "step": 19630 - }, - { - "epoch": 0.9484944341148914, - "grad_norm": 2.6328896730196147, - "learning_rate": 1.6104832087852518e-07, - "loss": 3.8902, - "mean_token_accuracy": 0.3254032269120216, - "step": 19640 - }, - { - "epoch": 0.9489773742544612, - "grad_norm": 2.482082345541495, - "learning_rate": 1.5804889600262607e-07, - "loss": 3.882, - "mean_token_accuracy": 0.3225806444883347, - "step": 19650 - }, - { - "epoch": 0.9494603143940309, - "grad_norm": 2.685352604061562, - "learning_rate": 1.5507744303216777e-07, - "loss": 3.9352, - "mean_token_accuracy": 0.3145161300897598, - "step": 19660 - }, - { - "epoch": 0.9499432545336005, - "grad_norm": 2.5224640241860836, - "learning_rate": 1.521339704123448e-07, - "loss": 3.9273, - "mean_token_accuracy": 0.31985886842012407, - "step": 19670 - }, - { - "epoch": 0.9504261946731702, - "grad_norm": 2.505164318961583, - "learning_rate": 1.4921848650882976e-07, - "loss": 3.8691, - "mean_token_accuracy": 0.32822580337524415, - "step": 19680 - }, - { - "epoch": 0.95090913481274, - "grad_norm": 2.4491558120453267, - "learning_rate": 1.4633099960774777e-07, - "loss": 3.8699, - "mean_token_accuracy": 0.3232862904667854, - "step": 19690 - }, - { - "epoch": 0.9513920749523097, - "grad_norm": 2.67599030505745, - "learning_rate": 1.434715179156554e-07, - "loss": 3.9168, - "mean_token_accuracy": 0.31572581082582474, - "step": 19700 - }, - { - "epoch": 0.9513920749523097, - "eval_runtime": 7.7953, - "eval_samples_per_second": 378.946, - "eval_steps_per_second": 23.732, - "step": 19700 - }, - { - "epoch": 0.9518750150918793, - "grad_norm": 2.511813279636735, - "learning_rate": 1.4064004955951062e-07, - "loss": 3.9086, - "mean_token_accuracy": 0.3171370968222618, - "step": 19710 - }, - { - "epoch": 0.9523579552314491, - "grad_norm": 2.6119050450332453, - "learning_rate": 1.3783660258665733e-07, - "loss": 3.8668, - "mean_token_accuracy": 0.32016129046678543, - "step": 19720 - }, - { - "epoch": 0.9528408953710188, - "grad_norm": 2.7940854989777444, - "learning_rate": 1.3506118496480314e-07, - "loss": 4.0207, - "mean_token_accuracy": 0.31159274131059644, - "step": 19730 - }, - { - "epoch": 0.9533238355105884, - "grad_norm": 2.5316011212641656, - "learning_rate": 1.3231380458198605e-07, - "loss": 3.941, - "mean_token_accuracy": 0.32197580933570863, - "step": 19740 - }, - { - "epoch": 0.9538067756501581, - "grad_norm": 2.5339227212393194, - "learning_rate": 1.2959446924656448e-07, - "loss": 3.9297, - "mean_token_accuracy": 0.3189516142010689, - "step": 19750 - }, - { - "epoch": 0.9542897157897279, - "grad_norm": 2.617854696222853, - "learning_rate": 1.2690318668718726e-07, - "loss": 3.852, - "mean_token_accuracy": 0.31693548411130906, - "step": 19760 - }, - { - "epoch": 0.9547726559292976, - "grad_norm": 2.6508015106758016, - "learning_rate": 1.2423996455277477e-07, - "loss": 3.8594, - "mean_token_accuracy": 0.3305443570017815, - "step": 19770 - }, - { - "epoch": 0.9552555960688672, - "grad_norm": 2.4739673459249607, - "learning_rate": 1.2160481041249783e-07, - "loss": 3.8656, - "mean_token_accuracy": 0.3322580635547638, - "step": 19780 - }, - { - "epoch": 0.955738536208437, - "grad_norm": 2.7077860756096976, - "learning_rate": 1.1899773175575224e-07, - "loss": 3.8406, - "mean_token_accuracy": 0.32540322542190553, - "step": 19790 - }, - { - "epoch": 0.9562214763480067, - "grad_norm": 2.5397501129750637, - "learning_rate": 1.1641873599214204e-07, - "loss": 3.9133, - "mean_token_accuracy": 0.3172379031777382, - "step": 19800 - }, - { - "epoch": 0.9562214763480067, - "eval_runtime": 7.7919, - "eval_samples_per_second": 379.111, - "eval_steps_per_second": 23.743, - "step": 19800 - }, - { - "epoch": 0.9567044164875763, - "grad_norm": 2.6037360027328926, - "learning_rate": 1.1386783045145733e-07, - "loss": 3.95, - "mean_token_accuracy": 0.3125, - "step": 19810 - }, - { - "epoch": 0.957187356627146, - "grad_norm": 2.610495064130943, - "learning_rate": 1.1134502238365097e-07, - "loss": 3.9094, - "mean_token_accuracy": 0.31673387438058853, - "step": 19820 - }, - { - "epoch": 0.9576702967667158, - "grad_norm": 2.5182987998687927, - "learning_rate": 1.0885031895882081e-07, - "loss": 3.9223, - "mean_token_accuracy": 0.3259072557091713, - "step": 19830 - }, - { - "epoch": 0.9581532369062855, - "grad_norm": 2.6583038229965372, - "learning_rate": 1.0638372726718749e-07, - "loss": 3.8641, - "mean_token_accuracy": 0.3239577829837799, - "step": 19840 - }, - { - "epoch": 0.9586361770458551, - "grad_norm": 2.5819519431756266, - "learning_rate": 1.0394525431907443e-07, - "loss": 3.9074, - "mean_token_accuracy": 0.3214717760682106, - "step": 19850 - }, - { - "epoch": 0.9591191171854249, - "grad_norm": 2.815563560391342, - "learning_rate": 1.0153490704489233e-07, - "loss": 3.9242, - "mean_token_accuracy": 0.3147177442908287, - "step": 19860 - }, - { - "epoch": 0.9596020573249946, - "grad_norm": 2.5672375656625035, - "learning_rate": 9.915269229510805e-08, - "loss": 3.902, - "mean_token_accuracy": 0.3168346807360649, - "step": 19870 - }, - { - "epoch": 0.9600849974645642, - "grad_norm": 2.58054879091667, - "learning_rate": 9.679861684024239e-08, - "loss": 3.8988, - "mean_token_accuracy": 0.32056451588869095, - "step": 19880 - }, - { - "epoch": 0.960567937604134, - "grad_norm": 2.7407947990544574, - "learning_rate": 9.447268737083348e-08, - "loss": 3.9449, - "mean_token_accuracy": 0.3187499985098839, - "step": 19890 - }, - { - "epoch": 0.9610508777437037, - "grad_norm": 2.509597303413779, - "learning_rate": 9.217491049742789e-08, - "loss": 4.009, - "mean_token_accuracy": 0.32247983664274216, - "step": 19900 - }, - { - "epoch": 0.9610508777437037, - "eval_runtime": 7.8001, - "eval_samples_per_second": 378.711, - "eval_steps_per_second": 23.718, - "step": 19900 - }, - { - "epoch": 0.9615338178832734, - "grad_norm": 2.5509201174210783, - "learning_rate": 8.990529275056059e-08, - "loss": 3.9504, - "mean_token_accuracy": 0.32177419364452364, - "step": 19910 - }, - { - "epoch": 0.962016758022843, - "grad_norm": 2.809475091884003, - "learning_rate": 8.766384058073618e-08, - "loss": 3.9937, - "mean_token_accuracy": 0.31249999850988386, - "step": 19920 - }, - { - "epoch": 0.9624996981624128, - "grad_norm": 2.5517006156063413, - "learning_rate": 8.545056035840438e-08, - "loss": 3.9203, - "mean_token_accuracy": 0.3196572601795197, - "step": 19930 - }, - { - "epoch": 0.9629826383019825, - "grad_norm": 2.535404226207725, - "learning_rate": 8.326545837395228e-08, - "loss": 3.8465, - "mean_token_accuracy": 0.3216733902692795, - "step": 19940 - }, - { - "epoch": 0.9634655784415521, - "grad_norm": 2.7718206828584284, - "learning_rate": 8.110854083767883e-08, - "loss": 3.9039, - "mean_token_accuracy": 0.32268145084381106, - "step": 19950 - }, - { - "epoch": 0.9639485185811218, - "grad_norm": 2.5763078063963527, - "learning_rate": 7.89798138797826e-08, - "loss": 3.959, - "mean_token_accuracy": 0.3169354826211929, - "step": 19960 - }, - { - "epoch": 0.9644314587206916, - "grad_norm": 2.5595050443811984, - "learning_rate": 7.687928355033736e-08, - "loss": 3.9348, - "mean_token_accuracy": 0.324193549156189, - "step": 19970 - }, - { - "epoch": 0.9649143988602613, - "grad_norm": 2.68061210541419, - "learning_rate": 7.480695581927988e-08, - "loss": 3.8996, - "mean_token_accuracy": 0.3186491936445236, - "step": 19980 - }, - { - "epoch": 0.9653973389998309, - "grad_norm": 2.5632036100421622, - "learning_rate": 7.27628365763966e-08, - "loss": 3.9617, - "mean_token_accuracy": 0.32429435551166536, - "step": 19990 - }, - { - "epoch": 0.9658802791394007, - "grad_norm": 2.543367807101631, - "learning_rate": 7.074693163129476e-08, - "loss": 3.8383, - "mean_token_accuracy": 0.3331653192639351, - "step": 20000 - }, - { - "epoch": 0.9658802791394007, - "eval_runtime": 7.7868, - "eval_samples_per_second": 379.362, - "eval_steps_per_second": 23.758, - "step": 20000 - }, - { - "epoch": 0.9663632192789704, - "grad_norm": 2.6551108398559085, - "learning_rate": 6.875924671340018e-08, - "loss": 3.8672, - "mean_token_accuracy": 0.3329637095332146, - "step": 20010 - }, - { - "epoch": 0.96684615941854, - "grad_norm": 2.585869453538405, - "learning_rate": 6.679978747193061e-08, - "loss": 3.8949, - "mean_token_accuracy": 0.31774193346500396, - "step": 20020 - }, - { - "epoch": 0.9673290995581098, - "grad_norm": 2.6161156467423305, - "learning_rate": 6.486855947588467e-08, - "loss": 3.9406, - "mean_token_accuracy": 0.3146169319748878, - "step": 20030 - }, - { - "epoch": 0.9678120396976795, - "grad_norm": 2.733466732480941, - "learning_rate": 6.2965568214024e-08, - "loss": 3.8789, - "mean_token_accuracy": 0.3240927383303642, - "step": 20040 - }, - { - "epoch": 0.9682949798372492, - "grad_norm": 2.4510361763570763, - "learning_rate": 6.109081909485892e-08, - "loss": 3.884, - "mean_token_accuracy": 0.31975806653499605, - "step": 20050 - }, - { - "epoch": 0.9687779199768188, - "grad_norm": 2.5076379400532827, - "learning_rate": 5.924431744663173e-08, - "loss": 3.8844, - "mean_token_accuracy": 0.315625, - "step": 20060 - }, - { - "epoch": 0.9692608601163886, - "grad_norm": 2.5052522951514105, - "learning_rate": 5.7426068517303366e-08, - "loss": 3.9762, - "mean_token_accuracy": 0.319556450843811, - "step": 20070 - }, - { - "epoch": 0.9697438002559583, - "grad_norm": 2.6687676448832596, - "learning_rate": 5.563607747453681e-08, - "loss": 3.9125, - "mean_token_accuracy": 0.31995967775583267, - "step": 20080 - }, - { - "epoch": 0.9702267403955279, - "grad_norm": 2.649350649940303, - "learning_rate": 5.38743494056837e-08, - "loss": 3.9082, - "mean_token_accuracy": 0.3239919379353523, - "step": 20090 - }, - { - "epoch": 0.9707096805350977, - "grad_norm": 2.67673862199842, - "learning_rate": 5.214088931776662e-08, - "loss": 3.9301, - "mean_token_accuracy": 0.3192540317773819, - "step": 20100 - }, - { - "epoch": 0.9707096805350977, - "eval_runtime": 7.7901, - "eval_samples_per_second": 379.2, - "eval_steps_per_second": 23.748, - "step": 20100 - }, - { - "epoch": 0.9711926206746674, - "grad_norm": 2.6627407389383864, - "learning_rate": 5.0435702137472395e-08, - "loss": 3.9082, - "mean_token_accuracy": 0.3205645143985748, - "step": 20110 - }, - { - "epoch": 0.9716755608142371, - "grad_norm": 2.474276940486427, - "learning_rate": 4.875879271112771e-08, - "loss": 3.9426, - "mean_token_accuracy": 0.31673386991024016, - "step": 20120 - }, - { - "epoch": 0.9721585009538067, - "grad_norm": 2.746346971736028, - "learning_rate": 4.711016580469352e-08, - "loss": 3.925, - "mean_token_accuracy": 0.31592742055654527, - "step": 20130 - }, - { - "epoch": 0.9726414410933765, - "grad_norm": 2.4538207448738385, - "learning_rate": 4.548982610374952e-08, - "loss": 3.9438, - "mean_token_accuracy": 0.31532258689403536, - "step": 20140 - }, - { - "epoch": 0.9731243812329462, - "grad_norm": 2.5715442891879015, - "learning_rate": 4.389777821347862e-08, - "loss": 3.9867, - "mean_token_accuracy": 0.3145161300897598, - "step": 20150 - }, - { - "epoch": 0.9736073213725158, - "grad_norm": 2.5076641603491696, - "learning_rate": 4.2334026658655826e-08, - "loss": 3.8574, - "mean_token_accuracy": 0.32620967775583265, - "step": 20160 - }, - { - "epoch": 0.9740902615120856, - "grad_norm": 2.5807542888545534, - "learning_rate": 4.0798575883633784e-08, - "loss": 3.8766, - "mean_token_accuracy": 0.3172379031777382, - "step": 20170 - }, - { - "epoch": 0.9745732016516553, - "grad_norm": 2.5993565386051665, - "learning_rate": 3.929143025233395e-08, - "loss": 3.9453, - "mean_token_accuracy": 0.31824597120285036, - "step": 20180 - }, - { - "epoch": 0.975056141791225, - "grad_norm": 2.6279745872767406, - "learning_rate": 3.781259404822657e-08, - "loss": 3.8629, - "mean_token_accuracy": 0.32741935551166534, - "step": 20190 - }, - { - "epoch": 0.9755390819307946, - "grad_norm": 2.5852149492915517, - "learning_rate": 3.6362071474329574e-08, - "loss": 3.9547, - "mean_token_accuracy": 0.3145161345601082, - "step": 20200 - }, - { - "epoch": 0.9755390819307946, - "eval_runtime": 7.81, - "eval_samples_per_second": 378.233, - "eval_steps_per_second": 23.688, - "step": 20200 - }, - { - "epoch": 0.9760220220703644, - "grad_norm": 2.754839568962273, - "learning_rate": 3.4939866653186384e-08, - "loss": 3.9133, - "mean_token_accuracy": 0.3183467760682106, - "step": 20210 - }, - { - "epoch": 0.9765049622099341, - "grad_norm": 2.499466225306501, - "learning_rate": 3.354598362685923e-08, - "loss": 3.8711, - "mean_token_accuracy": 0.3232862904667854, - "step": 20220 - }, - { - "epoch": 0.9769879023495038, - "grad_norm": 2.6713748643221082, - "learning_rate": 3.218042635691587e-08, - "loss": 3.9184, - "mean_token_accuracy": 0.32106854766607285, - "step": 20230 - }, - { - "epoch": 0.9774708424890735, - "grad_norm": 2.326302461823539, - "learning_rate": 3.084319872442176e-08, - "loss": 3.9219, - "mean_token_accuracy": 0.31905242055654526, - "step": 20240 - }, - { - "epoch": 0.9779537826286432, - "grad_norm": 2.4955060925733896, - "learning_rate": 2.9534304529922343e-08, - "loss": 3.9242, - "mean_token_accuracy": 0.3198588714003563, - "step": 20250 - }, - { - "epoch": 0.9784367227682129, - "grad_norm": 2.6329725140740394, - "learning_rate": 2.8253747493439677e-08, - "loss": 3.9996, - "mean_token_accuracy": 0.3151209682226181, - "step": 20260 - }, - { - "epoch": 0.9789196629077825, - "grad_norm": 2.7774990851805583, - "learning_rate": 2.7001531254458036e-08, - "loss": 3.9141, - "mean_token_accuracy": 0.32288306802511213, - "step": 20270 - }, - { - "epoch": 0.9794026030473523, - "grad_norm": 2.7290209636423746, - "learning_rate": 2.577765937191279e-08, - "loss": 4.0113, - "mean_token_accuracy": 0.3159274160861969, - "step": 20280 - }, - { - "epoch": 0.979885543186922, - "grad_norm": 2.452912457697491, - "learning_rate": 2.458213532418263e-08, - "loss": 3.8773, - "mean_token_accuracy": 0.3323588684201241, - "step": 20290 - }, - { - "epoch": 0.9803684833264917, - "grad_norm": 2.7315987149614824, - "learning_rate": 2.3414962509077375e-08, - "loss": 3.9172, - "mean_token_accuracy": 0.32036290168762205, - "step": 20300 - }, - { - "epoch": 0.9803684833264917, - "eval_runtime": 7.7919, - "eval_samples_per_second": 379.11, - "eval_steps_per_second": 23.743, - "step": 20300 - }, - { - "epoch": 0.9808514234660614, - "grad_norm": 2.5893200792442834, - "learning_rate": 2.2276144243830177e-08, - "loss": 3.9375, - "mean_token_accuracy": 0.3176411300897598, - "step": 20310 - }, - { - "epoch": 0.9813343636056311, - "grad_norm": 2.5841617000924124, - "learning_rate": 2.116568376508865e-08, - "loss": 3.8, - "mean_token_accuracy": 0.3359879031777382, - "step": 20320 - }, - { - "epoch": 0.9818173037452008, - "grad_norm": 2.6170419293834932, - "learning_rate": 2.0083584228903775e-08, - "loss": 3.8832, - "mean_token_accuracy": 0.32762096971273424, - "step": 20330 - }, - { - "epoch": 0.9823002438847704, - "grad_norm": 2.526407636386915, - "learning_rate": 1.902984871071878e-08, - "loss": 3.9312, - "mean_token_accuracy": 0.3182459697127342, - "step": 20340 - }, - { - "epoch": 0.9827831840243402, - "grad_norm": 2.58792876005009, - "learning_rate": 1.8004480205368046e-08, - "loss": 3.891, - "mean_token_accuracy": 0.322177417576313, - "step": 20350 - }, - { - "epoch": 0.9832661241639099, - "grad_norm": 2.7570470226238784, - "learning_rate": 1.700748162705934e-08, - "loss": 3.8445, - "mean_token_accuracy": 0.32540322542190553, - "step": 20360 - }, - { - "epoch": 0.9837490643034796, - "grad_norm": 2.6759940011610506, - "learning_rate": 1.603885580937492e-08, - "loss": 3.9059, - "mean_token_accuracy": 0.32358870804309847, - "step": 20370 - }, - { - "epoch": 0.9842320044430493, - "grad_norm": 2.55852422932362, - "learning_rate": 1.509860550525266e-08, - "loss": 3.9617, - "mean_token_accuracy": 0.31633064672350886, - "step": 20380 - }, - { - "epoch": 0.984714944582619, - "grad_norm": 2.626404616788194, - "learning_rate": 1.4186733386989393e-08, - "loss": 3.9055, - "mean_token_accuracy": 0.3150201618671417, - "step": 20390 - }, - { - "epoch": 0.9851978847221887, - "grad_norm": 2.592649879971373, - "learning_rate": 1.330324204622424e-08, - "loss": 3.9805, - "mean_token_accuracy": 0.31542338877916337, - "step": 20400 - }, - { - "epoch": 0.9851978847221887, - "eval_runtime": 7.8227, - "eval_samples_per_second": 377.618, - "eval_steps_per_second": 23.649, - "step": 20400 - }, - { - "epoch": 0.9856808248617583, - "grad_norm": 2.7257211880867103, - "learning_rate": 1.2448133993938627e-08, - "loss": 3.8711, - "mean_token_accuracy": 0.3280241936445236, - "step": 20410 - }, - { - "epoch": 0.9861637650013281, - "grad_norm": 2.582473900644627, - "learning_rate": 1.1621411660440728e-08, - "loss": 3.8676, - "mean_token_accuracy": 0.328125, - "step": 20420 - }, - { - "epoch": 0.9866467051408978, - "grad_norm": 2.6080631228828404, - "learning_rate": 1.0823077395367698e-08, - "loss": 3.8563, - "mean_token_accuracy": 0.3216733828186989, - "step": 20430 - }, - { - "epoch": 0.9871296452804675, - "grad_norm": 2.7425221460032576, - "learning_rate": 1.0053133467673448e-08, - "loss": 3.9625, - "mean_token_accuracy": 0.31270160973072053, - "step": 20440 - }, - { - "epoch": 0.9876125854200372, - "grad_norm": 2.4726696547150446, - "learning_rate": 9.311582065623103e-09, - "loss": 3.891, - "mean_token_accuracy": 0.3218750014901161, - "step": 20450 - }, - { - "epoch": 0.9880955255596069, - "grad_norm": 2.4848261646937377, - "learning_rate": 8.598425296786339e-09, - "loss": 3.8535, - "mean_token_accuracy": 0.32731855362653733, - "step": 20460 - }, - { - "epoch": 0.9885784656991766, - "grad_norm": 2.6259157070343924, - "learning_rate": 7.913665188032938e-09, - "loss": 3.9988, - "mean_token_accuracy": 0.3130040317773819, - "step": 20470 - }, - { - "epoch": 0.9890614058387462, - "grad_norm": 2.5753441068460274, - "learning_rate": 7.2573036855272395e-09, - "loss": 3.9273, - "mean_token_accuracy": 0.31693548411130906, - "step": 20480 - }, - { - "epoch": 0.989544345978316, - "grad_norm": 2.532870554202357, - "learning_rate": 6.629342654720372e-09, - "loss": 3.9184, - "mean_token_accuracy": 0.3182459682226181, - "step": 20490 - }, - { - "epoch": 0.9900272861178857, - "grad_norm": 2.5959249282217107, - "learning_rate": 6.029783880345807e-09, - "loss": 3.8594, - "mean_token_accuracy": 0.3258064538240433, - "step": 20500 - }, - { - "epoch": 0.9900272861178857, - "eval_runtime": 7.7915, - "eval_samples_per_second": 379.13, - "eval_steps_per_second": 23.744, - "step": 20500 - }, - { - "epoch": 0.9905102262574554, - "grad_norm": 2.5909835121554745, - "learning_rate": 5.458629066416032e-09, - "loss": 3.898, - "mean_token_accuracy": 0.31935484111309054, - "step": 20510 - }, - { - "epoch": 0.9909931663970251, - "grad_norm": 2.551611851195527, - "learning_rate": 4.915879836216997e-09, - "loss": 3.9621, - "mean_token_accuracy": 0.31602822691202165, - "step": 20520 - }, - { - "epoch": 0.9914761065365948, - "grad_norm": 2.495172896407294, - "learning_rate": 4.4015377322981225e-09, - "loss": 3.9445, - "mean_token_accuracy": 0.31975806355476377, - "step": 20530 - }, - { - "epoch": 0.9919590466761645, - "grad_norm": 2.5717490943703707, - "learning_rate": 3.915604216480074e-09, - "loss": 3.9211, - "mean_token_accuracy": 0.3230846762657166, - "step": 20540 - }, - { - "epoch": 0.9924419868157341, - "grad_norm": 2.530971109670067, - "learning_rate": 3.458080669836994e-09, - "loss": 3.9359, - "mean_token_accuracy": 0.3204637125134468, - "step": 20550 - }, - { - "epoch": 0.9929249269553039, - "grad_norm": 2.6131099234680475, - "learning_rate": 3.0289683927009484e-09, - "loss": 3.9355, - "mean_token_accuracy": 0.3195564538240433, - "step": 20560 - }, - { - "epoch": 0.9934078670948736, - "grad_norm": 2.4800998635135802, - "learning_rate": 2.62826860465637e-09, - "loss": 3.9109, - "mean_token_accuracy": 0.3160282254219055, - "step": 20570 - }, - { - "epoch": 0.9938908072344433, - "grad_norm": 2.6540648843105674, - "learning_rate": 2.255982444536731e-09, - "loss": 3.9586, - "mean_token_accuracy": 0.31905242055654526, - "step": 20580 - }, - { - "epoch": 0.994373747374013, - "grad_norm": 2.727633761771066, - "learning_rate": 1.9121109704201e-09, - "loss": 3.8969, - "mean_token_accuracy": 0.31925403475761416, - "step": 20590 - }, - { - "epoch": 0.9948566875135827, - "grad_norm": 2.5216381608616687, - "learning_rate": 1.596655159625815e-09, - "loss": 3.9207, - "mean_token_accuracy": 0.31844758093357084, - "step": 20600 - }, - { - "epoch": 0.9948566875135827, - "eval_runtime": 7.7835, - "eval_samples_per_second": 379.521, - "eval_steps_per_second": 23.768, - "step": 20600 - }, - { - "epoch": 0.9953396276531524, - "grad_norm": 2.557566970702623, - "learning_rate": 1.3096159087155892e-09, - "loss": 3.9066, - "mean_token_accuracy": 0.3253024220466614, - "step": 20610 - }, - { - "epoch": 0.995822567792722, - "grad_norm": 2.46116757551048, - "learning_rate": 1.0509940334857417e-09, - "loss": 3.8754, - "mean_token_accuracy": 0.32429435551166536, - "step": 20620 - }, - { - "epoch": 0.9963055079322918, - "grad_norm": 2.479608506712216, - "learning_rate": 8.207902689671976e-10, - "loss": 3.8945, - "mean_token_accuracy": 0.32217741906642916, - "step": 20630 - }, - { - "epoch": 0.9967884480718615, - "grad_norm": 2.5148816980131254, - "learning_rate": 6.190052694254877e-10, - "loss": 3.918, - "mean_token_accuracy": 0.3203629031777382, - "step": 20640 - }, - { - "epoch": 0.9972713882114312, - "grad_norm": 2.6897319142489486, - "learning_rate": 4.4563960835519725e-10, - "loss": 3.9133, - "mean_token_accuracy": 0.3217741921544075, - "step": 20650 - }, - { - "epoch": 0.9977543283510009, - "grad_norm": 2.668387661313725, - "learning_rate": 3.0069377847996573e-10, - "loss": 3.9289, - "mean_token_accuracy": 0.3209677442908287, - "step": 20660 - }, - { - "epoch": 0.9982372684905706, - "grad_norm": 2.6502085298332934, - "learning_rate": 1.8416819175359756e-10, - "loss": 3.909, - "mean_token_accuracy": 0.3197580650448799, - "step": 20670 - }, - { - "epoch": 0.9987202086301403, - "grad_norm": 2.6602355596713547, - "learning_rate": 9.606317935229036e-11, - "loss": 3.8676, - "mean_token_accuracy": 0.324193549156189, - "step": 20680 - }, - { - "epoch": 0.99920314876971, - "grad_norm": 2.6061338286329367, - "learning_rate": 3.637899168240644e-11, - "loss": 3.9465, - "mean_token_accuracy": 0.31794354766607286, - "step": 20690 - }, - { - "epoch": 0.9996860889092797, - "grad_norm": 2.588158409191739, - "learning_rate": 5.115798370480818e-12, - "loss": 3.9426, - "mean_token_accuracy": 0.3201612919569016, - "step": 20700 - }, - { - "epoch": 0.9996860889092797, - "eval_runtime": 7.7842, - "eval_samples_per_second": 379.484, - "eval_steps_per_second": 23.766, - "step": 20700 - }, - { - "epoch": 0.9999758529930215, - "mean_token_accuracy": 0.32610887040694553, - "step": 20706, - "total_flos": 5419008396361728.0, - "train_loss": 4.076150745858688, - "train_runtime": 7573.4415, - "train_samples_per_second": 87.49, - "train_steps_per_second": 2.734 + "epoch": 0.9992277992277993, + "mean_token_accuracy": 0.3896016627550125, + "step": 647, + "total_flos": 5418484972388352.0, + "train_loss": 3.606253622488408, + "train_runtime": 424.9732, + "train_samples_per_second": 48.742, + "train_steps_per_second": 1.522 } ], "logging_steps": 10, - "max_steps": 20706, + "max_steps": 647, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, @@ -18045,7 +590,7 @@ "attributes": {} } }, - "total_flos": 5419008396361728.0, + "total_flos": 5418484972388352.0, "train_batch_size": 8, "trial_name": null, "trial_params": null